diff --git a/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
index c8db951381b0bd8b4c36ffe0b97c2155aea5c52b..0745da8dc418d478d84df9c45978f5da19152f6c 100755
--- a/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
@@ -2,7 +2,7 @@
 # We can use this script to compute baseline accuracy on chartqa for vllm.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install lm-eval==0.4.9
+#   pip install "lm-eval[api]>=0.4.9.2"
 
 usage() {
     echo``
diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
index 897f84d1e360de11ceb10d77baf0ff9f8453cdfd..5c17a06245bcf6277decc55bb3236fd2e618eb34 100755
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
@@ -2,7 +2,7 @@
 # We can use this script to compute baseline accuracy on GSM for transformers.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
+#   pip install "lm-eval[api]>=0.4.9.2"
 
 usage() {
     echo``
diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
index 792f355c47a5178801b2624f1a9e06c69707f0ce..1b617ff17c41c3f7e2b4e13aed8ad9b0938fa2e8 100644
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
@@ -3,7 +3,7 @@
 # We use this for fp8, which HF does not support.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
+#   pip install "lm-eval[api]>=0.4.9.2"
 
 usage() {
     echo``
diff --git a/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
index d85a1721db9a59d46ab9a7fdaf52b68c8dc13186..12336d7f85bc918cd5776d82fffeca518f474180 100644
--- a/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
@@ -3,7 +3,7 @@
 # We use this for fp8, which HF does not support.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
+#   pip install "lm-eval[api]>=0.4.9.2"
 
 usage() {
     echo``
diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
index f94d681197d2d5e0fce5d1de23cf47840309c78b..a22abe73e39f72abdab84e51a38324b696ef7cf0 100644
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@@ -60,6 +60,7 @@ def launch_lm_eval(eval_config, tp_size):
         f"add_bos_token=true,"
         f"trust_remote_code={trust_remote_code},"
         f"max_model_len={max_model_len},"
+        "allow_deprecated_quantization=True,"
     )
 
     env_vars = eval_config.get("env_vars", None)
diff --git a/.buildkite/performance-benchmarks/README.md b/.buildkite/performance-benchmarks/README.md
index 015f48c2520d60dee4782b0a1bde445cd0ed76fa..289877e504bbda8cd4fba7b2fb4b32ae50c19977 100644
--- a/.buildkite/performance-benchmarks/README.md
+++ b/.buildkite/performance-benchmarks/README.md
@@ -7,7 +7,7 @@ vLLM also maintains a continuous performance benchmark under [perf.vllm.ai](http
 
 ## Performance benchmark quick overview
 
-**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100, Intel® Xeon® Processors and Intel® Gaudi® 3 Accelerators with different models.
+**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100, Intel® Xeon® Processors, Intel® Gaudi® 3 Accelerators and Arm® Neoverse™ with different models.
 
 **Benchmarking Duration**: about 1hr.
 
@@ -23,7 +23,7 @@ bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
 
 Runtime environment variables:
 
-- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
+- `ON_CPU`: set the value to '1' on Intel® Xeon® and Arm® Neoverse™ Processors. Default value is 0.
 - `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
 - `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
 - `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file).
@@ -34,8 +34,9 @@ Runtime environment variables:
 
 See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
 > NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead.
-For Intel® Gaudi® 3 Accelerators, use `tests/latency-tests-hpu.json`, `tests/throughput-tests-hpu.json`, `tests/serving-tests-hpu.json` instead.
->
+> For Intel® Gaudi® 3 Accelerators, use `tests/latency-tests-hpu.json`, `tests/throughput-tests-hpu.json`, `tests/serving-tests-hpu.json` instead.
+> For Arm® Neoverse™, use `tests/latency-tests-arm64-cpu.json`, `tests/throughput-tests-arm64-cpu.json`, `tests/serving-tests-arm64-cpu.json` instead.
+
 ### Latency test
 
 Here is an example of one test inside `latency-tests.json`:
@@ -175,19 +176,6 @@ If you do not see the table, please wait till the benchmark finish running.
 The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
 The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
 
-The `compare-json-results.py` helps to compare benchmark results JSON files converted using `convert-results-json-to-markdown.py`.
-When run, benchmark script generates results under `benchmark/results` folder, along with the `benchmark_results.md` and `benchmark_results.json`.
-`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.  
-If only one benchmark_results.json is passed, `compare-json-results.py` compares different TP and PP configurations in the benchmark_results.json instead.
-
-Here is an example using the script to compare result_a and result_b with Model, Dataset name, input/output length, max concurrency and qps.
-`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json`
-
-|   | Model | Dataset Name | Input Len | Output Len | # of max concurrency | qps  | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio        |
-|----|---------------------------------------|--------|-----|-----|------|-----|-----------|----------|----------|
-| 0  | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | 1 | 142.633982                             | 156.526018                             | 1.097396 |
-| 1  | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | inf| 241.620334                             | 294.018783                             | 1.216863 |
+#### Performance Results Comparison  
 
-A comparison diagram will be generated below the table.
-Here is an example to compare between 96c/results_gnr_96c_091_tp2pp3 and 128c/results_gnr_128c_091_tp2pp3
-<img width="1886" height="828" alt="image" src="https://github.com/user-attachments/assets/c02a43ef-25d0-4fd6-90e5-2169a28682dd" />
+Follow the instructions in [performance results comparison](https://docs.vllm.ai/en/latest/benchmarking/dashboard/#performance-results-comparison) to analyze performance results and the sizing guide.
diff --git a/.buildkite/performance-benchmarks/scripts/compare-json-results.py b/.buildkite/performance-benchmarks/scripts/compare-json-results.py
index c8bf7b0453662d71dff5a6be0f48d2ceb63785e3..b3d0a2d3bbce0b6804a4fdd0ac177628bd860ebb 100644
--- a/.buildkite/performance-benchmarks/scripts/compare-json-results.py
+++ b/.buildkite/performance-benchmarks/scripts/compare-json-results.py
@@ -1,8 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from __future__ import annotations
+
 import argparse
+import html as _html
 import json
 import os
+from dataclasses import dataclass
 from importlib import util
 
 import pandas as pd
@@ -10,27 +15,49 @@ import pandas as pd
 pd.options.display.float_format = "{:.2f}".format
 plotly_found = util.find_spec("plotly.express") is not None
 
-
+DEFAULT_INFO_COLS = [
+    "Model",
+    "Dataset Name",
+    "Input Len",
+    "Output Len",
+    #    "TP Size",
+    #    "PP Size",
+    "# of max concurrency.",
+    "qps",
+]
+
+# Safety net: if any DataFrame leaks into to_html(), keep precision at 2.
+pd.set_option("display.precision", 2)
+pd.set_option("display.float_format", lambda x: f"{x:.2f}")
+
+
+# -----------------------------
+# Core data compare
+# -----------------------------
 def compare_data_columns(
-    files, name_column, data_column, info_cols, drop_column, debug=False
+    files: list[str],
+    name_column: str,
+    data_column: str,
+    info_cols: list[str],
+    drop_column: str,
+    debug: bool = False,
 ):
     """
     Align concatenation by keys derived from info_cols instead of row order.
     - Pick one canonical key list: subset of info_cols present in ALL files.
     - For each file: set index to those keys, aggregate duplicates
-    - (mean for metric, first for names).
+      (mean for metric, first for names).
     - Concat along axis=1 (indexes align), then reset_index so callers can
-    - group by columns.
+      group by columns.
     - If --debug, add a <file_label>_name column per file.
     """
     print("\ncompare_data_column:", data_column)
 
     frames = []
-    raw_data_cols = []
+    raw_data_cols: list[str] = []
     compare_frames = []
 
-    # 1) choose a canonical key list from info_cols that exists in ALL files
-    cols_per_file = []
+    cols_per_file: list[set] = []
     for f in files:
         try:
             df_tmp = pd.read_json(f, orient="records")
@@ -40,24 +67,20 @@ def compare_data_columns(
 
     key_cols = [c for c in info_cols if all(c in cset for cset in cols_per_file)]
     if not key_cols:
-        # soft fallback: use any info_cols present in the first file
         key_cols = [c for c in info_cols if c in list(cols_per_file[0])]
     if not key_cols:
         raise ValueError(
             "No common key columns found from info_cols across the input files."
         )
 
-    # 2) build a single "meta" block (keys as columns) once, aligned by the key index
     meta_added = False
 
     for file in files:
         df = pd.read_json(file, orient="records")
 
-        # Keep rows that actually have the compared metric (same as original behavior)
         if drop_column in df.columns:
             df = df.dropna(subset=[drop_column], ignore_index=True)
 
-        # Stabilize numeric key columns (harmless if missing)
         for c in (
             "Input Len",
             "Output Len",
@@ -69,32 +92,26 @@ def compare_data_columns(
             if c in df.columns:
                 df[c] = pd.to_numeric(df[c], errors="coerce")
 
-        # Ensure all key columns exist
         for c in key_cols:
             if c not in df.columns:
                 df[c] = pd.NA
 
-        # Set index = key_cols and aggregate duplicates → unique MultiIndex
         df_idx = df.set_index(key_cols, drop=False)
 
-        # meta (key columns), unique per key
         meta = df_idx[key_cols]
         if not meta.index.is_unique:
             meta = meta.groupby(level=key_cols, dropna=False).first()
 
-        # metric series for this file, aggregated to one row per key
         file_label = "/".join(file.split("/")[:-1]) or os.path.basename(file)
         s = df_idx[data_column]
         if not s.index.is_unique:
             s = s.groupby(level=key_cols, dropna=False).mean()
-        s.name = file_label  # column label like original
+        s.name = file_label
 
-        # add meta once (from first file) so keys are the leftmost columns
         if not meta_added:
             frames.append(meta)
             meta_added = True
 
-        # (NEW) debug: aligned test-name column per file
         if debug and name_column in df_idx.columns:
             name_s = df_idx[name_column]
             if not name_s.index.is_unique:
@@ -106,26 +123,19 @@ def compare_data_columns(
         raw_data_cols.append(file_label)
         compare_frames.append(s)
 
-        # Generalize ratio: for any file N>=2, add ratio (fileN / file1)
         if len(compare_frames) >= 2:
             base = compare_frames[0]
             current = compare_frames[-1]
             if "P99" in data_column or "Median" in data_column:
-                ratio = base / current  # for latency
+                ratio = base / current
             else:
                 ratio = current / base
-            ratio = ratio.mask(base == 0)  # avoid inf when baseline is 0
+            ratio = ratio.mask(base == 0)
             ratio.name = f"Ratio 1 vs {len(compare_frames)}"
             frames.append(ratio)
 
-    # 4) concat on columns with aligned MultiIndex;
-    # then reset_index to return keys as columns
-    concat_df = pd.concat(frames, axis=1)
-    concat_df = concat_df.reset_index(drop=True).reset_index()
-    if "index" in concat_df.columns:
-        concat_df = concat_df.drop(columns=["index"])
+    concat_df = pd.concat(frames, axis=1).reset_index(drop=True)
 
-    # Ensure key/info columns appear first (in your info_cols order)
     front = [c for c in info_cols if c in concat_df.columns]
     rest = [c for c in concat_df.columns if c not in front]
     concat_df = concat_df[front + rest]
@@ -134,20 +144,15 @@ def compare_data_columns(
     return concat_df, raw_data_cols
 
 
+# -----------------------------
+# Split helper
+# -----------------------------
 def split_json_by_tp_pp(
     input_file: str = "benchmark_results.json", output_root: str = "."
 ) -> list[str]:
-    """
-    Split a benchmark JSON into separate folders by (TP Size, PP Size).
-
-    Creates: <output_root>/tp{TP}_pp{PP}/benchmark_results.json
-    Returns: list of file paths written.
-    """
-    # Load JSON data into DataFrame
     with open(input_file, encoding="utf-8") as f:
         data = json.load(f)
 
-    # If the JSON is a dict with a list under common keys, use that list
     if isinstance(data, dict):
         for key in ("results", "serving_results", "benchmarks", "data"):
             if isinstance(data.get(key), list):
@@ -156,7 +161,6 @@ def split_json_by_tp_pp(
 
     df = pd.DataFrame(data)
 
-    # Keep only "serving" tests
     name_col = next(
         (c for c in ["Test name", "test_name", "Test Name"] if c in df.columns), None
     )
@@ -165,7 +169,6 @@ def split_json_by_tp_pp(
             df[name_col].astype(str).str.contains(r"serving", case=False, na=False)
         ].copy()
 
-    # Handle alias column names
     rename_map = {
         "tp_size": "TP Size",
         "tensor_parallel_size": "TP Size",
@@ -176,21 +179,14 @@ def split_json_by_tp_pp(
         columns={k: v for k, v in rename_map.items() if k in df.columns}, inplace=True
     )
 
-    # Ensure TP/PP columns exist (default to 1 if missing)
     if "TP Size" not in df.columns:
         df["TP Size"] = 1
     if "PP Size" not in df.columns:
         df["PP Size"] = 1
 
-    # make sure TP/PP are numeric ints with no NaN
-    df["TP Size"] = (
-        pd.to_numeric(df.get("TP Size", 1), errors="coerce").fillna(1).astype(int)
-    )
-    df["PP Size"] = (
-        pd.to_numeric(df.get("PP Size", 1), errors="coerce").fillna(1).astype(int)
-    )
+    df["TP Size"] = pd.to_numeric(df["TP Size"], errors="coerce").fillna(1).astype(int)
+    df["PP Size"] = pd.to_numeric(df["PP Size"], errors="coerce").fillna(1).astype(int)
 
-    # Split into separate folders
     saved_paths: list[str] = []
     for (tp, pp), group_df in df.groupby(["TP Size", "PP Size"], dropna=False):
         folder_name = os.path.join(output_root, f"tp{int(tp)}_pp{int(pp)}")
@@ -203,32 +199,9 @@ def split_json_by_tp_pp(
     return saved_paths
 
 
-def _add_limit_line(fig, y_value, label):
-    # Visible dashed line + annotation
-    fig.add_hline(
-        y=y_value,
-        line_dash="dash",
-        line_color="red" if "ttft" in label.lower() else "blue",
-        annotation_text=f"{label}: {y_value} ms",
-        annotation_position="top left",
-    )
-    # Optional: add a legend item (as a transparent helper trace)
-    if plot and plotly_found:
-        import plotly.graph_objects as go
-
-        fig.add_trace(
-            go.Scatter(
-                x=[None],
-                y=[None],
-                mode="lines",
-                line=dict(
-                    dash="dash", color="red" if "ttft" in label.lower() else "blue"
-                ),
-                name=f"{label}",
-            )
-        )
-
-
+# -----------------------------
+# Styling helpers
+# -----------------------------
 def _find_concurrency_col(df: pd.DataFrame) -> str:
     for c in [
         "# of max concurrency.",
@@ -239,7 +212,6 @@ def _find_concurrency_col(df: pd.DataFrame) -> str:
     ]:
         if c in df.columns:
             return c
-    # Fallback: guess an integer-like column (harmless if unused)
     for c in df.columns:
         if df[c].dtype.kind in "iu" and df[c].nunique() > 1 and df[c].min() >= 1:
             return c
@@ -248,8 +220,7 @@ def _find_concurrency_col(df: pd.DataFrame) -> str:
 
 def _highlight_threshold(
     df: pd.DataFrame, threshold: float
-) -> "pd.io.formats.style.Styler":
-    """Highlight numeric per-configuration columns with value <= threshold."""
+) -> pd.io.formats.style.Styler:
     conc_col = _find_concurrency_col(df)
     key_cols = [
         c
@@ -260,6 +231,7 @@ def _highlight_threshold(
         c for c in df.columns if c not in key_cols and not str(c).startswith("Ratio")
     ]
     conf_cols = [c for c in conf_cols if pd.api.types.is_numeric_dtype(df[c])]
+
     return df.style.map(
         lambda v: "background-color:#e6ffe6;font-weight:bold;"
         if pd.notna(v) and v <= threshold
@@ -268,7 +240,264 @@ def _highlight_threshold(
     )
 
 
-if __name__ == "__main__":
+def highlight_ratio_columns(styler: pd.io.formats.style.Styler):
+    ratio_cols = [c for c in styler.data.columns if "ratio" in str(c).lower()]
+    if not ratio_cols:
+        return styler
+
+    styler = styler.apply(
+        lambda _: ["background-color: #fff3b0"] * len(styler.data),
+        subset=ratio_cols,
+        axis=0,
+    )
+
+    styler = styler.set_table_styles(
+        [
+            {
+                "selector": f"th.col_heading.level0.col{i}",
+                "props": [("background-color", "#fff3b0")],
+            }
+            for i, col in enumerate(styler.data.columns)
+            if col in ratio_cols
+        ],
+        overwrite=False,
+    )
+    return styler
+
+
+def _apply_two_decimals(
+    styler: pd.io.formats.style.Styler,
+) -> pd.io.formats.style.Styler:
+    df = styler.data
+    num_cols = df.select_dtypes("number").columns
+    if len(num_cols) == 0:
+        return styler
+    return styler.format({c: "{:.2f}" for c in num_cols}, na_rep="")
+
+
+# -----------------------------
+# Valid max concurrency summary helpers
+# -----------------------------
+def _config_value_columns(df: pd.DataFrame, conc_col: str) -> list[str]:
+    key_cols = [
+        c
+        for c in ["Model", "Dataset Name", "Input Len", "Output Len"]
+        if c in df.columns
+    ]
+    exclude = set(key_cols + [conc_col, "qps", "QPS"])
+
+    cols: list[str] = []
+    for c in df.columns:
+        if c in exclude:
+            continue
+        lc = str(c).lower()
+        if lc.startswith("ratio"):
+            continue
+        if lc.endswith("_name") or lc == "test name" or lc == "test_name":
+            continue
+        if pd.api.types.is_numeric_dtype(df[c]):
+            cols.append(c)
+    return cols
+
+
+def _max_concurrency_ok(
+    df: pd.DataFrame, conc_col: str, cfg_col: str, threshold: float
+):
+    if df is None or conc_col not in df.columns or cfg_col not in df.columns:
+        return pd.NA
+
+    d = df[[conc_col, cfg_col]].copy()
+    d[conc_col] = pd.to_numeric(d[conc_col], errors="coerce")
+    d[cfg_col] = pd.to_numeric(d[cfg_col], errors="coerce")
+    d = d.dropna(subset=[conc_col, cfg_col])
+
+    if d.empty:
+        return pd.NA
+
+    ok = d[d[cfg_col] <= threshold]
+    if ok.empty:
+        return pd.NA
+
+    return ok[conc_col].max()
+
+
+def _value_at_concurrency(df: pd.DataFrame, conc_col: str, cfg_col: str, conc_value):
+    if (
+        df is None
+        or conc_col not in df.columns
+        or cfg_col not in df.columns
+        or pd.isna(conc_value)
+    ):
+        return pd.NA
+
+    d = df[[conc_col, cfg_col]].copy()
+    d[conc_col] = pd.to_numeric(d[conc_col], errors="coerce")
+    d[cfg_col] = pd.to_numeric(d[cfg_col], errors="coerce")
+
+    conc_value = pd.to_numeric(conc_value, errors="coerce")
+    if pd.isna(conc_value):
+        return pd.NA
+
+    hit = d[d[conc_col] == conc_value]
+    if hit.empty:
+        return pd.NA
+    return hit[cfg_col].iloc[0]
+
+
+def build_valid_max_concurrency_summary_html(
+    tput_group_df: pd.DataFrame | None,
+    ttft_group_df: pd.DataFrame | None,
+    tpot_group_df: pd.DataFrame | None,
+    conc_col: str,
+    args,
+) -> str:
+    if ttft_group_df is None and tpot_group_df is None:
+        return ""
+
+    ttft_cols = (
+        _config_value_columns(ttft_group_df, conc_col)
+        if ttft_group_df is not None
+        else []
+    )
+    tpot_cols = (
+        _config_value_columns(tpot_group_df, conc_col)
+        if tpot_group_df is not None
+        else []
+    )
+    tput_cols = (
+        _config_value_columns(tput_group_df, conc_col)
+        if tput_group_df is not None
+        else []
+    )
+
+    if ttft_group_df is not None and tpot_group_df is not None:
+        cfg_cols = [c for c in ttft_cols if c in tpot_cols]
+        if tput_group_df is not None:
+            cfg_cols = [c for c in cfg_cols if c in tput_cols] or cfg_cols
+    else:
+        cfg_cols = ttft_cols or tpot_cols
+
+    if not cfg_cols:
+        cfg_cols = sorted(set(ttft_cols) | set(tpot_cols) | set(tput_cols), key=str)
+
+    rows = []
+    for cfg in cfg_cols:
+        ttft_max = (
+            _max_concurrency_ok(ttft_group_df, conc_col, cfg, args.ttft_max_ms)
+            if ttft_group_df is not None
+            else pd.NA
+        )
+        tpot_max = (
+            _max_concurrency_ok(tpot_group_df, conc_col, cfg, args.tpot_max_ms)
+            if tpot_group_df is not None
+            else pd.NA
+        )
+        both = (
+            pd.NA
+            if (pd.isna(ttft_max) or pd.isna(tpot_max))
+            else min(ttft_max, tpot_max)
+        )
+
+        tput_at_both = (
+            _value_at_concurrency(tput_group_df, conc_col, cfg, both)
+            if tput_group_df is not None
+            else pd.NA
+        )
+        ttft_at_both = (
+            _value_at_concurrency(ttft_group_df, conc_col, cfg, both)
+            if ttft_group_df is not None
+            else pd.NA
+        )
+        tpot_at_both = (
+            _value_at_concurrency(tpot_group_df, conc_col, cfg, both)
+            if tpot_group_df is not None
+            else pd.NA
+        )
+
+        rows.append(
+            {
+                "Configuration": cfg,
+                f"Max {conc_col} (TTFT ≤ {args.ttft_max_ms:g} ms)": ttft_max,
+                f"Max {conc_col} (TPOT ≤ {args.tpot_max_ms:g} ms)": tpot_max,
+                f"Max {conc_col} (Both)": both,
+                "Output Tput @ Both (tok/s)": tput_at_both,
+                "TTFT @ Both (ms)": ttft_at_both,
+                "TPOT @ Both (ms)": tpot_at_both,
+            }
+        )
+
+    summary_df = pd.DataFrame(rows)
+
+    # --- Coerce numeric columns so Styler doesn't miss them due to object dtype ---
+    for c in summary_df.columns:
+        if c == "Configuration":
+            continue
+        summary_df[c] = pd.to_numeric(summary_df[c], errors="coerce")
+
+    both_col = f"Max {conc_col} (Both)"
+
+    # --- Strict 2-decimal formatting for ALL non-Configuration columns ---
+    formatters = {}
+    for c in summary_df.columns:
+        if c == "Configuration":
+            continue
+        # default argument binds per-column formatter correctly
+        formatters[c] = lambda v: "" if pd.isna(v) else f"{float(v):.2f}"
+
+    styler = summary_df.style.format(formatters)
+
+    def _green(v):
+        return "background-color:#e6ffe6;font-weight:bold;" if pd.notna(v) else ""
+
+    if both_col in summary_df.columns:
+        styler = styler.map(_green, subset=[both_col])
+
+    title = (
+        '<div style="font-size: 1.15em; font-weight: 700; margin: 12px 0 6px 0;">'
+        "Valid Max Concurrency Summary"
+        "</div>\n"
+    )
+    return title + styler.to_html(table_attributes='border="1" class="dataframe"')
+
+
+# -----------------------------
+# Plot helper
+# -----------------------------
+def _add_limit_line(fig, y_value: float, label: str):
+    fig.add_hline(
+        y=y_value,
+        line_dash="dash",
+        line_color="red" if "ttft" in label.lower() else "blue",
+        annotation_text=f"{label}: {y_value} ms",
+        annotation_position="top left",
+    )
+    if plotly_found:
+        import plotly.graph_objects as go
+
+        fig.add_trace(
+            go.Scatter(
+                x=[None],
+                y=[None],
+                mode="lines",
+                line=dict(
+                    dash="dash",
+                    color="red" if "ttft" in label.lower() else "blue",
+                ),
+                name=label,
+            )
+        )
+
+
+# -----------------------------
+# Refactored main + group-first report
+# -----------------------------
+@dataclass(frozen=True)
+class MetricPlan:
+    data_cols: list[str]
+    drop_column: str
+
+
+def build_parser() -> argparse.ArgumentParser:
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "-f", "--file", action="append", type=str, help="input file name"
@@ -308,149 +537,289 @@ if __name__ == "__main__":
         default=100.0,
         help="Reference limit for TPOT plots (ms)",
     )
+    return parser
 
-    args = parser.parse_args()
 
+def choose_metrics(latency: str) -> MetricPlan:
+    latency = (latency or "").lower()
     drop_column = "P99"
-    name_column = "Test name"
-    info_cols = [
-        "Model",
-        "Dataset Name",
-        "Input Len",
-        "Output Len",
-        "TP Size",
-        "PP Size",
-        "# of max concurrency.",
-        "qps",
-    ]
 
-    if "median" in args.latency:
-        data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
-        html_msgs_for_data_cols = [
-            "Compare Output Tokens /n",
-            "Median TTFT /n",
-            "Median TPOT /n",
-        ]
-        drop_column = "P99"
-    elif "p99" in args.latency:
-        data_cols_to_compare = ["Output Tput (tok/s)", "P99 TTFT (ms)", "P99"]
-        html_msgs_for_data_cols = [
-            "Compare Output Tokens /n",
-            "P99 TTFT /n",
-            "P99 TPOT /n",
-        ]
+    if "median" in latency:
+        return MetricPlan(
+            data_cols=["Output Tput (tok/s)", "Median TTFT (ms)", "Median"],
+            drop_column=drop_column,
+        )
+
+    return MetricPlan(
+        data_cols=["Output Tput (tok/s)", "P99 TTFT (ms)", "P99"],
+        drop_column=drop_column,
+    )
+
+
+def prepare_input_files(args, info_cols: list[str]) -> tuple[list[str], list[str]]:
+    if not args.file:
+        raise ValueError("No input files provided. Use -f/--file.")
 
     if len(args.file) == 1:
         files = split_json_by_tp_pp(args.file[0], output_root="splits")
         info_cols = [c for c in info_cols if c not in ("TP Size", "PP Size")]
     else:
         files = args.file
+
+    return files, info_cols
+
+
+def get_y_axis_col(info_cols: list[str], xaxis: str) -> str:
+    y_axis_index = info_cols.index(xaxis) if xaxis in info_cols else 6
+    return info_cols[y_axis_index]
+
+
+def get_group_cols(output_df: pd.DataFrame, info_cols: list[str]) -> list[str]:
+    filtered_info_cols = info_cols[:4]
+    group_cols = [c for c in filtered_info_cols if c in output_df.columns]
+    if not group_cols:
+        raise ValueError(
+            f"No valid group-by columns. Expected subset: {filtered_info_cols}, "
+            f"but DataFrame has: {list(output_df.columns)}"
+        )
+    return group_cols
+
+
+def normalize_group_key(name):
+    return name if isinstance(name, tuple) else (name,)
+
+
+def group_filename(name, prefix: str = "perf_comparison_") -> str:
+    name_vals = normalize_group_key(name)
+    safe = ",".join(map(str, name_vals)).replace(",", "_").replace("/", "-")
+    return f"{prefix}{safe}.html"
+
+
+def build_group_suffix(group_cols: list[str], name) -> str:
+    name_vals = normalize_group_key(name)
+    return " , ".join(f"{col} : [ {val} ] " for col, val in zip(group_cols, name_vals))
+
+
+def render_metric_table_html(
+    display_group: pd.DataFrame,
+    metric_label: str,
+    group_suffix: str,
+    args,
+) -> str:
+    title = (
+        f'<div style="font-size: 1.25em; font-weight: 600; margin: 12px 0;">'
+        f"{_html.escape(metric_label)}"
+        f" — {_html.escape(group_suffix)}"
+        f"</div>\n"
+    )
+
+    metric_name = metric_label.lower()
+    if "ttft" in metric_name:
+        styler = _highlight_threshold(display_group, args.ttft_max_ms)
+    elif ("tpot" in metric_name) or ("median" in metric_name) or ("p99" in metric_name):
+        styler = _highlight_threshold(display_group, args.tpot_max_ms)
+    else:
+        styler = display_group.style
+
+    styler = _apply_two_decimals(styler)
+    styler = highlight_ratio_columns(styler)
+
+    return title + styler.to_html(table_attributes='border="1" class="dataframe"')
+
+
+def maybe_write_plot(
+    main_fh,
+    sub_fh,
+    group_df: pd.DataFrame,
+    raw_data_cols: list[str],
+    metric_label: str,
+    y_axis_col: str,
+    args,
+):
+    if not (args.plot and plotly_found):
+        return
+
+    import plotly.express as px
+
+    df = group_df[raw_data_cols].sort_values(by=y_axis_col)
+    df_melted = df.melt(
+        id_vars=y_axis_col,
+        var_name="Configuration",
+        value_name=metric_label,
+    )
+
+    fig = px.line(
+        df_melted,
+        x=y_axis_col,
+        y=metric_label,
+        color="Configuration",
+        title=f"{metric_label} vs {y_axis_col}",
+        markers=True,
+    )
+
+    # Ensure plot hover + y tick labels are also 2 decimals.
+    fig.update_traces(hovertemplate="%{y:.2f}<extra></extra>")
+    fig.update_yaxes(tickformat=".2f")
+
+    metric_name = metric_label.lower()
+    if "ttft" in metric_name:
+        _add_limit_line(fig, args.ttft_max_ms, "TTFT limit")
+    elif ("tpot" in metric_name) or ("median" in metric_name) or ("p99" in metric_name):
+        _add_limit_line(fig, args.tpot_max_ms, "TPOT limit")
+
+    html = fig.to_html(full_html=True, include_plotlyjs="cdn")
+    main_fh.write(html)
+    sub_fh.write(html)
+
+
+def build_group_keys(
+    df: pd.DataFrame, group_cols: list[str], sort_cols: list[str] | None = None
+):
+    if sort_cols:
+        df = df.sort_values(by=sort_cols)
+    gb = df.groupby(group_cols, dropna=False)
+    return [k for k, _ in gb]
+
+
+def write_report_group_first(
+    files: list[str], info_cols: list[str], plan: MetricPlan, args
+):
+    name_column = "Test name"
+    y_axis_col = get_y_axis_col(info_cols, args.xaxis)
+
     print("comparing : " + ", ".join(files))
-    debug = args.debug
-    plot = args.plot
-    # For Plot feature, assign y axis from one of info_cols
-    y_axis_index = info_cols.index(args.xaxis) if args.xaxis in info_cols else 6
-    with open("perf_comparison.html", "w") as text_file:
-        for i in range(len(data_cols_to_compare)):
-            output_df, raw_data_cols = compare_data_columns(
-                files,
-                name_column,
-                data_cols_to_compare[i],
-                info_cols,
-                drop_column,
-                debug=debug,
+
+    metric_cache: dict[str, tuple[pd.DataFrame, list[str]]] = {}
+    group_cols_canonical: list[str] | None = None
+
+    for metric_label in plan.data_cols:
+        output_df, raw_data_cols = compare_data_columns(
+            files,
+            name_column,
+            metric_label,
+            info_cols,
+            plan.drop_column,
+            debug=args.debug,
+        )
+
+        raw_data_cols = list(raw_data_cols)
+        raw_data_cols.insert(0, y_axis_col)
+
+        group_cols = get_group_cols(output_df, info_cols)
+        if group_cols_canonical is None:
+            group_cols_canonical = group_cols
+        else:
+            group_cols_canonical = [c for c in group_cols_canonical if c in group_cols]
+
+        metric_cache[metric_label] = (
+            output_df.sort_values(by=args.xaxis),
+            raw_data_cols,
+        )
+
+    if not group_cols_canonical:
+        raise ValueError("No canonical group columns found across metrics.")
+
+    first_metric = plan.data_cols[0]
+    first_df_sorted, _ = metric_cache[first_metric]
+    group_keys = build_group_keys(
+        first_df_sorted, group_cols_canonical, sort_cols=[args.xaxis]
+    )
+
+    metric_groupbys = {
+        metric_label: df.groupby(group_cols_canonical, dropna=False)
+        for metric_label, (df, _) in metric_cache.items()
+    }
+
+    with open("perf_comparison.html", "w", encoding="utf-8") as main_fh:
+        main_fh.write('<meta charset="utf-8">\n')
+        for gkey in group_keys:
+            gkey_tuple = normalize_group_key(gkey)
+            suffix = build_group_suffix(group_cols_canonical, gkey_tuple)
+            sub_path = group_filename(gkey_tuple)
+            group_header = (
+                '<div style="font-size: 1.4em; font-weight: 700; '
+                'margin: 18px 0 10px 0;">'
+                f"{_html.escape(suffix)}"
+                "</div>\n"
             )
 
-            # For Plot feature, insert y axis from one of info_cols
-            raw_data_cols.insert(0, info_cols[y_axis_index])
-
-            filtered_info_cols = info_cols[:-2]
-            existing_group_cols = [
-                c for c in filtered_info_cols if c in output_df.columns
-            ]
-            if not existing_group_cols:
-                raise ValueError(
-                    f"No valid group-by columns  "
-                    f"Expected subset: {filtered_info_cols}, "
-                    f"but DataFrame has: {list(output_df.columns)}"
-                )
-            # output_df_sorted = output_df.sort_values(by=existing_group_cols)
-            output_df_sorted = output_df.sort_values(by=args.xaxis)
-            output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False)
-            for name, group in output_groups:
-                group_name = (
-                    ",".join(map(str, name)).replace(",", "_").replace("/", "-")
-                )
-                group_html_name = "perf_comparison_" + group_name + ".html"
-
-                metric_name = str(data_cols_to_compare[i]).lower()
-                if "tok/s" in metric_name:
-                    html = group.to_html()
-                elif "ttft" in metric_name:
-                    styler = _highlight_threshold(group, args.ttft_max_ms).format(
-                        {c: "{:.2f}" for c in group.select_dtypes("number").columns},
-                        na_rep="—",
-                    )
-                    html = styler.to_html(
-                        table_attributes='border="1" class="dataframe"'
+            main_fh.write(group_header)
+            with open(sub_path, "w", encoding="utf-8") as sub_fh:
+                sub_fh.write('<meta charset="utf-8">\n')
+                sub_fh.write(group_header)
+                tput_group_df = None
+                ttft_group_df = None
+                tpot_group_df = None
+                conc_col = args.xaxis
+
+                for metric_label in plan.data_cols:
+                    gb = metric_groupbys[metric_label]
+                    df_sorted, raw_data_cols = metric_cache[metric_label]
+
+                    try:
+                        group_df = gb.get_group(gkey)
+                    except KeyError:
+                        missing = (
+                            '<div style="font-size: 1.1em; font-weight: 600; '
+                            'margin: 10px 0;">'
+                            f"{_html.escape(metric_label)} — missing for this group"
+                            "</div>\n"
+                        )
+
+                        main_fh.write(missing)
+                        sub_fh.write(missing)
+                        continue
+
+                    if conc_col not in group_df.columns:
+                        conc_col = _find_concurrency_col(group_df)
+
+                    mn = metric_label.lower().strip()
+                    if "tok/s" in mn:
+                        tput_group_df = group_df
+                    elif "ttft" in mn:
+                        ttft_group_df = group_df
+                    elif mn in ("p99", "median") or "tpot" in mn:
+                        tpot_group_df = group_df
+
+                    display_group = group_df.drop(
+                        columns=group_cols_canonical, errors="ignore"
                     )
-                elif (
-                    "tpot" in metric_name
-                    or "median" in metric_name
-                    or "p99" in metric_name
-                ):
-                    styler = _highlight_threshold(group, args.tpot_max_ms).format(
-                        {c: "{:.2f}" for c in group.select_dtypes("number").columns},
-                        na_rep="—",
+
+                    html = render_metric_table_html(
+                        display_group, metric_label, suffix, args
                     )
-                    html = styler.to_html(
-                        table_attributes='border="1" class="dataframe"'
+                    main_fh.write(html)
+                    sub_fh.write(html)
+
+                    maybe_write_plot(
+                        main_fh,
+                        sub_fh,
+                        group_df=group_df,
+                        raw_data_cols=raw_data_cols,
+                        metric_label=metric_label,
+                        y_axis_col=y_axis_col,
+                        args=args,
                     )
 
-                text_file.write(html_msgs_for_data_cols[i])
-                text_file.write(html)
-                with open(group_html_name, "a+") as sub_text_file:
-                    sub_text_file.write(html_msgs_for_data_cols[i])
-                    sub_text_file.write(html)
-
-                    if plot and plotly_found:
-                        import plotly.express as px
-
-                        df = group[raw_data_cols]
-                        df_sorted = df.sort_values(by=info_cols[y_axis_index])
-                        # Melt DataFrame for plotting
-                        df_melted = df_sorted.melt(
-                            id_vars=info_cols[y_axis_index],
-                            var_name="Configuration",
-                            value_name=data_cols_to_compare[i],
-                        )
-                        title = (
-                            data_cols_to_compare[i] + " vs " + info_cols[y_axis_index]
-                        )
-                        # Create Plotly line chart
-                        fig = px.line(
-                            df_melted,
-                            x=info_cols[y_axis_index],
-                            y=data_cols_to_compare[i],
-                            color="Configuration",
-                            title=title,
-                            markers=True,
-                        )
+                summary_html = build_valid_max_concurrency_summary_html(
+                    tput_group_df=tput_group_df,
+                    ttft_group_df=ttft_group_df,
+                    tpot_group_df=tpot_group_df,
+                    conc_col=conc_col,
+                    args=args,
+                )
+                if summary_html:
+                    main_fh.write(summary_html)
+                    sub_fh.write(summary_html)
 
-                        # ---- Add threshold lines based on metric name ----
-                        if "ttft" in metric_name:
-                            _add_limit_line(fig, args.ttft_max_ms, "TTFT limit")
-                        elif (
-                            "tpot" in metric_name
-                            or "median" in metric_name
-                            or "p99" in metric_name
-                        ):
-                            _add_limit_line(fig, args.tpot_max_ms, "TPOT limit")
-
-                        # Export to HTML
-                        text_file.write(
-                            fig.to_html(full_html=True, include_plotlyjs="cdn")
-                        )
-                        sub_text_file.write(
-                            fig.to_html(full_html=True, include_plotlyjs="cdn")
-                        )
+
+def main():
+    args = build_parser().parse_args()
+    info_cols = list(DEFAULT_INFO_COLS)
+    plan = choose_metrics(args.latency)
+    files, info_cols = prepare_input_files(args, info_cols)
+    write_report_group_first(files, info_cols, plan, args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
old mode 100644
new mode 100755
index 34ceefe0996f29c4d9b8957257645bfc9d849b65..6b6a7e472b9c8658f3e79135030f2c5a604ee0b4
--- a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
@@ -49,7 +49,11 @@ check_cpus() {
     echo "Need at least 1 NUMA to run benchmarking."
     exit 1
   fi
-  declare -g gpu_type="cpu"
+  if [[ "$(uname -m)" == "aarch64" ]] || [[ "$(uname -m)" == "arm64" ]]; then
+    declare -g gpu_type="arm64-cpu"
+  else
+    declare -g gpu_type="cpu"
+  fi
   echo "GPU type is $gpu_type"
 }
 
@@ -207,8 +211,8 @@ run_latency_tests() {
 
     # check if there is enough GPU to run the test
     tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
-    if [ "$ON_CPU" == "1" ]; then
-      pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size')
+    if [[ "$ON_CPU" == "1" ]]; then
+      pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size // 1')
       world_size=$(($tp*$pp))
       if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
         echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
@@ -276,8 +280,8 @@ run_throughput_tests() {
 
     # check if there is enough GPU to run the test
     tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
-    if [ "$ON_CPU" == "1" ]; then
-      pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size')
+    if [[ "$ON_CPU" == "1" ]]; then
+      pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size // 1')
       world_size=$(($tp*$pp))
       if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
         echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
@@ -393,8 +397,8 @@ run_serving_tests() {
 
     # check if there is enough resources to run the test
     tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
-    if [ "$ON_CPU" == "1" ]; then
-      pp=$(echo "$server_params" | jq -r '.pipeline_parallel_size')
+    if [[ "$ON_CPU" == "1" ]]; then
+      pp=$(echo "$server_params" | jq -r '.pipeline_parallel_size // 1')
       world_size=$(($tp*$pp))
       if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
         echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
@@ -496,9 +500,9 @@ run_serving_tests() {
 main() {
   local ARCH
   ARCH=''
-  if [ "$ON_CPU" == "1" ];then
-     check_cpus
-     ARCH='-cpu'
+  if [[ "$ON_CPU" == "1" ]]; then
+    check_cpus
+    ARCH="-$gpu_type"
   else
      check_gpus
      ARCH="$arch_suffix"
diff --git a/.buildkite/performance-benchmarks/tests/latency-tests-arm64-cpu.json b/.buildkite/performance-benchmarks/tests/latency-tests-arm64-cpu.json
new file mode 100644
index 0000000000000000000000000000000000000000..fba695041e3eef5a40503f8a9e9abf1bcb150595
--- /dev/null
+++ b/.buildkite/performance-benchmarks/tests/latency-tests-arm64-cpu.json
@@ -0,0 +1,26 @@
+[
+    {
+        "test_name": "latency_llama8B_tp1",
+        "environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+            "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+            "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+            "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+            "load_format": "dummy",
+            "dtype": "bfloat16",
+            "distributed_executor_backend": "mp",
+            "block_size": 128,
+            "trust_remote_code": "",
+            "disable_log_stats": "",
+            "enforce_eager": "",
+            "max_num_batched_tokens": 2048,
+            "max_num_seqs": 256,
+            "num_iters_warmup": 5,
+            "num_iters": 15
+        }
+    }
+]
diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-arm64-cpu.json b/.buildkite/performance-benchmarks/tests/serving-tests-arm64-cpu.json
new file mode 100644
index 0000000000000000000000000000000000000000..63f1f8ab887b34e3a5cb8752fc4da4af120c4389
--- /dev/null
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-arm64-cpu.json
@@ -0,0 +1,130 @@
+{
+  "defaults": {
+    "qps_list": [
+      "inf"
+    ],
+    "max_concurrency_list": [
+      12,
+      16,
+      24,
+      32,
+      64,
+      128,
+      200
+    ],
+    "server_environment_variables": {
+      "VLLM_RPC_TIMEOUT": 100000,
+      "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+      "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+      "VLLM_CPU_SGL_KERNEL": 1,
+      "VLLM_CPU_KVCACHE_SPACE": 40
+    },
+    "server_parameters": {
+      "model": "meta-llama/Llama-3.1-8B-Instruct",
+      "tensor_parallel_size": 1,
+      "dtype": "bfloat16",
+      "distributed_executor_backend": "mp",
+      "block_size": 128,
+      "trust_remote_code": "",
+      "disable_log_stats": "",
+      "enforce_eager": "",
+      "max_num_batched_tokens": 2048,
+      "max_num_seqs": 256,
+      "load_format": "dummy"
+    },
+    "client_parameters": {
+      "model": "meta-llama/Llama-3.1-8B-Instruct",
+      "backend": "vllm",
+      "ignore-eos": "",
+      "num_prompts": 200
+    }
+  },
+  "tests": [
+    {
+      "test_name": "serving_llama8B_tp1_sharegpt",
+      "server_parameters": {
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "dataset_name": "sharegpt",
+        "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp2_sharegpt",
+      "server_parameters": {
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "dataset_name": "sharegpt",
+        "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp1_random_128_128",
+      "server_parameters": {
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp2_random_128_128",
+      "server_parameters": {
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp1_random_128_2048",
+      "server_parameters": {
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 2048
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp2_random_128_2048",
+      "server_parameters": {
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 2048
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp1_random_2048_128",
+      "server_parameters": {
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 2048,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp2_random_2048_128",
+      "server_parameters": {
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 2048,
+        "random-output-len": 128
+      }
+    }
+  ]
+}
\ No newline at end of file
diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
index 8f7200862d20cb1d0d2a8a3e793143479668d44b..25ed7415ec0e48b65e19123493aff4a9977a2296 100644
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
@@ -19,10 +19,8 @@
       "block_size": 128,
       "trust_remote_code": "",
       "disable_log_stats": "",
-      "enforce_eager": "",
       "max_num_batched_tokens": 2048,
-      "max_num_seqs": 256,
-      "load_format": "dummy"
+      "max_num_seqs": 256
     },
     "client_parameters": {
       "model": "meta-llama/Llama-3.1-8B-Instruct",
@@ -151,6 +149,45 @@
         "random-output-len": 128
       }
     },
+    {
+      "test_name": "serving_llama8B_int4_tp1_random_128_128",
+      "server_parameters": {
+        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_int4_tp2_random_128_128",
+      "server_parameters": {
+        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_int4_tp4_random_128_128",
+      "server_parameters": {
+        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+        "tensor_parallel_size": 4
+      },
+      "client_parameters": {
+        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
     {
       "test_name": "serving_llama3B_tp1_random_128_128",
       "server_parameters": {
diff --git a/.buildkite/performance-benchmarks/tests/throughput-tests-arm64-cpu.json b/.buildkite/performance-benchmarks/tests/throughput-tests-arm64-cpu.json
new file mode 100644
index 0000000000000000000000000000000000000000..da84dd4d0c67aa9887f2105a30eed1e097fcb0ca
--- /dev/null
+++ b/.buildkite/performance-benchmarks/tests/throughput-tests-arm64-cpu.json
@@ -0,0 +1,27 @@
+[
+    {
+        "test_name": "throughput_llama8B_tp1",
+        "environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+            "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+            "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+            "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+            "load_format": "dummy",
+            "dtype": "bfloat16",
+            "distributed_executor_backend": "mp",
+            "block_size": 128,
+            "trust_remote_code": "",
+            "disable_log_stats": "",
+            "enforce_eager": "",
+            "max_num_batched_tokens": 2048,
+            "max_num_seqs": 256,
+            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200,
+            "backend": "vllm"
+        }
+    }
+]
diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index a9d51557bd9bb67951f5ab97e1748f21abd37ec2..092755ea085c8538f7b552eb7819bea787186b8b 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -1,6 +1,6 @@
 steps:
   # aarch64 + CUDA builds
-  - label: "Build arm64 wheel - CUDA 12.9"
+  - label: "Build wheel - aarch64 - CUDA 12.9"
     depends_on: ~
     id: build-wheel-arm64-cuda-12-9
     agents:
@@ -11,11 +11,11 @@ steps:
       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh"
+      - "bash .buildkite/scripts/upload-nightly-wheels.sh"
     env:
       DOCKER_BUILDKIT: "1"
 
-  - label: "Build arm64 wheel - CUDA 13.0"
+  - label: "Build wheel - aarch64 - CUDA 13.0"
     depends_on: ~
     id: build-wheel-arm64-cuda-13-0
     agents:
@@ -26,12 +26,12 @@ steps:
       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04  --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
+      - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
     env:
       DOCKER_BUILDKIT: "1"
 
   # aarch64 build
-  - label: "Build arm64 CPU wheel"
+  - label: "Build wheel - aarch64 - CPU"
     depends_on: ~
     id: build-wheel-arm64-cpu
     agents:
@@ -40,39 +40,39 @@ steps:
       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
+      - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
     env:
       DOCKER_BUILDKIT: "1"
 
   # x86 + CUDA builds
-  - label: "Build wheel - CUDA 12.9"
+  - label: "Build wheel - x86_64 - CUDA 12.9"
     depends_on: ~
-    id: build-wheel-cuda-12-9
+    id: build-wheel-x86-cuda-12-9
     agents:
       queue: cpu_queue_postmerge
     commands:
       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_31"
+      - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_31"
     env:
       DOCKER_BUILDKIT: "1"
 
-  - label: "Build wheel - CUDA 13.0"
+  - label: "Build wheel - x86_64 - CUDA 13.0"
     depends_on: ~
-    id: build-wheel-cuda-13-0
+    id: build-wheel-x86-cuda-13-0
     agents:
       queue: cpu_queue_postmerge
     commands:
       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
+      - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
     env:
       DOCKER_BUILDKIT: "1"
 
   # x86 CPU wheel build
-  - label: "Build x86 CPU wheel"
+  - label: "Build wheel - x86_64 - CPU"
     depends_on: ~
     id: build-wheel-x86-cpu
     agents:
@@ -81,12 +81,12 @@ steps:
       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
+      - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
     env:
       DOCKER_BUILDKIT: "1"
 
-  # Build release images (12.9)
-  - label: "Build release image (x86)"
+  # Build release images (CUDA 12.9)
+  - label: "Build release image - x86_64 - CUDA 12.9"
     depends_on: ~
     id: build-release-image-x86
     agents:
@@ -99,7 +99,7 @@ steps:
       - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
       - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
 
-  - label: "Build release image (arm64)"
+  - label: "Build release image - aarch64 - CUDA 12.9"
     depends_on: ~
     id: build-release-image-arm64
     agents:
@@ -109,34 +109,92 @@ steps:
       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
       - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
 
-  # Add job to create multi-arch manifest
-  - label: "Create multi-arch manifest"
+  - label: "Create multi-arch manifest - CUDA 12.9"
     depends_on:
       - build-release-image-x86
       - build-release-image-arm64
     id: create-multi-arch-manifest
     agents:
-      queue: cpu_queue_postmerge
+      queue: small_cpu_queue_postmerge
     commands:
       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
       - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
       - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
 
-  - label: "Annotate release workflow"
+  - label: "Annotate release workflow - CUDA 12.9"
     depends_on:
       - create-multi-arch-manifest
     id: annotate-release-workflow
     agents:
-      queue: cpu_queue_postmerge
+      queue: small_cpu_queue_postmerge
     commands:
       - "bash .buildkite/scripts/annotate-release.sh"
 
+  - block: "Build CUDA 13.0 release images"
+    key: block-release-image-build-cuda-13-0
+    depends_on: ~
+
+  - label: "Build release image - x86_64 - CUDA 13.0"
+    depends_on: block-release-image-build-cuda-13-0
+    id: build-release-image-x86-cuda-13-0
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.2 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
+      # re-tag to default image tag and push, just in case arm64 build fails
+      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
+
+  - label: "Build release image - aarch64 - CUDA 13.0"
+    depends_on: block-release-image-build-cuda-13-0
+    id: build-release-image-arm64-cuda-13-0
+    agents:
+      queue: arm64_cpu_queue_postmerge
+    commands:
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.2 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
+
+  - label: "Create multi-arch manifest - CUDA 13.0"
+    depends_on:
+      - build-release-image-x86-cuda-13-0
+      - build-release-image-arm64-cuda-13-0
+    id: create-multi-arch-manifest-cuda-13-0
+    agents:
+      queue: small_cpu_queue_postmerge
+    commands:
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-cu130 --amend"
+      - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
+
   - input: "Provide Release version here"
     id: input-release-version
     fields:
       - text: "What is the release version?"
         key: release-version
 
+  - block: "Confirm update release wheels to PyPI (experimental, use with caution)?"
+    key: block-upload-release-wheels
+    depends_on:
+      - input-release-version
+      - build-wheel-x86-cuda-12-9
+      - build-wheel-x86-cuda-13-0
+      - build-wheel-x86-cpu
+      - build-wheel-arm64-cuda-12-9
+      - build-wheel-arm64-cuda-13-0
+      - build-wheel-arm64-cpu
+
+  - label: "Upload release wheels to PyPI and GitHub"
+    depends_on:
+      - block-upload-release-wheels
+    id: upload-release-wheels
+    agents:
+      queue: small_cpu_queue_postmerge
+    commands:
+      - "bash .buildkite/scripts/upload-release-wheels.sh"
+
   - block: "Build CPU release image"
     key: block-cpu-release-image-build
     depends_on: ~
@@ -169,12 +227,30 @@ steps:
     env:
       DOCKER_BUILDKIT: "1"
 
+  - block: "Build ROCm release image"
+    key: block-rocm-release-image-build
+    depends_on: ~
+
+  - label: "Build release image (ROCm)"
+    depends_on: block-rocm-release-image-build
+    id: build-release-image-rocm
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      # Build base image first
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --tag rocm/vllm-dev:base-$BUILDKITE_COMMIT --target final --progress plain -f docker/Dockerfile.rocm_base ."
+      # Build vLLM ROCm image using the base
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg BASE_IMAGE=rocm/vllm-dev:base-$BUILDKITE_COMMIT --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-rocm --target vllm-openai --progress plain -f docker/Dockerfile.rocm ."
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-rocm"
+
+  
   - label: "Build and publish nightly multi-arch image to DockerHub"
     depends_on:
       - create-multi-arch-manifest
     if: build.env("NIGHTLY") == "1"
     agents:
-      queue: cpu_queue_postmerge
+      queue: small_cpu_queue_postmerge
     commands:
       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
       - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64"
@@ -196,3 +272,365 @@ steps:
     env:
       DOCKER_BUILDKIT: "1"
       DOCKERHUB_USERNAME: "vllmbot"
+
+  # =============================================================================
+  # ROCm Release Pipeline (x86_64 only)
+  # =============================================================================
+  #
+  # vLLM version is determined by the Buildkite checkout (like CUDA pipeline).
+  # To build a specific version, trigger the build from that branch/tag.
+  #
+  # Environment variables for ROCm builds (set via Buildkite UI or schedule):
+  #   ROCM_PYTHON_VERSION: Python version (default: 3.12)
+  #   PYTORCH_ROCM_ARCH: GPU architectures (default: gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151)
+  #   ROCM_UPLOAD_WHEELS: Upload to S3 (default: false for nightly, true for releases)
+  #   ROCM_FORCE_REBUILD: Force rebuild base wheels, ignore S3 cache (default: false)
+  #
+  # Note: ROCm version is determined by BASE_IMAGE in docker/Dockerfile.rocm_base
+  #       (currently rocm/dev-ubuntu-22.04:7.1-complete)
+  #
+  # =============================================================================
+
+  # ROCm Input Step - Collect build configuration (manual trigger only)
+  - input: "ROCm Wheel Release Build Configuration"
+    key: input-rocm-config
+    depends_on: ~
+    if: build.source == "ui"
+    fields:
+      - text: "Python Version"
+        key: "rocm-python-version"
+        default: "3.12"
+        hint: "Python version (e.g., 3.12)"
+      - text: "GPU Architectures"
+        key: "rocm-pytorch-rocm-arch"
+        default: "gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151"
+        hint: "Semicolon-separated GPU architectures"
+      - select: "Upload Wheels to S3"
+        key: "rocm-upload-wheels"
+        default: "true"
+        options:
+          - label: "No - Build only (nightly/dev)"
+            value: "false"
+          - label: "Yes - Upload to S3 (release)"
+            value: "true"
+      - select: "Force Rebuild Base Wheels"
+        key: "rocm-force-rebuild"
+        default: "false"
+        hint: "Ignore S3 cache and rebuild base wheels from scratch"
+        options:
+          - label: "No - Use cached wheels if available"
+            value: "false"
+          - label: "Yes - Rebuild even if cache exists"
+            value: "true"
+
+  # ROCm Job 1: Build ROCm Base Wheels (with S3 caching)
+  - label: ":rocm: Build ROCm Base Wheels"
+    id: build-rocm-base-wheels
+    depends_on:
+      - step: input-rocm-config
+        allow_failure: true  # Allow failure so non-UI builds can proceed (input step is skipped)
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      # Set configuration and check cache
+      - |
+        set -euo pipefail
+
+        # Get values from meta-data (set by input step) or use defaults
+        PYTHON_VERSION="$$(buildkite-agent meta-data get rocm-python-version 2>/dev/null || echo '')"
+        export PYTHON_VERSION="$${PYTHON_VERSION:-3.12}"
+
+        PYTORCH_ROCM_ARCH="$$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo '')"
+        export PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151}"
+
+        # Check for force rebuild flag
+        ROCM_FORCE_REBUILD="$${ROCM_FORCE_REBUILD:-}"
+        if [ -z "$${ROCM_FORCE_REBUILD}" ]; then
+          ROCM_FORCE_REBUILD="$$(buildkite-agent meta-data get rocm-force-rebuild 2>/dev/null || echo '')"
+        fi
+
+        echo "========================================"
+        echo "ROCm Base Wheels Build Configuration"
+        echo "========================================"
+        echo "  PYTHON_VERSION: $${PYTHON_VERSION}"
+        echo "  PYTORCH_ROCM_ARCH: $${PYTORCH_ROCM_ARCH}"
+        echo "  ROCM_FORCE_REBUILD: $${ROCM_FORCE_REBUILD:-false}"
+        echo "========================================"
+
+        # Save resolved config for later jobs
+        buildkite-agent meta-data set "rocm-python-version" "$${PYTHON_VERSION}"
+        buildkite-agent meta-data set "rocm-pytorch-rocm-arch" "$${PYTORCH_ROCM_ARCH}"
+
+        # Check S3 cache for pre-built wheels
+        CACHE_KEY=$$(.buildkite/scripts/cache-rocm-base-wheels.sh key)
+        CACHE_PATH=$$(.buildkite/scripts/cache-rocm-base-wheels.sh path)
+        echo ""
+        echo "Cache key: $${CACHE_KEY}"
+        echo "Cache path: $${CACHE_PATH}"
+
+        # Save cache key for downstream jobs
+        buildkite-agent meta-data set "rocm-cache-key" "$${CACHE_KEY}"
+
+        CACHE_STATUS="miss"
+        if [ "$${ROCM_FORCE_REBUILD}" != "true" ]; then
+          CACHE_STATUS=$$(.buildkite/scripts/cache-rocm-base-wheels.sh check)
+        else
+          echo "Force rebuild requested, skipping cache check"
+        fi
+
+        if [ "$${CACHE_STATUS}" = "hit" ]; then
+          echo ""
+          echo "CACHE HIT! Downloading pre-built wheels..."
+          echo ""
+          .buildkite/scripts/cache-rocm-base-wheels.sh download
+
+          # Set the S3 path for the cached Docker image (for Job 2 to download)
+          S3_ARTIFACT_PATH="s3://$${S3_BUCKET}/rocm/cache/$${CACHE_KEY}"
+          buildkite-agent meta-data set "rocm-docker-image-s3-path" "$${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
+
+          # Mark that we used cache (for Docker image handling)
+          buildkite-agent meta-data set "rocm-used-cache" "true"
+
+          echo ""
+          echo "Cache download complete. Skipping Docker build."
+          echo "Docker image will be downloaded from: $${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
+        else
+          echo ""
+          echo "CACHE MISS. Building from scratch..."
+          echo ""
+
+          # Build full base image (for later vLLM build)
+          DOCKER_BUILDKIT=1 docker buildx build \
+            --file docker/Dockerfile.rocm_base \
+            --tag rocm/vllm-dev:base-$${BUILDKITE_BUILD_NUMBER} \
+            --build-arg PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
+            --build-arg PYTHON_VERSION="$${PYTHON_VERSION}" \
+            --build-arg USE_SCCACHE=1 \
+            --build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
+            --build-arg SCCACHE_REGION_NAME=us-west-2 \
+            --build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
+            --load \
+            .
+
+          # Build debs_wheel_release stage for wheel extraction
+          DOCKER_BUILDKIT=1 docker buildx build \
+            --file docker/Dockerfile.rocm_base \
+            --tag rocm-base-debs:$${BUILDKITE_BUILD_NUMBER} \
+            --target debs_wheel_release \
+            --build-arg PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
+            --build-arg PYTHON_VERSION="$${PYTHON_VERSION}" \
+            --build-arg USE_SCCACHE=1 \
+            --build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
+            --build-arg SCCACHE_REGION_NAME=us-west-2 \
+            --build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
+            --load \
+            .
+
+          # Extract wheels from Docker image
+          mkdir -p artifacts/rocm-base-wheels
+          container_id=$$(docker create rocm-base-debs:$${BUILDKITE_BUILD_NUMBER})
+          docker cp $${container_id}:/app/debs/. artifacts/rocm-base-wheels/
+          docker rm $${container_id}
+          echo "Extracted base wheels:"
+          ls -lh artifacts/rocm-base-wheels/
+
+          # Upload wheels to S3 cache for future builds
+          echo ""
+          echo "Uploading wheels to S3 cache..."
+          .buildkite/scripts/cache-rocm-base-wheels.sh upload
+
+          # Export base Docker image for reuse in vLLM build
+          mkdir -p artifacts/rocm-docker-image
+          docker save rocm/vllm-dev:base-$${BUILDKITE_BUILD_NUMBER} | gzip > artifacts/rocm-docker-image/rocm-base-image.tar.gz
+          echo "Docker image size:"
+          ls -lh artifacts/rocm-docker-image/
+
+          # Upload large Docker image to S3 (also cached by cache key)
+          S3_ARTIFACT_PATH="s3://$${S3_BUCKET}/rocm/cache/$${CACHE_KEY}"
+          echo "Uploading Docker image to $${S3_ARTIFACT_PATH}/"
+          aws s3 cp artifacts/rocm-docker-image/rocm-base-image.tar.gz "$${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
+
+          # Save the S3 path for downstream jobs
+          buildkite-agent meta-data set "rocm-docker-image-s3-path" "$${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
+
+          # Mark that we did NOT use cache
+          buildkite-agent meta-data set "rocm-used-cache" "false"
+
+          echo ""
+          echo "Build complete. Wheels cached for future builds."
+        fi
+    artifact_paths:
+      - "artifacts/rocm-base-wheels/*.whl"
+    env:
+      DOCKER_BUILDKIT: "1"
+      S3_BUCKET: "vllm-wheels"
+
+  # ROCm Job 2: Build vLLM ROCm Wheel
+  - label: ":python: Build vLLM ROCm Wheel"
+    id: build-rocm-vllm-wheel
+    depends_on:
+      - step: build-rocm-base-wheels
+        allow_failure: false
+    agents:
+      queue: cpu_queue_postmerge
+    timeout_in_minutes: 180
+    commands:
+      # Download artifacts and prepare Docker image
+      - |
+        set -euo pipefail
+
+        # Ensure git tags are up-to-date (Buildkite's default fetch doesn't update tags)
+        # This fixes version detection when tags are moved/force-pushed
+        echo "Fetching latest tags from origin..."
+        git fetch --tags --force origin
+        
+        # Log tag information for debugging version detection
+        echo "========================================"
+        echo "Git Tag Verification"
+        echo "========================================"
+        echo "Current HEAD: $(git rev-parse HEAD)"
+        echo "git describe --tags: $(git describe --tags 2>/dev/null || echo 'No tags found')"
+        echo ""
+        echo "Recent tags (pointing to commits near HEAD):"
+        git tag -l --sort=-creatordate | head -5
+        echo "setuptools_scm version detection:"
+        pip install -q setuptools_scm 2>/dev/null || true
+        python3 -c "import setuptools_scm; print('  Detected version:', setuptools_scm.get_version())" 2>/dev/null || echo "  (setuptools_scm not available in this environment)"
+        echo "========================================"
+
+        # Download wheel artifacts from current build
+        echo "Downloading wheel artifacts from current build"
+        buildkite-agent artifact download "artifacts/rocm-base-wheels/*.whl" .
+
+        # Download Docker image from S3 (too large for Buildkite artifacts)
+        DOCKER_IMAGE_S3_PATH="$$(buildkite-agent meta-data get rocm-docker-image-s3-path 2>/dev/null || echo '')"
+        if [ -z "$${DOCKER_IMAGE_S3_PATH}" ]; then
+          echo "ERROR: rocm-docker-image-s3-path metadata not found"
+          echo "This should have been set by the build-rocm-base-wheels job"
+          exit 1
+        fi
+        echo "Downloading Docker image from $${DOCKER_IMAGE_S3_PATH}"
+        mkdir -p artifacts/rocm-docker-image
+        aws s3 cp "$${DOCKER_IMAGE_S3_PATH}" artifacts/rocm-docker-image/rocm-base-image.tar.gz
+
+        # Load base Docker image and capture the tag
+        echo "Loading base Docker image..."
+        LOAD_OUTPUT=$$(gunzip -c artifacts/rocm-docker-image/rocm-base-image.tar.gz | docker load)
+        echo "$${LOAD_OUTPUT}"
+        # Extract the actual loaded image tag from "Loaded image: <tag>" output
+        # This avoids picking up stale images (like rocm/vllm-dev:nightly) already on the agent
+        BASE_IMAGE_TAG=$$(echo "$${LOAD_OUTPUT}" | grep "Loaded image:" | sed 's/Loaded image: //')
+        if [ -z "$${BASE_IMAGE_TAG}" ]; then
+          echo "ERROR: Failed to extract image tag from docker load output"
+          echo "Load output was: $${LOAD_OUTPUT}"
+          exit 1
+        fi
+        echo "Loaded base image: $${BASE_IMAGE_TAG}"
+
+        # Prepare base wheels for Docker build context
+        mkdir -p docker/context/base-wheels
+        touch docker/context/base-wheels/.keep
+        cp artifacts/rocm-base-wheels/*.whl docker/context/base-wheels/
+        echo "Base wheels for vLLM build:"
+        ls -lh docker/context/base-wheels/
+
+        # Get GPU architectures from meta-data
+        PYTORCH_ROCM_ARCH="$$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo '')"
+        PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151}"
+
+        echo "========================================"
+        echo "Building vLLM wheel with:"
+        echo "  BUILDKITE_COMMIT: $${BUILDKITE_COMMIT}"
+        echo "  BUILDKITE_BRANCH: $${BUILDKITE_BRANCH}"
+        echo "  PYTORCH_ROCM_ARCH: $${PYTORCH_ROCM_ARCH}"
+        echo "  BASE_IMAGE: $${BASE_IMAGE_TAG}"
+        echo "========================================"
+
+        # Build vLLM wheel using local checkout (REMOTE_VLLM=0)
+        DOCKER_BUILDKIT=1 docker build \
+          --file docker/Dockerfile.rocm \
+          --target export_vllm_wheel_release \
+          --output type=local,dest=rocm-dist \
+          --build-arg BASE_IMAGE="$${BASE_IMAGE_TAG}" \
+          --build-arg ARG_PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
+          --build-arg REMOTE_VLLM=0 \
+          --build-arg GIT_REPO_CHECK=1 \
+          --build-arg USE_SCCACHE=1 \
+          --build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
+          --build-arg SCCACHE_REGION_NAME=us-west-2 \
+          --build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
+          .
+
+        echo "Built vLLM wheel:"
+        ls -lh rocm-dist/*.whl
+
+        # Copy wheel to artifacts directory
+        mkdir -p artifacts/rocm-vllm-wheel
+        cp rocm-dist/*.whl artifacts/rocm-vllm-wheel/
+        echo "Final vLLM wheel:"
+        ls -lh artifacts/rocm-vllm-wheel/
+    artifact_paths:
+      - "artifacts/rocm-vllm-wheel/*.whl"
+    env:
+      DOCKER_BUILDKIT: "1"
+      S3_BUCKET: "vllm-wheels"
+
+  # ROCm Job 3: Upload Wheels to S3
+  - label: ":s3: Upload ROCm Wheels to S3"
+    id: upload-rocm-wheels
+    depends_on:
+      - step: build-rocm-vllm-wheel
+        allow_failure: false
+    agents:
+      queue: cpu_queue_postmerge
+    timeout_in_minutes: 60
+    commands:
+      # Download all wheel artifacts and run upload
+      - |
+        set -euo pipefail
+
+        # Check if upload is enabled (from env var, meta-data, or release branch)
+        ROCM_UPLOAD_WHEELS="$${ROCM_UPLOAD_WHEELS:-}"
+        if [ -z "$${ROCM_UPLOAD_WHEELS}" ]; then
+          # Try to get from meta-data (input form)
+          ROCM_UPLOAD_WHEELS="$$(buildkite-agent meta-data get rocm-upload-wheels 2>/dev/null || echo '')"
+        fi
+
+        echo "========================================"
+        echo "Upload check:"
+        echo "  ROCM_UPLOAD_WHEELS: $${ROCM_UPLOAD_WHEELS}"
+        echo "  BUILDKITE_BRANCH: $${BUILDKITE_BRANCH}"
+        echo "========================================"
+
+        # Skip upload if not enabled
+        if [ "$${ROCM_UPLOAD_WHEELS}" != "true" ]; then
+          echo "Skipping S3 upload (ROCM_UPLOAD_WHEELS != true, NIGHTLY != 1, not a release branch)"
+          echo "To enable upload, set 'Upload Wheels to S3' to 'Yes' in the build configuration"
+          exit 0
+        fi
+
+        echo "Upload enabled, proceeding..."
+
+        # Download artifacts from current build
+        echo "Downloading artifacts from current build"
+        buildkite-agent artifact download "artifacts/rocm-base-wheels/*.whl" .
+        buildkite-agent artifact download "artifacts/rocm-vllm-wheel/*.whl" .
+
+        # Run upload script
+        bash .buildkite/scripts/upload-rocm-wheels.sh
+    env:
+      DOCKER_BUILDKIT: "1"
+      S3_BUCKET: "vllm-wheels"
+
+  # ROCm Job 4: Annotate ROCm Wheel Release
+  - label: ":memo: Annotate ROCm wheel release"
+    id: annotate-rocm-release
+    depends_on:
+      - step: upload-rocm-wheels
+        allow_failure: true
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "bash .buildkite/scripts/annotate-rocm-release.sh"
+    env:
+      S3_BUCKET: "vllm-wheels"
diff --git a/.buildkite/scripts/annotate-release.sh b/.buildkite/scripts/annotate-release.sh
index df805e085080626ec9bc7daf9fbe8dfee231df7a..d178fb88841e26f4d7c6f770a3686f0bbf8de33b 100755
--- a/.buildkite/scripts/annotate-release.sh
+++ b/.buildkite/scripts/annotate-release.sh
@@ -32,6 +32,7 @@ To download and upload the image:
 \`\`\`
 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
 
 docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
 docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
@@ -45,6 +46,12 @@ docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
 docker push vllm/vllm-openai:latest-aarch64
 docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
 
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai:rocm
+docker tag vllm/vllm-openai:rocm vllm/vllm-openai:latest-rocm
+docker tag vllm/vllm-openai:rocm vllm/vllm-openai:v${RELEASE_VERSION}-rocm
+docker push vllm/vllm-openai:latest-rocm
+docker push vllm/vllm-openai:v${RELEASE_VERSION}-rocm
+
 docker manifest rm vllm/vllm-openai:latest
 docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
 docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
diff --git a/.buildkite/scripts/annotate-rocm-release.sh b/.buildkite/scripts/annotate-rocm-release.sh
new file mode 100755
index 0000000000000000000000000000000000000000..fcc7c290ec043aa1daf4445ba116bb75e88d8bb5
--- /dev/null
+++ b/.buildkite/scripts/annotate-rocm-release.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+#
+# Generate Buildkite annotation for ROCm wheel release
+
+set -ex
+
+# Get build configuration from meta-data
+# Extract ROCm version dynamically from Dockerfile.rocm_base
+# BASE_IMAGE format: rocm/dev-ubuntu-22.04:7.1-complete -> extracts "7.1"
+ROCM_VERSION=$(grep -E '^ARG BASE_IMAGE=' docker/Dockerfile.rocm_base | sed -E 's/.*:([0-9]+\.[0-9]+).*/\1/' || echo "unknown")
+PYTHON_VERSION=$(buildkite-agent meta-data get rocm-python-version 2>/dev/null || echo "3.12")
+PYTORCH_ROCM_ARCH=$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo "gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
+
+# S3 URLs
+S3_BUCKET="${S3_BUCKET:-vllm-wheels}"
+S3_REGION="${AWS_DEFAULT_REGION:-us-west-2}"
+S3_URL="https://${S3_BUCKET}.s3.${S3_REGION}.amazonaws.com"
+ROCM_PATH="rocm/${BUILDKITE_COMMIT}"
+
+buildkite-agent annotate --style 'success' --context 'rocm-release-workflow' << EOF
+## :rocm: ROCm Wheel Release
+
+### Build Configuration
+| Setting | Value |
+|---------|-------|
+| **ROCm Version** | ${ROCM_VERSION} |
+| **Python Version** | ${PYTHON_VERSION} |
+| **GPU Architectures** | ${PYTORCH_ROCM_ARCH} |
+| **Branch** | \`${BUILDKITE_BRANCH}\` |
+| **Commit** | \`${BUILDKITE_COMMIT}\` |
+
+### :package: Installation
+
+**Install from this build (by commit):**
+\`\`\`bash
+uv pip install vllm --extra-index-url ${S3_URL}/${ROCM_PATH}/{rocm_variant}/
+
+# Example:
+uv pip install vllm --extra-index-url ${S3_URL}/${ROCM_PATH}/rocm700/
+\`\`\`
+
+**Install from nightly (if published):**
+\`\`\`bash
+uv pip install vllm --extra-index-url ${S3_URL}/rocm/nightly/
+\`\`\`
+
+### :floppy_disk: Download Wheels Directly
+
+\`\`\`bash
+# List all ROCm wheels
+aws s3 ls s3://${S3_BUCKET}/${ROCM_PATH}/
+
+# Download specific wheels
+aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/vllm-*.whl .
+aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/torch-*.whl .
+aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/triton_rocm-*.whl .
+aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/torchvision-*.whl .
+aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/amdsmi-*.whl .
+\`\`\`
+
+### :gear: Included Packages
+- **vllm**: vLLM with ROCm support
+- **torch**: PyTorch built for ROCm ${ROCM_VERSION}
+- **triton_rocm**: Triton built for ROCm
+- **torchvision**: TorchVision for ROCm PyTorch
+- **amdsmi**: AMD SMI Python bindings
+
+### :warning: Notes
+- These wheels are built for **ROCm ${ROCM_VERSION}** and will NOT work with CUDA GPUs
+- Supported GPU architectures: ${PYTORCH_ROCM_ARCH}
+- Platform: Linux x86_64 only
+EOF
diff --git a/.buildkite/scripts/cache-rocm-base-wheels.sh b/.buildkite/scripts/cache-rocm-base-wheels.sh
new file mode 100755
index 0000000000000000000000000000000000000000..be244725023da4640d41ffb9c11b3a6588e7a8a2
--- /dev/null
+++ b/.buildkite/scripts/cache-rocm-base-wheels.sh
@@ -0,0 +1,140 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+#
+# Cache helper for ROCm base wheels
+#
+# This script manages caching of pre-built ROCm base wheels (torch, triton, etc.)
+# to avoid rebuilding them when Dockerfile.rocm_base hasn't changed.
+#
+# Usage:
+#   cache-rocm-base-wheels.sh check    - Check if cache exists, outputs "hit" or "miss"
+#   cache-rocm-base-wheels.sh upload   - Upload wheels to cache
+#   cache-rocm-base-wheels.sh download - Download wheels from cache
+#   cache-rocm-base-wheels.sh key      - Output the cache key
+#
+# Environment variables:
+#   S3_BUCKET          - S3 bucket name (default: vllm-wheels)
+#   PYTHON_VERSION     - Python version (affects cache key)
+#   PYTORCH_ROCM_ARCH  - GPU architectures (affects cache key)
+#
+# Note: ROCm version is determined by BASE_IMAGE in Dockerfile.rocm_base,
+#       so changes to ROCm version are captured by the Dockerfile hash.
+
+set -euo pipefail
+
+BUCKET="${S3_BUCKET:-vllm-wheels}"
+DOCKERFILE="docker/Dockerfile.rocm_base"
+CACHE_PREFIX="rocm/cache"
+
+# Generate hash from Dockerfile content + build args
+generate_cache_key() {
+    # Include Dockerfile content
+    if [[ ! -f "$DOCKERFILE" ]]; then
+        echo "ERROR: Dockerfile not found: $DOCKERFILE" >&2
+        exit 1
+    fi
+    local dockerfile_hash=$(sha256sum "$DOCKERFILE" | cut -c1-16)
+
+    # Include key build args that affect the output
+    # These should match the ARGs in Dockerfile.rocm_base that change the build output
+    # Note: ROCm version is determined by BASE_IMAGE in the Dockerfile, so it's captured by dockerfile_hash
+    local args_string="${PYTHON_VERSION:-}|${PYTORCH_ROCM_ARCH:-}"
+    local args_hash=$(echo "$args_string" | sha256sum | cut -c1-8)
+
+    echo "${dockerfile_hash}-${args_hash}"
+}
+
+CACHE_KEY=$(generate_cache_key)
+CACHE_PATH="s3://${BUCKET}/${CACHE_PREFIX}/${CACHE_KEY}/"
+
+case "${1:-}" in
+    check)
+        echo "Checking cache for key: ${CACHE_KEY}" >&2
+        echo "Cache path: ${CACHE_PATH}" >&2
+        echo "Variables used in cache key:" >&2
+        echo "  PYTHON_VERSION: ${PYTHON_VERSION:-<not set>}" >&2
+        echo "  PYTORCH_ROCM_ARCH: ${PYTORCH_ROCM_ARCH:-<not set>}" >&2
+
+        # Check if cache exists by listing objects
+        # We look for at least one .whl file
+        echo "Running: aws s3 ls ${CACHE_PATH}" >&2
+        S3_OUTPUT=$(aws s3 ls "${CACHE_PATH}" 2>&1) || true
+        echo "S3 ls output:" >&2
+        echo "$S3_OUTPUT" | head -5 >&2
+
+        if echo "$S3_OUTPUT" | grep -q "\.whl"; then
+            echo "hit"
+        else
+            echo "miss"
+        fi
+        ;;
+
+    upload)
+        echo "========================================"
+        echo "Uploading wheels to cache"
+        echo "========================================"
+        echo "Cache key: ${CACHE_KEY}"
+        echo "Cache path: ${CACHE_PATH}"
+        echo ""
+
+        if [[ ! -d "artifacts/rocm-base-wheels" ]]; then
+            echo "ERROR: artifacts/rocm-base-wheels directory not found" >&2
+            exit 1
+        fi
+
+        WHEEL_COUNT=$(ls artifacts/rocm-base-wheels/*.whl 2>/dev/null | wc -l)
+        if [[ "$WHEEL_COUNT" -eq 0 ]]; then
+            echo "ERROR: No wheels found in artifacts/rocm-base-wheels/" >&2
+            exit 1
+        fi
+
+        echo "Uploading $WHEEL_COUNT wheels..."
+        aws s3 cp --recursive artifacts/rocm-base-wheels/ "${CACHE_PATH}"
+
+        echo ""
+        echo "Cache upload complete!"
+        echo "========================================"
+        ;;
+
+    download)
+        echo "========================================"
+        echo "Downloading wheels from cache"
+        echo "========================================"
+        echo "Cache key: ${CACHE_KEY}"
+        echo "Cache path: ${CACHE_PATH}"
+        echo ""
+
+        mkdir -p artifacts/rocm-base-wheels
+        aws s3 cp --recursive "${CACHE_PATH}" artifacts/rocm-base-wheels/
+
+        echo ""
+        echo "Downloaded wheels:"
+        ls -lh artifacts/rocm-base-wheels/
+
+        WHEEL_COUNT=$(ls artifacts/rocm-base-wheels/*.whl 2>/dev/null | wc -l)
+        echo ""
+        echo "Total: $WHEEL_COUNT wheels"
+        echo "========================================"
+        ;;
+
+    key)
+        echo "${CACHE_KEY}"
+        ;;
+
+    path)
+        echo "${CACHE_PATH}"
+        ;;
+
+    *)
+        echo "Usage: $0 {check|upload|download|key|path}" >&2
+        echo "" >&2
+        echo "Commands:" >&2
+        echo "  check    - Check if cache exists, outputs 'hit' or 'miss'" >&2
+        echo "  upload   - Upload wheels from artifacts/rocm-base-wheels/ to cache" >&2
+        echo "  download - Download wheels from cache to artifacts/rocm-base-wheels/" >&2
+        echo "  key      - Output the cache key" >&2
+        echo "  path     - Output the full S3 cache path" >&2
+        exit 1
+        ;;
+esac
diff --git a/.buildkite/scripts/generate-nightly-index.py b/.buildkite/scripts/generate-nightly-index.py
index d0965fbd56405ac50b470bbe2ed2ed922b1597d6..2eb4211402cc1e5cfb96cf1363181e3fa8a6adfe 100644
--- a/.buildkite/scripts/generate-nightly-index.py
+++ b/.buildkite/scripts/generate-nightly-index.py
@@ -16,6 +16,18 @@ from urllib.parse import quote
 
 import regex as re
 
+
+def normalize_package_name(name: str) -> str:
+    """
+    Normalize package name according to PEP 503.
+    https://peps.python.org/pep-0503/#normalized-names
+
+    Replace runs of underscores, hyphens, and periods with a single hyphen,
+    and lowercase the result.
+    """
+    return re.sub(r"[-_.]+", "-", name).lower()
+
+
 if not sys.version_info >= (3, 12):
     raise RuntimeError("This script requires Python 3.12 or higher.")
 
@@ -78,7 +90,13 @@ def parse_from_filename(file: str) -> WheelFileInfo:
             version = version.removesuffix("." + variant)
     else:
         if "+" in version:
-            version, variant = version.split("+")
+            version_part, suffix = version.split("+", 1)
+            # Only treat known patterns as variants (rocmXXX, cuXXX, cpu)
+            # Git hashes and other suffixes are NOT variants
+            if suffix.startswith(("rocm", "cu", "cpu")):
+                variant = suffix
+                version = version_part
+            # Otherwise keep the full version string (variant stays None)
 
     return WheelFileInfo(
         package_name=package_name,
@@ -206,6 +224,26 @@ def generate_index_and_metadata(
         print("No wheel files found, skipping index generation.")
         return
 
+    # For ROCm builds: inherit variant from vllm wheel
+    # All ROCm wheels should share the same variant as vllm
+    rocm_variant = None
+    for file in parsed_files:
+        if (
+            file.package_name == "vllm"
+            and file.variant
+            and file.variant.startswith("rocm")
+        ):
+            rocm_variant = file.variant
+            print(f"Detected ROCm variant from vllm: {rocm_variant}")
+            break
+
+    # Apply ROCm variant to all wheels without a variant
+    if rocm_variant:
+        for file in parsed_files:
+            if file.variant is None:
+                file.variant = rocm_variant
+                print(f"Inherited variant '{rocm_variant}' for {file.filename}")
+
     # Group by variant
     variant_to_files: dict[str, list[WheelFileInfo]] = {}
     for file in parsed_files:
@@ -256,8 +294,8 @@ def generate_index_and_metadata(
 
         variant_dir.mkdir(parents=True, exist_ok=True)
 
-        # gather all package names in this variant
-        packages = set(f.package_name for f in files)
+        # gather all package names in this variant (normalized per PEP 503)
+        packages = set(normalize_package_name(f.package_name) for f in files)
         if variant == "default":
             # these packages should also appear in the "project list"
             # generate after all variants are processed
@@ -269,8 +307,10 @@ def generate_index_and_metadata(
                 f.write(project_list_str)
 
         for package in packages:
-            # filter files belonging to this package only
-            package_files = [f for f in files if f.package_name == package]
+            # filter files belonging to this package only (compare normalized names)
+            package_files = [
+                f for f in files if normalize_package_name(f.package_name) == package
+            ]
             package_dir = variant_dir / package
             package_dir.mkdir(parents=True, exist_ok=True)
             index_str, metadata_str = generate_package_index_and_metadata(
@@ -291,6 +331,7 @@ if __name__ == "__main__":
     """
     Arguments:
         --version <version> : version string for the current build (e.g., commit hash)
+        --wheel-dir <wheel_directory> : directory containing wheel files (default to be same as `version`)
         --current-objects <path_to_json> : path to JSON file containing current S3 objects listing in this version directory
         --output-dir <output_directory> : directory to store generated index files
         --alias-to-default <alias_variant_name> : (optional) alias variant name for the default variant
@@ -318,6 +359,12 @@ if __name__ == "__main__":
         required=True,
         help="Directory to store generated index files",
     )
+    parser.add_argument(
+        "--wheel-dir",
+        type=str,
+        default=None,
+        help="Directory containing wheel files (default to be same as `version`)",
+    )
     parser.add_argument(
         "--alias-to-default",
         type=str,
@@ -334,8 +381,13 @@ if __name__ == "__main__":
     args = parser.parse_args()
 
     version = args.version
-    if "/" in version or "\\" in version:
-        raise ValueError("Version string must not contain slashes.")
+    # Allow rocm/ prefix, reject other slashes and all backslashes
+    if "\\" in version:
+        raise ValueError("Version string must not contain backslashes.")
+    if "/" in version and not version.startswith("rocm/"):
+        raise ValueError(
+            "Version string must not contain slashes (except for 'rocm/' prefix)."
+        )
     current_objects_path = Path(args.current_objects)
     output_dir = Path(args.output_dir)
     if not output_dir.exists():
@@ -372,7 +424,7 @@ if __name__ == "__main__":
 
     print(f"Found {len(wheel_files)} wheel files for version {version}: {wheel_files}")
 
-    # keep only "official" files for a non-nightly version (specifed by cli args)
+    # keep only "official" files for a non-nightly version (specified by cli args)
     PY_VERSION_RE = re.compile(r"^\d+\.\d+\.\d+([a-zA-Z0-9.+-]*)?$")
     if PY_VERSION_RE.match(version):
         # upload-wheels.sh ensures no "dev" is in args.version
@@ -384,9 +436,25 @@ if __name__ == "__main__":
         print("Nightly version detected, keeping all wheel files.")
 
     # Generate index and metadata, assuming wheels and indices are stored as:
-    # s3://vllm-wheels/{version}/<wheel files>
+    # s3://vllm-wheels/{wheel_dir}/<wheel files>
     # s3://vllm-wheels/<anything>/<index files>
-    wheel_base_dir = Path(output_dir).parent / version
+    #
+    # For ROCm builds, version is "rocm/{commit}" and indices are uploaded to:
+    #   - rocm/{commit}/  (same as wheels)
+    #   - rocm/nightly/
+    #   - rocm/{version}/
+    # All these are under the "rocm/" prefix, so relative paths should be
+    # relative to "rocm/", not the bucket root.
+    if args.wheel_dir:
+        # Explicit wheel-dir provided (e.g., for version-specific indices pointing to commit dir)
+        wheel_dir = args.wheel_dir.strip().rstrip("/")
+    elif version.startswith("rocm/"):
+        # For rocm/commit, wheel_base_dir should be just the commit part
+        # so relative path from rocm/0.12.0/rocm710/vllm/ -> ../../../{commit}/
+        wheel_dir = version.split("/", 1)[1]
+    else:
+        wheel_dir = version
+    wheel_base_dir = Path(output_dir).parent / wheel_dir
     index_base_dir = Path(output_dir)
 
     generate_index_and_metadata(
diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh
index 864eb470bb0a7e33cd9067ec912a30a3e2722ee3..484167f4619b37e0680dc8d6da10eb564e3ecc25 100755
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -141,7 +141,6 @@ if [[ $commands == *" entrypoints/openai "* ]]; then
   --ignore=entrypoints/openai/test_audio.py \
   --ignore=entrypoints/openai/test_shutdown.py \
   --ignore=entrypoints/openai/test_completion.py \
-  --ignore=entrypoints/openai/test_sleep.py \
   --ignore=entrypoints/openai/test_models.py \
   --ignore=entrypoints/openai/test_lora_adapters.py \
   --ignore=entrypoints/openai/test_return_tokens_as_ids.py \
@@ -210,12 +209,21 @@ if [[ $commands == *"--shard-id="* ]]; then
     wait "${pid}"
     STATUS+=($?)
   done
+  at_least_one_shard_with_tests=0
   for st in "${STATUS[@]}"; do
-    if [[ ${st} -ne 0 ]]; then
+    if [[ ${st} -ne 0 ]] && [[ ${st} -ne 5 ]]; then
       echo "One of the processes failed with $st"
       exit "${st}"
+    elif [[ ${st} -eq 5 ]]; then
+      echo "Shard exited with status 5 (no tests collected) - treating as success"
+    else # This means st is 0
+      at_least_one_shard_with_tests=1
     fi
   done
+  if [[ ${#STATUS[@]} -gt 0 && ${at_least_one_shard_with_tests} -eq 0 ]]; then
+    echo "All shards reported no tests collected. Failing the build."
+    exit 1
+  fi
 else
   echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
   docker run \
diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
index 438fe522c8702ec2e552d4a958d816d3808a2891..ee6510bf88e3e108e0c6f8bddc5de81ff62cb919 100644
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -50,6 +50,7 @@ function cpu_tests() {
   docker exec cpu-test-"$NUMA_NODE" bash -c "
     set -e
     pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
+    pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
     pytest -x -v -s tests/kernels/test_onednn.py"
 
   # Run basic model test
@@ -83,7 +84,7 @@ function cpu_tests() {
   docker exec cpu-test-"$NUMA_NODE" bash -c "
     set -e
     pytest -x -s -v \
-    tests/lora/test_qwen2vl.py"
+    tests/lora/test_qwenvl.py"
 
   # online serving: tp+pp
   docker exec cpu-test-"$NUMA_NODE" bash -c '
diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
index cbb2527a4ff0aa30664f340c61f75833db469e2a..6959f81eab3732043741dab067fa4c6710fbf088 100755
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
 echo "--- Installing Python dependencies ---"
 python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
     && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
-    && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
+    && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \
     && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
 echo "--- Python dependencies installed ---"
 
diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
index f022fa3672eeba2774d51e34b928ea25c7d4d6e6..eafc82b98439be027a28b4be8b9fc4899badbf5e 100755
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
 echo "--- Installing Python dependencies ---"
 python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
     && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
-    && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
+    && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \
     && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
 echo "--- Python dependencies installed ---"
 
diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
index dfc9db512d1e9d01166f3bee1b8b3f9e2d66847f..85b554e5e86460ad21b4c2072d0e5aa73883d6c2 100644
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -39,7 +39,7 @@ docker run \
     python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
     python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
     python3 examples/offline_inference/basic/generate.py --model Intel/Qwen2.5-0.5B-W4A16-G128-AutoRound-LLMC-TEST-ONLY --enforce-eager
-    VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
     cd tests
     pytest -v -s v1/core
     pytest -v -s v1/engine
diff --git a/.buildkite/scripts/run-multi-node-test.sh b/.buildkite/scripts/run-multi-node-test.sh
index 49aebce786b9250887a6160f0f98d5581fd34344..c0911f17b660d21ca95e7e606654f0c82f4d8f2d 100755
--- a/.buildkite/scripts/run-multi-node-test.sh
+++ b/.buildkite/scripts/run-multi-node-test.sh
@@ -2,6 +2,17 @@
 
 set -euox pipefail
 
+# To detect ROCm
+# Check multiple indicators:
+if [ -e /dev/kfd ] || \
+    [ -d /opt/rocm ] || \
+    command -v rocm-smi &> /dev/null || \
+    [ -n "${ROCM_HOME:-}" ]; then
+    IS_ROCM=1
+else
+    IS_ROCM=0
+fi
+
 if [[ $# -lt 4 ]]; then
     echo "Usage: .buildkite/scripts/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
     exit 1
@@ -26,13 +37,18 @@ for command in "${COMMANDS[@]}"; do
     echo "$command"
 done
 
+
 start_network() {
     docker network create --subnet=192.168.10.0/24 docker-net
 }
 
 start_nodes() {
     for node in $(seq 0 $(($NUM_NODES-1))); do
-        GPU_DEVICES='"device='
+        if [ "$IS_ROCM" -eq 1 ]; then
+            GPU_DEVICES='--device /dev/kfd --device /dev/dri -e HIP_VISIBLE_DEVICES='
+        else
+            GPU_DEVICES='--gpus "device='
+        fi
         for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
             DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
             GPU_DEVICES+=$(($DEVICE_NUM))
@@ -40,7 +56,9 @@ start_nodes() {
                 GPU_DEVICES+=','
             fi
         done
-        GPU_DEVICES+='"'
+        if [ "$IS_ROCM" -eq 0 ]; then
+            GPU_DEVICES+='"'
+        fi
 
         # start the container in detached mode
         # things to note:
@@ -49,7 +67,7 @@ start_nodes() {
         # 3. map the huggingface cache directory to the container
         # 3. assign ip addresses to the containers (head node: 192.168.10.10, worker nodes:
         #    starting from 192.168.10.11)
-        docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN \
+        docker run -d $GPU_DEVICES --shm-size=10.24gb -e HF_TOKEN \
             -v ~/.cache/huggingface:/root/.cache/huggingface --name "node$node" \
             --network docker-net --ip 192.168.10.$((10 + $node)) --rm "$DOCKER_IMAGE" \
             /bin/bash -c "tail -f /dev/null"
diff --git a/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh b/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
index 6a1bef275d04705e26675add8078786ac362b2d8..d0921c5699d5d202bd0fed73e3ac0bb14860d4f0 100644
--- a/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
@@ -44,10 +44,10 @@ trap cleanup EXIT
 
 for BACK in "${BACKENDS[@]}"; do
   VLLM_DEEP_GEMM_WARMUP=skip \
-  VLLM_ALL2ALL_BACKEND=$BACK \
   vllm serve "$MODEL" \
     --enforce-eager \
     --enable-eplb \
+    --all2all-backend $BACK \
     --eplb-config '{"window_size":10, "step_interval":100, "num_redundant_experts":0, "log_balancedness":true}' \
     --tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \
     --data-parallel-size ${DATA_PARALLEL_SIZE} \
diff --git a/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh b/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
index 937a43d1a32214ed97a6437710f28e4fc19c49ce..b3b65128e606244e2a444e0e514dbc73e99ca53f 100644
--- a/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
@@ -43,12 +43,12 @@ trap cleanup EXIT
 
 for BACK in "${BACKENDS[@]}"; do
   VLLM_DEEP_GEMM_WARMUP=skip \
-  VLLM_ALL2ALL_BACKEND=$BACK \
   vllm serve "$MODEL" \
     --enforce-eager \
     --tensor-parallel-size 4 \
     --enable-expert-parallel \
     --enable-eplb \
+    --all2all-backend $BACK \
     --eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
     --speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \
     --trust-remote-code \
diff --git a/.buildkite/scripts/upload-wheels.sh b/.buildkite/scripts/upload-nightly-wheels.sh
similarity index 94%
rename from .buildkite/scripts/upload-wheels.sh
rename to .buildkite/scripts/upload-nightly-wheels.sh
index 3a218a4bb2e6daae133b060038859f7a6ddfe2ee..1af7f476ae74b725aa3c969256a49d5ebca0b411 100644
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-nightly-wheels.sh
@@ -102,6 +102,7 @@ if [[ "$version" != *"dev"* ]]; then
     echo "Re-generating indices for /$pure_version/"
     rm -rf "$INDICES_OUTPUT_DIR/*"
     mkdir -p "$INDICES_OUTPUT_DIR"
-    $PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" $alias_arg
+    # wheel-dir is overridden to be the commit directory, so that the indices point to the correct wheel path
+    $PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --wheel-dir "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" $alias_arg
     aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
 fi
diff --git a/.buildkite/scripts/upload-release-wheels.sh b/.buildkite/scripts/upload-release-wheels.sh
new file mode 100644
index 0000000000000000000000000000000000000000..a4b246bf1b8531ba3ad13d83c3f0e7646b668594
--- /dev/null
+++ b/.buildkite/scripts/upload-release-wheels.sh
@@ -0,0 +1,103 @@
+#!/usr/bin/env bash
+
+set -e
+
+BUCKET="vllm-wheels"
+SUBPATH=$BUILDKITE_COMMIT
+S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
+
+RELEASE_VERSION=$(buildkite-agent meta-data get release-version)
+echo "Release version from Buildkite: $RELEASE_VERSION"
+GIT_VERSION=$(git describe --exact-match --tags $BUILDKITE_COMMIT 2>/dev/null)
+if [ -z "$GIT_VERSION" ]; then
+    echo "[FATAL] Not on a git tag, cannot create release."
+    exit 1
+else
+    echo "Git version for commit $BUILDKITE_COMMIT: $GIT_VERSION"
+fi
+# sanity check for version mismatch
+if [ "v$RELEASE_VERSION" != "$GIT_VERSION" ]; then
+  if [ "$FORCE_RELEASE_IGNORE_VERSION_MISMATCH" == "true" ]; then
+    echo "[WARNING] Force release and ignore version mismatch"
+  else
+    echo "[FATAL] Release version from Buildkite does not match Git version."
+    exit 1
+  fi
+fi
+
+# check pypi token
+if [ -z "$PYPI_TOKEN" ]; then
+  echo "[FATAL] PYPI_TOKEN is not set."
+  exit 1
+else
+  export TWINE_USERNAME="__token__"
+  export TWINE_PASSWORD="$PYPI_TOKEN"
+fi
+
+# check github token
+if [ -z "$GITHUB_TOKEN" ]; then
+  echo "[FATAL] GITHUB_TOKEN is not set."
+  exit 1
+else
+  export GH_TOKEN="$GITHUB_TOKEN"
+fi
+
+set -x # avoid printing secrets above
+
+# download gh CLI from github
+# Get latest gh CLI version from GitHub API
+GH_VERSION=$(curl -s https://api.github.com/repos/cli/cli/releases/latest | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/' | sed 's/^v//')
+if [ -z "$GH_VERSION" ]; then
+  echo "[FATAL] Failed to get latest gh CLI version from GitHub"
+  exit 1
+fi
+echo "Downloading gh CLI version: $GH_VERSION"
+GH_TARBALL="gh_${GH_VERSION}_linux_amd64.tar.gz"
+GH_URL="https://github.com/cli/cli/releases/download/v${GH_VERSION}/${GH_TARBALL}"
+GH_INSTALL_DIR="/tmp/gh-install"
+mkdir -p "$GH_INSTALL_DIR"
+pushd "$GH_INSTALL_DIR"
+curl -L -o "$GH_TARBALL" "$GH_URL"
+tar -xzf "$GH_TARBALL"
+GH_BIN=$(realpath $(find . -name "gh" -type f -executable | head -n 1))
+if [ -z "$GH_BIN" ]; then
+  echo "[FATAL] Failed to find gh CLI executable"
+  exit 1
+fi
+echo "gh CLI downloaded successfully, version: $($GH_BIN --version)"
+echo "Last 5 releases on GitHub:" # as a sanity check of gh and GH_TOKEN
+command "$GH_BIN" release list --limit 5
+popd
+
+# install twine from pypi
+python3 -m venv /tmp/vllm-release-env
+source /tmp/vllm-release-env/bin/activate
+pip install twine
+python3 -m twine --version
+
+# copy release wheels to local directory
+DIST_DIR=/tmp/vllm-release-dist
+echo "Existing wheels on S3:"
+aws s3 ls "$S3_COMMIT_PREFIX"
+echo "Copying wheels to local directory"
+mkdir -p $DIST_DIR
+# include only wheels for the release version, ignore all files with "dev" or "rc" in the name
+aws s3 cp --recursive --exclude "*" --include "vllm-${RELEASE_VERSION}*.whl" --exclude "*dev*" --exclude "*rc*" "$S3_COMMIT_PREFIX" $DIST_DIR
+echo "Wheels copied to local directory"
+# generate source tarball
+git archive --format=tar.gz --output="$DIST_DIR/vllm-${RELEASE_VERSION}.tar.gz" $BUILDKITE_COMMIT
+ls -la $DIST_DIR
+
+
+# upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
+PYPI_WHEEL_FILES=$(find $DIST_DIR -name "vllm-${RELEASE_VERSION}*.whl" -not -name "*+*")
+if [ -z "$PYPI_WHEEL_FILES" ]; then
+  echo "No default variant wheels found, quitting..."
+  exit 1
+fi
+python3 -m twine check $PYPI_WHEEL_FILES
+python3 -m twine --non-interactive --verbose upload $PYPI_WHEEL_FILES
+echo "Wheels uploaded to PyPI"
+
+# create release on GitHub with the release version and all wheels
+command "$GH_BIN" release create $GIT_VERSION -d --latest --notes-from-tag --verify-tag $DIST_DIR/*.whl
diff --git a/.buildkite/scripts/upload-rocm-wheels.sh b/.buildkite/scripts/upload-rocm-wheels.sh
new file mode 100755
index 0000000000000000000000000000000000000000..bb555bc842925c13bd05f7aa44ebb8f4dabbf194
--- /dev/null
+++ b/.buildkite/scripts/upload-rocm-wheels.sh
@@ -0,0 +1,151 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+#
+# Upload ROCm wheels to S3 with proper index generation
+#
+# Required environment variables:
+#   AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY (or IAM role)
+#   S3_BUCKET (default: vllm-wheels)
+#
+# S3 path structure:
+#   s3://vllm-wheels/rocm/{commit}/     - All wheels for this commit
+#   s3://vllm-wheels/rocm/nightly/      - Index pointing to latest nightly
+#   s3://vllm-wheels/rocm/{version}/    - Index for release versions
+
+set -ex
+
+# ======== Configuration ========
+BUCKET="${S3_BUCKET:-vllm-wheels}"
+ROCM_SUBPATH="rocm/${BUILDKITE_COMMIT}"
+S3_COMMIT_PREFIX="s3://$BUCKET/$ROCM_SUBPATH/"
+INDICES_OUTPUT_DIR="rocm-indices"
+PYTHON="${PYTHON_PROG:-python3}"
+
+# ROCm uses manylinux_2_35 (Ubuntu 22.04 based)
+MANYLINUX_VERSION="manylinux_2_35"
+
+echo "========================================"
+echo "ROCm Wheel Upload Configuration"
+echo "========================================"
+echo "S3 Bucket: $BUCKET"
+echo "S3 Path: $ROCM_SUBPATH"
+echo "Commit: $BUILDKITE_COMMIT"
+echo "Branch: $BUILDKITE_BRANCH"
+echo "========================================"
+
+# ======== Part 0: Setup Python ========
+
+# Detect if python3.12+ is available
+has_new_python=$($PYTHON -c "print(1 if __import__('sys').version_info >= (3,12) else 0)" 2>/dev/null || echo 0)
+if [[ "$has_new_python" -eq 0 ]]; then
+    # Use new python from docker
+    # Use --user to ensure files are created with correct ownership (not root)
+    docker pull python:3-slim
+    PYTHON="docker run --rm --user $(id -u):$(id -g) -v $(pwd):/app -w /app python:3-slim python3"
+fi
+
+echo "Using python interpreter: $PYTHON"
+echo "Python version: $($PYTHON --version)"
+
+# ======== Part 1: Collect and prepare wheels ========
+
+# Collect all wheels
+mkdir -p all-rocm-wheels
+cp artifacts/rocm-base-wheels/*.whl all-rocm-wheels/ 2>/dev/null || true
+cp artifacts/rocm-vllm-wheel/*.whl all-rocm-wheels/ 2>/dev/null || true
+
+WHEEL_COUNT=$(ls all-rocm-wheels/*.whl 2>/dev/null | wc -l)
+echo "Total wheels to upload: $WHEEL_COUNT"
+
+if [ "$WHEEL_COUNT" -eq 0 ]; then
+    echo "ERROR: No wheels found to upload!"
+    exit 1
+fi
+
+# Rename linux to manylinux in wheel filenames
+for wheel in all-rocm-wheels/*.whl; do
+    if [[ "$wheel" == *"linux"* ]] && [[ "$wheel" != *"manylinux"* ]]; then
+        new_wheel="${wheel/linux/$MANYLINUX_VERSION}"
+        mv -- "$wheel" "$new_wheel"
+        echo "Renamed: $(basename "$wheel") -> $(basename "$new_wheel")"
+    fi
+done
+
+echo ""
+echo "Wheels to upload:"
+ls -lh all-rocm-wheels/
+
+# ======== Part 2: Upload wheels to S3 ========
+
+echo ""
+echo "Uploading wheels to $S3_COMMIT_PREFIX"
+for wheel in all-rocm-wheels/*.whl; do
+    aws s3 cp "$wheel" "$S3_COMMIT_PREFIX"
+done
+
+# ======== Part 3: Generate and upload indices ========
+
+# List existing wheels in commit directory
+echo ""
+echo "Generating indices..."
+obj_json="rocm-objects.json"
+aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$ROCM_SUBPATH/" --delimiter / --output json > "$obj_json"
+
+mkdir -p "$INDICES_OUTPUT_DIR"
+
+# Use the existing generate-nightly-index.py
+# HACK: Replace regex module with stdlib re (same as CUDA script)
+sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py
+
+$PYTHON .buildkite/scripts/generate-nightly-index.py \
+    --version "$ROCM_SUBPATH" \
+    --current-objects "$obj_json" \
+    --output-dir "$INDICES_OUTPUT_DIR" \
+    --comment "ROCm commit $BUILDKITE_COMMIT"
+
+# Upload indices to commit directory
+echo "Uploading indices to $S3_COMMIT_PREFIX"
+aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "$S3_COMMIT_PREFIX"
+
+# Update rocm/nightly/ if on main branch and not a PR
+if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]] || [[ "$NIGHTLY" == "1" ]]; then
+    echo "Updating rocm/nightly/ index..."
+    aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/rocm/nightly/"
+fi
+
+# Extract version from vLLM wheel and update version-specific index
+VLLM_WHEEL=$(ls all-rocm-wheels/vllm*.whl 2>/dev/null | head -1)
+if [ -n "$VLLM_WHEEL" ]; then
+    VERSION=$(unzip -p "$VLLM_WHEEL" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
+    echo "Version in wheel: $VERSION"
+    PURE_VERSION="${VERSION%%+*}"
+    PURE_VERSION="${PURE_VERSION%%.rocm}"
+    echo "Pure version: $PURE_VERSION"
+
+    if [[ "$VERSION" != *"dev"* ]]; then
+        echo "Updating rocm/$PURE_VERSION/ index..."
+        aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/rocm/$PURE_VERSION/"
+    fi
+fi
+
+# ======== Part 4: Summary ========
+
+echo ""
+echo "========================================"
+echo "ROCm Wheel Upload Complete!"
+echo "========================================"
+echo ""
+echo "Wheels available at:"
+echo "  s3://$BUCKET/$ROCM_SUBPATH/"
+echo ""
+echo "Install command (by commit):"
+echo "  pip install vllm --extra-index-url https://${BUCKET}.s3.amazonaws.com/$ROCM_SUBPATH/"
+echo ""
+if [[ "$BUILDKITE_BRANCH" == "main" ]] || [[ "$NIGHTLY" == "1" ]]; then
+    echo "Install command (nightly):"
+    echo "  pip install vllm --extra-index-url https://${BUCKET}.s3.amazonaws.com/rocm/nightly/"
+fi
+echo ""
+echo "Wheel count: $WHEEL_COUNT"
+echo "========================================"
diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 3c9b8cbedcf065c1af6a686c72082ecb19138d95..044a82c9773f0e3bb7dfea936f1861e313af5d4f 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -128,7 +128,7 @@ steps:
   - tests/entrypoints/
   commands:
   - pytest -v -s entrypoints/openai/tool_parsers
-  - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling
+  - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling
 
 - label: Entrypoints Integration Test (LLM) # 30min
   timeout_in_minutes: 40
@@ -148,7 +148,7 @@ steps:
   - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
   - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
 
-- label: Entrypoints Integration Test (API Server) # 100min
+- label: Entrypoints Integration Test (API Server 1) # 100min
   timeout_in_minutes: 130
   mirror_hardwares: [amdexperimental]
   agent_pool: mi325_1
@@ -162,10 +162,28 @@ steps:
   - tests/entrypoints/test_chat_utils
   commands:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
-  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses
   - pytest -v -s entrypoints/test_chat_utils.py
 
+- label: Entrypoints Integration Test (API Server 2)
+  timeout_in_minutes: 50
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  # grade: Blocking
+  working_dir: "/vllm-workspace/tests"
+  fast_check: true
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/entrypoints/sleep
+  - tests/entrypoints/rpc
+  - tests/tool_use
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -v -s entrypoints/sleep
+  - pytest -v -s tool_use
+  - PYTHONPATH=/vllm-workspace  pytest -v -s entrypoints/rpc
+
 - label: Entrypoints Integration Test (Pooling)
   timeout_in_minutes: 50
   mirror_hardwares: [amdexperimental]
@@ -181,6 +199,21 @@ steps:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   - pytest -v -s entrypoints/pooling
 
+- label: Entrypoints Integration Test (Responses API)
+  timeout_in_minutes: 50
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  # grade: Blocking
+  working_dir: "/vllm-workspace/tests"
+  fast_check: true
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/entrypoints/openai/responses
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -v -s entrypoints/openai/responses
+
 - label: Distributed Tests (4 GPUs) # 35min
   timeout_in_minutes: 50
   mirror_hardwares: [amdexperimental]
@@ -201,6 +234,9 @@ steps:
   - tests/v1/engine/test_engine_core_client.py
   - tests/distributed/test_symm_mem_allreduce.py
   commands:
+  # Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876
+  # TODO: Remove when the bug is fixed in a future ROCm release
+  - export TORCH_NCCL_BLOCKING_WAIT=1
   # test with torchrun tp=2 and external_dp=2
   - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
   # test with torchrun tp=2 and pp=2
@@ -249,9 +285,10 @@ steps:
   - vllm/v1/executor/uniproc_executor.py
   - vllm/v1/worker/gpu_worker.py
   commands:
-  # https://github.com/NVIDIA/nccl/issues/1838
-  #- export NCCL_CUMEM_HOST_ENABLE=0
   # test with torchrun tp=2 and dp=4 with ep
+  # Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876
+  # TODO: Remove when the bug is fixed in a future ROCm release
+  - export TORCH_NCCL_BLOCKING_WAIT=1
   - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
 
 - label: EPLB Algorithm Test # 5min
@@ -331,7 +368,9 @@ steps:
 - label: V1 Test e2e + engine # 65min
   timeout_in_minutes: 90
   mirror_hardwares: [amdexperimental]
-  agent_pool: mi325_4
+  # The test uses 4 GPUs, but we schedule it on 8-GPU machines for stability.
+  # See discussion here: https://github.com/vllm-project/vllm/pull/31040
+  agent_pool: mi325_8
   # grade: Blocking
   source_file_dependencies:
     - vllm/
@@ -492,8 +531,7 @@ steps:
   - tests/samplers
   - tests/conftest.py
   commands:
-    - pytest -v -s samplers
-    - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
+    - pytest -v -s -m 'not skip_v1' samplers
 
 - label: LoRA Test %N # 20min each
   timeout_in_minutes: 30
@@ -707,7 +745,7 @@ steps:
 
 - label: Quantization Test # 70min
   timeout_in_minutes: 90
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   # grade: Blocking
   source_file_dependencies:
@@ -722,7 +760,7 @@ steps:
   # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
   # we can only upgrade after this is resolved
   # TODO(jerryzh168): resolve the above comment
-  - uv pip install --system torchao==0.13.0
+  - uv pip install --system torchao==0.14.1
   - uv pip install --system conch-triton-kernels
   - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
 
@@ -736,7 +774,7 @@ steps:
   - vllm/model_executor/layers/quantization
   autorun_on_main: true
   commands:
-  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
 
 - label: OpenAI API correctness # 10min
   timeout_in_minutes: 15
@@ -747,21 +785,11 @@ steps:
   - csrc/
   - vllm/entrypoints/openai/
   - vllm/model_executor/models/whisper.py
+  - tools/
   commands: # LMEval+Transcription WER check
-  # Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442
+  - bash ../tools/install_torchcodec_rocm.sh || exit 1
   - pytest -s entrypoints/openai/correctness/
 
-- label: OpenAI-Compatible Tool Use # 23 min
-  timeout_in_minutes: 35
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_1
-  # grade: Blocking
-  fast_check: false
-  source_file_dependencies:
-    - vllm/
-    - tests/tool_use
-  commands:
-    - pytest -v -s tool_use
 
 #####  models test  #####
 
@@ -854,6 +882,7 @@ steps:
     # Shard slow subset of standard language models tests. Only run when model
     # source is modified, or when specified test files are modified
     - pip freeze | grep -E 'torch'
+    - export TORCH_NCCL_BLOCKING_WAIT=1
     - pytest -v -s models/language -m 'core_model and slow_test' \
              --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
              --shard-id=$$BUILDKITE_PARALLEL_JOB
@@ -871,7 +900,7 @@ steps:
   commands:
     # Install fast path packages for testing against transformers
     # Note: also needed to run plamo2 model in vLLM
-    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
+    - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr'
     - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
     # Shard hybrid language model tests
     - pytest -v -s models/language/generation \
@@ -892,7 +921,7 @@ steps:
   commands:
     # Install fast path packages for testing against transformers
     # Note: also needed to run plamo2 model in vLLM
-    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
+    - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr'
     - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
     - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
 
@@ -957,7 +986,7 @@ steps:
     - pytest -v -s models/multimodal/processing
 
 - label: Multi-Modal Models Test (Standard) # 60min
-  timeout_in_minutes: 80
+  timeout_in_minutes: 100
   mirror_hardwares: [amdexperimental]
   agent_pool: mi325_1
   # grade: Blocking
@@ -966,13 +995,16 @@ steps:
   - vllm/
   - tests/models/multimodal
   commands:
+    - export MIOPEN_DEBUG_CONV_DIRECT=0
+    - export MIOPEN_DEBUG_CONV_GEMM=0
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
     - pip freeze | grep -E 'torch'
-    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
+    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing --ignore models/multimodal/pooling/test_prithvi_mae.py
+    - pytest -v -s models/multimodal/pooling/test_prithvi_mae.py -m core_model
     - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
 
-- label: Multi-Modal Accuracy Eval (Small Models) # 150min - 180min
-  timeout_in_minutes: 180
+- label: Multi-Modal Accuracy Eval (Small Models) # 5min
+  timeout_in_minutes: 10
   mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   # grade: Blocking
@@ -982,7 +1014,9 @@ steps:
   - vllm/inputs/
   - vllm/v1/core/
   commands:
-  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
+  - export MIOPEN_DEBUG_CONV_DIRECT=0
+  - export MIOPEN_DEBUG_CONV_GEMM=0
+  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt
 
 - label: Multi-Modal Models Test (Extended) 1 # 60min
   timeout_in_minutes: 120
@@ -994,10 +1028,13 @@ steps:
   - vllm/
   - tests/models/multimodal
   commands:
+    - export MIOPEN_DEBUG_CONV_DIRECT=0
+    - export MIOPEN_DEBUG_CONV_GEMM=0
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
     - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
 
-- label: Multi-Modal Models Test (Extended) 2
+- label: Multi-Modal Models Test (Extended) 2 #60min
+  timeout_in_minutes: 120
   mirror_hardwares: [amdexperimental]
   agent_pool: mi325_1
   # grade: Blocking
@@ -1006,6 +1043,8 @@ steps:
   - vllm/
   - tests/models/multimodal
   commands:
+    - export MIOPEN_DEBUG_CONV_DIRECT=0
+    - export MIOPEN_DEBUG_CONV_GEMM=0
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
     - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
 
@@ -1019,6 +1058,8 @@ steps:
   - vllm/
   - tests/models/multimodal
   commands:
+    - export MIOPEN_DEBUG_CONV_DIRECT=0
+    - export MIOPEN_DEBUG_CONV_GEMM=0
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
     - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
 
@@ -1078,8 +1119,8 @@ steps:
   - vllm/v1/attention/backends/flashinfer.py
   - vllm/v1/attention/backends/mla/cutlass_mla.py
   - vllm/v1/attention/backends/mla/flashinfer_mla.py
+  - vllm/v1/attention/selector.py
   - vllm/platforms/cuda.py
-  - vllm/attention/selector.py
   commands:
     - nvidia-smi
     - python3 examples/offline_inference/basic/chat.py
@@ -1196,7 +1237,7 @@ steps:
   - csrc/
   - vllm/model_executor/layers/quantization
   commands:
-  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
 
 #####  1 GPU test  #####
 #####  multi gpus test  #####
@@ -1236,13 +1277,13 @@ steps:
   - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
     - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
     - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
-    - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
+    - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
     - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
     - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
   - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
     - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
     - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
-    - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
+    - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
 
 - label: Distributed Tests (2 GPUs) # 68min
   timeout_in_minutes: 90
@@ -1268,6 +1309,9 @@ steps:
   - tests/v1/shutdown
   - tests/v1/worker/test_worker_memory_snapshot.py
   commands:
+  # Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876
+  # TODO: Remove when the bug is fixed in a future ROCm release
+  - export TORCH_NCCL_BLOCKING_WAIT=1
   - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
   - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
   - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
@@ -1417,8 +1461,22 @@ steps:
     - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
     - tests/v1/kv_connector/nixl_integration/
   commands:
-    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
-    - bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
+    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
+    - VLLM_ATTENTION_BACKEND=ROCM_ATTN bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
+
+- label: DP EP NixlConnector PD accuracy tests (Distributed) # 15min
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_4
+  # grade: Blocking
+  timeout_in_minutes: 15
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  source_file_dependencies:
+    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+    - tests/v1/kv_connector/nixl_integration/
+  commands:
+    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
+    - VLLM_ATTENTION_BACKEND=ROCM_ATTN DP_EP=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
 
 ##### multi gpus test #####
 ##### A100 test #####
@@ -1490,7 +1548,7 @@ steps:
     - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
     - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
     - pytest -v -s tests/distributed/test_context_parallel.py
-    - HIP_VISIBLE_DEVICES=0,1 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
+    - HIP_VISIBLE_DEVICES=0,1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization
     - pytest -v -s tests/v1/distributed/test_dbo.py
 
 ##### B200 test #####
@@ -1514,7 +1572,7 @@ steps:
   - csrc/
   - vllm/model_executor/layers/quantization
   commands:
-  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
 
 - label: LM Eval Large Models (4 Card)
   mirror_hardwares: [amdexperimental, amdproduction]
@@ -1569,6 +1627,8 @@ steps:
   - .buildkite/scripts/run-prime-rl-test.sh
   commands:
     - bash .buildkite/scripts/run-prime-rl-test.sh
+
+##### EPLB Accuracy Tests #####
 - label: DeepSeek V2-Lite Accuracy
   mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_4
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 9d0b3fdd3a02c4b5366aed2521ab84d436f7e3b0..1c7a5ca368867560f53a2cee6193a85c6016638c 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -114,7 +114,7 @@ steps:
   - tests/entrypoints/
   commands:
   - pytest -v -s entrypoints/openai/tool_parsers
-  - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling
+  - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling
 
 - label: Entrypoints Integration Test (LLM) # 30min
   timeout_in_minutes: 40
@@ -132,7 +132,7 @@ steps:
   - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
   - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
 
-- label: Entrypoints Integration Test (API Server) # 100min
+- label: Entrypoints Integration Test (API Server 1) # 100min
   timeout_in_minutes: 130
   mirror_hardwares: [amdexperimental]
   working_dir: "/vllm-workspace/tests"
@@ -144,10 +144,26 @@ steps:
   - tests/entrypoints/test_chat_utils
   commands:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
-  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/  --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses
   - pytest -v -s entrypoints/test_chat_utils.py
 
+- label: Entrypoints Integration Test (API Server 2)
+  timeout_in_minutes: 50
+  mirror_hardwares: [amdexperimental]
+  working_dir: "/vllm-workspace/tests"
+  fast_check: true
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/entrypoints/sleep
+  - tests/entrypoints/rpc
+  - tests/tool_use
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -v -s entrypoints/sleep
+  - PYTHONPATH=/vllm-workspace  pytest -v -s entrypoints/rpc
+  - pytest -v -s tool_use
+
 - label: Entrypoints Integration Test (Pooling)
   timeout_in_minutes: 50
   mirror_hardwares: [amdexperimental]
@@ -161,6 +177,18 @@ steps:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   - pytest -v -s entrypoints/pooling
 
+- label: Entrypoints Integration Test (Responses API)
+  timeout_in_minutes: 50
+  mirror_hardwares: [amdexperimental]
+  working_dir: "/vllm-workspace/tests"
+  fast_check: true
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/entrypoints/openai/responses
+  commands:
+  - pytest -v -s entrypoints/openai/responses
+
 - label: Distributed Tests (4 GPUs) # 35min
   timeout_in_minutes: 50
   mirror_hardwares: [amdexperimental]
@@ -303,7 +331,10 @@ steps:
     # TODO: accuracy does not match, whether setting
     # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
     - pytest -v -s v1/e2e
-    - pytest -v -s v1/engine
+    # Run this test standalone for now;
+    # need to untangle use (implicit) use of spawn/fork across the tests.
+    - pytest -v -s v1/engine/test_preprocess_error_handling.py
+    - pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py
 
 - label: V1 Test entrypoints # 35min
   timeout_in_minutes: 50
@@ -642,7 +673,7 @@ steps:
   # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
   # we can only upgrade after this is resolved
   # TODO(jerryzh168): resolve the above comment
-  - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
+  - uv pip install --system torchao==0.14.1 --index-url https://download.pytorch.org/whl/cu129
   - uv pip install --system conch-triton-kernels
   - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
 
@@ -654,7 +685,7 @@ steps:
   - vllm/model_executor/layers/quantization
   autorun_on_main: true
   commands:
-  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
 
 - label: OpenAI API correctness # 22min
   timeout_in_minutes: 30
@@ -666,16 +697,6 @@ steps:
   commands: # LMEval+Transcription WER check
   - pytest -s entrypoints/openai/correctness/
 
-- label: OpenAI-Compatible Tool Use # 23 min
-  timeout_in_minutes: 35
-  mirror_hardwares: [amdexperimental]
-  fast_check: false
-  source_file_dependencies:
-    - vllm/
-    - tests/tool_use
-  commands:
-    - pytest -v -s tool_use
-
 #####  models test  #####
 
 - label: Basic Models Tests (Initialization)
@@ -934,7 +955,6 @@ steps:
   timeout_in_minutes: 30
   working_dir: "/vllm-workspace/"
   gpu: b200
-  # optional: true
   source_file_dependencies:
   - csrc/quantization/fp4/
   - csrc/attention/mla/
@@ -946,8 +966,8 @@ steps:
   - vllm/v1/attention/backends/flashinfer.py
   - vllm/v1/attention/backends/mla/cutlass_mla.py
   - vllm/v1/attention/backends/mla/flashinfer_mla.py
+  - vllm/v1/attention/selector.py
   - vllm/platforms/cuda.py
-  - vllm/attention/selector.py
   commands:
     - nvidia-smi
     - python3 examples/offline_inference/basic/chat.py
@@ -1064,7 +1084,7 @@ steps:
   - csrc/
   - vllm/model_executor/layers/quantization
   commands:
-  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
 
 #####  1 GPU test  #####
 #####  multi gpus test  #####
@@ -1096,17 +1116,18 @@ steps:
   - vllm/model_executor/models/
   - tests/distributed/
   - tests/examples/offline_inference/data_parallel.py
+  - .buildkite/scripts/run-multi-node-test.sh
   commands:
   - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
     - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
     - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
-    - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
+    - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
     - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
     - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
   - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
     - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
     - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
-    - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
+    - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
 
 - label: Distributed Tests (2 GPUs) # 68min
   timeout_in_minutes: 90
@@ -1258,8 +1279,19 @@ steps:
   commands:
     - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
 
-- label: NixlConnector PD accuracy tests (Distributed) # 30min
-  timeout_in_minutes: 30
+- label: NixlConnector PD accuracy tests (Distributed) # 40min
+  timeout_in_minutes: 40
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  source_file_dependencies:
+    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+    - tests/v1/kv_connector/nixl_integration/
+  commands:
+    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
+    - bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
+
+- label: DP EP NixlConnector PD accuracy tests (Distributed) # 15min
+  timeout_in_minutes: 15
   working_dir: "/vllm-workspace/tests"
   num_gpus: 4
   source_file_dependencies:
@@ -1267,7 +1299,7 @@ steps:
     - tests/v1/kv_connector/nixl_integration/
   commands:
     - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
-    - bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
+    - DP_EP=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
 
 
 ##### multi gpus test #####
@@ -1325,9 +1357,17 @@ steps:
     - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
     - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
     - pytest -v -s tests/distributed/test_context_parallel.py
-    - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
+    - CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
     - pytest -v -s tests/v1/distributed/test_dbo.py
 
+- label: LM Eval Large Models (H200) # optional
+  timeout_in_minutes: 60
+  gpu: h200
+  optional: true
+  num_gpus: 8
+  commands:
+    - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-h200.txt
+
 ##### B200 test #####
 - label: Distributed Tests (B200) # optional
   gpu: b200
@@ -1350,6 +1390,7 @@ steps:
   - vllm/
   - .buildkite/scripts/run-prime-rl-test.sh
   commands:
+    - nvidia-smi
     - bash .buildkite/scripts/run-prime-rl-test.sh
 
 - label: DeepSeek V2-Lite Accuracy
@@ -1378,3 +1419,26 @@ steps:
   working_dir: "/vllm-workspace"
   commands:
   - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
+
+##### MoE Refactor (Temporary) Tests #####
+
+- label: MoE Refactor Integration Test (H100 - TEMPORARY) # optional
+  gpu: h100
+  optional: true
+  num_gpus: 2
+  commands:
+    - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-h100.txt
+  
+- label: MoE Refactor Integration Test (B200 - TEMPORARY) # optional
+  gpu: b200
+  optional: true
+  num_gpus: 2
+  commands:
+    - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-b200.txt
+
+- label: MoE Refactor Integration Test (B200 DP - TEMPORARY) # optional
+  gpu: b200
+  optional: true
+  num_gpus: 2
+  commands:
+    - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt
diff --git a/.buildkite/test_areas/distributed.yaml b/.buildkite/test_areas/distributed.yaml
index 2cc90698d916ae0a7a231d5af4fa558491f42b4a..c88076bb528e8e1de0cd0f296561d9b15cd8b122 100644
--- a/.buildkite/test_areas/distributed.yaml
+++ b/.buildkite/test_areas/distributed.yaml
@@ -145,7 +145,7 @@ steps:
     - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
     - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
     - pytest -v -s tests/distributed/test_context_parallel.py
-    - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
+    - CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
     - pytest -v -s tests/v1/distributed/test_dbo.py
 
 - label: Distributed Tests (2 GPUs)(B200)
@@ -171,7 +171,7 @@ steps:
   - tests/distributed/
   - tests/examples/offline_inference/data_parallel.py
   commands:
-    - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code"
+    - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code"
 
 - label: Distributed NixlConnector PD accuracy (4 GPUs)
   timeout_in_minutes: 30
@@ -182,7 +182,7 @@ steps:
     - tests/v1/kv_connector/nixl_integration/
   commands:
     - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
-    - bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
+    - bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
 
 - label: Pipeline + Context Parallelism (4 GPUs))
   timeout_in_minutes: 60
diff --git a/.buildkite/test_areas/e2e_integration.yaml b/.buildkite/test_areas/e2e_integration.yaml
index 93d389815edacc6f3005ee594028c99b838eeeb5..2e0857986c3fa0b22369d2f9de49592a68f3fe6a 100644
--- a/.buildkite/test_areas/e2e_integration.yaml
+++ b/.buildkite/test_areas/e2e_integration.yaml
@@ -32,6 +32,7 @@ steps:
 - label: Prime-RL Integration (2 GPUs)
   timeout_in_minutes: 30
   optional: true
+  soft_fail: true
   num_gpus: 2
   working_dir: "/vllm-workspace"
   source_file_dependencies:
@@ -39,21 +40,3 @@ steps:
   - .buildkite/scripts/run-prime-rl-test.sh
   commands:
     - bash .buildkite/scripts/run-prime-rl-test.sh
-
-- label: DeepSeek V2-Lite Async EPLB Accuracy
-  timeout_in_minutes: 60
-  gpu: h100
-  optional: true
-  num_gpus: 4
-  working_dir: "/vllm-workspace"
-  commands:
-  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
-
-- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
-  timeout_in_minutes: 60
-  gpu: h100
-  optional: true
-  num_gpus: 4
-  working_dir: "/vllm-workspace"
-  commands:
-  - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
diff --git a/.buildkite/test_areas/entrypoints.yaml b/.buildkite/test_areas/entrypoints.yaml
index 0a789be943f374a82f323a450b0417f3544df352..8e02d9f60b4e9cbee9481b697a6c017a0f1e32d9 100644
--- a/.buildkite/test_areas/entrypoints.yaml
+++ b/.buildkite/test_areas/entrypoints.yaml
@@ -10,7 +10,7 @@ steps:
   - tests/entrypoints/
   commands:
   - pytest -v -s entrypoints/openai/tool_parsers
-  - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling
+  - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling
 
 - label: Entrypoints Integration (LLM)
   timeout_in_minutes: 40
@@ -25,7 +25,7 @@ steps:
   - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
   - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
 
-- label: Entrypoints Integration (API Server)
+- label: Entrypoints Integration (API Server 1)
   timeout_in_minutes: 130
   working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
@@ -34,10 +34,24 @@ steps:
   - tests/entrypoints/test_chat_utils
   commands:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
-  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/  --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses
   - pytest -v -s entrypoints/test_chat_utils.py
 
+- label: Entrypoints Integration (API Server 2)
+  timeout_in_minutes: 130
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/
+  - tests/tool_use
+  - tests/entrypoints/sleep
+  - tests/entrypoints/instrumentator
+  - tests/entrypoints/rpc
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
+  - pytest -v -s entrypoints/instrumentator
+  - pytest -v -s entrypoints/sleep
+  - pytest -v -s tool_use
 
 - label: Entrypoints Integration (Pooling)
   timeout_in_minutes: 50
@@ -49,6 +63,14 @@ steps:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   - pytest -v -s entrypoints/pooling
 
+- label: Entrypoints Integration (Responses API)
+  timeout_in_minutes: 50
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/
+  - tests/entrypoints/openai/responses
+  commands:
+  - pytest -v -s entrypoints/openai/responses
 
 - label: Entrypoints V1
   timeout_in_minutes: 50
diff --git a/.buildkite/test_areas/kernels.yaml b/.buildkite/test_areas/kernels.yaml
index 7ca099516d64159fe1925407902443a2215769f3..cf4b646f349595b4759d0ac49d15d7d03c4a332f 100644
--- a/.buildkite/test_areas/kernels.yaml
+++ b/.buildkite/test_areas/kernels.yaml
@@ -90,8 +90,8 @@ steps:
   - vllm/v1/attention/backends/flashinfer.py
   - vllm/v1/attention/backends/mla/cutlass_mla.py
   - vllm/v1/attention/backends/mla/flashinfer_mla.py
+  - vllm/v1/attention/selector.py
   - vllm/platforms/cuda.py
-  - vllm/attention/selector.py
   commands:
     - nvidia-smi
     - python3 examples/offline_inference/basic/chat.py
diff --git a/.buildkite/test_areas/lm_eval.yaml b/.buildkite/test_areas/lm_eval.yaml
index 9af43e0c375a8d77b9123b8409f471b4c702e7bb..e2498512bdef7cb59eaa493c5bca7bcf533c1822 100644
--- a/.buildkite/test_areas/lm_eval.yaml
+++ b/.buildkite/test_areas/lm_eval.yaml
@@ -9,7 +9,7 @@ steps:
   - vllm/model_executor/layers/quantization
   autorun_on_main: true
   commands:
-  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
 
 - label: LM Eval Large Models (4 GPUs)(A100)
   gpu: a100
@@ -43,4 +43,4 @@ steps:
   - csrc/
   - vllm/model_executor/layers/quantization
   commands:
-  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
diff --git a/.buildkite/test_areas/lora.yaml b/.buildkite/test_areas/lora.yaml
index 809b4138f44babebb6be1f76e17ed66ad230e261..59ade40cc8f520efee826709982840b34903f276 100644
--- a/.buildkite/test_areas/lora.yaml
+++ b/.buildkite/test_areas/lora.yaml
@@ -22,6 +22,8 @@ steps:
     # FIXIT: find out which code initialize cuda before running the test
     # before the fix, we need to use spawn to test it
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    # Alot of these tests are on the edge of OOMing
+    - export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
     # There is some Tensor Parallelism related processing logic in LoRA that
     # requires multi-GPU testing for validation.
     - pytest -v -s -x lora/test_chatglm3_tp.py
diff --git a/.buildkite/test_areas/models_basic.yaml b/.buildkite/test_areas/models_basic.yaml
index 39a5d51c48833556f2a08921a188059cbf7b8efb..2a86596a6d603ef093a4c991525098550ab97743 100644
--- a/.buildkite/test_areas/models_basic.yaml
+++ b/.buildkite/test_areas/models_basic.yaml
@@ -9,6 +9,7 @@ steps:
   source_file_dependencies:
   - vllm/
   - tests/models/test_initialization.py
+  - tests/models/registry.py
   commands:
     # Run a subset of model initialization tests
     - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
@@ -20,6 +21,7 @@ steps:
   source_file_dependencies:
   - vllm/model_executor/models/
   - tests/models/test_initialization.py
+  - tests/models/registry.py
   commands:
     # Only when vLLM model source is modified - test initialization of a large
     # subset of supported models (the complement of the small subset in the above
diff --git a/.buildkite/test_areas/pytorch.yaml b/.buildkite/test_areas/pytorch.yaml
index 703c82eb1a91bb4b887e57b2315dd20332705865..332d5202d83384be02ff273385f0cca253c5e4bb 100644
--- a/.buildkite/test_areas/pytorch.yaml
+++ b/.buildkite/test_areas/pytorch.yaml
@@ -13,7 +13,9 @@ steps:
   # tests covered elsewhere.
   # Use `find` to launch multiple instances of pytest so that
   # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
-  - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\;"
+  # However, find does not normally propagate error codes, so we combine it with xargs
+  # (using -0 for proper path handling)
+  - "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
 
 - label: PyTorch Fullgraph Smoke Test
   timeout_in_minutes: 30
diff --git a/.buildkite/test_areas/tool_use.yaml b/.buildkite/test_areas/tool_use.yaml
deleted file mode 100644
index 69527a1214229e78b0c56aa6e79573125fbcd259..0000000000000000000000000000000000000000
--- a/.buildkite/test_areas/tool_use.yaml
+++ /dev/null
@@ -1,13 +0,0 @@
-group: Tool use
-depends_on: 
-  - image-build
-steps:
-- label: OpenAI-Compatible Tool Use
-  timeout_in_minutes: 35
-  mirror_hardwares: [amdexperimental]
-  fast_check: false
-  source_file_dependencies:
-    - vllm/
-    - tests/tool_use
-  commands:
-    - pytest -v -s tool_use
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index d6447649cd89a18147eb099f276c6d9d6f5b86a7..c963be4cb8f92a5c7fcdbd1e2dffc781514280bd 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -3,7 +3,6 @@
 
 # This lists cover the "core" components of vLLM that require careful review
 /vllm/attention @LucasWilkinson
-/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @njhill
 /vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @njhill @22quinn
 /vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
@@ -15,6 +14,7 @@
 /vllm/lora @jeejeelee
 /vllm/reasoning @aarnphm @chaunceyjiang
 /vllm/entrypoints @aarnphm @chaunceyjiang
+/vllm/tool_parsers @aarnphm @chaunceyjiang
 /vllm/compilation @zou3519 @youkaichao @ProExpertProg
 /vllm/distributed/kv_transfer @NickLucche @ApostaC
 CMakeLists.txt @tlrmchlsmth @LucasWilkinson
@@ -26,6 +26,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 
 # vLLM V1
 /vllm/v1/attention @LucasWilkinson
+/vllm/v1/attention/backend.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @njhill
 /vllm/v1/attention/backends/mla @pavanimajety
 /vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety
 /vllm/v1/attention/backends/triton_attn.py @tdoublep
@@ -116,15 +117,15 @@ mkdocs.yaml @hmellor
 /vllm/transformers_utils/tokenizers/mistral.py @patrickvonplaten
 
 # Kernels
-/vllm/attention/ops/chunked_prefill_paged_decode.py @tdoublep
-/vllm/attention/ops/triton_unified_attention.py @tdoublep
+/vllm/v1/attention/ops/chunked_prefill_paged_decode.py @tdoublep
+/vllm/v1/attention/ops/triton_unified_attention.py @tdoublep
 
 # ROCm related: specify owner with write access to notify AMD folks for careful code review
 /vllm/**/*rocm* @tjtanaa
 /docker/Dockerfile.rocm* @gshtras @tjtanaa
 /vllm/v1/attention/backends/rocm*.py @gshtras @tjtanaa
 /vllm/v1/attention/backends/mla/rocm*.py @gshtras @tjtanaa
-/vllm/attention/ops/rocm*.py @gshtras @tjtanaa
+/vllm/v1/attention/ops/rocm*.py @gshtras @tjtanaa
 /vllm/model_executor/layers/fused_moe/rocm*.py @gshtras @tjtanaa
 /csrc/rocm @gshtras @tjtanaa
 /requirements/*rocm* @tjtanaa
@@ -152,7 +153,7 @@ mkdocs.yaml @hmellor
 /vllm/entrypoints/pooling @noooop
 /vllm/config/pooler.py @noooop
 /vllm/pooling_params.py @noooop
-/vllm/model_executor/layers/pooler.py @noooop
+/vllm/model_executor/layers/pooler @noooop
 
 # Security guide and policies
 /docs/usage/security.md @russellb
diff --git a/.github/mergify.yml b/.github/mergify.yml
index 3ad79f93bc7ad52fba91cd7ffb1925d25eef2107..a496dd302db507ae8a43e074ece96db51bd71939 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -222,10 +222,10 @@ pull_request_rules:
       - files~=^csrc/rocm/
       - files~=^docker/Dockerfile.rocm
       - files~=^requirements/rocm.*\.txt
-      - files~=^vllm/attention/backends/rocm.*\.py
-      - files~=^vllm/attention/ops/rocm.*\.py
       - files~=^vllm/model_executor/layers/fused_moe/rocm.*\.py
+      - files~=^vllm/v1/attention/backends/rocm.*\.py
       - files~=^vllm/v1/attention/backends/mla/rocm.*\.py
+      - files~=^vllm/v1/attention/ops/rocm.*\.py
       - files~=^tests/kernels/.*_rocm.*\.py
       - files=vllm/platforms/rocm.py
       - title~=(?i)AMD
@@ -235,6 +235,20 @@ pull_request_rules:
       add:
         - rocm
 
+- name: label-cpu
+  description: Automatically apply cpu label
+  conditions:
+    - label != stale
+    - files~=^(?!.*kv_offload)(?!.*cpu_offload).*\bcpu.*
+  actions:
+    label:
+      add:
+        - cpu
+    assign:
+      users:
+        - "fadara01"
+        - "aditew01"
+
 - name: label-structured-output
   description: Automatically apply structured-output label
   conditions:
@@ -335,6 +349,18 @@ pull_request_rules:
       add:
         - tool-calling
 
+- name: auto-rebase if approved, ready, and 40 commits behind main
+  conditions:
+    - base = main
+    - label=ready
+    - "#approved-reviews-by >= 1"
+    - "#commits-behind >= 40"
+    - -closed
+    - -draft
+    - -conflict
+  actions:
+    rebase: {}
+
 - name: ping author on conflicts and add 'needs-rebase' label
   conditions:
     - label != stale
diff --git a/.gitignore b/.gitignore
index 7cda86478664fbc366454af101a132a8f06bd3e8..864542128c0508330efaeea3a99c4d380d4ba71d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -227,3 +227,8 @@ ep_kernels_workspace/
 
 # Allow tracked library source folders under submodules (e.g., benchmarks/lib)
 !vllm/benchmarks/lib/
+
+# Generated gRPC protobuf files (compiled at build time from vllm_engine.proto)
+vllm/grpc/vllm_engine_pb2.py
+vllm/grpc/vllm_engine_pb2_grpc.py
+vllm/grpc/vllm_engine_pb2.pyi
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 83c86f3591625c860af16f12073b3fdde7e407a0..98daf9aa7097c0cbfee802a5e1bf0ab18f148dad 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -287,6 +287,7 @@ endif()
 set(VLLM_EXT_SRC
   "csrc/mamba/mamba_ssm/selective_scan_fwd.cu"
   "csrc/cache_kernels.cu"
+  "csrc/cache_kernels_fused.cu"
   "csrc/attention/paged_attention_v1.cu"
   "csrc/attention/paged_attention_v2.cu"
   "csrc/attention/merge_attn_states.cu"
@@ -365,6 +366,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 
   # marlin arches for fp16 output
   cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX" "${CUDA_ARCHS}")
+  # marlin has limited support for turing
+  cuda_archs_loose_intersection(MARLIN_SM75_ARCHS "7.5" "${CUDA_ARCHS}")
   # marlin arches for bf16 output (we need 9.0 for bf16 atomicAdd PTX)
   cuda_archs_loose_intersection(MARLIN_BF16_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
   # marlin arches for fp8 input
@@ -372,8 +375,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction
   # so we only enable fp8 computation for SM89 (e.g. RTX 40x0)  and 12.0 (e.g. RTX 50x0)
   cuda_archs_loose_intersection(MARLIN_FP8_ARCHS "8.9;12.0" "${CUDA_ARCHS}")
+  # marlin arches for other files
+  cuda_archs_loose_intersection(MARLIN_OTHER_ARCHS "7.5;8.0+PTX" "${CUDA_ARCHS}")
 
-  if (MARLIN_ARCHS)
+  if (MARLIN_OTHER_ARCHS)
 
     #
     # For the Marlin kernels we automatically generate sources for various
@@ -414,25 +419,39 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
       message(STATUS "Marlin generation script has not changed, skipping generation.")
     endif()
 
-    file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_float16.cu")
-    set_gencode_flags_for_srcs(
-      SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
-      CUDA_ARCHS "${MARLIN_ARCHS}")
-    if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
-      set_source_files_properties(${MARLIN_TEMPLATE_KERNEL_SRC}
-        PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+    if (MARLIN_ARCHS)
+      file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_float16.cu")
+      set_gencode_flags_for_srcs(
+        SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
+        CUDA_ARCHS "${MARLIN_ARCHS}")
+      if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+        set_source_files_properties(${MARLIN_TEMPLATE_KERNEL_SRC}
+          PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+      endif()
+      list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
+
+      file(GLOB MARLIN_TEMPLATE_BF16_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_bfloat16.cu")
+      set_gencode_flags_for_srcs(
+        SRCS "${MARLIN_TEMPLATE_BF16_KERNEL_SRC}"
+        CUDA_ARCHS "${MARLIN_BF16_ARCHS}")
+      if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+        set_source_files_properties(${MARLIN_TEMPLATE_BF16_KERNEL_SRC}
+          PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+      endif()
+      list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_BF16_KERNEL_SRC})
     endif()
-    list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
 
-    file(GLOB MARLIN_TEMPLATE_BF16_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_bfloat16.cu")
-    set_gencode_flags_for_srcs(
-      SRCS "${MARLIN_TEMPLATE_BF16_KERNEL_SRC}"
-      CUDA_ARCHS "${MARLIN_BF16_ARCHS}")
-    if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
-      set_source_files_properties(${MARLIN_TEMPLATE_BF16_KERNEL_SRC}
-        PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+    if (MARLIN_SM75_ARCHS) 
+      file(GLOB MARLIN_TEMPLATE_SM75_KERNEL_SRC "csrc/quantization/gptq_marlin/sm75_kernel_*.cu")
+      set_gencode_flags_for_srcs(
+        SRCS "${MARLIN_TEMPLATE_SM75_KERNEL_SRC}"
+        CUDA_ARCHS "${MARLIN_SM75_ARCHS}")
+      if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+        set_source_files_properties(${MARLIN_TEMPLATE_SM75_KERNEL_SRC}
+          PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+      endif()
+      list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_SM75_KERNEL_SRC})
     endif()
-    list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_BF16_KERNEL_SRC})
 
     if (MARLIN_FP8_ARCHS) 
       file(GLOB MARLIN_TEMPLATE_FP8_KERNEL_SRC "csrc/quantization/gptq_marlin/sm89_kernel_*.cu")
@@ -454,14 +473,14 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
        "csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
     set_gencode_flags_for_srcs(
       SRCS "${MARLIN_SRCS}"
-      CUDA_ARCHS "${MARLIN_ARCHS}")
+      CUDA_ARCHS "${MARLIN_OTHER_ARCHS}")
     if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
-      set_source_files_properties("csrc/quantization/gptq_marlin/gptq_marlin.cu"
+      set_source_files_properties(${MARLIN_SRCS}
         PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
     endif()
     list(APPEND VLLM_EXT_SRC "${MARLIN_SRCS}")
 
-    message(STATUS "Building Marlin kernels for archs: ${MARLIN_ARCHS}")
+    message(STATUS "Building Marlin kernels for archs: ${MARLIN_OTHER_ARCHS}")
   else()
     message(STATUS "Not building Marlin kernels as no compatible archs found"
                    " in CUDA target architectures")
@@ -789,24 +808,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   else()
     cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
   endif()
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
-    set(SRCS "csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu")
-    set_gencode_flags_for_srcs(
-      SRCS "${SRCS}"
-      CUDA_ARCHS "${SCALED_MM_ARCHS}")
-    list(APPEND VLLM_EXT_SRC "${SRCS}")
-    list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM100=1")
-    message(STATUS "Building blockwise_scaled_group_mm_sm100 for archs: ${SCALED_MM_ARCHS}")
-  else()
-    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
-      message(STATUS "Not building blockwise_scaled_group_mm_sm100 kernels as CUDA Compiler version is "
-                     "not >= 12.8, we recommend upgrading to CUDA 12.8 or later "
-                     "if you intend on running FP8 quantized MoE models on Blackwell.")
-    else()
-      message(STATUS "Not building blockwise_scaled_group_mm_sm100 as no compatible archs found "
-                     "in CUDA target architectures")
-    endif()
-  endif()
 
   #
   # Machete kernels
@@ -989,12 +990,16 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # note that we always set `use_atomic_add=False` for moe marlin now,
   # so we don't need 9.0 for bf16 atomicAdd PTX
   cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX" "${CUDA_ARCHS}")
+  # moe marlin has limited support for turing
+  cuda_archs_loose_intersection(MARLIN_MOE_SM75_ARCHS "7.5" "${CUDA_ARCHS}")
   # moe marlin arches for fp8 input
   # - sm80 doesn't support fp8 computation
   # - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction
   # so we only enable fp8 computation for SM89 (e.g. RTX 40x0)  and 12.0 (e.g. RTX 50x0)
   cuda_archs_loose_intersection(MARLIN_MOE_FP8_ARCHS "8.9;12.0" "${CUDA_ARCHS}")
-  if (MARLIN_MOE_ARCHS)
+  # moe marlin arches for other files
+  cuda_archs_loose_intersection(MARLIN_MOE_OTHER_ARCHS "7.5;8.0+PTX" "${CUDA_ARCHS}")
+  if (MARLIN_MOE_OTHER_ARCHS)
 
     #
     # For the Marlin MOE kernels we automatically generate sources for various
@@ -1035,16 +1040,29 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
       message(STATUS "Marlin MOE generation script has not changed, skipping generation.")
     endif()
 
-    file(GLOB MARLIN_MOE_SRC "csrc/moe/marlin_moe_wna16/sm80_kernel_*.cu")
-    list(APPEND MARLIN_MOE_SRC "csrc/moe/marlin_moe_wna16/ops.cu")
-    set_gencode_flags_for_srcs(
-      SRCS "${MARLIN_MOE_SRC}"
-      CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
-    if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
-      set_source_files_properties(${MARLIN_MOE_SRC}
-        PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+    if (MARLIN_MOE_ARCHS)
+      file(GLOB MARLIN_MOE_SRC "csrc/moe/marlin_moe_wna16/sm80_kernel_*.cu")
+      set_gencode_flags_for_srcs(
+        SRCS "${MARLIN_MOE_SRC}"
+        CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
+      if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+        set_source_files_properties(${MARLIN_MOE_SRC}
+          PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+      endif()
+      list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_SRC})
+    endif()
+
+    if (MARLIN_MOE_SM75_ARCHS) 
+      file(GLOB MARLIN_MOE_SM75_SRC "csrc/moe/marlin_moe_wna16/sm75_kernel_*.cu")
+      set_gencode_flags_for_srcs(
+        SRCS "${MARLIN_MOE_SM75_SRC}"
+        CUDA_ARCHS "${MARLIN_MOE_SM75_ARCHS}")
+      if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+        set_source_files_properties(${MARLIN_MOE_SM75_SRC}
+          PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+      endif()
+      list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_SM75_SRC})
     endif()
-    list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_SRC})
 
     if (MARLIN_MOE_FP8_ARCHS)
       file(GLOB MARLIN_MOE_FP8_SRC "csrc/moe/marlin_moe_wna16/sm89_kernel_*.cu")
@@ -1058,7 +1076,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
       list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_FP8_SRC})
     endif()
 
-    message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
+    set(MARLIN_MOE_OTHER_SRC "csrc/moe/marlin_moe_wna16/ops.cu")
+    set_gencode_flags_for_srcs(
+      SRCS "${MARLIN_MOE_OTHER_SRC}"
+      CUDA_ARCHS "${MARLIN_MOE_OTHER_ARCHS}")
+    if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+      set_source_files_properties(${MARLIN_MOE_OTHER_SRC}
+        PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+    endif()
+    list(APPEND VLLM_MOE_EXT_SRC "${MARLIN_MOE_OTHER_SRC}")
+
+    message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_OTHER_ARCHS}")
   else()
     message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
                    " in CUDA target architectures")
diff --git a/README_ORIGIN.md b/README_ORIGIN.md
index 7f08e9c16ab626bead5bd6e5621c81ca71bff5c8..4cab345f88d978678953222f76a87489e38cd0f0 100644
--- a/README_ORIGIN.md
+++ b/README_ORIGIN.md
@@ -14,51 +14,8 @@ Easy, fast, and cheap LLM serving for everyone
 | <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://blog.vllm.ai/"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://discuss.vllm.ai"><b>User Forum</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
 </p>
 
----
-Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundation.org/pytorch-conference/) and [Ray Summit, November 3-5](https://www.anyscale.com/ray-summit/2025) in San Francisco for our latest updates on vLLM and to meet the vLLM team! Register now for the largest vLLM community events of the year!
-
----
-
-*Latest News* 🔥
-
-- [2025/11] We hosted [vLLM Bangkok Meetup](https://luma.com/v0f647nv). We explored vLLM and LMCache inference and low-resource language adaptation with speakers from Embedded LLM, AMD, and Red Hat. Please find the meetup slides [here](https://drive.google.com/drive/folders/1H0DS57F8HQ5q3kSOSoRmucPJWL3E0A_X?usp=sharing).
-- [2025/11] We hosted [the first vLLM Europe Meetup in Zurich](https://luma.com/0gls27kb) focused on quantization, distributed inference, and reinforcement learning at scale with speakers from Mistral, IBM, and Red Hat. Please find the meetup slides [here](https://docs.google.com/presentation/d/1UC9PTLCHYXQpOmJDSFg6Sljra3iVXzc09DeEI7dnxMc/edit?usp=sharing) and recording [here](https://www.youtube.com/watch?v=6m6ZE6yVEDI)
-- [2025/11] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/xSrYXjNgr1HbCP4ExYNG1w) focusing on distributed inference and diverse accelerator support with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1nQJ8ZkLSjKxvu36sSHaceVXtttbLvvu-?usp=drive_link).
-- [2025/10] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg) focused on hands-on vLLM inference optimization! Please find the meetup slides [here](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6).
-- [2025/09] We hosted [vLLM Toronto Meetup](https://luma.com/e80e0ymm) focused on tackling inference at scale and speculative decoding with speakers from NVIDIA and Red Hat! Please find the meetup slides [here](https://docs.google.com/presentation/d/1IYJYmJcu9fLpID5N5RbW_vO0XLo0CGOR14IXOjB61V8/edit?usp=sharing).
-- [2025/08] We hosted [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ) focusing on the ecosystem around vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA).
-- [2025/08] We hosted [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet). We shared V1 updates, disaggregated serving and MLLM speedups with speakers from Embedded LLM, AMD, WekaIO, and A*STAR. Please find the meetup slides [here](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing).
-- [2025/08] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg) focusing on building, developing, and integrating with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH).
-- [2025/05] vLLM is now a hosted project under PyTorch Foundation! Please find the announcement [here](https://pytorch.org/blog/pytorch-foundation-welcomes-vllm/).
-- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
-
-<details>
-<summary>Previous News</summary>
-
-- [2025/08] We hosted [vLLM Korea Meetup](https://luma.com/cgcgprmh) with Red Hat and Rebellions! We shared the latest advancements in vLLM along with project spotlights from the vLLM Korea community. Please find the meetup slides [here](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view).
-- [2025/08] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA) focusing on large-scale LLM deployment! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) and the recording [here](https://www.chaspark.com/#/live/1166916873711665152).
-- [2025/05] We hosted [NYC vLLM Meetup](https://lu.ma/c1rqyf1f)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing).
-- [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
-- [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
-- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
-- [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
-- [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
-- [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing), and Google Cloud team [here](https://drive.google.com/file/d/1h24pHewANyRL11xy5dXUbvRC9F9Kkjix/view?usp=sharing).
-- [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
-- [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).
-- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
-- [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://www.youtube.com/playlist?list=PLzTswPQNepXl6AQwifuwUImLPFRVpksjR) from other vLLM contributors and users!
-- [2024/09] We hosted [the sixth vLLM meetup](https://lu.ma/87q3nvnh) with NVIDIA! Please find the meetup slides [here](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing).
-- [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing).
-- [2024/07] In partnership with Meta, vLLM officially supports Llama 3.1 with FP8 quantization and pipeline parallelism! Please check out our blog post [here](https://blog.vllm.ai/2024/07/23/llama31.html).
-- [2024/06] We hosted [the fourth vLLM meetup](https://lu.ma/agivllm) with Cloudflare and BentoML! Please find the meetup slides [here](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing).
-- [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing).
-- [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) with IBM! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
-- [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) with a16z! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing).
-- [2023/08] We would like to express our sincere gratitude to [Andreessen Horowitz](https://a16z.com/2023/08/30/supporting-the-open-source-ai-community/) (a16z) for providing a generous grant to support the open-source development and research of vLLM.
-- [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).
-
-</details>
+🔥 We have built a vllm website to help you get started with vllm. Please visit [vllm.ai](https://vllm.ai) to learn more.
+For events, please visit [vllm.ai/events](https://vllm.ai/events) to join us.
 
 ---
 
@@ -118,50 +75,6 @@ Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more.
 We welcome and value any contributions and collaborations.
 Please check out [Contributing to vLLM](https://docs.vllm.ai/en/latest/contributing/index.html) for how to get involved.
 
-## Sponsors
-
-vLLM is a community project. Our compute resources for development and testing are supported by the following organizations. Thank you for your support!
-
-<!-- Note: Please sort them in alphabetical order. -->
-<!-- Note: Please keep these consistent with docs/community/sponsors.md -->
-Cash Donations:
-
-- a16z
-- Dropbox
-- Sequoia Capital
-- Skywork AI
-- ZhenFund
-
-Compute Resources:
-
-- Alibaba Cloud
-- AMD
-- Anyscale
-- Arm
-- AWS
-- Crusoe Cloud
-- Databricks
-- DeepInfra
-- Google Cloud
-- IBM
-- Intel
-- Lambda Lab
-- Nebius
-- Novita AI
-- NVIDIA
-- Red Hat
-- Replicate
-- Roblox
-- RunPod
-- Trainy
-- UC Berkeley
-- UC San Diego
-- Volcengine
-
-Slack Sponsor: Anyscale
-
-We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM.
-
 ## Citation
 
 If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs/2309.06180):
@@ -182,7 +95,7 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
 - For discussing with fellow users, please use the [vLLM Forum](https://discuss.vllm.ai)
 - For coordinating contributions and development, please use [Slack](https://slack.vllm.ai)
 - For security disclosures, please use GitHub's [Security Advisories](https://github.com/vllm-project/vllm/security/advisories) feature
-- For collaborations and partnerships, please contact us at [vllm-questions@lists.berkeley.edu](mailto:vllm-questions@lists.berkeley.edu)
+- For collaborations and partnerships, please contact us at [collaboration@vllm.ai](mailto:collaboration@vllm.ai)
 <!-- --8<-- [end:contact-us] -->
 
 ## Media Kit
diff --git a/RELEASE.md b/RELEASE.md
index db0d51afc7be13b4da530430c33fa1c899956e41..dfd4fa1ae04d499663b4b315a9fc4988408cbfc9 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,47 +1,30 @@
 # Releasing vLLM
 
-vLLM releases offer a reliable version of the code base, packaged into a binary format that can be conveniently accessed via PyPI. These releases also serve as key milestones for the development team to communicate with the community about newly available features, improvements, and upcoming changes that could affect users, including potential breaking changes.
+vLLM releases offer a reliable version of the code base, packaged into a binary format that can be conveniently accessed via [PyPI](https://pypi.org/project/vllm). These releases also serve as key milestones for the development team to communicate with the community about newly available features, improvements, and upcoming changes that could affect users, including potential breaking changes.
 
-## Release Versioning
+## Release Cadence and Versioning
 
-vLLM uses a “right-shifted” versioning scheme where a new patch release is out every 2 weeks. And patch releases contain features and bug fixes (as opposed to semver where patch release contains only backwards-compatible bug fixes). When critical fixes need to be made, special release post1 is released.
+We aim to have a regular release every 2 weeks. Since v0.12.0, regular releases increment the minor version rather than patch version. The list of past releases can be found [here](https://vllm.ai/releases).
 
-* _major_ major architectural milestone and when incompatible API changes are made, similar to PyTorch 2.0.
-* _minor_ major features
-* _patch_ features and backwards-compatible bug fixes
-* _post1_ or _patch-1_ backwards-compatible bug fixes, either explicit or implicit post release
+Our version numbers are expressed in the form `vX.Y.Z`, where `X` is the major version, `Y` is the minor version, and `Z` is the patch version. They are incremented according to the following rules:
 
-## Release Cadence
+* _Major_ releases are reserved for architectural milestones involving sweeping API changes, similar to PyTorch 2.0.
+* _Minor_ releases correspond to regular releases, which include new features, bug fixes and other backwards-compatible changes.
+* _Patch_ releases correspond to special releases for new models, as well as emergency patches for critical performance, functionality and security issues.
 
-Patch release is released on bi-weekly basis. Post release 1-3 days after patch release and uses same branch as patch release.
-Following is the release cadence for year 2025. All future release dates below are tentative. Please note: Post releases are optional.
+This versioning scheme is similar to [SemVer](https://semver.org/) for compatibility purposes, except that backwards compatibility is only guaranteed for a limited number of minor releases (see our [deprecation policy](https://docs.vllm.ai/en/latest/contributing/deprecation_policy) for details).
 
-| Release Date | Patch release versions | Post Release versions |
-| --- | --- | --- |
-| Jan 2025 | 0.7.0 | --- |
-| Feb 2025 | 0.7.1, 0.7.2, 0.7.3  | --- |
-| Mar 2025 | 0.7.4, 0.7.5 | --- |
-| Apr 2025 | 0.7.6, 0.7.7 | --- |
-| May 2025 | 0.7.8, 0.7.9 | --- |
-| Jun 2025 | 0.7.10, 0.7.11 | --- |
-| Jul 2025 | 0.7.12, 0.7.13 | --- |
-| Aug 2025 | 0.7.14, 0.7.15 | --- |
-| Sep 2025 | 0.7.16, 0.7.17 | --- |
-| Oct 2025 | 0.7.18, 0.7.19 | --- |
-| Nov 2025 | 0.7.20, 0.7.21 | --- |
-| Dec 2025 | 0.7.22, 0.7.23 | --- |
-
-## Release branch
+## Release Branch
 
 Each release is built from a dedicated release branch.
 
-* For _major_, _minor_, _patch_ releases, the release branch cut is performed 1-2 days before release is live.
-* For post releases, previously cut release branch is reused
-* Release builds are triggered via push to RC tag like vX.Y.Z-rc1 . This enables us to build and test multiple RCs for each release.
-* Final tag : vX.Y.Z does not trigger the build but used for Release notes and assets.
-* After branch cut is created we monitor the main branch for any reverts and apply these reverts to a release branch.
+* For _major_ and _minor_ releases, the release branch cut is performed 1-2 days before release is live.
+* For _patch_ releases, previously cut release branch is reused.
+* Release builds are triggered via push to RC tag like `vX.Y.Z-rc1`. This enables us to build and test multiple RCs for each release.
+* Final tag: `vX.Y.Z` does not trigger the build but used for Release notes and assets.
+* After branch cut is created, we monitor the main branch for any reverts and apply these reverts to a release branch.
 
-## Release Cherry-Pick Criteria
+### Cherry-Pick Criteria
 
 After branch cut, we approach finalizing the release branch with clear criteria on what cherry picks are allowed in. Note: a cherry pick is a process to land a PR in the release branch after branch cut. These are typically limited to ensure that the team has sufficient time to complete a thorough round of testing on a stable code base.
 
diff --git a/benchmarks/benchmark_batch_invariance.py b/benchmarks/benchmark_batch_invariance.py
index b5c16c42de467abb26927a017003fc36c5c33e71..7473a41e51406dcb5b3e1a9a1ccfce41f10573fb 100755
--- a/benchmarks/benchmark_batch_invariance.py
+++ b/benchmarks/benchmark_batch_invariance.py
@@ -104,7 +104,6 @@ def run_benchmark_with_batch_invariant(
     random.seed(seed)
 
     # Set environment variables
-    os.environ["VLLM_ATTENTION_BACKEND"] = backend
     if batch_invariant:
         os.environ["VLLM_BATCH_INVARIANT"] = "1"
     else:
@@ -140,6 +139,7 @@ def run_benchmark_with_batch_invariant(
             max_model_len=max_model_len,
             dtype="bfloat16",
             tensor_parallel_size=tp_size,
+            attention_config={"backend": backend},
             enable_prefix_caching=False,
         )
         init_time = time.perf_counter() - start_init
diff --git a/benchmarks/benchmark_ngram_proposer.py b/benchmarks/benchmark_ngram_proposer.py
index b5373d383b548eb97dad2ebb534b80096537776d..57a6c1aef5e78ee892a45d4267409c5d524ac4dd 100644
--- a/benchmarks/benchmark_ngram_proposer.py
+++ b/benchmarks/benchmark_ngram_proposer.py
@@ -135,7 +135,6 @@ def benchmark_batched_propose(args):
             block_sizes=[16],
         )
         dummy_input_batch._req_ids = list(str(id) for id in range(args.num_req))
-        dummy_input_batch.spec_decode_unsupported_reqs = ()
         dummy_input_batch.num_tokens_no_spec = [args.num_token] * args.num_req
         dummy_input_batch.token_ids_cpu = np.random.randint(
             0, 20, (args.num_req, args.num_token)
@@ -151,10 +150,8 @@ def benchmark_batched_propose(args):
             start = time.time()
             runner.drafter.propose(
                 sampled_token_ids,
-                dummy_input_batch.req_ids,
                 dummy_input_batch.num_tokens_no_spec,
                 dummy_input_batch.token_ids_cpu,
-                dummy_input_batch.spec_decode_unsupported_reqs,
             )
             end = time.time()
             print(f"Iteration time (s): {end - start}")
diff --git a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
index 67fccdf4fd07e3aaa0feb29c64c114f7fdbb738f..7720f15e45cc1535e3c195faf2752d618c42ee9d 100644
--- a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
@@ -343,7 +343,9 @@ def bench(
         return bench_int8(dtype, m, k, n, label, sub_label)
     if dtype == torch.float8_e4m3fn:
         return bench_fp8(dtype, m, k, n, label, sub_label)
-    raise ValueError("unsupported type")
+    raise ValueError(
+        f"Unsupported dtype {dtype}: should be one of torch.int8, torch.float8_e4m3fn."
+    )
 
 
 # runner
diff --git a/benchmarks/kernels/bench_nvfp4_quant.py b/benchmarks/kernels/bench_nvfp4_quant.py
new file mode 100644
index 0000000000000000000000000000000000000000..7517376535925d6a4271b711e8b21cd3f806846b
--- /dev/null
+++ b/benchmarks/kernels/bench_nvfp4_quant.py
@@ -0,0 +1,177 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import argparse
+import copy
+import itertools
+
+import torch
+from weight_shapes import WEIGHT_SHAPES
+
+from vllm import _custom_ops as ops
+from vllm.platforms import current_platform
+from vllm.scalar_type import scalar_types
+from vllm.triton_utils import triton
+from vllm.utils.flashinfer import flashinfer_fp4_quantize
+
+if not current_platform.has_device_capability(100):
+    raise RuntimeError("NVFP4 requires compute capability of 10.0 (Blackwell)")
+
+FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max()
+FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max
+
+PROVIDER_CFGS = {
+    "vllm": dict(backend="vllm", enabled=True),
+    "flashinfer": dict(backend="flashinfer", enabled=True),
+}
+
+_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
+
+
+def compute_global_scale(tensor: torch.Tensor) -> torch.Tensor:
+    """Compute global scale for FP4 quantization."""
+    amax = torch.abs(tensor).max().to(torch.float32)
+    return FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / amax
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["batch_size"],
+        x_vals=[1, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096],
+        x_log=False,
+        line_arg="provider",
+        line_vals=_enabled,
+        line_names=_enabled,
+        ylabel="us (lower is better)",
+        plot_name="NVFP4 Input Quantization Latency (us)",
+        args={},
+    )
+)
+def benchmark(batch_size, provider, N, K):
+    M = batch_size
+    device = "cuda"
+    dtype = torch.bfloat16
+
+    # Create input tensor
+    a = torch.randn((M, K), device=device, dtype=dtype)
+
+    # Compute global scale for activation
+    a_global_scale = compute_global_scale(a)
+
+    quantiles = [0.5, 0.2, 0.8]
+
+    cfg = PROVIDER_CFGS[provider]
+
+    if cfg["backend"] == "vllm":
+        # vLLM's FP4 quantization
+        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+            lambda: ops.scaled_fp4_quant(a, a_global_scale),
+            quantiles=quantiles,
+        )
+    elif cfg["backend"] == "flashinfer":
+        # FlashInfer's FP4 quantization
+        # Use is_sf_swizzled_layout=True to match vLLM's output format
+        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+            lambda: flashinfer_fp4_quantize(
+                a, a_global_scale, is_sf_swizzled_layout=True
+            ),
+            quantiles=quantiles,
+        )
+
+    # Convert ms to us for better readability at small batch sizes
+    to_us = lambda t_ms: t_ms * 1000
+    return to_us(ms), to_us(max_ms), to_us(min_ms)
+
+
+def prepare_shapes(args):
+    out = []
+    for model, tp_size in itertools.product(args.models, args.tp_sizes):
+        for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
+            KN[tp_dim] //= tp_size
+            KN.append(model)
+            out.append(KN)
+    return out
+
+
+def _test_accuracy_once(M: int, K: int, dtype: torch.dtype, device: str):
+    """Test accuracy between vLLM and FlashInfer FP4 quantization."""
+    # Create input tensor
+    a = torch.randn((M, K), device=device, dtype=dtype)
+
+    # Compute global scale
+    a_global_scale = compute_global_scale(a)
+
+    # vLLM quantization
+    vllm_fp4, vllm_scale = ops.scaled_fp4_quant(a, a_global_scale)
+
+    # FlashInfer quantization (with swizzled layout to match vLLM's output)
+    flashinfer_fp4, flashinfer_scale = flashinfer_fp4_quantize(
+        a, a_global_scale, is_sf_swizzled_layout=True
+    )
+    flashinfer_scale = flashinfer_scale.view(torch.float8_e4m3fn)
+
+    # Compare outputs
+    torch.testing.assert_close(
+        vllm_fp4,
+        flashinfer_fp4,
+    )
+    print(f"M={M}, K={K}, dtype={dtype}: PASSED")
+
+
+def test_accuracy():
+    """Run accuracy tests across various shapes."""
+    print("\n" + "=" * 60)
+    print("Running accuracy tests: vLLM vs FlashInfer")
+    print("=" * 60)
+
+    device = "cuda"
+    dtype = torch.bfloat16
+
+    # Test various batch sizes and hidden dimensions
+    Ms = [1, 1024]
+    Ks = [4096]
+
+    for M in Ms:
+        for K in Ks:
+            _test_accuracy_once(M, K, dtype, device)
+
+    print("\nAll accuracy tests passed!")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Benchmark NVFP4 quantization: vLLM vs FlashInfer"
+    )
+    parser.add_argument(
+        "--models",
+        nargs="+",
+        type=str,
+        default=["meta-llama/Llama-3.1-8B-Instruct"],
+        choices=list(WEIGHT_SHAPES.keys()),
+    )
+    parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])
+    parser.add_argument(
+        "--save-path",
+        type=str,
+        default=None,
+        help="Path to save benchmark results",
+    )
+    parser.add_argument(
+        "--accuracy",
+        action="store_true",
+        help="Run accuracy tests",
+    )
+    args = parser.parse_args()
+
+    if args.accuracy:
+        test_accuracy()
+
+    for K, N, model in prepare_shapes(args):
+        print(f"\n{model}, N={N} K={K}")
+        benchmark.run(
+            print_data=True,
+            save_path=args.save_path,
+            N=N,
+            K=K,
+        )
+
+    print("\nBenchmark finished!")
diff --git a/benchmarks/kernels/benchmark_activation.py b/benchmarks/kernels/benchmark_activation.py
index 66268b71b3de644b424429aa66ad7dc6216b3c3a..fbe5f744148e3591e944ff8b89b85c891ee08aa6 100644
--- a/benchmarks/kernels/benchmark_activation.py
+++ b/benchmarks/kernels/benchmark_activation.py
@@ -8,13 +8,12 @@ import torch
 
 import vllm.model_executor.layers.activation  # noqa F401
 from vllm.model_executor.custom_op import CustomOp
-from vllm.platforms import current_platform
 from vllm.triton_utils import triton
 from vllm.utils.argparse_utils import FlexibleArgumentParser
-from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
+from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, set_random_seed
 
-batch_size_range = [1, 16, 32, 64, 128]
-seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
+batch_size_range = [1, 16, 128]
+seq_len_range = [1, 16, 64, 1024, 4096]
 intermediate_size = [3072, 9728, 12288]
 configs = list(itertools.product(batch_size_range, seq_len_range, intermediate_size))
 
@@ -30,7 +29,7 @@ def benchmark_activation(
     device = "cuda"
     num_tokens = batch_size * seq_len
     dim = intermediate_size
-    current_platform.seed_everything(42)
+    set_random_seed(42)
     torch.set_default_device(device)
 
     if func_name == "gelu_and_mul":
diff --git a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
index e07d6c776bc00b838a0a8d99202fe85bc0e4935c..9c6edee7b26454c924a6686de7dd096c7b5621fa 100644
--- a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
+++ b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
@@ -6,15 +6,19 @@ kernel. Both kernels take in fp8 quantized weights and 16-bit activations,
 but use different quantization strategies and backends.
 """
 
-import nvtx
 import torch
 
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
-from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
+from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
+from vllm.model_executor.layers.fused_moe.prepare_finalize import (
+    MoEPrepareAndFinalizeNoEP,
+)
 from vllm.platforms import current_platform
 from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.v1.worker.workspace import init_workspace_manager
 
 # Weight shapes for different models: [num_experts, topk, hidden_size,
 # intermediate_size]
@@ -58,6 +62,7 @@ def bench_run(
     per_out_ch: bool,
     mkn: tuple[int, int, int],
 ):
+    init_workspace_manager(torch.cuda.current_device())
     (m, k, n) = mkn
 
     dtype = torch.half
@@ -120,85 +125,6 @@ def bench_run(
     # Force per-tensor quantization for all cases
     per_act_token = False
 
-    # Create stride tensors for CUTLASS
-    ab_strides1 = torch.full((num_experts,), k, dtype=torch.int64, device=device)
-    ab_strides2 = torch.full((num_experts,), n, dtype=torch.int64, device=device)
-    c_strides1 = torch.full((num_experts,), 2 * n, dtype=torch.int64, device=device)
-    c_strides2 = torch.full((num_experts,), k, dtype=torch.int64, device=device)
-
-    def run_triton_moe(
-        a: torch.Tensor,
-        w1: torch.Tensor,
-        w2: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        w1_scale: torch.Tensor,
-        w2_scale: torch.Tensor,
-        a1_scale: torch.Tensor,
-        a2_scale: torch.Tensor,
-        num_repeats: int,
-    ):
-        quant_config = fp8_w8a8_moe_quant_config(
-            w1_scale=w1_scale,
-            w2_scale=w2_scale,
-            a1_scale=a1_scale,
-            a2_scale=a2_scale,
-            per_act_token_quant=per_act_token,
-            per_out_ch_quant=per_out_ch,
-        )
-
-        for _ in range(num_repeats):
-            fused_experts(
-                a,
-                w1,
-                w2,
-                topk_weights,
-                topk_ids,
-                quant_config=quant_config,
-            )
-
-    def run_cutlass_moe_fp8(
-        a: torch.Tensor,
-        w1: torch.Tensor,
-        w2: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        ab_strides1: torch.Tensor,
-        ab_strides2: torch.Tensor,
-        c_strides1: torch.Tensor,
-        c_strides2: torch.Tensor,
-        w1_scale: torch.Tensor,
-        w2_scale: torch.Tensor,
-        a1_scale: torch.Tensor,
-        a2_scale: torch.Tensor,
-        num_repeats: int,
-    ):
-        quant_config = fp8_w8a8_moe_quant_config(
-            w1_scale=w1_scale,
-            w2_scale=w2_scale,
-            a1_scale=a1_scale,
-            a2_scale=a2_scale,
-            per_act_token_quant=per_act_token,
-            per_out_ch_quant=per_out_ch,
-        )
-
-        for _ in range(num_repeats):
-            with nvtx.annotate("cutlass_moe_fp8", color="blue"):
-                cutlass_moe_fp8(
-                    a=a,
-                    w1_q=w1,
-                    w2_q=w2,
-                    topk_weights=topk_weights,
-                    topk_ids=topk_ids,
-                    ab_strides1=ab_strides1,
-                    ab_strides2=ab_strides2,
-                    c_strides1=c_strides1,
-                    c_strides2=c_strides2,
-                    quant_config=quant_config,
-                    activation="silu",
-                    global_num_experts=num_experts,
-                )
-
     # Pre-create quantization config to avoid creating it inside CUDA graph
     quant_config = fp8_w8a8_moe_quant_config(
         w1_scale=w1_scale,
@@ -209,23 +135,30 @@ def bench_run(
         per_out_ch_quant=per_out_ch,
     )
 
+    fn = mk.FusedMoEModularKernel(
+        MoEPrepareAndFinalizeNoEP(),
+        CutlassExpertsFp8(
+            out_dtype=a.dtype,
+            e=num_experts,
+            n=n,
+            k=k,
+            quant_config=quant_config,
+            device=w1.device,
+        ),
+    )
+
     # Create CUDA graphs for CUTLASS (match benchmark_moe.py pattern exactly)
     cutlass_stream = torch.cuda.Stream()
     cutlass_graph = torch.cuda.CUDAGraph()
     with torch.cuda.graph(cutlass_graph, stream=cutlass_stream):
         # Capture 10 invocations like benchmark_moe.py
         for _ in range(10):
-            cutlass_moe_fp8(
-                a=a,
-                w1_q=w1_fp8q_cutlass,
-                w2_q=w2_fp8q_cutlass,
-                topk_weights=topk_weights,
-                topk_ids=topk_ids,
-                ab_strides1=ab_strides1,
-                ab_strides2=ab_strides2,
-                c_strides1=c_strides1,
-                c_strides2=c_strides2,
-                quant_config=quant_config,
+            fn(
+                a,
+                w1_fp8q_cutlass,
+                w2_fp8q_cutlass,
+                topk_weights,
+                topk_ids,
                 activation="silu",
                 global_num_experts=num_experts,
             )
@@ -297,6 +230,10 @@ def bench_run(
 
 
 def main(args):
+    # Initialize workspace manager (required for CUTLASS MoE kernels)
+    device = torch.device("cuda:0")
+    init_workspace_manager(device)
+
     print("Benchmarking models:")
     for i, model in enumerate(args.models):
         print(f"[{i}]  {model}")
diff --git a/benchmarks/kernels/benchmark_cutlass_fp4_moe.py b/benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py
similarity index 92%
rename from benchmarks/kernels/benchmark_cutlass_fp4_moe.py
rename to benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py
index 7982cbb1422c5e4dda0f8aff07a3bad91f85ad0a..10a3e3eab5fd397ec6a148290397b97f3d1db17d 100644
--- a/benchmarks/kernels/benchmark_cutlass_fp4_moe.py
+++ b/benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py
@@ -11,16 +11,23 @@ import nvtx
 import torch
 import torch.utils.benchmark as benchmark
 
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm import _custom_ops as ops
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.fused_moe.config import (
     fp8_w8a8_moe_quant_config,
     nvfp4_moe_quant_config,
 )
-from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4
+from vllm.model_executor.layers.fused_moe.cutlass_moe import (
+    CutlassExpertsFp4,
+)
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
+from vllm.model_executor.layers.fused_moe.prepare_finalize import (
+    MoEPrepareAndFinalizeNoEP,
+)
 from vllm.scalar_type import scalar_types
 from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.v1.worker.workspace import init_workspace_manager
 
 WEIGHT_SHAPES_MOE = {
     "nvidia/DeepSeek-R1-FP4": [
@@ -187,19 +194,24 @@ def bench_run(
             g1_alphas=w1_gs,
             g2_alphas=w2_gs,
         )
+
+        kernel = mk.FusedMoEModularKernel(
+            MoEPrepareAndFinalizeNoEP(defer_input_quant=True),
+            CutlassExpertsFp4(
+                out_dtype=dtype,
+                max_experts_per_worker=e,
+                quant_config=quant_config,
+            ),
+        )
+
         for _ in range(num_repeats):
             with nvtx.annotate("cutlass_moe_fp4", color="green"):
-                cutlass_moe_fp4(
-                    a=a,
-                    w1_fp4=w1_fp4,
-                    w2_fp4=w2_fp4,
+                kernel(
+                    hidden_states=a,
+                    w1=w1_fp4,
+                    w2=w2_fp4,
                     topk_weights=topk_weights,
                     topk_ids=topk_ids,
-                    m=m,
-                    n=n,
-                    k=k,
-                    e=num_experts,
-                    quant_config=quant_config,
                 )
 
     def run_cutlass_from_graph(
@@ -229,20 +241,24 @@ def bench_run(
             g2_alphas=w2_gs,
         )
 
+        kernel = mk.FusedMoEModularKernel(
+            MoEPrepareAndFinalizeNoEP(defer_input_quant=True),
+            CutlassExpertsFp4(
+                out_dtype=dtype,
+                max_experts_per_worker=e,
+                quant_config=quant_config,
+            ),
+        )
+
         with set_current_vllm_config(
             VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
         ):
-            return cutlass_moe_fp4(
-                a=a,
-                w1_fp4=w1_fp4,
-                w2_fp4=w2_fp4,
+            return kernel(
+                hidden_states=a,
+                w1=w1_fp4,
+                w2=w2_fp4,
                 topk_weights=topk_weights,
                 topk_ids=topk_ids,
-                m=m,
-                n=n,
-                k=k,
-                e=num_experts,
-                quant_config=quant_config,
             )
 
     def run_triton_from_graph(
@@ -441,6 +457,10 @@ def bench_run(
 
 
 def main(args):
+    # Initialize workspace manager (required for CUTLASS MoE kernels)
+    device = torch.device("cuda:0")
+    init_workspace_manager(device)
+
     print("Benchmarking models:")
     for i, model in enumerate(args.models):
         print(f"[{i}]  {model}")
diff --git a/benchmarks/kernels/benchmark_device_communicators.py b/benchmarks/kernels/benchmark_device_communicators.py
index b414efa6e330bca5f1f5e5604c60fa357d9803bc..7b453fe7b6809957fabd9bfb772ecec98ee55999 100644
--- a/benchmarks/kernels/benchmark_device_communicators.py
+++ b/benchmarks/kernels/benchmark_device_communicators.py
@@ -293,7 +293,7 @@ class CommunicatorBenchmark:
                     graph = torch.cuda.CUDAGraph()
                     graph_pool = torch.cuda.graph_pool_handle()
                     set_graph_pool_id(graph_pool)
-                    with torch.cuda.graph(graph, pool=graph_pool):
+                    with torch.cuda.graph(graph, pool=graph_pool, stream=stream):
                         for _ in range(CUDA_GRAPH_CAPTURE_CYCLES):
                             allreduce_fn(graph_input)
 
diff --git a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
index 9b426d8d5f778a483aab12fa82d451460b940274..b30a1263878bed39fef449fc4521053bb0a3136f 100644
--- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
+++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
@@ -5,15 +5,20 @@ import torch
 import torch.utils.benchmark as benchmark
 from benchmark_shapes import WEIGHT_SHAPES_MOE
 
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm import _custom_ops as ops
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
-from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
+from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8
 from vllm.model_executor.layers.fused_moe.fused_moe import (
     fused_experts,
     fused_topk,
 )
+from vllm.model_executor.layers.fused_moe.prepare_finalize import (
+    MoEPrepareAndFinalizeNoEP,
+)
 from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.v1.worker.workspace import init_workspace_manager
 
 DEFAULT_MODELS = [
     "mistralai/Mixtral-8x7B-Instruct-v0.1",
@@ -44,6 +49,7 @@ def bench_run(
     per_out_ch: bool,
     mkn: tuple[int, int, int],
 ):
+    init_workspace_manager(torch.cuda.current_device())
     label = "Quant Matmul"
 
     sub_label = (
@@ -81,11 +87,6 @@ def bench_run(
         a, score, topk, renormalize=False
     )
 
-    ab_strides1 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
-    ab_strides2 = torch.full((num_experts,), n, device="cuda", dtype=torch.int64)
-    c_strides1 = torch.full((num_experts,), 2 * n, device="cuda", dtype=torch.int64)
-    c_strides2 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
-
     def run_triton_moe(
         a: torch.Tensor,
         w1: torch.Tensor,
@@ -119,10 +120,6 @@ def bench_run(
         w2: torch.Tensor,
         w1_scale: torch.Tensor,
         w2_scale: torch.Tensor,
-        ab_strides1: torch.Tensor,
-        ab_strides2: torch.Tensor,
-        c_strides1: torch.Tensor,
-        c_strides2: torch.Tensor,
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
         per_act_token: bool,
@@ -134,31 +131,29 @@ def bench_run(
             per_act_token_quant=per_act_token,
         )
 
-        for _ in range(num_repeats):
-            cutlass_moe_fp8(
-                a,
-                w1,
-                w2,
-                topk_weights,
-                topk_ids,
-                ab_strides1,
-                ab_strides2,
-                c_strides1,
-                c_strides2,
+        fn = mk.FusedMoEModularKernel(
+            MoEPrepareAndFinalizeNoEP(),
+            CutlassExpertsFp8(
+                out_dtype=a.dtype,
+                # NOTE(rob): w2 is shaped as [E, hidden, intermediate]
+                e=w2.shape[0],
+                n=w2.shape[2],
+                k=w2.shape[1],
                 quant_config=quant_config,
-            )
+                device=w1.device,
+            ),
+        )
+
+        for _ in range(num_repeats):
+            fn(a, w1, w2, topk_weights, topk_ids)
 
     def run_cutlass_from_graph(
         a: torch.Tensor,
         a_scale: torch.Tensor,
-        w1_q: torch.Tensor,
-        w2_q: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
         w1_scale: torch.Tensor,
         w2_scale: torch.Tensor,
-        ab_strides1: torch.Tensor,
-        ab_strides2: torch.Tensor,
-        c_strides1: torch.Tensor,
-        c_strides2: torch.Tensor,
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
     ):
@@ -168,21 +163,23 @@ def bench_run(
             per_act_token_quant=per_act_token,
         )
 
+        fn = mk.FusedMoEModularKernel(
+            MoEPrepareAndFinalizeNoEP(),
+            CutlassExpertsFp8(
+                out_dtype=a.dtype,
+                # NOTE(rob): w2 is shaped as [E, hidden, intermediate]
+                e=w2.shape[0],
+                n=w2.shape[2],
+                k=w2.shape[1],
+                quant_config=quant_config,
+                device=w1.device,
+            ),
+        )
+
         with set_current_vllm_config(
             VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
         ):
-            return cutlass_moe_fp8(
-                a,
-                w1_q,
-                w2_q,
-                topk_weights,
-                topk_ids,
-                ab_strides1,
-                ab_strides2,
-                c_strides1,
-                c_strides2,
-                quant_config=quant_config,
-            )
+            return fn(a, w1, w2, topk_weights, topk_ids)
 
     def run_triton_from_graph(
         a: torch.Tensor,
@@ -226,10 +223,6 @@ def bench_run(
             w2_q,
             w1_scale,
             w2_scale,
-            ab_strides1,
-            ab_strides2,
-            c_strides1,
-            c_strides2,
             topk_weights,
             topk_ids,
         )
@@ -267,10 +260,6 @@ def bench_run(
         "w1_scale": w1_scale,
         "w2_scale": w2_scale,
         "per_act_token": per_act_token,
-        "ab_strides1": ab_strides1,
-        "ab_strides2": ab_strides2,
-        "c_strides1": c_strides1,
-        "c_strides2": c_strides2,
         # cuda graph params
         "cutlass_graph": cutlass_graph,
         "triton_graph": triton_graph,
@@ -329,10 +318,6 @@ def bench_run(
         w2_q,
         w1_scale,
         w2_scale,
-        ab_strides1,
-        ab_strides2,
-        c_strides1,
-        c_strides2,
         topk_weights,
         topk_ids,
         per_act_token,
@@ -341,7 +326,7 @@ def bench_run(
 
     results.append(
         benchmark.Timer(
-            stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, ab_strides1, ab_strides2, c_strides1, c_strides2, topk_weights, topk_ids, per_act_token, num_runs)",  # noqa: E501
+            stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, per_act_token, num_runs)",  # noqa: E501
             globals=globals,
             label=label,
             sub_label=sub_label,
@@ -364,6 +349,10 @@ def bench_run(
 
 
 def main(args):
+    # Initialize workspace manager (required for CUTLASS MoE kernels)
+    device = torch.device("cuda:0")
+    init_workspace_manager(device)
+
     print("Benchmarking models:")
     for i, model in enumerate(args.models):
         print(f"[{i}]  {model}")
diff --git a/benchmarks/kernels/benchmark_layernorm.py b/benchmarks/kernels/benchmark_layernorm.py
index 6fa5c248670e32cdd672e45e016d3c47d9689577..2292d2f87288f267082e7106e20270f6f8e17bbf 100644
--- a/benchmarks/kernels/benchmark_layernorm.py
+++ b/benchmarks/kernels/benchmark_layernorm.py
@@ -6,9 +6,8 @@ import time
 import torch
 
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.platforms import current_platform
 from vllm.utils.argparse_utils import FlexibleArgumentParser
-from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
+from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, set_random_seed
 
 
 @torch.inference_mode()
@@ -22,7 +21,7 @@ def main(
     num_warmup_iters: int = 5,
     num_iters: int = 100,
 ) -> None:
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
     torch.set_default_device("cuda")
 
     layer = RMSNorm(hidden_size).to(dtype=dtype)
diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index 67b033ca7b708fd0d364bcea7709e9d2dd38eaa6..35f00afbc4f9cf6acc03c32b69dcfb3d3ef7c1ee 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
+import gc
 import json
 import os
 import time
@@ -22,10 +23,49 @@ from vllm.model_executor.layers.fused_moe.fused_moe import *
 from vllm.transformers_utils.config import get_config
 from vllm.triton_utils import triton
 from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils.torch_utils import set_random_seed
 
 # 移除全局的 current_platform 导入，改为在需要时局部导入
 # FP8_DTYPE = current_platform.fp8_dtype()
 
+# Default interval for clearing Triton JIT cache during tuning
+# Set to 0 to disable automatic cache clearing
+_CACHE_CLEAR_INTERVAL_ENV = "VLLM_MOE_TUNE_CACHE_CLEAR_INTERVAL"
+TRITON_CACHE_CLEAR_INTERVAL = int(os.environ.get(_CACHE_CLEAR_INTERVAL_ENV, "50"))
+
+
+def clear_triton_cache():
+    """Clear Triton JIT compilation cache and Python/CUDA memory.
+
+    This helps prevent OOM during tuning with large models (many experts).
+    """
+    # Force Python garbage collection
+    gc.collect()
+
+    # Clear CUDA memory cache
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+
+    # Try to clear Triton's runtime cache
+    try:
+        if (
+            hasattr(triton, "runtime")
+            and hasattr(triton.runtime, "cache")
+            and hasattr(triton.runtime.cache, "clear")
+        ):
+            triton.runtime.cache.clear()
+    except ImportError:
+        # Triton not installed, skip cache clearing
+        pass
+    except AttributeError:
+        # Triton version doesn't have expected cache API
+        pass
+    except Exception as e:
+        print(f"Warning: Failed to clear Triton cache: {e}")
+
+    # Additional garbage collection after clearing caches
+    gc.collect()
+
 
 def ensure_divisibility(numerator, denominator, text):
     """Ensure that numerator is divisible by the denominator."""
@@ -454,7 +494,8 @@ class BenchmarkWorker:
             pass
         else:
             torch.set_default_device("cuda:"+ str(device_id))
-        current_platform.seed_everything(seed)
+
+        set_random_seed(seed)
         self.seed = seed
         # Store the logical device ID for Ray
         self.device_id = device_id
@@ -475,7 +516,10 @@ class BenchmarkWorker:
     ) -> tuple[dict[str, int], float]:
         # 局部导入 current_platform
         from vllm.platforms import current_platform
-        current_platform.seed_everything(self.seed)
+        
+        from vllm.model_executor.layers.fused_moe.fused_moe import  get_moe_configs, get_default_config
+
+        set_random_seed(self.seed)
         dtype_str = _get_config_dtype_str(
             dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
         )
@@ -560,7 +604,7 @@ class BenchmarkWorker:
                 need_device_guard = True
 
         with torch.cuda.device(self.device_id) if need_device_guard else nullcontext():
-            for config in tqdm(search_space):
+            for idx, config in enumerate(tqdm(search_space)):
                 try:
                     kernel_time = benchmark_config(
                         config,
@@ -583,6 +627,19 @@ class BenchmarkWorker:
                 if kernel_time < best_time:
                     best_time = kernel_time
                     best_config = config
+
+                # Periodically clear Triton JIT cache to prevent OOM
+                # This is especially important for large models with many experts
+                if (
+                    TRITON_CACHE_CLEAR_INTERVAL > 0
+                    and idx > 0
+                    and idx % TRITON_CACHE_CLEAR_INTERVAL == 0
+                ):
+                    clear_triton_cache()
+
+        # Final cleanup after tuning completes
+        clear_triton_cache()
+
         now = datetime.now()
         print(f"{now.ctime()}] Completed tuning for batch_size={num_tokens}")
         assert best_config is not None
diff --git a/benchmarks/kernels/benchmark_moe_permute_unpermute.py b/benchmarks/kernels/benchmark_moe_permute_unpermute.py
index b8913a217c608c5b4109d4f2181dae20e72a4d0b..77b77a15b53af8cb63867bc3f88505a8f6f838de 100644
--- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py
+++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py
@@ -18,6 +18,7 @@ from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
 from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize
 from vllm.platforms import current_platform
 from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils.torch_utils import set_random_seed
 
 FP8_DTYPE = current_platform.fp8_dtype()
 
@@ -261,7 +262,7 @@ def benchmark_unpermute(
 class BenchmarkWorker:
     def __init__(self, seed: int) -> None:
         torch.set_default_device("cuda")
-        current_platform.seed_everything(seed)
+        set_random_seed(seed)
         self.seed = seed
         # Get the device ID to allocate tensors and kernels
         # on the respective GPU. This is required for Ray to work
@@ -279,7 +280,7 @@ class BenchmarkWorker:
         use_int8_w8a16: bool,
         use_customized_permute: bool = False,
     ) -> tuple[dict[str, int], float]:
-        current_platform.seed_everything(self.seed)
+        set_random_seed(self.seed)
 
         permute_time = benchmark_permute(
             num_tokens,
diff --git a/benchmarks/kernels/benchmark_mrope.py b/benchmarks/kernels/benchmark_mrope.py
index 09de5fa822f86802df996390852b4319e72f4598..3e03651357784bfd3c1d539a3eeafd76c54d311a 100644
--- a/benchmarks/kernels/benchmark_mrope.py
+++ b/benchmarks/kernels/benchmark_mrope.py
@@ -37,9 +37,9 @@ import numpy as np
 import torch
 
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.platforms import current_platform
 from vllm.transformers_utils.config import get_config
 from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils.torch_utils import set_random_seed
 
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
@@ -94,7 +94,7 @@ def benchmark_mrope(
     benchmark_iter: int = 100,
     csv_writer=None,
 ):
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
     torch.set_default_device(device)
     # the parameters to compute the q k v size based on tp_size
     mrope_helper_class = get_rope(
diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py
index 9affd4c182ff6ea134af30ba488fab5dd64d7c72..b844f1e29fd16156e099e536e8acd0db3b8e93d8 100644
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -13,6 +13,7 @@ from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.torch_utils import (
     STR_DTYPE_TO_TORCH_DTYPE,
     create_kv_caches_with_random,
+    set_random_seed,
 )
 import vllm.envs as envs
 
@@ -39,7 +40,7 @@ def main(
     device: str = "cuda",
     kv_cache_dtype: str | None = None,
 ) -> None:
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
 
     scale = float(1.0 / (head_size**0.5))
     query = torch.empty(
diff --git a/benchmarks/kernels/benchmark_quant.py b/benchmarks/kernels/benchmark_quant.py
index 3c2ac9128947af0f33aad790b811bc3f56afae10..9a21cfe94e5be1d69114fe049a6f8167eaf36592 100644
--- a/benchmarks/kernels/benchmark_quant.py
+++ b/benchmarks/kernels/benchmark_quant.py
@@ -6,9 +6,8 @@ import time
 import torch
 
 from vllm import _custom_ops as ops
-from vllm.platforms import current_platform
 from vllm.utils.argparse_utils import FlexibleArgumentParser
-from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
+from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, set_random_seed
 
 
 @torch.inference_mode()
@@ -23,7 +22,7 @@ def main(
     num_warmup_iters: int = 5,
     num_iters: int = 100,
 ) -> None:
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
     torch.set_default_device("cuda")
 
     x = torch.randn(num_tokens, hidden_size, dtype=dtype)
diff --git a/benchmarks/kernels/benchmark_reshape_and_cache.py b/benchmarks/kernels/benchmark_reshape_and_cache.py
index 0d3aef0c630b284feb8a8b4e3e35cdfa62237f39..99067d8ac3710fc7f86dcd3017b3a8ea218426de 100644
--- a/benchmarks/kernels/benchmark_reshape_and_cache.py
+++ b/benchmarks/kernels/benchmark_reshape_and_cache.py
@@ -8,11 +8,11 @@ from tabulate import tabulate
 
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
-from vllm.platforms import current_platform
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.torch_utils import (
     STR_DTYPE_TO_TORCH_DTYPE,
     create_kv_caches_with_random,
+    set_random_seed,
 )
 
 logger = init_logger(__name__)
@@ -36,7 +36,7 @@ def run_benchmark(
     if kv_cache_dtype == "fp8" and head_size % 16:
         raise ValueError("fp8 kv-cache requires head_size to be a multiple of 16.")
 
-    current_platform.seed_everything(42)
+    set_random_seed(42)
     torch.set_default_device(device)
 
     # create random key / value tensors [T, H, D].
diff --git a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
index 12f17ea575d9448b1cd7ad3e900e4a86a7594703..ef6be1f3c3597c9d4922b6bba8ad4128fecfbd0a 100644
--- a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
+++ b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
@@ -7,15 +7,15 @@ import torch
 from tabulate import tabulate
 
 from vllm import _custom_ops as ops
-from vllm.attention.ops.triton_reshape_and_cache_flash import (
-    triton_reshape_and_cache_flash,
-)
 from vllm.logger import init_logger
-from vllm.platforms import current_platform
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.torch_utils import (
     STR_DTYPE_TO_TORCH_DTYPE,
     create_kv_caches_with_random_flash,
+    set_random_seed,
+)
+from vllm.v1.attention.ops.triton_reshape_and_cache_flash import (
+    triton_reshape_and_cache_flash,
 )
 
 logger = init_logger(__name__)
@@ -49,7 +49,7 @@ def run_benchmark(
     if implementation == "triton" and kv_cache_layout == "HND":
         return float("nan")  # Triton does not support HND layout yet.
 
-    current_platform.seed_everything(42)
+    set_random_seed(42)
     torch.set_default_device(device)
 
     # create random key / value tensors [T, H, D].
diff --git a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
index de01ff197eab7593be32056b2cca1c9a0a3fe060..da32bc30cb2ae3b385b79c852334f1594a4fe52d 100644
--- a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
+++ b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
@@ -23,9 +23,9 @@ import torch
 from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
     persistent_masked_m_silu_mul_quant,
 )
-from vllm.platforms import current_platform
 from vllm.triton_utils import tl, triton
 from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used
+from vllm.utils.torch_utils import set_random_seed
 
 
 @triton.jit
@@ -207,7 +207,7 @@ def benchmark(
 ):
     def generate_data(seed_offset=0):
         """Generate input data with given seed offset"""
-        current_platform.seed_everything(42 + seed_offset)
+        set_random_seed(42 + seed_offset)
         y = torch.rand((E, T, 2 * H), dtype=torch.bfloat16, device="cuda").contiguous()
 
         if gen_strategy == "random_imbalanced":
diff --git a/benchmarks/kernels/cpu/benchmark_cpu_attn.py b/benchmarks/kernels/cpu/benchmark_cpu_attn.py
new file mode 100644
index 0000000000000000000000000000000000000000..30b86039537390c27a43e9f66b723b806fd06d21
--- /dev/null
+++ b/benchmarks/kernels/cpu/benchmark_cpu_attn.py
@@ -0,0 +1,272 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import functools
+import time
+
+import numpy as np
+import torch
+
+from vllm._custom_ops import (
+    cpu_attention_with_kv_cache,
+    cpu_attn_get_scheduler_metadata,
+    cpu_attn_reshape_and_cache,
+)
+from vllm.platforms import CpuArchEnum, current_platform
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
+from vllm.v1.attention.backends.cpu_attn import CPUAttentionBackend, _get_attn_isa
+
+
+def get_attn_isa(
+    block_size: int | None = None,
+    dtype: torch.dtype | None = None,
+):
+    if block_size and dtype:
+        return _get_attn_isa(dtype, block_size)
+    else:
+        if current_platform.get_cpu_architecture() == CpuArchEnum.ARM:
+            return "neon"
+        elif torch._C._cpu._is_amx_tile_supported():
+            return "amx"
+        else:
+            return "vec"
+
+
+# rand number generation takes too much time, cache rand tensors
+@functools.lru_cache(maxsize=128, typed=False)
+def tensor_cache(
+    elem_num: int,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    tensor = torch.randn(elem_num, dtype=dtype)
+    return tensor
+
+
+@torch.inference_mode()
+def main(
+    seq_lens: list[tuple[int, int]],
+    num_heads: tuple[int, int],
+    head_size: int,
+    sliding_window: int = None,
+    dtype: torch.dtype = torch.bfloat16,
+    block_size: int = 128,
+    num_blocks: int = 4096,
+    use_sink: bool = False,
+    enable_kv_split: bool = False,
+    isa: str | None = None,
+    seed: int = 0,
+    iters: int = 20,
+) -> None:
+    current_platform.seed_everything(seed)
+    num_seqs = len(seq_lens)
+    query_lens = [x[0] for x in seq_lens]
+    kv_lens = [x[1] for x in seq_lens]
+    num_query_heads = num_heads[0]
+    num_kv_heads = num_heads[1]
+    assert num_query_heads % num_kv_heads == 0
+    max_kv_len = max(kv_lens)
+    window_size = (sliding_window - 1, 0) if sliding_window is not None else (-1, -1)
+    scale = head_size**-0.5
+    token_num = sum(query_lens)
+
+    if isa is None:
+        isa = get_attn_isa(block_size, dtype)
+
+    s_aux = (
+        15 * torch.rand((num_query_heads,), dtype=torch.bfloat16) if use_sink else None
+    )
+
+    query = tensor_cache(
+        elem_num=token_num * num_query_heads * head_size,
+        dtype=dtype,
+    )
+    query = query.view(
+        token_num,
+        num_query_heads,
+        head_size,
+    )
+
+    key_value = tensor_cache(
+        elem_num=2 * num_blocks * num_kv_heads * block_size * head_size,
+        dtype=dtype,
+    )
+    key_value = key_value.view(
+        2,
+        num_blocks,
+        block_size,
+        num_kv_heads,
+        head_size,
+    )
+    key_cache, value_cache = key_value.unbind(0)
+
+    # KV cache for CPU attention
+    packed_key_cache = torch.empty(
+        num_blocks, num_kv_heads, block_size, head_size, dtype=dtype
+    )
+    packed_value_cache = torch.empty_like(packed_key_cache)
+
+    cu_query_lens = torch.tensor([0] + query_lens, dtype=torch.int32).cumsum(
+        dim=0, dtype=torch.int32
+    )
+    kv_lens_tensor = torch.tensor(kv_lens, dtype=torch.int32)
+    max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
+    block_tables = torch.randint(
+        0, num_blocks, (num_seqs, max_num_blocks_per_seq), dtype=torch.int32
+    )
+
+    # use reshape_and_cache to pack key_cache and value_cache
+    slot_mapping = torch.arange(0, num_blocks * block_size, dtype=torch.int64)
+    cpu_attn_reshape_and_cache(
+        key=key_cache.view(-1, num_kv_heads, head_size),
+        value=value_cache.view(-1, num_kv_heads, head_size),
+        key_cache=packed_key_cache,
+        value_cache=packed_value_cache,
+        slot_mapping=slot_mapping,
+        isa=isa,
+    )
+
+    metadata = cpu_attn_get_scheduler_metadata(
+        num_reqs=num_seqs,
+        num_heads=num_query_heads,
+        num_kv_heads=num_kv_heads,
+        head_dim=head_size,
+        seq_lens=kv_lens_tensor,
+        dtype=dtype,
+        query_start_loc=cu_query_lens,
+        causal=True,
+        sliding_window_size=sliding_window if sliding_window is not None else -1,
+        isa=isa,
+        enable_kv_split=enable_kv_split,
+    )
+
+    out_with_split = torch.empty_like(query)
+
+    def run_benchmark(iters: int) -> list[float]:
+        times = []
+        for _ in range(iters):
+            start_time = time.perf_counter_ns()
+            cpu_attention_with_kv_cache(
+                query=query,
+                key_cache=packed_key_cache,
+                value_cache=packed_value_cache,
+                output=out_with_split,
+                query_start_loc=cu_query_lens,
+                seq_lens=kv_lens_tensor,
+                scale=scale,
+                causal=True,
+                alibi_slopes=None,
+                sliding_window=window_size,
+                block_table=block_tables,
+                softcap=0,
+                scheduler_metadata=metadata,
+                s_aux=s_aux,
+            )
+            end_time = time.perf_counter_ns()
+            times.append((end_time - start_time) / 1e6)
+        return times
+
+    # warmup
+    run_benchmark(5)
+    # benchmark
+    times = run_benchmark(iters)
+
+    time_min = min(times)
+    time_max = max(times)
+    time_mean = np.mean(times)
+    time_std = np.std(times)
+
+    print("\tmin (ms) = ", time_min)
+    print("\tmax (ms) = ", time_max)
+    print("\tmean (ms) = ", time_mean)
+    print("\tstd = ", time_std)
+    print("\tmedian (ms) = ", np.median(times))
+
+
+def generate_seq_lens(
+    batch_size: int,
+    q_len_min: int,
+    q_len_max: int,
+    kv_len_min: int,
+    kv_len_max: int,
+    seed: int = 0,
+) -> list[tuple[int, int]]:
+    assert 1 <= q_len_min <= q_len_max
+    assert 1 <= kv_len_min <= kv_len_max
+    assert kv_len_max >= q_len_min
+
+    g = torch.Generator(device="cpu").manual_seed(seed)
+
+    def rint(lo: int, hi: int) -> int:
+        return torch.randint(lo, hi + 1, (1,), generator=g).item()
+
+    seq_lens: list[tuple[int, int]] = []
+    for _ in range(batch_size):
+        # ensure q <= kv
+        kv = rint(max(kv_len_min, q_len_min), kv_len_max)
+        q = rint(q_len_min, min(q_len_max, kv))
+        seq_lens.append((q, kv))
+
+    return seq_lens
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(description="Benchmark the paged attention kernel.")
+    parser.add_argument("--batch-size", type=int, default=64)
+    parser.add_argument("--q-len-min", type=int, default=512)
+    parser.add_argument("--q-len-max", type=int, default=512)
+    parser.add_argument("--kv-len-min", type=int, default=512)
+    parser.add_argument("--kv-len-max", type=int, default=512)
+    parser.add_argument("--num-blocks", type=int, default=4096)
+
+    parser.add_argument("--sliding-window", type=int, default=None)
+    parser.add_argument("--num-query-heads", type=int, default=32)
+    parser.add_argument("--num-kv-heads", type=int, default=8)
+    parser.add_argument(
+        "--head-size",
+        type=int,
+        choices=CPUAttentionBackend.get_supported_head_sizes(),
+        default=128,
+    )
+    parser.add_argument("--enable-kv-split", action="store_true")
+    parser.add_argument("--block-size", type=int, choices=[32, 64, 128], default=128)
+    parser.add_argument(
+        "--dtype", type=str, choices=["half", "bfloat16", "float"], default="bfloat16"
+    )
+    parser.add_argument("--use-sink", action="store_true")
+    parser.add_argument(
+        "--isa", type=str, choices=["vec", "neon", "amx", "vec16"], default=None
+    )
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--iters", type=int, default=20)
+
+    args = parser.parse_args()
+    print(args)
+
+    seq_lens = generate_seq_lens(
+        args.batch_size,
+        args.q_len_min,
+        args.q_len_max,
+        args.kv_len_min,
+        args.kv_len_max,
+        args.seed,
+    )
+
+    print("batch (query len, kv len) = ", seq_lens)
+
+    main(
+        seq_lens=seq_lens,
+        num_heads=(args.num_query_heads, args.num_kv_heads),
+        head_size=args.head_size,
+        sliding_window=args.sliding_window,
+        dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype],
+        block_size=args.block_size,
+        num_blocks=args.num_blocks,
+        use_sink=args.use_sink,
+        enable_kv_split=args.enable_kv_split,
+        isa=args.isa
+        if args.isa is not None
+        else get_attn_isa(args.block_size, STR_DTYPE_TO_TORCH_DTYPE[args.dtype]),
+        seed=args.seed,
+        iters=args.iters,
+    )
diff --git a/benchmarks/kernels/cpu/benchmark_cpu_fused_moe.py b/benchmarks/kernels/cpu/benchmark_cpu_fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..186b79ede0861bde51578d0704b419d4b50389ad
--- /dev/null
+++ b/benchmarks/kernels/cpu/benchmark_cpu_fused_moe.py
@@ -0,0 +1,175 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import sys
+import time
+
+import numpy as np
+import torch
+
+from vllm.platforms import current_platform
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+
+# Check if CPU MoE operations are available
+try:
+    from vllm._custom_ops import cpu_fused_moe, cpu_prepack_moe_weight
+except (ImportError, AttributeError) as e:
+    print("ERROR: CPU fused MoE operations are not available on this platform.")
+    print("This benchmark requires x86 CPU with proper vLLM CPU extensions compiled.")
+    print(
+        "The cpu_fused_moe kernel is typically available on Linux x86_64 "
+        "with AVX2/AVX512."
+    )
+    print(f"Import error: {e}")
+    sys.exit(1)
+
+# ISA selection following test_cpu_fused_moe.py pattern
+ISA_CHOICES = ["amx", "vec"] if torch._C._cpu._is_amx_tile_supported() else ["vec"]
+
+
+@torch.inference_mode()
+def main(
+    batch_size: int,
+    expert_num: int,
+    hidden_size: int,
+    intermediate_size: int,
+    topk_num: int,
+    use_bias: bool = False,
+    dtype: torch.dtype = torch.bfloat16,
+    activation: str = "silu",
+    isa: str = "vec",
+    seed: int = 0,
+    iters: int = 20,
+) -> None:
+    current_platform.seed_everything(seed)
+    # up_dim = 2 * intermediate_size for gate + up projection
+    up_dim = 2 * intermediate_size
+
+    input_tensor = torch.randn((batch_size, hidden_size), dtype=dtype) / (
+        0.5 * hidden_size**0.5
+    )
+
+    w13 = torch.randn((expert_num, up_dim, hidden_size), dtype=dtype) / (
+        0.5 * hidden_size**0.5
+    )
+    w2 = torch.randn((expert_num, hidden_size, intermediate_size), dtype=dtype) / (
+        0.5 * intermediate_size**0.5
+    )
+
+    w13_bias = None
+    w2_bias = None
+    if use_bias:
+        w13_bias = torch.randn((expert_num, up_dim), dtype=dtype) / (0.5 * up_dim**0.5)
+        w2_bias = torch.randn((expert_num, hidden_size), dtype=dtype) / (
+            0.5 * hidden_size**0.5
+        )
+
+    router_logits = torch.randn((batch_size, expert_num), dtype=dtype)
+    score = torch.softmax(router_logits, dim=-1, dtype=torch.float32)
+    topk_weights, topk_ids = torch.topk(score, topk_num)
+    topk_ids = topk_ids.to(torch.int32)
+
+    packed_w13 = cpu_prepack_moe_weight(w13, isa)
+    packed_w2 = cpu_prepack_moe_weight(w2, isa)
+
+    def run_benchmark(iters: int) -> list[float]:
+        times = []
+        for _ in range(iters):
+            start_time = time.perf_counter_ns()
+            _ = cpu_fused_moe(
+                input_tensor,
+                packed_w13,
+                packed_w2,
+                w13_bias,
+                w2_bias,
+                topk_weights,
+                topk_ids,
+                activation,
+                isa,
+            )
+            end_time = time.perf_counter_ns()
+            times.append((end_time - start_time) / 1e6)
+        return times
+
+    # warmup
+    run_benchmark(5)
+    # benchmark
+    times = run_benchmark(iters)
+
+    if not times:
+        print("No iterations to measure. Set --iters > 0.")
+        return
+
+    time_min = min(times)
+    time_max = max(times)
+    time_mean = np.mean(times)
+    time_std = np.std(times)
+
+    print("\tmin (ms) = ", time_min)
+    print("\tmax (ms) = ", time_max)
+    print("\tmean (ms) = ", time_mean)
+    print("\tstd = ", time_std)
+    print("\tmedian (ms) = ", np.median(times))
+
+    # Calculate throughput metrics
+    # FLOPs estimation: 2 * batch * topk * (hidden * up_dim + intermediate * hidden)
+    flops_per_token = (
+        2 * topk_num * (hidden_size * up_dim + intermediate_size * hidden_size)
+    )
+    total_flops = batch_size * flops_per_token
+    tflops = total_flops / (time_mean * 1e-3) / 1e12
+    print(f"\tthroughput (TFLOP/s) = {tflops:.4f}")
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(description="Benchmark the CPU fused MoE kernel.")
+    parser.add_argument("--batch-size", type=int, default=64)
+    parser.add_argument("--expert-num", type=int, default=8)
+    parser.add_argument("--hidden-size", type=int, default=2880)
+    parser.add_argument("--intermediate-size", type=int, default=2880)
+    parser.add_argument(
+        "--topk-num",
+        type=int,
+        default=None,
+        help="Number of experts to route each token to (default: expert_num // 2)",
+    )
+    parser.add_argument("--use-bias", action="store_true")
+    parser.add_argument(
+        "--activation",
+        type=str,
+        choices=["silu", "swigluoai"],
+        default="silu",
+        help="Activation function",
+    )
+    parser.add_argument(
+        "--isa",
+        type=str,
+        choices=ISA_CHOICES,
+        default=ISA_CHOICES[0],
+        help=f"ISA to use (available: {ISA_CHOICES})",
+    )
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--iters", type=int, default=20)
+
+    args = parser.parse_args()
+
+    # Default topk_num to expert_num // 2, minimum 1
+    topk_num = (
+        args.topk_num if args.topk_num is not None else max(args.expert_num // 2, 1)
+    )
+
+    print(args)
+
+    main(
+        batch_size=args.batch_size,
+        expert_num=args.expert_num,
+        hidden_size=args.hidden_size,
+        intermediate_size=args.intermediate_size,
+        topk_num=topk_num,
+        use_bias=args.use_bias,
+        dtype=torch.bfloat16,  # Following test_cpu_fused_moe.py
+        activation=args.activation,
+        isa=args.isa,
+        seed=args.seed,
+        iters=args.iters,
+    )
diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index 85b286f8d8d0a18575592a41e8f93b6653eaa397..0af87fd7f0b5338df2bc3ce6e84d18cc55d24de9 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -330,7 +330,7 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
         PUBLIC ${oneDNN_BINARY_DIR}/include
         PRIVATE ${oneDNN_SOURCE_DIR}/src
     )
-    target_link_libraries(dnnl_ext dnnl)
+    target_link_libraries(dnnl_ext dnnl torch)
     target_compile_options(dnnl_ext PRIVATE ${CXX_COMPILE_FLAGS} -fPIC)
     list(APPEND LIBS dnnl_ext)
     set(USE_ONEDNN ON)
@@ -358,13 +358,13 @@ set(VLLM_EXT_SRC
     "csrc/cpu/pos_encoding.cpp"
     "csrc/moe/dynamic_4bit_int_moe_cpu.cpp"
     "csrc/cpu/cpu_attn.cpp"
-    "csrc/cpu/scratchpad_manager.cpp"
     "csrc/cpu/torch_bindings.cpp")
 
 if (AVX512_FOUND AND NOT AVX512_DISABLED)
     set(VLLM_EXT_SRC
         "csrc/cpu/shm.cpp"
         "csrc/cpu/cpu_wna16.cpp"
+        "csrc/cpu/cpu_fused_moe.cpp"
         ${VLLM_EXT_SRC})
     if (ENABLE_AVX512BF16 AND ENABLE_AVX512VNNI)
         set(VLLM_EXT_SRC
diff --git a/cmake/external_projects/flashmla.cmake b/cmake/external_projects/flashmla.cmake
index 2cf3c1a755d3c0b86623121eefb481644280c8a3..0d4f9b7aa07c8912697b9c8b482699cc446aa0b1 100644
--- a/cmake/external_projects/flashmla.cmake
+++ b/cmake/external_projects/flashmla.cmake
@@ -35,16 +35,21 @@ message(STATUS "FlashMLA is available at ${flashmla_SOURCE_DIR}")
 # sm90a
 
 set(SUPPORT_ARCHS)
-if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3)
-    list(APPEND SUPPORT_ARCHS 9.0a)
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3)
+    list(APPEND SUPPORT_ARCHS "9.0a")
 endif()
-if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8)
-    list(APPEND SUPPORT_ARCHS 10.0a)
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.9)
+    # CUDA 12.9 has introduced "Family-Specific Architecture Features"
+    # this supports all compute_10x family
+    list(APPEND SUPPORT_ARCHS "10.0f")
+elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+    list(APPEND SUPPORT_ARCHS "10.0a")
 endif()
 
 
 cuda_archs_loose_intersection(FLASH_MLA_ARCHS "${SUPPORT_ARCHS}" "${CUDA_ARCHS}")
 if(FLASH_MLA_ARCHS)
+    message(STATUS "FlashMLA CUDA architectures: ${FLASH_MLA_ARCHS}")
     set(VLLM_FLASHMLA_GPU_FLAGS ${VLLM_GPU_FLAGS})
     list(APPEND VLLM_FLASHMLA_GPU_FLAGS "--expt-relaxed-constexpr" "--expt-extended-lambda" "--use_fast_math")
 
@@ -126,7 +131,8 @@ if(FLASH_MLA_ARCHS)
         $<$<COMPILE_LANGUAGE:CUDA>:-UPy_LIMITED_API>
         $<$<COMPILE_LANGUAGE:CXX>:-UPy_LIMITED_API>)
 else()
-    # Create empty targets for setup.py when not targeting sm90a systems
+    message(STATUS "FlashMLA will not compile: unsupported CUDA architecture ${CUDA_ARCHS}")
+    # Create empty targets for setup.py on unsupported systems
     add_custom_target(_flashmla_C)
     add_custom_target(_flashmla_extension_C)
 endif()
diff --git a/cmake/external_projects/qutlass.cmake b/cmake/external_projects/qutlass.cmake
index 5a59a409999ad5d673eea5b3a67648bba2c43c5e..84bb1b00c1bba0fecb96ad2193587d9e52967040 100644
--- a/cmake/external_projects/qutlass.cmake
+++ b/cmake/external_projects/qutlass.cmake
@@ -31,10 +31,15 @@ if(NOT qutlass_SOURCE_DIR)
 endif()
 message(STATUS "[QUTLASS] QuTLASS is available at ${qutlass_SOURCE_DIR}")
 
-cuda_archs_loose_intersection(QUTLASS_ARCHS "12.0a;10.0a" "${CUDA_ARCHS}")
-if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND QUTLASS_ARCHS)
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+  cuda_archs_loose_intersection(QUTLASS_ARCHS "12.0a;10.0f" "${CUDA_ARCHS}")
+else()
+  cuda_archs_loose_intersection(QUTLASS_ARCHS "12.0a;10.0a;10.3a" "${CUDA_ARCHS}")
+endif()
+
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND QUTLASS_ARCHS)
 
-  if(QUTLASS_ARCHS MATCHES "10\\.0a")
+  if(QUTLASS_ARCHS MATCHES "10\\.(0a|3a|0f)")
     set(QUTLASS_TARGET_CC 100)
   elseif(QUTLASS_ARCHS MATCHES "12\\.0a")
     set(QUTLASS_TARGET_CC 120)
diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake
index ff687e0af7b44d2caa1a83a98958a176f907dbf3..b51934a3ab29ac76b000ee5c5ba48b56ac86364e 100644
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@@ -38,7 +38,7 @@ else()
   FetchContent_Declare(
           vllm-flash-attn
           GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 86f8f157cf82aa2342743752b97788922dd7de43
+          GIT_TAG 188be16520ceefdc625fdf71365585d2ee348fe2
           GIT_PROGRESS TRUE
           # Don't share the vllm-flash-attn build between build types
           BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
diff --git a/csrc/activation_kernels.cu b/csrc/activation_kernels.cu
index 0ff4d6a4cd709e5b96175ce2c828d2f4c475f941..f30ec0c08a89ffe9a0e02b1ebfb11cbcb8f1bf89 100644
--- a/csrc/activation_kernels.cu
+++ b/csrc/activation_kernels.cu
@@ -15,19 +15,61 @@ __device__ __forceinline__ scalar_t compute(const scalar_t& x,
                                             const scalar_t& y) {
   return act_first ? ACT_FN(x) * y : x * ACT_FN(y);
 }
-// Activation and gating kernel template.
 
+// Check if all pointers are 16-byte aligned for int4 vectorized access
+__device__ __forceinline__ bool is_16byte_aligned(const void* ptr) {
+  return (reinterpret_cast<uintptr_t>(ptr) & 15) == 0;
+}
+
+// Activation and gating kernel template.
 template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&),
           bool act_first>
 __global__ void act_and_mul_kernel(
     scalar_t* __restrict__ out,          // [..., d]
     const scalar_t* __restrict__ input,  // [..., 2, d]
     const int d) {
+  constexpr int VEC_SIZE = 16 / sizeof(scalar_t);
   const int64_t token_idx = blockIdx.x;
-  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
-    const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]);
-    const scalar_t y = VLLM_LDG(&input[token_idx * 2 * d + d + idx]);
-    out[token_idx * d + idx] = compute<scalar_t, ACT_FN, act_first>(x, y);
+  const scalar_t* x_ptr = input + token_idx * 2 * d;
+  const scalar_t* y_ptr = x_ptr + d;
+  scalar_t* out_ptr = out + token_idx * d;
+
+  // Check alignment for 128-bit vectorized access.
+  // All three pointers must be 16-byte aligned for safe int4 operations.
+  const bool aligned = is_16byte_aligned(x_ptr) && is_16byte_aligned(y_ptr) &&
+                       is_16byte_aligned(out_ptr);
+
+  if (aligned && d >= VEC_SIZE) {
+    // Fast path: 128-bit vectorized loop
+    const int4* x_vec = reinterpret_cast<const int4*>(x_ptr);
+    const int4* y_vec = reinterpret_cast<const int4*>(y_ptr);
+    int4* out_vec = reinterpret_cast<int4*>(out_ptr);
+    const int num_vecs = d / VEC_SIZE;
+    const int vec_end = num_vecs * VEC_SIZE;
+
+    for (int i = threadIdx.x; i < num_vecs; i += blockDim.x) {
+      int4 x = VLLM_LDG(&x_vec[i]), y = VLLM_LDG(&y_vec[i]), r;
+      auto* xp = reinterpret_cast<scalar_t*>(&x);
+      auto* yp = reinterpret_cast<scalar_t*>(&y);
+      auto* rp = reinterpret_cast<scalar_t*>(&r);
+#pragma unroll
+      for (int j = 0; j < VEC_SIZE; j++) {
+        rp[j] = compute<scalar_t, ACT_FN, act_first>(xp[j], yp[j]);
+      }
+      out_vec[i] = r;
+    }
+    // Scalar cleanup for remaining elements
+    for (int i = vec_end + threadIdx.x; i < d; i += blockDim.x) {
+      out_ptr[i] = compute<scalar_t, ACT_FN, act_first>(VLLM_LDG(&x_ptr[i]),
+                                                        VLLM_LDG(&y_ptr[i]));
+    }
+  } else {
+    // Scalar fallback for unaligned data or small d
+    for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+      const scalar_t x = VLLM_LDG(&x_ptr[idx]);
+      const scalar_t y = VLLM_LDG(&y_ptr[idx]);
+      out_ptr[idx] = compute<scalar_t, ACT_FN, act_first>(x, y);
+    }
   }
 }
 
@@ -120,50 +162,115 @@ template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&, const float)>
 __global__ void act_and_mul_kernel_with_param(
     scalar_t* __restrict__ out, const scalar_t* __restrict__ input, const int d,
     const float param) {
+  constexpr int VEC_SIZE = 16 / sizeof(scalar_t);
   const int64_t token_idx = blockIdx.x;
-  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
-    const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]);
-    const scalar_t y = VLLM_LDG(&input[token_idx * 2 * d + d + idx]);
-    out[token_idx * d + idx] = ACT_FN(x, param) * y;
+  const scalar_t* x_ptr = input + token_idx * 2 * d;
+  const scalar_t* y_ptr = x_ptr + d;
+  scalar_t* out_ptr = out + token_idx * d;
+
+  // Check alignment for 128-bit vectorized access
+  const bool aligned = is_16byte_aligned(x_ptr) && is_16byte_aligned(y_ptr) &&
+                       is_16byte_aligned(out_ptr);
+
+  if (aligned && d >= VEC_SIZE) {
+    // Fast path: 128-bit vectorized loop
+    const int4* x_vec = reinterpret_cast<const int4*>(x_ptr);
+    const int4* y_vec = reinterpret_cast<const int4*>(y_ptr);
+    int4* out_vec = reinterpret_cast<int4*>(out_ptr);
+    const int num_vecs = d / VEC_SIZE;
+    const int vec_end = num_vecs * VEC_SIZE;
+
+    for (int i = threadIdx.x; i < num_vecs; i += blockDim.x) {
+      int4 x = VLLM_LDG(&x_vec[i]), y = VLLM_LDG(&y_vec[i]), r;
+      auto* xp = reinterpret_cast<scalar_t*>(&x);
+      auto* yp = reinterpret_cast<scalar_t*>(&y);
+      auto* rp = reinterpret_cast<scalar_t*>(&r);
+#pragma unroll
+      for (int j = 0; j < VEC_SIZE; j++) {
+        rp[j] = ACT_FN(xp[j], param) * yp[j];
+      }
+      out_vec[i] = r;
+    }
+    // Scalar cleanup for remaining elements
+    for (int i = vec_end + threadIdx.x; i < d; i += blockDim.x) {
+      out_ptr[i] = ACT_FN(VLLM_LDG(&x_ptr[i]), param) * VLLM_LDG(&y_ptr[i]);
+    }
+  } else {
+    // Scalar fallback for unaligned data or small d
+    for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+      const scalar_t x = VLLM_LDG(&x_ptr[idx]);
+      const scalar_t y = VLLM_LDG(&y_ptr[idx]);
+      out_ptr[idx] = ACT_FN(x, param) * y;
+    }
   }
 }
 
 template <typename T>
 __device__ __forceinline__ T swigluoai_and_mul(const T& gate, const T& up,
                                                float alpha, float limit) {
-  // clamp gate: min=None, max=limit
-  const float gate_f = (float)gate;
-  const float clamped_gate = gate_f > limit ? limit : gate_f;
-
-  // clamp up: min=-limit, max=limit
-  const float up_f = (float)up;
-  const float clamped_up =
-      up_f > limit ? limit : (up_f < -limit ? -limit : up_f);
-
-  // glu = gate * sigmoid(gate * alpha)
-  const float sigmoid_val = 1.0f / (1.0f + expf(-clamped_gate * alpha));
-  const float glu = clamped_gate * sigmoid_val;
-
-  // (up + 1) * glu
-  return (T)((clamped_up + 1.0f) * glu);
+  // Clamp gate to (-inf, limit] and up to [-limit, limit]
+  const float g = fminf((float)gate, limit);
+  const float u = fmaxf(fminf((float)up, limit), -limit);
+  // glu = gate * sigmoid(gate * alpha), then return (up + 1) * glu
+  return (T)((u + 1.0f) * g / (1.0f + expf(-g * alpha)));
 }
 
+// Interleaved gate/up: input has [gate0, up0, gate1, up1, ...].
 template <typename scalar_t,
           scalar_t (*ACT_FN)(const scalar_t&, const scalar_t&, const float,
                              const float)>
 __global__ void swigluoai_and_mul_kernel(
     scalar_t* __restrict__ out,          // [..., d]
-    const scalar_t* __restrict__ input,  // [..., 2, d]
+    const scalar_t* __restrict__ input,  // [..., 2 * d] (interleaved)
     const int d, const float alpha, const float limit) {
+  // For interleaved data: input has 2*d elements per token (gate/up pairs)
+  // output has d elements per token
+  constexpr int VEC_SIZE = 16 / sizeof(scalar_t);
+  constexpr int PAIRS = VEC_SIZE / 2;  // Number of gate/up pairs per int4 load
   const int64_t token_idx = blockIdx.x;
-  // TODO: Vectorize loads and stores.
-  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
-    // gate = x[..., ::2]  (even indices)
-    const scalar_t gate = VLLM_LDG(&input[token_idx * 2 * d + 2 * idx]);
-    // up = x[..., 1::2]   (odd indices)
-    const scalar_t up = VLLM_LDG(&input[token_idx * 2 * d + 2 * idx + 1]);
-
-    out[token_idx * d + idx] = ACT_FN(gate, up, alpha, limit);
+  const scalar_t* in_ptr = input + token_idx * 2 * d;
+  scalar_t* out_ptr = out + token_idx * d;
+
+  // Check alignment for 128-bit vectorized access on input.
+  // For output we use int2 (64-bit) which has 8-byte alignment requirement.
+  const bool in_aligned = is_16byte_aligned(in_ptr);
+  const bool out_aligned =
+      (reinterpret_cast<uintptr_t>(out_ptr) & 7) == 0;  // 8-byte for int2
+
+  if (in_aligned && out_aligned && d >= PAIRS) {
+    // Fast path: vectorized loop
+    // Each int4 load gives VEC_SIZE elements = PAIRS gate/up pairs
+    // Each int2 store writes PAIRS output elements
+    const int4* in_vec = reinterpret_cast<const int4*>(in_ptr);
+    int2* out_vec = reinterpret_cast<int2*>(out_ptr);
+    const int num_vecs = d / PAIRS;
+    const int vec_end = num_vecs * PAIRS;
+
+    for (int i = threadIdx.x; i < num_vecs; i += blockDim.x) {
+      int4 v = VLLM_LDG(&in_vec[i]);
+      int2 r;
+      auto* vp = reinterpret_cast<scalar_t*>(&v);
+      auto* rp = reinterpret_cast<scalar_t*>(&r);
+#pragma unroll
+      for (int j = 0; j < PAIRS; j++) {
+        rp[j] = ACT_FN(vp[2 * j], vp[2 * j + 1], alpha, limit);
+      }
+      out_vec[i] = r;
+    }
+    // Scalar cleanup for remaining elements
+    for (int i = vec_end + threadIdx.x; i < d; i += blockDim.x) {
+      out_ptr[i] = ACT_FN(VLLM_LDG(&in_ptr[2 * i]),
+                          VLLM_LDG(&in_ptr[2 * i + 1]), alpha, limit);
+    }
+  } else {
+    // Scalar fallback for unaligned data or small d
+    for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+      // gate = x[..., ::2]  (even indices)
+      const scalar_t gate = VLLM_LDG(&in_ptr[2 * idx]);
+      // up = x[..., 1::2]   (odd indices)
+      const scalar_t up = VLLM_LDG(&in_ptr[2 * idx + 1]);
+      out_ptr[idx] = ACT_FN(gate, up, alpha, limit);
+    }
   }
 }
 
@@ -217,10 +324,41 @@ __global__ void activation_kernel(
     scalar_t* __restrict__ out,          // [..., d]
     const scalar_t* __restrict__ input,  // [..., d]
     const int d) {
+  constexpr int VEC_SIZE = 16 / sizeof(scalar_t);
   const int64_t token_idx = blockIdx.x;
-  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
-    const scalar_t x = VLLM_LDG(&input[token_idx * d + idx]);
-    out[token_idx * d + idx] = ACT_FN(x);
+  const scalar_t* in_ptr = input + token_idx * d;
+  scalar_t* out_ptr = out + token_idx * d;
+
+  // Check alignment for 128-bit vectorized access
+  const bool aligned = is_16byte_aligned(in_ptr) && is_16byte_aligned(out_ptr);
+
+  if (aligned && d >= VEC_SIZE) {
+    // Fast path: 128-bit vectorized loop
+    const int4* in_vec = reinterpret_cast<const int4*>(in_ptr);
+    int4* out_vec = reinterpret_cast<int4*>(out_ptr);
+    const int num_vecs = d / VEC_SIZE;
+    const int vec_end = num_vecs * VEC_SIZE;
+
+    for (int i = threadIdx.x; i < num_vecs; i += blockDim.x) {
+      int4 v = VLLM_LDG(&in_vec[i]), r;
+      auto* vp = reinterpret_cast<scalar_t*>(&v);
+      auto* rp = reinterpret_cast<scalar_t*>(&r);
+#pragma unroll
+      for (int j = 0; j < VEC_SIZE; j++) {
+        rp[j] = ACT_FN(vp[j]);
+      }
+      out_vec[i] = r;
+    }
+    // Scalar cleanup for remaining elements
+    for (int i = vec_end + threadIdx.x; i < d; i += blockDim.x) {
+      out_ptr[i] = ACT_FN(VLLM_LDG(&in_ptr[i]));
+    }
+  } else {
+    // Scalar fallback for unaligned data or small d
+    for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+      const scalar_t x = VLLM_LDG(&in_ptr[idx]);
+      out_ptr[idx] = ACT_FN(x);
+    }
   }
 }
 
diff --git a/csrc/cache.h b/csrc/cache.h
index ee96a33ec8d95ada81ea72c79d107d43296cc8b2..bf007dabf6c170775f38ba8152495fcac81a0d28 100644
--- a/csrc/cache.h
+++ b/csrc/cache.h
@@ -9,16 +9,6 @@
 void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
                  const torch::Tensor& block_mapping);
 
-// Note: the key_caches and value_caches vectors are constant but
-// not the Tensors they contain. The vectors need to be const refs
-// in order to satisfy pytorch's C++ operator registration code.
-void copy_blocks(std::vector<torch::Tensor> const& key_caches,
-                 std::vector<torch::Tensor> const& value_caches,
-                 const torch::Tensor& block_mapping);
-
-void copy_blocks_mla(std::vector<torch::Tensor> const& kv_caches,
-                     const torch::Tensor& block_mapping);
-
 void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
                        torch::Tensor& key_cache, torch::Tensor& value_cache,
                        torch::Tensor& slot_mapping,
@@ -43,6 +33,13 @@ void concat_and_cache_mla(torch::Tensor& kv_c, torch::Tensor& k_pe,
                           const std::string& kv_cache_dtype,
                           torch::Tensor& scale);
 
+// NOTE: k_pe and kv_c order is flipped compared to concat_and_cache_mla
+void concat_and_cache_mla_rope_fused(
+    torch::Tensor& positions, torch::Tensor& q_pe, torch::Tensor& k_pe,
+    torch::Tensor& kv_c, torch::Tensor& rope_cos_sin_cache, bool rope_is_neox,
+    torch::Tensor& kv_cache_slot_mapping, torch::Tensor& kv_cache,
+    const std::string& kv_cache_dtype, torch::Tensor& kv_cache_quant_scale);
+
 // Just for unittest
 void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
                  const double scale, const std::string& kv_cache_dtype);
diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
index 0112cea6a65bb657bd133a241f2a93e6cf44da0e..e4b95c2795fa78518952b6527f3b40e3302fc340 100644
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -124,94 +124,6 @@ __global__ void copy_blocks_mla_kernel(
 
 }  // namespace vllm
 
-// Note: the key_caches and value_caches vectors are constant but
-// not the Tensors they contain. The vectors need to be const refs
-// in order to satisfy pytorch's C++ operator registration code.
-void copy_blocks(std::vector<torch::Tensor> const& key_caches,
-                 std::vector<torch::Tensor> const& value_caches,
-                 const torch::Tensor& block_mapping) {
-  int num_layers = key_caches.size();
-  TORCH_CHECK(num_layers == value_caches.size());
-  if (num_layers == 0) {
-    return;
-  }
-  torch::Device cache_device = key_caches[0].device();
-  TORCH_CHECK(cache_device.is_cuda());
-
-  // Create data structures for the kernel.
-  // Create an array of pointers to the key and value caches.
-  int64_t key_cache_ptrs[num_layers];
-  int64_t value_cache_ptrs[num_layers];
-  for (int layer_idx = 0; layer_idx < num_layers; ++layer_idx) {
-    key_cache_ptrs[layer_idx] =
-        reinterpret_cast<int64_t>(key_caches[layer_idx].data_ptr());
-    value_cache_ptrs[layer_idx] =
-        reinterpret_cast<int64_t>(value_caches[layer_idx].data_ptr());
-  }
-
-  // block_mapping is a 2D tensor with shape (num_pairs, 2).
-  int num_pairs = block_mapping.size(0);
-
-  // Move the data structures to the GPU.
-  // NOTE: This synchronizes the CPU and GPU.
-  torch::Tensor key_cache_ptrs_tensor =
-      torch::from_blob(key_cache_ptrs, {num_layers}, torch::kInt64)
-          .to(cache_device);
-  torch::Tensor value_cache_ptrs_tensor =
-      torch::from_blob(value_cache_ptrs, {num_layers}, torch::kInt64)
-          .to(cache_device);
-
-  // Launch the kernel.
-  const int numel_per_block = key_caches[0][0].numel();
-  dim3 grid(num_layers, num_pairs);
-  dim3 block(std::min(1024, numel_per_block));
-  const at::cuda::OptionalCUDAGuard device_guard(cache_device);
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES(
-      key_caches[0].scalar_type(), "copy_blocks_kernel", ([&] {
-        vllm::copy_blocks_kernel<scalar_t><<<grid, block, 0, stream>>>(
-            key_cache_ptrs_tensor.data_ptr<int64_t>(),
-            value_cache_ptrs_tensor.data_ptr<int64_t>(),
-            block_mapping.data_ptr<int64_t>(), numel_per_block);
-      }));
-}
-
-// copy blocks kernel for MLA (assumes a joint KV-cache)
-void copy_blocks_mla(std::vector<torch::Tensor> const& kv_caches,
-                     const torch::Tensor& block_mapping) {
-  int num_layers = kv_caches.size();
-  if (num_layers == 0) {
-    return;
-  }
-  torch::Device cache_device = kv_caches[0].device();
-  TORCH_CHECK(cache_device.is_cuda(), "kv_cache must be on CUDA");
-
-  std::vector<int64_t> cache_ptrs(num_layers);
-  for (int layer_idx = 0; layer_idx < num_layers; ++layer_idx) {
-    cache_ptrs[layer_idx] =
-        reinterpret_cast<int64_t>(kv_caches[layer_idx].data_ptr());
-  }
-  torch::Tensor cache_ptrs_tensor =
-      torch::from_blob(cache_ptrs.data(), {num_layers}, torch::kInt64)
-          .to(cache_device);
-
-  int num_pairs = block_mapping.size(0);
-  // We use the stride instead of numel in case the cache is padded for memory
-  // alignment reasons, we assume the blocks data (inclusive of any padding)
-  // is contiguous in memory
-  int mem_footprint_per_block = kv_caches[0].stride(0);
-  dim3 grid(num_layers, num_pairs);
-  dim3 block(std::min(1024, mem_footprint_per_block));
-  const at::cuda::OptionalCUDAGuard device_guard(cache_device);
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES(
-      kv_caches[0].scalar_type(), "copy_blocks_mla_kernel", ([&] {
-        vllm::copy_blocks_mla_kernel<scalar_t><<<grid, block, 0, stream>>>(
-            cache_ptrs_tensor.data_ptr<int64_t>(),
-            block_mapping.data_ptr<int64_t>(), mem_footprint_per_block);
-      }));
-}
-
 namespace vllm {
 
 // Used to copy/convert one element
@@ -770,9 +682,6 @@ __global__ void indexer_k_quant_and_cache_kernel(
   for (int i = 0; i < VEC_SIZE; i++) {
     amax = fmaxf(amax, fabsf(float(k_val_ptr[i])));
   }
-#ifndef USE_ROCM
-  __syncwarp();
-#endif
 
   // Reduced amax
   for (int mask = 16; mask > 0; mask /= 2) {
@@ -782,9 +691,7 @@ __global__ void indexer_k_quant_and_cache_kernel(
     amax = fmaxf(amax, __shfl_xor_sync(unsigned(-1), amax, mask));
 #endif
   }
-#ifndef USE_ROCM
-  __syncwarp();
-#endif
+
 #if defined(__gfx942__)
   float scale = fmaxf(amax, 1e-4) / 224.0f;
 #else
diff --git a/csrc/cache_kernels_fused.cu b/csrc/cache_kernels_fused.cu
new file mode 100644
index 0000000000000000000000000000000000000000..be037b2fdec2be66a67c4a61144a0e45b009fc7a
--- /dev/null
+++ b/csrc/cache_kernels_fused.cu
@@ -0,0 +1,279 @@
+#include <torch/all.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include "cuda_compat.h"
+#include "dispatch_utils.h"
+
+#include "quantization/w8a8/fp8/common.cuh"
+#ifdef USE_ROCM
+  #include "quantization/w8a8/fp8/amd/quant_utils.cuh"
+#else
+  #include "quantization/w8a8/fp8/nvidia/quant_utils.cuh"
+#endif
+
+#ifdef USE_ROCM
+  #include <hip/hip_bf16.h>
+typedef __hip_bfloat16 __nv_bfloat16;
+#endif
+
+namespace vllm {
+
+// NOTE Be EXTRA careful with raw_kv_scalar_t, for __half and __nv_bfloat16 it's
+// using u16 as the backing type.
+template <typename qk_t, bool IS_NEOX, typename raw_kv_scalar_t,
+          typename cache_t, Fp8KVCacheDataType kv_dt>
+__global__ void concat_and_cache_mla_rope_fused_kernel(
+    const int64_t* __restrict__ positions,  // [num_tokens]
+    qk_t* __restrict__ q_pe,        // [num_tokens, num_q_heads, rot_dim]
+    qk_t* __restrict__ k_pe,        // [num_tokens, rot_dim]
+    const qk_t* __restrict__ kv_c,  // [num_tokens, kv_lora_rank]
+    const qk_t* __restrict__ rope_cos_sin_cache,  // [max_position, 2,
+                                                  // rot_dim // 2]
+    const int rot_dim, const int64_t q_pe_stride_token,
+    const int64_t q_pe_stride_head, const int64_t k_pe_stride,
+    const int64_t kv_c_stride, const int num_q_heads,
+    cache_t* __restrict__ kv_cache,  // [num_blocks, block_size, (kv_lora_rank +
+                                     // rot_dim)]
+    const int64_t* __restrict__ kv_cache_slot_mapping,  // [num_tokens]
+    const int block_stride, const int entry_stride, const int kv_lora_rank,
+    const int block_size, const float* kv_cache_quant_scale) {
+  // Each thread block is responsible for one token.
+  const int64_t token_idx = blockIdx.x;
+  const int64_t pos = positions[token_idx];
+
+  const qk_t* cos_sin_ptr = rope_cos_sin_cache + pos * rot_dim;
+
+  const int embed_dim = rot_dim / 2;
+
+  // Q ROPE
+  const int nq = num_q_heads * embed_dim;
+  for (int i = threadIdx.x; i < nq; i += blockDim.x) {
+    int head_idx = i / embed_dim;
+    int pair_idx = i % embed_dim;
+
+    // NOTE: Would be nice to have interleaved sin/cos so we could just load
+    // both at the same time.
+    qk_t cos = VLLM_LDG(cos_sin_ptr + pair_idx);
+    qk_t sin = VLLM_LDG(cos_sin_ptr + pair_idx + embed_dim);
+
+    qk_t* q_pe_head_ptr =
+        q_pe + token_idx * q_pe_stride_token + head_idx * q_pe_stride_head;
+
+    int pair_idx_x, pair_idx_y;
+    if constexpr (IS_NEOX) {
+      // GPT-NeoX style rotary embedding.
+      pair_idx_x = pair_idx;
+      pair_idx_y = embed_dim + pair_idx;
+    } else {
+      // GPT-J style rotary embedding.
+      pair_idx_x = pair_idx * 2;
+      pair_idx_y = pair_idx * 2 + 1;
+    }
+
+    qk_t x_src = q_pe_head_ptr[pair_idx_x];
+    qk_t y_src = q_pe_head_ptr[pair_idx_y];
+
+    qk_t x_dst = x_src * cos - y_src * sin;
+    qk_t y_dst = y_src * cos + x_src * sin;
+
+    q_pe_head_ptr[pair_idx_x] = x_dst;
+    q_pe_head_ptr[pair_idx_y] = y_dst;
+  }
+
+  const int64_t slot_idx = kv_cache_slot_mapping[token_idx];
+  const int64_t block_idx = slot_idx / block_size;
+  const int64_t entry_idx = slot_idx % block_size;
+
+  // NOTE: slot_idx can be -1 if the token is padded
+  if (slot_idx < 0) {
+    return;
+  }
+
+  // K with 1 HEAD
+  for (int i = threadIdx.x; i < embed_dim; i += blockDim.x) {
+    int pair_idx = i;
+
+    qk_t cos = VLLM_LDG(cos_sin_ptr + pair_idx);
+    qk_t sin = VLLM_LDG(cos_sin_ptr + pair_idx + embed_dim);
+
+    qk_t* k_pe_head_ptr = k_pe + token_idx * k_pe_stride;
+
+    int pair_idx_x, pair_idx_y;
+    if constexpr (IS_NEOX) {
+      // GPT-NeoX style rotary embedding.
+      pair_idx_x = pair_idx;
+      pair_idx_y = embed_dim + pair_idx;
+    } else {
+      // GPT-J style rotary embedding.
+      pair_idx_x = pair_idx * 2;
+      pair_idx_y = pair_idx * 2 + 1;
+    }
+
+    qk_t x_src = k_pe_head_ptr[pair_idx_x];
+    qk_t y_src = k_pe_head_ptr[pair_idx_y];
+
+    qk_t x_dst = x_src * cos - y_src * sin;
+    qk_t y_dst = y_src * cos + x_src * sin;
+
+    k_pe_head_ptr[pair_idx_x] = x_dst;
+    k_pe_head_ptr[pair_idx_y] = y_dst;
+
+    // NOTE Why is this monster necessary?
+    // When K is of type float16, the actual template replacement for
+    // raw_kv_scalar_t with be u16. That's why it's used at the last moment
+    // otherwise CUDA ALU would break.
+    const raw_kv_scalar_t raw_x_value =
+        *reinterpret_cast<const raw_kv_scalar_t*>(&x_dst);
+    const raw_kv_scalar_t raw_y_value =
+        *reinterpret_cast<const raw_kv_scalar_t*>(&y_dst);
+
+    cache_t* kv_cache_ptr = kv_cache + block_idx * block_stride +
+                            entry_idx * entry_stride + kv_lora_rank;
+
+    // MLA Cache Store
+    if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
+      kv_cache_ptr[pair_idx_x] = raw_x_value;
+      kv_cache_ptr[pair_idx_y] = raw_y_value;
+    } else {
+      kv_cache_ptr[pair_idx_x] =
+          fp8::scaled_convert<cache_t, raw_kv_scalar_t, kv_dt>(
+              raw_x_value, *kv_cache_quant_scale);
+      kv_cache_ptr[pair_idx_y] =
+          fp8::scaled_convert<cache_t, raw_kv_scalar_t, kv_dt>(
+              raw_y_value, *kv_cache_quant_scale);
+    }
+  }
+
+  // NOPE
+  for (int i = threadIdx.x; i < kv_lora_rank; i += blockDim.x) {
+    const qk_t* src_ptr = kv_c + token_idx * kv_c_stride + i;
+    const raw_kv_scalar_t src_value =
+        *reinterpret_cast<const raw_kv_scalar_t*>(src_ptr);
+
+    cache_t* kv_cache_ptr =
+        kv_cache + block_idx * block_stride + entry_idx * entry_stride;
+
+    if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
+      kv_cache_ptr[i] = src_value;
+    } else {
+      kv_cache_ptr[i] = fp8::scaled_convert<cache_t, raw_kv_scalar_t, kv_dt>(
+          src_value, *kv_cache_quant_scale);
+    }
+  }
+}
+
+}  // namespace vllm
+
+#define CALL_CONCAT_AND_CACHE_MLA_ROPE_FUSED(RAW_KV_T, CACHE_T, KV_DTYPE)      \
+  do {                                                                         \
+    VLLM_DISPATCH_FLOATING_TYPES(q_pe.scalar_type(), "qk_scalar_type", [&] {   \
+      using qk_t = scalar_t;                                                   \
+      if (rope_is_neox) {                                                      \
+        vllm::concat_and_cache_mla_rope_fused_kernel<qk_t, true, RAW_KV_T,     \
+                                                     CACHE_T, KV_DTYPE>        \
+            <<<grid, block, 0, stream>>>(                                      \
+                positions.data_ptr<int64_t>(), q_pe.data_ptr<qk_t>(),          \
+                k_pe.data_ptr<qk_t>(), kv_c.data_ptr<qk_t>(),                  \
+                rope_cos_sin_cache.data_ptr<qk_t>(), rot_dim,                  \
+                q_pe_stride_token, q_pe_stride_head, k_pe_stride, kv_c_stride, \
+                num_q_heads, reinterpret_cast<CACHE_T*>(kv_cache.data_ptr()),  \
+                kv_cache_slot_mapping.data_ptr<int64_t>(), block_stride,       \
+                entry_stride, kv_lora_rank, block_size,                        \
+                kv_cache_quant_scale.data_ptr<float>());                       \
+      } else {                                                                 \
+        vllm::concat_and_cache_mla_rope_fused_kernel<qk_t, false, RAW_KV_T,    \
+                                                     CACHE_T, KV_DTYPE>        \
+            <<<grid, block, 0, stream>>>(                                      \
+                positions.data_ptr<int64_t>(), q_pe.data_ptr<qk_t>(),          \
+                k_pe.data_ptr<qk_t>(), kv_c.data_ptr<qk_t>(),                  \
+                rope_cos_sin_cache.data_ptr<qk_t>(), rot_dim,                  \
+                q_pe_stride_token, q_pe_stride_head, k_pe_stride, kv_c_stride, \
+                num_q_heads, reinterpret_cast<CACHE_T*>(kv_cache.data_ptr()),  \
+                kv_cache_slot_mapping.data_ptr<int64_t>(), block_stride,       \
+                entry_stride, kv_lora_rank, block_size,                        \
+                kv_cache_quant_scale.data_ptr<float>());                       \
+      }                                                                        \
+    });                                                                        \
+  } while (false)
+
+// Executes RoPE on q_pe and k_pe, then writes k_pe and kv_c in the kv cache.
+// q_pe and k_pe are modified in place.
+// Replaces DeepseekScalingRotaryEmbedding.self.rotary_emb and
+// concat_and_cache_mla.
+void concat_and_cache_mla_rope_fused(
+    torch::Tensor& positions,           // [num_tokens]
+    torch::Tensor& q_pe,                // [num_tokens, num_q_heads, rot_dim]
+    torch::Tensor& k_pe,                // [num_tokens, rot_dim]
+    torch::Tensor& kv_c,                // [num_tokens, kv_lora_rank]
+    torch::Tensor& rope_cos_sin_cache,  // [max_position, rot_dim]
+    bool rope_is_neox,
+    torch::Tensor&
+        kv_cache_slot_mapping,  // [num_tokens] or [num_actual_tokens]
+    torch::Tensor&
+        kv_cache,  // [num_blocks, block_size, (kv_lora_rank + rot_dim)]
+    const std::string& kv_cache_dtype, torch::Tensor& kv_cache_quant_scale) {
+  const int64_t num_tokens = q_pe.size(0);
+
+  const int num_q_heads = q_pe.size(1);
+  const int rot_dim = q_pe.size(2);
+  const int kv_lora_rank = kv_c.size(1);
+
+  TORCH_CHECK(positions.size(0) >=
+              num_tokens);  // CUDA Graphs might pad this for us
+  TORCH_CHECK_EQ(positions.dim(), 1);
+  TORCH_CHECK_EQ(positions.scalar_type(), c10::ScalarType::Long);
+
+  TORCH_CHECK_EQ(q_pe.size(0), num_tokens);
+  TORCH_CHECK_EQ(q_pe.size(1), num_q_heads);
+  TORCH_CHECK_EQ(q_pe.size(2), rot_dim);
+  TORCH_CHECK_EQ(q_pe.dim(), 3);
+
+  TORCH_CHECK_EQ(k_pe.size(0), num_tokens);
+  TORCH_CHECK_EQ(k_pe.size(1), rot_dim);
+  TORCH_CHECK_EQ(k_pe.dim(), 2);
+  TORCH_CHECK_EQ(k_pe.scalar_type(), q_pe.scalar_type());
+
+  TORCH_CHECK_EQ(kv_c.size(0), num_tokens);
+  TORCH_CHECK_EQ(kv_c.size(1), kv_lora_rank);
+  TORCH_CHECK_EQ(kv_c.dim(), 2);
+  TORCH_CHECK_EQ(kv_c.scalar_type(), q_pe.scalar_type());
+  TORCH_CHECK_EQ(kv_c.dtype(), q_pe.dtype());
+
+  TORCH_CHECK_EQ(rope_cos_sin_cache.size(1), rot_dim);
+  TORCH_CHECK_EQ(rope_cos_sin_cache.scalar_type(), q_pe.scalar_type());
+
+  TORCH_CHECK_EQ(kv_cache_slot_mapping.size(0), num_tokens);
+  TORCH_CHECK_EQ(kv_cache_slot_mapping.scalar_type(), c10::ScalarType::Long);
+
+  TORCH_CHECK_EQ(kv_cache.size(2), kv_lora_rank + rot_dim);
+  TORCH_CHECK_EQ(kv_cache.dim(), 3);
+
+  TORCH_CHECK_EQ(kv_cache_quant_scale.numel(), 1);
+  TORCH_CHECK_EQ(kv_cache_quant_scale.scalar_type(), c10::ScalarType::Float);
+
+  int64_t q_pe_stride_token = q_pe.stride(0);
+  int64_t q_pe_stride_head = q_pe.stride(1);
+
+  int64_t k_pe_stride = k_pe.stride(0);
+  int64_t kv_c_stride = kv_c.stride(0);
+
+  int block_size = kv_cache.size(1);
+
+  int block_stride = kv_cache.stride(0);
+  int entry_stride = kv_cache.stride(1);
+
+  int rope_block_size = std::min(num_q_heads * rot_dim / 2, 512);
+  int mla_block_size = kv_lora_rank;
+  int thread_block_size =
+      std::min(std::max(rope_block_size, mla_block_size), 512);
+
+  dim3 grid(num_tokens, 1, 1);
+  dim3 block(thread_block_size, 1, 1);
+
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(positions));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype,
+                             CALL_CONCAT_AND_CACHE_MLA_ROPE_FUSED);
+}
diff --git a/csrc/cpu/cpu_attn_macros.h b/csrc/cpu/cpu_arch_macros.h
similarity index 97%
rename from csrc/cpu/cpu_attn_macros.h
rename to csrc/cpu/cpu_arch_macros.h
index 35716a0790ab354fa77b3b04f5581d10c0f7f566..c73b62ecdec901f4cf543bc12176aa6645b2a7dc 100644
--- a/csrc/cpu/cpu_attn_macros.h
+++ b/csrc/cpu/cpu_arch_macros.h
@@ -1,5 +1,5 @@
-#ifndef CPU_ATTN_MACROS_H
-#define CPU_ATTN_MACROS_H
+#ifndef CPU_ARCH_MACROS_H
+#define CPU_ARCH_MACROS_H
 
 // x86_64
 #ifdef __x86_64__
@@ -26,7 +26,7 @@
           _mm512_castsi512_ps(_mm512_set1_epi32(0x42b17218));                  \
       const __m512i vec_127 = _mm512_set1_epi32(0x0000007f);                   \
       const int n_mantissa_bits = 23;                                          \
-      auto fast_exp = [&](vec_op::FP32Vec16& vec) __attribute__((              \
+      auto fast_exp = [&](const vec_op::FP32Vec16& vec) __attribute__((        \
                           always_inline)) {                                    \
         __m512 values = vec.reg;                                               \
         auto less_ln_flt_min_mask =                                            \
@@ -98,7 +98,7 @@
       poly = vbslq_f32(hi_mask, inf, poly);                                    \
       return vbslq_f32(lo_mask, zero, poly);                                   \
     };                                                                         \
-    auto fast_exp = [&](vec_op::FP32Vec16& vec)                                \
+    auto fast_exp = [&](const vec_op::FP32Vec16& vec)                          \
                         __attribute__((always_inline)) {                       \
                           float32x4x4_t result;                                \
                           result.val[0] = neon_expf(vec.reg.val[0]);           \
@@ -110,4 +110,4 @@
 
 #endif  // __aarch64__
 
-#endif
\ No newline at end of file
+#endif
diff --git a/csrc/cpu/cpu_attn.cpp b/csrc/cpu/cpu_attn.cpp
index 02c722ba031a4c2168740c96794c4a040b0a259e..374fc2ee6ddcc756d5d9ededdabd56ad8571e62f 100644
--- a/csrc/cpu/cpu_attn.cpp
+++ b/csrc/cpu/cpu_attn.cpp
@@ -15,6 +15,7 @@
 
 #ifdef __aarch64__
   #include "cpu_attn_neon.hpp"
+  // NEON requires head_dim to be a multiple of 32
   #define NEON_DISPATCH(...)                                                   \
     case cpu_attention::ISA::NEON: {                                           \
       using attn_impl = cpu_attention::AttentionImpl<cpu_attention::ISA::NEON, \
@@ -36,7 +37,9 @@
     switch (HEAD_DIM) {                                         \
       CPU_ATTN_DISPATCH_CASE(32, __VA_ARGS__)                   \
       CPU_ATTN_DISPATCH_CASE(64, __VA_ARGS__)                   \
+      CPU_ATTN_DISPATCH_CASE(80, __VA_ARGS__)                   \
       CPU_ATTN_DISPATCH_CASE(96, __VA_ARGS__)                   \
+      CPU_ATTN_DISPATCH_CASE(112, __VA_ARGS__)                  \
       CPU_ATTN_DISPATCH_CASE(128, __VA_ARGS__)                  \
       CPU_ATTN_DISPATCH_CASE(160, __VA_ARGS__)                  \
       CPU_ATTN_DISPATCH_CASE(192, __VA_ARGS__)                  \
diff --git a/csrc/cpu/cpu_attn_amx.hpp b/csrc/cpu/cpu_attn_amx.hpp
index 8da458b99119c31667ff875eeb947e5979f65968..78be05e8dc8261adf5c3bfb090cfe9dc2a9adbf0 100644
--- a/csrc/cpu/cpu_attn_amx.hpp
+++ b/csrc/cpu/cpu_attn_amx.hpp
@@ -377,7 +377,7 @@ class AttentionImpl<ISA::AMX, scalar_t, head_dim> {
       const int32_t q_heads_per_kv, const int64_t q_num_stride,
       const int64_t q_head_stride, const float scale) {
     constexpr int64_t bytes_per_head = head_dim * sizeof(scalar_t);
-    static_assert(bytes_per_head % AMX_TILE_ROW_BYTES == 0);
+    // static_assert(bytes_per_head % AMX_TILE_ROW_BYTES == 0);
     constexpr int64_t head_size_block_num = bytes_per_head / AMX_TILE_ROW_BYTES;
     constexpr int64_t head_elem_num_pre_block =
         AMX_TILE_ROW_BYTES / sizeof(scalar_t);
diff --git a/csrc/cpu/cpu_attn_impl.hpp b/csrc/cpu/cpu_attn_impl.hpp
index e3e077b845f4f992e48af87fcc42943225a53d53..08d208e05a62c89353b72e5555435e786bc0c288 100644
--- a/csrc/cpu/cpu_attn_impl.hpp
+++ b/csrc/cpu/cpu_attn_impl.hpp
@@ -8,10 +8,8 @@
   #include <sys/sysctl.h>
 #endif
 
-#include "cpu_types.hpp"
-#include "scratchpad_manager.h"
-#include "cpu_attn_macros.h"
-#include "utils.hpp"
+#include "cpu/cpu_arch_macros.h"
+#include "cpu/utils.hpp"
 
 namespace cpu_attention {
 enum class ISA { AMX, VEC, VEC16, NEON };
@@ -378,12 +376,13 @@ class AttentionScheduler {
 
   static constexpr int32_t MaxQTileIterNum = 128;
 
-  AttentionScheduler() : available_cache_size_(get_available_l2_size()) {}
+  AttentionScheduler()
+      : available_cache_size_(cpu_utils::get_available_l2_size()) {}
 
   torch::Tensor schedule(const ScheduleInput& input) const {
     const bool casual = input.casual;
     const int32_t thread_num = omp_get_max_threads();
-    const int64_t cache_size = get_available_l2_size();
+    const int64_t cache_size = cpu_utils::get_available_l2_size();
     const int32_t max_num_q_per_iter = input.max_num_q_per_iter;
     const int32_t kv_len_alignment = input.kv_block_alignment;
     int32_t q_head_per_kv = input.num_heads_q / input.num_heads_kv;
@@ -659,7 +658,7 @@ class AttentionScheduler {
             metadata_ptr->thread_num +
         metadata_ptr->reduction_scratchpad_size_per_kv_head *
             (use_gqa ? input.num_heads_kv : input.num_heads_q);
-    DNNLScratchPadManager::get_dnnl_scratchpad_manager()->realloc(
+    cpu_utils::ScratchPadManager::get_scratchpad_manager()->realloc(
         scratchpad_size);
 
     // metadata_ptr->print();
@@ -667,7 +666,7 @@ class AttentionScheduler {
     // test out of boundary access
     // {
     //     float* cache_ptr =
-    //     DNNLScratchPadManager::get_dnnl_scratchpad_manager()->get_data<float>();
+    //     cpu_utils::ScratchPadManager::getl_scratchpad_manager()->get_data<float>();
     //     for (int64_t i = 0; i < scratchpad_size / sizeof(float); ++i) {
     //         cache_ptr[i] = std::numeric_limits<float>::quiet_NaN();
     //     }
@@ -749,27 +748,6 @@ class AttentionScheduler {
     return std::max(rounded_tile_size, round_size);
   }
 
-  static int64_t get_available_l2_size() {
-    static int64_t size = []() {
-#if defined(__APPLE__)
-      // macOS doesn't have _SC_LEVEL2_CACHE_SIZE. Use sysctlbyname.
-      int64_t l2_cache_size = 0;
-      size_t len = sizeof(l2_cache_size);
-      if (sysctlbyname("hw.l2cachesize", &l2_cache_size, &len, NULL, 0) == 0 &&
-          l2_cache_size > 0) {
-        return l2_cache_size >> 1;  // use 50% of L2 cache
-      }
-      // Fallback if sysctlbyname fails
-      return 128LL * 1024 >> 1;  // use 50% of 128KB
-#else
-      long l2_cache_size = sysconf(_SC_LEVEL2_CACHE_SIZE);
-      TORCH_CHECK_NE(l2_cache_size, -1);
-      return l2_cache_size >> 1;  // use 50% of L2 cache
-#endif
-    }();
-    return size;
-  }
-
  private:
   int64_t available_cache_size_;
 };
@@ -1402,7 +1380,7 @@ class AttentionMainLoop {
 
       // init buffers
       void* scratchpad_ptr =
-          DNNLScratchPadManager::get_dnnl_scratchpad_manager()
+          cpu_utils::ScratchPadManager::get_scratchpad_manager()
               ->get_data<void>();
       AttentionScratchPad buffer_manager(thread_id, metadata, scratchpad_ptr);
 
@@ -1422,8 +1400,7 @@ class AttentionMainLoop {
         }
       }
 
-      const int64_t available_cache_size =
-          AttentionScheduler::get_available_l2_size();
+      const int64_t available_cache_size = cpu_utils::get_available_l2_size();
       const int32_t default_tile_size =
           AttentionScheduler::calcu_default_tile_size(
               available_cache_size, head_dim, sizeof(kv_cache_t),
diff --git a/csrc/cpu/cpu_attn_neon.hpp b/csrc/cpu/cpu_attn_neon.hpp
index 827f0cfbc718ed97ce6d3ac2ed765d4ef9ee0f80..e9ecd1d32904e34b7dbaf4e85d17dfed245088ab 100644
--- a/csrc/cpu/cpu_attn_neon.hpp
+++ b/csrc/cpu/cpu_attn_neon.hpp
@@ -264,7 +264,7 @@ class AttentionImpl<ISA::NEON, scalar_t, head_dim> {
   constexpr static ISA ISAType = ISA::NEON;
   constexpr static bool scale_on_logits = false;  // apply scale on q_buffer
 
-  static_assert(HeadDim % HeadDimAlignment == 0);
+  //  static_assert(HeadDim % HeadDimAlignment == 0);
   // the gemm micro kernel is Mx8
   static_assert(HeadDimAlignment % 8 == 0);
   static_assert(BlockSizeAlignment % 8 == 0);
diff --git a/csrc/cpu/cpu_fused_moe.cpp b/csrc/cpu/cpu_fused_moe.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..090e2d4cd4b56504602af6594003f767344ed993
--- /dev/null
+++ b/csrc/cpu/cpu_fused_moe.cpp
@@ -0,0 +1,727 @@
+#include "cpu/cpu_types.hpp"
+#include "cpu/utils.hpp"
+#include "cpu/micro_gemm/cpu_micro_gemm_vec.hpp"
+#include "cpu/cpu_arch_macros.h"
+
+#ifdef CPU_CAPABILITY_AMXBF16
+  #include "cpu/micro_gemm/cpu_micro_gemm_amx.hpp"
+  #define AMX_DISPATCH(...)                                                    \
+    case cpu_utils::ISA::AMX: {                                                \
+      using gemm_t = cpu_micro_gemm::MicroGemm<cpu_utils::ISA::AMX, scalar_t>; \
+      return __VA_ARGS__();                                                    \
+    }
+#else
+  #define AMX_DISPATCH(...) case cpu_utils::ISA::AMX:
+#endif
+
+#define CPU_ISA_DISPATCH_IMPL(ISA_TYPE, ...)                          \
+  [&] {                                                               \
+    switch (ISA_TYPE) {                                               \
+      AMX_DISPATCH(__VA_ARGS__)                                       \
+      case cpu_utils::ISA::VEC: {                                     \
+        using gemm_t =                                                \
+            cpu_micro_gemm::MicroGemm<cpu_utils::ISA::VEC, scalar_t>; \
+        return __VA_ARGS__();                                         \
+      }                                                               \
+      default: {                                                      \
+        TORCH_CHECK(false, "Invalid CPU ISA type.");                  \
+      }                                                               \
+    }                                                                 \
+  }()
+
+namespace {
+enum class FusedMOEAct { SiluAndMul, SwigluOAIAndMul };
+
+FusedMOEAct get_act_type(const std::string& act) {
+  if (act == "silu") {
+    return FusedMOEAct::SiluAndMul;
+  } else if (act == "swigluoai") {
+    return FusedMOEAct::SwigluOAIAndMul;
+  } else {
+    TORCH_CHECK(false, "Invalid act type: " + act);
+  }
+}
+
+template <typename scalar_t>
+void swigluoai_and_mul(float* __restrict__ input, scalar_t* __restrict__ output,
+                       const int32_t m_size, const int32_t n_size,
+                       const int32_t input_stride,
+                       const int32_t output_stride) {
+  using scalar_vec_t = typename cpu_utils::VecTypeTrait<scalar_t>::vec_t;
+  // For GPT-OSS interleaved gate-up weights
+  alignas(64) static int32_t index[16] = {0,  2,  4,  6,  8,  10, 12, 14,
+                                          16, 18, 20, 22, 24, 26, 28, 30};
+  vec_op::INT32Vec16 index_vec(index);
+  vec_op::FP32Vec16 gate_up_max_vec(7.0);
+  vec_op::FP32Vec16 up_min_vec(-7.0);
+  vec_op::FP32Vec16 alpha_vec(1.702);
+  vec_op::FP32Vec16 one_vec(1.0);
+
+  DEFINE_FAST_EXP
+
+  for (int32_t m = 0; m < m_size; ++m) {
+    for (int32_t n = 0; n < n_size; n += 32) {
+      vec_op::FP32Vec16 gate_vec(input + n, index_vec);
+      vec_op::FP32Vec16 up_vec(input + n + 1, index_vec);
+      gate_vec = gate_vec.min(gate_up_max_vec);
+      up_vec = up_vec.clamp(up_min_vec, gate_up_max_vec);
+      auto sigmoid_vec = one_vec / (one_vec + fast_exp(-gate_vec * alpha_vec));
+      auto glu = gate_vec * sigmoid_vec;
+      auto gated_output_fp32 = (one_vec + up_vec) * glu;
+      scalar_vec_t gated_output = scalar_vec_t(gated_output_fp32);
+      gated_output.save(output + n / 2);
+    }
+    input += input_stride;
+    output += output_stride;
+  }
+}
+
+template <typename scalar_t>
+void silu_and_mul(float* __restrict__ input, scalar_t* __restrict__ output,
+                  const int32_t m_size, const int32_t n_size,
+                  const int32_t input_stride, const int32_t output_stride) {
+  using scalar_vec_t = typename cpu_utils::VecTypeTrait<scalar_t>::vec_t;
+  const int32_t dim = n_size / 2;
+  float* __restrict__ gate = input;
+  float* __restrict__ up = input + dim;
+  vec_op::FP32Vec16 one_vec(1.0);
+
+  DEFINE_FAST_EXP
+
+  for (int32_t m = 0; m < m_size; ++m) {
+    for (int32_t n = 0; n < dim; n += 16) {
+      vec_op::FP32Vec16 gate_vec(gate + n);
+      vec_op::FP32Vec16 up_vec(up + n);
+      auto sigmoid_vec = one_vec / (one_vec + fast_exp(-gate_vec));
+      auto silu = gate_vec * sigmoid_vec;
+      auto gated_output_fp32 = up_vec * silu;
+      scalar_vec_t gated_output = scalar_vec_t(gated_output_fp32);
+      gated_output.save(output + n);
+    }
+    gate += input_stride;
+    up += input_stride;
+    output += output_stride;
+  }
+}
+
+template <typename scalar_t>
+FORCE_INLINE void apply_gated_act(const FusedMOEAct act,
+                                  float* __restrict__ input,
+                                  scalar_t* __restrict__ output,
+                                  const int32_t m, const int32_t n,
+                                  const int32_t input_stride,
+                                  const int32_t output_stride) {
+  switch (act) {
+    case FusedMOEAct::SwigluOAIAndMul:
+      swigluoai_and_mul(input, output, m, n, input_stride, output_stride);
+      return;
+    case FusedMOEAct::SiluAndMul:
+      silu_and_mul(input, output, m, n, input_stride, output_stride);
+      return;
+    default:
+      TORCH_CHECK(false, "Unsupported act type.");
+  }
+}
+
+template <typename scalar_t, typename gemm_t>
+void prepack_moe_weight_impl(scalar_t* __restrict__ weight_ptr,
+                             scalar_t* __restrict__ packed_weight_ptr,
+                             const int32_t expert_num,
+                             const int32_t output_size,
+                             const int32_t input_size,
+                             const int64_t expert_stride) {
+#pragma omp parallel for
+  for (int32_t e_idx = 0; e_idx < expert_num; ++e_idx) {
+    gemm_t::pack_weight(weight_ptr + expert_stride * e_idx,
+                        packed_weight_ptr + expert_stride * e_idx, output_size,
+                        input_size);
+  }
+}
+
+template <typename scalar_t, typename w_t, typename gemm_t>
+void fused_moe_impl(scalar_t* __restrict__ output, scalar_t* __restrict__ input,
+                    w_t* __restrict__ w13, w_t* __restrict__ w2,
+                    w_t* __restrict__ w13_bias, w_t* __restrict__ w2_bias,
+                    float* __restrict__ topk_weights,
+                    int32_t* __restrict__ topk_id, FusedMOEAct act_type,
+                    const int32_t token_num, const int32_t expert_num,
+                    const int32_t topk_num, const int32_t input_size_13,
+                    const int32_t output_size_13, const int32_t input_size_2,
+                    const int32_t output_size_2) {
+  using scalar_vec_t = typename cpu_utils::VecTypeTrait<scalar_t>::vec_t;
+  constexpr int32_t gemm_n_tile_size = gemm_t::NSize;
+  constexpr int32_t gemm_m_tile_size = gemm_t::MaxMSize;
+  constexpr int32_t min_w13_n_tile_size = 2 * gemm_n_tile_size;
+  static_assert(gemm_n_tile_size % 16 == 0);
+
+  TORCH_CHECK_EQ(output_size_13 % min_w13_n_tile_size, 0);
+  TORCH_CHECK_EQ(output_size_2 % gemm_n_tile_size, 0);
+  TORCH_CHECK_EQ(output_size_13 / 2, input_size_2);
+
+  const int32_t thread_num = omp_get_max_threads();
+
+  const int32_t w13_input_buffer_size = cpu_utils::round_up<64>(
+      gemm_m_tile_size * input_size_13 * sizeof(scalar_t));
+
+  const int32_t w13_n_tile_size = [&]() {
+    const int64_t cache_size = cpu_utils::get_available_l2_size();
+    // input buffer + output buffer + weight
+    const int32_t n_size_cache_limit =
+        (cache_size - w13_input_buffer_size) /
+        (gemm_m_tile_size * sizeof(float) + input_size_13 * sizeof(scalar_t));
+    const int32_t n_size_thread_limit =
+        output_size_13 / std::max(1, thread_num / topk_num);
+    const int32_t n_size = cpu_utils::round_down<min_w13_n_tile_size>(
+        std::min(n_size_cache_limit, n_size_thread_limit));
+    return std::max(n_size, min_w13_n_tile_size);
+  }();
+
+  const int32_t w2_input_tile_size = cpu_utils::round_up<64>(
+      gemm_m_tile_size * input_size_2 * sizeof(scalar_t));
+
+  const int32_t w2_n_tile_size = [&]() {
+    const int64_t cache_size = cpu_utils::get_available_l2_size();
+    // input tile + weight
+    const int32_t n_size_cache_limit =
+        (cache_size - w2_input_tile_size) / (input_size_2 * sizeof(scalar_t));
+    const int32_t n_size_thread_limit =
+        output_size_2 / std::max(1, thread_num / topk_num);
+    const int32_t n_size = cpu_utils::round_down<gemm_n_tile_size>(
+        std::min(n_size_cache_limit, n_size_thread_limit));
+    return std::max(n_size, gemm_n_tile_size);
+  }();
+
+  // allocate buffers
+  int32_t common_buffer_offset = 0;
+  int32_t w13_thread_buffer_offset = 0;
+  int32_t ws_thread_buffer_offset = 0;
+
+  // common buffers
+  const int32_t token_num_per_group_buffer_size =
+      cpu_utils::round_up<64>(expert_num * sizeof(int32_t));
+  const int32_t token_num_per_group_buffer_offset = common_buffer_offset;
+  common_buffer_offset += token_num_per_group_buffer_size;
+
+  const int32_t cu_token_num_per_group_buffer_size =
+      cpu_utils::round_up<64>((expert_num + 1) * sizeof(int32_t));
+  const int32_t cu_token_num_per_group_buffer_offset = common_buffer_offset;
+  common_buffer_offset += cu_token_num_per_group_buffer_size;
+
+  const int32_t expand_token_id_buffer_size =
+      cpu_utils::round_up<64>(token_num * topk_num * sizeof(int32_t));
+  const int32_t expand_token_id_buffer_offset = common_buffer_offset;
+  common_buffer_offset += expand_token_id_buffer_size;
+
+  const int32_t expand_token_id_index_buffer_size =
+      cpu_utils::round_up<64>(token_num * topk_num * sizeof(int32_t));
+  const int32_t expand_token_id_index_buffer_offset = common_buffer_offset;
+  common_buffer_offset += expand_token_id_index_buffer_size;
+
+  const int32_t w13_gemm_output_buffer_size = cpu_utils::round_up<64>(
+      token_num * topk_num * (output_size_13 / 2) * sizeof(scalar_t));
+  const int32_t w13_gemm_output_buffer_offset = common_buffer_offset;
+  common_buffer_offset += w13_gemm_output_buffer_size;
+
+  const int32_t w2_gemm_output_buffer_size = cpu_utils::round_up<64>(
+      token_num * topk_num * output_size_2 * sizeof(float));
+  const int32_t w2_gemm_output_buffer_offset = common_buffer_offset;
+  common_buffer_offset += w2_gemm_output_buffer_size;
+
+  // w13 GEMM thread buffers
+  const int32_t w13_input_buffer_offset = w13_thread_buffer_offset;
+  w13_thread_buffer_offset += w13_input_buffer_size;
+
+  const int32_t w13_output_buffer_size = cpu_utils::round_up<64>(
+      gemm_m_tile_size * w13_n_tile_size * sizeof(float));
+  const int32_t w13_output_buffer_offset = w13_thread_buffer_offset;
+  w13_thread_buffer_offset += w13_output_buffer_size;
+
+  // Weighted sum thread buffer
+  const int32_t ws_output_buffer_size =
+      cpu_utils::round_up<64>(output_size_2 * sizeof(float));
+  const int32_t ws_output_buffer_offset = ws_thread_buffer_offset;
+  ws_thread_buffer_offset += ws_output_buffer_size;
+
+  const int32_t buffer_size =
+      common_buffer_offset +
+      std::max(w13_thread_buffer_offset, ws_thread_buffer_offset) * thread_num;
+  cpu_utils::ScratchPadManager::get_scratchpad_manager()->realloc(buffer_size);
+  uint8_t* common_buffer_start =
+      cpu_utils::ScratchPadManager::get_scratchpad_manager()
+          ->get_data<uint8_t>();
+  uint8_t* thread_buffer_start = common_buffer_start + common_buffer_offset;
+
+  int32_t* __restrict__ token_num_per_group_buffer = reinterpret_cast<int32_t*>(
+      common_buffer_start + token_num_per_group_buffer_offset);
+  int32_t* __restrict__ cu_token_num_per_group_buffer =
+      reinterpret_cast<int32_t*>(common_buffer_start +
+                                 cu_token_num_per_group_buffer_offset);
+  int32_t* __restrict__ expand_token_id_buffer = reinterpret_cast<int32_t*>(
+      common_buffer_start + expand_token_id_buffer_offset);
+  int32_t* __restrict__ expand_token_id_index_buffer =
+      reinterpret_cast<int32_t*>(common_buffer_start +
+                                 expand_token_id_index_buffer_offset);
+
+  // prepare token-expert mappings
+  {
+    std::memset(token_num_per_group_buffer, 0, expert_num * sizeof(int32_t));
+    for (int32_t i = 0; i < token_num * topk_num; ++i) {
+      int32_t curr_expert_id = topk_id[i];
+      ++token_num_per_group_buffer[curr_expert_id];
+    }
+
+    int32_t token_num_sum = 0;
+    cu_token_num_per_group_buffer[0] = 0;
+    int32_t* token_index_buffer = cu_token_num_per_group_buffer + 1;
+    for (int32_t i = 0; i < expert_num; ++i) {
+      token_index_buffer[i] = token_num_sum;
+      token_num_sum += token_num_per_group_buffer[i];
+    }
+
+    for (int32_t i = 0; i < token_num; ++i) {
+      int32_t* curr_topk_id = topk_id + i * topk_num;
+      int32_t* curr_index_buffer = expand_token_id_index_buffer + i * topk_num;
+      for (int32_t j = 0; j < topk_num; ++j) {
+        int32_t curr_expert_id = curr_topk_id[j];
+        int32_t curr_index = token_index_buffer[curr_expert_id];
+        ++token_index_buffer[curr_expert_id];
+        expand_token_id_buffer[curr_index] = i;
+        curr_index_buffer[j] = curr_index;
+      }
+    }
+  }
+
+  // w13 GEMM + act
+  {
+    alignas(64) cpu_utils::Counter counter;
+    cpu_utils::Counter* counter_ptr = &counter;
+
+#pragma omp parallel for schedule(static, 1)
+    for (int32_t thread_id = 0; thread_id < thread_num; ++thread_id) {
+      const int32_t task_num_per_expert =
+          (output_size_13 + w13_n_tile_size - 1) / w13_n_tile_size;
+      const int32_t task_num = task_num_per_expert * expert_num;
+
+      uint8_t* __restrict__ thread_buffer =
+          thread_buffer_start + thread_id * w13_thread_buffer_offset;
+      scalar_t* __restrict__ w13_input_buffer =
+          reinterpret_cast<scalar_t*>(thread_buffer + w13_input_buffer_offset);
+      float* __restrict__ w13_output_buffer =
+          reinterpret_cast<float*>(thread_buffer + w13_output_buffer_offset);
+      scalar_t* __restrict__ w13_gemm_output_buffer =
+          reinterpret_cast<scalar_t*>(common_buffer_start +
+                                      w13_gemm_output_buffer_offset);
+
+      gemm_t gemm;
+
+      const int32_t input_size_13_bytes = input_size_13 * sizeof(scalar_t);
+      const int32_t w13_n_group_stride = 16 * input_size_13;
+      const int32_t w13_n_tile_stride = gemm_n_tile_size * input_size_13;
+
+      for (;;) {
+        int32_t task_id = counter_ptr->acquire_counter();
+        if (task_id >= task_num) {
+          break;
+        }
+
+        const int32_t curr_expert_id = task_id / task_num_per_expert;
+        const int32_t curr_output_group_id = task_id % task_num_per_expert;
+        const int32_t curr_token_num =
+            token_num_per_group_buffer[curr_expert_id];
+        if (curr_token_num == 0) {
+          continue;
+        }
+
+        const int32_t actual_n_tile_size =
+            std::min(w13_n_tile_size,
+                     output_size_13 - curr_output_group_id * w13_n_tile_size);
+        const int32_t* __restrict__ curr_expand_token_id_buffer =
+            expand_token_id_buffer +
+            cu_token_num_per_group_buffer[curr_expert_id];
+        scalar_t* __restrict__ curr_w13_gemm_output_buffer =
+            w13_gemm_output_buffer +
+            cu_token_num_per_group_buffer[curr_expert_id] *
+                (output_size_13 / 2) +
+            curr_output_group_id * w13_n_tile_size / 2;
+
+        w_t* __restrict__ w13_weight_ptr_0 = nullptr;
+        w_t* __restrict__ w13_weight_ptr_1 = nullptr;
+        w_t* __restrict__ w13_bias_ptr_0 = nullptr;
+        w_t* __restrict__ w13_bias_ptr_1 = nullptr;
+        if (act_type == FusedMOEAct::SwigluOAIAndMul) {
+          // For SwigluOAIAndMul, up and down weights are interleaved
+          w13_weight_ptr_0 =
+              w13 + curr_expert_id * input_size_13 * output_size_13 +
+              curr_output_group_id * w13_n_tile_size * input_size_13;
+          w13_weight_ptr_1 =
+              w13_weight_ptr_0 + actual_n_tile_size / 2 * input_size_13;
+          if (w13_bias != nullptr) {
+            w13_bias_ptr_0 = w13_bias + curr_expert_id * output_size_13 +
+                             curr_output_group_id * w13_n_tile_size;
+            w13_bias_ptr_1 = w13_bias_ptr_0 + actual_n_tile_size / 2;
+          }
+        } else {
+          w13_weight_ptr_0 =
+              w13 + curr_expert_id * input_size_13 * output_size_13 +
+              curr_output_group_id * (w13_n_tile_size / 2) * input_size_13;
+          w13_weight_ptr_1 =
+              w13_weight_ptr_0 + output_size_13 / 2 * input_size_13;
+          if (w13_bias != nullptr) {
+            w13_bias_ptr_0 = w13_bias + curr_expert_id * output_size_13 +
+                             curr_output_group_id * (w13_n_tile_size / 2);
+            w13_bias_ptr_1 = w13_bias_ptr_0 + output_size_13 / 2;
+          }
+        }
+
+        scalar_t* __restrict__ curr_w13_input_buffer = w13_input_buffer;
+        for (int32_t token_idx = 0; token_idx < curr_token_num;
+             token_idx += gemm_m_tile_size) {
+          const int32_t actual_token_num =
+              std::min(gemm_m_tile_size, curr_token_num - token_idx);
+          // copy inputs
+          {
+            scalar_t* __restrict__ curr_w13_input_buffer_iter =
+                curr_w13_input_buffer;
+            for (int32_t i = 0; i < actual_token_num; ++i) {
+              const int32_t curr_token_id = curr_expand_token_id_buffer[i];
+              int8_t* __restrict__ curr_input_iter = reinterpret_cast<int8_t*>(
+                  input + curr_token_id * input_size_13);
+              int8_t* __restrict__ curr_output_iter =
+                  reinterpret_cast<int8_t*>(curr_w13_input_buffer_iter);
+              int32_t j = 0;
+              for (; j < input_size_13_bytes - 64; j += 64) {
+                vec_op::INT8Vec64 vec(curr_input_iter);
+                vec.save(curr_output_iter);
+                curr_input_iter += 64;
+                curr_output_iter += 64;
+              }
+              vec_op::INT8Vec64 vec(curr_input_iter);
+              vec.save(curr_output_iter, input_size_13_bytes - j);
+
+              // update
+              curr_w13_input_buffer_iter += input_size_13;
+            }
+            // update
+            curr_expand_token_id_buffer += actual_token_num;
+          }
+
+          // gemm + act
+          {
+            scalar_t* __restrict__ w13_weight_ptr_0_iter = w13_weight_ptr_0;
+            scalar_t* __restrict__ w13_weight_ptr_1_iter = w13_weight_ptr_1;
+            scalar_t* __restrict__ w13_bias_ptr_0_iter = w13_bias_ptr_0;
+            scalar_t* __restrict__ w13_bias_ptr_1_iter = w13_bias_ptr_1;
+            scalar_t* __restrict__ curr_w13_input_buffer_iter =
+                curr_w13_input_buffer;
+            float* __restrict__ w13_output_buffer_0_iter = w13_output_buffer;
+            float* __restrict__ w13_output_buffer_1_iter =
+                w13_output_buffer + actual_n_tile_size / 2;
+            for (int32_t i = 0; i < actual_n_tile_size;
+                 i += min_w13_n_tile_size) {
+              gemm.gemm(curr_w13_input_buffer_iter, w13_weight_ptr_0_iter,
+                        w13_output_buffer_0_iter, actual_token_num,
+                        input_size_13, input_size_13, w13_n_group_stride,
+                        actual_n_tile_size, false);
+
+              if (w13_bias != nullptr) {
+                cpu_micro_gemm::add_bias_epilogue<gemm_n_tile_size>(
+                    w13_output_buffer_0_iter, w13_output_buffer_0_iter,
+                    w13_bias_ptr_0_iter, actual_token_num, actual_n_tile_size,
+                    actual_n_tile_size);
+                w13_bias_ptr_0_iter += gemm_n_tile_size;
+              }
+
+              gemm.gemm(curr_w13_input_buffer_iter, w13_weight_ptr_1_iter,
+                        w13_output_buffer_1_iter, actual_token_num,
+                        input_size_13, input_size_13, w13_n_group_stride,
+                        actual_n_tile_size, false);
+
+              if (w13_bias != nullptr) {
+                cpu_micro_gemm::add_bias_epilogue<gemm_n_tile_size>(
+                    w13_output_buffer_1_iter, w13_output_buffer_1_iter,
+                    w13_bias_ptr_1_iter, actual_token_num, actual_n_tile_size,
+                    actual_n_tile_size);
+                w13_bias_ptr_1_iter += gemm_n_tile_size;
+              }
+
+              // update
+              w13_weight_ptr_0_iter += w13_n_tile_stride;
+              w13_weight_ptr_1_iter += w13_n_tile_stride;
+              w13_output_buffer_0_iter += gemm_n_tile_size;
+              w13_output_buffer_1_iter += gemm_n_tile_size;
+            }
+
+            apply_gated_act(act_type, w13_output_buffer,
+                            curr_w13_gemm_output_buffer, actual_token_num,
+                            actual_n_tile_size, actual_n_tile_size,
+                            output_size_13 / 2);
+
+            // update
+            curr_w13_gemm_output_buffer +=
+                gemm_m_tile_size * (output_size_13 / 2);
+          }
+        }
+      }
+    }
+  }
+
+  // w2 GEMM
+  {
+    alignas(64) cpu_utils::Counter counter;
+    cpu_utils::Counter* counter_ptr = &counter;
+
+#pragma omp parallel for schedule(static, 1)
+    for (int32_t thread_id = 0; thread_id < thread_num; ++thread_id) {
+      const int32_t task_num_per_expert =
+          (output_size_2 + w2_n_tile_size - 1) / w2_n_tile_size;
+      const int32_t task_num = task_num_per_expert * expert_num;
+      scalar_t* __restrict__ w13_gemm_output_buffer =
+          reinterpret_cast<scalar_t*>(common_buffer_start +
+                                      w13_gemm_output_buffer_offset);
+      float* __restrict__ w2_gemm_output_buffer = reinterpret_cast<float*>(
+          common_buffer_start + w2_gemm_output_buffer_offset);
+
+      gemm_t gemm;
+
+      const int32_t w2_n_tile_stride = gemm_n_tile_size * input_size_2;
+      const int32_t w2_n_group_stride = 16 * input_size_2;
+
+      for (;;) {
+        int32_t task_id = counter_ptr->acquire_counter();
+        if (task_id >= task_num) {
+          break;
+        }
+
+        const int32_t curr_expert_id = task_id / task_num_per_expert;
+        const int32_t curr_output_group_id = task_id % task_num_per_expert;
+        const int32_t curr_token_num =
+            token_num_per_group_buffer[curr_expert_id];
+        if (curr_token_num == 0) {
+          continue;
+        }
+
+        const int32_t actual_n_tile_size =
+            std::min(w2_n_tile_size,
+                     output_size_2 - curr_output_group_id * w2_n_tile_size);
+        scalar_t* __restrict__ curr_w13_gemm_output_buffer =
+            w13_gemm_output_buffer +
+            cu_token_num_per_group_buffer[curr_expert_id] * input_size_2;
+        float* __restrict__ curr_w2_gemm_output_buffer =
+            w2_gemm_output_buffer +
+            cu_token_num_per_group_buffer[curr_expert_id] * output_size_2 +
+            curr_output_group_id * w2_n_tile_size;
+        scalar_t* __restrict__ w2_weight_ptr =
+            w2 + curr_expert_id * output_size_2 * input_size_2 +
+            curr_output_group_id * w2_n_tile_size * input_size_2;
+        scalar_t* __restrict__ w2_bias_ptr = nullptr;
+        if (w2_bias != nullptr) {
+          w2_bias_ptr = w2_bias + curr_expert_id * output_size_2 +
+                        curr_output_group_id * w2_n_tile_size;
+        }
+
+        for (int32_t token_idx = 0; token_idx < curr_token_num;
+             token_idx += gemm_m_tile_size) {
+          const int32_t actual_token_num =
+              std::min(gemm_m_tile_size, curr_token_num - token_idx);
+
+          scalar_t* __restrict__ w2_weight_ptr_iter = w2_weight_ptr;
+          scalar_t* __restrict__ w2_bias_ptr_iter = w2_bias_ptr;
+          float* __restrict__ curr_w2_gemm_output_buffer_iter =
+              curr_w2_gemm_output_buffer;
+          for (int32_t i = 0; i < actual_n_tile_size; i += gemm_n_tile_size) {
+            gemm.gemm(curr_w13_gemm_output_buffer, w2_weight_ptr_iter,
+                      curr_w2_gemm_output_buffer_iter, actual_token_num,
+                      input_size_2, input_size_2, w2_n_group_stride,
+                      output_size_2, false);
+
+            if (w2_bias != nullptr) {
+              cpu_micro_gemm::add_bias_epilogue<gemm_n_tile_size>(
+                  curr_w2_gemm_output_buffer_iter,
+                  curr_w2_gemm_output_buffer_iter, w2_bias_ptr_iter,
+                  actual_token_num, output_size_2, output_size_2);
+              w2_bias_ptr_iter += gemm_n_tile_size;
+            }
+
+            w2_weight_ptr_iter += w2_n_tile_stride;
+            curr_w2_gemm_output_buffer_iter += gemm_n_tile_size;
+          }
+
+          // update
+          curr_w13_gemm_output_buffer += gemm_m_tile_size * input_size_2;
+          curr_w2_gemm_output_buffer += gemm_m_tile_size * output_size_2;
+        }
+      }
+    }
+  }
+
+  // weighted sum
+  {
+    alignas(64) cpu_utils::Counter counter;
+    cpu_utils::Counter* counter_ptr = &counter;
+
+#pragma omp parallel for schedule(static, 1)
+    for (int32_t thread_id = 0; thread_id < thread_num; ++thread_id) {
+      const int32_t task_num = token_num;
+      uint8_t* __restrict__ thread_buffer =
+          thread_buffer_start + thread_id * ws_thread_buffer_offset;
+      float* __restrict__ ws_output_buffer =
+          reinterpret_cast<float*>(thread_buffer + ws_output_buffer_offset);
+      float* __restrict__ w2_gemm_output_buffer = reinterpret_cast<float*>(
+          common_buffer_start + w2_gemm_output_buffer_offset);
+
+      for (;;) {
+        int32_t task_id = counter_ptr->acquire_counter();
+        if (task_id >= task_num) {
+          break;
+        }
+
+        int32_t token_id = task_id;
+        int32_t* __restrict__ curr_expand_token_id_index_buffer =
+            expand_token_id_index_buffer + token_id * topk_num;
+        float* __restrict__ curr_weight = topk_weights + token_id * topk_num;
+        scalar_t* __restrict__ curr_output_buffer =
+            output + token_id * output_size_2;
+
+        if (topk_num > 1) {
+          {
+            int32_t w2_output_idx = curr_expand_token_id_index_buffer[0];
+            float* __restrict__ w2_output_iter =
+                w2_gemm_output_buffer + w2_output_idx * output_size_2;
+            float* __restrict__ ws_output_buffer_iter = ws_output_buffer;
+            vec_op::FP32Vec16 weight_vec(curr_weight[0]);
+            for (int32_t i = 0; i < output_size_2; i += 16) {
+              vec_op::FP32Vec16 vec(w2_output_iter);
+              vec = vec * weight_vec;
+              vec.save(ws_output_buffer_iter);
+
+              // update
+              w2_output_iter += 16;
+              ws_output_buffer_iter += 16;
+            }
+          }
+
+          {
+            for (int32_t idx = 1; idx < topk_num - 1; ++idx) {
+              int32_t w2_output_idx = curr_expand_token_id_index_buffer[idx];
+              float* __restrict__ w2_output_iter =
+                  w2_gemm_output_buffer + w2_output_idx * output_size_2;
+              float* __restrict__ ws_output_buffer_iter = ws_output_buffer;
+              vec_op::FP32Vec16 weight_vec(curr_weight[idx]);
+              for (int32_t i = 0; i < output_size_2; i += 16) {
+                vec_op::FP32Vec16 vec(w2_output_iter);
+                vec_op::FP32Vec16 sum(ws_output_buffer_iter);
+                sum = sum + vec * weight_vec;
+                sum.save(ws_output_buffer_iter);
+
+                // update
+                w2_output_iter += 16;
+                ws_output_buffer_iter += 16;
+              }
+            }
+          }
+
+          {
+            int32_t idx = topk_num - 1;
+            int32_t w2_output_idx = curr_expand_token_id_index_buffer[idx];
+            float* __restrict__ w2_output_iter =
+                w2_gemm_output_buffer + w2_output_idx * output_size_2;
+            float* __restrict__ ws_output_buffer_iter = ws_output_buffer;
+            scalar_t* __restrict__ curr_output_buffer_iter = curr_output_buffer;
+            vec_op::FP32Vec16 weight_vec(curr_weight[idx]);
+            for (int32_t i = 0; i < output_size_2; i += 16) {
+              vec_op::FP32Vec16 vec(w2_output_iter);
+              vec_op::FP32Vec16 sum(ws_output_buffer_iter);
+              sum = sum + vec * weight_vec;
+              scalar_vec_t out_vec(sum);
+              out_vec.save(curr_output_buffer_iter);
+
+              // update
+              w2_output_iter += 16;
+              ws_output_buffer_iter += 16;
+              curr_output_buffer_iter += 16;
+            }
+          }
+        } else {
+          int32_t w2_output_idx = curr_expand_token_id_index_buffer[0];
+          float* __restrict__ w2_output_iter =
+              w2_gemm_output_buffer + w2_output_idx * output_size_2;
+          scalar_t* __restrict__ curr_output_buffer_iter = curr_output_buffer;
+          vec_op::FP32Vec16 weight_vec(curr_weight[0]);
+          for (int32_t i = 0; i < output_size_2; i += 16) {
+            vec_op::FP32Vec16 vec(w2_output_iter);
+            vec = vec * weight_vec;
+            scalar_vec_t out_vec(vec);
+            out_vec.save(curr_output_buffer_iter);
+
+            // update
+            w2_output_iter += 16;
+            curr_output_buffer_iter += 16;
+          }
+        }
+      }
+    }
+  }
+}
+}  // namespace
+
+void prepack_moe_weight(
+    const torch::Tensor& weight,  // [expert_num, output_size, input_size]
+    torch::Tensor& packed_weight, const std::string& isa) {
+  TORCH_CHECK(weight.is_contiguous());
+  const int32_t expert_num = weight.size(0);
+  const int32_t output_size = weight.size(1);
+  const int32_t input_size = weight.size(2);
+  TORCH_CHECK_EQ(output_size % 32, 0);
+  const int64_t expert_stride = weight.stride(0);
+  cpu_utils::ISA isa_type = cpu_utils::get_isa(isa);
+
+  VLLM_DISPATCH_FLOATING_TYPES(
+      weight.scalar_type(), "prepack_moe_weight", [&]() {
+        CPU_ISA_DISPATCH_IMPL(isa_type, [&]() {
+          scalar_t* weight_ptr = weight.data_ptr<scalar_t>();
+          scalar_t* packed_weight_ptr = packed_weight.data_ptr<scalar_t>();
+          prepack_moe_weight_impl<scalar_t, gemm_t>(
+              weight_ptr, packed_weight_ptr, expert_num, output_size,
+              input_size, expert_stride);
+        });
+      });
+}
+
+void cpu_fused_moe(
+    torch::Tensor& output,       // [token_num, output_size_2]
+    const torch::Tensor& input,  // [token_num, input_size_13]
+    const torch::Tensor&
+        w13,  // [expert_num, output_size_13, input_size_13], packed
+    const torch::Tensor&
+        w2,  // [expert_num, output_size_2, input_size_2], packed
+    const std::optional<torch::Tensor>&
+        w13_bias,  // [expert_num, output_size_13]
+    const std::optional<torch::Tensor>& w2_bias,  // [expert_num, output_size_2]
+    const torch::Tensor& topk_weights,            // [token_num, k], float32
+    const torch::Tensor& topk_id,                 // [token_num, k], int32
+    const std::string& act, const std::string& isa) {
+  const int32_t token_num = input.size(0);
+  const int32_t input_size_13 = input.size(1);
+  const int64_t input_stride = input.stride(0);
+  TORCH_CHECK_EQ(input_stride, input_size_13);
+  const int32_t expert_num = w13.size(0);
+  const int32_t output_size_13 = w13.size(1);
+  const int32_t input_size_2 = w2.size(2);
+  const int32_t output_size_2 = w2.size(1);
+  const int32_t topk_num = topk_id.size(1);
+  const FusedMOEAct act_type = get_act_type(act);
+  cpu_utils::ISA isa_type = cpu_utils::get_isa(isa);
+
+  VLLM_DISPATCH_FLOATING_TYPES(w13.scalar_type(), "cpu_fused_moe", [&]() {
+    CPU_ISA_DISPATCH_IMPL(isa_type, [&]() {
+      fused_moe_impl<scalar_t, scalar_t, gemm_t>(
+          output.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),
+          w13.data_ptr<scalar_t>(), w2.data_ptr<scalar_t>(),
+          w13_bias.has_value() ? w13_bias->data_ptr<scalar_t>() : nullptr,
+          w2_bias.has_value() ? w2_bias->data_ptr<scalar_t>() : nullptr,
+          topk_weights.data_ptr<float>(), topk_id.data_ptr<int32_t>(), act_type,
+          token_num, expert_num, topk_num, input_size_13, output_size_13,
+          input_size_2, output_size_2);
+    });
+  });
+}
diff --git a/csrc/cpu/cpu_types_x86.hpp b/csrc/cpu/cpu_types_x86.hpp
index 6f51277f784402a29fa45ffa9e4bd58f04547194..d94af338ac1c949f42352dc1418908eba3e965d2 100644
--- a/csrc/cpu/cpu_types_x86.hpp
+++ b/csrc/cpu/cpu_types_x86.hpp
@@ -352,6 +352,10 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
   explicit FP32Vec16(bool, void* ptr)
       : reg((__m512)_mm512_stream_load_si512(ptr)) {}
 
+  // strided load
+  explicit FP32Vec16(const float* ptr, INT32Vec16 idx)
+      : reg(_mm512_i32gather_ps(idx.reg, ptr, 4)) {}
+
   explicit FP32Vec16(__m512 data) : reg(data) {}
 
   // de-pack 4 bit values
@@ -408,6 +412,10 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
     return FP32Vec16(_mm512_sub_ps(reg, b.reg));
   }
 
+  FP32Vec16 operator-() const {
+    return FP32Vec16(_mm512_xor_ps(reg, _mm512_set1_ps(-0.0f)));
+  }
+
   FP32Vec16 operator/(const FP32Vec16& b) const {
     return FP32Vec16(_mm512_div_ps(reg, b.reg));
   }
diff --git a/csrc/cpu/cpu_wna16.cpp b/csrc/cpu/cpu_wna16.cpp
index 816d195506e5285cbe66ddcd3b1526908d312b8a..88d48f3db8772e8494439ce7d2aff555db38b690 100644
--- a/csrc/cpu/cpu_wna16.cpp
+++ b/csrc/cpu/cpu_wna16.cpp
@@ -1,6 +1,5 @@
-#include "cpu_types.hpp"
-#include "scratchpad_manager.h"
-#include "utils.hpp"
+#include "cpu/cpu_types.hpp"
+#include "cpu/utils.hpp"
 
 #ifdef CPU_CAPABILITY_AMXBF16
   #include "cpu/micro_gemm/cpu_micro_gemm_amx.hpp"
@@ -158,7 +157,7 @@ void cpu_gemm_wna16_impl(
   // a simple schedule policy, just to hold more B tiles in L2 and make sure
   // each thread has tasks
   const int32_t n_partition_size = [&]() {
-    const int64_t cache_size = cpu_utils::get_l2_size();
+    const int64_t cache_size = cpu_utils::get_available_l2_size();
     int64_t ps_cache_limit = cache_size / (k_size * sizeof(scalar_t));
     int64_t ps_thread_limit = n_size / thread_num;
     ps_cache_limit =
@@ -179,8 +178,8 @@ void cpu_gemm_wna16_impl(
   const int64_t b_buffer_offset = 0;
   const int64_t c_buffer_offset = b_buffer_size;
   const int64_t buffer_size = b_buffer_size + c_buffer_size;
-  DNNLScratchPadManager::get_dnnl_scratchpad_manager()->realloc(buffer_size *
-                                                                thread_num);
+  cpu_utils::ScratchPadManager::get_scratchpad_manager()->realloc(buffer_size *
+                                                                  thread_num);
 
   alignas(64) cpu_utils::Counter counter;
   cpu_utils::Counter* counter_ptr = &counter;
@@ -190,9 +189,10 @@ void cpu_gemm_wna16_impl(
     scalar_t* __restrict__ b_buffer = nullptr;
     float* __restrict__ c_buffer = nullptr;
     {
-      uint8_t* buffer_ptr = DNNLScratchPadManager::get_dnnl_scratchpad_manager()
-                                ->get_data<uint8_t>() +
-                            thread_id * buffer_size;
+      uint8_t* buffer_ptr =
+          cpu_utils::ScratchPadManager::get_scratchpad_manager()
+              ->get_data<uint8_t>() +
+          thread_id * buffer_size;
       b_buffer = reinterpret_cast<scalar_t*>(buffer_ptr + b_buffer_offset);
       c_buffer = reinterpret_cast<float*>(buffer_ptr + c_buffer_offset);
     }
diff --git a/csrc/cpu/dnnl_helper.cpp b/csrc/cpu/dnnl_helper.cpp
index cfb6e78cba9a110d323ca996c5515a9af93de9d9..e337e10e1cf7b4ebfd97413f922d9688add2f4db 100644
--- a/csrc/cpu/dnnl_helper.cpp
+++ b/csrc/cpu/dnnl_helper.cpp
@@ -4,8 +4,8 @@
 #include "common/memory_desc.hpp"
 #include "common/memory.hpp"
 
-#include "dnnl_helper.h"
-#include "scratchpad_manager.h"
+#include "cpu/utils.hpp"
+#include "cpu/dnnl_helper.h"
 
 static dnnl::engine& default_engine() {
   static dnnl::engine engine(dnnl::engine::kind::cpu, 0);
@@ -274,7 +274,7 @@ void W8A8MatMulPrimitiveHandler::execute(ExecArgs& args) {
 
   auto&& [scratchpad_storage, scratchpad_mem_desc] = get_runtime_memory_ptr(5);
   scratchpad_storage->set_data_handle(
-      DNNLScratchPadManager::get_dnnl_scratchpad_manager()->get_data<void>());
+      cpu_utils::ScratchPadManager::get_scratchpad_manager()->get_data<void>());
 
   matmul.execute(default_stream(), memory_cache_);
   default_stream().wait();
@@ -294,7 +294,7 @@ dnnl::matmul W8A8MatMulPrimitiveHandler::get_matmul_cache(
 
   return m_size_cache_->get_or_create(key, [&]() {
     dnnl::matmul::primitive_desc desc = this->create_primitive_desc(key, false);
-    auto manager = DNNLScratchPadManager::get_dnnl_scratchpad_manager();
+    auto manager = cpu_utils::ScratchPadManager::get_scratchpad_manager();
     manager->realloc(desc.scratchpad_desc().get_size());
     return dnnl::matmul(desc);
   });
@@ -470,7 +470,7 @@ void MatMulPrimitiveHandler::execute(ExecArgs& args) {
 
   auto&& [scratchpad_storage, scratchpad_mem_desc] = get_runtime_memory_ptr(3);
   scratchpad_storage->set_data_handle(
-      DNNLScratchPadManager::get_dnnl_scratchpad_manager()->get_data<void>());
+      cpu_utils::ScratchPadManager::get_scratchpad_manager()->get_data<void>());
 
   matmul.execute(default_stream(), memory_cache_);
   default_stream().wait();
@@ -486,7 +486,7 @@ dnnl::matmul MatMulPrimitiveHandler::get_matmul_cache(
   }
   return m_size_cache_->get_or_create(key, [&]() {
     dnnl::matmul::primitive_desc desc = this->create_primitive_desc(key, false);
-    auto manager = DNNLScratchPadManager::get_dnnl_scratchpad_manager();
+    auto manager = cpu_utils::ScratchPadManager::get_scratchpad_manager();
     manager->realloc(desc.scratchpad_desc().get_size());
     return dnnl::matmul(desc);
   });
diff --git a/csrc/cpu/micro_gemm/cpu_micro_gemm_amx.hpp b/csrc/cpu/micro_gemm/cpu_micro_gemm_amx.hpp
index 87a019773a8951be4ed4ea188935286939066882..357c7cf1d7844b4e1244e05a8dd362c618163a18 100644
--- a/csrc/cpu/micro_gemm/cpu_micro_gemm_amx.hpp
+++ b/csrc/cpu/micro_gemm/cpu_micro_gemm_amx.hpp
@@ -235,6 +235,39 @@ class MicroGemm<cpu_utils::ISA::AMX, scalar_t> {
     }
   }
 
+  static void pack_weight(const scalar_t* __restrict__ weight,
+                          scalar_t* __restrict__ packed_weight,
+                          const int32_t output_size, const int32_t input_size) {
+    constexpr int32_t elem_num_per_group = 4 / sizeof(scalar_t);
+    TORCH_CHECK_EQ(output_size % 16, 0);
+    TORCH_CHECK_EQ(input_size % (16 * elem_num_per_group), 0);
+
+    const int32_t output_group_num = output_size / 16;
+    const int32_t input_32b_num = input_size / elem_num_per_group;
+    for (int32_t output_group_idx = 0; output_group_idx < output_group_num;
+         ++output_group_idx) {
+      const int32_t* __restrict__ weight_32b =
+          reinterpret_cast<const int32_t*>(weight);
+      int32_t* __restrict__ packed_weight_32b =
+          reinterpret_cast<int32_t*>(packed_weight);
+      for (int32_t output_idx = 0; output_idx < 16; ++output_idx) {
+        for (int32_t weight_offset = 0, packed_offset = 0;
+             weight_offset < input_32b_num;
+             ++weight_offset, packed_offset += 16) {
+          packed_weight_32b[packed_offset] = weight_32b[weight_offset];
+        }
+
+        // update
+        weight_32b += input_32b_num;
+        packed_weight_32b += 1;
+      }
+
+      // update
+      weight += 16 * input_size;
+      packed_weight += 16 * input_size;
+    }
+  }
+
  private:
   alignas(64) __tilecfg amx_tile_config_;
   int32_t curr_m_;
diff --git a/csrc/cpu/micro_gemm/cpu_micro_gemm_impl.hpp b/csrc/cpu/micro_gemm/cpu_micro_gemm_impl.hpp
index 784da55a420e5cd1a754437ce9b0cc495bb8e0dd..23e78a681b5fed790a83567a162b3c13c8645a4b 100644
--- a/csrc/cpu/micro_gemm/cpu_micro_gemm_impl.hpp
+++ b/csrc/cpu/micro_gemm/cpu_micro_gemm_impl.hpp
@@ -13,6 +13,9 @@ namespace cpu_micro_gemm {
 #define CPU_MICRO_GEMM_PARAMS \
   a_ptr, b_ptr, c_ptr, m, k, lda, b_n_group_stride, ldc, accum_c
 
+// Note: weights for MicroGemm should be packed as (output_size / 16) contiguous
+// blocks, means the logical shape of blocks is [16, input_size]. And the actual
+// layout of blocks can be ISA-specific.
 template <cpu_utils::ISA isa, typename scalar_t>
 class MicroGemm {
  public:
@@ -86,6 +89,41 @@ FORCE_INLINE void bias_epilogue(float* __restrict__ c_ptr,
     curr_d += ldd;
   }
 }
+
+template <int32_t n_size, typename scalar_t>
+FORCE_INLINE void add_bias_epilogue(float* c_ptr, float* d_ptr,
+                                    scalar_t* __restrict__ bias_ptr,
+                                    const int32_t m, const int64_t ldc,
+                                    const int64_t ldd) {
+  using scalar_vec_t = typename cpu_utils::VecTypeTrait<scalar_t>::vec_t;
+  static_assert(n_size % 16 == 0);
+  constexpr int32_t n_group_num = n_size / 16;
+  static_assert(n_group_num <= 16);
+
+  vec_op::FP32Vec16 bias_vecs[n_group_num];
+  scalar_t* __restrict__ curr_bias = bias_ptr;
+  vec_op::unroll_loop<int32_t, n_group_num>([&](int32_t i) {
+    scalar_vec_t vec(curr_bias);
+    bias_vecs[i] = vec_op::FP32Vec16(vec);
+    curr_bias += 16;
+  });
+
+  float* curr_c = c_ptr;
+  float* curr_d = d_ptr;
+  for (int32_t i = 0; i < m; ++i) {
+    float* curr_c_iter = curr_c;
+    float* curr_d_iter = curr_d;
+    vec_op::unroll_loop<int32_t, n_group_num>([&](int32_t n_g_idx) {
+      vec_op::FP32Vec16 c_vec_fp32(curr_c_iter);
+      c_vec_fp32 = c_vec_fp32 + bias_vecs[n_g_idx];
+      c_vec_fp32.save(curr_d_iter);
+      curr_c_iter += 16;
+      curr_d_iter += 16;
+    });
+    curr_c += ldc;
+    curr_d += ldd;
+  }
+}
 }  // namespace cpu_micro_gemm
 
 #endif
diff --git a/csrc/cpu/micro_gemm/cpu_micro_gemm_vec.hpp b/csrc/cpu/micro_gemm/cpu_micro_gemm_vec.hpp
index 3985c2f2e5fe47a45381669756a9c3e1f5b7f5df..bdd3e85a1c522dfd3f28b78f7d9fcf43dbafeed2 100644
--- a/csrc/cpu/micro_gemm/cpu_micro_gemm_vec.hpp
+++ b/csrc/cpu/micro_gemm/cpu_micro_gemm_vec.hpp
@@ -109,6 +109,25 @@ class MicroGemm<cpu_utils::ISA::VEC, scalar_t> {
   void gemm(DEFINE_CPU_MICRO_GEMM_PARAMS) {
     TileGemm82<scalar_t>::gemm(CPU_MICRO_GEMM_PARAMS);
   }
+
+  // Note: pack contiguous weight [output_size, input_size] as contiguous
+  // packed weight [output_size / 16, input_size, 16]
+  static void pack_weight(const scalar_t* __restrict__ weight,
+                          scalar_t* __restrict__ packed_weight,
+                          const int32_t output_size, const int32_t input_size) {
+    TORCH_CHECK_EQ(output_size % 16, 0);
+    for (int32_t o_idx = 0; o_idx < output_size; ++o_idx) {
+      const scalar_t* __restrict__ curr_weight = weight + o_idx * input_size;
+      scalar_t* __restrict__ curr_packed_weight =
+          packed_weight + (o_idx / 16) * (16 * input_size) + o_idx % 16;
+      for (int32_t i_idx = 0; i_idx < input_size; ++i_idx) {
+        *curr_packed_weight = *curr_weight;
+
+        curr_packed_weight += 16;
+        ++curr_weight;
+      }
+    }
+  }
 };
 }  // namespace cpu_micro_gemm
 
diff --git a/csrc/cpu/scratchpad_manager.cpp b/csrc/cpu/scratchpad_manager.cpp
deleted file mode 100644
index 05cd435f34b7a644176ca99cc518cf0f03dc5294..0000000000000000000000000000000000000000
--- a/csrc/cpu/scratchpad_manager.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-#include <cstdlib>
-
-#include "scratchpad_manager.h"
-
-DNNLScratchPadManager::DNNLScratchPadManager() : size_(0), ptr_(nullptr) {
-  this->realloc(allocation_unit * 128);
-}
-
-void DNNLScratchPadManager::realloc(size_t new_size) {
-  new_size = round(new_size);
-  if (new_size > size_) {
-    if (ptr_ != nullptr) {
-      std::free(ptr_);
-    }
-    ptr_ = std::aligned_alloc(64, new_size);
-    size_ = new_size;
-  }
-}
-
-DNNLScratchPadManager* DNNLScratchPadManager::get_dnnl_scratchpad_manager() {
-  static DNNLScratchPadManager manager;
-  return &manager;
-}
diff --git a/csrc/cpu/scratchpad_manager.h b/csrc/cpu/scratchpad_manager.h
deleted file mode 100644
index 0ecf59192f84532770783abea95dc25586abe049..0000000000000000000000000000000000000000
--- a/csrc/cpu/scratchpad_manager.h
+++ /dev/null
@@ -1,31 +0,0 @@
-#ifndef SCRATCHPAD_MANAGER_H
-#define SCRATCHPAD_MANAGER_H
-
-#include <cstddef>
-#include <cstdio>
-
-class DNNLScratchPadManager {
- public:
-  static constexpr size_t allocation_unit = 4 * 1024;  // 4KB
-
-  static DNNLScratchPadManager* get_dnnl_scratchpad_manager();
-
-  DNNLScratchPadManager();
-
-  template <typename T>
-  T* get_data() {
-    return reinterpret_cast<T*>(ptr_);
-  }
-
-  static size_t round(size_t size) {
-    return ((size + allocation_unit - 1) / allocation_unit) * allocation_unit;
-  }
-
-  void realloc(size_t new_size);
-
- private:
-  size_t size_;
-  void* ptr_;
-};
-
-#endif
diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp
index d01349d59bc6625d5a3c9a755b605a8e9924e3fc..c98f970e41f078b75b0568d39125a150098b0ddd 100644
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@@ -110,6 +110,17 @@ void cpu_gemm_wna16(const torch::Tensor& input, const torch::Tensor& q_weight,
                     const std::optional<torch::Tensor>& bias,
                     const int64_t pack_factor, const std::string& isa_hint);
 
+void prepack_moe_weight(const torch::Tensor& weight,
+                        torch::Tensor& packed_weight, const std::string& isa);
+
+void cpu_fused_moe(torch::Tensor& output, const torch::Tensor& input,
+                   const torch::Tensor& w13, const torch::Tensor& w2,
+                   const std::optional<torch::Tensor>& w13_bias,
+                   const std::optional<torch::Tensor>& w2_bias,
+                   const torch::Tensor& topk_weights,
+                   const torch::Tensor& topk_id, const std::string& act,
+                   const std::string& isa);
+
 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   // vLLM custom ops
 
@@ -296,6 +307,19 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "pack_factor, str isa_hint) -> ()");
   ops.impl("cpu_gemm_wna16", torch::kCPU, &cpu_gemm_wna16);
 #endif
+
+  // fused moe
+#if defined(__AVX512F__)
+  ops.def(
+      "prepack_moe_weight(Tensor weight, Tensor(a1!) packed_weight, str isa) "
+      "-> ()");
+  ops.impl("prepack_moe_weight", torch::kCPU, &prepack_moe_weight);
+  ops.def(
+      "cpu_fused_moe(Tensor(a0!) output, Tensor input, Tensor w13, Tensor w2, "
+      "Tensor? w13_bias, Tensor? w2_bias, Tensor topk_weights, Tensor topk_id, "
+      "str act, str isa) -> ()");
+  ops.impl("cpu_fused_moe", torch::kCPU, &cpu_fused_moe);
+#endif
 }
 
 TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _utils), utils) {
diff --git a/csrc/cpu/utils.cpp b/csrc/cpu/utils.cpp
index 3dacfc7b2b7a3bfbcc0179ca9949b875b7e60d5f..f2085b73b6a48a4dbea3dd516eb1c39f3b36e2fb 100644
--- a/csrc/cpu/utils.cpp
+++ b/csrc/cpu/utils.cpp
@@ -10,7 +10,7 @@
   #define gettid() syscall(SYS_gettid)
 #endif
 
-#include "cpu_types.hpp"
+#include "cpu/utils.hpp"
 
 #ifdef VLLM_NUMA_DISABLED
 std::string init_cpu_threads_env(const std::string& cpu_ids) {
@@ -24,6 +24,8 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
 #ifndef VLLM_NUMA_DISABLED
 std::string init_cpu_threads_env(const std::string& cpu_ids) {
   bitmask* omp_cpu_mask = numa_parse_cpustring_all(cpu_ids.c_str());
+  TORCH_CHECK(omp_cpu_mask != nullptr,
+              "Failed to parse CPU string: " + cpu_ids);
   TORCH_CHECK(omp_cpu_mask->size > 0);
   std::vector<int> omp_cpu_ids;
   omp_cpu_ids.reserve(omp_cpu_mask->size);
@@ -44,20 +46,12 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
 
   // Memory node binding
   if (numa_available() != -1) {
-    int mem_node_id = numa_node_of_cpu(omp_cpu_ids.front());
     std::set<int> node_ids;
     for (const auto& cpu_id : omp_cpu_ids) {
       int node_id = numa_node_of_cpu(cpu_id);
       if (node_id != -1) {
         node_ids.insert(node_id);
       }
-      if (node_id != mem_node_id) {
-        TORCH_WARN("CPU ", cpu_id, " is on NUMA node ", node_id, ", but CPU ",
-                   omp_cpu_ids.front(), " is on NUMA node ", mem_node_id,
-                   ". All CPUs should be on the same NUMA node for optimal "
-                   "performance. Memory will be bound to NUMA node ",
-                   mem_node_id, ".");
-      }
     }
     // Concatenate all node_ids into a single comma-separated string
     if (!node_ids.empty()) {
@@ -70,7 +64,7 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
       }
 
       bitmask* mask = numa_parse_nodestring(node_ids_str.c_str());
-      bitmask* src_mask = numa_get_membind();
+      bitmask* src_mask = numa_get_mems_allowed();
 
       int pid = getpid();
 
@@ -83,15 +77,46 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
                      std::to_string(errno));
         }
 
-        // restrict memory allocation node.
-        numa_set_membind(mask);
+        // Restrict memory allocation to the selected NUMA node(s).
+        // Enhances memory locality for the threads bound to those NUMA CPUs.
+        if (node_ids.size() > 1) {
+          errno = 0;
+          numa_set_interleave_mask(mask);
+          if (errno != 0) {
+            TORCH_WARN("numa_set_interleave_mask failed. errno: " +
+                       std::to_string(errno));
+          } else {
+            TORCH_WARN(
+                "NUMA binding: Using INTERLEAVE policy for memory "
+                "allocation across multiple NUMA nodes (nodes: " +
+                node_ids_str +
+                "). Memory allocations will be "
+                "interleaved across the specified NUMA nodes.");
+          }
+        } else {
+          errno = 0;
+          numa_set_membind(mask);
+          if (errno != 0) {
+            TORCH_WARN("numa_set_membind failed. errno: " +
+                       std::to_string(errno));
+          } else {
+            TORCH_WARN(
+                "NUMA binding: Using MEMBIND policy for memory "
+                "allocation on the NUMA nodes (" +
+                node_ids_str +
+                "). Memory allocations will be "
+                "strictly bound to these NUMA nodes.");
+          }
+        }
+
         numa_set_strict(1);
 
         numa_free_nodemask(mask);
         numa_free_nodemask(src_mask);
       } else {
-        TORCH_WARN("numa_parse_nodestring or numa_get_membind failed. errno: " +
-                   std::to_string(errno));
+        TORCH_WARN(
+            "numa_parse_nodestring or numa_get_run_node_mask failed. errno: " +
+            std::to_string(errno));
       }
     }
   }
@@ -138,4 +163,26 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
 
   return ss.str();
 }
-#endif
+#endif  // VLLM_NUMA_DISABLED
+
+namespace cpu_utils {
+ScratchPadManager::ScratchPadManager() : size_(0), ptr_(nullptr) {
+  this->realloc(allocation_unit * 128);
+}
+
+void ScratchPadManager::realloc(size_t new_size) {
+  new_size = round(new_size);
+  if (new_size > size_) {
+    if (ptr_ != nullptr) {
+      std::free(ptr_);
+    }
+    ptr_ = std::aligned_alloc(64, new_size);
+    size_ = new_size;
+  }
+}
+
+ScratchPadManager* ScratchPadManager::get_scratchpad_manager() {
+  static ScratchPadManager manager;
+  return &manager;
+}
+}  // namespace cpu_utils
diff --git a/csrc/cpu/utils.hpp b/csrc/cpu/utils.hpp
index d3def306b806918c33236377faef71c7d1b0b566..682751d67b1cdbd196f89aa4a7b120f8e3e89113 100644
--- a/csrc/cpu/utils.hpp
+++ b/csrc/cpu/utils.hpp
@@ -2,19 +2,24 @@
 #define UTILS_HPP
 
 #include <atomic>
-#include <cassert>
-#include <cstdint>
 #include <unistd.h>
+#include <ATen/cpu/Utils.h>
 
-#if defined(__APPLE__)
-  #include <sys/sysctl.h>
-#endif
-
-#include "cpu_types.hpp"
+#include "cpu/cpu_types.hpp"
 
 namespace cpu_utils {
 enum class ISA { AMX, VEC };
 
+inline ISA get_isa(const std::string& isa) {
+  if (isa == "amx") {
+    return ISA::AMX;
+  } else if (isa == "vec") {
+    return ISA::VEC;
+  } else {
+    TORCH_CHECK(false, "Invalid isa type: " + isa);
+  }
+}
+
 template <typename T>
 struct VecTypeTrait {
   using vec_t = void;
@@ -32,10 +37,12 @@ struct VecTypeTrait<c10::BFloat16> {
 };
 #endif
 
+#if !defined(__powerpc__)
 template <>
 struct VecTypeTrait<c10::Half> {
   using vec_t = vec_op::FP16Vec16;
 };
+#endif
 
 struct Counter {
   std::atomic<int64_t> counter;
@@ -48,26 +55,66 @@ struct Counter {
   int64_t acquire_counter() { return counter++; }
 };
 
-inline int64_t get_l2_size() {
+inline int64_t get_available_l2_size() {
   static int64_t size = []() {
-#if defined(__APPLE__)
-    // macOS doesn't have _SC_LEVEL2_CACHE_SIZE. Use sysctlbyname.
-    int64_t l2_cache_size = 0;
-    size_t len = sizeof(l2_cache_size);
-    if (sysctlbyname("hw.l2cachesize", &l2_cache_size, &len, NULL, 0) == 0 &&
-        l2_cache_size > 0) {
-      return l2_cache_size >> 1;  // use 50% of L2 cache
-    }
-    // Fallback if sysctlbyname fails
-    return 128LL * 1024 >> 1;  // use 50% of 128KB
-#else
-    long l2_cache_size = sysconf(_SC_LEVEL2_CACHE_SIZE);
-    assert(l2_cache_size != -1);
+    const uint32_t l2_cache_size = at::cpu::L2_cache_size();
     return l2_cache_size >> 1;  // use 50% of L2 cache
-#endif
   }();
   return size;
 }
+
+template <int32_t alignment_v, typename T>
+inline T round_up(T size) {
+  T alignment = alignment_v;
+  return (((size + alignment - 1) / alignment) * alignment);
+}
+
+template <int32_t alignment_v, typename T>
+inline T round_down(T size) {
+  T alignment = alignment_v;
+  return (size / alignment) * alignment;
+}
+
+template <typename T>
+inline void print_logits(const char* name, T* ptr, int32_t row, int32_t col,
+                         int32_t stride) {
+  std::stringstream ss;
+  ss << std::fixed << std::setprecision(5) << name << ": [\n";
+  auto* curr_logits_buffer = ptr;
+  for (int32_t m = 0; m < row; ++m) {
+    for (int32_t n = 0; n < col; ++n) {
+      ss << curr_logits_buffer[n] << ", ";
+    }
+    ss << "\n";
+    curr_logits_buffer += stride;
+  }
+  ss << "]\n";
+  std::printf("%s", ss.str().c_str());
+}
+
+class ScratchPadManager {
+ public:
+  static constexpr size_t allocation_unit = 4 * 1024;  // 4KB
+
+  static ScratchPadManager* get_scratchpad_manager();
+
+  ScratchPadManager();
+
+  template <typename T>
+  T* get_data() {
+    return reinterpret_cast<T*>(ptr_);
+  }
+
+  static size_t round(size_t size) {
+    return ((size + allocation_unit - 1) / allocation_unit) * allocation_unit;
+  }
+
+  void realloc(size_t new_size);
+
+ private:
+  size_t size_;
+  void* ptr_;
+};
 }  // namespace cpu_utils
 
 #endif
diff --git a/csrc/cumem_allocator.cpp b/csrc/cumem_allocator.cpp
index 78dc840a98b677de9555dbd542b22f36d6ed68fd..6c2c18a6602d28ea09fa2f0cddfe826f8c993c49 100644
--- a/csrc/cumem_allocator.cpp
+++ b/csrc/cumem_allocator.cpp
@@ -107,6 +107,16 @@ void create_and_map(unsigned long long device, ssize_t size, CUdeviceptr d_mem,
   prop.location.id = device;
   prop.allocFlags.compressionType = CU_MEM_ALLOCATION_COMP_NONE;
 
+#ifndef USE_ROCM
+  int flag = 0;
+  CUDA_CHECK(cuDeviceGetAttribute(
+      &flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED,
+      device));
+  if (flag) {  // support GPUDirect RDMA if possible
+    prop.allocFlags.gpuDirectRDMACapable = 1;
+  }
+#endif
+
 #ifndef USE_ROCM
   // Allocate memory using cuMemCreate
   CUDA_CHECK(cuMemCreate(p_memHandle, size, &prop, 0));
diff --git a/csrc/fused_qknorm_rope_kernel.cu b/csrc/fused_qknorm_rope_kernel.cu
index baff8363162efa2864a8ea30ae987da78628d281..a51e1a347e1d4cc028bdbb02cff8afcab8d46348 100644
--- a/csrc/fused_qknorm_rope_kernel.cu
+++ b/csrc/fused_qknorm_rope_kernel.cu
@@ -107,7 +107,8 @@ __global__ void fusedQKNormRopeKernel(
     void const* k_weight_void,       // RMSNorm weights for key
     void const* cos_sin_cache_void,  // Pre-computed cos/sin cache
     int64_t const* position_ids,     // Position IDs for RoPE
-    int const num_tokens             // Number of tokens
+    int const num_tokens,            // Number of tokens
+    int const rotary_dim             // Dimension for RoPE
 ) {
 #if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 800) && !defined(USE_ROCM)
   if constexpr ((std::is_same_v<scalar_t_in, c10::BFloat16>) ||
@@ -227,56 +228,59 @@ __global__ void fusedQKNormRopeKernel(
 
     // Calculate cache pointer for this position - similar to
     // pos_encoding_kernels.cu
-    T_cache const* cache_ptr = cos_sin_cache + pos_id * head_dim;
-    int const embed_dim = head_dim / 2;
+    T_cache const* cache_ptr = cos_sin_cache + pos_id * rotary_dim;
+    int const embed_dim = rotary_dim / 2;
     T_cache const* cos_ptr = cache_ptr;
     T_cache const* sin_ptr = cache_ptr + embed_dim;
-
-    if constexpr (interleave) {
-      // Perform interleaving. Use pre-computed cos/sin values.
+    int const rotary_lanes = rotary_dim / numElemsPerThread;  // rotary range
+    if (laneId < rotary_lanes) {
+      if constexpr (interleave) {
+        // Perform interleaving. Use pre-computed cos/sin values.
 #pragma unroll
-      for (int i = 0; i < numElemsPerThread / 2; ++i) {
-        int const idx0 = 2 * i;
-        int const idx1 = 2 * i + 1;
-
-        float const val0 = elements[idx0];
-        float const val1 = elements[idx1];
-
-        int const dim_idx = laneId * numElemsPerThread + idx0;
-        int const half_dim = dim_idx / 2;
-        float const cos_val =
-            CacheConverter::convert(VLLM_LDG(cos_ptr + half_dim));
-        float const sin_val =
-            CacheConverter::convert(VLLM_LDG(sin_ptr + half_dim));
-
-        elements[idx0] = val0 * cos_val - val1 * sin_val;
-        elements[idx1] = val0 * sin_val + val1 * cos_val;
-      }
-    } else {
-      // Before data exchange with in warp, we need to sync.
-      __syncwarp();
-      // Get the data from the other half of the warp. Use pre-computed cos/sin
-      // values.
-#pragma unroll
-      for (int i = 0; i < numElemsPerThread; i++) {
-        elements2[i] = __shfl_xor_sync(FINAL_MASK, elements[i], 16);
-        if (laneId < 16) {
-          elements2[i] = -elements2[i];
+        for (int i = 0; i < numElemsPerThread / 2; ++i) {
+          int const idx0 = 2 * i;
+          int const idx1 = 2 * i + 1;
+          // Global dimension index in the head
+          int const dim_idx = laneId * numElemsPerThread + idx0;
+
+          float const val0 = elements[idx0];
+          float const val1 = elements[idx1];
+
+          int const half_dim = dim_idx / 2;
+          float const cos_val =
+              CacheConverter::convert(VLLM_LDG(cos_ptr + half_dim));
+          float const sin_val =
+              CacheConverter::convert(VLLM_LDG(sin_ptr + half_dim));
+
+          elements[idx0] = val0 * cos_val - val1 * sin_val;
+          elements[idx1] = val0 * sin_val + val1 * cos_val;
         }
+      } else {
+        // Before data exchange with in warp, we need to sync.
+        __syncwarp();
+        int pairOffset = (rotary_dim / 2) / numElemsPerThread;
+        // Get the data from the other half of the warp. Use pre-computed
+        // cos/sin values.
+#pragma unroll
+        for (int i = 0; i < numElemsPerThread; i++) {
+          elements2[i] = __shfl_xor_sync(FINAL_MASK, elements[i], pairOffset);
 
-        int dim_idx = laneId * numElemsPerThread + i;
-        dim_idx = (dim_idx * 2) % head_dim;
-        int half_dim = dim_idx / 2;
-        // Use pre-computed cos/sin from cache
-        float cos_val = CacheConverter::convert(VLLM_LDG(cos_ptr + half_dim));
-        float sin_val = CacheConverter::convert(VLLM_LDG(sin_ptr + half_dim));
+          if (laneId < pairOffset) {
+            elements2[i] = -elements2[i];
+          }
+          int dim_idx = laneId * numElemsPerThread + i;
 
-        elements[i] = elements[i] * cos_val + elements2[i] * sin_val;
+          dim_idx = (dim_idx * 2) % rotary_dim;
+          int half_dim = dim_idx / 2;
+          float cos_val = CacheConverter::convert(VLLM_LDG(cos_ptr + half_dim));
+          float sin_val = CacheConverter::convert(VLLM_LDG(sin_ptr + half_dim));
+
+          elements[i] = elements[i] * cos_val + elements2[i] * sin_val;
+        }
+        // __shfl_xor_sync does not provide memfence. Need to sync again.
+        __syncwarp();
       }
-      // __shfl_xor_sync does not provide memfence. Need to sync again.
-      __syncwarp();
     }
-
     // Store.
     {
       vec_T vec;
@@ -312,10 +316,10 @@ template <typename scalar_t_in, typename scalar_t_cache>
 void launchFusedQKNormRope(void* qkv, int const num_tokens,
                            int const num_heads_q, int const num_heads_k,
                            int const num_heads_v, int const head_dim,
-                           float const eps, void const* q_weight,
-                           void const* k_weight, void const* cos_sin_cache,
-                           bool const interleave, int64_t const* position_ids,
-                           cudaStream_t stream) {
+                           int const rotary_dim, float const eps,
+                           void const* q_weight, void const* k_weight,
+                           void const* cos_sin_cache, bool const interleave,
+                           int64_t const* position_ids, cudaStream_t stream) {
   constexpr int blockSize = 256;
 
   int const warpsPerBlock = blockSize / 32;
@@ -332,7 +336,7 @@ void launchFusedQKNormRope(void* qkv, int const num_tokens,
         fusedQKNormRopeKernel<scalar_t_in, scalar_t_cache, 64, INTERLEAVE>
             <<<gridDim, blockDim, 0, stream>>>(
                 qkv, num_heads_q, num_heads_k, num_heads_v, eps, q_weight,
-                k_weight, cos_sin_cache, position_ids, num_tokens);
+                k_weight, cos_sin_cache, position_ids, num_tokens, rotary_dim);
       });
       break;
     case 128:
@@ -340,7 +344,7 @@ void launchFusedQKNormRope(void* qkv, int const num_tokens,
         fusedQKNormRopeKernel<scalar_t_in, scalar_t_cache, 128, INTERLEAVE>
             <<<gridDim, blockDim, 0, stream>>>(
                 qkv, num_heads_q, num_heads_k, num_heads_v, eps, q_weight,
-                k_weight, cos_sin_cache, position_ids, num_tokens);
+                k_weight, cos_sin_cache, position_ids, num_tokens, rotary_dim);
       });
       break;
     case 256:
@@ -348,7 +352,7 @@ void launchFusedQKNormRope(void* qkv, int const num_tokens,
         fusedQKNormRopeKernel<scalar_t_in, scalar_t_cache, 256, INTERLEAVE>
             <<<gridDim, blockDim, 0, stream>>>(
                 qkv, num_heads_q, num_heads_k, num_heads_v, eps, q_weight,
-                k_weight, cos_sin_cache, position_ids, num_tokens);
+                k_weight, cos_sin_cache, position_ids, num_tokens, rotary_dim);
       });
       break;
     default:
@@ -392,8 +396,11 @@ void fused_qk_norm_rope(
               "Query weights size must match head dimension");
   TORCH_CHECK(k_weight.size(0) == head_dim,
               "Key weights size must match head dimension");
-  TORCH_CHECK(cos_sin_cache.size(1) == head_dim,
-              "Cos/sin cache dimension must match head_dim");
+
+  TORCH_CHECK(cos_sin_cache.size(1) % 2 == 0, "rotary_dim must be even");
+  TORCH_CHECK(cos_sin_cache.size(1) <= head_dim,
+              "rotary_dim must be less than or equal to head_dim");
+
   TORCH_CHECK(qkv.scalar_type() == q_weight.scalar_type() &&
                   qkv.scalar_type() == k_weight.scalar_type(),
               "qkv, q_weight and k_weight must have the same dtype");
@@ -419,7 +426,8 @@ void fused_qk_norm_rope(
               qkv.data_ptr(), static_cast<int>(num_tokens),
               static_cast<int>(num_heads_q), static_cast<int>(num_heads_k),
               static_cast<int>(num_heads_v), static_cast<int>(head_dim),
-              static_cast<float>(eps), q_weight.data_ptr(), k_weight.data_ptr(),
+              static_cast<int>(cos_sin_cache.size(1)), static_cast<float>(eps),
+              q_weight.data_ptr(), k_weight.data_ptr(),
               cos_sin_cache.data_ptr(), !is_neox,
               reinterpret_cast<int64_t const*>(position_ids.data_ptr()),
               stream);
diff --git a/csrc/moe/grouped_topk_kernels.cu b/csrc/moe/grouped_topk_kernels.cu
index 5fa367abd96f56250b25d312e1efbea4d83a3ce7..27e646bcd56fa8aac1a1e897d764a4517c633a8b 100644
--- a/csrc/moe/grouped_topk_kernels.cu
+++ b/csrc/moe/grouped_topk_kernels.cu
@@ -446,15 +446,19 @@ __device__ inline T apply_sigmoid(T val) {
 
 template <ScoringFunc SF, typename T>
 __device__ inline T apply_scoring(T val) {
-  if constexpr (SF == SCORING_SIGMOID) {
+  if constexpr (SF == SCORING_NONE) {
+    return val;
+  } else if constexpr (SF == SCORING_SIGMOID) {
     return apply_sigmoid(val);
   } else {
+    static_assert(SF == SCORING_NONE || SF == SCORING_SIGMOID,
+                  "Unsupported ScoringFunc in apply_scoring");
     return val;
   }
 }
 
-template <typename T, ScoringFunc SF>
-__device__ void topk_with_k2(T* output, T const* input, T const* bias,
+template <typename T, typename BiasT, ScoringFunc SF>
+__device__ void topk_with_k2(T* output, T const* input, BiasT const* bias,
                              cg::thread_block_tile<32> const& tile,
                              int32_t const lane_id,
                              int const num_experts_per_group) {
@@ -465,7 +469,7 @@ __device__ void topk_with_k2(T* output, T const* input, T const* bias,
   if (num_experts_per_group > WARP_SIZE) {
     for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) {
       T value = apply_scoring<SF>(input[i]);
-      value = value + bias[i];
+      value = value + static_cast<T>(bias[i]);
 
       if (value > largest) {
         second_largest = largest;
@@ -477,7 +481,7 @@ __device__ void topk_with_k2(T* output, T const* input, T const* bias,
   } else {
     for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) {
       T value = apply_scoring<SF>(input[i]);
-      value = value + bias[i];
+      value = value + static_cast<T>(bias[i]);
       largest = value;
     }
   }
@@ -499,8 +503,8 @@ __device__ void topk_with_k2(T* output, T const* input, T const* bias,
   }
 }
 
-template <typename T, ScoringFunc SF>
-__global__ void topk_with_k2_kernel(T* output, T* input, T const* bias,
+template <typename T, typename BiasT, ScoringFunc SF>
+__global__ void topk_with_k2_kernel(T* output, T* input, BiasT const* bias,
                                     int64_t const num_tokens,
                                     int64_t const num_cases,
                                     int64_t const n_group,
@@ -513,7 +517,7 @@ __global__ void topk_with_k2_kernel(T* output, T* input, T const* bias,
     input += case_id * num_experts_per_group;
     // bias is per expert group, offset to current group
     int32_t group_id = case_id % n_group;
-    T const* group_bias = bias + group_id * num_experts_per_group;
+    BiasT const* group_bias = bias + group_id * num_experts_per_group;
     output += case_id;
 
     cg::thread_block block = cg::this_thread_block();
@@ -522,18 +526,19 @@ __global__ void topk_with_k2_kernel(T* output, T* input, T const* bias,
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
     asm volatile("griddepcontrol.wait;");
 #endif
-    topk_with_k2<T, SF>(output, input, group_bias, tile, lane_id,
-                        num_experts_per_group);
+    topk_with_k2<T, BiasT, SF>(output, input, group_bias, tile, lane_id,
+                               num_experts_per_group);
   }
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
   asm volatile("griddepcontrol.launch_dependents;");
 #endif
 }
 
-template <typename T, typename IdxT, ScoringFunc SF, int NGroup = -1>
+template <typename T, typename BiasT, typename IdxT, ScoringFunc SF,
+          int NGroup = -1>
 __global__ void group_idx_and_topk_idx_kernel(
     T* scores, T const* group_scores, float* topk_values, IdxT* topk_indices,
-    T const* bias, int64_t const num_tokens, int64_t const n_group,
+    BiasT const* bias, int64_t const num_tokens, int64_t const n_group,
     int64_t const topk_group, int64_t const topk, int64_t const num_experts,
     int64_t const num_experts_per_group, bool renormalize,
     double routed_scaling_factor) {
@@ -619,7 +624,7 @@ __global__ void group_idx_and_topk_idx_kernel(
             T input = scores[offset + i];
             if (is_finite(input)) {
               T score = apply_scoring<SF>(input);
-              candidates = score + bias[offset + i];
+              candidates = score + static_cast<T>(bias[offset + i]);
             }
           }
           queue.add(candidates, offset + i);
@@ -670,10 +675,13 @@ __global__ void group_idx_and_topk_idx_kernel(
 
   if (case_id < num_tokens) {
     if (if_proceed_next_topk) {
+      float scale = routed_scaling_factor;
+      if (renormalize) {
+        scale /= topk_sum;
+      }
       for (int i = lane_id; i < topk; i += WARP_SIZE) {
         float base = cuda_cast<float, T>(s_topk_value[i]);
-        float value = renormalize ? (base / topk_sum * routed_scaling_factor)
-                                  : (base * routed_scaling_factor);
+        float value = base * scale;
         topk_indices[i] = s_topk_idx[i];
         topk_values[i] = value;
       }
@@ -691,10 +699,10 @@ __global__ void group_idx_and_topk_idx_kernel(
 #endif
 }
 
-template <typename T, typename IdxT, ScoringFunc SF>
+template <typename T, typename BiasT, typename IdxT, ScoringFunc SF>
 inline void launch_group_idx_and_topk_kernel(
     cudaLaunchConfig_t const& config, T* scores, T* group_scores,
-    float* topk_values, IdxT* topk_indices, T const* bias,
+    float* topk_values, IdxT* topk_indices, BiasT const* bias,
     int64_t const num_tokens, int64_t const n_group, int64_t const topk_group,
     int64_t const topk, int64_t const num_experts,
     int64_t const num_experts_per_group, bool const renormalize,
@@ -708,36 +716,36 @@ inline void launch_group_idx_and_topk_kernel(
 
   switch (n_group) {
     case 4: {
-      launch(&group_idx_and_topk_idx_kernel<T, IdxT, SF, 4>);
+      launch(&group_idx_and_topk_idx_kernel<T, BiasT, IdxT, SF, 4>);
       break;
     }
     case 8: {
-      launch(&group_idx_and_topk_idx_kernel<T, IdxT, SF, 8>);
+      launch(&group_idx_and_topk_idx_kernel<T, BiasT, IdxT, SF, 8>);
       break;
     }
     case 16: {
-      launch(&group_idx_and_topk_idx_kernel<T, IdxT, SF, 16>);
+      launch(&group_idx_and_topk_idx_kernel<T, BiasT, IdxT, SF, 16>);
       break;
     }
     case 32: {
-      launch(&group_idx_and_topk_idx_kernel<T, IdxT, SF, 32>);
+      launch(&group_idx_and_topk_idx_kernel<T, BiasT, IdxT, SF, 32>);
       break;
     }
     default: {
-      launch(&group_idx_and_topk_idx_kernel<T, IdxT, SF>);
+      launch(&group_idx_and_topk_idx_kernel<T, BiasT, IdxT, SF>);
       break;
     }
   }
 }
 
-template <typename T, typename IdxT>
+template <typename T, typename BiasT, typename IdxT>
 void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values,
-                   IdxT* topk_indices, T const* bias, int64_t const num_tokens,
-                   int64_t const num_experts, int64_t const n_group,
-                   int64_t const topk_group, int64_t const topk,
-                   bool const renormalize, double const routed_scaling_factor,
-                   int const scoring_func, bool enable_pdl = false,
-                   cudaStream_t const stream = 0) {
+                   IdxT* topk_indices, BiasT const* bias,
+                   int64_t const num_tokens, int64_t const num_experts,
+                   int64_t const n_group, int64_t const topk_group,
+                   int64_t const topk, bool const renormalize,
+                   double const routed_scaling_factor, int const scoring_func,
+                   bool enable_pdl = false, cudaStream_t const stream = 0) {
   int64_t num_cases = num_tokens * n_group;
   int64_t topk_with_k2_num_blocks = (num_cases - 1) / NUM_WARPS_PER_BLOCK + 1;
   cudaLaunchConfig_t config;
@@ -758,12 +766,12 @@ void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values,
   };
   switch (sf) {
     case SCORING_NONE: {
-      auto* kernel_instance1 = &topk_with_k2_kernel<T, SCORING_NONE>;
+      auto* kernel_instance1 = &topk_with_k2_kernel<T, BiasT, SCORING_NONE>;
       launch_topk_with_k2(kernel_instance1);
       break;
     }
     case SCORING_SIGMOID: {
-      auto* kernel_instance1 = &topk_with_k2_kernel<T, SCORING_SIGMOID>;
+      auto* kernel_instance1 = &topk_with_k2_kernel<T, BiasT, SCORING_SIGMOID>;
       launch_topk_with_k2(kernel_instance1);
       break;
     }
@@ -787,14 +795,14 @@ void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values,
   config.attrs = attrs;
   switch (sf) {
     case SCORING_NONE: {
-      launch_group_idx_and_topk_kernel<T, IdxT, SCORING_NONE>(
+      launch_group_idx_and_topk_kernel<T, BiasT, IdxT, SCORING_NONE>(
           config, scores, group_scores, topk_values, topk_indices, bias,
           num_tokens, n_group, topk_group, topk, num_experts,
           num_experts_per_group, renormalize, routed_scaling_factor);
       break;
     }
     case SCORING_SIGMOID: {
-      launch_group_idx_and_topk_kernel<T, IdxT, SCORING_SIGMOID>(
+      launch_group_idx_and_topk_kernel<T, BiasT, IdxT, SCORING_SIGMOID>(
           config, scores, group_scores, topk_values, topk_indices, bias,
           num_tokens, n_group, topk_group, topk, num_experts,
           num_experts_per_group, renormalize, routed_scaling_factor);
@@ -805,17 +813,23 @@ void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values,
   }
 }
 
-#define INSTANTIATE_NOAUX_TC(T, IdxT)                                       \
-  template void invokeNoAuxTc<T, IdxT>(                                     \
-      T * scores, T * group_scores, float* topk_values, IdxT* topk_indices, \
-      T const* bias, int64_t const num_tokens, int64_t const num_experts,   \
-      int64_t const n_group, int64_t const topk_group, int64_t const topk,  \
-      bool const renormalize, double const routed_scaling_factor,           \
+#define INSTANTIATE_NOAUX_TC(T, BiasT, IdxT)                                  \
+  template void invokeNoAuxTc<T, BiasT, IdxT>(                                \
+      T * scores, T * group_scores, float* topk_values, IdxT* topk_indices,   \
+      BiasT const* bias, int64_t const num_tokens, int64_t const num_experts, \
+      int64_t const n_group, int64_t const topk_group, int64_t const topk,    \
+      bool const renormalize, double const routed_scaling_factor,             \
       int const scoring_func, bool enable_pdl, cudaStream_t const stream);
 
-INSTANTIATE_NOAUX_TC(float, int32_t);
-INSTANTIATE_NOAUX_TC(half, int32_t);
-INSTANTIATE_NOAUX_TC(__nv_bfloat16, int32_t);
+INSTANTIATE_NOAUX_TC(float, float, int32_t);
+INSTANTIATE_NOAUX_TC(float, half, int32_t);
+INSTANTIATE_NOAUX_TC(float, __nv_bfloat16, int32_t);
+INSTANTIATE_NOAUX_TC(half, float, int32_t);
+INSTANTIATE_NOAUX_TC(half, half, int32_t);
+INSTANTIATE_NOAUX_TC(half, __nv_bfloat16, int32_t);
+INSTANTIATE_NOAUX_TC(__nv_bfloat16, float, int32_t);
+INSTANTIATE_NOAUX_TC(__nv_bfloat16, half, int32_t);
+INSTANTIATE_NOAUX_TC(__nv_bfloat16, __nv_bfloat16, int32_t);
 }  // end namespace moe
 }  // namespace vllm
 
@@ -824,6 +838,7 @@ std::tuple<torch::Tensor, torch::Tensor> grouped_topk(
     int64_t topk, bool renormalize, double routed_scaling_factor,
     torch::Tensor const& bias, int64_t scoring_func = 0) {
   auto data_type = scores.scalar_type();
+  auto bias_type = bias.scalar_type();
   auto input_size = scores.sizes();
   int64_t num_tokens = input_size[0];
   int64_t num_experts = input_size[1];
@@ -847,39 +862,62 @@ std::tuple<torch::Tensor, torch::Tensor> grouped_topk(
 
   auto stream = c10::cuda::getCurrentCUDAStream(scores.get_device());
 
+#define LAUNCH_KERNEL(T, IdxT)                                               \
+  do {                                                                       \
+    switch (bias_type) {                                                     \
+      case torch::kFloat16:                                                  \
+        vllm::moe::invokeNoAuxTc<T, half, IdxT>(                             \
+            reinterpret_cast<T*>(scores.mutable_data_ptr()),                 \
+            reinterpret_cast<T*>(group_scores.mutable_data_ptr()),           \
+            reinterpret_cast<float*>(topk_values.mutable_data_ptr()),        \
+            reinterpret_cast<IdxT*>(topk_indices.mutable_data_ptr()),        \
+            reinterpret_cast<half const*>(bias.data_ptr()), num_tokens,      \
+            num_experts, n_group, topk_group, topk, renormalize,             \
+            routed_scaling_factor, static_cast<int>(scoring_func), false,    \
+            stream);                                                         \
+        break;                                                               \
+      case torch::kFloat32:                                                  \
+        vllm::moe::invokeNoAuxTc<T, float, IdxT>(                            \
+            reinterpret_cast<T*>(scores.mutable_data_ptr()),                 \
+            reinterpret_cast<T*>(group_scores.mutable_data_ptr()),           \
+            reinterpret_cast<float*>(topk_values.mutable_data_ptr()),        \
+            reinterpret_cast<IdxT*>(topk_indices.mutable_data_ptr()),        \
+            reinterpret_cast<float const*>(bias.data_ptr()), num_tokens,     \
+            num_experts, n_group, topk_group, topk, renormalize,             \
+            routed_scaling_factor, static_cast<int>(scoring_func), false,    \
+            stream);                                                         \
+        break;                                                               \
+      case torch::kBFloat16:                                                 \
+        vllm::moe::invokeNoAuxTc<T, __nv_bfloat16, IdxT>(                    \
+            reinterpret_cast<T*>(scores.mutable_data_ptr()),                 \
+            reinterpret_cast<T*>(group_scores.mutable_data_ptr()),           \
+            reinterpret_cast<float*>(topk_values.mutable_data_ptr()),        \
+            reinterpret_cast<IdxT*>(topk_indices.mutable_data_ptr()),        \
+            reinterpret_cast<__nv_bfloat16 const*>(bias.data_ptr()),         \
+            num_tokens, num_experts, n_group, topk_group, topk, renormalize, \
+            routed_scaling_factor, static_cast<int>(scoring_func), false,    \
+            stream);                                                         \
+        break;                                                               \
+      default:                                                               \
+        throw std::invalid_argument(                                         \
+            "Invalid bias dtype, only supports float16, float32, and "       \
+            "bfloat16");                                                     \
+        break;                                                               \
+    }                                                                        \
+  } while (0)
+
   switch (data_type) {
     case torch::kFloat16:
       // Handle Float16
-      vllm::moe::invokeNoAuxTc<half, int32_t>(
-          reinterpret_cast<half*>(scores.mutable_data_ptr()),
-          reinterpret_cast<half*>(group_scores.mutable_data_ptr()),
-          reinterpret_cast<float*>(topk_values.mutable_data_ptr()),
-          reinterpret_cast<int32_t*>(topk_indices.mutable_data_ptr()),
-          reinterpret_cast<half const*>(bias.data_ptr()), num_tokens,
-          num_experts, n_group, topk_group, topk, renormalize,
-          routed_scaling_factor, static_cast<int>(scoring_func), false, stream);
+      LAUNCH_KERNEL(half, int32_t);
       break;
     case torch::kFloat32:
       // Handle Float32
-      vllm::moe::invokeNoAuxTc<float, int32_t>(
-          reinterpret_cast<float*>(scores.mutable_data_ptr()),
-          reinterpret_cast<float*>(group_scores.mutable_data_ptr()),
-          reinterpret_cast<float*>(topk_values.mutable_data_ptr()),
-          reinterpret_cast<int32_t*>(topk_indices.mutable_data_ptr()),
-          reinterpret_cast<float const*>(bias.data_ptr()), num_tokens,
-          num_experts, n_group, topk_group, topk, renormalize,
-          routed_scaling_factor, static_cast<int>(scoring_func), false, stream);
+      LAUNCH_KERNEL(float, int32_t);
       break;
     case torch::kBFloat16:
       // Handle BFloat16
-      vllm::moe::invokeNoAuxTc<__nv_bfloat16, int32_t>(
-          reinterpret_cast<__nv_bfloat16*>(scores.mutable_data_ptr()),
-          reinterpret_cast<__nv_bfloat16*>(group_scores.mutable_data_ptr()),
-          reinterpret_cast<float*>(topk_values.mutable_data_ptr()),
-          reinterpret_cast<int32_t*>(topk_indices.mutable_data_ptr()),
-          reinterpret_cast<__nv_bfloat16 const*>(bias.data_ptr()), num_tokens,
-          num_experts, n_group, topk_group, topk, renormalize,
-          routed_scaling_factor, static_cast<int>(scoring_func), false, stream);
+      LAUNCH_KERNEL(__nv_bfloat16, int32_t);
       break;
     default:
       // Handle other data types
@@ -887,5 +925,6 @@ std::tuple<torch::Tensor, torch::Tensor> grouped_topk(
           "Invalid dtype, only supports float16, float32, and bfloat16");
       break;
   }
+#undef LAUNCH_KERNEL
   return {topk_values, topk_indices};
 }
diff --git a/csrc/moe/marlin_moe_wna16/.gitignore b/csrc/moe/marlin_moe_wna16/.gitignore
index ba805f9250ecea707563336138067efe13b7b8b5..7dc482a8946605d91d192f43b2fffae518397f59 100644
--- a/csrc/moe/marlin_moe_wna16/.gitignore
+++ b/csrc/moe/marlin_moe_wna16/.gitignore
@@ -1,2 +1,3 @@
 sm*_kernel_*.cu
 kernel_selector.h
+kernel_*.cu
diff --git a/csrc/moe/marlin_moe_wna16/generate_kernels.py b/csrc/moe/marlin_moe_wna16/generate_kernels.py
index 88f1055337fd5332bef486a141025e90f937181f..9db03ea149d0c38083f63fa542b92cddc6dae9fc 100644
--- a/csrc/moe/marlin_moe_wna16/generate_kernels.py
+++ b/csrc/moe/marlin_moe_wna16/generate_kernels.py
@@ -10,6 +10,8 @@ import jinja2
 
 ARCHS = []
 SUPPORT_FP8 = False
+SUPPORT_SM75 = False
+SUPPORT_SM80 = False
 for arch in sys.argv[1].split(","):
     arch = arch[: arch.index(".") + 2].replace(".", "")
     arch = int(arch)
@@ -19,6 +21,10 @@ for arch in sys.argv[1].split(","):
     # with FP16 MMA, so it cannot achieve any acceleration.
     if arch in [89, 120]:
         SUPPORT_FP8 = True
+    if arch >= 80:
+        SUPPORT_SM80 = True
+    if arch == 75:
+        SUPPORT_SM75 = True
 
 FILE_HEAD_COMMENT = """
 // auto generated by generate_kernels.py
@@ -157,6 +163,7 @@ def remove_old_kernels():
 
 def generate_new_kernels():
     result_dict = {}
+    sm_75_result_dict = {}
 
     for quant_config in QUANT_CONFIGS:
         c_types = quant_config.get("c_type", ["kFloat16", "kBFloat16"])
@@ -174,6 +181,8 @@ def generate_new_kernels():
             s_type = quant_config.get("s_type", c_type)
             if (a_type, b_type, c_type) not in result_dict:
                 result_dict[(a_type, b_type, c_type)] = []
+                if a_type in ["kFloat16", "kS8"] and c_type == "kFloat16":
+                    sm_75_result_dict[(a_type, b_type, c_type)] = []
 
             for group_blocks, m_blocks, thread_configs in itertools.product(
                 all_group_blocks, all_m_blocks, all_thread_configs
@@ -197,78 +206,89 @@ def generate_new_kernels():
                     "thread_k_blocks": thread_k // 16,
                     "thread_n_blocks": thread_n // 16,
                     "m_block_size_8": "true" if m_blocks == 0.5 else "false",
-                    "stages": "pipe_stages",
+                    "stages": 4,
                     "group_blocks": group_blocks,
                     "is_zp_float": "false",
                 }
 
-                result_dict[(a_type, b_type, c_type)].append(config)
+                if SUPPORT_SM80:
+                    result_dict[(a_type, b_type, c_type)].append(config)
+                if (a_type, b_type, c_type) in sm_75_result_dict and SUPPORT_SM75:
+                    config_sm75 = config.copy()
+                    config_sm75["stages"] = 2
+                    sm_75_result_dict[(a_type, b_type, c_type)].append(config_sm75)
 
     kernel_selector_str = FILE_HEAD_COMMENT
 
-    for (a_type, b_type, c_type), config_list in result_dict.items():
-        all_template_str_list = []
-        for config in config_list:
-            s_type = config["s_type"]
-            template_str = jinja2.Template(TEMPLATE).render(
-                a_type_id=f"vllm::{a_type}.id()",
-                b_type_id=f"vllm::{b_type}.id()",
-                c_type_id=f"vllm::{c_type}.id()",
-                s_type_id=f"vllm::{s_type}.id()",
-                **config,
-            )
-            all_template_str_list.append(template_str)
-
-            conditions = [
-                f"a_type == vllm::{a_type}",
-                f"b_type == vllm::{b_type}",
-                f"c_type == vllm::{c_type}",
-                f"s_type == vllm::{s_type}",
-                f"threads == {config['threads']}",
-                f"thread_m_blocks == {config['thread_m_blocks']}",
-                f"thread_n_blocks == {config['thread_n_blocks']}",
-                f"thread_k_blocks == {config['thread_k_blocks']}",
-                f"m_block_size_8 == {config['m_block_size_8']}",
-                f"group_blocks == {config['group_blocks']}",
-                f"is_zp_float == {config['is_zp_float']}",
-            ]
-            conditions = " && ".join(conditions)
-
-            if kernel_selector_str == FILE_HEAD_COMMENT:
-                kernel_selector_str += f"if ({conditions})\n  kernel = "
-            else:
-                kernel_selector_str += f"else if ({conditions})\n  kernel = "
-
-            kernel_template2 = (
-                "Marlin<{{a_type_id}}, {{b_type_id}}, {{c_type_id}}, "
-                "{{s_type_id}}, {{threads}}, {{thread_m_blocks}}, "
-                "{{thread_n_blocks}}, {{thread_k_blocks}}, "
-                "{{m_block_size_8}}, {{stages}}, {{group_blocks}}, "
-                "{{is_zp_float}}>;"
-            )
-
-            kernel_selector_str += (
-                jinja2.Template(kernel_template2).render(
+    for result_dict_tmp in [result_dict, sm_75_result_dict]:
+        for (a_type, b_type, c_type), config_list in result_dict_tmp.items():
+            all_template_str_list = []
+            if not config_list:
+                continue
+            for config in config_list:
+                s_type = config["s_type"]
+                template_str = jinja2.Template(TEMPLATE).render(
                     a_type_id=f"vllm::{a_type}.id()",
                     b_type_id=f"vllm::{b_type}.id()",
                     c_type_id=f"vllm::{c_type}.id()",
                     s_type_id=f"vllm::{s_type}.id()",
                     **config,
                 )
-                + "\n"
-            )
+                all_template_str_list.append(template_str)
+
+                conditions = [
+                    f"a_type == vllm::{a_type}",
+                    f"b_type == vllm::{b_type}",
+                    f"c_type == vllm::{c_type}",
+                    f"s_type == vllm::{s_type}",
+                    f"threads == {config['threads']}",
+                    f"thread_m_blocks == {config['thread_m_blocks']}",
+                    f"thread_n_blocks == {config['thread_n_blocks']}",
+                    f"thread_k_blocks == {config['thread_k_blocks']}",
+                    f"m_block_size_8 == {config['m_block_size_8']}",
+                    f"stages == {config['stages']}",
+                    f"group_blocks == {config['group_blocks']}",
+                    f"is_zp_float == {config['is_zp_float']}",
+                ]
+                conditions = " && ".join(conditions)
+
+                if kernel_selector_str == FILE_HEAD_COMMENT:
+                    kernel_selector_str += f"if ({conditions})\n  kernel = "
+                else:
+                    kernel_selector_str += f"else if ({conditions})\n  kernel = "
+
+                kernel_template2 = (
+                    "Marlin<{{a_type_id}}, {{b_type_id}}, {{c_type_id}}, "
+                    "{{s_type_id}}, {{threads}}, {{thread_m_blocks}}, "
+                    "{{thread_n_blocks}}, {{thread_k_blocks}}, "
+                    "{{m_block_size_8}}, {{stages}}, {{group_blocks}}, "
+                    "{{is_zp_float}}>;"
+                )
 
-        file_content = FILE_HEAD + "\n\n"
-        file_content += "\n\n".join(all_template_str_list) + "\n\n}\n"
-        if a_type == "kFE4M3fn":
-            filename = f"sm89_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
-        else:
-            filename = f"sm80_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
+                kernel_selector_str += (
+                    jinja2.Template(kernel_template2).render(
+                        a_type_id=f"vllm::{a_type}.id()",
+                        b_type_id=f"vllm::{b_type}.id()",
+                        c_type_id=f"vllm::{c_type}.id()",
+                        s_type_id=f"vllm::{s_type}.id()",
+                        **config,
+                    )
+                    + "\n"
+                )
+
+            file_content = FILE_HEAD + "\n\n"
+            file_content += "\n\n".join(all_template_str_list) + "\n\n}\n"
+            if a_type == "kFE4M3fn":
+                filename = f"sm89_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
+            elif result_dict_tmp is sm_75_result_dict:
+                filename = f"sm75_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
+            else:
+                filename = f"sm80_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
 
-        filename = filename.lower()
+            filename = filename.lower()
 
-        with open(os.path.join(os.path.dirname(__file__), filename), "w") as f:
-            f.write(file_content)
+            with open(os.path.join(os.path.dirname(__file__), filename), "w") as f:
+                f.write(file_content)
 
     if not SUPPORT_FP8 and kernel_selector_str != FILE_HEAD_COMMENT:
         kernel_selector_str += (
diff --git a/csrc/moe/marlin_moe_wna16/kernel.h b/csrc/moe/marlin_moe_wna16/kernel.h
index 57f5a17932d4437e45a48901582039d2b96d327d..eb83df22cde7243f37c09db38f5b2eccb47983a3 100644
--- a/csrc/moe/marlin_moe_wna16/kernel.h
+++ b/csrc/moe/marlin_moe_wna16/kernel.h
@@ -7,20 +7,20 @@
 #include "quantization/gptq_marlin/marlin_dtypes.cuh"
 #include "core/scalar_type.hpp"
 
-#define MARLIN_KERNEL_PARAMS                                                  \
-  const int4 *__restrict__ A, const int4 *__restrict__ B,                     \
-      int4 *__restrict__ C, int4 *__restrict__ C_tmp,                         \
-      const int4 *__restrict__ b_bias_ptr,                                    \
-      const float *__restrict__ a_scales_ptr,                                 \
-      const int4 *__restrict__ scales_ptr,                                    \
-      const uint16_t *__restrict__ global_scale_ptr,                          \
-      const int4 *__restrict__ zp_ptr, const int *__restrict__ g_idx,         \
-      const int32_t *__restrict__ sorted_token_ids_ptr,                       \
-      const int32_t *__restrict__ expert_ids_ptr,                             \
-      const int32_t *__restrict__ num_tokens_past_padded_ptr,                 \
-      const float *__restrict__ topk_weights_ptr, int top_k,                  \
-      bool mul_topk_weights, bool is_ep, int num_groups, int prob_m,          \
-      int prob_n, int prob_k, int *locks, bool has_bias, bool use_atomic_add, \
+#define MARLIN_KERNEL_PARAMS                                          \
+  const int4 *__restrict__ A, const int4 *__restrict__ B,             \
+      int4 *__restrict__ C, int4 *__restrict__ C_tmp,                 \
+      const int4 *__restrict__ b_bias_ptr,                            \
+      const float *__restrict__ a_scales_ptr,                         \
+      const int4 *__restrict__ scales_ptr,                            \
+      const uint16_t *__restrict__ global_scale_ptr,                  \
+      const int4 *__restrict__ zp_ptr, const int *__restrict__ g_idx, \
+      const int32_t *__restrict__ sorted_token_ids_ptr,               \
+      const int32_t *__restrict__ expert_ids_ptr,                     \
+      const int32_t *__restrict__ num_tokens_past_padded_ptr,         \
+      const float *__restrict__ topk_weights_ptr, int top_k,          \
+      bool mul_topk_weights, int num_groups, int prob_m, int prob_n,  \
+      int prob_k, int *locks, bool has_bias, bool use_atomic_add,     \
       bool use_fp32_reduce
 
 namespace MARLIN_NAMESPACE_NAME {
diff --git a/csrc/moe/marlin_moe_wna16/marlin_template.h b/csrc/moe/marlin_moe_wna16/marlin_template.h
index 5b6b2456b4111fd6cacd5e8a209dd5f6cb01612c..5aac69b5c7d5a90d25b4863041d1df667b8693f9 100644
--- a/csrc/moe/marlin_moe_wna16/marlin_template.h
+++ b/csrc/moe/marlin_moe_wna16/marlin_template.h
@@ -26,6 +26,7 @@
 #include "quantization/gptq_marlin/marlin.cuh"
 #include "quantization/gptq_marlin/marlin_dtypes.cuh"
 #include "quantization/gptq_marlin/dequant.h"
+#include "quantization/gptq_marlin/marlin_mma.h"
 #include "core/scalar_type.hpp"
 
 #define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t)               \
@@ -35,7 +36,7 @@
 
 namespace MARLIN_NAMESPACE_NAME {
 
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 750
 
 template <typename scalar_t,  // compute dtype, half or nv_float16
           const vllm::ScalarTypeId b_type_id,  // weight MarlinScalarType id
@@ -70,7 +71,6 @@ __global__ void Marlin(
     const float* __restrict__ topk_weights_ptr,              // moe top weights
     int top_k,              // num of experts per token
     bool mul_topk_weights,  // mul topk weights or not
-    bool is_ep,             // expert parallelism
     int num_groups,         // number of scale groups per output channel
     int prob_m,             // batch dimension m
     int prob_n,             // output dimension n
@@ -84,146 +84,6 @@ __global__ void Marlin(
 
 #else
 
-// m16n8k16 tensor core mma instruction with fp16 inputs and fp32
-// output/accumulation.
-template <vllm::ScalarTypeId type_id, int k_size = 16>
-__device__ inline void mma(
-    const typename MarlinScalarType<type_id>::FragA& a_frag,
-    const typename MarlinScalarType<type_id>::FragB& frag_b,
-    typename MarlinScalarType<type_id>::FragC& frag_c, int idx = 0) {
-  const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
-  const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
-  using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
-  if constexpr (k_size == 16) {
-    if constexpr (std::is_same<scalar_t, half>::value) {
-      float* c = reinterpret_cast<float*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
-          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-          : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
-            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
-    } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
-      float* c = reinterpret_cast<float*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
-          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-          : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
-            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
-    } else if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
-      float* c = reinterpret_cast<float*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k16.row.col.f32.e4m3.e4m3.f32 "
-          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
-          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-          : "r"(a[idx * 2]), "r"(a[idx * 2 + 1]), "r"(b[idx]), "f"(c[0]),
-            "f"(c[1]), "f"(c[2]), "f"(c[3]));
-    } else if constexpr (std::is_same<scalar_t, int8_t>::value) {
-      int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32.satfinite "
-          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
-          : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
-          : "r"(a[idx * 2]), "r"(a[idx * 2 + 1]), "r"(b[idx]), "r"(c[0]),
-            "r"(c[1]), "r"(c[2]), "r"(c[3]));
-    }
-  } else if (k_size == 32) {
-    if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
-      float* c = reinterpret_cast<float*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32 "
-          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-          : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
-            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
-    } else if constexpr (std::is_same<scalar_t, int8_t>::value) {
-      int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32.satfinite "
-          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-          : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
-          : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
-            "r"(c[0]), "r"(c[1]), "r"(c[2]), "r"(c[3]));
-    }
-  }
-}
-
-template <vllm::ScalarTypeId type_id, int k_size = 16>
-__device__ inline void mma_trans(
-    const typename MarlinScalarType<type_id>::FragA& a_frag,
-    const typename MarlinScalarType<type_id>::FragB& frag_b,
-    const typename MarlinScalarType<type_id>::FragB& frag_b2,
-    typename MarlinScalarType<type_id>::FragC& frag_c) {
-  const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
-  const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
-  const uint32_t* b2 = reinterpret_cast<const uint32_t*>(&frag_b2);
-  float* c = reinterpret_cast<float*>(&frag_c);
-  using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
-  if constexpr (k_size == 16) {
-    if constexpr (std::is_same<scalar_t, half>::value) {
-      float* c = reinterpret_cast<float*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
-          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
-            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
-    } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
-      float* c = reinterpret_cast<float*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
-          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
-            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
-    } else if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
-      float* c = reinterpret_cast<float*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k16.row.col.f32.e4m3.e4m3.f32 "
-          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
-          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-          : "r"(b[0]), "r"(b2[0]), "r"(a[0]), "f"(c[0]), "f"(c[1]), "f"(c[2]),
-            "f"(c[3]));
-    } else if constexpr (std::is_same<scalar_t, int8_t>::value) {
-      int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32.satfinite "
-          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
-          : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
-          : "r"(b[0]), "r"(b2[0]), "r"(a[0]), "r"(c[0]), "r"(c[1]), "r"(c[2]),
-            "r"(c[3]));
-    }
-  } else {
-    if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
-      float* c = reinterpret_cast<float*>(&frag_c);
-  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 1200
-      asm volatile(
-          "mma.sync.aligned.kind::f8f6f4.m16n8k32.row.col.f32.e4m3.e4m3.f32 "
-          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
-            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
-  #else
-      asm volatile(
-          "mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32 "
-          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
-            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
-  #endif
-    } else if constexpr (std::is_same<scalar_t, int8_t>::value) {
-      int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32.satfinite "
-          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-          : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
-          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
-            "r"(c[0]), "r"(c[1]), "r"(c[2]), "r"(c[3]));
-    }
-  }
-}
-
 // Instruction for loading a full 16x16 matrix fragment of operand A from shared
 // memory, directly in tensor core layout.
 template <int count, vllm::ScalarTypeId type_id>
@@ -412,7 +272,6 @@ __global__ void Marlin(
     const float* __restrict__ topk_weights_ptr,              // moe top weights
     int top_k,              // num of experts per token
     bool mul_topk_weights,  // mul topk weights or not
-    bool is_ep,             // expert parallelism
     int num_groups,         // number of scale groups per output channel
     int prob_m,             // batch dimension m
     int prob_n,             // output dimension n
@@ -439,9 +298,20 @@ __global__ void Marlin(
   if constexpr (a_type_id == vllm::kFE4M3fn.id()) return;
   #endif
 
+  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750
+  // Turing TensorCore only supports fp16 and int8
+  if constexpr (a_type_id != vllm::kFloat16.id() && a_type_id != vllm::kS8.id())
+    return;
+  #endif
+
   int num_tokens_past_padded = num_tokens_past_padded_ptr[0];
   constexpr int moe_block_size = m_block_size_8 ? 8 : (16 * thread_m_blocks);
 
+  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750
+  constexpr bool use_fp16_accum = a_type_id == vllm::kFloat16.id();
+  #else
+  constexpr bool use_fp16_accum = false;
+  #endif
   using Adtype = MarlinScalarType<a_type_id>;
   using Cdtype = MarlinScalarType<c_type_id>;
 
@@ -504,14 +374,6 @@ __global__ void Marlin(
 
   // parallel: num valid moe blocks
   int parallel = num_tokens_past_padded / moe_block_size;
-  int num_valid_blocks = parallel;
-  if (is_ep) {
-    for (int i = 0; i < parallel; i++) {
-      if (expert_ids_ptr[i] == -1) num_valid_blocks--;
-    }
-  }
-  int num_invalid_blocks = parallel - num_valid_blocks;
-  parallel = num_valid_blocks;
 
   int k_tiles = prob_k / 16 / thread_k_blocks;
   int n_tiles = prob_n / 16 / thread_n_blocks;
@@ -618,7 +480,22 @@ __global__ void Marlin(
         }
       }
 
+  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750
+
+      if constexpr (moe_block_size >= 16)
+        local_count += __shfl_down_sync(0xFFFFFFFF, local_count, 16);
+      if constexpr (moe_block_size >= 8)
+        local_count += __shfl_down_sync(0xFFFFFFFF, local_count, 8);
+      if constexpr (moe_block_size >= 4)
+        local_count += __shfl_down_sync(0xFFFFFFFF, local_count, 4);
+      if constexpr (moe_block_size >= 2)
+        local_count += __shfl_down_sync(0xFFFFFFFF, local_count, 2);
+
+      local_count += __shfl_down_sync(0xFFFFFFFF, local_count, 1);
+      block_num_valid_tokens = local_count;
+  #else
       block_num_valid_tokens = __reduce_add_sync(0xffffffff, local_count);
+  #endif
 
       if (lane_id == 0)
         reinterpret_cast<int*>(sh_new)[0] = block_num_valid_tokens;
@@ -651,22 +528,8 @@ __global__ void Marlin(
     if (par_id >= parallel) return;
 
     old_expert_id = expert_id;
-    if (num_invalid_blocks > 0) {
-      int skip_count = par_id;
-      for (int i = 0; i < num_tokens_past_padded / moe_block_size; i++) {
-        expert_id = expert_ids_ptr[i];
-        if (expert_id != -1) {
-          if (skip_count == 0) {
-            block_id = i;
-            break;
-          };
-          skip_count--;
-        };
-      }
-    } else {
-      block_id = par_id;
-      expert_id = expert_ids_ptr[block_id];
-    }
+    block_id = par_id;
+    expert_id = expert_ids_ptr[block_id];
 
     if constexpr (b_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
       uint16_t val = global_scale_ptr[expert_id];
@@ -1018,10 +881,6 @@ __global__ void Marlin(
   constexpr int sh_s_size = has_act_order ? (act_s_max_num_groups * s_sh_stride)
                                           : (stages * s_sh_stage);
   int4* sh_s = sh_zp + (stages * zp_sh_stage);
-  // shared memory reused by reduction should be smaller than
-  // shared memory used by weight.
-  static_assert(thread_m_blocks * 16 * thread_n_blocks * 16 / 8 <=
-                stages * b_sh_stage);
   int4* sh_a = sh_s + sh_s_size;
 
   // Register storage for double buffer of shared memory reads.
@@ -1545,11 +1404,13 @@ __global__ void Marlin(
   #pragma unroll
       for (int i = 0; i < thread_m_blocks; i++) {
         if constexpr (m_block_size_8) {
-          mma_trans<a_type_id>(frag_a[k2][i], frag_b0, frag_b1,
-                               frag_c[i][j][0]);
+          mma_trans<a_type_id, use_fp16_accum>(frag_a[k2][i], frag_b0, frag_b1,
+                                               frag_c[i][j][0]);
         } else {
-          mma<a_type_id>(frag_a[k2][i], frag_b0, frag_c[i][j][0]);
-          mma<a_type_id>(frag_a[k2][i], frag_b1, frag_c[i][j][1]);
+          mma<a_type_id, use_fp16_accum>(frag_a[k2][i], frag_b0,
+                                         frag_c[i][j][0]);
+          mma<a_type_id, use_fp16_accum>(frag_a[k2][i], frag_b1,
+                                         frag_c[i][j][1]);
         }
       }
     }
@@ -1583,10 +1444,12 @@ __global__ void Marlin(
 
   #pragma unroll
       for (int i = 0; i < thread_m_blocks; i++) {
-        mma<a_type_id, 32>(frag_a[k2][i], frag_b[0],
-                           (group_blocks == -1 ? frag_c : frag_c_tmp)[i][j][0]);
-        mma<a_type_id, 32>(frag_a[k2][i], frag_b[1],
-                           (group_blocks == -1 ? frag_c : frag_c_tmp)[i][j][1]);
+        mma<a_type_id, false, 32>(
+            frag_a[k2][i], frag_b[0],
+            (group_blocks == -1 ? frag_c : frag_c_tmp)[i][j][0]);
+        mma<a_type_id, false, 32>(
+            frag_a[k2][i], frag_b[1],
+            (group_blocks == -1 ? frag_c : frag_c_tmp)[i][j][1]);
       }
 
       if constexpr (group_blocks != -1) {
@@ -2132,6 +1995,21 @@ __global__ void Marlin(
     // While this pattern may not be the most readable, other ways of writing
     // the loop seemed to noticeably worse performance after compilation.
     if (slice_iters == 0) {
+      // convert fp16 accum to fp32 for reduction
+      if constexpr (use_fp16_accum) {
+  #pragma unroll
+        for (int i = 0; i < (thread_m_blocks * (is_a_8bit ? 2 : 4) * 2); i++) {
+          float* frag_c_part_float = reinterpret_cast<float*>(frag_c) + i * 4;
+          scalar_t* frag_c_part_half =
+              reinterpret_cast<scalar_t*>(frag_c_part_float);
+
+  #pragma unroll
+          for (int i = 3; i >= 0; i--) {
+            frag_c_part_float[i] = Cdtype::num2float(frag_c_part_half[i]);
+          }
+        }
+      }
+
       if constexpr (is_a_8bit) {
         float frag_a_s[2 * thread_m_blocks];
 
diff --git a/csrc/moe/marlin_moe_wna16/ops.cu b/csrc/moe/marlin_moe_wna16/ops.cu
index 4fd8fc5c542023edbc61609bcc811e8c406d3cb8..00b17f075af680fbfe7263c4941f85b3cf997761 100644
--- a/csrc/moe/marlin_moe_wna16/ops.cu
+++ b/csrc/moe/marlin_moe_wna16/ops.cu
@@ -142,7 +142,7 @@ typedef struct {
 
 int get_scales_cache_size(thread_config_t const& th_config, int prob_m,
                           int prob_n, int prob_k, int num_bits, int group_size,
-                          bool has_act_order, bool is_k_full) {
+                          bool has_act_order, bool is_k_full, int stages) {
   bool cache_scales_chunk = has_act_order && !is_k_full;
 
   int tb_n = th_config.thread_n;
@@ -160,13 +160,13 @@ int get_scales_cache_size(thread_config_t const& th_config, int prob_m,
 
   if (cache_scales_chunk) {
     int load_groups =
-        tb_groups * pipe_stages * 2;     // Chunk size is 2x pipeline over dim K
+        tb_groups * stages * 2;          // Chunk size is 2x pipeline over dim K
     load_groups = max(load_groups, 32);  // We load at least 32 scale groups
     return load_groups * tb_n * 2;
   } else {
     int tb_scales = tb_groups * tb_n * 2;
 
-    return tb_scales * pipe_stages;
+    return tb_scales * stages;
   }
 }
 
@@ -174,7 +174,7 @@ int get_kernel_cache_size(thread_config_t const& th_config, bool m_block_size_8,
                           int thread_m_blocks, int prob_m, int prob_n,
                           int prob_k, int num_bits, int group_size,
                           bool has_act_order, bool is_k_full, int has_zp,
-                          int is_zp_float, bool is_a_8bit) {
+                          int is_zp_float, bool is_a_8bit, int stages) {
   int pack_factor = 32 / num_bits;
 
   // Get B size
@@ -185,8 +185,8 @@ int get_kernel_cache_size(thread_config_t const& th_config, bool m_block_size_8,
   // shm size for block_sorted_ids/rd_block_sorted_ids/block_topk_weights
   // both of them requires tb_m * 4 bytes (tb_m * int32 or tb_m * float32)
   int sh_block_meta_size = tb_m * 16;
-  int sh_a_size = pipe_stages * (tb_m * tb_k) * (is_a_8bit ? 1 : 2);
-  int sh_b_size = pipe_stages * (tb_k * tb_n / pack_factor) * 4;
+  int sh_a_size = stages * (tb_m * tb_k) * (is_a_8bit ? 1 : 2);
+  int sh_b_size = stages * (tb_k * tb_n / pack_factor) * 4;
   int sh_red_size = tb_m * (tb_n + 8) * 2;
   int sh_bias_size = tb_n * 2;
   int tmp_size =
@@ -195,8 +195,8 @@ int get_kernel_cache_size(thread_config_t const& th_config, bool m_block_size_8,
 
   int sh_s_size =
       get_scales_cache_size(th_config, prob_m, prob_n, prob_k, num_bits,
-                            group_size, has_act_order, is_k_full);
-  int sh_g_idx_size = has_act_order && !is_k_full ? pipe_stages * tb_k / 4 : 0;
+                            group_size, has_act_order, is_k_full, stages);
+  int sh_g_idx_size = has_act_order && !is_k_full ? stages * tb_k / 4 : 0;
   int sh_zp_size = 0;
   if (has_zp) {
     if (is_zp_float)
@@ -217,7 +217,7 @@ bool is_valid_config(thread_config_t const& th_config, bool m_block_size_8,
                      int thread_m_blocks, int prob_m, int prob_n, int prob_k,
                      int num_bits, int group_size, bool has_act_order,
                      bool is_k_full, int has_zp, int is_zp_float,
-                     int max_shared_mem, bool is_a_8bit) {
+                     bool is_a_8bit, int stages, int max_shared_mem) {
   // Sanity
   if (th_config.thread_k == -1 || th_config.thread_n == -1 ||
       th_config.num_threads == -1) {
@@ -243,7 +243,7 @@ bool is_valid_config(thread_config_t const& th_config, bool m_block_size_8,
   int cache_size =
       get_kernel_cache_size(th_config, m_block_size_8, thread_m_blocks, prob_m,
                             prob_n, prob_k, num_bits, group_size, has_act_order,
-                            is_k_full, has_zp, is_zp_float, is_a_8bit);
+                            is_k_full, has_zp, is_zp_float, is_a_8bit, stages);
   return cache_size <= max_shared_mem;
 }
 
@@ -252,7 +252,7 @@ MarlinFuncPtr get_marlin_kernel(
     const vllm::ScalarType c_type, const vllm::ScalarType s_type,
     int thread_m_blocks, int thread_n_blocks, int thread_k_blocks,
     bool m_block_size_8, bool has_act_order, bool has_zp, int group_blocks,
-    int threads, bool is_zp_float) {
+    int threads, bool is_zp_float, int stages) {
   int num_bits = b_type.size_bits();
   auto kernel = MarlinDefault;
 
@@ -266,8 +266,8 @@ exec_config_t determine_exec_config(
     const vllm::ScalarType& c_type, const vllm::ScalarType& s_type, int prob_m,
     int prob_n, int prob_k, int num_experts, int top_k, int thread_m_blocks,
     bool m_block_size_8, int num_bits, int group_size, bool has_act_order,
-    bool is_k_full, bool has_zp, bool is_zp_float, int max_shared_mem, int sms,
-    bool is_a_8bit) {
+    bool is_k_full, bool has_zp, bool is_zp_float, bool is_a_8bit, int stages,
+    int max_shared_mem, int sms) {
   exec_config_t exec_cfg = exec_config_t{1, thread_config_t{-1, -1, -1}};
   thread_config_t* thread_configs = thread_m_blocks > 1
                                         ? large_batch_thread_configs
@@ -284,15 +284,15 @@ exec_config_t determine_exec_config(
 
     if (!is_valid_config(th_config, m_block_size_8, thread_m_blocks, prob_m,
                          prob_n, prob_k, num_bits, group_size, has_act_order,
-                         is_k_full, has_zp, is_zp_float, max_shared_mem - 512,
-                         is_a_8bit)) {
+                         is_k_full, has_zp, is_zp_float, is_a_8bit, stages,
+                         max_shared_mem - 512)) {
       continue;
     }
 
     int cache_size = get_kernel_cache_size(
         th_config, m_block_size_8, thread_m_blocks, prob_m, prob_n, prob_k,
         num_bits, group_size, has_act_order, is_k_full, has_zp, is_zp_float,
-        is_a_8bit);
+        is_a_8bit, stages);
 
     int group_blocks = 0;
     if (!has_act_order) {
@@ -303,7 +303,7 @@ exec_config_t determine_exec_config(
         get_marlin_kernel(a_type, b_type, c_type, s_type, thread_m_blocks,
                           th_config.thread_n / 16, th_config.thread_k / 16,
                           m_block_size_8, has_act_order, has_zp, group_blocks,
-                          th_config.num_threads, is_zp_float);
+                          th_config.num_threads, is_zp_float, stages);
 
     if (kernel == MarlinDefault) continue;
 
@@ -336,14 +336,14 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
                void* perm, void* a_tmp, void* sorted_token_ids,
                void* expert_ids, void* num_tokens_past_padded,
                void* topk_weights, int moe_block_size, int num_experts,
-               int top_k, bool mul_topk_weights, bool is_ep, int prob_m,
-               int prob_n, int prob_k, void* workspace,
-               vllm::ScalarType const& a_type, vllm::ScalarType const& b_type,
-               vllm::ScalarType const& c_type, vllm::ScalarType const& s_type,
-               bool has_bias, bool has_act_order, bool is_k_full, bool has_zp,
-               int num_groups, int group_size, int dev, cudaStream_t stream,
-               int thread_k, int thread_n, int sms, int blocks_per_sm,
-               bool use_atomic_add, bool use_fp32_reduce, bool is_zp_float) {
+               int top_k, bool mul_topk_weights, int prob_m, int prob_n,
+               int prob_k, void* workspace, vllm::ScalarType const& a_type,
+               vllm::ScalarType const& b_type, vllm::ScalarType const& c_type,
+               vllm::ScalarType const& s_type, bool has_bias,
+               bool has_act_order, bool is_k_full, bool has_zp, int num_groups,
+               int group_size, int dev, cudaStream_t stream, int thread_k,
+               int thread_n, int sms, int blocks_per_sm, bool use_atomic_add,
+               bool use_fp32_reduce, bool is_zp_float) {
   int thread_m_blocks = div_ceil(moe_block_size, 16);
   bool m_block_size_8 = moe_block_size == 8;
   bool is_a_8bit = a_type.size_bits() == 8;
@@ -433,8 +433,14 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
                          dev);
   cudaDeviceGetAttribute(&minor_capability, cudaDevAttrComputeCapabilityMinor,
                          dev);
-  TORCH_CHECK(major_capability * 10 + minor_capability >= 80,
-              "marlin kernel only support Ampere or newer GPUs.");
+  TORCH_CHECK(major_capability * 10 + minor_capability >= 75,
+              "marlin kernel only support Turing or newer GPUs.");
+  int stages = 4;
+  if (major_capability == 7 && minor_capability == 5) {
+    stages = 2;
+    TORCH_CHECK(a_type == vllm::kFloat16 || a_type == vllm::kS8,
+                "Turing only support FP16 or INT8 activation.");
+  }
   if (a_type == vllm::kFE4M3fn) {
     TORCH_CHECK(major_capability * 10 + minor_capability >= 89,
                 "FP8 only support Ada Lovelace or newer GPUs.");
@@ -461,8 +467,8 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
     exec_cfg = determine_exec_config(
         a_type, b_type, c_type, s_type, prob_m, prob_n, prob_k, num_experts,
         top_k, thread_m_blocks, m_block_size_8, num_bits, group_size,
-        has_act_order, is_k_full, has_zp, is_zp_float, max_shared_mem, sms,
-        is_a_8bit);
+        has_act_order, is_k_full, has_zp, is_zp_float, is_a_8bit, stages,
+        max_shared_mem, sms);
     thread_tfg = exec_cfg.tb_cfg;
   }
 
@@ -479,7 +485,7 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
   TORCH_CHECK(is_valid_config(thread_tfg, m_block_size_8, thread_m_blocks,
                               prob_m, prob_n, prob_k, num_bits, group_size,
                               has_act_order, is_k_full, has_zp, is_zp_float,
-                              max_shared_mem, is_a_8bit),
+                              is_a_8bit, stages, max_shared_mem),
               "Invalid thread config: thread_m_blocks = ", thread_m_blocks,
               ", thread_k = ", thread_tfg.thread_k,
               ", thread_n = ", thread_tfg.thread_n,
@@ -493,12 +499,12 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
   int sh_cache_size =
       get_kernel_cache_size(thread_tfg, m_block_size_8, thread_m_blocks, prob_m,
                             prob_n, prob_k, num_bits, group_size, has_act_order,
-                            is_k_full, has_zp, is_zp_float, is_a_8bit);
+                            is_k_full, has_zp, is_zp_float, is_a_8bit, stages);
 
   auto kernel = get_marlin_kernel(
       a_type, b_type, c_type, s_type, thread_m_blocks, thread_n_blocks,
       thread_k_blocks, m_block_size_8, has_act_order, has_zp, group_blocks,
-      num_threads, is_zp_float);
+      num_threads, is_zp_float, stages);
 
   if (kernel == MarlinDefault) {
     TORCH_CHECK(false, "Unsupported shapes: MNK = [", prob_m, ", ", prob_n,
@@ -517,7 +523,7 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
   kernel<<<blocks, num_threads, max_shared_mem, stream>>>(
       A_ptr, B_ptr, C_ptr, C_tmp_ptr, bias_ptr, a_s_ptr, b_s_ptr, g_s_ptr, zp_ptr, g_idx_ptr,
       sorted_token_ids_ptr, expert_ids_ptr, num_tokens_past_padded_ptr,
-      topk_weights_ptr, top_k, mul_topk_weights, is_ep, num_groups, prob_m,
+      topk_weights_ptr, top_k, mul_topk_weights, num_groups, prob_m,
       prob_n, prob_k, locks, has_bias, use_atomic_add, use_fp32_reduce);
   // clang-format on
 }
@@ -535,7 +541,7 @@ torch::Tensor moe_wna16_marlin_gemm(
     std::optional<torch::Tensor> const& perm_or_none, torch::Tensor& workspace,
     torch::Tensor& sorted_token_ids, torch::Tensor& expert_ids,
     torch::Tensor& num_tokens_past_padded, torch::Tensor& topk_weights,
-    int64_t moe_block_size, int64_t top_k, bool mul_topk_weights, bool is_ep,
+    int64_t moe_block_size, int64_t top_k, bool mul_topk_weights,
     vllm::ScalarTypeId const& b_type_id, int64_t size_m, int64_t size_n,
     int64_t size_k, bool is_k_full, bool use_atomic_add, bool use_fp32_reduce,
     bool is_zp_float, int64_t thread_k, int64_t thread_n,
@@ -849,9 +855,9 @@ torch::Tensor moe_wna16_marlin_gemm(
       perm.data_ptr(), a_tmp.data_ptr(), sorted_token_ids.data_ptr(),
       expert_ids.data_ptr(), num_tokens_past_padded.data_ptr(),
       topk_weights.data_ptr(), moe_block_size, num_experts, top_k,
-      mul_topk_weights, is_ep, size_m, size_n, size_k, workspace.data_ptr(),
-      a_type, b_type, c_type, s_type, has_bias, has_act_order, is_k_full,
-      has_zp, num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev),
+      mul_topk_weights, size_m, size_n, size_k, workspace.data_ptr(), a_type,
+      b_type, c_type, s_type, has_bias, has_act_order, is_k_full, has_zp,
+      num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev),
       thread_k, thread_n, sms, blocks_per_sm, use_atomic_add, use_fp32_reduce,
       is_zp_float);
 
@@ -860,4 +866,4 @@ torch::Tensor moe_wna16_marlin_gemm(
 
 TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
   m.impl("moe_wna16_marlin_gemm", &moe_wna16_marlin_gemm);
-}
\ No newline at end of file
+}
diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp
index 20a2d85821133c62bb2aacfb3b686e2bd2b37a2d..cad4249d4697e12b41f6f6dfee51aff0201bbb42 100644
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -80,7 +80,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
       "Tensor sorted_token_ids,"
       "Tensor! expert_ids, Tensor! num_tokens_past_padded,"
       "Tensor! topk_weights, int moe_block_size, int top_k, "
-      "bool mul_topk_weights, bool is_ep, int b_type_id,"
+      "bool mul_topk_weights, int b_type_id,"
       "int size_m, int size_n, int size_k,"
       "bool is_full_k, bool use_atomic_add,"
       "bool use_fp32_reduce, bool is_zp_float,"
diff --git a/csrc/ops.h b/csrc/ops.h
index e08ca40090404e778f65725bb90f59b6ebada5ca..f51a651f54a3bc9c192da4dbb55403a34a60f15c 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -2,6 +2,7 @@
 
 #include <optional>
 #include <torch/library.h>
+#include <tuple>
 
 #include "core/scalar_type.hpp"
 
@@ -280,6 +281,11 @@ void get_cutlass_moe_mm_problem_sizes(
     const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets,
     std::optional<bool> force_swap_ab = std::nullopt);
 
+void get_cutlass_moe_mm_problem_sizes_from_expert_offsets(
+    const torch::Tensor& expert_first_token_offset,
+    torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
+    const int64_t n, const int64_t k, const bool swap_ab);
+
 void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
                                   torch::Tensor& problem_sizes1,
                                   torch::Tensor& problem_sizes2,
@@ -316,6 +322,12 @@ void scaled_fp4_experts_quant(
     torch::Tensor const& input_offset_by_experts,
     torch::Tensor const& output_scale_offset_by_experts);
 
+void silu_and_mul_scaled_fp4_experts_quant(
+    torch::Tensor& output, torch::Tensor& output_scale,
+    torch::Tensor const& input, torch::Tensor const& input_global_scale,
+    torch::Tensor const& input_offset_by_experts,
+    torch::Tensor const& output_scale_offset_by_experts);
+
 void per_token_group_quant_fp8(const torch::Tensor& input,
                                torch::Tensor& output_q, torch::Tensor& output_s,
                                int64_t group_size, double eps, double fp8_min,
@@ -350,8 +362,9 @@ void dynamic_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
 
 // void gptq_shuffle(torch::Tensor q_weight, torch::Tensor q_perm, int64_t bit);
 
-// void static_scaled_fp8_quant(torch::Tensor& out, torch::Tensor const& input,
-//                              torch::Tensor const& scale);
+// void static_scaled_fp8_quant(
+//     torch::Tensor& out, torch::Tensor const& input, torch::Tensor const& scale,
+//     std::optional<std::tuple<int64_t, int64_t>> group_shape = std::nullopt);
 
 // void dynamic_scaled_fp8_quant(torch::Tensor& out, torch::Tensor const& input,
 //                               torch::Tensor& scale);
diff --git a/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu b/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu
index 7539f836ecf379e50efbe9602eb78207bcd13dee..2ea229c47d7ec9d2fb82423b49f253957e5fabdf 100644
--- a/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu
+++ b/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu
@@ -31,37 +31,6 @@
 
 namespace vllm {
 
-// silu in float32
-__device__ __forceinline__ float silu(float x) {
-  return __fdividef(x, (1.f + __expf(-x)));
-}
-
-__device__ __forceinline__ float2 silu2(float2 x) {
-  return make_float2(silu(x.x), silu(x.y));
-}
-
-template <class Type>
-__inline__ __device__ PackedVec<Type> compute_silu_mul(PackedVec<Type>& vec,
-                                                       PackedVec<Type>& vec2) {
-  PackedVec<Type> result;
-  using packed_type = typename TypeConverter<Type>::Type;
-
-#pragma unroll
-  for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; ++i) {
-    // silu_mul in float32
-    if constexpr (std::is_same_v<Type, half>) {
-      float2 silu_vec = silu2(__half22float2(vec.elts[i]));
-      result.elts[i] =
-          __float22half2_rn(__fmul2_rn(silu_vec, __half22float2(vec2.elts[i])));
-    } else {
-      float2 silu_vec = silu2(__bfloat1622float2(vec.elts[i]));
-      result.elts[i] = __float22bfloat162_rn(
-          __fmul2_rn(silu_vec, __bfloat1622float2(vec2.elts[i])));
-    }
-  }
-  return result;
-}
-
 // Use UE4M3 by default.
 template <class Type, bool UE8M0_SF = false>
 __global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
@@ -74,6 +43,9 @@ __global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
   static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
                 "Vec size is not matched.");
 
+  // Precompute SF layout parameter (constant for entire kernel).
+  int32_t const numKTiles = (numCols + 63) / 64;
+
   // Get the global scaling factor, which will be applied to the SF.
   // Note SFScale is the same as next GEMM's alpha, which is
   // (448.f / (Alpha_A / 6.f)).
@@ -101,7 +73,7 @@ __global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
       auto sf_out =
           cvt_quant_to_fp4_get_sf_out_offset<uint32_t,
                                              CVT_FP4_NUM_THREADS_PER_SF>(
-              rowIdx, colIdx, numCols, SFout);
+              rowIdx, colIdx, numKTiles, SFout);
 
       out_pos = cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(out_silu_mul, SFScaleVal,
                                                      sf_out);
diff --git a/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu b/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu
index 674440278383294414438a2572acde32e76eccf7..ae8ef1bf99d6480eb35cd72f31de71d66bf3cde2 100644
--- a/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu
+++ b/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu
@@ -62,7 +62,9 @@ __global__ void __get_group_gemm_starts(
     ElementSF* a_scales_base_as_int, ElementSF* b_scales_base_as_int,
     ElementAccumulator* alphas_base_as_int, const int32_t* expert_offsets,
     const int32_t* sf_offsets, const int32_t* problem_sizes_as_shapes,
-    const int K, const int N) {
+    int64_t* a_strides, int64_t* b_strides, int64_t* c_strides,
+    const int64_t a_stride_val, const int64_t b_stride_val,
+    const int64_t c_stride_val, const int K, const int N) {
   int64_t expert_id = threadIdx.x;
   if (expert_id >= gridDim.x * blockDim.x) {
     return;
@@ -103,6 +105,11 @@ __global__ void __get_group_gemm_starts(
   // Shape of alpha = [E]
   alpha_offsets[expert_id] = alphas_base_as_int + expert_id;
 
+  // Initialize strides (constant across all experts, avoids separate kernels)
+  a_strides[expert_id] = a_stride_val;
+  b_strides[expert_id] = b_stride_val;
+  c_strides[expert_id] = c_stride_val;
+
   LayoutSFA* layout_sfa_ptr = layout_sfa_base_as_int + expert_id;
   LayoutSFB* layout_sfb_ptr = layout_sfb_base_as_int + expert_id;
 
@@ -135,7 +142,11 @@ __global__ void __get_group_gemm_starts(
             static_cast<float*>(alphas.data_ptr()),                           \
             static_cast<int32_t*>(expert_offsets.data_ptr()),                 \
             static_cast<int32_t*>(sf_offsets.data_ptr()),                     \
-            static_cast<int32_t*>(problem_sizes.data_ptr()), K, N);           \
+            static_cast<int32_t*>(problem_sizes.data_ptr()),                  \
+            static_cast<int64_t*>(a_strides.data_ptr()),                      \
+            static_cast<int64_t*>(b_strides.data_ptr()),                      \
+            static_cast<int64_t*>(c_strides.data_ptr()), a_stride_val,        \
+            b_stride_val, c_stride_val, K, N);                                \
   }
 
 template <typename LayoutSFA, typename LayoutSFB, typename ScaleConfig>
@@ -144,6 +155,9 @@ void run_get_group_gemm_starts(
     const torch::Tensor& out_starts, const torch::Tensor& a_scales_starts,
     const torch::Tensor& b_scales_starts, const torch::Tensor& alpha_starts,
     const torch::Tensor& layout_sfa, const torch::Tensor& layout_sfb,
+    const torch::Tensor& a_strides, const torch::Tensor& b_strides,
+    const torch::Tensor& c_strides, int64_t a_stride_val, int64_t b_stride_val,
+    int64_t c_stride_val,
     /*these are used for their base addresses*/
     torch::Tensor const& a_tensors, torch::Tensor const& b_tensors,
     torch::Tensor const& out_tensors, torch::Tensor const& a_scales,
@@ -269,17 +283,16 @@ void run_fp4_blockwise_scaled_group_mm_sm100(
   torch::Tensor alpha_ptrs = torch::empty(num_experts, options_int);
   torch::Tensor layout_sfa = torch::empty({num_experts, 5}, options_int);
   torch::Tensor layout_sfb = torch::empty({num_experts, 5}, options_int);
-  torch::Tensor c_strides1 =
-      torch::full({num_experts}, output.stride(0), options_int);
-  torch::Tensor a_strides1 =
-      torch::full({num_experts}, a.stride(0) * 2, options_int);
-  torch::Tensor b_strides1 =
-      torch::full({num_experts}, b.stride(1) * 2, options_int);
+  torch::Tensor a_strides1 = torch::empty(num_experts, options_int);
+  torch::Tensor b_strides1 = torch::empty(num_experts, options_int);
+  torch::Tensor c_strides1 = torch::empty(num_experts, options_int);
 
   run_get_group_gemm_starts<LayoutSFA, LayoutSFB, ScaleConfig>(
       a_ptrs, b_ptrs, out_ptrs, a_scales_ptrs, b_scales_ptrs, alpha_ptrs,
-      layout_sfa, layout_sfb, a, b, output, a_blockscale, b_blockscales, alphas,
-      expert_offsets, sf_offsets, problem_sizes, M, N, K);
+      layout_sfa, layout_sfb, a_strides1, b_strides1, c_strides1,
+      a.stride(0) * 2, b.stride(1) * 2, output.stride(0), a, b, output,
+      a_blockscale, b_blockscales, alphas, expert_offsets, sf_offsets,
+      problem_sizes, M, N, K);
 
   // Create an instance of the GEMM
   Gemm gemm_op;
@@ -444,17 +457,16 @@ void run_fp4_blockwise_scaled_group_mm_sm120(
   torch::Tensor alpha_ptrs = torch::empty(num_experts, options_int);
   torch::Tensor layout_sfa = torch::empty({num_experts, 5}, options_int);
   torch::Tensor layout_sfb = torch::empty({num_experts, 5}, options_int);
-  torch::Tensor c_strides1 =
-      torch::full({num_experts}, output.stride(0), options_int);
-  torch::Tensor a_strides1 =
-      torch::full({num_experts}, a.stride(0) * 2, options_int);
-  torch::Tensor b_strides1 =
-      torch::full({num_experts}, b.stride(1) * 2, options_int);
+  torch::Tensor a_strides1 = torch::empty(num_experts, options_int);
+  torch::Tensor b_strides1 = torch::empty(num_experts, options_int);
+  torch::Tensor c_strides1 = torch::empty(num_experts, options_int);
 
   run_get_group_gemm_starts<LayoutSFA, LayoutSFB, ScaleConfig>(
       a_ptrs, b_ptrs, out_ptrs, a_scales_ptrs, b_scales_ptrs, alpha_ptrs,
-      layout_sfa, layout_sfb, a, b, output, a_blockscale, b_blockscales, alphas,
-      expert_offsets, sf_offsets, problem_sizes, M, N, K);
+      layout_sfa, layout_sfb, a_strides1, b_strides1, c_strides1,
+      a.stride(0) * 2, b.stride(1) * 2, output.stride(0), a, b, output,
+      a_blockscale, b_blockscales, alphas, expert_offsets, sf_offsets,
+      problem_sizes, M, N, K);
 
   // Create an instance of the GEMM
   Gemm gemm_op;
diff --git a/csrc/quantization/fp4/nvfp4_experts_quant.cu b/csrc/quantization/fp4/nvfp4_experts_quant.cu
index 82c53c2375a31e91dd58e1cefe335088e51e601e..aa573c007b3dfde325a2db6bd52c92fca4c153ed 100644
--- a/csrc/quantization/fp4/nvfp4_experts_quant.cu
+++ b/csrc/quantization/fp4/nvfp4_experts_quant.cu
@@ -25,13 +25,18 @@
 #include <cuda_fp8.h>
 #include "dispatch_utils.h"
 
+#include "cuda_utils.h"
 #include "nvfp4_utils.cuh"
 #include "launch_bounds_utils.h"
 
 namespace vllm {
 
+// NVFP4 quantization kernel for experts (low-latency path).
+// When FUSE_SILU_MUL=true, expects input with gate||up layout and fuses
+// SiLU(gate)*up before quantization.
 // Use UE4M3 by default.
-template <class Type, bool UE8M0_SF = false, bool SMALL_NUM_EXPERTS = false>
+template <class Type, bool FUSE_SILU_MUL = false, bool UE8M0_SF = false,
+          bool SMALL_NUM_EXPERTS = false>
 __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
     cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
                     float const* SFScale, uint32_t* out, uint32_t* SFout,
@@ -44,8 +49,13 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
   static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
                 "Vec size is not matched.");
 
+  // Precompute SF layout parameter (constant for entire kernel).
+  int32_t const numKTiles = (numCols + 63) / 64;
+
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
   int colsPerRow = numCols / CVT_FP4_ELTS_PER_THREAD;
+  // When fusing SiLU+Mul, input has gate || up layout (doubled width)
+  int inColsPerRow = FUSE_SILU_MUL ? colsPerRow * 2 : colsPerRow;
 
   // Each global thread processes one element
   for (int globalIdx = tid; globalIdx < numRows * colsPerRow;
@@ -54,13 +64,6 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
     int rowIdx = globalIdx / colsPerRow;
     int colIdx = globalIdx % colsPerRow;
 
-    int64_t inOffset = rowIdx * colsPerRow + colIdx;
-    PackedVec in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
-    // Get the output tensor offset.
-    // Same as inOffset because 8 elements are packed into one uint32_t.
-    int64_t outOffset = inOffset;
-    auto& out_pos = out[outOffset];
-
     // Find index within the experts using different strategies based on expert
     // count
     int rowIdx_in_expert = 0;
@@ -107,29 +110,46 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
       }
     }
 
+    // Load input and optionally apply fused SiLU+Mul
+    int64_t inOffset = rowIdx * inColsPerRow + colIdx;
+    PackedVec in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
+    PackedVec quant_input;
+    if constexpr (FUSE_SILU_MUL) {
+      PackedVec in_vec_up =
+          reinterpret_cast<PackedVec const*>(in)[inOffset + colsPerRow];
+      quant_input = compute_silu_mul(in_vec, in_vec_up);
+    } else {
+      quant_input = in_vec;
+    }
+
+    // Get the output tensor offset.
+    // Same as inOffset because 8 elements are packed into one uint32_t.
+    int64_t outOffset = rowIdx * colsPerRow + colIdx;
+    auto& out_pos = out[outOffset];
+
     // Get the global scaling factor, which will be applied to the SF.
     // Note SFScale is the same as next GEMM's alpha, which is
     // (448.f / (Alpha_A / 6.f)).
     float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[expert_idx];
 
-    int factor = CVT_FP4_SF_VEC_SIZE * 4;
-    // The actual output_scales dim is computed from the padded numCols.
-    int32_t numCols_padded = (numCols + factor - 1) / factor * factor;
-    int numCols_SFout = numCols_padded / CVT_FP4_SF_VEC_SIZE / 4;
     uint32_t* SFout_in_expert =
-        SFout + output_scale_offset_by_experts[expert_idx] * numCols_SFout;
+        SFout + output_scale_offset_by_experts[expert_idx] * numKTiles;
 
     auto sf_out =
         cvt_quant_to_fp4_get_sf_out_offset<uint32_t,
                                            CVT_FP4_NUM_THREADS_PER_SF>(
-            rowIdx_in_expert, colIdx, numCols, SFout_in_expert);
+            rowIdx_in_expert, colIdx, numKTiles, SFout_in_expert);
 
-    out_pos = cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
+    out_pos =
+        cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(quant_input, SFScaleVal, sf_out);
   }
 }
 
-// Kernel for LARGE_M_TOPK = true (large m_topk optimized version)
-template <class Type, bool UE8M0_SF = false, bool SMALL_NUM_EXPERTS = false>
+// NVFP4 quantization kernel for LARGE_M_TOPK = true (large m_topk optimized
+// version). When FUSE_SILU_MUL=true, expects input with gate||up layout and
+// fuses SiLU(gate)*up before quantization.
+template <class Type, bool FUSE_SILU_MUL = false, bool UE8M0_SF = false,
+          bool SMALL_NUM_EXPERTS = false>
 __global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
     cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
                     float const* SFScale, uint32_t* out, uint32_t* SFout,
@@ -140,6 +160,10 @@ __global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
       (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD);
   static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
                 "Vec size is not matched.");
+
+  // Precompute SF layout parameter (constant for entire kernel).
+  int32_t const numKTiles = (numCols + 63) / 64;
+
   extern __shared__ uint32_t shared_input_offsets[];
 
   // Load input offsets into shared memory.
@@ -163,6 +187,8 @@ __global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
 
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
   int colsPerRow = numCols / CVT_FP4_ELTS_PER_THREAD;
+  // When fusing SiLU+Mul, input has gate || up layout (doubled width)
+  int inColsPerRow = FUSE_SILU_MUL ? colsPerRow * 2 : colsPerRow;
 
   // Each global thread processes one element
   for (int globalIdx = tid; globalIdx < numRows * colsPerRow;
@@ -171,11 +197,6 @@ __global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
     int rowIdx = globalIdx / colsPerRow;
     int colIdx = globalIdx % colsPerRow;
 
-    int64_t inOffset = rowIdx * colsPerRow + colIdx;
-    PackedVec in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
-    int64_t outOffset = inOffset;
-    auto& out_pos = out[outOffset];
-
     // Find expert using binary search for better performance with large m_topk
     int rowIdx_in_expert = 0;
     int expert_idx = 0;
@@ -200,34 +221,43 @@ __global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
       }
     }
 
+    // Load input and optionally apply fused SiLU+Mul
+    int64_t inOffset = rowIdx * inColsPerRow + colIdx;
+    PackedVec in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
+    PackedVec quant_input;
+    if constexpr (FUSE_SILU_MUL) {
+      PackedVec in_vec_up =
+          reinterpret_cast<PackedVec const*>(in)[inOffset + colsPerRow];
+      quant_input = compute_silu_mul(in_vec, in_vec_up);
+    } else {
+      quant_input = in_vec;
+    }
+
+    int64_t outOffset = rowIdx * colsPerRow + colIdx;
+    auto& out_pos = out[outOffset];
+
     float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[expert_idx];
 
-    int factor = CVT_FP4_SF_VEC_SIZE * 4;
-    int32_t numCols_padded = (numCols + factor - 1) / factor * factor;
-    int numCols_SFout = numCols_padded / CVT_FP4_SF_VEC_SIZE / 4;
     uint32_t* SFout_in_expert =
-        SFout + output_scale_offset_by_experts[expert_idx] * numCols_SFout;
+        SFout + output_scale_offset_by_experts[expert_idx] * numKTiles;
 
     auto sf_out =
         cvt_quant_to_fp4_get_sf_out_offset<uint32_t,
                                            CVT_FP4_NUM_THREADS_PER_SF>(
-            rowIdx_in_expert, colIdx, numCols, SFout_in_expert);
+            rowIdx_in_expert, colIdx, numKTiles, SFout_in_expert);
 
-    out_pos = cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
+    out_pos =
+        cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(quant_input, SFScaleVal, sf_out);
   }
 }
 
-template <typename T>
+template <typename T, bool FUSE_SILU_MUL = false>
 void quant_impl(void* output, void* output_scale, void* input,
                 void* input_global_scale, void* input_offset_by_experts,
                 void* output_scale_offset_by_experts, int m_topk, int k,
                 int n_experts, cudaStream_t stream) {
-  // TODO: this multiProcessorCount should be cached.
-  int device;
-  cudaGetDevice(&device);
-  int multiProcessorCount;
-  cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount,
-                         device);
+  int multiProcessorCount =
+      get_device_attribute(cudaDevAttrMultiProcessorCount, -1);
 
   // Grid, Block size.
   // Each thread converts 8 values.
@@ -249,7 +279,7 @@ void quant_impl(void* output, void* output_scale, void* input,
   if (blockRepeat > 1) {
     size_t shared_mem_size = (n_experts + 1) * sizeof(uint32_t);
     if (n_experts >= 4) {
-      cvt_fp16_to_fp4<T, false, false>
+      cvt_fp16_to_fp4<T, FUSE_SILU_MUL, false, false>
           <<<grid, block, shared_mem_size, stream>>>(
               m_topk, k, reinterpret_cast<T*>(input),
               reinterpret_cast<float*>(input_global_scale),
@@ -259,34 +289,37 @@ void quant_impl(void* output, void* output_scale, void* input,
               reinterpret_cast<uint32_t*>(output_scale_offset_by_experts),
               n_experts);
     } else {
-      cvt_fp16_to_fp4<T, false, true><<<grid, block, shared_mem_size, stream>>>(
-          m_topk, k, reinterpret_cast<T*>(input),
-          reinterpret_cast<float*>(input_global_scale),
-          reinterpret_cast<uint32_t*>(output),
-          reinterpret_cast<uint32_t*>(output_scale),
-          reinterpret_cast<uint32_t*>(input_offset_by_experts),
-          reinterpret_cast<uint32_t*>(output_scale_offset_by_experts),
-          n_experts);
+      cvt_fp16_to_fp4<T, FUSE_SILU_MUL, false, true>
+          <<<grid, block, shared_mem_size, stream>>>(
+              m_topk, k, reinterpret_cast<T*>(input),
+              reinterpret_cast<float*>(input_global_scale),
+              reinterpret_cast<uint32_t*>(output),
+              reinterpret_cast<uint32_t*>(output_scale),
+              reinterpret_cast<uint32_t*>(input_offset_by_experts),
+              reinterpret_cast<uint32_t*>(output_scale_offset_by_experts),
+              n_experts);
     }
   } else {
     if (n_experts >= 16) {
-      cvt_fp16_to_fp4<T, false, false><<<grid, block, 0, stream>>>(
-          m_topk, k, reinterpret_cast<T*>(input),
-          reinterpret_cast<float*>(input_global_scale),
-          reinterpret_cast<uint32_t*>(output),
-          reinterpret_cast<uint32_t*>(output_scale),
-          reinterpret_cast<uint32_t*>(input_offset_by_experts),
-          reinterpret_cast<uint32_t*>(output_scale_offset_by_experts),
-          n_experts, /* bool low_latency */ true);
+      cvt_fp16_to_fp4<T, FUSE_SILU_MUL, false, false>
+          <<<grid, block, 0, stream>>>(
+              m_topk, k, reinterpret_cast<T*>(input),
+              reinterpret_cast<float*>(input_global_scale),
+              reinterpret_cast<uint32_t*>(output),
+              reinterpret_cast<uint32_t*>(output_scale),
+              reinterpret_cast<uint32_t*>(input_offset_by_experts),
+              reinterpret_cast<uint32_t*>(output_scale_offset_by_experts),
+              n_experts, /* bool low_latency */ true);
     } else {
-      cvt_fp16_to_fp4<T, false, true><<<grid, block, 0, stream>>>(
-          m_topk, k, reinterpret_cast<T*>(input),
-          reinterpret_cast<float*>(input_global_scale),
-          reinterpret_cast<uint32_t*>(output),
-          reinterpret_cast<uint32_t*>(output_scale),
-          reinterpret_cast<uint32_t*>(input_offset_by_experts),
-          reinterpret_cast<uint32_t*>(output_scale_offset_by_experts),
-          n_experts, /* bool low_latency */ true);
+      cvt_fp16_to_fp4<T, FUSE_SILU_MUL, false, true>
+          <<<grid, block, 0, stream>>>(
+              m_topk, k, reinterpret_cast<T*>(input),
+              reinterpret_cast<float*>(input_global_scale),
+              reinterpret_cast<uint32_t*>(output),
+              reinterpret_cast<uint32_t*>(output_scale),
+              reinterpret_cast<uint32_t*>(input_offset_by_experts),
+              reinterpret_cast<uint32_t*>(output_scale_offset_by_experts),
+              n_experts, /* bool low_latency */ true);
     }
   }
 }
@@ -307,19 +340,19 @@ constexpr auto FLOAT = at::ScalarType::Float;
 constexpr auto INT = at::ScalarType::Int;
 constexpr auto UINT8 = at::ScalarType::Byte;
 
-void scaled_fp4_experts_quant_sm1xxa(
-    torch::Tensor& output, torch::Tensor& output_scale,
+// Common validation for fp4 experts quantization entry points.
+static void validate_fp4_experts_quant_inputs(
+    torch::Tensor const& output, torch::Tensor const& output_scale,
     torch::Tensor const& input, torch::Tensor const& input_global_scale,
     torch::Tensor const& input_offset_by_experts,
-    torch::Tensor const& output_scale_offset_by_experts) {
-  CHECK_INPUT(output, "output must be a CUDA tensor");
-  CHECK_INPUT(output_scale, "output_scale must be a CUDA tensor");
-  CHECK_INPUT(input, "input must be a CUDA tensor");
-  CHECK_INPUT(input_global_scale, "input_global_scale must be a CUDA tensor");
-  CHECK_INPUT(input_offset_by_experts,
-              "input_offset_by_experts must be a CUDA tensor");
-  CHECK_INPUT(output_scale_offset_by_experts,
-              "output_scale_offset_by_experts must be a CUDA tensor");
+    torch::Tensor const& output_scale_offset_by_experts, int64_t m_topk,
+    int64_t k) {
+  CHECK_INPUT(output, "output");
+  CHECK_INPUT(output_scale, "output_scale");
+  CHECK_INPUT(input, "input");
+  CHECK_INPUT(input_global_scale, "input_global_scale");
+  CHECK_INPUT(input_offset_by_experts, "input_offset_by_experts");
+  CHECK_INPUT(output_scale_offset_by_experts, "output_scale_offset_by_experts");
 
   TORCH_CHECK(output.dim() == 2);
   TORCH_CHECK(output_scale.dim() == 2);
@@ -338,8 +371,6 @@ void scaled_fp4_experts_quant_sm1xxa(
   TORCH_CHECK(output_scale.scalar_type() == INT);
 
   const int BLOCK_SIZE = 16;
-  auto m_topk = input.size(0);
-  auto k = input.size(1);
   TORCH_CHECK(k % BLOCK_SIZE == 0, "k must be a multiple of 16");
   auto n_experts = input_global_scale.size(0);
   TORCH_CHECK(input_offset_by_experts.size(0) == n_experts + 1);
@@ -351,7 +382,21 @@ void scaled_fp4_experts_quant_sm1xxa(
   int padded_k = (scales_k + (4 - 1)) / 4 * 4;
   // 4 means 4 fp8 values are packed into one int32
   TORCH_CHECK(output_scale.size(1) * 4 == padded_k);
+}
 
+void scaled_fp4_experts_quant_sm1xxa(
+    torch::Tensor& output, torch::Tensor& output_scale,
+    torch::Tensor const& input, torch::Tensor const& input_global_scale,
+    torch::Tensor const& input_offset_by_experts,
+    torch::Tensor const& output_scale_offset_by_experts) {
+  auto m_topk = input.size(0);
+  auto k = input.size(1);
+
+  validate_fp4_experts_quant_inputs(output, output_scale, input,
+                                    input_global_scale, input_offset_by_experts,
+                                    output_scale_offset_by_experts, m_topk, k);
+
+  auto n_experts = input_global_scale.size(0);
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   const cudaStream_t stream =
       at::cuda::getCurrentCUDAStream(input.get_device());
@@ -359,7 +404,38 @@ void scaled_fp4_experts_quant_sm1xxa(
   VLLM_DISPATCH_HALF_TYPES(
       input.scalar_type(), "nvfp4_experts_quant_kernel", [&] {
         using cuda_type = vllm::CUDATypeConverter<scalar_t>::Type;
-        vllm::quant_impl<cuda_type>(
+        vllm::quant_impl<cuda_type, /*FUSE_SILU_MUL=*/false>(
+            output.data_ptr(), output_scale.data_ptr(), input.data_ptr(),
+            input_global_scale.data_ptr(), input_offset_by_experts.data_ptr(),
+            output_scale_offset_by_experts.data_ptr(), m_topk, k, n_experts,
+            stream);
+      });
+}
+
+void silu_and_mul_scaled_fp4_experts_quant_sm1xxa(
+    torch::Tensor& output, torch::Tensor& output_scale,
+    torch::Tensor const& input, torch::Tensor const& input_global_scale,
+    torch::Tensor const& input_offset_by_experts,
+    torch::Tensor const& output_scale_offset_by_experts) {
+  auto m_topk = input.size(0);
+  // Input has gate || up layout, so k = input.size(1) / 2
+  auto k_times_2 = input.size(1);
+  TORCH_CHECK(k_times_2 % 2 == 0, "input width must be even (gate || up)");
+  auto k = k_times_2 / 2;
+
+  validate_fp4_experts_quant_inputs(output, output_scale, input,
+                                    input_global_scale, input_offset_by_experts,
+                                    output_scale_offset_by_experts, m_topk, k);
+
+  auto n_experts = input_global_scale.size(0);
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  const cudaStream_t stream =
+      at::cuda::getCurrentCUDAStream(input.get_device());
+
+  VLLM_DISPATCH_HALF_TYPES(
+      input.scalar_type(), "silu_mul_nvfp4_experts_quant_kernel", [&] {
+        using cuda_type = vllm::CUDATypeConverter<scalar_t>::Type;
+        vllm::quant_impl<cuda_type, /*FUSE_SILU_MUL=*/true>(
             output.data_ptr(), output_scale.data_ptr(), input.data_ptr(),
             input_global_scale.data_ptr(), input_offset_by_experts.data_ptr(),
             output_scale_offset_by_experts.data_ptr(), m_topk, k, n_experts,
diff --git a/csrc/quantization/fp4/nvfp4_quant_entry.cu b/csrc/quantization/fp4/nvfp4_quant_entry.cu
index fb6d22f035b99d8fef3e4bfa54d080fc258b0d8c..25e0ba8486c7e6098b7d5e71558bd3d4c4601c1f 100644
--- a/csrc/quantization/fp4/nvfp4_quant_entry.cu
+++ b/csrc/quantization/fp4/nvfp4_quant_entry.cu
@@ -41,6 +41,15 @@ void silu_and_mul_nvfp4_quant_sm1xxa(torch::Tensor& output,
                                      torch::Tensor& input_sf);
 #endif
 
+#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
+    (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
+void silu_and_mul_scaled_fp4_experts_quant_sm1xxa(
+    torch::Tensor& output, torch::Tensor& output_scale,
+    torch::Tensor const& input, torch::Tensor const& input_global_scale,
+    torch::Tensor const& input_offset_by_experts,
+    torch::Tensor const& output_scale_offset_by_experts);
+#endif
+
 void scaled_fp4_quant(torch::Tensor& output, torch::Tensor const& input,
                       torch::Tensor& output_sf, torch::Tensor const& input_sf) {
 #if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
@@ -74,3 +83,18 @@ void silu_and_mul_nvfp4_quant(torch::Tensor& output, torch::Tensor& output_sf,
   TORCH_CHECK_NOT_IMPLEMENTED(
       false, "No compiled silu_and_mul nvfp4 quantization kernel");
 }
+
+void silu_and_mul_scaled_fp4_experts_quant(
+    torch::Tensor& output, torch::Tensor& output_scale,
+    torch::Tensor const& input, torch::Tensor const& input_global_scale,
+    torch::Tensor const& input_offset_by_experts,
+    torch::Tensor const& output_scale_offset_by_experts) {
+#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
+    (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
+  return silu_and_mul_scaled_fp4_experts_quant_sm1xxa(
+      output, output_scale, input, input_global_scale, input_offset_by_experts,
+      output_scale_offset_by_experts);
+#endif
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      false, "No compiled silu_and_mul nvfp4 experts quantization kernel");
+}
diff --git a/csrc/quantization/fp4/nvfp4_quant_kernels.cu b/csrc/quantization/fp4/nvfp4_quant_kernels.cu
index 6d69852bb4e4f9c639fd6ad1ffad197555d9e54c..8e38deeb6607fb8da8da323aee03817c502c2cad 100644
--- a/csrc/quantization/fp4/nvfp4_quant_kernels.cu
+++ b/csrc/quantization/fp4/nvfp4_quant_kernels.cu
@@ -35,7 +35,13 @@ template <typename Int>
 __host__ __device__ inline Int round_up(Int x, Int y) {
   static_assert(std::is_integral_v<Int>,
                 "round_up argument must be integral type");
-  return (x + y - 1) / y * y;
+  return ((x + y - 1) / y) * y;
+}
+
+// Compute effective rows for grid configuration with swizzled SF layouts.
+inline int computeEffectiveRows(int m) {
+  constexpr int ROW_TILE = 128;
+  return round_up(m, ROW_TILE);
 }
 
 // Use UE4M3 by default.
@@ -49,81 +55,57 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
   static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
                 "Vec size is not matched.");
 
+  // Precompute SF layout parameter (constant for entire kernel).
+  int32_t const numKTiles = (numCols + 63) / 64;
+
   int sf_m = round_up<int>(numRows, 128);
   int sf_n_unpadded = numCols / CVT_FP4_SF_VEC_SIZE;
   int sf_n_int = round_up<int>(sf_n_unpadded, 4) / 4;
-  for (int row = numRows + blockIdx.x; row < sf_m; row += gridDim.x) {
-    // Each thread writes 4 uint32_t elements.
-    for (int col = sf_n_unpadded + threadIdx.x * 4; col < sf_n_int;
-         col += blockDim.x * 4) {
-      SFout[row * sf_n_int + col] = 0x00;
-    }
-  }
+  int num_padded_cols = sf_n_int * 4 * CVT_FP4_SF_VEC_SIZE;
 
   // Get the global scaling factor, which will be applied to the SF.
   // Note SFScale is the same as next GEMM's alpha, which is
   // (448.f / (Alpha_A / 6.f)).
   float const global_scale = SFScale == nullptr ? 1.0f : SFScale[0];
 
-  // Input tensor row/col loops.
-  for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x) {
-    for (int colIdx = threadIdx.x; colIdx < numCols / CVT_FP4_ELTS_PER_THREAD;
+  // Iterate over all rows and cols including padded ones -
+  //  ensures we visit every single scale factor address to initialize it.
+  for (int rowIdx = blockIdx.x; rowIdx < sf_m; rowIdx += gridDim.x) {
+    for (int colIdx = threadIdx.x;
+         colIdx < num_padded_cols / CVT_FP4_ELTS_PER_THREAD;
          colIdx += blockDim.x) {
+      int elem_idx = colIdx * CVT_FP4_ELTS_PER_THREAD;
+
+      PackedVec in_vec;
       int64_t inOffset = rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx;
-      PackedVec in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
-      // Get the output tensor offset.
-      // Same as inOffset because 8 elements are packed into one uint32_t.
-      int64_t outOffset = inOffset;
-      auto& out_pos = out[outOffset];
+
+      // If we are outside valid rows OR outside valid columns -> Use Zeros
+      if (rowIdx >= numRows || elem_idx >= numCols) {
+        memset(&in_vec, 0, sizeof(PackedVec));
+
+      } else {
+        // Valid Region: Load actual data
+        in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
+      }
 
       auto sf_out =
           cvt_quant_to_fp4_get_sf_out_offset<uint32_t,
                                              CVT_FP4_NUM_THREADS_PER_SF>(
-              rowIdx, colIdx, numCols, SFout);
+              rowIdx, colIdx, numKTiles, SFout);
 
-      out_pos =
+      auto out_val =
           cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, global_scale, sf_out);
-    }
-  }
-}
 
-template <typename T>
-void invokeFP4Quantization(int m, int n, T const* input, float const* SFScale,
-                           int64_t* output, int32_t* SFOuput, bool useUE8M0,
-                           int multiProcessorCount, cudaStream_t stream) {
-  // Grid, Block size.
-  // Each thread converts 8 values.
-  dim3 block(std::min(int(n / ELTS_PER_THREAD), 512));
-  // Get number of blocks per SM
-  int const numBlocksPerSM =
-      vllm_runtime_blocks_per_sm(static_cast<int>(block.x));
-  dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));
-
-  // Launch the cvt kernel.
-  if (useUE8M0) {
-    cvt_fp16_to_fp4<T, true><<<grid, block, 0, stream>>>(
-        m, n, input, SFScale, reinterpret_cast<uint32_t*>(output),
-        reinterpret_cast<uint32_t*>(SFOuput));
-  } else {
-    cvt_fp16_to_fp4<T, false><<<grid, block, 0, stream>>>(
-        m, n, input, SFScale, reinterpret_cast<uint32_t*>(output),
-        reinterpret_cast<uint32_t*>(SFOuput));
+      // We do NOT write output for padding because the 'out' tensor is not
+      // padded.
+      if (rowIdx < numRows && elem_idx < numCols) {
+        // Same as inOffset because 8 elements are packed into one uint32_t.
+        out[inOffset] = out_val;
+      }
+    }
   }
 }
 
-// Instantiate the function.
-template void invokeFP4Quantization(int m, int n, half const* input,
-                                    float const* SFScale, int64_t* output,
-                                    int32_t* SFOuput, bool useUE8M0,
-                                    int multiProcessorCount,
-                                    cudaStream_t stream);
-
-template void invokeFP4Quantization(int m, int n, __nv_bfloat16 const* input,
-                                    float const* SFScale, int64_t* output,
-                                    int32_t* SFOuput, bool useUE8M0,
-                                    int multiProcessorCount,
-                                    cudaStream_t stream);
-
 }  // namespace vllm
 
 void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
@@ -147,13 +129,19 @@ void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   auto stream = at::cuda::getCurrentCUDAStream(input.get_device());
 
-  // We don't support e8m0 scales at this moment.
-  bool useUE8M0 = false;
+  // Grid, Block size. Each thread converts 8 values.
+  dim3 block(std::min(int(n / ELTS_PER_THREAD), 512));
+  int const numBlocksPerSM =
+      vllm_runtime_blocks_per_sm(static_cast<int>(block.x));
+  int effectiveRows = vllm::computeEffectiveRows(m);
+  dim3 grid(std::min(effectiveRows, multiProcessorCount * numBlocksPerSM));
 
   VLLM_DISPATCH_HALF_TYPES(input.scalar_type(), "nvfp4_quant_kernel", [&] {
     using cuda_type = vllm::CUDATypeConverter<scalar_t>::Type;
     auto input_ptr = static_cast<cuda_type const*>(input.data_ptr());
-    vllm::invokeFP4Quantization(m, n, input_ptr, input_sf_ptr, output_ptr,
-                                sf_out, useUE8M0, multiProcessorCount, stream);
+    // NOTE: We don't support e8m0 scales at this moment.
+    vllm::cvt_fp16_to_fp4<cuda_type, false><<<grid, block, 0, stream>>>(
+        m, n, input_ptr, input_sf_ptr, reinterpret_cast<uint32_t*>(output_ptr),
+        reinterpret_cast<uint32_t*>(sf_out));
   });
-}
+}
\ No newline at end of file
diff --git a/csrc/quantization/fp4/nvfp4_utils.cuh b/csrc/quantization/fp4/nvfp4_utils.cuh
index 48e4959de979378e09eba59924e8bc3ff2ad8e3b..7082ad684bc3e651d38948c8827de109439b367c 100644
--- a/csrc/quantization/fp4/nvfp4_utils.cuh
+++ b/csrc/quantization/fp4/nvfp4_utils.cuh
@@ -128,51 +128,42 @@ inline __device__ float reciprocal_approximate_ftz(float a) {
   return b;
 }
 
+// Compute SF output offset for swizzled tensor core layout.
+// SF layout: [numMTiles, numKTiles, 32, 4, 4]
+// Caller must precompute: numKTiles = (numCols + 63) / 64
 template <class SFType, int CVT_FP4_NUM_THREADS_PER_SF>
-__device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(int rowIdx, int colIdx,
-                                                       int numCols,
-                                                       SFType* SFout) {
+__device__ __forceinline__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(
+    int rowIdx, int colIdx, int32_t numKTiles, SFType* SFout) {
   static_assert(CVT_FP4_NUM_THREADS_PER_SF == 1 ||
                 CVT_FP4_NUM_THREADS_PER_SF == 2);
 
   // One pair of threads write one SF to global memory.
   // TODO: stage through smem for packed STG.32
   // is it better than STG.8 from 4 threads ?
-  if (threadIdx.x % CVT_FP4_NUM_THREADS_PER_SF == 0) {
-    // SF vector index (16 elements share one SF in the K dimension).
-    int32_t kIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF;
-    int32_t mIdx = rowIdx;
-
-    // SF layout [numMTiles, numKTiles, 32 (mTile), 4 (mTile), 4(kTile)]
-    // --> index [mTileIdx, kTileIdx, outerMIdx, innerMIdx, innerKIdx]
-
-    int32_t mTileIdx = mIdx / (32 * 4);
-    // SF vector size 16.
-    int factor = CVT_FP4_SF_VEC_SIZE * 4;
-    int32_t numKTiles = (numCols + factor - 1) / factor;
-    int64_t mTileStride = numKTiles * 32 * 4 * 4;
-
-    int32_t kTileIdx = (kIdx / 4);
-    int64_t kTileStride = 32 * 4 * 4;
-
-    // M tile layout [32, 4] is column-major.
-    int32_t outerMIdx = (mIdx % 32);
-    int64_t outerMStride = 4 * 4;
-
-    int32_t innerMIdx = (mIdx % (32 * 4)) / 32;
-    int64_t innerMStride = 4;
-
-    int32_t innerKIdx = (kIdx % 4);
-    int64_t innerKStride = 1;
-
-    // Compute the global offset.
-    int64_t SFOffset = mTileIdx * mTileStride + kTileIdx * kTileStride +
-                       outerMIdx * outerMStride + innerMIdx * innerMStride +
-                       innerKIdx * innerKStride;
-
-    return reinterpret_cast<uint8_t*>(SFout) + SFOffset;
+  if (threadIdx.x % CVT_FP4_NUM_THREADS_PER_SF != 0) {
+    return nullptr;
   }
-  return nullptr;
+
+  // SF vector index (16 elements share one SF in the K dimension).
+  int32_t kIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF;
+  int32_t mIdx = rowIdx;
+
+  // Decompose indices using bitwise ops (all divisors are powers of 2).
+  // SF layout [numMTiles, numKTiles, 32 (mTile), 4 (mTile), 4(kTile)]
+  int32_t mTileIdx = mIdx >> 7;         // mIdx / 128
+  int32_t outerMIdx = mIdx & 31;        // mIdx % 32
+  int32_t innerMIdx = (mIdx >> 5) & 3;  // (mIdx / 32) % 4
+  int32_t kTileIdx = kIdx >> 2;         // kIdx / 4
+  int32_t innerKIdx = kIdx & 3;         // kIdx % 4
+
+  // Compute global SF offset: mTileIdx * (numKTiles * 512) + kTileIdx * 512 +
+  //                           outerMIdx * 16 + innerMIdx * 4 + innerKIdx
+  // Use bitwise OR for non-overlapping lower bits.
+  int64_t SFOffset = (static_cast<int64_t>(mTileIdx) * numKTiles + kTileIdx)
+                         << 9 |
+                     (outerMIdx << 4) | (innerMIdx << 2) | innerKIdx;
+
+  return reinterpret_cast<uint8_t*>(SFout) + SFOffset;
 }
 
 // Quantizes the provided PackedVec into the uint32_t output
@@ -248,4 +239,34 @@ __device__ uint32_t cvt_warp_fp16_to_fp4(PackedVec<Type>& vec, float SFScaleVal,
   return e2m1Vec;
 }
 
+// silu in float32
+__device__ __forceinline__ float silu(float x) {
+  return __fdividef(x, (1.f + __expf(-x)));
+}
+
+__device__ __forceinline__ float2 silu2(float2 x) {
+  return make_float2(silu(x.x), silu(x.y));
+}
+
+template <class Type>
+__inline__ __device__ PackedVec<Type> compute_silu_mul(
+    const PackedVec<Type>& x_vec, const PackedVec<Type>& y_vec) {
+  PackedVec<Type> result;
+
+#pragma unroll
+  for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; ++i) {
+    // silu_mul in float32
+    if constexpr (std::is_same_v<Type, half>) {
+      float2 silu_vec = silu2(__half22float2(x_vec.elts[i]));
+      result.elts[i] = __float22half2_rn(
+          __fmul2_rn(silu_vec, __half22float2(y_vec.elts[i])));
+    } else {
+      float2 silu_vec = silu2(__bfloat1622float2(x_vec.elts[i]));
+      result.elts[i] = __float22bfloat162_rn(
+          __fmul2_rn(silu_vec, __bfloat1622float2(y_vec.elts[i])));
+    }
+  }
+  return result;
+}
+
 }  // namespace vllm
diff --git a/csrc/quantization/gptq/q_gemm.cu b/csrc/quantization/gptq/q_gemm.cu
index 2f6bc7903b4546c2d0e26af01813ae2c8b65554d..5ce6229bbd617a2c908eda7a0e9dcb48722a9222 100644
--- a/csrc/quantization/gptq/q_gemm.cu
+++ b/csrc/quantization/gptq/q_gemm.cu
@@ -235,11 +235,6 @@ __global__ void gemm_half_q_half_gptq_4bit_kernel(
   // Zero output
   if (n >= size_n) return;
 
-  if (blockIdx.z == 0) {
-    for (int m = 0; m < m_count; m++)
-      *((uint64_t*)c_.item_ptr(offset_m + m, n)) = 0;
-  }
-
   __syncthreads();
 
   // Find initial group
@@ -374,11 +369,6 @@ __global__ void gemm_half_q_half_gptq_2bit_kernel(
   // Zero output
   if (n >= size_n) return;
 
-  if (blockIdx.z == 0) {
-    for (int m = 0; m < m_count; m++)
-      *((uint64_t*)c_.item_ptr(offset_m + m, n)) = 0;
-  }
-
   __syncthreads();
 
   // Find initial group
@@ -496,11 +486,6 @@ __global__ void gemm_half_q_half_gptq_3bit_kernel(
   // Zero output
   if (n >= size_n) return;
 
-  if (blockIdx.z == 0) {
-    for (int m = 0; m < m_count; m++)
-      *((uint64_t*)c_.item_ptr(offset_m + m, n)) = 0;
-  }
-
   __syncthreads();
 
   // Find initial group
@@ -625,11 +610,6 @@ __global__ void gemm_half_q_half_gptq_8bit_kernel(
   // Zero output
   if (n >= size_n) return;
 
-  if (blockIdx.z == 0) {
-    for (int m = 0; m < m_count; m++)
-      *((uint64_t*)c_.item_ptr(offset_m + m, n)) = 0;
-  }
-
   __syncthreads();
 
   // Find initial group
@@ -1226,9 +1206,6 @@ __global__ void gemm_half_q_half_alt_4bit_kernel(
         __halves2half2(__int2half_rn(val & 0xF), __int2half_rn(val >> 4));
   }
 
-  if (blockIdx.z == 0) {
-    for (int m = 0; m < b_end; m++) mul[(b + m) * width + w] = __int2half_rn(0);
-  }
   __syncthreads();
 
   int i = width * h + w;
@@ -1321,9 +1298,6 @@ __global__ void gemm_half_q_half_alt_8bit_kernel(
     }
   }
 
-  if (blockIdx.z == 0) {
-    for (int m = 0; m < b_end; m++) mul[(b + m) * width + w] = __int2half_rn(0);
-  }
   __syncthreads();
 
   int i = width * h + w;
@@ -1860,7 +1834,7 @@ torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
                         bool use_exllama, bool use_v2_format, int64_t bit) {
   const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
   auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
-  at::Tensor c = torch::empty({a.size(0), b_q_weight.size(1)}, options);
+  at::Tensor c = torch::zeros({a.size(0), b_q_weight.size(1)}, options);
   at::Tensor temp_dq = torch::empty(
       {b_q_weight.size(0) * 32 / bit, b_q_weight.size(1)}, options);
 
diff --git a/csrc/quantization/gptq_marlin/.gitignore b/csrc/quantization/gptq_marlin/.gitignore
index ba805f9250ecea707563336138067efe13b7b8b5..7dc482a8946605d91d192f43b2fffae518397f59 100644
--- a/csrc/quantization/gptq_marlin/.gitignore
+++ b/csrc/quantization/gptq_marlin/.gitignore
@@ -1,2 +1,3 @@
 sm*_kernel_*.cu
 kernel_selector.h
+kernel_*.cu
diff --git a/csrc/quantization/gptq_marlin/dequant.h b/csrc/quantization/gptq_marlin/dequant.h
index 26b8d40368aa959af3a78013c56e869c50f86f60..edd97dbfcd8e58b4d6dee3f695792816331510e5 100644
--- a/csrc/quantization/gptq_marlin/dequant.h
+++ b/csrc/quantization/gptq_marlin/dequant.h
@@ -67,7 +67,7 @@ where `scale_factor * multiplier` can be computed at weight loading.
 
 namespace MARLIN_NAMESPACE_NAME {
 
-#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 750
 // Lookup-table based 3-input logical operation; explicitly used for
 // dequantization as the compiler does not seem to automatically recognize it in
 // all cases.
diff --git a/csrc/quantization/gptq_marlin/generate_kernels.py b/csrc/quantization/gptq_marlin/generate_kernels.py
index 27ef7271ba41cfbcdbaf9d5b44b5d25a9779a570..24866fc5cd5463e16cfc2460628d2327f6ff99a1 100644
--- a/csrc/quantization/gptq_marlin/generate_kernels.py
+++ b/csrc/quantization/gptq_marlin/generate_kernels.py
@@ -10,6 +10,8 @@ import jinja2
 
 ARCHS = []
 SUPPORT_FP8 = False
+SUPPORT_SM75 = False
+SUPPORT_SM80 = False
 for arch in sys.argv[1].split(","):
     arch = arch[: arch.index(".") + 2].replace(".", "")
     arch = int(arch)
@@ -19,6 +21,10 @@ for arch in sys.argv[1].split(","):
     # with FP16 MMA, so it cannot achieve any acceleration.
     if arch in [89, 120]:
         SUPPORT_FP8 = True
+    if arch >= 80:
+        SUPPORT_SM80 = True
+    if arch == 75:
+        SUPPORT_SM75 = True
 
 FILE_HEAD_COMMENT = """
 // auto generated by generate_kernels.py
@@ -166,6 +172,7 @@ def remove_old_kernels():
 
 def generate_new_kernels():
     result_dict = {}
+    sm_75_result_dict = {}
 
     for quant_config in QUANT_CONFIGS:
         c_types = quant_config.get("c_type", ["kFloat16", "kBFloat16"])
@@ -184,6 +191,8 @@ def generate_new_kernels():
             s_type = quant_config.get("s_type", c_type)
             if (a_type, b_type, c_type) not in result_dict:
                 result_dict[(a_type, b_type, c_type)] = []
+                if a_type in ["kFloat16", "kS8"] and c_type == "kFloat16":
+                    sm_75_result_dict[(a_type, b_type, c_type)] = []
 
             for group_blocks, m_blocks, thread_configs in itertools.product(
                 all_group_blocks, all_m_blocks, all_thread_configs
@@ -207,78 +216,89 @@ def generate_new_kernels():
                     "thread_k_blocks": thread_k // 16,
                     "thread_n_blocks": thread_n // 16,
                     "m_block_size_8": "true" if m_blocks == 0.5 else "false",
-                    "stages": "pipe_stages",
+                    "stages": 4,
                     "group_blocks": group_blocks,
                     "is_zp_float": "true" if is_zp_float else "false",
                 }
 
-                result_dict[(a_type, b_type, c_type)].append(config)
+                if SUPPORT_SM80:
+                    result_dict[(a_type, b_type, c_type)].append(config)
+                if (a_type, b_type, c_type) in sm_75_result_dict and SUPPORT_SM75:
+                    config_sm75 = config.copy()
+                    config_sm75["stages"] = 2
+                    sm_75_result_dict[(a_type, b_type, c_type)].append(config_sm75)
 
     kernel_selector_str = FILE_HEAD_COMMENT
 
-    for (a_type, b_type, c_type), config_list in result_dict.items():
-        all_template_str_list = []
-        for config in config_list:
-            s_type = config["s_type"]
-            template_str = jinja2.Template(TEMPLATE).render(
-                a_type_id=f"vllm::{a_type}.id()",
-                b_type_id=f"vllm::{b_type}.id()",
-                c_type_id=f"vllm::{c_type}.id()",
-                s_type_id=f"vllm::{s_type}.id()",
-                **config,
-            )
-            all_template_str_list.append(template_str)
-
-            conditions = [
-                f"a_type == vllm::{a_type}",
-                f"b_type == vllm::{b_type}",
-                f"c_type == vllm::{c_type}",
-                f"s_type == vllm::{s_type}",
-                f"threads == {config['threads']}",
-                f"thread_m_blocks == {config['thread_m_blocks']}",
-                f"thread_n_blocks == {config['thread_n_blocks']}",
-                f"thread_k_blocks == {config['thread_k_blocks']}",
-                f"m_block_size_8 == {config['m_block_size_8']}",
-                f"group_blocks == {config['group_blocks']}",
-                f"is_zp_float == {config['is_zp_float']}",
-            ]
-            conditions = " && ".join(conditions)
-
-            if kernel_selector_str == FILE_HEAD_COMMENT:
-                kernel_selector_str += f"if ({conditions})\n  kernel = "
-            else:
-                kernel_selector_str += f"else if ({conditions})\n  kernel = "
-
-            kernel_template2 = (
-                "Marlin<{{a_type_id}}, {{b_type_id}}, {{c_type_id}}, "
-                "{{s_type_id}}, {{threads}}, {{thread_m_blocks}}, "
-                "{{thread_n_blocks}}, {{thread_k_blocks}}, "
-                "{{m_block_size_8}}, {{stages}}, {{group_blocks}}, "
-                "{{is_zp_float}}>;"
-            )
-
-            kernel_selector_str += (
-                jinja2.Template(kernel_template2).render(
+    for result_dict_tmp in [result_dict, sm_75_result_dict]:
+        for (a_type, b_type, c_type), config_list in result_dict_tmp.items():
+            all_template_str_list = []
+            if not config_list:
+                continue
+            for config in config_list:
+                s_type = config["s_type"]
+                template_str = jinja2.Template(TEMPLATE).render(
                     a_type_id=f"vllm::{a_type}.id()",
                     b_type_id=f"vllm::{b_type}.id()",
                     c_type_id=f"vllm::{c_type}.id()",
                     s_type_id=f"vllm::{s_type}.id()",
                     **config,
                 )
-                + "\n"
-            )
+                all_template_str_list.append(template_str)
+
+                conditions = [
+                    f"a_type == vllm::{a_type}",
+                    f"b_type == vllm::{b_type}",
+                    f"c_type == vllm::{c_type}",
+                    f"s_type == vllm::{s_type}",
+                    f"threads == {config['threads']}",
+                    f"thread_m_blocks == {config['thread_m_blocks']}",
+                    f"thread_n_blocks == {config['thread_n_blocks']}",
+                    f"thread_k_blocks == {config['thread_k_blocks']}",
+                    f"m_block_size_8 == {config['m_block_size_8']}",
+                    f"stages == {config['stages']}",
+                    f"group_blocks == {config['group_blocks']}",
+                    f"is_zp_float == {config['is_zp_float']}",
+                ]
+                conditions = " && ".join(conditions)
+
+                if kernel_selector_str == FILE_HEAD_COMMENT:
+                    kernel_selector_str += f"if ({conditions})\n  kernel = "
+                else:
+                    kernel_selector_str += f"else if ({conditions})\n  kernel = "
+
+                kernel_template2 = (
+                    "Marlin<{{a_type_id}}, {{b_type_id}}, {{c_type_id}}, "
+                    "{{s_type_id}}, {{threads}}, {{thread_m_blocks}}, "
+                    "{{thread_n_blocks}}, {{thread_k_blocks}}, "
+                    "{{m_block_size_8}}, {{stages}}, {{group_blocks}}, "
+                    "{{is_zp_float}}>;"
+                )
 
-        file_content = FILE_HEAD + "\n\n"
-        file_content += "\n\n".join(all_template_str_list) + "\n\n}\n"
-        if a_type == "kFE4M3fn":
-            filename = f"sm89_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
-        else:
-            filename = f"sm80_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
+                kernel_selector_str += (
+                    jinja2.Template(kernel_template2).render(
+                        a_type_id=f"vllm::{a_type}.id()",
+                        b_type_id=f"vllm::{b_type}.id()",
+                        c_type_id=f"vllm::{c_type}.id()",
+                        s_type_id=f"vllm::{s_type}.id()",
+                        **config,
+                    )
+                    + "\n"
+                )
+
+            file_content = FILE_HEAD + "\n\n"
+            file_content += "\n\n".join(all_template_str_list) + "\n\n}\n"
+            if a_type == "kFE4M3fn":
+                filename = f"sm89_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
+            elif result_dict_tmp is sm_75_result_dict:
+                filename = f"sm75_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
+            else:
+                filename = f"sm80_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
 
-        filename = filename.lower()
+            filename = filename.lower()
 
-        with open(os.path.join(os.path.dirname(__file__), filename), "w") as f:
-            f.write(file_content)
+            with open(os.path.join(os.path.dirname(__file__), filename), "w") as f:
+                f.write(file_content)
 
     if not SUPPORT_FP8 and kernel_selector_str != FILE_HEAD_COMMENT:
         kernel_selector_str += (
diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cu b/csrc/quantization/gptq_marlin/gptq_marlin.cu
index 28ff06559a98a254c325a14ccbd6b550fb6134f5..77f319d53bc52ec0f8bc84327db06147494b6e37 100644
--- a/csrc/quantization/gptq_marlin/gptq_marlin.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu
@@ -37,7 +37,7 @@ __global__ void MarlinDefault(MARLIN_KERNEL_PARAMS){};
 
 using MarlinFuncPtr = void (*)(MARLIN_KERNEL_PARAMS);
 
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 750
 
 __global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
                                     int const* __restrict__ perm_int_ptr,
@@ -148,7 +148,7 @@ typedef struct {
 
 int get_scales_cache_size(thread_config_t const& th_config, int prob_m,
                           int prob_n, int prob_k, int num_bits, int group_size,
-                          bool has_act_order, bool is_k_full) {
+                          bool has_act_order, bool is_k_full, int stages) {
   bool cache_scales_chunk = has_act_order && !is_k_full;
 
   int tb_n = th_config.thread_n;
@@ -166,28 +166,29 @@ int get_scales_cache_size(thread_config_t const& th_config, int prob_m,
 
   if (cache_scales_chunk) {
     int load_groups =
-        tb_groups * pipe_stages * 2;     // Chunk size is 2x pipeline over dim K
+        tb_groups * stages * 2;          // Chunk size is 2x pipeline over dim K
     load_groups = max(load_groups, 32);  // We load at least 32 scale groups
     return load_groups * tb_n * 2;
   } else {
     int tb_scales = tb_groups * tb_n * 2;
 
-    return tb_scales * pipe_stages;
+    return tb_scales * stages;
   }
 }
 
 int get_kernel_cache_size(thread_config_t const& th_config, int thread_m_blocks,
                           int prob_m, int prob_n, int prob_k, int num_bits,
                           int group_size, bool has_act_order, bool is_k_full,
-                          int has_zp, int is_zp_float) {
+                          int has_zp, bool is_zp_float, bool is_a_8bit,
+                          int stages) {
   int pack_factor = 32 / num_bits;
 
   // Get B size
   int tb_k = th_config.thread_k;
   int tb_n = th_config.thread_n;
   int tb_m = thread_m_blocks * 16;
-  int sh_a_size = pipe_stages * (tb_m * tb_k) * 2;
-  int sh_b_size = pipe_stages * (tb_k * tb_n / pack_factor) * 4;
+  int sh_a_size = stages * (tb_m * tb_k) * (is_a_8bit ? 1 : 2);
+  int sh_b_size = stages * (tb_k * tb_n / pack_factor) * 4;
   int sh_red_size = tb_m * (tb_n + 8) * 2;
   int sh_bias_size = tb_n * 2;
   int tmp_size =
@@ -196,8 +197,8 @@ int get_kernel_cache_size(thread_config_t const& th_config, int thread_m_blocks,
 
   int sh_s_size =
       get_scales_cache_size(th_config, prob_m, prob_n, prob_k, num_bits,
-                            group_size, has_act_order, is_k_full);
-  int sh_g_idx_size = has_act_order && !is_k_full ? pipe_stages * tb_k / 4 : 0;
+                            group_size, has_act_order, is_k_full, stages);
+  int sh_g_idx_size = has_act_order && !is_k_full ? stages * tb_k / 4 : 0;
   int sh_zp_size = 0;
   if (has_zp) {
     if (is_zp_float)
@@ -217,7 +218,8 @@ int get_kernel_cache_size(thread_config_t const& th_config, int thread_m_blocks,
 bool is_valid_config(thread_config_t const& th_config, int thread_m_blocks,
                      int prob_m, int prob_n, int prob_k, int num_bits,
                      int group_size, bool has_act_order, bool is_k_full,
-                     int has_zp, int is_zp_float, int max_shared_mem) {
+                     int has_zp, bool is_zp_float, bool is_a_8bit, int stages,
+                     int max_shared_mem) {
   // Sanity
   if (th_config.thread_k == -1 || th_config.thread_n == -1 ||
       th_config.num_threads == -1) {
@@ -242,7 +244,7 @@ bool is_valid_config(thread_config_t const& th_config, int thread_m_blocks,
   // Check that pipeline fits into cache
   int cache_size = get_kernel_cache_size(
       th_config, thread_m_blocks, prob_m, prob_n, prob_k, num_bits, group_size,
-      has_act_order, is_k_full, has_zp, is_zp_float);
+      has_act_order, is_k_full, has_zp, is_zp_float, is_a_8bit, stages);
   return cache_size <= max_shared_mem;
 }
 
@@ -251,7 +253,7 @@ MarlinFuncPtr get_marlin_kernel(
     const vllm::ScalarType c_type, const vllm::ScalarType s_type,
     int thread_m_blocks, int thread_n_blocks, int thread_k_blocks,
     bool m_block_size_8, bool has_act_order, bool has_zp, int group_blocks,
-    int threads, bool is_zp_float) {
+    int threads, bool is_zp_float, int stages) {
   int num_bits = b_type.size_bits();
   auto kernel = MarlinDefault;
 
@@ -265,7 +267,8 @@ exec_config_t determine_exec_config(
     const vllm::ScalarType& c_type, const vllm::ScalarType& s_type, int prob_m,
     int prob_n, int prob_k, int thread_m_blocks, bool m_block_size_8,
     int num_bits, int group_size, bool has_act_order, bool is_k_full,
-    bool has_zp, bool is_zp_float, int max_shared_mem, int sms) {
+    bool has_zp, bool is_zp_float, int is_a_8bit, int stages,
+    int max_shared_mem, int sms) {
   exec_config_t exec_cfg = exec_config_t{1, thread_config_t{-1, -1, -1}};
   thread_config_t* thread_configs = thread_m_blocks > 1
                                         ? large_batch_thread_configs
@@ -280,13 +283,15 @@ exec_config_t determine_exec_config(
 
     if (!is_valid_config(th_config, thread_m_blocks, prob_m, prob_n, prob_k,
                          num_bits, group_size, has_act_order, is_k_full, has_zp,
-                         is_zp_float, max_shared_mem - 512)) {
+                         is_zp_float, is_a_8bit, stages,
+                         max_shared_mem - 512)) {
       continue;
     }
 
-    int cache_size = get_kernel_cache_size(
-        th_config, thread_m_blocks, prob_m, prob_n, prob_k, num_bits,
-        group_size, has_act_order, is_k_full, has_zp, is_zp_float);
+    int cache_size = get_kernel_cache_size(th_config, thread_m_blocks, prob_m,
+                                           prob_n, prob_k, num_bits, group_size,
+                                           has_act_order, is_k_full, has_zp,
+                                           is_zp_float, is_a_8bit, stages);
 
     int group_blocks = 0;
     if (!has_act_order) {
@@ -297,14 +302,10 @@ exec_config_t determine_exec_config(
         get_marlin_kernel(a_type, b_type, c_type, s_type, thread_m_blocks,
                           th_config.thread_n / 16, th_config.thread_k / 16,
                           m_block_size_8, has_act_order, has_zp, group_blocks,
-                          th_config.num_threads, is_zp_float);
+                          th_config.num_threads, is_zp_float, stages);
 
     if (kernel == MarlinDefault) continue;
 
-    // int m_tiles = div_ceil(prob_m, thread_m_blocks * 16);
-    // int n_tiles = prob_n / th_config.thread_n;
-    // int k_tiles = prob_k / th_config.thread_k;
-
     return {1, th_config};
   }
 
@@ -321,6 +322,7 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
                int group_size, int dev, cudaStream_t stream, int thread_k_init,
                int thread_n_init, int sms, bool use_atomic_add,
                bool use_fp32_reduce, bool is_zp_float) {
+  bool is_a_8bit = a_type.size_bits() == 8;
   TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m,
               ", ", prob_n, ", ", prob_k, "]");
 
@@ -389,8 +391,14 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
                          dev);
   cudaDeviceGetAttribute(&minor_capability, cudaDevAttrComputeCapabilityMinor,
                          dev);
-  TORCH_CHECK(major_capability * 10 + minor_capability >= 80,
-              "marlin kernel only support Ampere or newer GPUs.");
+  TORCH_CHECK(major_capability * 10 + minor_capability >= 75,
+              "marlin kernel only support Turing or newer GPUs.");
+  int stages = 4;
+  if (major_capability == 7 && minor_capability == 5) {
+    stages = 2;
+    TORCH_CHECK(a_type == vllm::kFloat16 || a_type == vllm::kS8,
+                "Turing only support FP16 or INT8 activation.");
+  }
   if (a_type == vllm::kFE4M3fn) {
     TORCH_CHECK(
         major_capability * 10 + minor_capability == 89 ||
@@ -431,7 +439,8 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
       exec_cfg = determine_exec_config(
           a_type, b_type, c_type, s_type, prob_m_split, prob_n, prob_k,
           thread_m_blocks, m_block_size_8, num_bits, group_size, has_act_order,
-          is_k_full, has_zp, is_zp_float, max_shared_mem, sms);
+          is_k_full, has_zp, is_zp_float, is_a_8bit, stages, max_shared_mem,
+          sms);
       thread_tfg = exec_cfg.tb_cfg;
       if (thread_tfg.thread_n != -1) {
         if (prob_n / thread_tfg.thread_n *
@@ -440,7 +449,7 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
           if (is_valid_config({128, 64, 128}, thread_m_blocks, prob_m_split,
                               prob_n, prob_k, num_bits, group_size,
                               has_act_order, is_k_full, has_zp, is_zp_float,
-                              max_shared_mem_new)) {
+                              is_a_8bit, stages, max_shared_mem_new)) {
             thread_tfg = {128, 64, 128};
             exec_cfg = {1, thread_tfg};
           }
@@ -466,7 +475,8 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
     TORCH_CHECK(
         is_valid_config(thread_tfg, thread_m_blocks, prob_m_split, prob_n,
                         prob_k, num_bits, group_size, has_act_order, is_k_full,
-                        has_zp, is_zp_float, max_shared_mem_new),
+                        has_zp, is_zp_float, is_a_8bit, stages,
+                        max_shared_mem_new),
         "Invalid thread config: thread_m_blocks = ", thread_m_blocks,
         ", thread_k = ", thread_tfg.thread_k,
         ", thread_n = ", thread_tfg.thread_n,
@@ -475,12 +485,12 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
         ", prob_m_split = ", prob_m_split, ", group_size = ", group_size,
         ", has_act_order = ", has_act_order, ", is_k_full = ", is_k_full,
         ", has_zp = ", has_zp, ", is_zp_float = ", is_zp_float,
-        ", max_shared_mem_new = ", max_shared_mem_new);
+        ", stages = ", stages, ", max_shared_mem_new = ", max_shared_mem_new);
 
     auto kernel = get_marlin_kernel(
         a_type, b_type, c_type, s_type, thread_m_blocks, thread_n_blocks,
         thread_k_blocks, m_block_size_8, has_act_order, has_zp, group_blocks,
-        num_threads, is_zp_float);
+        num_threads, is_zp_float, stages);
 
     if (kernel == MarlinDefault) {
       TORCH_CHECK(false, "Unsupported shapes: MNK = [", prob_m, ", ", prob_n,
diff --git a/csrc/quantization/gptq_marlin/marlin.cuh b/csrc/quantization/gptq_marlin/marlin.cuh
index 2505e221322dde60ad749d37dd070ac25250e648..33fe52f605b4238205996352cceba2455022629a 100644
--- a/csrc/quantization/gptq_marlin/marlin.cuh
+++ b/csrc/quantization/gptq_marlin/marlin.cuh
@@ -1,17 +1,19 @@
 #pragma once
 
-#include <torch/all.h>
+#ifndef _marlin_cuh
+  #define _marlin_cuh
+  #include <torch/all.h>
 
-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <cuda_runtime.h>
-#include <iostream>
+  #include <ATen/cuda/CUDAContext.h>
+  #include <c10/cuda/CUDAGuard.h>
+  #include <cuda.h>
+  #include <cuda_fp16.h>
+  #include <cuda_runtime.h>
+  #include <iostream>
 
-#ifndef MARLIN_NAMESPACE_NAME
-  #define MARLIN_NAMESPACE_NAME marlin
-#endif
+  #ifndef MARLIN_NAMESPACE_NAME
+    #define MARLIN_NAMESPACE_NAME marlin
+  #endif
 
 namespace MARLIN_NAMESPACE_NAME {
 
@@ -51,9 +53,51 @@ using I4 = Vec<int, 4>;
 
 constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; }
 
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
-// No support for async
-#else
+  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+
+__device__ inline void cp_async1_ca_pred(void* smem_ptr, const void* glob_ptr,
+                                         bool pred = true) {
+  if (pred) {
+    reinterpret_cast<int32_t*>(smem_ptr)[0] =
+        reinterpret_cast<const int32_t*>(glob_ptr)[0];
+  }
+}
+
+__device__ inline void cp_async2_ca_pred(void* smem_ptr, const void* glob_ptr,
+                                         bool pred = true) {
+  if (pred) {
+    reinterpret_cast<int64_t*>(smem_ptr)[0] =
+        reinterpret_cast<const int64_t*>(glob_ptr)[0];
+  }
+}
+
+__device__ inline void cp_async4_ca_pred(void* smem_ptr, const void* glob_ptr,
+                                         bool pred = true) {
+  if (pred) {
+    reinterpret_cast<int4*>(smem_ptr)[0] =
+        reinterpret_cast<const int4*>(glob_ptr)[0];
+  }
+}
+
+__device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr,
+                                      bool pred = true) {
+  if (pred) {
+    reinterpret_cast<int4*>(smem_ptr)[0] =
+        reinterpret_cast<const int4*>(glob_ptr)[0];
+  }
+}
+
+__device__ inline void cp_async4(void* smem_ptr, const void* glob_ptr) {
+  reinterpret_cast<int4*>(smem_ptr)[0] =
+      reinterpret_cast<const int4*>(glob_ptr)[0];
+}
+
+__device__ inline void cp_async_fence() {}
+
+template <int n>
+__device__ inline void cp_async_wait() {}
+
+  #else
 
 __device__ inline void cp_async1_ca_pred(void* smem_ptr, const void* glob_ptr,
                                          bool pred = true) {
@@ -126,6 +170,8 @@ __device__ inline void cp_async_wait() {
   asm volatile("cp.async.wait_group %0;\n" ::"n"(n));
 }
 
-#endif
+  #endif
 
 }  // namespace MARLIN_NAMESPACE_NAME
+
+#endif
\ No newline at end of file
diff --git a/csrc/quantization/gptq_marlin/marlin_mma.h b/csrc/quantization/gptq_marlin/marlin_mma.h
new file mode 100644
index 0000000000000000000000000000000000000000..6ec2aaafc4392c3bd70827b37188efced0763eec
--- /dev/null
+++ b/csrc/quantization/gptq_marlin/marlin_mma.h
@@ -0,0 +1,269 @@
+
+#include "marlin_dtypes.cuh"
+
+namespace MARLIN_NAMESPACE_NAME {
+
+// m16n8k16 tensor core mma instruction with fp16 inputs and fp32
+// output/accumulation.
+template <vllm::ScalarTypeId type_id, bool use_fp16_accum, int k_size = 16>
+__device__ inline void mma(
+    const typename MarlinScalarType<type_id>::FragA& a_frag,
+    const typename MarlinScalarType<type_id>::FragB& frag_b,
+    typename MarlinScalarType<type_id>::FragC& frag_c, int idx = 0) {
+  const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
+  const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
+  using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
+  if constexpr (!std::is_same<scalar_t, half>::value || k_size != 16) {
+    static_assert(!use_fp16_accum);
+  }
+
+  if constexpr (k_size == 16) {
+    if constexpr (std::is_same<scalar_t, half>::value && !use_fp16_accum) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(a[0]), "r"(a[1]), "r"(b[0]), "f"(c[0]), "f"(c[1]), "f"(c[2]),
+            "f"(c[3]));
+      asm volatile(
+          "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(a[2]), "r"(a[3]), "r"(b[1]), "f"(c[0]), "f"(c[1]), "f"(c[2]),
+            "f"(c[3]));
+#else
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
+            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+#endif
+    } else if constexpr (std::is_same<scalar_t, half>::value &&
+                         use_fp16_accum) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750
+      uint32_t* c = reinterpret_cast<uint32_t*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 "
+          "{%0,%1}, {%2,%3}, {%4}, {%5,%6};\n"
+          : "=r"(c[0]), "=r"(c[1])
+          : "r"(a[0]), "r"(a[1]), "r"(b[0]), "r"(c[0]), "r"(c[1]));
+      asm volatile(
+          "mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 "
+          "{%0,%1}, {%2,%3}, {%4}, {%5,%6};\n"
+          : "=r"(c[0]), "=r"(c[1])
+          : "r"(a[2]), "r"(a[3]), "r"(b[1]), "r"(c[0]), "r"(c[1]));
+#else
+      uint32_t* c = reinterpret_cast<uint32_t*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 "
+          "{%0,%1}, {%2,%3,%4,%5}, {%6,%7}, {%8,%9};\n"
+          : "=r"(c[0]), "=r"(c[1])
+          : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
+            "r"(c[0]), "r"(c[1]));
+#endif
+    } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
+            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+    } else if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f32.e4m3.e4m3.f32 "
+          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(a[idx * 2]), "r"(a[idx * 2 + 1]), "r"(b[idx]), "f"(c[0]),
+            "f"(c[1]), "f"(c[2]), "f"(c[3]));
+    } else if constexpr (std::is_same<scalar_t, int8_t>::value) {
+      int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32.satfinite "
+          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+          : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
+          : "r"(a[idx * 2]), "r"(a[idx * 2 + 1]), "r"(b[idx]), "r"(c[0]),
+            "r"(c[1]), "r"(c[2]), "r"(c[3]));
+    }
+  } else if (k_size == 32) {
+    if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
+            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+    } else if constexpr (std::is_same<scalar_t, int8_t>::value) {
+      int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750
+      asm volatile(
+          "mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32.satfinite "
+          "{%0,%1}, {%2}, {%3}, {%4,%5};\n"
+          : "=r"(c[0]), "=r"(c[1])
+          : "r"(a[0]), "r"(b[0]), "r"(c[0]), "r"(c[1]));
+      asm volatile(
+          "mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32.satfinite "
+          "{%0,%1}, {%2}, {%3}, {%4,%5};\n"
+          : "=r"(c[2]), "=r"(c[3])
+          : "r"(a[1]), "r"(b[0]), "r"(c[2]), "r"(c[3]));
+      asm volatile(
+          "mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32.satfinite "
+          "{%0,%1}, {%2}, {%3}, {%4,%5};\n"
+          : "=r"(c[0]), "=r"(c[1])
+          : "r"(a[2]), "r"(b[1]), "r"(c[0]), "r"(c[1]));
+      asm volatile(
+          "mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32.satfinite "
+          "{%0,%1}, {%2}, {%3}, {%4,%5};\n"
+          : "=r"(c[2]), "=r"(c[3])
+          : "r"(a[3]), "r"(b[1]), "r"(c[2]), "r"(c[3]));
+#else
+      asm volatile(
+          "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32.satfinite "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
+          : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
+            "r"(c[0]), "r"(c[1]), "r"(c[2]), "r"(c[3]));
+#endif
+    }
+  }
+}
+
+template <vllm::ScalarTypeId type_id, bool use_fp16_accum, int k_size = 16>
+__device__ inline void mma_trans(
+    const typename MarlinScalarType<type_id>::FragA& a_frag,
+    const typename MarlinScalarType<type_id>::FragB& frag_b,
+    const typename MarlinScalarType<type_id>::FragB& frag_b2,
+    typename MarlinScalarType<type_id>::FragC& frag_c) {
+  const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
+  const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
+  const uint32_t* b2 = reinterpret_cast<const uint32_t*>(&frag_b2);
+  float* c = reinterpret_cast<float*>(&frag_c);
+  using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
+  if constexpr (!std::is_same<scalar_t, half>::value || k_size != 16) {
+    static_assert(!use_fp16_accum);
+  }
+
+  if constexpr (k_size == 16) {
+    if constexpr (std::is_same<scalar_t, half>::value && !use_fp16_accum) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(b[0]), "r"(b2[0]), "r"(a[0]), "f"(c[0]), "f"(c[1]), "f"(c[2]),
+            "f"(c[3]));
+      asm volatile(
+          "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(b[1]), "r"(b2[1]), "r"(a[1]), "f"(c[0]), "f"(c[1]), "f"(c[2]),
+            "f"(c[3]));
+#else
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
+            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+#endif
+    } else if constexpr (std::is_same<scalar_t, half>::value &&
+                         use_fp16_accum) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750
+      uint32_t* c = reinterpret_cast<uint32_t*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 "
+          "{%0,%1}, {%2,%3}, {%4}, {%5,%6};\n"
+          : "=r"(c[0]), "=r"(c[1])
+          : "r"(b[0]), "r"(b2[0]), "r"(a[0]), "r"(c[0]), "r"(c[1]));
+      asm volatile(
+          "mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 "
+          "{%0,%1}, {%2,%3}, {%4}, {%5,%6};\n"
+          : "=r"(c[0]), "=r"(c[1])
+          : "r"(b[1]), "r"(b2[1]), "r"(a[1]), "r"(c[0]), "r"(c[1]));
+#else
+      uint32_t* c = reinterpret_cast<uint32_t*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 "
+          "{%0,%1}, {%2,%3,%4,%5}, {%6,%7}, {%8,%9};\n"
+          : "=r"(c[0]), "=r"(c[1])
+          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
+            "r"(c[0]), "r"(c[1]));
+#endif
+    } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
+            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+    } else if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f32.e4m3.e4m3.f32 "
+          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(b[0]), "r"(b2[0]), "r"(a[0]), "f"(c[0]), "f"(c[1]), "f"(c[2]),
+            "f"(c[3]));
+    } else if constexpr (std::is_same<scalar_t, int8_t>::value) {
+      int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32.satfinite "
+          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+          : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
+          : "r"(b[0]), "r"(b2[0]), "r"(a[0]), "r"(c[0]), "r"(c[1]), "r"(c[2]),
+            "r"(c[3]));
+    }
+  } else {
+    if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
+            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+    } else if constexpr (std::is_same<scalar_t, int8_t>::value) {
+      int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750
+      asm volatile(
+          "mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32.satfinite "
+          "{%0,%1}, {%2}, {%3}, {%4,%5};\n"
+          : "=r"(c[0]), "=r"(c[1])
+          : "r"(b[0]), "r"(a[0]), "r"(c[0]), "r"(c[1]));
+      asm volatile(
+          "mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32.satfinite "
+          "{%0,%1}, {%2}, {%3}, {%4,%5};\n"
+          : "=r"(c[2]), "=r"(c[3])
+          : "r"(b2[1]), "r"(a[0]), "r"(c[2]), "r"(c[3]));
+      asm volatile(
+          "mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32.satfinite "
+          "{%0,%1}, {%2}, {%3}, {%4,%5};\n"
+          : "=r"(c[0]), "=r"(c[1])
+          : "r"(b[0]), "r"(a[1]), "r"(c[0]), "r"(c[1]));
+      asm volatile(
+          "mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32.satfinite "
+          "{%0,%1}, {%2}, {%3}, {%4,%5};\n"
+          : "=r"(c[2]), "=r"(c[3])
+          : "r"(b2[1]), "r"(a[1]), "r"(c[2]), "r"(c[3]));
+#else
+      asm volatile(
+          "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32.satfinite "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
+          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
+            "r"(c[0]), "r"(c[1]), "r"(c[2]), "r"(c[3]));
+#endif
+    }
+  }
+}
+
+}  // namespace MARLIN_NAMESPACE_NAME
\ No newline at end of file
diff --git a/csrc/quantization/gptq_marlin/marlin_template.h b/csrc/quantization/gptq_marlin/marlin_template.h
index 22bb71e482ce8775920fc4e864002b5e0a74729f..c7b53696c12237967effe3db72ecbbd1375cf4d6 100644
--- a/csrc/quantization/gptq_marlin/marlin_template.h
+++ b/csrc/quantization/gptq_marlin/marlin_template.h
@@ -26,6 +26,7 @@
 #include "marlin.cuh"
 #include "marlin_dtypes.cuh"
 #include "dequant.h"
+#include "marlin_mma.h"
 #include "core/scalar_type.hpp"
 
 #define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t)               \
@@ -35,7 +36,7 @@
 
 namespace MARLIN_NAMESPACE_NAME {
 
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 750
 
 template <typename scalar_t,  // compute dtype, half or nv_float16
           const vllm::ScalarTypeId b_type_id,  // weight MarlinScalarType id
@@ -75,137 +76,6 @@ __global__ void Marlin(
 
 #else
 
-// m16n8k16 tensor core mma instruction with fp16 inputs and fp32
-// output/accumulation.
-template <vllm::ScalarTypeId type_id, int k_size = 16>
-__device__ inline void mma(
-    const typename MarlinScalarType<type_id>::FragA& a_frag,
-    const typename MarlinScalarType<type_id>::FragB& frag_b,
-    typename MarlinScalarType<type_id>::FragC& frag_c, int idx = 0) {
-  const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
-  const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
-  using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
-  if constexpr (k_size == 16) {
-    if constexpr (std::is_same<scalar_t, half>::value) {
-      float* c = reinterpret_cast<float*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
-          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-          : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
-            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
-    } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
-      float* c = reinterpret_cast<float*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
-          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-          : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
-            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
-    } else if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
-      float* c = reinterpret_cast<float*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k16.row.col.f32.e4m3.e4m3.f32 "
-          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
-          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-          : "r"(a[idx * 2]), "r"(a[idx * 2 + 1]), "r"(b[idx]), "f"(c[0]),
-            "f"(c[1]), "f"(c[2]), "f"(c[3]));
-    } else if constexpr (std::is_same<scalar_t, int8_t>::value) {
-      int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32.satfinite "
-          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
-          : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
-          : "r"(a[idx * 2]), "r"(a[idx * 2 + 1]), "r"(b[idx]), "r"(c[0]),
-            "r"(c[1]), "r"(c[2]), "r"(c[3]));
-    }
-  } else if (k_size == 32) {
-    if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
-      float* c = reinterpret_cast<float*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32 "
-          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-          : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
-            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
-    } else if constexpr (std::is_same<scalar_t, int8_t>::value) {
-      int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32.satfinite "
-          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-          : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
-          : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
-            "r"(c[0]), "r"(c[1]), "r"(c[2]), "r"(c[3]));
-    }
-  }
-}
-
-template <vllm::ScalarTypeId type_id, int k_size = 16>
-__device__ inline void mma_trans(
-    const typename MarlinScalarType<type_id>::FragA& a_frag,
-    const typename MarlinScalarType<type_id>::FragB& frag_b,
-    const typename MarlinScalarType<type_id>::FragB& frag_b2,
-    typename MarlinScalarType<type_id>::FragC& frag_c) {
-  const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
-  const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
-  const uint32_t* b2 = reinterpret_cast<const uint32_t*>(&frag_b2);
-  float* c = reinterpret_cast<float*>(&frag_c);
-  using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
-  if constexpr (k_size == 16) {
-    if constexpr (std::is_same<scalar_t, half>::value) {
-      float* c = reinterpret_cast<float*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
-          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
-            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
-    } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
-      float* c = reinterpret_cast<float*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
-          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
-            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
-    } else if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
-      float* c = reinterpret_cast<float*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k16.row.col.f32.e4m3.e4m3.f32 "
-          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
-          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-          : "r"(b[0]), "r"(b2[0]), "r"(a[0]), "f"(c[0]), "f"(c[1]), "f"(c[2]),
-            "f"(c[3]));
-    } else if constexpr (std::is_same<scalar_t, int8_t>::value) {
-      int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32.satfinite "
-          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
-          : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
-          : "r"(b[0]), "r"(b2[0]), "r"(a[0]), "r"(c[0]), "r"(c[1]), "r"(c[2]),
-            "r"(c[3]));
-    }
-  } else {
-    if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
-      float* c = reinterpret_cast<float*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32 "
-          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
-            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
-    } else if constexpr (std::is_same<scalar_t, int8_t>::value) {
-      int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32.satfinite "
-          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-          : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
-          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
-            "r"(c[0]), "r"(c[1]), "r"(c[2]), "r"(c[3]));
-    }
-  }
-}
-
 // Instruction for loading a full 16x16 matrix fragment of operand A from shared
 // memory, directly in tensor core layout.
 template <int count, vllm::ScalarTypeId type_id>
@@ -415,6 +285,17 @@ __global__ void Marlin(
   if constexpr (a_type_id == vllm::kFE4M3fn.id()) return;
   #endif
 
+  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750
+  // Turing TensorCore only supports fp16 and int8
+  if constexpr (a_type_id != vllm::kFloat16.id() && a_type_id != vllm::kS8.id())
+    return;
+  #endif
+
+  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750
+  constexpr bool use_fp16_accum = a_type_id == vllm::kFloat16.id();
+  #else
+  constexpr bool use_fp16_accum = false;
+  #endif
   using Adtype = MarlinScalarType<a_type_id>;
   using Cdtype = MarlinScalarType<c_type_id>;
   const int4* A = A0;
@@ -873,10 +754,6 @@ __global__ void Marlin(
   constexpr int sh_s_size = has_act_order ? (act_s_max_num_groups * s_sh_stride)
                                           : (stages * s_sh_stage);
   int4* sh_s = sh_zp + (stages * zp_sh_stage);
-  // shared memory reused by reduction should be smaller than
-  // shared memory used by weight.
-  static_assert(thread_m_blocks * 16 * thread_n_blocks * 16 / 8 <=
-                stages * b_sh_stage);
   int4* sh_a = sh_s + sh_s_size;
 
   // Register storage for double buffer of shared memory reads.
@@ -1395,11 +1272,13 @@ __global__ void Marlin(
   #pragma unroll
       for (int i = 0; i < thread_m_blocks; i++) {
         if constexpr (m_block_size_8) {
-          mma_trans<a_type_id>(frag_a[k2][i], frag_b0, frag_b1,
-                               frag_c[i][j][0]);
+          mma_trans<a_type_id, use_fp16_accum>(frag_a[k2][i], frag_b0, frag_b1,
+                                               frag_c[i][j][0]);
         } else {
-          mma<a_type_id>(frag_a[k2][i], frag_b0, frag_c[i][j][0]);
-          mma<a_type_id>(frag_a[k2][i], frag_b1, frag_c[i][j][1]);
+          mma<a_type_id, use_fp16_accum>(frag_a[k2][i], frag_b0,
+                                         frag_c[i][j][0]);
+          mma<a_type_id, use_fp16_accum>(frag_a[k2][i], frag_b1,
+                                         frag_c[i][j][1]);
         }
       }
     }
@@ -1433,10 +1312,12 @@ __global__ void Marlin(
 
   #pragma unroll
       for (int i = 0; i < thread_m_blocks; i++) {
-        mma<a_type_id, 32>(frag_a[k2][i], frag_b[0],
-                           (group_blocks == -1 ? frag_c : frag_c_tmp)[i][j][0]);
-        mma<a_type_id, 32>(frag_a[k2][i], frag_b[1],
-                           (group_blocks == -1 ? frag_c : frag_c_tmp)[i][j][1]);
+        mma<a_type_id, false, 32>(
+            frag_a[k2][i], frag_b[0],
+            (group_blocks == -1 ? frag_c : frag_c_tmp)[i][j][0]);
+        mma<a_type_id, false, 32>(
+            frag_a[k2][i], frag_b[1],
+            (group_blocks == -1 ? frag_c : frag_c_tmp)[i][j][1]);
       }
 
       if constexpr (group_blocks != -1) {
@@ -1956,6 +1837,21 @@ __global__ void Marlin(
     // While this pattern may not be the most readable, other ways of writing
     // the loop seemed to noticeably worse performance after compilation.
     if (slice_iters == 0) {
+      // convert fp16 accum to fp32 for reduction
+      if constexpr (use_fp16_accum) {
+  #pragma unroll
+        for (int i = 0; i < (thread_m_blocks * (is_a_8bit ? 2 : 4) * 2); i++) {
+          float* frag_c_part_float = reinterpret_cast<float*>(frag_c) + i * 4;
+          scalar_t* frag_c_part_half =
+              reinterpret_cast<scalar_t*>(frag_c_part_float);
+
+  #pragma unroll
+          for (int i = 3; i >= 0; i--) {
+            frag_c_part_float[i] = Cdtype::num2float(frag_c_part_half[i]);
+          }
+        }
+      }
+
       if constexpr (is_a_8bit) {
         float frag_a_s[2 * thread_m_blocks];
 
diff --git a/csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu b/csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu
deleted file mode 100644
index 6c8f6309ef43f1f10e00384233436616181342d8..0000000000000000000000000000000000000000
--- a/csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu
+++ /dev/null
@@ -1,373 +0,0 @@
-#include "core/registration.h"
-
-#include <torch/all.h>
-#include <cutlass/arch/arch.h>
-
-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <c10/cuda/CUDAStream.h>
-
-#include "cute/tensor.hpp"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/epilogue/collective/default_epilogue.hpp"
-#include "cutlass/epilogue/thread/linear_combination.h"
-#include "cutlass/gemm/dispatch_policy.hpp"
-#include "cutlass/gemm/group_array_problem_shape.hpp"
-#include "cutlass/gemm/collective/collective_builder.hpp"
-#include "cutlass/epilogue/collective/collective_builder.hpp"
-#include "cutlass/gemm/device/gemm_universal_adapter.h"
-#include "cutlass/gemm/kernel/gemm_universal.hpp"
-
-#include "cutlass/util/command_line.h"
-#include "cutlass/util/distribution.h"
-#include "cutlass/util/host_tensor.h"
-#include "cutlass/util/packed_stride.hpp"
-#include "cutlass/util/tensor_view_io.h"
-#include "cutlass/util/reference/device/gemm.h"
-#include "cutlass/util/reference/device/tensor_compare.h"
-#include "cutlass/util/reference/host/tensor_fill.h"
-#include "cutlass/util/reference/host/gett.hpp"
-#include "cutlass/util/reference/host/tensor_norm.h"
-#include "cutlass/util/reference/host/tensor_compare.h"
-#include <cassert>
-
-using namespace cute;
-
-template <typename ElementAB, typename ElementC, typename ElementAccumulator,
-          typename LayoutSFA, typename LayoutSFB, typename ScaleConfig>
-__global__ void get_ggemm_starts(
-    int32_t* expert_offsets, ElementAB** a_offsets, ElementAB** b_offsets,
-    ElementC** out_offsets, ElementAccumulator** a_scale_offsets,
-    ElementAccumulator** b_scale_offsets, ElementAB* a_base_as_int,
-    ElementAB* b_base_as_int, ElementC* out_base_as_int,
-    ElementAccumulator* a_scale_base_as_int,
-    ElementAccumulator* b_scale_base_as_int, LayoutSFA* layout_sfa_base_as_int,
-    LayoutSFB* layout_sfb_base_as_int, int* problem_sizes) {
-  int expert_id = threadIdx.x;
-
-  if (expert_id >= gridDim.x * blockDim.x) {
-    return;
-  }
-
-  int m = problem_sizes[expert_id * 3];
-  int n = problem_sizes[expert_id * 3 + 1];
-  int k = problem_sizes[expert_id * 3 + 2];
-
-  int32_t expert_offset = expert_offsets[expert_id];
-  int a_stride = expert_offset * k;
-  int b_stride = expert_id * k * n;
-  int a_scale_stride = expert_offset * k / 128;
-  int b_scale_stride = expert_id * k * n / 128 / 128;
-
-  a_offsets[expert_id] = a_base_as_int + a_stride;
-  b_offsets[expert_id] = b_base_as_int + b_stride;
-  out_offsets[expert_id] = out_base_as_int + expert_offset * n;
-  a_scale_offsets[expert_id] = a_scale_base_as_int + a_scale_stride;
-  b_scale_offsets[expert_id] = b_scale_base_as_int + b_scale_stride;
-
-  LayoutSFA* layout_sfa_ptr = layout_sfa_base_as_int + expert_id;
-  LayoutSFB* layout_sfb_ptr = layout_sfb_base_as_int + expert_id;
-
-  *layout_sfa_ptr =
-      ScaleConfig::tile_atom_to_shape_SFA(cute::make_shape(m, n, k, 1));
-  *layout_sfb_ptr =
-      ScaleConfig::tile_atom_to_shape_SFB(cute::make_shape(m, n, k, 1));
-}
-
-#define __CALL_GET_STARTS_KERNEL(TENSOR_C_TYPE, C_TYPE, LayoutSFA, LayoutSFB, \
-                                 ScaleConfig)                                 \
-  else if (out_tensors.dtype() == TENSOR_C_TYPE) {                            \
-    get_ggemm_starts<cutlass::float_e4m3_t, C_TYPE, float, LayoutSFA,         \
-                     LayoutSFB, ScaleConfig><<<1, num_experts, 0, stream>>>(  \
-        static_cast<int32_t*>(expert_offsets.data_ptr()),                     \
-        static_cast<cutlass::float_e4m3_t**>(a_ptrs.data_ptr()),              \
-        static_cast<cutlass::float_e4m3_t**>(b_ptrs.data_ptr()),              \
-        static_cast<C_TYPE**>(out_ptrs.data_ptr()),                           \
-        static_cast<float**>(a_scales_ptrs.data_ptr()),                       \
-        static_cast<float**>(b_scales_ptrs.data_ptr()),                       \
-        static_cast<cutlass::float_e4m3_t*>(a_tensors.data_ptr()),            \
-        static_cast<cutlass::float_e4m3_t*>(b_tensors.data_ptr()),            \
-        static_cast<C_TYPE*>(out_tensors.data_ptr()),                         \
-        static_cast<float*>(a_scales.data_ptr()),                             \
-        static_cast<float*>(b_scales.data_ptr()),                             \
-        reinterpret_cast<LayoutSFA*>(layout_sfa.data_ptr()),                  \
-        reinterpret_cast<LayoutSFB*>(layout_sfb.data_ptr()),                  \
-        static_cast<int*>(problem_sizes.data_ptr()));                         \
-  }
-
-template <typename LayoutSFA, typename LayoutSFB, typename ScaleConfig>
-void run_get_ggemm_starts(
-    torch::Tensor const& expert_offsets, torch::Tensor& a_ptrs,
-    torch::Tensor& b_ptrs, torch::Tensor& out_ptrs,
-    torch::Tensor& a_scales_ptrs, torch::Tensor& b_scales_ptrs,
-    torch::Tensor const& a_tensors, torch::Tensor const& b_tensors,
-    torch::Tensor out_tensors, torch::Tensor const& a_scales,
-    torch::Tensor const& b_scales, torch::Tensor const& layout_sfa,
-    torch::Tensor const& layout_sfb, torch::Tensor const& problem_sizes) {
-  TORCH_CHECK(a_tensors.dtype() == torch::kFloat8_e4m3fn);
-  TORCH_CHECK(b_tensors.dtype() == torch::kFloat8_e4m3fn);
-  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
-  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
-  TORCH_CHECK(out_tensors.size(1) % 128 == 0 or out_tensors.size(0) % 128 == 0);
-  TORCH_CHECK(a_tensors.size(1) % 128 == 0 or a_tensors.size(0) % 128 == 0);
-
-  int num_experts = (int)expert_offsets.size(0);
-  auto stream = at::cuda::getCurrentCUDAStream(a_tensors.device().index());
-
-  if (false) {
-  }
-  __CALL_GET_STARTS_KERNEL(torch::kBFloat16, cutlass::bfloat16_t, LayoutSFA,
-                           LayoutSFB, ScaleConfig)
-  __CALL_GET_STARTS_KERNEL(torch::kFloat16, cutlass::half_t, LayoutSFA,
-                           LayoutSFB, ScaleConfig)
-  else {
-    TORCH_CHECK(false, "Unsupported output tensor type");
-  }
-}
-
-template <typename OutType, typename ScheduleConfig, typename LayoutD>
-void run_blockwise_scaled_group_mm(
-    torch::Tensor& out_ptrs, const torch::Tensor& a_ptrs,
-    const torch::Tensor& b_ptrs, const torch::Tensor& a_scales_ptrs,
-    const torch::Tensor& b_scales_ptrs, const torch::Tensor& stride_a,
-    const torch::Tensor& stride_b, const torch::Tensor& stride_c,
-    const torch::Tensor& layout_sfa, const torch::Tensor& layout_sfb,
-    const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets) {
-  using ProblemShape = cutlass::gemm::GroupProblemShape<Shape<int, int, int>>;
-
-  // Types
-  using ElementA = cutlass::float_e4m3_t;
-  using ElementB = cutlass::float_e4m3_t;
-  using ElementC = OutType;
-  using ElementD = ElementC;
-  using ElementAccumulator = float;
-  using LayoutA = cutlass::layout::RowMajor;
-  using LayoutB = cutlass::layout::ColumnMajor;
-  using LayoutC = LayoutD;
-
-  // Alignments
-  static constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value;
-  static constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value;
-  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
-
-  using ArchTag = cutlass::arch::Sm100;
-  using OperatorClass = cutlass::arch::OpClassTensorOp;
-
-  using CollectiveEpilogue =
-      typename cutlass::epilogue::collective::CollectiveBuilder<
-          ArchTag, OperatorClass, typename ScheduleConfig::MmaTileShape,
-          typename ScheduleConfig::ClusterShape,
-          cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator,
-          ElementAccumulator, void, LayoutC*, AlignmentC, ElementD, LayoutC*,
-          AlignmentC, typename ScheduleConfig::EpilogueSchedule>::CollectiveOp;
-
-  using CollectiveMainloop =
-      typename cutlass::gemm::collective::CollectiveBuilder<
-          ArchTag, OperatorClass, ElementA,
-          cute::tuple<LayoutA*, typename ScheduleConfig::LayoutSFA*>,
-          AlignmentA, ElementB,
-          cute::tuple<LayoutB*, typename ScheduleConfig::LayoutSFB*>,
-          AlignmentB, ElementAccumulator, typename ScheduleConfig::MmaTileShape,
-          typename ScheduleConfig::ClusterShape,
-          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
-              sizeof(typename CollectiveEpilogue::SharedStorage))>,
-          typename ScheduleConfig::KernelSchedule>::CollectiveOp;
-
-  using GemmKernel =
-      cutlass::gemm::kernel::GemmUniversal<ProblemShape, CollectiveMainloop,
-                                           CollectiveEpilogue, void>;
-
-  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
-  using StrideA = typename Gemm::GemmKernel::InternalStrideA;
-  using StrideB = typename Gemm::GemmKernel::InternalStrideB;
-  using StrideC = typename Gemm::GemmKernel::InternalStrideC;
-  using StrideD = typename Gemm::GemmKernel::InternalStrideD;
-
-  using UnderlyingProblemShape = ProblemShape::UnderlyingProblemShape;
-  int num_experts = (int)expert_offsets.size(0);
-
-  Gemm gemm_op;
-
-  // Mainloop Arguments
-  typename GemmKernel::MainloopArguments mainloop_args{
-      static_cast<const ElementA**>(a_ptrs.data_ptr()),
-      static_cast<StrideA*>(stride_a.data_ptr()),
-      static_cast<const ElementB**>(b_ptrs.data_ptr()),
-      static_cast<StrideB*>(stride_b.data_ptr()),
-      static_cast<const ElementAccumulator**>(a_scales_ptrs.data_ptr()),
-      reinterpret_cast<typename ScheduleConfig::LayoutSFA*>(
-          layout_sfa.data_ptr()),
-      static_cast<const ElementAccumulator**>(b_scales_ptrs.data_ptr()),
-      reinterpret_cast<typename ScheduleConfig::LayoutSFB*>(
-          layout_sfb.data_ptr())};
-
-  int device_id = a_ptrs.device().index();
-  static const cutlass::KernelHardwareInfo hw_info{
-      device_id, cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
-                     device_id)};
-
-  // Epilogue Arguments
-  typename GemmKernel::EpilogueArguments epilogue_args{
-      {},  // epilogue.thread
-      nullptr,
-      static_cast<StrideC*>(stride_c.data_ptr()),
-      static_cast<ElementD**>(out_ptrs.data_ptr()),
-      static_cast<StrideC*>(stride_c.data_ptr())};
-
-  UnderlyingProblemShape* problem_sizes_as_shapes =
-      static_cast<UnderlyingProblemShape*>(problem_sizes.data_ptr());
-
-  // Gemm Arguments
-  typename GemmKernel::Arguments args{
-      cutlass::gemm::GemmUniversalMode::kGrouped,
-      {num_experts, problem_sizes_as_shapes, nullptr},
-      mainloop_args,
-      epilogue_args,
-      hw_info};
-
-  at::cuda::CUDAGuard device_guard{(char)a_ptrs.device().index()};
-  const cudaStream_t stream =
-      at::cuda::getCurrentCUDAStream(a_ptrs.get_device());
-
-  auto can_implement_status = gemm_op.can_implement(args);
-  TORCH_CHECK(can_implement_status == cutlass::Status::kSuccess,
-              "Failed to implement GEMM");
-
-  size_t workspace_size = gemm_op.get_workspace_size(args);
-  auto const workspace_options =
-      torch::TensorOptions().dtype(torch::kUInt8).device(a_ptrs.device());
-  auto workspace = torch::empty(workspace_size, workspace_options);
-
-  auto status = gemm_op.initialize(args, workspace.data_ptr(), stream);
-  TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to initialize GEMM");
-
-  status = gemm_op.run(stream);
-  TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to run GEMM");
-}
-
-template <typename OutType>
-void blockwise_scaled_group_mm_dispatch_shape(
-    torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
-    const torch::Tensor& scales_a, const torch::Tensor& scales_b,
-    const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets) {
-  struct MmaConfig {
-    using ElementA = cutlass::float_e4m3_t;
-    using KernelSchedule =
-        cutlass::gemm::KernelPtrArrayTmaWarpSpecializedBlockwise1SmSm100;
-    using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
-    using ScaleConfig = cutlass::detail::Sm100BlockwiseScaleConfig<
-        1, 128, 128, cute::UMMA::Major::K, cute::UMMA::Major::K>;
-    using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA());
-    using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB());
-    using LayoutC = cutlass::layout::RowMajor;
-    using MmaTileShape = Shape<_128, _128, _128>;
-    using ClusterShape = Shape<_1, _1, _1>;
-  };
-
-  int num_experts = (int)expert_offsets.size(0);
-
-  auto a_ptrs = torch::empty(
-      {num_experts},
-      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
-  auto b_ptrs = torch::empty(
-      {num_experts},
-      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
-  auto out_ptrs = torch::empty(
-      {num_experts},
-      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
-  auto a_scales_ptrs = torch::empty(
-      {num_experts},
-      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
-  auto b_scales_ptrs = torch::empty(
-      {num_experts},
-      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
-
-  auto layout_sfa = torch::empty(
-      {num_experts, 5},
-      torch::TensorOptions().dtype(torch::kInt32).device(a.device()));
-  auto layout_sfb = torch::empty(
-      {num_experts, 5},
-      torch::TensorOptions().dtype(torch::kInt32).device(a.device()));
-
-  auto stride_a = torch::full(
-      {num_experts}, a.size(1),
-      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
-  auto stride_b = torch::full(
-      {num_experts}, a.size(1),
-      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
-  auto stride_c = torch::full(
-      {num_experts}, output.size(1),
-      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
-
-  torch::TensorOptions options_int =
-      torch::TensorOptions().dtype(torch::kInt64).device(a.device());
-
-  run_get_ggemm_starts<typename MmaConfig::LayoutSFA,
-                       typename MmaConfig::LayoutSFB,
-                       typename MmaConfig::ScaleConfig>(
-      expert_offsets, a_ptrs, b_ptrs, out_ptrs, a_scales_ptrs, b_scales_ptrs, a,
-      b, output, scales_a, scales_b, layout_sfa, layout_sfb, problem_sizes);
-
-  run_blockwise_scaled_group_mm<OutType, MmaConfig,
-                                typename MmaConfig::LayoutC>(
-      out_ptrs, a_ptrs, b_ptrs, a_scales_ptrs, b_scales_ptrs, stride_a,
-      stride_b, stride_c, layout_sfa, layout_sfb, problem_sizes,
-      expert_offsets);
-}
-
-void cutlass_blockwise_scaled_grouped_mm(
-    torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
-    const torch::Tensor& scales_a, const torch::Tensor& scales_b,
-    const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets) {
-  TORCH_CHECK(problem_sizes.dim() == 2, "problem_sizes must be 2D tensor");
-  TORCH_CHECK(problem_sizes.size(1) == 3,
-              "problem_sizes must have shape (num_experts, 3)");
-  TORCH_CHECK(problem_sizes.size(0) == expert_offsets.size(0),
-              "Number of experts in problem_sizes must match expert_offsets");
-  TORCH_CHECK(problem_sizes.dtype() == torch::kInt32,
-              "problem_sizes must be int32");
-  TORCH_CHECK(a.scalar_type() == torch::kFloat8_e4m3fn,
-              "a must be kFloat8_e4m3fn");
-  TORCH_CHECK(b.scalar_type() == torch::kFloat8_e4m3fn,
-              "b must be kFloat8_e4m3fn");
-  TORCH_CHECK(output.scalar_type() == torch::kBFloat16 ||
-                  output.scalar_type() == torch::kHalf,
-              "output must be bfloat16 or half");
-  TORCH_CHECK(scales_a.scalar_type() == torch::kFloat32,
-              "scales_a must be float32");
-  TORCH_CHECK(scales_b.scalar_type() == torch::kFloat32,
-              "scales_b must be float32");
-  TORCH_CHECK(expert_offsets.scalar_type() == torch::kInt32,
-              "expert_offsets must be int32");
-
-  TORCH_CHECK(output.dim() == 2, "output must be 2D tensor");
-  TORCH_CHECK(a.dim() == 2, "a must be 2D tensor");
-  TORCH_CHECK(b.dim() == 3, "b must be 3D tensor");
-  TORCH_CHECK(scales_a.dim() == 2, "scales_a must be 2D tensor");
-  TORCH_CHECK(scales_b.dim() == 3, "scales_b must be 3D tensor");
-  TORCH_CHECK(problem_sizes.dim() == 2, "problem_sizes must be 2D tensor");
-  TORCH_CHECK(problem_sizes.size(1) == 3,
-              "problem_sizes must have shape (num_experts, 3)");
-  TORCH_CHECK(problem_sizes.size(0) == expert_offsets.size(0),
-              "Number of experts in problem_sizes must match expert_offsets");
-  TORCH_CHECK(problem_sizes.dtype() == torch::kInt32,
-              "problem_sizes must be int32");
-  TORCH_CHECK(expert_offsets.dim() == 1, "expert_offsets must be 1D tensor");
-
-#if defined(ENABLE_CUTLASS_MOE_SM100) && ENABLE_CUTLASS_MOE_SM100
-  if (output.scalar_type() == torch::kBFloat16) {
-    blockwise_scaled_group_mm_dispatch_shape<cutlass::bfloat16_t>(
-        output, a, b, scales_a, scales_b, problem_sizes, expert_offsets);
-  } else if (output.scalar_type() == torch::kFloat16) {
-    blockwise_scaled_group_mm_dispatch_shape<cutlass::half_t>(
-        output, a, b, scales_a, scales_b, problem_sizes, expert_offsets);
-  } else {
-    TORCH_CHECK(false, "Unsupported output tensor type");
-  }
-#endif
-}
-
-TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
-  m.impl("cutlass_blockwise_scaled_grouped_mm",
-         &cutlass_blockwise_scaled_grouped_mm);
-}
diff --git a/csrc/quantization/w8a8/cutlass/moe/moe_data.cu b/csrc/quantization/w8a8/cutlass/moe/moe_data.cu
index 99fec8fd6febc81ec796ada7b09de225272e29e0..28af2e7d4d80fc75b5e723dc07e5262203bf1c03 100644
--- a/csrc/quantization/w8a8/cutlass/moe/moe_data.cu
+++ b/csrc/quantization/w8a8/cutlass/moe/moe_data.cu
@@ -3,6 +3,8 @@
 #include <c10/cuda/CUDAGuard.h>
 #include <torch/all.h>
 
+#include "dispatch_utils.h"
+
 #include <iostream>
 
 constexpr uint64_t THREADS_PER_EXPERT = 512;
@@ -114,22 +116,17 @@ inline void launch_compute_problem_sizes(const torch::Tensor& topk_ids,
                                          const bool swap_ab) {
   int num_threads = min(THREADS_PER_EXPERT, topk_ids.numel());
 
-  const int32_t* topk_ptr = static_cast<const int32_t*>(topk_ids.data_ptr());
-  int32_t* ps1_ptr = static_cast<int32_t*>(problem_sizes1.data_ptr());
-  int32_t* ps2_ptr = static_cast<int32_t*>(problem_sizes2.data_ptr());
-  int32_t* atomic_ptr = static_cast<int32_t*>(atomic_buffer.data_ptr());
+  auto const* topk_ptr = topk_ids.data_ptr<int32_t>();
+  auto* ps1_ptr = problem_sizes1.data_ptr<int32_t>();
+  auto* ps2_ptr = problem_sizes2.data_ptr<int32_t>();
+  auto* atomic_ptr = atomic_buffer.data_ptr<int32_t>();
 
-  if (swap_ab) {
-    compute_problem_sizes<true><<<num_experts, num_threads, 0, stream>>>(
+  VLLM_DISPATCH_BOOL(swap_ab, SwapAB, [&] {
+    compute_problem_sizes<SwapAB><<<num_experts, num_threads, 0, stream>>>(
         topk_ptr, ps1_ptr, ps2_ptr, atomic_ptr,
         static_cast<int>(topk_ids.numel()), static_cast<int>(n),
         static_cast<int>(k));
-  } else {
-    compute_problem_sizes<false><<<num_experts, num_threads, 0, stream>>>(
-        topk_ptr, ps1_ptr, ps2_ptr, atomic_ptr,
-        static_cast<int>(topk_ids.numel()), static_cast<int>(n),
-        static_cast<int>(k));
-  }
+  });
 }
 }  // namespace
 
@@ -153,6 +150,93 @@ void get_cutlass_moe_mm_problem_sizes_caller(
                                may_swap_ab);
 }
 
+template <bool SWAP_AB>
+__global__ void compute_problem_sizes_from_expert_offsets(
+    const int64_t* __restrict__ expert_first_token_offset,
+    int32_t* __restrict__ problem_sizes1, int32_t* __restrict__ problem_sizes2,
+    const int num_experts, const int n, const int k) {
+  int const expert_id = blockIdx.x * blockDim.x + threadIdx.x;
+  if (expert_id >= num_experts) {
+    return;
+  }
+
+  int64_t const m64 = expert_first_token_offset[expert_id + 1] -
+                      expert_first_token_offset[expert_id];
+  int32_t const m = static_cast<int32_t>(m64);
+
+  int32_t* ps1 = problem_sizes1 + expert_id * 3;
+  int32_t* ps2 = problem_sizes2 + expert_id * 3;
+
+  if constexpr (!SWAP_AB) {
+    // [M, 2*N, K]
+    ps1[0] = m;
+    ps1[1] = 2 * n;
+    ps1[2] = k;
+    // [M, K, N]
+    ps2[0] = m;
+    ps2[1] = k;
+    ps2[2] = n;
+  } else {
+    // swap logical M/N in the problem shape
+    // [2*N, M, K]
+    ps1[0] = 2 * n;
+    ps1[1] = m;
+    ps1[2] = k;
+    // [K, M, N]
+    ps2[0] = k;
+    ps2[1] = m;
+    ps2[2] = n;
+  }
+}
+
+void get_cutlass_moe_mm_problem_sizes_from_expert_offsets_caller(
+    const torch::Tensor& expert_first_token_offset,
+    torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
+    const int64_t n, const int64_t k, const bool swap_ab) {
+  TORCH_CHECK(expert_first_token_offset.is_cuda(),
+              "expert_first_token_offset must be a CUDA tensor");
+  TORCH_CHECK(expert_first_token_offset.dtype() == torch::kInt64,
+              "expert_first_token_offset must be int64");
+
+  TORCH_CHECK(problem_sizes1.is_cuda() && problem_sizes2.is_cuda(),
+              "problem_sizes must be CUDA tensors");
+  TORCH_CHECK(problem_sizes1.dtype() == torch::kInt32 &&
+                  problem_sizes2.dtype() == torch::kInt32,
+              "problem_sizes must be int32");
+  TORCH_CHECK(problem_sizes1.is_contiguous() && problem_sizes2.is_contiguous(),
+              "problem_sizes must be contiguous");
+  TORCH_CHECK(problem_sizes1.dim() == 2 && problem_sizes2.dim() == 2,
+              "problem_sizes must be 2D tensors");
+  TORCH_CHECK(problem_sizes1.size(1) == 3 && problem_sizes2.size(1) == 3,
+              "problem_sizes second dim must be 3");
+  TORCH_CHECK(problem_sizes1.sizes() == problem_sizes2.sizes(),
+              "problem_sizes1 and problem_sizes2 must have same shape");
+
+  int64_t const num_experts64 = problem_sizes1.size(0);
+  TORCH_CHECK(expert_first_token_offset.numel() == num_experts64 + 1,
+              "expert_first_token_offset must have num_experts + 1 elements");
+  TORCH_CHECK(num_experts64 <= INT32_MAX, "num_experts must fit in int32");
+  TORCH_CHECK(n <= INT32_MAX && k <= INT32_MAX, "n and k must fit in int32");
+
+  int const num_experts = static_cast<int>(num_experts64);
+  auto stream = at::cuda::getCurrentCUDAStream(
+      expert_first_token_offset.device().index());
+
+  int const threads = (num_experts < 256) ? num_experts : 256;
+  int const blocks = (num_experts + threads - 1) / threads;
+
+  auto const* offsets_ptr = expert_first_token_offset.data_ptr<int64_t>();
+  auto* ps1_ptr = problem_sizes1.data_ptr<int32_t>();
+  auto* ps2_ptr = problem_sizes2.data_ptr<int32_t>();
+
+  VLLM_DISPATCH_BOOL(swap_ab, SwapAB, [&] {
+    compute_problem_sizes_from_expert_offsets<SwapAB>
+        <<<blocks, threads, 0, stream>>>(offsets_ptr, ps1_ptr, ps2_ptr,
+                                         num_experts, static_cast<int>(n),
+                                         static_cast<int>(k));
+  });
+}
+
 void get_cutlass_moe_mm_data_caller(
     const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
     torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
diff --git a/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu b/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu
index 5de21cfbbaafb4ee71d879ff906e26633bd5284f..077966a1d92a0c08182f79b4a493a64eea50208f 100644
--- a/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu
+++ b/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu
@@ -83,6 +83,11 @@ void get_cutlass_moe_mm_problem_sizes_caller(
     const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets,
     std::optional<bool> force_swap_ab = std::nullopt);
 
+void get_cutlass_moe_mm_problem_sizes_from_expert_offsets_caller(
+    const torch::Tensor& expert_first_token_offset,
+    torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
+    const int64_t n, const int64_t k, const bool swap_ab);
+
 void get_cutlass_pplx_moe_mm_data_caller(torch::Tensor& expert_offsets,
                                          torch::Tensor& problem_sizes1,
                                          torch::Tensor& problem_sizes2,
@@ -322,6 +327,25 @@ void get_cutlass_moe_mm_problem_sizes(
       version_num, ". Required capability: 90, 100, or 120");
 }
 
+void get_cutlass_moe_mm_problem_sizes_from_expert_offsets(
+    const torch::Tensor& expert_first_token_offset,
+    torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
+    const int64_t n, const int64_t k, const bool swap_ab) {
+  int32_t version_num = get_sm_version_num();
+#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) ||   \
+    (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) || \
+    (defined ENABLE_CUTLASS_MOE_SM120 && ENABLE_CUTLASS_MOE_SM120)
+  get_cutlass_moe_mm_problem_sizes_from_expert_offsets_caller(
+      expert_first_token_offset, problem_sizes1, problem_sizes2, n, k, swap_ab);
+  return;
+#endif
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      false,
+      "No compiled get_cutlass_moe_mm_problem_sizes_from_expert_offsets: "
+      "no cutlass_scaled_mm kernel for CUDA device capability: ",
+      version_num, ". Required capability: 90, 100, or 120");
+}
+
 void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
                                   torch::Tensor& problem_sizes1,
                                   torch::Tensor& problem_sizes2,
diff --git a/csrc/quantization/w8a8/fp8/common.cu b/csrc/quantization/w8a8/fp8/common.cu
index 7a822fb8fb8aa8b8492906e4666f6c9402079837..d07cdd571fedd18b3893d5dbbd7b70d22c9f4ec7 100644
--- a/csrc/quantization/w8a8/fp8/common.cu
+++ b/csrc/quantization/w8a8/fp8/common.cu
@@ -4,28 +4,77 @@
 #include "quantization/vectorization_utils.cuh"
 #include <c10/cuda/CUDAGuard.h>
 #include <ATen/cuda/Exceptions.h>
+#include <tuple>
 
 namespace vllm {
 
-template <typename scalar_t, typename fp8_type>
-__global__ void scaled_fp8_quant_kernel_strided(
+// STRIDE_I_ZERO: true if scale_stride_i == 0 (per-tensor or per-channel)
+// STRIDE_J_ZERO: true if scale_stride_j == 0 (per-tensor or per-token)
+template <typename scalar_t, typename fp8_type, bool STRIDE_I_ZERO,
+          bool STRIDE_J_ZERO>
+__global__ void scaled_fp8_quant_kernel_strided_group_shape(
     fp8_type* __restrict__ out, const scalar_t* __restrict__ input,
     const float* __restrict__ scale, int hidden_size, int64_t in_row_stride,
-    int64_t out_row_stride) {
-  const int64_t token_idx = blockIdx.x;  // one token per block
+    int64_t out_row_stride, int group_m, int group_n, int64_t scale_stride_i,
+    int64_t scale_stride_j) {
+  const int64_t token_idx = blockIdx.x;
   const int tid = threadIdx.x;
 
   const scalar_t* token_in = input + token_idx * in_row_stride;
   fp8_type* token_out = out + token_idx * out_row_stride;
 
-  const float inv_scale = 1.0f / (*scale);
-
-  vectorize_with_alignment<16>(
-      token_in, token_out, hidden_size, tid, blockDim.x,
-      [=] __device__(fp8_type & dst, const scalar_t& src) {
-        dst = scaled_fp8_conversion<true, fp8_type>(static_cast<float>(src),
-                                                    inv_scale);
-      });
+  // Precompute row-level base offset for scale access (compile-time eliminated
+  // when STRIDE_I_ZERO)
+  const int64_t scale_row_base =
+      STRIDE_I_ZERO ? 0
+                    : static_cast<int>(token_idx) / group_m * scale_stride_i;
+
+  auto get_inv_scale = [&](int gj) {
+    return 1.0f / scale[scale_row_base + gj * scale_stride_j];
+  };
+
+  int cached_gj = -1;
+  float cached_inv_scale = 0.0f;
+  auto get_inv_scale_cached = [&](int gj) {
+    if (gj != cached_gj) {
+      cached_inv_scale = 1.0f / scale[scale_row_base + gj * scale_stride_j];
+      cached_gj = gj;
+    }
+    return cached_inv_scale;
+  };
+
+  constexpr int VEC_SIZE = 16;  // FP8 so vectorize to 128 bits
+  auto scaled_fp8_conversion_vectorized = [&](const scalar_t* in, fp8_type* out,
+                                              int size, float inv_scale) {
+    vectorize_with_alignment<VEC_SIZE>(
+        in, out, size, tid, blockDim.x,
+        [=] __device__(fp8_type & dst, const scalar_t& src) {
+          dst = scaled_fp8_conversion<true, fp8_type>(static_cast<float>(src),
+                                                      inv_scale);
+        });
+  };
+
+  if (STRIDE_J_ZERO && hidden_size % VEC_SIZE == 0) {
+    // Per-tensor or per-token: single scale per row, vectorize full row
+    scaled_fp8_conversion_vectorized(token_in, token_out, hidden_size,
+                                     get_inv_scale(0));
+  } else if (group_n % VEC_SIZE == 0) {
+    // Multiple column groups with vectorization
+    const int num_groups_n = hidden_size / group_n;
+
+    for (int gj = 0; gj < num_groups_n; gj++) {
+      scaled_fp8_conversion_vectorized(token_in + gj * group_n,
+                                       token_out + gj * group_n, group_n,
+                                       get_inv_scale(gj));
+    }
+  } else {
+    // Scalar path for small column groups (group_n < VEC_SIZE)
+    for (int n = tid; n < hidden_size; n += blockDim.x) {
+      const int gj = n / group_n;
+      token_out[n] = scaled_fp8_conversion<true, fp8_type>(
+          static_cast<float>(token_in[n]), get_inv_scale_cached(gj));
+    }
+  }
 }
 
 template <typename scalar_t, typename fp8_type>
@@ -133,17 +182,116 @@ __global__ void dynamic_per_token_scaled_fp8_quant_kernel_strided(
 
 }  // namespace vllm
 
-void static_scaled_fp8_quant(torch::Tensor& out,          // [..., d]
-                             torch::Tensor const& input,  // [..., d]
-                             torch::Tensor const& scale)  // [1]
+void static_scaled_fp8_quant(
+    torch::Tensor& out,          // [..., d]
+    torch::Tensor const& input,  // [..., d]
+    torch::Tensor const& scale,  // various shapes
+    std::optional<std::tuple<int64_t, int64_t>>
+        opt_group_shape)  // optional explicit (group_m, group_n)
 {
   TORCH_CHECK(input.stride(-1) == 1,
               "last dimension of input must be contiguous");
   TORCH_CHECK(out.stride(-1) == 1,
               "last dimension of output must be contiguous");
 
-  const int hidden_size = input.size(-1);
-  const int num_tokens = input.numel() / hidden_size;
+  const int hidden_size = input.size(-1);              // N (columns)
+  const int num_tokens = input.numel() / hidden_size;  // M (rows)
+
+  // Determine group_m, group_n, and scale strides from scale shape
+  // Scale indexing: scale[gi * scale_stride_j + gj * scale_stride_i]
+  // where gi = m / group_m, gj = n / group_n
+  int group_m, group_n;
+  int64_t scale_stride_i, scale_stride_j;
+
+  if (scale.dim() == 0 || scale.numel() == 1) {
+    // Per-tensor: one scale for the entire tensor
+    group_m = num_tokens;
+    group_n = hidden_size;
+    scale_stride_i = 0;
+    scale_stride_j = 0;
+  } else if (scale.dim() == 1) {
+    // 1D scale: require explicit group_shape to disambiguate per-channel vs
+    // per-token (avoids edge case where num_tokens == hidden_size)
+    TORCH_CHECK(opt_group_shape.has_value(),
+                "1D scale requires explicit group_shape to disambiguate "
+                "per-channel vs per-token quantization. "
+                "Use group_shape=(-1, 1) for per-channel or group_shape=(1, "
+                "-1) for per-token.");
+
+    const auto& [opt_group_m, opt_group_n] = opt_group_shape.value();
+    group_m = opt_group_m == -1 ? num_tokens : static_cast<int>(opt_group_m);
+    group_n = opt_group_n == -1 ? hidden_size : static_cast<int>(opt_group_n);
+
+    // Validate the explicit group shape matches the 1D scale
+    const int64_t scale_len = scale.numel();
+    const int64_t expected_scale_m = num_tokens / group_m;
+    const int64_t expected_scale_n = hidden_size / group_n;
+    const int64_t expected_scale_numel = expected_scale_m * expected_scale_n;
+
+    TORCH_CHECK(scale_len == expected_scale_numel, "1D scale length (",
+                scale_len, ") does not match expected size (",
+                expected_scale_numel, ") for group_shape (", opt_group_m, ", ",
+                opt_group_n, ") with input shape (", num_tokens, ", ",
+                hidden_size, ")");
+
+    // For 1D scale, determine strides based on which dim is trivial
+    // Scale indexing: scale[gi * scale_stride_i + gj * scale_stride_j]
+    // where gi = m / group_m (row group), gj = n / group_n (col group)
+    if (expected_scale_m == 1) {
+      // Per-channel style: one scale in M dim, scale varies along N
+      // gi = 0 always, gj varies, so stride_1 traverses the scale
+      scale_stride_i = 0;
+      scale_stride_j = scale.stride(0);
+    } else if (expected_scale_n == 1) {
+      // Per-token style: one scale in N dim, scale varies along M
+      // gj = 0 always, gi varies, so stride_0 traverses the scale
+      scale_stride_i = scale.stride(0);
+      scale_stride_j = 0;
+    } else {
+      TORCH_CHECK(
+          false,
+          "1D scale can only be used when one of the scale dimensions is 1. "
+          "For 2D group scaling, use a 2D scale tensor.");
+    }
+  } else if (scale.dim() == 2) {
+    // 2D scale: infer group sizes from scale dimensions (or use explicit if
+    // provided)
+    const int64_t scale_size_0 = scale.size(0);
+    const int64_t scale_size_1 = scale.size(1);
+
+    TORCH_CHECK(num_tokens % scale_size_0 == 0, "num_tokens (", num_tokens,
+                ") must be divisible by scale.size(0) (", scale_size_0, ")");
+    TORCH_CHECK(hidden_size % scale_size_1 == 0, "hidden_size (", hidden_size,
+                ") must be divisible by scale.size(1) (", scale_size_1, ")");
+
+    // Infer from 2D scale shape
+    int inferred_group_m = num_tokens / scale_size_0;
+    int inferred_group_n = hidden_size / scale_size_1;
+
+    // Use explicit if provided, otherwise use inferred
+    if (opt_group_shape.has_value()) {
+      const auto& [opt_group_m, opt_group_n] = opt_group_shape.value();
+      group_m = opt_group_m == -1 ? num_tokens : static_cast<int>(opt_group_m);
+      group_n = opt_group_n == -1 ? hidden_size : static_cast<int>(opt_group_n);
+
+      // Validate explicit matches inferred
+      TORCH_CHECK(group_m == inferred_group_m && group_n == inferred_group_n,
+                  "Explicit group_shape (", opt_group_m, ", ", opt_group_n,
+                  ") does not match inferred group shape (", inferred_group_m,
+                  ", ", inferred_group_n, ") from 2D scale tensor shape (",
+                  scale_size_0, ", ", scale_size_1, ")");
+    } else {
+      group_m = inferred_group_m;
+      group_n = inferred_group_n;
+    }
+
+    scale_stride_i = scale.stride(0);
+    scale_stride_j = scale.stride(1);
+  } else {
+    TORCH_CHECK(false, "scale must be 0D, 1D, or 2D tensor, but got ",
+                scale.dim(), "D");
+  }
+
   const int block_size = 256;
   dim3 grid(num_tokens);
   dim3 block(block_size);
@@ -153,15 +301,23 @@ void static_scaled_fp8_quant(torch::Tensor& out,          // [..., d]
 
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // Dispatch to template-specialized kernel based on stride pattern
   VLLM_DISPATCH_FLOATING_TYPES(
       input.scalar_type(), "scaled_fp8_quant_kernel_scalar_type", [&] {
         VLLM_DISPATCH_FP8_TYPES(
             out.scalar_type(), "scaled_fp8_quant_kernel_fp8_type", [&] {
-              vllm::scaled_fp8_quant_kernel_strided<scalar_t, fp8_t>
-                  <<<grid, block, 0, stream>>>(
-                      out.data_ptr<fp8_t>(), input.data_ptr<scalar_t>(),
-                      scale.data_ptr<float>(), hidden_size, in_row_stride,
-                      out_row_stride);
+              VLLM_DISPATCH_BOOL(scale_stride_i == 0, S0_ZERO, [&] {
+                VLLM_DISPATCH_BOOL(scale_stride_j == 0, S1_ZERO, [&] {
+                  vllm::scaled_fp8_quant_kernel_strided_group_shape<
+                      scalar_t, fp8_t, S0_ZERO, S1_ZERO>
+                      <<<grid, block, 0, stream>>>(
+                          out.data_ptr<fp8_t>(), input.data_ptr<scalar_t>(),
+                          scale.data_ptr<float>(), hidden_size, in_row_stride,
+                          out_row_stride, group_m, group_n, scale_stride_i,
+                          scale_stride_j);
+                });
+              });
             });
       });
 }
diff --git a/csrc/sampler.cu b/csrc/sampler.cu
index fc2154beff9e0ffbc9361a9a87c102c6ad910903..f7c091f1d4ee40ac24332f60067f25c54cd27a61 100644
--- a/csrc/sampler.cu
+++ b/csrc/sampler.cu
@@ -1,3 +1,4 @@
+#include "cuda_compat.h"
 #include "dispatch_utils.h"
 
 #include <torch/cuda.h>
@@ -97,7 +98,9 @@ static inline __device__ bool isPartialMatch(float x, uint32_t pattern) {
 template <typename T, typename idxT, typename Func>
 __device__ void vectorized_process(size_t thread_rank, size_t num_threads,
                                    const T* in, idxT len, Func f) {
-  constexpr int WARP_SIZE = 32;
+  // Use dynamic WARP_SIZE from cuda_compat.h to support both
+  // Wave64 (MI300X/gfx942) and Wave32 (Strix Halo/gfx1151) architectures
+  constexpr int kWarpSize = WARP_SIZE;
   using WideT = float4;
   if constexpr (sizeof(T) >= sizeof(WideT)) {
     for (idxT i = thread_rank; i < len; i += num_threads) {
@@ -132,8 +135,8 @@ __device__ void vectorized_process(size_t thread_rank, size_t num_threads,
       }
     }
 
-    static_assert(WARP_SIZE >= items_per_scalar);
-    // and because items_per_scalar > skip_cnt, WARP_SIZE > skip_cnt
+    static_assert(kWarpSize >= items_per_scalar);
+    // and because items_per_scalar > skip_cnt, kWarpSize > skip_cnt
     // no need to use loop
     if (thread_rank < skip_cnt) {
       f(in[thread_rank], thread_rank);
@@ -142,7 +145,7 @@ __device__ void vectorized_process(size_t thread_rank, size_t num_threads,
     // len_cast * items_per_scalar + items_per_scalar > len - skip_cnt;
     // and so
     // len - (skip_cnt + len_cast * items_per_scalar) < items_per_scalar <=
-    // WARP_SIZE no need to use loop
+    // kWarpSize no need to use loop
     const idxT remain_i = skip_cnt + len_cast * items_per_scalar + thread_rank;
     if (remain_i < len) {
       f(in[remain_i], remain_i);
@@ -550,8 +553,8 @@ static __global__ __launch_bounds__(kNumThreadsPerBlock) void topKPerRowPrefill(
   int rowEnd = rowEnds[rowIdx];
 
   // Local pointers to this block
-  outIndices += rowIdx * topK;
-  logits += rowIdx * stride0;
+  outIndices += static_cast<int64_t>(rowIdx) * topK;
+  logits += static_cast<int64_t>(rowIdx) * stride0;
 
   topKPerRowJob<kNumThreadsPerBlock, kNumBins, useRadixSort>(
       nullptr, logits, rowStart, rowEnd, outIndices, nullptr, stride1, topK);
@@ -576,19 +579,21 @@ static __global__ __launch_bounds__(kNumThreadsPerBlock) void topKPerRowDecode(
 
   // Local pointers to this block
   if constexpr (!multipleBlocksPerRow && !mergeBlocks) {
-    outIndices += rowIdx * topK;
+    outIndices += static_cast<int64_t>(rowIdx) * topK;
   } else if constexpr (multipleBlocksPerRow) {
     const auto blockSize = rowEnd / gridDim.y;  // 16384 / 2 = 8192
     rowStart = blockSize * blockIdx.y;          // 8192 * 1 = 8192
     rowEnd = gridDim.y == blockIdx.y + 1 ? rowEnd : rowStart + blockSize;
-    outIndices += rowIdx * gridDim.y * topK + blockIdx.y * topK;
-    outLogits += rowIdx * gridDim.y * topK + blockIdx.y * topK;
+    outIndices +=
+        static_cast<int64_t>(rowIdx) * gridDim.y * topK + blockIdx.y * topK;
+    outLogits +=
+        static_cast<int64_t>(rowIdx) * gridDim.y * topK + blockIdx.y * topK;
   } else if constexpr (mergeBlocks) {
     rowEnd = numBlocksToMerge * topK;
-    indices += rowIdx * numBlocksToMerge * topK;
-    outIndices += rowIdx * topK;
+    indices += static_cast<int64_t>(rowIdx) * numBlocksToMerge * topK;
+    outIndices += static_cast<int64_t>(rowIdx) * topK;
   }
-  logits += rowIdx * stride0;
+  logits += static_cast<int64_t>(rowIdx) * stride0;
 
   topKPerRowJob<kNumThreadsPerBlock, kNumBins, useRadixSort,
                 multipleBlocksPerRow, mergeBlocks>(
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index b50bb6ffbed4831e8741208ace5c2deb876b0a7b..45e8d8d980562f222837c7bd306d688602e3e45f 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -444,13 +444,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "                      Tensor alpha) -> ()");
   ops.impl("cutlass_scaled_fp4_mm", torch::kCUDA, &cutlass_scaled_fp4_mm);
 
-  // cutlass blockwise scaledgroup GEMM
-  ops.def(
-      "cutlass_blockwise_scaled_grouped_mm(Tensor! output, Tensor a, Tensor b, "
-      "Tensor scales_a, Tensor scales_b, "
-      "Tensor problem_sizes, Tensor expert_offsets) -> ()");
-  // conditionally compiled so impl registration is in source file
-
   // cutlass nvfp4 block scaled group GEMM
   ops.def(
       "cutlass_fp4_group_mm(Tensor! out, Tensor a, Tensor b,"
@@ -522,6 +515,17 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.impl("get_cutlass_moe_mm_problem_sizes", torch::kCUDA,
            &get_cutlass_moe_mm_problem_sizes);
 
+  // compute per-expert problem sizes from expert_first_token_offset
+  // produced by vLLM's moe_permute kernel
+  ops.def(
+      "get_cutlass_moe_mm_problem_sizes_from_expert_offsets("
+      "    Tensor expert_first_token_offset, "
+      "    Tensor! problem_sizes1, "
+      "    Tensor! problem_sizes2, "
+      "    int n, int k, bool swap_ab) -> ()");
+  ops.impl("get_cutlass_moe_mm_problem_sizes_from_expert_offsets", torch::kCUDA,
+           &get_cutlass_moe_mm_problem_sizes_from_expert_offsets);
+
   // A function that computes data required to run fused MoE with w8a8 grouped
   // GEMM and PPLX. It takes expert_num_tokens and non_zero_expert_idxs
   // as an input, and computes expert_offsets (token start indices of each
@@ -593,6 +597,15 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "Tensor output_scale_offset_by_experts) -> ()");
   ops.impl("scaled_fp4_experts_quant", torch::kCUDA, &scaled_fp4_experts_quant);
 
+  // Fused SiLU+Mul+NVFP4 experts quantization.
+  ops.def(
+      "silu_and_mul_scaled_fp4_experts_quant(Tensor! output, Tensor! "
+      "output_scale,"
+      "Tensor input, Tensor input_global_scale, Tensor input_offset_by_experts,"
+      "Tensor output_scale_offset_by_experts) -> ()");
+  ops.impl("silu_and_mul_scaled_fp4_experts_quant", torch::kCUDA,
+           &silu_and_mul_scaled_fp4_experts_quant);
+
   // Check if cutlass_scaled_mm_fp4 is supported for CUDA devices
   // of the given capability
   ops.def("cutlass_scaled_mm_supports_fp4(int cuda_device_capability) -> bool");
@@ -615,19 +628,22 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
 //   ops.impl("gptq_shuffle", torch::kCUDA, &gptq_shuffle);
 
   // Compute FP8 quantized tensor for given scaling factor.
+  // Supports per-tensor, per-channel, per-token, and arbitrary 2D group
+  // scaling. Optional group_m/group_n specify the group shape explicitly;
+  // required for 1D scales to disambiguate per-channel vs per-token.
 //   ops.def(
-//       "static_scaled_fp8_quant(Tensor! result, Tensor input, Tensor scale) -> "
-//       "()");
+//       "static_scaled_fp8_quant(Tensor! result, Tensor input, Tensor scale, "
+//       "(int, int)? group_shape=None) -> ()");
 //   ops.impl("static_scaled_fp8_quant", torch::kCUDA, &static_scaled_fp8_quant);
 
-//   // Compute dynamic-per-tensor FP8 quantized tensor and scaling factor.
+  // Compute dynamic-per-tensor FP8 quantized tensor and scaling factor.
 //   ops.def(
 //       "dynamic_scaled_fp8_quant(Tensor! result, Tensor input, Tensor! scale) "
 //       "-> "
 //       "()");
 //   ops.impl("dynamic_scaled_fp8_quant", torch::kCUDA, &dynamic_scaled_fp8_quant);
 
-//   // Compute dynamic-per-token FP8 quantized tensor and scaling factor.
+  // Compute dynamic-per-token FP8 quantized tensor and scaling factor.
 //   ops.def(
 //       "dynamic_per_token_scaled_fp8_quant(Tensor! result, Tensor input, "
 //       "Tensor! scale, Tensor? scale_ub) -> "
@@ -721,16 +737,6 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
       "swap_blocks(Tensor src, Tensor! dst, Tensor block_mapping) -> ()");
   cache_ops.impl("swap_blocks", torch::kCUDA, &swap_blocks);
 
-  // Copy the cache blocks from src to dst.
-  cache_ops.def(
-      "copy_blocks(Tensor(a!)[] key_caches, Tensor[](b!) value_caches, "
-      "Tensor block_mapping) -> ()");
-  cache_ops.impl("copy_blocks", torch::kCUDA, &copy_blocks);
-
-  cache_ops.def(
-      "copy_blocks_mla(Tensor(a!)[] kv_caches, Tensor block_mapping) -> ()");
-  cache_ops.impl("copy_blocks_mla", torch::kCUDA, &copy_blocks_mla);
-
   // Reshape the key and value tensors and cache them.
   cache_ops.def(
       "reshape_and_cache(Tensor key, Tensor value,"
@@ -785,6 +791,22 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
       "                     Tensor scale) -> ()");
   cache_ops.impl("concat_and_cache_mla", torch::kCUDA, &concat_and_cache_mla);
 
+  // Rotate Q and K, then write to kv cache for MLA
+  cache_ops.def(
+      "concat_and_cache_mla_rope_fused("
+      "                     Tensor positions,"
+      "                     Tensor! q_pe,"
+      "                     Tensor! k_pe,"
+      "                     Tensor kv_c,"
+      "                     Tensor cos_sin_cache,"
+      "                     bool is_neox,"
+      "                     Tensor slot_mapping,"
+      "                     Tensor! kv_cache,"
+      "                     str kv_cache_dtype,"
+      "                     Tensor kv_cache_scale) -> ()");
+  cache_ops.impl("concat_and_cache_mla_rope_fused", torch::kCUDA,
+                 &concat_and_cache_mla_rope_fused);
+
   // Convert the key and value cache to fp8 data type.
   cache_ops.def(
       "convert_fp8(Tensor! dst_cache, Tensor src_cache, float scale, "
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 0d50d97e54c6c3787c5b3939f2194d31374dd10f..ec6bfc5dfc30746f42a2bc41bcfbc257c3d01151 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -32,7 +32,7 @@ ARG DEADSNAKES_GPGKEY_URL
 
 # The PyPA get-pip.py script is a self contained script+zip file, that provides
 # both the installer script and the pip base85-encoded zip archive. This allows
-# bootstrapping pip in environment where a dsitribution package does not exist.
+# bootstrapping pip in environment where a distribution package does not exist.
 #
 # By parameterizing the URL for get-pip.py installation script, we allow
 # third-party to use their own copy of the script stored in a private mirror.
@@ -73,15 +73,13 @@ ARG INSTALL_KV_CONNECTORS=false
 #################### BASE BUILD IMAGE ####################
 # prepare basic build environment
 FROM ${BUILD_BASE_IMAGE} AS base
+
 ARG CUDA_VERSION
 ARG PYTHON_VERSION
-ARG TARGETPLATFORM
-ARG INSTALL_KV_CONNECTORS=false
-ENV DEBIAN_FRONTEND=noninteractive
 
-ARG GET_PIP_URL
+ENV DEBIAN_FRONTEND=noninteractive
 
-# Install system dependencies and uv, then create Python virtual environment
+# Install system dependencies including build tools
 RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
     && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
     && apt-get update -y \
@@ -107,32 +105,30 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
     && ln -s /opt/venv/bin/pip /usr/bin/pip \
     && python3 --version && python3 -m pip --version
 
-ARG PIP_INDEX_URL UV_INDEX_URL
-ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
-ARG PYTORCH_CUDA_INDEX_BASE_URL
-ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
-
 # Activate virtual environment and add uv to PATH
 ENV PATH="/opt/venv/bin:/root/.local/bin:$PATH"
 ENV VIRTUAL_ENV="/opt/venv"
 
-# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
-# Reference: https://github.com/astral-sh/uv/pull/1694
+# Environment for uv
 ENV UV_HTTP_TIMEOUT=500
 ENV UV_INDEX_STRATEGY="unsafe-best-match"
-# Use copy mode to avoid hardlink failures with Docker cache mounts
 ENV UV_LINK_MODE=copy
 
-RUN <<EOF
-gcc --version
-EOF
+# Verify GCC version
+RUN gcc --version
 
-# Workaround for https://github.com/openai/triton/issues/2507 and
-# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
-# this won't be needed for future versions of this docker image
-# or future versions of triton.
+# Workaround for triton/pytorch issues
 RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 
+# ============================================================
+# SLOW-CHANGING DEPENDENCIES BELOW
+# These are the expensive layers that we want to cache
+# ============================================================
+
+# Install PyTorch and core CUDA dependencies
+# This is ~2GB and rarely changes
+ARG PYTORCH_CUDA_INDEX_BASE_URL
+
 WORKDIR /workspace
 
 # install build and runtime dependencies
@@ -142,13 +138,12 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     uv pip install --python /opt/venv/bin/python3 -r requirements/cuda.txt \
     --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 
-# cuda arch list used by torch
-# can be useful for both `dev` and `test`
-# explicitly set the list to avoid issues with torch 2.2
-# see https://github.com/pytorch/pytorch/pull/123243
+# CUDA arch list used by torch
+# Explicitly set the list to avoid issues with torch 2.2
+# See https://github.com/pytorch/pytorch/pull/123243
 ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0 12.0'
 ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
-#################### BASE BUILD IMAGE ####################
+#################### BUILD BASE IMAGE ####################
 
 #################### CSRC BUILD IMAGE ####################
 FROM base AS csrc-build
@@ -188,7 +183,7 @@ ARG nvcc_threads=8
 ENV NVCC_THREADS=$nvcc_threads
 
 ARG USE_SCCACHE
-ARG SCCACHE_DOWNLOAD_URL=https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz
+ARG SCCACHE_DOWNLOAD_URL
 ARG SCCACHE_ENDPOINT
 ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
 ARG SCCACHE_REGION_NAME=us-west-2
@@ -206,10 +201,16 @@ ENV SETUPTOOLS_SCM_PRETEND_VERSION="0.0.0+csrc.build"
 RUN --mount=type=cache,target=/root/.cache/uv \
     if [ "$USE_SCCACHE" = "1" ]; then \
         echo "Installing sccache..." \
+        && case "${TARGETPLATFORM}" in \
+          linux/arm64) SCCACHE_ARCH="aarch64" ;; \
+          linux/amd64) SCCACHE_ARCH="x86_64" ;; \
+          *) echo "Unsupported TARGETPLATFORM for sccache: ${TARGETPLATFORM}" >&2; exit 1 ;; \
+        esac \
+        && export SCCACHE_DOWNLOAD_URL="${SCCACHE_DOWNLOAD_URL:-https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-${SCCACHE_ARCH}-unknown-linux-musl.tar.gz}" \
         && curl -L -o sccache.tar.gz ${SCCACHE_DOWNLOAD_URL} \
         && tar -xzf sccache.tar.gz \
-        && sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
-        && rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
+        && sudo mv sccache-v0.8.1-${SCCACHE_ARCH}-unknown-linux-musl/sccache /usr/bin/sccache \
+        && rm -rf sccache.tar.gz sccache-v0.8.1-${SCCACHE_ARCH}-unknown-linux-musl \
         && if [ ! -z ${SCCACHE_ENDPOINT} ] ; then export SCCACHE_ENDPOINT=${SCCACHE_ENDPOINT} ; fi \
         && export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
         && export SCCACHE_REGION=${SCCACHE_REGION_NAME} \
@@ -241,6 +242,50 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
     fi
 #################### CSRC BUILD IMAGE ####################
 
+#################### EXTENSIONS BUILD IMAGE ####################
+# Build DeepGEMM, pplx-kernels, DeepEP - runs in PARALLEL with csrc-build
+# This stage is independent and doesn't affect csrc cache
+FROM base AS extensions-build
+ARG CUDA_VERSION
+
+# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
+ENV UV_HTTP_TIMEOUT=500
+ENV UV_INDEX_STRATEGY="unsafe-best-match"
+ENV UV_LINK_MODE=copy
+
+WORKDIR /workspace
+
+# Build DeepGEMM wheel
+ARG DEEPGEMM_GIT_REF
+COPY tools/install_deepgemm.sh /tmp/install_deepgemm.sh
+RUN --mount=type=cache,target=/root/.cache/uv \
+    mkdir -p /tmp/deepgemm/dist && \
+    VLLM_DOCKER_BUILD_CONTEXT=1 TORCH_CUDA_ARCH_LIST="9.0a 10.0a" /tmp/install_deepgemm.sh \
+        --cuda-version "${CUDA_VERSION}" \
+        ${DEEPGEMM_GIT_REF:+--ref "$DEEPGEMM_GIT_REF"} \
+        --wheel-dir /tmp/deepgemm/dist || \
+    echo "DeepGEMM build skipped (CUDA version requirement not met)"
+
+# Ensure the wheel dir exists so COPY won't fail when DeepGEMM is skipped
+RUN mkdir -p /tmp/deepgemm/dist && touch /tmp/deepgemm/dist/.deepgemm_skipped
+
+# Build pplx-kernels and DeepEP wheels
+COPY tools/ep_kernels/install_python_libraries.sh /tmp/install_python_libraries.sh
+ARG PPLX_COMMIT_HASH
+ARG DEEPEP_COMMIT_HASH
+ARG NVSHMEM_VER
+RUN --mount=type=cache,target=/root/.cache/uv \
+    mkdir -p /tmp/ep_kernels_workspace/dist && \
+    export TORCH_CUDA_ARCH_LIST='9.0a 10.0a' && \
+    /tmp/install_python_libraries.sh \
+        --workspace /tmp/ep_kernels_workspace \
+        --mode wheel \
+        ${PPLX_COMMIT_HASH:+--pplx-ref "$PPLX_COMMIT_HASH"} \
+        ${DEEPEP_COMMIT_HASH:+--deepep-ref "$DEEPEP_COMMIT_HASH"} \
+        ${NVSHMEM_VER:+--nvshmem-ver "$NVSHMEM_VER"} && \
+    find /tmp/ep_kernels_workspace/nvshmem -name '*.a' -delete
+#################### EXTENSIONS BUILD IMAGE ####################
+
 #################### WHEEL BUILD IMAGE ####################
 FROM base AS build
 ARG TARGETPLATFORM
@@ -265,6 +310,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 
 WORKDIR /workspace
 
+# Copy pre-built csrc wheel directly
 COPY --from=csrc-build /workspace/dist /precompiled-wheels
 
 COPY . .
@@ -286,27 +332,9 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     fi && \
     python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38
 
-# Install DeepGEMM from source
-ARG DEEPGEMM_GIT_REF
-COPY tools/install_deepgemm.sh /tmp/install_deepgemm.sh
-RUN --mount=type=cache,target=/root/.cache/uv \
-    VLLM_DOCKER_BUILD_CONTEXT=1 TORCH_CUDA_ARCH_LIST="9.0a 10.0a" /tmp/install_deepgemm.sh --cuda-version "${CUDA_VERSION}" ${DEEPGEMM_GIT_REF:+--ref "$DEEPGEMM_GIT_REF"} --wheel-dir /tmp/deepgemm/dist
-
-# Ensure the wheel dir exists so later-stage COPY won't fail when DeepGEMM is skipped
-RUN mkdir -p /tmp/deepgemm/dist && touch /tmp/deepgemm/dist/.deepgemm_skipped
-
-COPY tools/ep_kernels/install_python_libraries.sh /tmp/install_python_libraries.sh
-# Install EP kernels(pplx-kernels and DeepEP)
-ARG PPLX_COMMIT_HASH
-ARG DEEPEP_COMMIT_HASH
-RUN --mount=type=cache,target=/root/.cache/uv \
-    export TORCH_CUDA_ARCH_LIST='9.0a 10.0a' && \
-    /tmp/install_python_libraries.sh \
-        --workspace /tmp/ep_kernels_workspace \
-        --mode wheel \
-        ${PPLX_COMMIT_HASH:+--pplx-ref "$PPLX_COMMIT_HASH"} \
-        ${DEEPEP_COMMIT_HASH:+--deepep-ref "$DEEPEP_COMMIT_HASH"} && \
-    find /tmp/ep_kernels_workspace/nvshmem -name '*.a' -delete
+# Copy extension wheels from extensions-build stage for later use
+COPY --from=extensions-build /tmp/deepgemm/dist /tmp/deepgemm/dist
+COPY --from=extensions-build /tmp/ep_kernels_workspace/dist /tmp/ep_kernels_workspace/dist
 
 # Check the size of the wheel if RUN_WHEEL_CHECK is true
 COPY .buildkite/check-wheel-size.py check-wheel-size.py
@@ -344,32 +372,25 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     uv pip install --python /opt/venv/bin/python3 -r requirements/dev.txt \
     --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 #################### DEV IMAGE ####################
-
 #################### vLLM installation IMAGE ####################
 # image with vLLM installed
 FROM ${FINAL_BASE_IMAGE} AS vllm-base
+
 ARG CUDA_VERSION
 ARG PYTHON_VERSION
-ARG INSTALL_KV_CONNECTORS=false
-WORKDIR /vllm-workspace
-ENV DEBIAN_FRONTEND=noninteractive
-ARG TARGETPLATFORM
-
-# TODO (huydhn): There is no prebuilt gdrcopy package on 12.9 at the moment
-ARG GDRCOPY_CUDA_VERSION=12.8
-# Keep in line with FINAL_BASE_IMAGE
-ARG GDRCOPY_OS_VERSION=Ubuntu22_04
-
-SHELL ["/bin/bash", "-c"]
-
 ARG DEADSNAKES_MIRROR_URL
 ARG DEADSNAKES_GPGKEY_URL
 ARG GET_PIP_URL
 
+ENV DEBIAN_FRONTEND=noninteractive
+WORKDIR /vllm-workspace
+
+
+# Python version string for paths (e.g., "312" for 3.12)
 RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
     echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
 
-# Install Python and other dependencies
+# Install Python and system dependencies
 RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
     && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
     && apt-get update -y \
@@ -408,62 +429,103 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
     && curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \
     && python3 --version && python3 -m pip --version
 
-# Install CUDA development tools and build essentials for runtime JIT compilation
+# Install CUDA development tools for runtime JIT compilation
 # (FlashInfer, DeepGEMM, EP kernels all require compilation at runtime)
 RUN CUDA_VERSION_DASH=$(echo $CUDA_VERSION | cut -d. -f1,2 | tr '.' '-') && \
     apt-get update -y && \
     apt-get install -y --no-install-recommends \
-    cuda-nvcc-${CUDA_VERSION_DASH} \
-    cuda-cudart-${CUDA_VERSION_DASH} \
-    cuda-nvrtc-${CUDA_VERSION_DASH} \
-    cuda-cuobjdump-${CUDA_VERSION_DASH} \
-    # https://github.com/vllm-project/vllm/issues/29590
-    libcurand-dev-${CUDA_VERSION_DASH} \
-    libcublas-${CUDA_VERSION_DASH} \
-    # Fixes nccl_allocator requiring nccl.h at runtime
-    # https://github.com/vllm-project/vllm/blob/1336a1ea244fa8bfd7e72751cabbdb5b68a0c11a/vllm/distributed/device_communicators/pynccl_allocator.py#L22
-    libnccl-dev && \
+        cuda-nvcc-${CUDA_VERSION_DASH} \
+        cuda-cudart-${CUDA_VERSION_DASH} \
+        cuda-nvrtc-${CUDA_VERSION_DASH} \
+        cuda-cuobjdump-${CUDA_VERSION_DASH} \
+        libcurand-dev-${CUDA_VERSION_DASH} \
+        libcublas-${CUDA_VERSION_DASH} \
+        # Fixes nccl_allocator requiring nccl.h at runtime
+        # https://github.com/vllm-project/vllm/blob/1336a1ea244fa8bfd7e72751cabbdb5b68a0c11a/vllm/distributed/device_communicators/pynccl_allocator.py#L22
+        libnccl-dev && \
     rm -rf /var/lib/apt/lists/*
 
-ARG PIP_INDEX_URL UV_INDEX_URL
-ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
-ARG PYTORCH_CUDA_INDEX_BASE_URL
-ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
-
 # Install uv for faster pip installs
-RUN --mount=type=cache,target=/root/.cache/uv \
-    python3 -m pip install uv
+RUN python3 -m pip install uv
 
-# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
-# Reference: https://github.com/astral-sh/uv/pull/1694
+# Environment for uv
 ENV UV_HTTP_TIMEOUT=500
 ENV UV_INDEX_STRATEGY="unsafe-best-match"
-# Use copy mode to avoid hardlink failures with Docker cache mounts
 ENV UV_LINK_MODE=copy
 
-# Workaround for https://github.com/openai/triton/issues/2507 and
-# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
-# this won't be needed for future versions of this docker image
-# or future versions of triton.
+# Workaround for triton/pytorch issues
 RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 
-# Install vllm wheel first, so that torch etc will be installed.
-RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
-    --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system dist/*.whl --verbose \
-        --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
+# ============================================================
+# SLOW-CHANGING DEPENDENCIES BELOW
+# These are the expensive layers that we want to cache
+# ============================================================
+
+# Install PyTorch and core CUDA dependencies
+# This is ~2GB and rarely changes
+ARG PYTORCH_CUDA_INDEX_BASE_URL
+COPY requirements/common.txt /tmp/common.txt
+COPY requirements/cuda.txt /tmp/requirements-cuda.txt
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system -r /tmp/requirements-cuda.txt \
+        --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') && \
+    rm /tmp/requirements-cuda.txt /tmp/common.txt
 
 # Install FlashInfer pre-compiled kernel cache and binaries
+# This is ~1.1GB and only changes when FlashInfer version bumps
 # https://docs.flashinfer.ai/installation.html
+ARG FLASHINFER_VERSION=0.5.3
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system flashinfer-cubin==0.5.3 \
-    && uv pip install --system flashinfer-jit-cache==0.5.3 \
+    uv pip install --system flashinfer-cubin==${FLASHINFER_VERSION} \
+    && uv pip install --system flashinfer-jit-cache==${FLASHINFER_VERSION} \
         --extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
     && flashinfer show-config
 
-COPY examples examples
-COPY benchmarks benchmarks
-COPY ./vllm/collect_env.py .
+# ============================================================
+# OPENAI API SERVER DEPENDENCIES
+# Pre-install these to avoid reinstalling on every vLLM wheel rebuild
+# ============================================================
+
+# Install gdrcopy (saves ~6s per build)
+# TODO (huydhn): There is no prebuilt gdrcopy package on 12.9 at the moment
+ARG GDRCOPY_CUDA_VERSION=12.8
+ARG GDRCOPY_OS_VERSION=Ubuntu22_04
+ARG TARGETPLATFORM
+COPY tools/install_gdrcopy.sh /tmp/install_gdrcopy.sh
+RUN set -eux; \
+    case "${TARGETPLATFORM}" in \
+      linux/arm64) UUARCH="aarch64" ;; \
+      linux/amd64) UUARCH="x64" ;; \
+      *) echo "Unsupported TARGETPLATFORM: ${TARGETPLATFORM}" >&2; exit 1 ;; \
+    esac; \
+    /tmp/install_gdrcopy.sh "${GDRCOPY_OS_VERSION}" "${GDRCOPY_CUDA_VERSION}" "${UUARCH}" && \
+    rm /tmp/install_gdrcopy.sh
+
+# Install vllm-openai dependencies (saves ~2.6s per build)
+# These are stable packages that don't depend on vLLM itself
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
+        BITSANDBYTES_VERSION="0.42.0"; \
+    else \
+        BITSANDBYTES_VERSION="0.46.1"; \
+    fi; \
+    uv pip install --system accelerate hf_transfer modelscope \
+        "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm>=1.0.17' 'runai-model-streamer[s3,gcs]>=0.15.3'
+
+# ============================================================
+# VLLM INSTALLATION (depends on build stage)
+# ============================================================
+
+ARG PIP_INDEX_URL UV_INDEX_URL
+ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
+ARG PYTORCH_CUDA_INDEX_BASE_URL
+ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
+
+# Install vllm wheel first, so that torch etc will be installed.
+RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
+    --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system dist/*.whl --verbose \
+        --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 
 RUN --mount=type=cache,target=/root/.cache/uv \
 . /etc/environment && \
@@ -478,7 +540,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
               echo "No DeepGEMM wheels to install; skipping."; \
            fi'
 
-# Pytorch now installs NVSHMEM, setting LD_LIBRARY_PATH (https://github.com/pytorch/pytorch/blob/d38164a545b4a4e4e0cf73ce67173f70574890b6/.ci/manywheel/build_cuda.sh#L141C14-L141C36)
+# Pytorch now installs NVSHMEM, setting LD_LIBRARY_PATH
 ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
 
 # Install EP kernels wheels (pplx-kernels and DeepEP) that have been built in the `build` stage
@@ -487,23 +549,17 @@ RUN --mount=type=bind,from=build,src=/tmp/ep_kernels_workspace/dist,target=/vllm
     uv pip install --system ep_kernels/dist/*.whl --verbose \
         --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 
-RUN --mount=type=bind,source=tools/install_gdrcopy.sh,target=/tmp/install_gdrcopy.sh,ro \
-    set -eux; \
-    case "${TARGETPLATFORM}" in \
-      linux/arm64) UUARCH="aarch64" ;; \
-      linux/amd64) UUARCH="x64" ;; \
-      *) echo "Unsupported TARGETPLATFORM: ${TARGETPLATFORM}" >&2; exit 1 ;; \
-    esac; \
-    /tmp/install_gdrcopy.sh "${GDRCOPY_OS_VERSION}" "${GDRCOPY_CUDA_VERSION}" "${UUARCH}"
-
 # CUDA image changed from /usr/local/nvidia to /usr/local/cuda in 12.8 but will
 # return to /usr/local/nvidia in 13.0 to allow container providers to mount drivers
 # consistently from the host (see https://github.com/vllm-project/vllm/issues/18859).
 # Until then, add /usr/local/nvidia/lib64 before the image cuda path to allow override.
 ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib64:${LD_LIBRARY_PATH}
 
+# Copy examples and benchmarks at the end to minimize cache invalidation
+COPY examples examples
+COPY benchmarks benchmarks
+COPY ./vllm/collect_env.py .
 #################### vLLM installation IMAGE ####################
-
 #################### TEST IMAGE ####################
 # image to run unit testing suite
 # note that this uses vllm installed by `pip`
@@ -561,6 +617,7 @@ RUN mv vllm src/vllm
 FROM vllm-base AS vllm-openai-base
 ARG TARGETPLATFORM
 ARG INSTALL_KV_CONNECTORS=false
+ARG CUDA_VERSION
 
 ARG PIP_INDEX_URL UV_INDEX_URL
 ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
@@ -569,18 +626,32 @@ ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
 # Reference: https://github.com/astral-sh/uv/pull/1694
 ENV UV_HTTP_TIMEOUT=500
 
-# install additional dependencies for openai api server
+# install kv_connectors if requested
+ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0 12.0'
+ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
 RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,source=requirements/kv_connectors.txt,target=/tmp/kv_connectors.txt,ro \
+    CUDA_MAJOR="${CUDA_VERSION%%.*}"; \
+    CUDA_VERSION_DASH=$(echo $CUDA_VERSION | cut -d. -f1,2 | tr '.' '-'); \
+    CUDA_HOME=/usr/local/cuda; \
+    # lmcache requires explicit specifying CUDA_HOME
+    BUILD_PKGS="libcusparse-dev-${CUDA_VERSION_DASH} \
+                libcublas-dev-${CUDA_VERSION_DASH} \
+                libcusolver-dev-${CUDA_VERSION_DASH}"; \
     if [ "$INSTALL_KV_CONNECTORS" = "true" ]; then \
-        uv pip install --system -r /tmp/kv_connectors.txt; \
-    fi; \
-    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        BITSANDBYTES_VERSION="0.42.0"; \
-    else \
-        BITSANDBYTES_VERSION="0.46.1"; \
-    fi; \
-    uv pip install --system accelerate hf_transfer modelscope "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm>=1.0.17' 'runai-model-streamer[s3,gcs]>=0.15.3'
+        if [ "$CUDA_MAJOR" -ge 13 ]; then \
+            uv pip install --system nixl-cu13; \
+        fi; \
+        uv pip install --system -r /tmp/kv_connectors.txt --no-build || ( \
+            # if the above fails, install from source
+            apt-get update -y && \
+            apt-get install -y --no-install-recommends ${BUILD_PKGS} && \
+            uv pip install --system -r /tmp/kv_connectors.txt --no-build-isolation && \
+            apt-get purge -y ${BUILD_PKGS} && \
+            # clean up -dev packages, keep runtime libraries
+            rm -rf /var/lib/apt/lists/* \
+        ); \
+    fi
 
 ENV VLLM_USAGE_SOURCE production-docker-image
 
diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu
index 8d55ecfba3e52eabccc24e2d0dc9ea0b0687b7c0..2caf1ad144178f0872ddc0e99c3d60fefcbcb545 100644
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@@ -17,7 +17,7 @@
 #   VLLM_CPU_DISABLE_AVX512=false (default)|true
 #   VLLM_CPU_AVX512BF16=false (default)|true
 #   VLLM_CPU_AVX512VNNI=false (default)|true
-#   VLLM_CPU_AMXBF16=false (default)|true
+#   VLLM_CPU_AMXBF16=false |true (default)
 #
 
 ######################### COMMON BASE IMAGE #########################
@@ -95,7 +95,7 @@ ENV VLLM_CPU_AVX512BF16=${VLLM_CPU_AVX512BF16}
 ARG VLLM_CPU_AVX512VNNI=0
 ENV VLLM_CPU_AVX512VNNI=${VLLM_CPU_AVX512VNNI}
 # Support for building with AMXBF16 ISA: docker build --build-arg VLLM_CPU_AMXBF16="true" ...
-ARG VLLM_CPU_AMXBF16=0
+ARG VLLM_CPU_AMXBF16=1
 ENV VLLM_CPU_AMXBF16=${VLLM_CPU_AMXBF16}
 
 WORKDIR /workspace/vllm
@@ -147,7 +147,9 @@ WORKDIR /workspace/vllm
 
 RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
     --mount=type=cache,target=/var/lib/apt,sharing=locked \
-    apt-get install -y --no-install-recommends vim numactl xz-utils
+    apt-get install -y --no-install-recommends vim numactl xz-utils make clangd-14
+
+RUN ln -s /usr/bin/clangd-14 /usr/bin/clangd
 
 # install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
diff --git a/docker/Dockerfile.ppc64le b/docker/Dockerfile.ppc64le
index b16bea3607d2f62a2fcd3154277aeeaf1b2ffce4..07b64a509a4b43efd66541ab2e393127dd496792 100644
--- a/docker/Dockerfile.ppc64le
+++ b/docker/Dockerfile.ppc64le
@@ -22,13 +22,13 @@ RUN microdnf install -y dnf && dnf install -y gcc-toolset-14 make wget unzip \
 ###############################################################
 FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS centos-deps-builder
 RUN  microdnf install -y dnf && \ 
-     dnf install -y https://mirror.stream.centos.org/9-stream/BaseOS/`arch`/os/Packages/centos-gpg-keys-9.0-24.el9.noarch.rpm \
-        https://mirror.stream.centos.org/9-stream/BaseOS/`arch`/os/Packages/centos-stream-repos-9.0-24.el9.noarch.rpm \
+     dnf install -y https://mirror.stream.centos.org/9-stream/BaseOS/`arch`/os/Packages/centos-gpg-keys-9.0-26.el9.noarch.rpm \
+        https://mirror.stream.centos.org/9-stream/BaseOS/`arch`/os/Packages/centos-stream-repos-9.0-26.el9.noarch.rpm \
         https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \
         dnf config-manager --set-enabled crb
 
-RUN dnf install -y openjpeg2-devel lcms2-devel tcl-devel tk-devel fribidi-devel && \
-    dnf remove -y centos-gpg-keys-9.0-24.el9.noarch centos-stream-repos-9.0-24.el9.noarch 
+RUN dnf install -y openjpeg2-devel lcms2-devel tcl-devel tk-devel fribidi-devel yajl-devel && \
+    dnf remove -y centos-gpg-keys-9.0-24.el9.noarch centos-stream-repos-9.0-26.el9.noarch 
 
 
 ###############################################################
@@ -346,4 +346,4 @@ WORKDIR /workspace/
 
 RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
 
-ENTRYPOINT ["vllm", "serve"]
\ No newline at end of file
+ENTRYPOINT ["vllm", "serve"]
diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index 1b6bdabc7a539c28b8bd0819a2bd40edd53e9e0b..2744117af9519dbd551028de2843fe5c9d66b595 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -3,6 +3,14 @@ ARG REMOTE_VLLM="0"
 ARG COMMON_WORKDIR=/app
 ARG BASE_IMAGE=rocm/vllm-dev:base
 
+# Sccache configuration (only used in release pipeline)
+ARG USE_SCCACHE
+ARG SCCACHE_DOWNLOAD_URL
+ARG SCCACHE_ENDPOINT
+ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
+ARG SCCACHE_REGION_NAME=us-west-2
+ARG SCCACHE_S3_NO_CREDENTIALS=0
+
 FROM ${BASE_IMAGE} AS base
 
 ARG ARG_PYTORCH_ROCM_ARCH
@@ -14,9 +22,14 @@ ENV RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1
 RUN apt-get update -q -y && apt-get install -q -y \
     sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \
     apt-transport-https ca-certificates wget curl
-# Remove sccache
 RUN python3 -m pip install --upgrade pip
-RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
+# Remove sccache only if not using sccache (it exists in base image from Dockerfile.rocm_base)
+ARG USE_SCCACHE
+RUN if [ "$USE_SCCACHE" != "1" ]; then \
+        apt-get purge -y sccache || true; \
+        python3 -m pip uninstall -y sccache || true; \
+        rm -f "$(which sccache)" || true; \
+    fi
 
 # Install UV
 RUN curl -LsSf https://astral.sh/uv/install.sh | env UV_INSTALL_DIR="/usr/local/bin" sh
@@ -28,6 +41,39 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
 # Use copy mode to avoid hardlink failures with Docker cache mounts
 ENV UV_LINK_MODE=copy
 
+# Install sccache if USE_SCCACHE is enabled (for release builds)
+ARG USE_SCCACHE
+ARG SCCACHE_DOWNLOAD_URL
+ARG SCCACHE_ENDPOINT
+ARG SCCACHE_BUCKET_NAME
+ARG SCCACHE_REGION_NAME
+ARG SCCACHE_S3_NO_CREDENTIALS
+RUN if [ "$USE_SCCACHE" = "1" ]; then \
+        if command -v sccache >/dev/null 2>&1; then \
+            echo "sccache already installed, skipping installation"; \
+            sccache --version; \
+        else \
+            echo "Installing sccache..." \
+            && SCCACHE_ARCH="x86_64" \
+            && SCCACHE_VERSION="v0.8.1" \
+            && SCCACHE_DL_URL="${SCCACHE_DOWNLOAD_URL:-https://github.com/mozilla/sccache/releases/download/${SCCACHE_VERSION}/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl.tar.gz}" \
+            && curl -L -o /tmp/sccache.tar.gz ${SCCACHE_DL_URL} \
+            && tar -xzf /tmp/sccache.tar.gz -C /tmp \
+            && mv /tmp/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl/sccache /usr/bin/sccache \
+            && chmod +x /usr/bin/sccache \
+            && rm -rf /tmp/sccache.tar.gz /tmp/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl \
+            && sccache --version; \
+        fi; \
+    fi
+
+# Set sccache environment variables only when USE_SCCACHE=1
+# This prevents S3 config from leaking into images when sccache is not used
+ARG USE_SCCACHE
+ENV SCCACHE_BUCKET=${USE_SCCACHE:+${SCCACHE_BUCKET_NAME}}
+ENV SCCACHE_REGION=${USE_SCCACHE:+${SCCACHE_REGION_NAME}}
+ENV SCCACHE_S3_NO_CREDENTIALS=${USE_SCCACHE:+${SCCACHE_S3_NO_CREDENTIALS}}
+ENV SCCACHE_IDLE_TIMEOUT=${USE_SCCACHE:+0}
+
 ARG COMMON_WORKDIR
 WORKDIR ${COMMON_WORKDIR}
 
@@ -39,6 +85,8 @@ ONBUILD COPY ./ vllm/
 FROM base AS fetch_vllm_1
 ARG VLLM_REPO="https://github.com/vllm-project/vllm.git"
 ARG VLLM_BRANCH="main"
+ENV VLLM_REPO=${VLLM_REPO}
+ENV VLLM_BRANCH=${VLLM_BRANCH}
 ONBUILD RUN git clone ${VLLM_REPO} \
 	    && cd vllm \
 	    && git fetch -v --prune -- origin ${VLLM_BRANCH} \
@@ -51,7 +99,7 @@ FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm
 # -----------------------
 # vLLM build stages
 FROM fetch_vllm AS build_vllm
-# Build vLLM
+# Build vLLM (setup.py auto-detects sccache in PATH)
 RUN cd vllm \
     && python3 -m pip install -r requirements/rocm.txt \
     && python3 setup.py clean --all  \
@@ -67,6 +115,178 @@ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/docker/Dockerfile.rocm /docker/
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/.buildkite /.buildkite
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/vllm/v1 /vllm_v1
 
+# RIXL/UCX build stages
+FROM base AS build_rixl
+ARG RIXL_BRANCH="f33a5599"
+ARG RIXL_REPO="https://github.com/ROCm/RIXL.git"
+ARG UCX_BRANCH="da3fac2a"
+ARG UCX_REPO="https://github.com/ROCm/ucx.git"
+ENV ROCM_PATH=/opt/rocm
+ENV UCX_HOME=/usr/local/ucx
+ENV RIXL_HOME=/usr/local/rixl
+ENV RIXL_BENCH_HOME=/usr/local/rixl_bench
+
+# RIXL build system dependences and RDMA support
+RUN apt-get -y update && apt-get -y install autoconf libtool pkg-config \
+    libgrpc-dev \
+    libgrpc++-dev \
+    libprotobuf-dev \
+    protobuf-compiler-grpc \
+    libcpprest-dev \
+    libaio-dev \
+    librdmacm1 \
+    librdmacm-dev \
+    libibverbs1 \
+    libibverbs-dev \
+    ibverbs-utils \
+    rdmacm-utils \
+    ibverbs-providers \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN uv pip install --system meson auditwheel patchelf tomlkit
+
+RUN cd /usr/local/src && \
+    git clone ${UCX_REPO} &&  \
+    cd ucx  && \
+    git checkout ${UCX_BRANCH} && \
+    ./autogen.sh && \
+    mkdir build && cd build && \
+    ../configure \
+        --prefix=/usr/local/ucx \
+        --enable-shared \
+        --disable-static \
+        --disable-doxygen-doc \
+        --enable-optimizations \
+        --enable-devel-headers \
+        --with-rocm=/opt/rocm \
+        --with-verbs \
+        --with-dm \
+        --enable-mt && \
+    make -j && \
+    make install
+
+ENV PATH=/usr/local/ucx/bin:$PATH
+ENV LD_LIBRARY_PATH=${UCX_HOME}/lib:${LD_LIBRARY_PATH}
+
+RUN git clone ${RIXL_REPO} /opt/rixl && \
+    cd /opt/rixl && \
+    git checkout ${RIXL_BRANCH} && \
+    meson setup build --prefix=${RIXL_HOME} \
+                     -Ducx_path=${UCX_HOME} \
+                     -Drocm_path=${ROCM_PATH} && \
+    cd build && \
+    ninja && \
+    ninja install
+
+# Generate RIXL wheel
+RUN cd /opt/rixl && mkdir -p /app/install && \
+    ./contrib/build-wheel.sh \
+        --output-dir /app/install \
+        --rocm-dir ${ROCM_PATH} \
+        --ucx-plugins-dir ${UCX_HOME}/lib/ucx \
+        --nixl-plugins-dir ${RIXL_HOME}/lib/x86_64-linux-gnu/plugins
+
+
+# -----------------------
+# vLLM wheel release build stage (for building distributable wheels)
+# This stage pins dependencies to custom ROCm wheel versions and handles version detection
+FROM fetch_vllm AS build_vllm_wheel_release
+
+ARG COMMON_WORKDIR
+
+# Create /install directory for custom wheels
+RUN mkdir -p /install
+
+# Copy custom ROCm wheels from docker/context if they exist
+# COPY ensures Docker cache is invalidated when wheels change
+# .keep file ensures directory always exists for COPY to work
+COPY docker/context/base-wheels/ /tmp/base-wheels/
+# This is how we know if we are building for a wheel release or not.
+# If there are not wheels found there, we are not building for a wheel release. 
+# So we exit with an error. To skip this stage.
+RUN if [ -n "$(ls /tmp/base-wheels/*.whl 2>/dev/null)" ]; then \
+        echo "Found custom wheels - copying to /install"; \
+        cp /tmp/base-wheels/*.whl /install/ && \
+        echo "Copied custom wheels:"; \
+        ls -lh /install/; \
+    else \
+        echo "ERROR: No custom wheels found in docker/context/base-wheels/"; \
+        echo "Wheel releases require pre-built ROCm wheels."; \
+        exit 1; \
+    fi
+
+# GIT_REPO_CHECK: Verify repo is clean and tags are available (for release builds)
+# This matches CUDA's Dockerfile behavior for proper version detection via setuptools_scm
+ARG GIT_REPO_CHECK=0
+RUN if [ "$GIT_REPO_CHECK" != "0" ]; then \
+        echo "Running repository checks..."; \
+        cd vllm && bash tools/check_repo.sh; \
+    fi
+
+# Extract version from git BEFORE any modifications (pin_rocm_dependencies.py modifies requirements/rocm.txt)
+# This ensures setuptools_scm sees clean repo state for version detection
+RUN --mount=type=bind,source=.git,target=vllm/.git \
+    cd vllm \
+    && pip install setuptools_scm \
+    && VLLM_VERSION=$(python3 -c "import setuptools_scm; print(setuptools_scm.get_version())") \
+    && echo "Detected vLLM version: ${VLLM_VERSION}" \
+    && echo "${VLLM_VERSION}" > /tmp/vllm_version.txt
+
+# Fail if git-based package dependencies are found in requirements files
+# (uv doesn't handle git+ URLs well, and packages should be distributed on PyPI)
+# Extra notes: pip install is able to handle git+ URLs, but uv doesn't.
+RUN echo "Checking for git-based packages in requirements files..." \
+    && echo "Checking common.txt for git-based packages:" \
+    && if grep -q 'git+' ${COMMON_WORKDIR}/vllm/requirements/common.txt; then \
+         echo "ERROR: Git-based packages found in common.txt:"; \
+         grep 'git+' ${COMMON_WORKDIR}/vllm/requirements/common.txt; \
+         echo "Please publish these packages to PyPI instead of using git dependencies."; \
+         exit 1; \
+       else \
+         echo "  ✓ No git-based packages found in common.txt"; \
+       fi \
+    && echo "Checking rocm.txt for git-based packages:" \
+    && if grep -q 'git+' ${COMMON_WORKDIR}/vllm/requirements/rocm.txt; then \
+         echo "ERROR: Git-based packages found in rocm.txt:"; \
+         grep 'git+' ${COMMON_WORKDIR}/vllm/requirements/rocm.txt; \
+         echo "Please publish these packages to PyPI instead of using git dependencies."; \
+         exit 1; \
+       else \
+         echo "  ✓ No git-based packages found in rocm.txt"; \
+       fi \
+    && echo "All requirements files are clean - no git-based packages found"
+
+# Pin vLLM dependencies to exact versions of custom ROCm wheels
+# This ensures 'pip install vllm' automatically installs correct torch/triton/torchvision/amdsmi
+COPY tools/vllm-rocm/pin_rocm_dependencies.py /tmp/pin_rocm_dependencies.py
+RUN echo "Pinning vLLM dependencies to custom wheel versions..." \
+    && python3 /tmp/pin_rocm_dependencies.py /install ${COMMON_WORKDIR}/vllm/requirements/rocm.txt
+
+# Install dependencies using custom wheels from /install
+RUN cd vllm \
+    && echo "Building vLLM with custom wheels from /install" \
+    && python3 -m pip install --find-links /install -r requirements/rocm.txt \
+    && python3 setup.py clean --all
+
+# Build wheel using pre-extracted version to avoid dirty state from modified requirements/rocm.txt
+# (setup.py auto-detects sccache in PATH)
+RUN --mount=type=bind,source=.git,target=vllm/.git \
+    cd vllm \
+    && export SETUPTOOLS_SCM_PRETEND_VERSION=$(cat /tmp/vllm_version.txt) \
+    && echo "Building wheel with version: ${SETUPTOOLS_SCM_PRETEND_VERSION}" \
+    && python3 setup.py bdist_wheel --dist-dir=dist
+
+FROM scratch AS export_vllm_wheel_release
+ARG COMMON_WORKDIR
+COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/dist/*.whl /
+COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/requirements /requirements
+COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/benchmarks /benchmarks
+COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/tests /tests
+COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/examples /examples
+COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/docker/Dockerfile.rocm /docker/
+COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/.buildkite /.buildkite
+COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/vllm/v1 /vllm_v1
+
 # -----------------------
 # Test vLLM image
 FROM base AS test
@@ -83,6 +303,10 @@ RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
     && pip uninstall -y vllm \
     && uv pip install --system *.whl
 
+# Install RIXL wheel
+RUN --mount=type=bind,from=build_rixl,src=/app/install,target=/rixl_install \
+    uv pip install --system /rixl_install/*.whl
+
 WORKDIR /vllm-workspace
 ARG COMMON_WORKDIR
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace
@@ -97,6 +321,14 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     uv pip install --system hf_transfer
 ENV HF_HUB_ENABLE_HF_TRANSFER=1
 
+# install audio decode package `torchcodec` from source (required due to 
+# ROCm and torch version mismatch) for tests with datasets package
+COPY tools/install_torchcodec_rocm.sh /tmp/install_torchcodec.sh
+RUN bash /tmp/install_torchcodec.sh \
+    && rm /tmp/install_torchcodec.sh \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
 # Copy in the v1 package (for python-only install test group)
 COPY --from=export_vllm /vllm_v1 /usr/local/lib/python${PYTHON_VERSION}/dist-packages/vllm/v1
 
@@ -130,6 +362,7 @@ RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
     && uv pip install --system *.whl
 
 ARG COMMON_WORKDIR
+ARG BASE_IMAGE
 
 # Copy over the benchmark scripts as well
 COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks
@@ -144,4 +377,13 @@ ENV SAFETENSORS_FAST_GPU=1
 # Performance environment variable.
 ENV HIP_FORCE_DEV_KERNARG=1
 
+# Workaround for ROCm profiler limits
+RUN echo "ROCTRACER_MAX_EVENTS=10000000" > ${COMMON_WORKDIR}/libkineto.conf
+ENV KINETO_CONFIG="${COMMON_WORKDIR}/libkineto.conf"
+RUN echo "VLLM_BASE_IMAGE=${BASE_IMAGE}" >> ${COMMON_WORKDIR}/versions.txt
+
 CMD ["/bin/bash"]
+
+#Set entrypoint for vllm-openai official images
+FROM final As vllm-openai
+ENTRYPOINT ["vllm", "serve"]
diff --git a/docker/Dockerfile.rocm_base b/docker/Dockerfile.rocm_base
index a57ee728d924367bacdf43bb377eb3756a9b1edf..6f8c7222fdcea78f5bfbb6a3fee12a51eabaf58e 100644
--- a/docker/Dockerfile.rocm_base
+++ b/docker/Dockerfile.rocm_base
@@ -1,16 +1,26 @@
-ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.1-complete
+ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.0-complete
 ARG TRITON_BRANCH="57c693b6"
 ARG TRITON_REPO="https://github.com/ROCm/triton.git"
-ARG PYTORCH_BRANCH="1c57644d"
-ARG PYTORCH_VISION_BRANCH="v0.23.0"
+ARG PYTORCH_BRANCH="89075173"
 ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git"
+ARG PYTORCH_VISION_BRANCH="v0.24.1"
 ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
 ARG PYTORCH_AUDIO_BRANCH="v2.9.0"
 ARG PYTORCH_AUDIO_REPO="https://github.com/pytorch/audio.git"
 ARG FA_BRANCH="0e60e394"
 ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
-ARG AITER_BRANCH="59bd8ff2"
+ARG AITER_BRANCH="6af8b687"
 ARG AITER_REPO="https://github.com/ROCm/aiter.git"
+ARG MORI_BRANCH="2d02c6a9"
+ARG MORI_REPO="https://github.com/ROCm/mori.git"
+
+# Sccache configuration (only used in release pipeline)
+ARG USE_SCCACHE
+ARG SCCACHE_DOWNLOAD_URL
+ARG SCCACHE_ENDPOINT
+ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
+ARG SCCACHE_REGION_NAME=us-west-2
+ARG SCCACHE_S3_NO_CREDENTIALS=0
 
 FROM ${BASE_IMAGE} AS base
 
@@ -20,6 +30,7 @@ ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib:
 ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151
 ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
 ENV AITER_ROCM_ARCH=gfx942;gfx950
+ENV MORI_GPU_ARCHS=gfx942;gfx950
 
 # Required for RCCL in ROCm7.1
 ENV HSA_NO_SCRATCH_RECLAIM=1
@@ -33,7 +44,7 @@ ENV DEBIAN_FRONTEND=noninteractive
 
 # Install Python and other dependencies
 RUN apt-get update -y \
-    && apt-get install -y software-properties-common git curl sudo vim less libgfortran5 \
+    && apt-get install -y software-properties-common git curl sudo vim less libgfortran5 libopenmpi-dev libpci-dev \
     && for i in 1 2 3; do \
         add-apt-repository -y ppa:deadsnakes/ppa && break || \
         { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
@@ -50,6 +61,53 @@ RUN apt-get update -y \
 RUN pip install -U packaging 'cmake<4' ninja wheel 'setuptools<80' pybind11 Cython
 RUN apt-get update && apt-get install -y libjpeg-dev libsox-dev libsox-fmt-all sox && rm -rf /var/lib/apt/lists/*
 
+# Install sccache if USE_SCCACHE is enabled (for release builds)
+ARG USE_SCCACHE
+ARG SCCACHE_DOWNLOAD_URL
+ARG SCCACHE_ENDPOINT
+ARG SCCACHE_BUCKET_NAME
+ARG SCCACHE_REGION_NAME
+ARG SCCACHE_S3_NO_CREDENTIALS
+RUN if [ "$USE_SCCACHE" = "1" ]; then \
+        echo "Installing sccache..." \
+        && SCCACHE_ARCH="x86_64" \
+        && SCCACHE_VERSION="v0.8.1" \
+        && SCCACHE_DL_URL="${SCCACHE_DOWNLOAD_URL:-https://github.com/mozilla/sccache/releases/download/${SCCACHE_VERSION}/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl.tar.gz}" \
+        && curl -L -o /tmp/sccache.tar.gz ${SCCACHE_DL_URL} \
+        && tar -xzf /tmp/sccache.tar.gz -C /tmp \
+        && mv /tmp/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl/sccache /usr/bin/sccache \
+        && chmod +x /usr/bin/sccache \
+        && rm -rf /tmp/sccache.tar.gz /tmp/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl \
+        && sccache --version; \
+    fi
+
+# Setup sccache for HIP compilation via HIP_CLANG_PATH
+# This creates wrapper scripts in a separate directory and points HIP to use them
+# This avoids modifying the original ROCm binaries which can break detection
+# NOTE: HIP_CLANG_PATH is NOT set as ENV to avoid affecting downstream images (Dockerfile.rocm)
+# Instead, each build stage should export HIP_CLANG_PATH=/opt/sccache-wrappers if USE_SCCACHE=1
+RUN if [ "$USE_SCCACHE" = "1" ]; then \
+        echo "Setting up sccache wrappers for HIP compilation..." \
+        && mkdir -p /opt/sccache-wrappers \
+        && printf '#!/bin/bash\nexec sccache /opt/rocm/lib/llvm/bin/clang++ "$@"\n' > /opt/sccache-wrappers/clang++ \
+        && chmod +x /opt/sccache-wrappers/clang++ \
+        && printf '#!/bin/bash\nexec sccache /opt/rocm/lib/llvm/bin/clang "$@"\n' > /opt/sccache-wrappers/clang \
+        && chmod +x /opt/sccache-wrappers/clang \
+        && echo "sccache wrappers created in /opt/sccache-wrappers"; \
+    fi
+
+# Set sccache environment variables only when USE_SCCACHE=1
+# This prevents S3 config from leaking into images when sccache is not used
+ARG USE_SCCACHE
+ENV SCCACHE_BUCKET=${USE_SCCACHE:+${SCCACHE_BUCKET_NAME}}
+ENV SCCACHE_REGION=${USE_SCCACHE:+${SCCACHE_REGION_NAME}}
+ENV SCCACHE_S3_NO_CREDENTIALS=${USE_SCCACHE:+${SCCACHE_S3_NO_CREDENTIALS}}
+ENV SCCACHE_IDLE_TIMEOUT=${USE_SCCACHE:+0}
+
+
+###
+### Triton Build
+###
 FROM base AS build_triton
 ARG TRITON_BRANCH
 ARG TRITON_REPO
@@ -62,11 +120,19 @@ RUN cd triton \
 RUN if [ -d triton/python/triton_kernels ]; then pip install build && cd triton/python/triton_kernels \
     && python3 -m build --wheel && cp dist/*.whl /app/install; fi
 
+
+###
+### AMD SMI Build
+###
 FROM base AS build_amdsmi
 RUN cd /opt/rocm/share/amd_smi \
     && pip wheel . --wheel-dir=dist
 RUN mkdir -p /app/install && cp /opt/rocm/share/amd_smi/dist/*.whl /app/install
 
+
+###
+### Pytorch build
+###
 FROM base AS build_pytorch
 ARG PYTORCH_BRANCH
 ARG PYTORCH_VISION_BRANCH
@@ -74,42 +140,93 @@ ARG PYTORCH_AUDIO_BRANCH
 ARG PYTORCH_REPO
 ARG PYTORCH_VISION_REPO
 ARG PYTORCH_AUDIO_REPO
+ARG USE_SCCACHE
 
 RUN git clone ${PYTORCH_REPO} pytorch
 RUN cd pytorch && git checkout ${PYTORCH_BRANCH} \
     && pip install -r requirements.txt && git submodule update --init --recursive \
     && python3 tools/amd_build/build_amd.py \
+    && if [ "$USE_SCCACHE" = "1" ]; then \
+           export HIP_CLANG_PATH=/opt/sccache-wrappers \
+           && export CMAKE_C_COMPILER_LAUNCHER=sccache \
+           && export CMAKE_CXX_COMPILER_LAUNCHER=sccache \
+           && sccache --show-stats; \
+       fi \
     && CMAKE_PREFIX_PATH=$(python3 -c 'import sys; print(sys.prefix)') python3 setup.py bdist_wheel --dist-dir=dist \
+    && if [ "$USE_SCCACHE" = "1" ]; then sccache --show-stats; fi \
     && pip install dist/*.whl
 RUN git clone ${PYTORCH_VISION_REPO} vision
 RUN cd vision && git checkout ${PYTORCH_VISION_BRANCH} \
+    && if [ "$USE_SCCACHE" = "1" ]; then \
+           export HIP_CLANG_PATH=/opt/sccache-wrappers \
+           && export CMAKE_C_COMPILER_LAUNCHER=sccache \
+           && export CMAKE_CXX_COMPILER_LAUNCHER=sccache; \
+       fi \
     && python3 setup.py bdist_wheel --dist-dir=dist \
+    && if [ "$USE_SCCACHE" = "1" ]; then sccache --show-stats; fi \
     && pip install dist/*.whl
 RUN git clone ${PYTORCH_AUDIO_REPO} audio
 RUN cd audio && git checkout ${PYTORCH_AUDIO_BRANCH} \
     && git submodule update --init --recursive \
     && pip install -r requirements.txt \
+    && if [ "$USE_SCCACHE" = "1" ]; then \
+           export HIP_CLANG_PATH=/opt/sccache-wrappers \
+           && export CMAKE_C_COMPILER_LAUNCHER=sccache \
+           && export CMAKE_CXX_COMPILER_LAUNCHER=sccache; \
+       fi \
     && python3 setup.py bdist_wheel --dist-dir=dist \
+    && if [ "$USE_SCCACHE" = "1" ]; then sccache --show-stats; fi \
     && pip install dist/*.whl
 RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
     && cp /app/vision/dist/*.whl /app/install \
     && cp /app/audio/dist/*.whl /app/install
 
+
+###
+### MORI Build
+###
+FROM base AS build_mori
+ARG MORI_BRANCH
+ARG MORI_REPO
+RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
+    pip install /install/*.whl
+RUN git clone ${MORI_REPO}
+RUN cd mori \
+    && git checkout ${MORI_BRANCH} \
+    && git submodule update --init --recursive \
+    && python3 setup.py bdist_wheel --dist-dir=dist && ls /app/mori/dist/*.whl
+RUN mkdir -p /app/install && cp /app/mori/dist/*.whl /app/install
+
+
+###
+### FlashAttention Build
+###
 FROM base AS build_fa
 ARG FA_BRANCH
 ARG FA_REPO
+ARG USE_SCCACHE
 RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
     pip install /install/*.whl
 RUN git clone ${FA_REPO}
 RUN cd flash-attention \
     && git checkout ${FA_BRANCH} \
     && git submodule update --init \
-    && GPU_ARCHS=$(echo ${PYTORCH_ROCM_ARCH} | sed -e 's/;gfx1[0-9]\{3\}//g') python3 setup.py bdist_wheel --dist-dir=dist
+    && if [ "$USE_SCCACHE" = "1" ]; then \
+           export HIP_CLANG_PATH=/opt/sccache-wrappers \
+           && sccache --show-stats; \
+       fi \
+    && GPU_ARCHS=$(echo ${PYTORCH_ROCM_ARCH} | sed -e 's/;gfx1[0-9]\{3\}//g') python3 setup.py bdist_wheel --dist-dir=dist \
+    && if [ "$USE_SCCACHE" = "1" ]; then sccache --show-stats; fi
 RUN mkdir -p /app/install && cp /app/flash-attention/dist/*.whl /app/install
 
+
+###
+### AITER Build
+###
 FROM base AS build_aiter
 ARG AITER_BRANCH
 ARG AITER_REPO
+ARG USE_SCCACHE
 RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
     pip install /install/*.whl
 RUN git clone --recursive ${AITER_REPO}
@@ -117,9 +234,37 @@ RUN cd aiter \
     && git checkout ${AITER_BRANCH} \
     && git submodule update --init --recursive \
     && pip install -r requirements.txt
-RUN pip install pyyaml && cd aiter && PREBUILD_KERNELS=1 GPU_ARCHS=${AITER_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist && ls /app/aiter/dist/*.whl
+RUN pip install pyyaml && cd aiter \
+    && if [ "$USE_SCCACHE" = "1" ]; then \
+           export HIP_CLANG_PATH=/opt/sccache-wrappers \
+           && sccache --show-stats; \
+       fi \
+    && PREBUILD_KERNELS=1 GPU_ARCHS=${AITER_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist \
+    && if [ "$USE_SCCACHE" = "1" ]; then sccache --show-stats; fi \
+    && ls /app/aiter/dist/*.whl
 RUN mkdir -p /app/install && cp /app/aiter/dist/*.whl /app/install
 
+
+###
+### Final Build
+###
+
+# Wheel release stage - 
+# only includes dependencies used by wheel release pipeline
+FROM base AS debs_wheel_release
+RUN mkdir /app/debs
+RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
+    cp /install/*.whl /app/debs
+RUN --mount=type=bind,from=build_fa,src=/app/install/,target=/install \
+    cp /install/*.whl /app/debs
+RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
+    cp /install/*.whl /app/debs
+RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
+    cp /install/*.whl /app/debs
+RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
+    cp /install/*.whl /app/debs
+
+# Full debs stage - includes Mori (used by Docker releases)
 FROM base AS debs
 RUN mkdir /app/debs
 RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
@@ -132,6 +277,8 @@ RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
     cp /install/*.whl /app/debs
 RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
     cp /install/*.whl /app/debs
+RUN --mount=type=bind,from=build_mori,src=/app/install/,target=/install \
+    cp /install/*.whl /app/debs
 
 FROM base AS final
 RUN --mount=type=bind,from=debs,src=/app/debs,target=/install \
@@ -150,6 +297,8 @@ ARG FA_BRANCH
 ARG FA_REPO
 ARG AITER_BRANCH
 ARG AITER_REPO
+ARG MORI_BRANCH
+ARG MORI_REPO
 RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
     && echo "TRITON_BRANCH: ${TRITON_BRANCH}" >> /app/versions.txt \
     && echo "TRITON_REPO: ${TRITON_REPO}" >> /app/versions.txt \
@@ -162,4 +311,6 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
     && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
     && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \
     && echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
-    && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt
\ No newline at end of file
+    && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt \
+    && echo "MORI_BRANCH: ${MORI_BRANCH}" >> /app/versions.txt \
+    && echo "MORI_REPO: ${MORI_REPO}" >> /app/versions.txt
diff --git a/docker/Dockerfile.xpu b/docker/Dockerfile.xpu
index 72d2053102c22426316df924f1bf15838b18f0ac..f63ce2c5037fbe4afb55ea9b53c7149b80be8b18 100644
--- a/docker/Dockerfile.xpu
+++ b/docker/Dockerfile.xpu
@@ -2,7 +2,7 @@ FROM intel/deep-learning-essentials:2025.2.2-0-devel-ubuntu24.04 AS vllm-base
 
 RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \
     echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \
-    add-apt-repository -y ppa:kobuk-team/intel-graphics
+    add-apt-repository -y ppa:kobuk-team/intel-graphics-staging
 
 RUN apt clean && apt-get update -y && \
     apt-get install -y --no-install-recommends --fix-missing \
@@ -28,10 +28,14 @@ RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.12 1
 RUN apt install -y libze1 libze-dev libze-intel-gpu1 intel-opencl-icd libze-intel-gpu-raytracing intel-ocloc
 
 # This oneccl contains the BMG support which is not the case for default version of oneapi 2025.2.
-RUN wget https://github.com/uxlfoundation/oneCCL/releases/download/2021.15.6/intel-oneccl-2021.15.6.9_offline.sh
-RUN bash intel-oneccl-2021.15.6.9_offline.sh -a --silent --eula accept && \
+ARG ONECCL_INSTALLER="intel-oneccl-2021.15.7.6_offline.sh"
+RUN wget "https://github.com/uxlfoundation/oneCCL/releases/download/2021.15.7/${ONECCL_INSTALLER}" && \
+    bash "${ONECCL_INSTALLER}" -a --silent --eula accept && \
+    rm "${ONECCL_INSTALLER}" && \
     echo "source /opt/intel/oneapi/setvars.sh --force" >> /root/.bashrc && \
     echo "source /opt/intel/oneapi/ccl/2021.15/env/vars.sh --force" >> /root/.bashrc
+RUN rm -f /opt/intel/oneapi/ccl/latest && \
+    ln -s /opt/intel/oneapi/ccl/2021.15 /opt/intel/oneapi/ccl/latest
 
 SHELL ["bash", "-c"]
 CMD ["bash", "-c", "source /root/.bashrc && exec bash"]
@@ -47,6 +51,11 @@ RUN --mount=type=cache,target=/root/.cache/pip \
     pip install --no-cache-dir \
     -r requirements/xpu.txt
 
+# arctic-inference is built from source which needs torch-xpu properly installed
+# used for suffix method speculative decoding
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install --no-cache-dir arctic-inference==0.1.1
+
 ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/"
 
 COPY . .
diff --git a/docker/docker-bake.hcl b/docker/docker-bake.hcl
new file mode 100644
index 0000000000000000000000000000000000000000..daf0d62a683d9d04346227d3c410886283349a75
--- /dev/null
+++ b/docker/docker-bake.hcl
@@ -0,0 +1,76 @@
+# docker-bake.hcl - vLLM Docker build configuration
+#
+# This file lives in vLLM repo at docker/docker-bake.hcl
+#
+# Usage:
+#   cd docker && docker buildx bake        # Build default target (openai)
+#   cd docker && docker buildx bake test   # Build test target
+#   docker buildx bake --print             # Show resolved config
+#
+# Reference: https://docs.docker.com/build/bake/reference/
+
+# Build configuration
+
+variable "MAX_JOBS" {
+  default = 16
+}
+
+variable "NVCC_THREADS" {
+  default = 8
+}
+
+variable "TORCH_CUDA_ARCH_LIST" {
+  default = "8.0 8.9 9.0 10.0"
+}
+
+variable "COMMIT" {
+  default = ""
+}
+
+# Groups
+
+group "default" {
+  targets = ["openai"]
+}
+
+# Base targets
+
+target "_common" {
+  dockerfile = "docker/Dockerfile"
+  context    = "."
+  args = {
+    max_jobs             = MAX_JOBS
+    nvcc_threads         = NVCC_THREADS
+    torch_cuda_arch_list = TORCH_CUDA_ARCH_LIST
+  }
+}
+
+target "_labels" {
+  labels = {
+    "org.opencontainers.image.source"      = "https://github.com/vllm-project/vllm"
+    "org.opencontainers.image.vendor"      = "vLLM"
+    "org.opencontainers.image.title"       = "vLLM"
+    "org.opencontainers.image.description" = "vLLM: A high-throughput and memory-efficient inference and serving engine for LLMs"
+    "org.opencontainers.image.licenses"    = "Apache-2.0"
+    "org.opencontainers.image.revision"    = COMMIT
+  }
+  annotations = [
+      "index,manifest:org.opencontainers.image.revision=${COMMIT}",
+  ]
+}
+
+# Build targets
+
+target "test" {
+  inherits = ["_common", "_labels"]
+  target   = "test"
+  tags     = ["vllm:test"]
+  output   = ["type=docker"]
+}
+
+target "openai" {
+  inherits = ["_common", "_labels"]
+  target   = "vllm-openai"
+  tags     = ["vllm:openai"]
+  output   = ["type=docker"]
+}
diff --git a/docs/README.md b/docs/README.md
index 0c279c19f96ca7da550364899b312f475c264cf7..4b480c463abb730503d09d957f7eb3fa9f27a7e6 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -62,7 +62,7 @@ vLLM is flexible and easy to use with:
 
 For more information, check out the following:
 
-- [vLLM announcing blog post](https://vllm.ai) (intro to PagedAttention)
+- [vLLM announcing blog post](https://blog.vllm.ai/2023/06/20/vllm.html) (intro to PagedAttention)
 - [vLLM paper](https://arxiv.org/abs/2309.06180) (SOSP 2023)
 - [How continuous batching enables 23x throughput in LLM inference while reducing p50 latency](https://www.anyscale.com/blog/continuous-batching-llm-inference) by Cade Daniel et al.
 - [vLLM Meetups](community/meetups.md)
diff --git a/docs/api/README.md b/docs/api/README.md
index d51329ec2faa3d360d8a7bbdc6dda3d26a2092e2..14780c803c75eb69c2ff8f0a6b16476ca930c501 100644
--- a/docs/api/README.md
+++ b/docs/api/README.md
@@ -72,7 +72,6 @@ Internal data structures.
 - [vllm.multimodal.inputs.MultiModalFieldConfig][]
 - [vllm.multimodal.inputs.MultiModalKwargsItem][]
 - [vllm.multimodal.inputs.MultiModalKwargsItems][]
-- [vllm.multimodal.inputs.MultiModalKwargs][]
 - [vllm.multimodal.inputs.MultiModalInputs][]
 
 ### Data Parsing
diff --git a/docs/assets/contributing/dockerfile-stages-dependency.png b/docs/assets/contributing/dockerfile-stages-dependency.png
index 7420ca4d89441e6dd320657092aaf3e1c0491e9c..c8839eb93de95fa5ffd6b3338b38ce270ea0e1c7 100644
Binary files a/docs/assets/contributing/dockerfile-stages-dependency.png and b/docs/assets/contributing/dockerfile-stages-dependency.png differ
diff --git a/docs/assets/deployment/claude-code-example.png b/docs/assets/deployment/claude-code-example.png
new file mode 100644
index 0000000000000000000000000000000000000000..c6f14419666bec396bb60123a92a0e8f5835abc9
Binary files /dev/null and b/docs/assets/deployment/claude-code-example.png differ
diff --git a/docs/benchmarking/dashboard.md b/docs/benchmarking/dashboard.md
index 7cc4d23250df978e17df54fe750bd1e777db6b16..701fb16ae2cf10b0e43883e4131406d1fd0a7ca5 100644
--- a/docs/benchmarking/dashboard.md
+++ b/docs/benchmarking/dashboard.md
@@ -8,12 +8,19 @@ The results are automatically published to the public [vLLM Performance Dashboar
 ## Manually Trigger the benchmark
 
 Use [vllm-ci-test-repo images](https://gallery.ecr.aws/q9t5s3a7/vllm-ci-test-repo) with vLLM benchmark suite.
-For CPU environment, please use the image with "-cpu" postfix.
+For x86 CPU environment, please use the image with "-cpu" postfix. For AArch64 CPU environment, please use the image with "-arm64-cpu" postfix.
 
-Here is an example for docker run command for CPU.
+Here is an example for docker run command for CPU. For GPUs skip setting the `ON_CPU` env var.
 
 ```bash
-docker run -it --entrypoint /bin/bash -v /data/huggingface:/root/.cache/huggingface  -e HF_TOKEN=''  --shm-size=16g --name vllm-cpu-ci  public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:1da94e673c257373280026f75ceb4effac80e892-cpu
+export VLLM_COMMIT=1da94e673c257373280026f75ceb4effac80e892 # use full commit hash from the main branch
+export HF_TOKEN=<valid Hugging Face token>
+if [[ "$(uname -m)" == aarch64 || "$(uname -m)" == arm64 ]]; then
+  IMG_SUFFIX="arm64-cpu"
+else
+  IMG_SUFFIX="cpu"
+fi
+docker run -it --entrypoint /bin/bash -v /data/huggingface:/root/.cache/huggingface -e HF_TOKEN=$HF_TOKEN -e ON_ARM64_CPU=1 --shm-size=16g --name vllm-cpu-ci public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:${VLLM_COMMIT}-${IMG_SUFFIX}
 ```
 
 Then, run below command inside the docker instance.
@@ -26,14 +33,65 @@ When run, benchmark script generates results under **benchmark/results** folder,
 
 ### Runtime environment variables
 
-- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
+- `ON_CPU`: set the value to '1' on Intel® Xeon® and Arm® Neoverse™ Processors. Default value is 0.
 - `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
 - `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
 - `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file).
 - `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string.
 - `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.
 
-For more results visualization, check the [visualizing the results](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md#visualizing-the-results).
+### Visualization
+
+The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table with real benchmarking results.
+You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
+If you do not see the table, please wait till the benchmark finish running.
+The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
+The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
+
+#### Performance Results Comparison
+
+The `compare-json-results.py` helps to compare benchmark results JSON files converted using `convert-results-json-to-markdown.py`.
+When run, benchmark script generates results under `benchmark/results` folder, along with the `benchmark_results.md` and `benchmark_results.json`.
+`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.  
+If only one benchmark_results.json is passed, `compare-json-results.py` compares different TP and PP configurations in the benchmark_results.json instead.
+
+Here is an example using the script to compare result_a and result_b with max concurrency and qps for same Model, Dataset name, input/output length.
+`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json`
+
+***Output Tput (tok/s) — Model : [ meta-llama/Llama-3.1-8B-Instruct ] , Dataset Name : [ random ] , Input Len : [ 2048.0 ] , Output Len : [ 2048.0 ]***
+
+|    | # of max concurrency | qps  | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio        |
+|----|------|-----|-----------|----------|----------|
+| 0  | 12 | inf | 24.98   | 186.03 |  7.45 |
+| 1  | 16 | inf|  25.49  | 246.92 | 9.69 |
+| 2  | 24 | inf| 27.74  | 293.34 |  10.57 |
+| 3  | 32 | inf| 28.61  |306.69 | 10.72 |
+
+***compare-json-results.py – Command-Line Parameters***  
+
+compare-json-results.py provides configurable parameters to compare one or more benchmark_results.json files and generate summary tables and plots.  
+In most cases, users only need to specify --file to parse the desired benchmark results.
+
+| Parameter              | Type               | Default Value           | Description                                                                                           |
+| ---------------------- | ------------------ | ----------------------- | ----------------------------------------------------------------------------------------------------- |
+| `--file`               | `str` (appendable) | *None*                  | Input JSON result file(s). Can be specified multiple times to compare multiple benchmark outputs.     |
+| `--debug`              | `bool`             | `False`                 | Enables debug mode. When set, prints all available information to aid troubleshooting and validation. |
+| `--plot` / `--no-plot` | `bool`             | `True`                  | Controls whether performance plots are generated. Use `--no-plot` to disable graph generation.        |
+| `--xaxis`              | `str`              | `# of max concurrency.` | Column name used as the X-axis in comparison plots (for example, concurrency or batch size).          |
+| `--latency`            | `str`              | `p99`                   | Latency aggregation method used for TTFT/TPOT. Supported values: `median` or `p99`.                   |
+| `--ttft-max-ms`        | `float`            | `3000.0`                | Reference upper bound (milliseconds) for TTFT plots, typically used to visualize SLA thresholds.      |
+| `--tpot-max-ms`        | `float`            | `100.0`                 | Reference upper bound (milliseconds) for TPOT plots, typically used to visualize SLA thresholds.      |
+
+***Valid Max Concurrency Summary***  
+
+Based on the configured TTFT and TPOT SLA thresholds, compare-json-results.py computes the maximum valid concurrency for each benchmark result.  
+The “Max # of max concurrency. (Both)” column represents the highest concurrency level that satisfies both TTFT and TPOT constraints simultaneously.  
+This value is typically used in capacity planning and sizing guides.  
+
+| # | Configuration  | Max # of max concurrency. (TTFT ≤ 10000 ms) | Max # of max concurrency. (TPOT ≤ 100 ms) | Max # of max concurrency. (Both) | Output Tput @ Both (tok/s) | TTFT @ Both (ms) | TPOT @ Both (ms) |
+| - | -------------- | ------------------------------------------- | ----------------------------------------- | -------------------------------- | -------------------------- | ---------------- | ---------------- |
+| 0 | results-a      | 128.00                                      | 12.00                                     | 12.00                            | 127.76                     | 3000.82          | 93.24            |
+| 1 | results-b      | 128.00                                      | 32.00                                     | 32.00                            | 371.42                     | 2261.53          | 81.74            |
 
 More information on the performance benchmarks and their parameters can be found in [Benchmark README](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md) and [performance benchmark description](../../.buildkite/performance-benchmarks/performance-benchmarks-descriptions.md).
 
diff --git a/docs/benchmarking/sweeps.md b/docs/benchmarking/sweeps.md
index ee4d40d876deaa2a52a25d4da6cc639c3e321ab1..93b9f4d6273a7f182d35ac09e3a032efd8eab0e5 100644
--- a/docs/benchmarking/sweeps.md
+++ b/docs/benchmarking/sweeps.md
@@ -129,10 +129,10 @@ vllm bench sweep serve_sla \
 
 The algorithm for adjusting the SLA variable is as follows:
 
-1. Run the benchmark with infinite QPS, and use the corresponding metrics to determine the initial value of the variable.
-    - For example, the initial request rate is set to the concurrency under infinite QPS.
-2. If the SLA is still satisfied, keep doubling the value until the SLA is no longer satisfied. This gives a relatively narrow window that contains the point where the SLA is barely satisfied.
-3. Apply binary search over the window to find the maximum value that still satisfies the SLA.
+1. Run the benchmark once with maximum possible QPS, and once with minimum possible QPS. For each run, calculate the distance of the SLA metrics from their targets, resulting in data points of QPS vs SLA distance.
+2. Perform spline interpolation between the data points to estimate the QPS that results in zero SLA distance.
+3. Run the benchmark with the estimated QPS and add the resulting data point to the history.
+4. Repeat Steps 2 and 3 until the maximum QPS that passes SLA and the minimum QPS that fails SLA in the history are close enough to each other.
 
 !!! important
     SLA tuning is applied over each combination of `--serve-params`, `--bench-params`, and `--sla-params`.
diff --git a/docs/cli/bench/latency.md b/docs/cli/bench/latency.md
index ea7ea7321ffcdb8acc6c2a6ee97385123a4280bc..9e1b905339757d48bcb51f37ecf5b75ea2bd4719 100644
--- a/docs/cli/bench/latency.md
+++ b/docs/cli/bench/latency.md
@@ -6,4 +6,4 @@
 
 ## Arguments
 
---8<-- "docs/argparse/bench_latency.inc.md"
+--8<-- "docs/generated/argparse/bench_latency.inc.md"
diff --git a/docs/cli/bench/mm_processor.md b/docs/cli/bench/mm_processor.md
new file mode 100644
index 0000000000000000000000000000000000000000..af2c3a8cfd36b07bde4f21b34983b7ce8c0e8243
--- /dev/null
+++ b/docs/cli/bench/mm_processor.md
@@ -0,0 +1,9 @@
+# vllm bench mm-processor
+
+## JSON CLI Arguments
+
+--8<-- "docs/cli/json_tip.inc.md"
+
+## Arguments
+
+--8<-- "docs/generated/argparse/bench_mm_processor.inc.md"
diff --git a/docs/cli/bench/serve.md b/docs/cli/bench/serve.md
index f7dc8036cc262dcd3308c62980806d68e055bf24..792c6e094b35102cad0ba82555b4320e6d879ad8 100644
--- a/docs/cli/bench/serve.md
+++ b/docs/cli/bench/serve.md
@@ -6,4 +6,4 @@
 
 ## Arguments
 
---8<-- "docs/argparse/bench_serve.inc.md"
+--8<-- "docs/generated/argparse/bench_serve.inc.md"
diff --git a/docs/cli/bench/sweep/plot.md b/docs/cli/bench/sweep/plot.md
index a101330e093cc4dfd94173b6bb2382c6792542ca..d7dc65e6df62c7c573a0e4ba729250831704d378 100644
--- a/docs/cli/bench/sweep/plot.md
+++ b/docs/cli/bench/sweep/plot.md
@@ -6,4 +6,4 @@
 
 ## Arguments
 
---8<-- "docs/argparse/bench_sweep_plot.inc.md"
+--8<-- "docs/generated/argparse/bench_sweep_plot.inc.md"
diff --git a/docs/cli/bench/sweep/plot_pareto.md b/docs/cli/bench/sweep/plot_pareto.md
index f5dc257ce6772f4b6052576e8227526b0bd613ab..13dffd7f2b5c423808dbf702414e23c507bfd99d 100644
--- a/docs/cli/bench/sweep/plot_pareto.md
+++ b/docs/cli/bench/sweep/plot_pareto.md
@@ -6,4 +6,4 @@
 
 ## Arguments
 
---8<-- "docs/argparse/bench_sweep_plot_pareto.inc.md"
+--8<-- "docs/generated/argparse/bench_sweep_plot_pareto.inc.md"
diff --git a/docs/cli/bench/sweep/serve.md b/docs/cli/bench/sweep/serve.md
index f0468f06fc287014c415b0c1bb3071ae767a6d01..6a8182feb40614359d4d0b88be6f913b47d8d275 100644
--- a/docs/cli/bench/sweep/serve.md
+++ b/docs/cli/bench/sweep/serve.md
@@ -6,4 +6,4 @@
 
 ## Arguments
 
---8<-- "docs/argparse/bench_sweep_serve.inc.md"
+--8<-- "docs/generated/argparse/bench_sweep_serve.inc.md"
diff --git a/docs/cli/bench/sweep/serve_sla.md b/docs/cli/bench/sweep/serve_sla.md
index 5642ec67eb0077fe1861cc7dbaf546b9331d94e4..688d64f0bc24d0d1f8f607ff090368becc4caf1e 100644
--- a/docs/cli/bench/sweep/serve_sla.md
+++ b/docs/cli/bench/sweep/serve_sla.md
@@ -6,4 +6,4 @@
 
 ## Arguments
 
---8<-- "docs/argparse/bench_sweep_serve_sla.inc.md"
+--8<-- "docs/generated/argparse/bench_sweep_serve_sla.inc.md"
diff --git a/docs/cli/bench/throughput.md b/docs/cli/bench/throughput.md
index e7f618fb4d14797042b241a77a65d092f1b383c1..66434c87819f1cc1cc362e62bbc3285d4eca27bb 100644
--- a/docs/cli/bench/throughput.md
+++ b/docs/cli/bench/throughput.md
@@ -6,4 +6,4 @@
 
 ## Arguments
 
---8<-- "docs/argparse/bench_throughput.inc.md"
+--8<-- "docs/generated/argparse/bench_throughput.inc.md"
diff --git a/docs/cli/chat.md b/docs/cli/chat.md
index 0246bd431b10170ca3d3c75f8bff787bbc1f670a..7b8e718f625fe40f613855ae94728e1b91298cc7 100644
--- a/docs/cli/chat.md
+++ b/docs/cli/chat.md
@@ -2,4 +2,4 @@
 
 ## Arguments
 
---8<-- "docs/argparse/chat.inc.md"
+--8<-- "docs/generated/argparse/chat.inc.md"
diff --git a/docs/cli/complete.md b/docs/cli/complete.md
index eb2ffdaabac25fc97c3dbea0d6696b8b88674b8b..65d953a7c046a070f18dfc970b1c0fc9a1eb20c9 100644
--- a/docs/cli/complete.md
+++ b/docs/cli/complete.md
@@ -2,4 +2,4 @@
 
 ## Arguments
 
---8<-- "docs/argparse/complete.inc.md"
+--8<-- "docs/generated/argparse/complete.inc.md"
diff --git a/docs/cli/run-batch.md b/docs/cli/run-batch.md
index 758fbda283978596cce5eb75c7406115bc92af84..f2255e66373d0a81373847e0daa0e91246f097a6 100644
--- a/docs/cli/run-batch.md
+++ b/docs/cli/run-batch.md
@@ -6,4 +6,4 @@
 
 ## Arguments
 
---8<-- "docs/argparse/run-batch.inc.md"
+--8<-- "docs/generated/argparse/run-batch.inc.md"
diff --git a/docs/cli/serve.md b/docs/cli/serve.md
index 35652fec587b3c4e2756cf6791e79a9fa16cd9f7..0326fe29ec7f0e3e1b028a543e0a368cb4ee8a52 100644
--- a/docs/cli/serve.md
+++ b/docs/cli/serve.md
@@ -6,4 +6,4 @@
 
 ## Arguments
 
---8<-- "docs/argparse/serve.inc.md"
+--8<-- "docs/generated/argparse/serve.inc.md"
diff --git a/docs/community/meetups.md b/docs/community/meetups.md
index d8cf4ecdd5a320406af43a9930b6e02ddcf15c97..43eb5cb246fc812d1d6fcb9f044bc7f17343b953 100644
--- a/docs/community/meetups.md
+++ b/docs/community/meetups.md
@@ -2,45 +2,4 @@
 
 We host regular meetups around the world. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights.
 
-## Upcoming Meetups
-
-Stay tuned for upcoming meetups! Follow us on [Twitter/X](https://x.com/vllm_project), join our [Slack](https://slack.vllm.ai), and follow vLLM on [Luma](https://luma.com/vLLM-Meetups) to get notified about new events.
-
-## Past Meetups
-
-Below you'll find slides and recordings from our previous meetups:
-
-- [vLLM Bangkok Meetup](https://luma.com/v0f647nv), November 21st 2025. [[Slides]](https://drive.google.com/drive/folders/1H0DS57F8HQ5q3kSOSoRmucPJWL3E0A_X?usp=sharing)
-- [vLLM Zurich Meetup](https://luma.com/0gls27kb), November 6th 2025. [[Slides]](https://docs.google.com/presentation/d/1UC9PTLCHYXQpOmJDSFg6Sljra3iVXzc09DeEI7dnxMc/edit?usp=sharing) [[Recording]](https://www.youtube.com/watch?v=6m6ZE6yVEDI)
-- [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/xSrYXjNgr1HbCP4ExYNG1w), November 1st 2025. [[Slides]](https://drive.google.com/drive/folders/1nQJ8ZkLSjKxvu36sSHaceVXtttbLvvu-?usp=drive_link)
-- [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg), October 25th 2025. [[Slides]](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6)
-- [vLLM Toronto Meetup](https://luma.com/e80e0ymm), September 25th 2025. [[Slides]](https://docs.google.com/presentation/d/1IYJYmJcu9fLpID5N5RbW_vO0XLo0CGOR14IXOjB61V8/edit?usp=sharing)
-- [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ), August 30th 2025. [[Slides]](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA)
-- [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet), August 27th 2025. [[Slides]](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing)
-- [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg), August 23rd 2025. [[Slides]](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH)
-- [vLLM Korea Meetup](https://luma.com/cgcgprmh), August 19th 2025. [[Slides]](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view).
-- [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA), August 2nd 2025. [[Slides]](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) [[Recording]](https://www.chaspark.com/#/live/1166916873711665152).
-- [NYC vLLM Meetup](https://lu.ma/c1rqyf1f), May 7th, 2025. [[Slides]](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing)
-- [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day), April 3rd 2025. [[Slides]](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
-- [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama), March 27th 2025. [[Slides]](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
-- [The first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg), March 16th 2025. [[Slides]](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
-- [The East Coast vLLM Meetup](https://lu.ma/7mu4k4xx), March 11th 2025. [[Slides]](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0)
-- [The ninth vLLM meetup](https://lu.ma/h7g3kuj9), with Meta, February 27th 2025. [[Slides]](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing)
-- [The eighth vLLM meetup](https://lu.ma/zep56hui), with Google Cloud, January 22nd 2025. [[Slides]](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing)
-- [The seventh vLLM meetup](https://lu.ma/h0qvrajz), with Snowflake, November 14th 2024. [[Slides]](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing)
-- [The sixth vLLM meetup](https://lu.ma/87q3nvnh), with NVIDIA, September 9th 2024. [[Slides]](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing)
-- [The fifth vLLM meetup](https://lu.ma/lp0gyjqr), with AWS, July 24th 2024. [[Slides]](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing)
-- [The fourth vLLM meetup](https://lu.ma/agivllm), with Cloudflare and BentoML, June 11th 2024. [[Slides]](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing)
-- [The third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/), with Roblox, April 2nd 2024. [[Slides]](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing)
-- [The second vLLM meetup](https://lu.ma/ygxbpzhl), with IBM Research, January 31st 2024. [[Slides]](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing) [[Video (vLLM Update)]](https://youtu.be/Y0C-DUvEnZQ) [[Video (IBM Research & torch.compile)]](https://youtu.be/m0dMtFLI-dg)
-- [The first vLLM meetup](https://lu.ma/first-vllm-meetup), with a16z, October 5th 2023. [[Slides]](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing)
-
-## Get Involved
-
-**Want to host or speak at a vLLM meetup?** We're always looking for speakers and sponsors for our meetups. Whether you want to:
-
-- Share your vLLM feature, use case, project extension, or deployment experience
-- Host a meetup in your city
-- Sponsor an event
-
-Please contact us at [vllm-questions@lists.berkeley.edu](mailto:vllm-questions@lists.berkeley.edu).
+Please visit [vllm.ai/events](https://vllm.ai/events) to learn more.
diff --git a/docs/community/sponsors.md b/docs/community/sponsors.md
index 847b99cce45c9d68c2ea24c2ddccddb64526966e..b645eaed0cd96d3b3b381d54d1873b622e337bc4 100644
--- a/docs/community/sponsors.md
+++ b/docs/community/sponsors.md
@@ -2,43 +2,4 @@
 
 vLLM is a community project. Our compute resources for development and testing are supported by the following organizations. Thank you for your support!
 
-<!-- Note: Please sort them in alphabetical order. -->
-<!-- Note: Please keep these consistent with README.md. -->
-
-Cash Donations:
-
-- a16z
-- Dropbox
-- Sequoia Capital
-- Skywork AI
-- ZhenFund
-
-Compute Resources:
-
-- Alibaba Cloud
-- AMD
-- Anyscale
-- Arm
-- AWS
-- Crusoe Cloud
-- Databricks
-- DeepInfra
-- Google Cloud
-- IBM
-- Intel
-- Lambda Lab
-- Nebius
-- Novita AI
-- NVIDIA
-- Red Hat
-- Replicate
-- Roblox
-- RunPod
-- Trainy
-- UC Berkeley
-- UC San Diego
-- Volcengine
-
-Slack Sponsor: Anyscale
-
-We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM.
+Please visit [vllm.ai/#sponsors](https://vllm.ai/#sponsors) to learn more.
diff --git a/docs/configuration/engine_args.md b/docs/configuration/engine_args.md
index 05d4f762306a37d397e23db94d97a0e7691dc64e..14589478821f920d7a96ae5000b496ef8945b960 100644
--- a/docs/configuration/engine_args.md
+++ b/docs/configuration/engine_args.md
@@ -15,8 +15,8 @@ The engine argument classes, [EngineArgs][vllm.engine.arg_utils.EngineArgs] and
 
 ## `EngineArgs`
 
---8<-- "docs/argparse/engine_args.md"
+--8<-- "docs/generated/argparse/engine_args.inc.md"
 
 ## `AsyncEngineArgs`
 
---8<-- "docs/argparse/async_engine_args.md"
+--8<-- "docs/generated/argparse/async_engine_args.inc.md"
diff --git a/docs/contributing/ci/update_pytorch_version.md b/docs/contributing/ci/update_pytorch_version.md
index 735bb2e2053323e15ef176fdde49b7890bb6e17a..74c0beb779c7db0d4899656ea4ccec3e3107a842 100644
--- a/docs/contributing/ci/update_pytorch_version.md
+++ b/docs/contributing/ci/update_pytorch_version.md
@@ -77,25 +77,20 @@ This complicates the process as we cannot use the out-of-the-box
     - `.buildkite/release-pipeline.yaml`
     - `.buildkite/scripts/upload-wheels.sh`
 
-## Address long vLLM build time
+## Manually running vLLM builds on BuildKiteCI
 
-When building vLLM with a new PyTorch/CUDA version, no cache will exist
-in the vLLM sccache S3 bucket, causing the build job on CI to potentially take more than 5 hours
-and timeout. Additionally, since vLLM's fastcheck pipeline runs in read-only mode,
-it doesn't populate the cache, so re-running it to warm up the cache
-is ineffective.
+When building vLLM with a new PyTorch/CUDA version, the vLLM sccache S3 bucket
+will not have any cached artifacts, which can cause CI build jobs to exceed 5 hours.
+Furthermore, vLLM's fastcheck pipeline operates in read-only mode and does not
+populate the cache, making it ineffective for cache warm-up purposes.
 
-While ongoing efforts like <https://github.com/vllm-project/vllm/issues/17419>
-address the long build time at its source, the current workaround is to set `VLLM_CI_BRANCH`
-to a custom branch provided by @khluu (`VLLM_CI_BRANCH=khluu/long_build`)
-when manually triggering a build on Buildkite. This branch accomplishes two things:
+To address this, manually trigger a build on Buildkite to accomplish two objectives:
 
-1. Increase the timeout limit to 10 hours so that the build doesn't time out.
-2. Allow the compiled artifacts to be written to the vLLM sccache S3 bucket
-to warm it up so that future builds are faster.
+1. Run the complete test suite against the PyTorch RC build by setting the environment variables: `RUN_ALL=1` and `NIGHTLY=1`
+2. Populate the vLLM sccache S3 bucket with compiled artifacts, enabling faster subsequent builds
 
 <p align="center" width="100%">
-    <img width="60%" alt="Buildkite new build popup" src="https://github.com/user-attachments/assets/a8ff0fcd-76e0-4e91-b72f-014e3fdb6b94">
+<img width="60%" alt="Buildkite new build popup" src="https://github.com/user-attachments/assets/3b07f71b-bb18-4ca3-aeaf-da0fe79d315f" />
 </p>
 
 ## Update all the different vLLM platforms
diff --git a/docs/contributing/deprecation_policy.md b/docs/contributing/deprecation_policy.md
index 904ef4ca058c008e71b88b1ab12f9a9273e84186..99b7c382da9c7cb4be7fd7c03ca5104ad62d38aa 100644
--- a/docs/contributing/deprecation_policy.md
+++ b/docs/contributing/deprecation_policy.md
@@ -46,7 +46,7 @@ warning (e.g., "This will be removed in v0.10.0").
     - GitHub Issue (RFC) for feedback
     - Documentation and use of the `@typing_extensions.deprecated` decorator for Python APIs
 
-### 2.Deprecated (Off By Default)
+### 2. Deprecated (Off By Default)
 
 - **Action**: Feature is disabled by default, but can still be re-enabled via a
 CLI flag or environment variable. Feature throws an error when used without
diff --git a/docs/contributing/model/basic.md b/docs/contributing/model/basic.md
index d37501b86556fa02ccb9ec035c45d836b516bb3a..915fe1495f452364fd218e8974128999abd0a814 100644
--- a/docs/contributing/model/basic.md
+++ b/docs/contributing/model/basic.md
@@ -118,7 +118,7 @@ To support a model with interleaving sliding windows, we need to take care of th
 - Make sure the model's `config.json` contains `layer_types`.
 - In the modeling code, parse the correct sliding window value for every layer, and pass it to the attention layer's `per_layer_sliding_window` argument. For reference, check [this line](https://github.com/vllm-project/vllm/blob/996357e4808ca5eab97d4c97c7d25b3073f46aab/vllm/model_executor/models/llama.py#L171).
 
-With these two steps, interleave sliding windows should work with the model.
+With these two steps, interleaved sliding windows should work with the model.
 
 ### How to support models that use Mamba?
 
@@ -142,7 +142,7 @@ We use "mamba-like" to refer to layers that posses a state that is updated in-pl
 For implementing new custom mamba-like layers, one should inherit from `MambaBase` and implement the methods `get_state_dtype`, `get_state_shape` to calculate the data types and state shapes at runtime, as well as `mamba_type` and `get_attn_backend`.
 It is also necessary to implement the "attention meta-data" class which handles the meta-data that is common across all layers.
 Please see [`LinearAttentionMetadata`](../../../vllm/v1/attention/backends/linear_attn.py) or [`ShortConvAttentionMetadata`](../../../vllm/v1/attention/backends/short_conv_attn.py) for examples of this.
-It is also worth noting that we should update `MAMBA_TYPE_TO_BACKEND_MAP` and `MambaAttentionBackendEnum` in [`registry.py`](../../../vllm/attention/backends/registry.py) when adding a new mamba backend.
+It is also worth noting that we should update `MAMBA_TYPE_TO_BACKEND_MAP` and `MambaAttentionBackendEnum` in [`registry.py`](../../../vllm/v1/attention/backends/registry.py) when adding a new mamba backend.
 Finally, if one wants to support torch compile and CUDA graphs, it necessary to wrap the call to the mamba-like layer inside a custom op and register it.
 Please see the calls to `direct_register_custom_op` in [vllm/model_executor/models/minimax_text_01.py](../../../vllm/model_executor/models/minimax_text_01.py) or [vllm/model_executor/layers/mamba/short_conv.py](../../../vllm/model_executor/layers/mamba/short_conv.py) for examples of this.
 The new custom op should then be added to the list `_attention_ops` in [vllm/config/compilation.py](../../../vllm/config/compilation.py) to ensure that piecewise CUDA graphs works as intended.
diff --git a/docs/contributing/profiling.md b/docs/contributing/profiling.md
index cbce14ce992ec9758b635e697d01183a2a96b970..ce10adaf0cad294eb102a91bd0bd8061f476bab2 100644
--- a/docs/contributing/profiling.md
+++ b/docs/contributing/profiling.md
@@ -54,6 +54,29 @@ vllm bench serve \
     --num-prompts 2
 ```
 
+Or use http request:
+
+```shell
+# We need first call /start_profile api to start profile.
+$ curl -X POST http://localhost:8000/start_profile
+
+# Call model generate.
+curl -X POST http://localhost:8000/v1/chat/completions \
+    -H "Content-Type: application/json" \
+    -d '{
+                "model": "meta-llama/Llama-3.1-8B-Instruct",
+                "messages": [
+                        {
+                                "role": "user",
+                                "content": "San Francisco is a"
+                        }
+                ]
+    }'
+
+# After need call /stop_profile api to stop profile.
+$ curl -X POST http://localhost:8000/stop_profile
+```
+
 ## Profile with NVIDIA Nsight Systems
 
 Nsight systems is an advanced tool that exposes more profiling details, such as register and shared memory usage, annotated code regions and low-level CUDA APIs and events.
diff --git a/docs/deployment/docker.md b/docs/deployment/docker.md
index d70e0142e3202ceaa53c433e2c0da00b3c88360a..ae7cea4364b401b877b9923054b007a4749b5a55 100644
--- a/docs/deployment/docker.md
+++ b/docs/deployment/docker.md
@@ -80,6 +80,15 @@ DOCKER_BUILDKIT=1 docker build . \
     If you are using Podman instead of Docker, you might need to disable SELinux labeling by
     adding `--security-opt label=disable` when running `podman build` command to avoid certain [existing issues](https://github.com/containers/buildah/discussions/4184).
 
+!!! note
+    If you have not changed any C++ or CUDA kernel code, you can use precompiled wheels to significantly reduce Docker build time.
+
+    *   **Enable the feature** by adding the build argument: `--build-arg VLLM_USE_PRECOMPILED="1"`.
+    *   **How it works**: By default, vLLM automatically finds the correct wheels from our [Nightly Builds](../contributing/ci/nightly_builds.md) by using the merge-base commit with the upstream `main` branch.
+    *   **Override commit**: To use wheels from a specific commit, provide the `--build-arg VLLM_PRECOMPILED_WHEEL_COMMIT=<commit_hash>` argument.
+
+    For a detailed explanation, refer to the documentation on 'Set up using Python-only build (without compilation)' part in [Build wheel from source](../contributing/ci/nightly_builds.md#precompiled-wheels-usage), these args are similar.
+
 ## Building for Arm64/aarch64
 
 A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper and Grace-Blackwell. Using the flag `--platform "linux/arm64"` will build for arm64.
diff --git a/docs/deployment/frameworks/cerebrium.md b/docs/deployment/frameworks/cerebrium.md
index 960347d9525c45ad08d84906689352fe7a364b5b..1b7c5d5a921380d3e75bb5c1cd1ce261c40aec17 100644
--- a/docs/deployment/frameworks/cerebrium.md
+++ b/docs/deployment/frameworks/cerebrium.md
@@ -59,7 +59,7 @@ Then, run the following code to deploy it to the cloud:
 cerebrium deploy
 ```
 
-If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case`/run`)
+If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case `/run`)
 
 ??? console "Command"
 
diff --git a/docs/deployment/frameworks/hf_inference_endpoints.md b/docs/deployment/frameworks/hf_inference_endpoints.md
index 05df0dacd8f11600e3f98f418f86e875f7d9b0f3..6217dc062d21a1f28f6103359bae215b5c830153 100644
--- a/docs/deployment/frameworks/hf_inference_endpoints.md
+++ b/docs/deployment/frameworks/hf_inference_endpoints.md
@@ -70,7 +70,7 @@ This method applies to models with the [`transformers` library tag](https://hugg
 
     ![Locate deploy button](../../assets/deployment/hf-inference-endpoints-locate-deploy-button.png)
 
-3. Click to **Deploy** button > **HF Inference Endpoints**. You will be taken to the Inference Endpoints interface to configure the deployment.
+3. Click the **Deploy** button > **HF Inference Endpoints**. You will be taken to the Inference Endpoints interface to configure the deployment.
 
     ![Click deploy button](../../assets/deployment/hf-inference-endpoints-click-deploy-button.png)
 
diff --git a/docs/deployment/integrations/kserve.md b/docs/deployment/integrations/kserve.md
index 37b29aa1a487659d88158224837942e25f905a21..06ad5f29a1a65dc03084e9c5c0c2ff3858147cc8 100644
--- a/docs/deployment/integrations/kserve.md
+++ b/docs/deployment/integrations/kserve.md
@@ -2,4 +2,4 @@
 
 vLLM can be deployed with [KServe](https://github.com/kserve/kserve) on Kubernetes for highly scalable distributed model serving.
 
-Please see [this guide](https://kserve.github.io/website/docs/model-serving/generative-inference/overview) for more details on using vLLM with KServe.
+You can use vLLM with KServe's [Hugging Face serving runtime](https://kserve.github.io/website/docs/model-serving/generative-inference/overview) or via [`LLMInferenceService` that uses llm-d](https://kserve.github.io/website/docs/model-serving/generative-inference/llmisvc/llmisvc-overview).
diff --git a/docs/deployment/integrations/llm-d.md b/docs/deployment/integrations/llm-d.md
new file mode 100644
index 0000000000000000000000000000000000000000..cccf1773c6be676c7da46dc6e39a3389a689b356
--- /dev/null
+++ b/docs/deployment/integrations/llm-d.md
@@ -0,0 +1,5 @@
+# llm-d
+
+vLLM can be deployed with [llm-d](https://github.com/llm-d/llm-d), a Kubernetes-native distributed inference serving stack providing well-lit paths for anyone to serve large generative AI models at scale. It helps achieve the fastest "time to state-of-the-art (SOTA) performance" for key OSS models across most hardware accelerators and infrastructure providers.
+
+You can use vLLM with llm-d directly by following [this guide](https://llm-d.ai/docs/guide) or via [KServe's LLMInferenceService](https://kserve.github.io/website/docs/model-serving/generative-inference/llmisvc/llmisvc-overview).
diff --git a/docs/deployment/integrations/production-stack.md b/docs/deployment/integrations/production-stack.md
index 624e98a08c98db9643b34befcd61aa6d5f2f87a5..4db595164e3de0338120d7a6e83fd088cd4a317f 100644
--- a/docs/deployment/integrations/production-stack.md
+++ b/docs/deployment/integrations/production-stack.md
@@ -10,7 +10,7 @@ If you are new to Kubernetes, don't worry: in the vLLM production stack [repo](h
 
 ## Pre-requisite
 
-Ensure that you have a running Kubernetes environment with GPU (you can follow [this tutorial](https://github.com/vllm-project/production-stack/blob/main/tutorials/00-install-kubernetes-env.md) to install a Kubernetes environment on a bare-medal GPU machine).
+Ensure that you have a running Kubernetes environment with GPU (you can follow [this tutorial](https://github.com/vllm-project/production-stack/blob/main/tutorials/00-install-kubernetes-env.md) to install a Kubernetes environment on a bare-metal GPU machine).
 
 ## Deployment using vLLM production stack
 
diff --git a/docs/deployment/k8s.md b/docs/deployment/k8s.md
index 05814cbad9bfcb556ea93f001681efba81e632a7..77a159009aa8d5407bc7f672989937c65aa422fa 100644
--- a/docs/deployment/k8s.md
+++ b/docs/deployment/k8s.md
@@ -12,6 +12,7 @@ Alternatively, you can deploy vLLM to Kubernetes using any of the following:
 
 - [Helm](frameworks/helm.md)
 - [InftyAI/llmaz](integrations/llmaz.md)
+- [llm-d](integrations/llm-d.md)
 - [KAITO](integrations/kaito.md)
 - [KServe](integrations/kserve.md)
 - [Kthena](integrations/kthena.md)
diff --git a/docs/design/cuda_graphs.md b/docs/design/cuda_graphs.md
index 19c02fc88641ccca0b64e009c694ac4938ffe7a6..af9e5b5ba6f9b666fa3f1d183b5d6fb83b098ec2 100644
--- a/docs/design/cuda_graphs.md
+++ b/docs/design/cuda_graphs.md
@@ -149,7 +149,7 @@ The CUDA Graphs wrapper no longer manages the warm-up logic. The warm-up process
 
 ## CUDA Graphs Compatibility of Attention Backends
 
-To signal the CUDA Graphs compatibility of the attention backends, we introduce a new enum type [AttentionCGSupport][vllm.v1.attention.backends.utils.AttentionCGSupport], which is an enum type that tracks the capability of the attention backend to support CUDA Graphs. The value is sorted in the order of the capability, i.e., `ALWAYS`> `UNIFORM_BATCH`> `UNIFORM_SINGLE_TOKEN_DECODE`> `NEVER`.
+To signal the CUDA Graphs compatibility of the attention backends, we introduce a new enum type [AttentionCGSupport][vllm.v1.attention.backend.AttentionCGSupport], which is an enum type that tracks the capability of the attention backend to support CUDA Graphs. The value is sorted in the order of the capability, i.e., `ALWAYS`> `UNIFORM_BATCH`> `UNIFORM_SINGLE_TOKEN_DECODE`> `NEVER`.
 
 ```python
 class AttentionCGSupport(enum.Enum):
diff --git a/docs/design/custom_op.md b/docs/design/custom_op.md
new file mode 100644
index 0000000000000000000000000000000000000000..13c2915abe8f3e31d31b5518efe4645de90d76d6
--- /dev/null
+++ b/docs/design/custom_op.md
@@ -0,0 +1,318 @@
+# CustomOp
+
+`CustomOp` is an abstract class used for dispatching the forward method of various operations to the appropriate backend. It also offers a mechanism for both vLLM and OOT (Out-Of-Tree) plugins to register their custom operations.
+
+This document will introduce how CustomOp works in vLLM and how to implement a new `CustomOp`.
+
+## How CustomOp Works in vLLM
+
+`CustomOp` manages two dictionaries of all custom ops (i.e., op classes, indexed by registered name) in its class, for vLLM and OOT plugins respectively.
+
+??? code
+
+    ```python
+    class CustomOp(nn.Module):
+
+        op_registry: dict[str, type["CustomOp"]] = {}
+        op_registry_oot: dict[str, type["CustomOp"]] = {}
+    ```
+
+We can use `@CustomOp.register("op_name")` to register an op class to the `CustomOp` system. After this, the `op_name` and its class will be added into the `op_registry` dictionary. In addition, We can also register an OOT op by `@CustomOp.register_oot("op_name")`. We will introduce this mechanism in detail later.
+
+When a `CustomOp` is called (i.e., call its `forward()` method), if it is enabled (i.e., with `--compilation_config.custom_ops '["+op_name"]'`), it will automatically dispatch the forward method to the appropriate backend according to `current_platform`. Otherwise (i.e., it is disabled), it will only call the `forward_native()` method to use PyTorch-native implementation of this forward method.
+
+- **CPU platform:** dispatch to `forward_cpu()`.
+- **CUDA platform:** dispatch to `forward_cuda()`.
+- **ROCm platform:** dispatch to `forward_hip()`. If `forward_hip()` is not implemented, it will use `forward_cuda()` as a fallback.
+- **XPU platform:** dispatch to `forward_xpu()`.
+- **TPU platform:** dispatch to `forward_tpu()`.
+- **OOT platform:** dispatch to `forward_oot()`. This will only be called on OOT platforms.
+- **Default:** dispatch to `forward_native()` as a final fallback for all platforms.
+
+!!! note
+    Note that the dispatching logic might not be absolute because of class inheritance. Derived class might override the behavior.
+
+Furthermore, vLLM decides whether to enable or disable a `CustomOp` based on `compilation_config.custom_ops`. To be specific, if a `CustomOp` is not registered in `compilation_config.custom_ops` (i.e., uses the default config), it will be enabled if `compilation_config.custom_ops` contains `all`, or will be disabled if it contains `none`.
+
+!!! note
+    Note that `all` and `none` cannot coexist in `compilation_config.custom_ops`.
+
+By default, if `compilation_config.backend == "inductor"` and `compilation_config.mode != CompilationMode.NONE`, a `none` will be appended into `compilation_config.custom_ops`, otherwise a `all` will be appended. In other words, this means `CustomOp` will be disabled in some platforms (i.e., those use `inductor` as dafault backend for `torch.compile`) when running with torch compile mode. In this case, Inductor generates (fused) Triton kernels for those disabled custom ops.
+
+!!! note
+    For multi-modal models, vLLM has enforced the enabling of some custom ops to use device-specific deep-optimized kernels for better performance in ViT part, such as `MMEncoderAttention` and `ApplyRotaryEmb`. We can also pass a `enforce_enable=True` param to the `__init__()` method of the `CustomOp` to enforce enable itself at object-level.
+
+    Note that this `enforce_enable` mechanism will be removed after we add a separate `compilation_config` for multi-modal part.
+
+## How to Customise Your Configuration for CustomOp
+
+vLLM also offers fine-grained control over which custom ops to enable or disable for users, by manually passing a `--compilation_config.custom_ops '["..."]'` when launching a server.
+
+For example:
+
+- Use `--compilation_config.custom_ops '["all"]'` to enable all custom ops.
+- Use `--compilation_config.custom_ops '["none"]'` to disable all custom ops.
+- Use `--compilation_config.custom_ops '["all,-op1"]'` to enable all custom ops except op1 (i.e., prefixed with a `-` means "disable").
+- Use `--compilation_config.custom_ops '["none,+op1,+op2"]'` to only enable op1 and op2 (i.e., prefixed with a `+` means "enable").
+
+## Types of Supported CustomOp in vLLM
+
+**1. Attention:**
+
+```python
+--8<-- "vllm/model_executor/layers/attention/mm_encoder_attention.py:mm_encoder_attn"
+
+--8<-- "vllm/model_executor/layers/mla.py:multi_head_latent_attention"
+```
+
+**2. Activation:**
+
+```python
+--8<-- "vllm/model_executor/layers/activation.py:silu_and_mul"
+
+--8<-- "vllm/model_executor/layers/activation.py:mul_and_silu"
+
+--8<-- "vllm/model_executor/layers/activation.py:gelu_new"
+
+--8<-- "vllm/model_executor/layers/activation.py:gelu_fast"
+
+--8<-- "vllm/model_executor/layers/activation.py:quick_gelu"
+
+--8<-- "vllm/model_executor/layers/activation.py:gelu_and_mul"
+
+--8<-- "vllm/model_executor/layers/activation.py:gelu_and_mul_sparse"
+
+--8<-- "vllm/model_executor/layers/activation.py:relu2"
+
+--8<-- "vllm/model_executor/layers/activation.py:xielu"
+
+--8<-- "vllm/model_executor/layers/activation.py:swigluoai_and_mul"
+
+--8<-- "vllm/model_executor/layers/activation.py:fatrelu_and_mul"
+```
+
+**3. MM-Conv:**
+
+```python
+--8<-- "vllm/model_executor/layers/conv.py:conv2d"
+
+--8<-- "vllm/model_executor/layers/conv.py:conv3d"
+```
+
+**4. Embedding:**
+
+```python
+--8<-- "vllm/model_executor/layers/vocab_parallel_embedding.py:vocab_parallel_embedding"
+
+--8<-- "vllm/model_executor/layers/vocab_parallel_embedding.py:parallel_lm_head"
+```
+
+**5. Linear:**
+
+```python
+--8<-- "vllm/model_executor/layers/linear.py:row_parallel_linear"
+
+--8<-- "vllm/model_executor/layers/linear.py:column_parallel_linear"
+
+--8<-- "vllm/model_executor/layers/linear.py:replicated_linear"
+```
+
+**6. Logits Processor:**
+
+```python
+--8<-- "vllm/model_executor/layers/logits_processor.py:logits_processor"
+```
+
+**7. Mamba:**
+
+```python
+--8<-- "vllm/model_executor/layers/mamba/mamba_mixer.py:mamba_mixer"
+
+--8<-- "vllm/model_executor/layers/mamba/mamba_mixer2.py:mamba_mixer2"
+
+--8<-- "vllm/model_executor/layers/mamba/mamba_mixer2.py:mixer2_gated_rms_norm"
+
+--8<-- "vllm/model_executor/models/plamo2.py:plamo2_mamba_mixer"
+
+--8<-- "vllm/model_executor/layers/mamba/short_conv.py:short_conv"
+```
+
+**8. MoE:**
+
+```python
+--8<-- "vllm/model_executor/layers/fused_moe/layer.py:fused_moe"
+
+--8<-- "vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py:modular_fused_moe"
+
+--8<-- "vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py:unquantized_fused_moe"
+
+--8<-- "vllm/model_executor/models/transformers/moe.py:transformers_fused_moe"
+
+--8<-- "vllm/model_executor/layers/fused_moe/fused_moe.py:grouped_topk"
+```
+
+**9. Norm:**
+
+```python
+--8<-- "vllm/model_executor/layers/layernorm.py:rms_norm"
+
+--8<-- "vllm/model_executor/layers/layernorm.py:rms_norm_gated"
+
+--8<-- "vllm/model_executor/layers/layernorm.py:gemma_rms_norm"
+```
+
+**10. Quantization:**
+
+```python
+--8<-- "vllm/model_executor/layers/quantization/input_quant_fp8.py:quant_fp8"
+```
+
+**11. Rope:**
+
+```python
+--8<-- "vllm/model_executor/layers/rotary_embedding/base.py:rotary_embedding"
+
+--8<-- "vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py:dual_chunk_rotary_embedding"
+
+--8<-- "vllm/model_executor/layers/rotary_embedding/common.py:apply_rotary_emb"
+```
+
+## Guidelines for Implementing a New CustomOp
+
+### Implement a New CustomOp in vLLM
+
+This part is a tutorial of how to implement a New `CustomOp` in vLLM.
+
+Steps:
+
+1. Implement a new op class, which extends from `CustomOp` base class.
+2. Add the `@CustomOp.register("op_name")` decorator on this op class to register it into `CustomOp` system.
+3. Implement different `forward_xxx()` method according to your needs.
+
+Taking `MMEncoderAttention` as an example:
+
+??? code
+
+    ```python
+    @CustomOp.register("mm_encoder_attn")
+    class MMEncoderAttention(CustomOp):
+
+        def __init__(
+            self,
+            num_heads: int,
+            head_size: int,
+            scale: float | None = None,
+            num_kv_heads: int | None = None,
+            prefix: str = "",
+            multimodal_config: MultiModalConfig | None = None,
+        ) -> None:
+            super().__init__()
+            # Init...
+
+        def forward_native(
+            self,
+            query: torch.Tensor,
+            key: torch.Tensor,
+            value: torch.Tensor,
+            cu_seqlens: torch.Tensor | None = None,
+            max_seqlen: torch.Tensor | None = None,  # Only used for Flash Attention
+        ) -> torch.Tensor:
+            # Call TORCH_SDPA implementation...
+
+        def forward_cuda(
+            self,
+            query: torch.Tensor,
+            key: torch.Tensor,
+            value: torch.Tensor,
+            cu_seqlens: torch.Tensor | None = None,
+            max_seqlen: torch.Tensor | None = None,  # Only used for Flash Attention
+        ) -> torch.Tensor:
+            # Call FA or TORCH_SDPA implementation...
+
+        def forward_cpu(
+            self,
+            query: torch.Tensor,
+            key: torch.Tensor,
+            value: torch.Tensor,
+            cu_seqlens: torch.Tensor | None = None,
+            max_seqlen: torch.Tensor | None = None,  # Only used for Flash Attention
+        ) -> torch.Tensor:
+            # Call TORCH_SDPA implementation...
+
+        def forward_xpu(
+            self,
+            query: torch.Tensor,
+            key: torch.Tensor,
+            value: torch.Tensor,
+            cu_seqlens: torch.Tensor | None = None,
+            max_seqlen: torch.Tensor | None = None,  # Only used for Flash Attention
+        ) -> torch.Tensor:
+            # Call FA implementation...
+
+        def forward_tpu(
+            self,
+            query: torch.Tensor,
+            key: torch.Tensor,
+            value: torch.Tensor,
+            cu_seqlens: torch.Tensor | None = None,
+            max_seqlen: torch.Tensor | None = None,  # Only used for Flash Attention
+        ) -> torch.Tensor:
+            # Call PALLAS implementation...
+    ```
+
+### Register a New CustomOp in OOT Device Plugins
+
+Currently, thanks to [vLLM's hardware-plugin mechanism](./plugin_system.md), there are various OOT device plugins emerging out to enable vLLM seamlessly runs on different hardwares. You can also find more details about this mechanism at [Introducing vLLM Hardware Plugin, Best Practice from Ascend NPU](https://blog.vllm.ai/2025/05/12/hardware-plugin.html).
+
+- **Official device plugins:** [vllm-ascend](https://github.com/vllm-project/vllm-ascend) (for Huawei Ascend NPU), [vllm-spyre](https://github.com/vllm-project/vllm-spyre)
+(for Spyre), [vllm-gaudi](https://github.com/vllm-project/vllm-gaudi) (for Intel Gaudi), [vllm-neuron](https://github.com/vllm-project/vllm-neuron) (for AWS Neuron), [vllm-meta](https://github.com/vllm-project/vllm-metal) (for Apple Silicon), etc.
+- **Non-official device plugins:** [vllm-metax](https://github.com/MetaX-MACA/vLLM-metax) (for MetaX GPU), [vllm-kunlun](https://github.com/baidu/vLLM-Kunlun) (for Baidu Kunlun XPU), etc.
+
+In this case, `CustomOp` can enable these hardware manufacturers to seamlessly replace vLLM's operations with their deep-optimized kernels for specific devices at runtime, by just registering an OOT `CustomOp` and implementing the `forward_oot()` method.
+
+Now, this part will show you how to register an OOT `CustomOp` for a device plugin.
+
+Taking `MMEncoderAttention` as an example:
+
+1. Implement a `CustomMMEncoderAttention` class which extends from `MMEncoderAttention` and implement its `forward_oot()` method.
+2. Register your `CustomMMEncoderAttention` into vLLM to replace `MMEncoderAttention`.
+
+??? code
+
+    ```python
+    from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention
+    from vllm.model_executor.custom_op import CustomOp
+
+
+    @CustomOp.register_oot("MMEncoderAttention")
+    class CustomMMEncoderAttention(MMEncoderAttention):
+
+        def __init__(...):
+            super().__init__(...)
+        
+        def forward_oot(...):
+            # Call optimized device-specific kernels.
+            ...
+    ```
+
+In this case, a new item `{"MMEncoderAttention": CustomMMEncoderAttention}` will be added into `op_registry_oot`. When initializing a `MMEncoderAttention` op object, if the class name (i.e., `MMEncoderAttention`) is contained in the keys of `op_registry_oot`, vLLM will replace it with our registered class (i.e., `CustomMMEncoderAttention`) and instantiate it.
+
+After that, when this `MMEncoderAttention` op is called, your `forward_oot()` will be called if it is enabled. Thus, you will get expected performance on your hardwares without directly modify vLLM.
+
+In addition, you can also register all your `CustomOp` at one place for better management.
+
+??? code
+
+    ```python
+    from vllm.model_executor.custom_op import CustomOp
+
+
+    REGISTERED_CUSTOM_OPS = {
+        "CustomOP1": YourCustomOp1,
+        "CustomOP2": YourCustomOp2,
+        "CustomOP3": YourCustomOp3,
+    }
+
+    for op_name, op_cls in REGISTERED_CUSTOM_OPS.items():
+        CustomOp.register_oot(_decorated_op_cls=op_cls, name=op_name)
+    ```
diff --git a/docs/design/debug_vllm_compile.md b/docs/design/debug_vllm_compile.md
index 731e542a0307bdaa86cc5f9c4d8e2440a1137cad..328df581627a21d8df5c7e03f6b97528f8d1a29c 100644
--- a/docs/design/debug_vllm_compile.md
+++ b/docs/design/debug_vllm_compile.md
@@ -33,7 +33,7 @@ goals while minimizing impact to performance and also helps us (vLLM) when you o
 For more details on the design, please see the following resources:
 
 - [Introduction to vLLM-torch.compile blogpost](https://blog.vllm.ai/2025/08/20/torch-compile.html)
-- [vLLM-torch.compile integration design](https://docs.vllm.ai/en/latest/design/torch_compile.html)
+- [vLLM-torch.compile integration design](./torch_compile.md)
 - [vLLM Office Hours #26](https://www.youtube.com/live/xLyxc7hxCJc?si=Xulo9pe53C6ywf0V&t=561)
 - [Talk at PyTorch Conference 2025](https://youtu.be/1wV1ESbGrVQ?si=s1GqymUfwiwOrDTg&t=725)
 
diff --git a/docs/design/fused_moe_modular_kernel.md b/docs/design/fused_moe_modular_kernel.md
index e1a96be6c3445b7d517faca53daf19b0bb8231f8..975df8ba29dc41fffb54d4ce40347d3c734f28b1 100644
--- a/docs/design/fused_moe_modular_kernel.md
+++ b/docs/design/fused_moe_modular_kernel.md
@@ -2,7 +2,7 @@
 
 ## Introduction
 
-FusedMoEModularKernel is implemented [here](../..//vllm/model_executor/layers/fused_moe/modular_kernel.py)
+FusedMoEModularKernel is implemented [here](../../vllm/model_executor/layers/fused_moe/modular_kernel.py)
 
 Based on the format of the input activations, FusedMoE implementations are broadly classified into 2 types.
 
diff --git a/docs/design/logits_processors.md b/docs/design/logits_processors.md
index 8eadeb386fcf25c7622aa7ff15ca8a08d7052d15..af1d7b6bbb45d075b3239fddc4cb37fe9844b6d6 100644
--- a/docs/design/logits_processors.md
+++ b/docs/design/logits_processors.md
@@ -138,7 +138,7 @@ Note that the sampler will access the logits processors via `SamplingMetadata.lo
             # ...return sampler output data structure...
 
 
-        def sample(self, logits, sampling_metadta)
+        def sample(self, logits, sampling_metadata)
 
             ...
 
diff --git a/docs/design/moe_kernel_features.md b/docs/design/moe_kernel_features.md
index 48341d199cb804afa9677ff61801b2aaf0aba033..18216b5965af292f5f7e3a74a77f79476547adf4 100644
--- a/docs/design/moe_kernel_features.md
+++ b/docs/design/moe_kernel_features.md
@@ -16,7 +16,7 @@ Async backends support the use of DBO (Dual Batch Overlap) and shared expert ove
 
 Certain models require the topk weights to be applied to the input activations rather than the output activations when topk==1, e.g. Llama. For modular kernels, this feature is supported by the `FusedMoEPrepareAndFinalize` subclass. For non-modular kernels, it is up to the experts function to deal with this flag.
 
-Unless otherwise specified, backends are controlled via `VLLM_ALL2ALL_BACKEND`. All backends except `flashinfer` only work with EP+DP or EP+TP. `Flashinfer` can work with EP or DP without EP.
+Unless otherwise specified, backends are controlled via the `--all2all-backend` command-line argument (or the `all2all_backend` parameter in `ParallelConfig`). All backends except `flashinfer` only work with EP+DP or EP+TP. `Flashinfer` can work with EP or DP without EP.
 
 <style>
 td {
@@ -86,13 +86,12 @@ To be used with a particular `FusedMoEPrepareAndFinalize` subclass, MoE kernels
 | triton | standard | all<sup>1</sup> | G,A,T | silu, gelu,</br>swigluoai,</br>silu_no_mul,</br>gelu_no_mul | Y | Y | [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts],</br>[`TritonExperts`][vllm.model_executor.layers.fused_moe.fused_moe.TritonExperts] |
 | triton (batched) | batched | all<sup>1</sup> | G,A,T | silu, gelu | <sup>6</sup> | Y | [`BatchedTritonExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.BatchedTritonExperts] |
 | deep gemm | standard,</br>batched | fp8 | G(128),A,T | silu, gelu | <sup>6</sup> | Y | [`deep_gemm_moe_fp8`][vllm.model_executor.layers.fused_moe.deep_gemm_moe.deep_gemm_moe_fp8],</br>[`DeepGemmExperts`][vllm.model_executor.layers.fused_moe.deep_gemm_moe.DeepGemmExperts],</br>[`BatchedDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe.BatchedDeepGemmExperts] |
-| cutlass_fp4 | standard,</br>batched | nvfp4 | A,T | silu | Y | Y | [`cutlass_moe_fp4`][vllm.model_executor.layers.fused_moe.cutlass_moe.cutlass_moe_fp4],</br>[`CutlassExpertsFp4`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp4] |
-| cutlass_fp8 | standard,</br>batched | fp8 | A,T | silu, gelu | Y | Y | [`cutlass_moe_fp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.cutlass_moe_fp8],</br>[`CutlassExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp8],</br>[`CutlasBatchedExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassBatchedExpertsFp8] |
+| cutlass_fp4 | standard,</br>batched | nvfp4 | A,T | silu | Y | Y | [`CutlassExpertsFp4`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp4] |
+| cutlass_fp8 | standard,</br>batched | fp8 | A,T | silu, gelu | Y | Y | [`CutlassExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp8],</br>[`CutlasBatchedExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassBatchedExpertsFp8] |
 | flashinfer | standard | nvfp4,</br>fp8 | T | <sup>5</sup> | N | Y | [`flashinfer_cutlass_moe_fp4`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.flashinfer_cutlass_moe_fp4],</br>[`FlashInferExperts`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.FlashInferExperts] |
 | gpt oss triton | standard | N/A | N/A | <sup>5</sup> | Y | Y | [`triton_kernel_fused_experts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.triton_kernel_fused_experts],</br>[`OAITritonExperts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.OAITritonExperts] |
 | marlin | standard,</br>batched | <sup>3</sup> / N/A | <sup>3</sup> / N/A | silu,</br>swigluoai | Y | Y | [`fused_marlin_moe`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.fused_marlin_moe],</br>[`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts],</br>[`BatchedMarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.BatchedMarlinExperts] |
 | trtllm | standard | mxfp4,</br>nvfp4 | G(16),G(32) | <sup>5</sup> | N | Y | [`TrtLlmGenExperts`][vllm.model_executor.layers.fused_moe.trtllm_moe.TrtLlmGenExperts] |
-| pallas | standard | N/A | N/A | silu | N | N | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_pallas.fused_moe] |
 | iterative | standard | N/A | N/A | silu | N | N | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_torch_iterative.fused_moe] |
 | rocm aiter moe | standard | fp8 | G(128),A,T | silu, gelu | Y | N | [`rocm_aiter_fused_experts`][vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe.rocm_aiter_fused_experts] |
 | cpu_fused_moe | standard | N/A | N/A | silu | N | N | [`CPUFusedMOE`][vllm.model_executor.layers.fused_moe.cpu_fused_moe.CPUFusedMOE] |
diff --git a/docs/design/paged_attention.md b/docs/design/paged_attention.md
index 5cc587842551510492d79d6baaacbf9f84b3f02c..7c0132cd2a213c40577d2cf4ffe620689311c419 100644
--- a/docs/design/paged_attention.md
+++ b/docs/design/paged_attention.md
@@ -139,18 +139,14 @@ token data.
 const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE;
 ```
 
-<figure markdown="span">
-  ![](../assets/design/paged_attention/query.png){ align="center" alt="query" width="70%" }
-</figure>
+![query](../assets/design/paged_attention/query.png)
 
 Each thread defines its own `q_ptr` which points to the assigned
 query token data on global memory. For example, if `VEC_SIZE` is 4
 and `HEAD_SIZE` is 128, the `q_ptr` points to data that contains
 total of 128 elements divided into 128 / 4 = 32 vecs.
 
-<figure markdown="span">
-  ![](../assets/design/paged_attention/q_vecs.png){ align="center" alt="q_vecs" width="70%" }
-</figure>
+![q_vecs](../assets/design/paged_attention/q_vecs.png)
 
 ```cpp
 __shared__ Q_vec q_vecs[THREAD_GROUP_SIZE][NUM_VECS_PER_THREAD];
@@ -187,9 +183,7 @@ key token at different iterations. As shown above, that `k_ptr`
 points to key token data based on `k_cache` at assigned block,
 assigned head and assigned token.
 
-<figure markdown="span">
-  ![](../assets/design/paged_attention/key.png){ align="center" alt="key" width="70%" }
-</figure>
+![key](../assets/design/paged_attention/key.png)
 
 The diagram above illustrates the memory layout for key data. It
 assumes that the `BLOCK_SIZE` is 16, `HEAD_SIZE` is 128, `x` is
@@ -202,9 +196,7 @@ iterations. Inside each rectangle, there are a total 32 vecs (128
 elements for one token) that will be processed by 2 threads (one
 thread group) separately.
 
-<figure markdown="span">
-  ![](../assets/design/paged_attention/k_vecs.png){ align="center" alt="k_vecs" width="70%" }
-</figure>
+![k_vecs](../assets/design/paged_attention/k_vecs.png)
 
 ```cpp
 K_vec k_vecs[NUM_VECS_PER_THREAD]
@@ -361,17 +353,11 @@ later steps. Now, it should store the normalized softmax result of
 
 ## Value
 
-<figure markdown="span">
-  ![](../assets/design/paged_attention/value.png){ align="center" alt="value" width="70%" }
-</figure>
+![value](../assets/design/paged_attention/value.png)
 
-<figure markdown="span">
-  ![](../assets/design/paged_attention/logits_vec.png){ align="center" alt="logits_vec" width="50%" }
-</figure>
+![logits_vec](../assets/design/paged_attention/logits_vec.png)
 
-<figure markdown="span">
-  ![](../assets/design/paged_attention/v_vec.png){ align="center" alt="v_vec" width="70%" }
-</figure>
+![v_vec](../assets/design/paged_attention/v_vec.png)
 
 Now we need to retrieve the value data and perform dot multiplication
 with `logits`. Unlike query and key, there is no thread group
diff --git a/docs/design/plugin_system.md b/docs/design/plugin_system.md
index b0ca2dad23d5ba964a5b950c96c2b9faf07b56ee..9cebaed51eeb215a6c29d494bc7e97ef13ce938c 100644
--- a/docs/design/plugin_system.md
+++ b/docs/design/plugin_system.md
@@ -109,7 +109,7 @@ Every plugin has three parts:
     - `init_device`: This function is called to set up the device for the worker.
     - `initialize_cache`: This function is called to set cache config for the worker.
     - `load_model`: This function is called to load the model weights to device.
-    - `get_kv_cache_spaces`: This function is called to generate the kv cache spaces for the model.
+    - `get_kv_cache_spec`: This function is called to generate the kv cache spec for the model.
     - `determine_available_memory`: This function is called to profiles the peak memory usage of the model to determine how much memory can be used for KV cache without OOMs.
     - `initialize_from_config`: This function is called to allocate device KV cache with the specified kv_cache_config
     - `execute_model`: This function is called every step to inference the model.
@@ -124,7 +124,7 @@ Every plugin has three parts:
 
     Please look at the worker base class [WorkerBase][vllm.v1.worker.worker_base.WorkerBase] for more functions that can be implemented.
 
-5. Implement the attention backend class `MyDummyAttention` in `my_dummy_attention.py`. The attention backend class should inherit from [AttentionBackend][vllm.attention.backends.abstract.AttentionBackend]. It's used to calculate attentions with your device. Take `vllm.v1.attention.backends` as examples, it contains many attention backend implementations.
+5. Implement the attention backend class `MyDummyAttention` in `my_dummy_attention.py`. The attention backend class should inherit from [AttentionBackend][vllm.v1.attention.backend.AttentionBackend]. It's used to calculate attentions with your device. Take `vllm.v1.attention.backends` as examples, it contains many attention backend implementations.
 
 6. Implement custom ops for high performance. Most ops can be ran by pytorch native implementation, while the performance may not be good. In this case, you can implement specific custom ops for your plugins. Currently, there are kinds of custom ops vLLM supports:
 
@@ -153,4 +153,5 @@ The interface for the model/module may change during vLLM's development. If you
 
 !!! warning "Deprecations"
     - `use_v1` parameter in `Platform.get_attn_backend_cls` is deprecated. It has been removed in v0.13.0.
-    - `_Backend` in `vllm.attention` is deprecated. It has been removed in v0.13.0. Please use `vllm.attention.backends.registry.register_backend` to add new attention backend to `AttentionBackendEnum` instead.
+    - `_Backend` in `vllm.attention` is deprecated. It has been removed in v0.13.0. Please use `vllm.v1.attention.backends.registry.register_backend` to add new attention backend to `AttentionBackendEnum` instead.
+    - `seed_everything` platform interface is deprecated. It will be removed in v0.15.0 or later. Please use `vllm.utils.torch_utils.set_random_seed` instead.
diff --git a/docs/design/torch_compile_multimodal.md b/docs/design/torch_compile_multimodal.md
new file mode 100644
index 0000000000000000000000000000000000000000..6c8e1d18f34ff064275c84a91e68c8f5a12f610d
--- /dev/null
+++ b/docs/design/torch_compile_multimodal.md
@@ -0,0 +1,111 @@
+# torch.compile with Multimodal Encoders
+
+`torch.compile` can now be applied to multimodal encoders and miscellaneous nn modules in vLLM, including vision-language models like LLaMA 4, Qwen-VL,
+and similar encoder-based architectures.
+
+This document covers the basics of how the `torch.compile` integration works for multimodal encoders in vLLM, as well as how to apply the decorator
+to new models to improve performance.
+
+!!! note
+    For general information about `torch.compile` integration in vLLM, see the [torch.compile design document](./torch_compile.md).
+
+## Overview
+
+We have recently enabled the `@supports_torch_compile` decorator to work for multiple nn module components within a model type; this enables
+turning compile on for multimodal encoders, bringing performance improvements to additional components of the stack.
+
+When applied to the vision block of [`Qwen2_5_vl`](https://github.com/vllm-project/vllm/pull/23207) we observe ~4.5% e2e perf improvements with
+some increase in compilation time
+
+This feature is off by default, but can be enabled by setting `compile_mm_encoder: true` in the compilation config when models have the
+`@supports_torch_compile` decorator.
+
+## How Compilation Works for Multimodal Components
+
+### APIs for Enablement
+
+To compile a multimodal component such as an encoder, we follow the same mechanism as the LLM text backbone, with a few additional scaffoldings:
+
+1. The `@supports_torch_compile` decorator should include `enable_if=should_torch_compile_mm_vit`. This will gate the compilation behind our
+`compile_mm_encoder` configuration
+
+2. `with set_model_tag("<component_name>", is_encoder=True)` context manager should be used around the nn.Module's instantiation. Since torch.compile
+relies on caching artifacts to reduce start time, we must properly propagate the `<component_name>` information to the cache in order to avoid collisions
+with the LLM text-backbone, or other instances of the same artifact (as is the case with vision block). `is_encoder=True` is also needed for encoder
+components (see Compile Range Integration).
+
+3. `with set_forward_context` context manager should be used around the nn.Module's forward call. This will properly forward the vllm_config which is needed
+for torch.compile integration.
+
+### CompilationConfig
+
+With the exception of `compile_mm_encoder: true`, the multimodal encoder will inherit from the same compilation config as the text LLM. We may extend
+this for more configuration in the future.
+
+## Applying torch.compile to a New Multimodal Model/Component
+
+To apply `supports_torch_compile` to a new general nn.Module, we advise following the same steps in [`debug_vllm_compile`](./debug_vllm_compile.md); this includes:
+
+1. Applying `supports_torch_compile` on initially small modules (such as basic MLP layers), then raising to more general modules until one reaches a good performance
+tradeoff
+
+2. Leveraging [`tlparse`](https://github.com/meta-pytorch/tlparse) to identify and eliminate the source of recompiles and graph breaks
+
+3. Using `dynamic_arg_dims` and proper `dynamic_shapes_config` to handle dynamism.
+
+### Common pitfalls
+
+## VllmBackend Feature Support
+
+### Compile ranges
+
+The torch.compile integration will try to rely on max_batch_size to infer compilation ranges for dynamic shapes; however, for modules used in the encoder, this
+shape can be difficult to infer due to the unspecified range of shapes the encoder may see as input. Therefore, we rely on `is_encoder=True` in the `set_model_tag`
+to alert torch.compile to the fact that this range cannot be inferred, and we default to the range (1, MAX_INT).
+
+!!! note
+    We may seek to tighten this range for better performance in the future
+
+### Cudagraphs
+
+We have not yet explored compilation for multimodal encoders with CUDAGraph integration; behavior is currently unspecified.
+
+## Troubleshooting
+
+### Graph Breaks in Vision Encoders
+
+Some vision encoder operations may cause graph breaks. To identify them:
+
+```bash
+TORCH_LOGS="+dynamo" vllm serve <MODEL>
+```
+
+Common causes of graph breaks in multimodal models:
+
+- **Dynamic image sizes**: Use `dynamic_shapes_config` to handle variable resolutions
+- **Untraceable operations**: Some operations (such as to_list) may not be supported by Dynamo
+- **Conditional processing**: Data-dependent branching based on image properties
+
+### Compilation Errors
+
+If compilation fails for a multimodal model:
+
+1. **Disable and test**: First verify the model works without compilation:
+   ```bash
+   VLLM_TORCH_COMPILE_LEVEL=0 vllm serve <model> --compilation-config='{"compile_mm_encoder":"false"}'
+   ```
+
+2. **Check logs**: Enable debug logging to see compilation details:
+   ```bash
+   VLLM_LOGGING_LEVEL=DEBUG vllm serve <model> --compilation-config='{"compile_mm_encoder":"true"}'
+   ```
+
+3. **Report issues**: If you find a bug, [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose)
+
+## See Also
+
+- [torch.compile Integration](./torch_compile.md) - Core design document
+- [Debugging torch.compile](./debug_vllm_compile.md) - Detailed debugging guide
+- [Multimodal Inputs](../features/multimodal_inputs.md) - How to pass multimodal data
+- [Disaggregated Encoder](../features/disagg_encoder.md) - Scaling vision encoders
+- [Supported Multimodal Models](../models/supported_models.md#list-of-multimodal-language-models) - Model compatibility
diff --git a/docs/features/README.md b/docs/features/README.md
index e9e5232929b726bdfc7b2efa277a6cfab1432947..b9083b9993159d5ffde4d4aa9ed6cf37d37fb1a3 100644
--- a/docs/features/README.md
+++ b/docs/features/README.md
@@ -64,7 +64,7 @@ th:not(:first-child) {
 | [CP](../configuration/optimization.md#chunked-prefill)                                     | [❌](https://github.com/vllm-project/vllm/issues/2729) | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
 | [APC](automatic_prefix_caching.md)                        | [❌](https://github.com/vllm-project/vllm/issues/3687) | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
 | [LoRA](lora.md)                                           | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
-| [SD](spec_decode.md)                                      | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ✅     | [🟠](https://github.com/vllm-project/vllm/issues/26963)       |
+| [SD](spec_decode.md)                                      | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ✅     | ✅        |
 | CUDA graph                                                | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ✅     | [❌](https://github.com/vllm-project/vllm/issues/26970)        |
 | [pooling](../models/pooling_models.md)                    | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
 | <abbr title="Encoder-Decoder Models">enc-dec</abbr>       | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ❌     | ✅        |
diff --git a/docs/features/custom_logitsprocs.md b/docs/features/custom_logitsprocs.md
index 5ddef9db1611bf7fc0b43fbeace5f313a70c85d9..232f4363efeeccbdceb6de5dcbb5758453d5aa88 100644
--- a/docs/features/custom_logitsprocs.md
+++ b/docs/features/custom_logitsprocs.md
@@ -180,7 +180,7 @@ The `DummyLogitsProcessor.update_state()` implementation maintains a "sparse" re
 
 ### Wrapping an Existing Request-Level Logits Processor
 
-Although the vLLM engine applies logits processors at batch granularity, some users may want to use vLLM with a "request-level" logits processor implementation - an implementation which operates on individual requests. This will be especially true if your logits processor was developed for vLLM version 0, which required it to be a `Callable` (as described [here](https://docs.vllm.ai/en/v0.10.1.1/api/vllm/logits_process.html)) conforming to the following type annotation:
+Although the vLLM engine applies logits processors at batch granularity, some users may want to use vLLM with a "request-level" logits processor implementation - an implementation which operates on individual requests. This will be especially true if your logits processor was developed for vLLM version 0, which required it to be a `Callable` (as described [here][vllm.logits_process]) conforming to the following type annotation:
 
 ``` python
 RequestLogitsProcessor = Union[
diff --git a/docs/features/disagg_encoder.md b/docs/features/disagg_encoder.md
index f18a0e85e4b3bf4311f927540e0d6a6756eb2e22..d95427464196f0a801d2430327b292c5b8cf93dc 100644
--- a/docs/features/disagg_encoder.md
+++ b/docs/features/disagg_encoder.md
@@ -68,7 +68,7 @@ Here is a figure illustrating disaggregate encoder flow:
 
 ![Disaggregated Encoder Flow](../assets/features/disagg_encoder/disagg_encoder_flow.png)
 
-For the PD disaggregation part, the Prefill instance receive cache exactly the same as the disaggregate encoder flow above. Prefill instance executes 1 step (prefill -> 1 token output) and then transfer KV cache to the Decode instance for the remaining execution. The KV transfer part purely happens after the execute of the PDinstance.
+For the PD disaggregation part, the Prefill instance receives cache exactly the same as the disaggregated encoder flow above. Prefill instance executes 1 step (prefill -> 1 token output) and then transfers KV cache to the Decode instance for the remaining execution. The KV transfer part purely happens after the execution of the PD instance.
 
 `docs/features/disagg_prefill.md` shows the brief idea about the disaggregated prefill (v0)
 
diff --git a/docs/features/disagg_prefill.md b/docs/features/disagg_prefill.md
index dc5e11ea257dac71278e9ed9b33a2482c88c7cc4..df69849bb92282213bd11dbe25dc4dd3c7192e09 100644
--- a/docs/features/disagg_prefill.md
+++ b/docs/features/disagg_prefill.md
@@ -1,6 +1,6 @@
 # Disaggregated Prefilling (experimental)
 
-This page introduces you the disaggregated prefilling feature in vLLM.
+This page introduces you to the disaggregated prefilling feature in vLLM.
 
 !!! note
     This feature is experimental and subject to change.
@@ -37,10 +37,10 @@ For NixlConnector, you may also specify one or multiple NIXL_Backend. Such as:
   --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both", "kv_buffer_device":"cuda", "kv_connector_extra_config":{"backends":["UCX", "GDS"]}}'
   ```
 
-- **OffloadingConnector**: enable offloading of KV data to CPU memory, customizing the CPU block size (in tokens) and number of blocks to allocate (per worker):
+- **OffloadingConnector**: enable offloading of KV data to CPU memory, customizing the CPU block size (in tokens) and total CPU memory bytes to allocate:
 
   ```bash
-  --kv-transfer-config '{"kv_connector":"OffloadingConnector","kv_role":"kv_both","kv_connector_extra_config":{"block_size": 64, "num_cpu_blocks": 1000}}'
+  --kv-transfer-config '{"kv_connector":"OffloadingConnector","kv_role":"kv_both","kv_connector_extra_config":{"block_size": 64, "cpu_bytes_to_use": 1000000000}}'
   ```
 
 ## Benchmarks
diff --git a/docs/features/lora.md b/docs/features/lora.md
index d42a3cef76bdeadce708e6942422804e7979451d..dda6b4768ed855cf20014a4bb5add0d8f9bcd5ab 100644
--- a/docs/features/lora.md
+++ b/docs/features/lora.md
@@ -10,7 +10,7 @@ them locally with
 ```python
 from huggingface_hub import snapshot_download
 
-sql_lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
+sql_lora_path = snapshot_download(repo_id="jeeejeee/llama32-3b-text2sql-spider")
 ```
 
 Then we instantiate the base model and pass in the `enable_lora=True` flag:
@@ -19,7 +19,7 @@ Then we instantiate the base model and pass in the `enable_lora=True` flag:
 from vllm import LLM, SamplingParams
 from vllm.lora.request import LoRARequest
 
-llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_lora=True)
+llm = LLM(model="meta-llama/Llama-3.2-3B-Instruct", enable_lora=True)
 ```
 
 We can now submit the prompts and call `llm.generate` with the `lora_request` parameter. The first parameter
@@ -55,14 +55,11 @@ LoRA adapted models can also be served with the Open-AI compatible vLLM server.
 `--lora-modules {name}={path} {name}={path}` to specify each LoRA module when we kick off the server:
 
 ```bash
-vllm serve meta-llama/Llama-2-7b-hf \
+vllm serve meta-llama/Llama-3.2-3B-Instruct \
     --enable-lora \
-    --lora-modules sql-lora=$HOME/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/
+    --lora-modules sql-lora=jeeejeee/llama32-3b-text2sql-spider
 ```
 
-!!! note
-    The commit ID `0dfa347e8877a4d4ed19ee56c140fa518470028c` may change over time. Please check the latest commit ID in your environment to ensure you are using the correct one.
-
 The server entrypoint accepts all other LoRA configuration parameters (`max_loras`, `max_lora_rank`, `max_cpu_loras`,
 etc.), which will apply to all forthcoming requests. Upon querying the `/models` endpoint, we should see our LoRA along
 with its base model (if `jq` is not installed, you can follow [this guide](https://jqlang.org/download/) to install it.):
@@ -75,7 +72,7 @@ with its base model (if `jq` is not installed, you can follow [this guide](https
         "object": "list",
         "data": [
             {
-                "id": "meta-llama/Llama-2-7b-hf",
+                "id": "meta-llama/Llama-3.2-3B-Instruct",
                 "object": "model",
                 ...
             },
@@ -218,14 +215,14 @@ Alternatively, follow these example steps to implement your own plugin:
 In the previous version, users would provide LoRA modules via the following format, either as a key-value pair or in JSON format. For example:
 
 ```bash
---lora-modules sql-lora=$HOME/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/
+--lora-modules  sql-lora=jeeejeee/llama32-3b-text2sql-spider
 ```
 
 This would only include the `name` and `path` for each LoRA module, but did not provide a way to specify a `base_model_name`.
 Now, you can specify a base_model_name alongside the name and path using JSON format. For example:
 
 ```bash
---lora-modules '{"name": "sql-lora", "path": "/path/to/lora", "base_model_name": "meta-llama/Llama-2-7b"}'
+--lora-modules '{"name": "sql-lora", "path": "jeeejeee/llama32-3b-text2sql-spider", "base_model_name": "meta-llama/Llama-3.2-3B-Instruct"}'
 ```
 
 To provide the backward compatibility support, you can still use the old key-value format (name=path), but the `base_model_name` will remain unspecified in that case.
@@ -234,7 +231,7 @@ To provide the backward compatibility support, you can still use the old key-val
 
 The new format of `--lora-modules` is mainly to support the display of parent model information in the model card. Here's an explanation of how your current response supports this:
 
-- The `parent` field of LoRA model `sql-lora` now links to its base model `meta-llama/Llama-2-7b-hf`. This correctly reflects the hierarchical relationship between the base model and the LoRA adapter.
+- The `parent` field of LoRA model `sql-lora` now links to its base model `meta-llama/Llama-3.2-3B-Instruct`. This correctly reflects the hierarchical relationship between the base model and the LoRA adapter.
 - The `root` field points to the artifact location of the lora adapter.
 
 ??? console "Command output"
@@ -246,11 +243,11 @@ The new format of `--lora-modules` is mainly to support the display of parent mo
         "object": "list",
         "data": [
             {
-            "id": "meta-llama/Llama-2-7b-hf",
+            "id": "meta-llama/Llama-3.2-3B-Instruct",
             "object": "model",
             "created": 1715644056,
             "owned_by": "vllm",
-            "root": "~/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9/",
+            "root": "meta-llama/Llama-3.2-3B-Instruct",
             "parent": null,
             "permission": [
                 {
@@ -263,8 +260,8 @@ The new format of `--lora-modules` is mainly to support the display of parent mo
             "object": "model",
             "created": 1715644056,
             "owned_by": "vllm",
-            "root": "~/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/",
-            "parent": meta-llama/Llama-2-7b-hf,
+            "root": "jeeejeee/llama32-3b-text2sql-spider",
+            "parent": "meta-llama/Llama-3.2-3B-Instruct",
             "permission": [
                 {
                 ....
@@ -275,6 +272,10 @@ The new format of `--lora-modules` is mainly to support the display of parent mo
     }
     ```
 
+## LoRA Support for Tower and Connector of Multi-Modal Model
+
+Currently, vLLM experimentally supports LoRA for the Tower and Connector components of multi-modal models. To enable this feature, you need to implement the corresponding token helper functions for the tower and connector. For more details on the rationale behind this approach, please refer to [PR 26674](https://github.com/vllm-project/vllm/pull/26674). We welcome contributions to extend LoRA support to additional models' tower and connector. Please refer to [Issue 31479](https://github.com/vllm-project/vllm/issues/31479) to check the current model support status.
+
 ## Default LoRA Models For Multimodal Models
 
 Some models, e.g., [Granite Speech](https://huggingface.co/ibm-granite/granite-speech-3.3-8b) and [Phi-4-multimodal-instruct](https://huggingface.co/microsoft/Phi-4-multimodal-instruct) multimodal, contain LoRA adapter(s) that are expected to always be applied when a given modality is present. This can be a bit tedious to manage with the above approaches, as it requires the user to send the `LoRARequest` (offline) or to filter requests between the base model and LoRA model (server) depending on the content of the request's multimodal data.
diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md
index c3fd726e9938c9d35b07af364a3f08195e06ef16..8948652082a840742a6b76e93321b0e75cf221eb 100644
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@@ -166,49 +166,51 @@ Full example: [examples/offline_inference/vision_language_multi_image.py](../../
 
 If using the [LLM.chat](../models/generative_models.md#llmchat) method, you can pass images directly in the message content using various formats: image URLs, PIL Image objects, or pre-computed embeddings:
 
-```python
-from vllm import LLM
-from vllm.assets.image import ImageAsset
-
-llm = LLM(model="llava-hf/llava-1.5-7b-hf")
-image_url = "https://picsum.photos/id/32/512/512"
-image_pil = ImageAsset('cherry_blossom').pil_image
-image_embeds = torch.load(...)
-
-conversation = [
-    {"role": "system", "content": "You are a helpful assistant"},
-    {"role": "user", "content": "Hello"},
-    {"role": "assistant", "content": "Hello! How can I assist you today?"},
-    {
-        "role": "user",
-        "content": [
-            {
-                "type": "image_url",
-                "image_url": {"url": image_url},
-            },
-            {
-                "type": "image_pil",
-                "image_pil": image_pil,
-            },
-            {
-                "type": "image_embeds",
-                "image_embeds": image_embeds,
-            },
-            {
-                "type": "text",
-                "text": "What's in these images?",
-            },
-        ],
-    },
-]
+??? code
 
-# Perform inference and log output.
-outputs = llm.chat(conversation)
+    ```python
+    from vllm import LLM
+    from vllm.assets.image import ImageAsset
 
-for o in outputs:
-    generated_text = o.outputs[0].text
-    print(generated_text)
-```
+    llm = LLM(model="llava-hf/llava-1.5-7b-hf")
+    image_url = "https://picsum.photos/id/32/512/512"
+    image_pil = ImageAsset('cherry_blossom').pil_image
+    image_embeds = torch.load(...)
+
+    conversation = [
+        {"role": "system", "content": "You are a helpful assistant"},
+        {"role": "user", "content": "Hello"},
+        {"role": "assistant", "content": "Hello! How can I assist you today?"},
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {"url": image_url},
+                },
+                {
+                    "type": "image_pil",
+                    "image_pil": image_pil,
+                },
+                {
+                    "type": "image_embeds",
+                    "image_embeds": image_embeds,
+                },
+                {
+                    "type": "text",
+                    "text": "What's in these images?",
+                },
+            ],
+        },
+    ]
+
+    # Perform inference and log output.
+    outputs = llm.chat(conversation)
+
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+    ```
 
 Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos:
 
@@ -354,6 +356,44 @@ You can pass a tuple `(array, sampling_rate)` to the `'audio'` field of the mult
 
 Full example: [examples/offline_inference/audio_language.py](../../examples/offline_inference/audio_language.py)
 
+#### Automatic Audio Channel Normalization
+
+vLLM automatically normalizes audio channels for models that require specific audio formats. When loading audio with libraries like `torchaudio`, stereo files return shape `[channels, time]`, but many audio models (particularly Whisper-based models) expect mono audio with shape `[time]`.
+
+**Supported models with automatic mono conversion:**
+
+- **Whisper** and all Whisper-based models
+- **Qwen2-Audio**
+- **Qwen2.5-Omni** / **Qwen3-Omni** (inherits from Qwen2.5-Omni)
+- **Ultravox**
+
+For these models, vLLM automatically:
+
+1. Detects if the model requires mono audio via the feature extractor
+2. Converts multi-channel audio to mono using channel averaging
+3. Handles both `(channels, time)` format (torchaudio) and `(time, channels)` format (soundfile)
+
+**Example with stereo audio:**
+
+```python
+import torchaudio
+from vllm import LLM
+
+# Load stereo audio file - returns (channels, time) shape
+audio, sr = torchaudio.load("stereo_audio.wav")
+print(f"Original shape: {audio.shape}")  # e.g., torch.Size([2, 16000])
+
+# vLLM automatically converts to mono for Whisper-based models
+llm = LLM(model="openai/whisper-large-v3")
+
+outputs = llm.generate({
+    "prompt": "",
+    "multi_modal_data": {"audio": (audio.numpy(), sr)},
+})
+```
+
+No manual conversion is needed - vLLM handles the channel normalization automatically based on the model's requirements.
+
 ### Embedding Inputs
 
 To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model,
@@ -506,6 +546,7 @@ Then, you can use the OpenAI client as follows:
 ??? code
 
     ```python
+    import os
     from openai import OpenAI
 
     openai_api_key = "EMPTY"
@@ -517,8 +558,11 @@ Then, you can use the OpenAI client as follows:
     )
 
     # Single-image input inference
+
+    # Public image URL for testing remote image processing
     image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
 
+    # Create chat completion with remote image
     chat_response = client.chat.completions.create(
         model="microsoft/Phi-3.5-vision-instruct",
         messages=[
@@ -542,6 +586,35 @@ Then, you can use the OpenAI client as follows:
     )
     print("Chat completion output:", chat_response.choices[0].message.content)
 
+    # Local image file path (update this to point to your actual image file)
+    image_file = "/path/to/image.jpg"
+
+    # Create chat completion with local image file
+    # Launch the API server/engine with the --allowed-local-media-path argument.
+    if os.path.exists(image_file):
+        chat_completion_from_local_image_url = client.chat.completions.create(
+            model="microsoft/Phi-3.5-vision-instruct",
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "What’s in this image?",
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": f"file://{image_file}"},
+                        },
+                    ],
+                }
+            ],
+        )
+        result = chat_completion_from_local_image_url.choices[0].message.content
+        print("Chat completion output from local image file:\n", result)
+    else:
+        print(f"Local image file not found at {image_file}, skipping local file test.")
+
     # Multi-image input inference
     image_url_duck = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/duck.jpg"
     image_url_lion = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/lion.jpg"
@@ -654,6 +727,31 @@ Full example: [examples/online_serving/openai_chat_completion_client_for_multimo
     export VLLM_VIDEO_FETCH_TIMEOUT=<timeout>
     ```
 
+#### Video Frame Recovery
+
+For improved robustness when processing potentially corrupted or truncated video files, vLLM supports optional frame recovery using a dynamic window forward-scan approach. When enabled, if a target frame fails to load during sequential reading, the next successfully grabbed frame (before the next target frame) will be used in its place.
+
+To enable video frame recovery, pass the `frame_recovery` parameter via `--media-io-kwargs`:
+
+```bash
+# Example: Enable frame recovery
+vllm serve Qwen/Qwen3-VL-30B-A3B-Instruct \
+  --media-io-kwargs '{"video": {"frame_recovery": true}}'
+```
+
+**Parameters:**
+
+- `frame_recovery`: Boolean flag to enable forward-scan recovery. When `true`, failed frames are recovered using the next available frame within the dynamic window (up to the next target frame). Default is `false`.
+
+**How it works:**
+
+1. The system reads frames sequentially
+2. If a target frame fails to grab, it's marked as "failed"
+3. The next successfully grabbed frame (before reaching the next target) is used to recover the failed frame
+4. This approach handles both mid-video corruption and end-of-video truncation
+
+Works with common video formats like MP4 when using OpenCV backends.
+
 #### Custom RGBA Background Color
 
 To use a custom background color for RGBA images, pass the `rgba_background_color` parameter via `--media-io-kwargs`:
@@ -860,6 +958,8 @@ The following example demonstrates how to pass image embeddings to the OpenAI se
 
 For Online Serving, you can also skip sending media if you expect cache hits with provided UUIDs. You can do so by sending media like this:
 
+??? code
+
     ```python
         # Image/video/audio URL:
         {
diff --git a/docs/features/nixl_connector_usage.md b/docs/features/nixl_connector_usage.md
index 601205e1ed0b14614186cc6f43316d1063472c43..8aa23b24aeae431134362f6a22428262710bd4ab 100644
--- a/docs/features/nixl_connector_usage.md
+++ b/docs/features/nixl_connector_usage.md
@@ -6,11 +6,17 @@ NixlConnector is a high-performance KV cache transfer connector for vLLM's disag
 
 ### Installation
 
-Install the NIXL library: `uv pip install nixl`, as a quick start.
+Install the NIXL library: `uv pip install nixl`, as a quick start on Nvidia platform.
 
 - Refer to [NIXL official repository](https://github.com/ai-dynamo/nixl) for more installation instructions
 - The specified required NIXL version can be found in [requirements/kv_connectors.txt](../../requirements/kv_connectors.txt) and other relevant config files
 
+For ROCm platform, the [base ROCm docker file](../../docker/Dockerfile.rocm_base) includes RIXL and ucx already.
+
+- Refer to [RIXL official repository](https://github.com/rocm/rixl) for more information
+- The supportive libraries for RIXL can be found in [requirements/kv_connectors_rocm.txt](../../requirements/kv_connectors_rocm.txt)
+- In the future we may remove RIXL from docker image file and users will be able to install from pre-compiled binary packages
+
 For non-cuda platform, please install nixl with ucx build from source, instructed as below.
 
 ```bash
diff --git a/docs/features/quantization/fp8.md b/docs/features/quantization/fp8.md
index d4a6176b236f18a24d691e061f23048471df4489..f17ef89a5cbf954b163375be3cfaf34da258a49a 100644
--- a/docs/features/quantization/fp8.md
+++ b/docs/features/quantization/fp8.md
@@ -84,7 +84,7 @@ Since simple RTN does not require data for weight quantization and the activatio
 Install `vllm` and `lm-evaluation-harness` for evaluation:
 
 ```bash
-pip install vllm git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
+pip install vllm "lm-eval[api]>=0.4.9.2"
 ```
 
 Load and run the model in `vllm`:
diff --git a/docs/features/quantization/inc.md b/docs/features/quantization/inc.md
index 9875bc44c914466314831ba58ed3fb5f53d949c4..f2bbca498cd062d35b2632eb7db13a6a3ffb3896 100644
--- a/docs/features/quantization/inc.md
+++ b/docs/features/quantization/inc.md
@@ -19,7 +19,7 @@ Once you've completed the model calibration process and collected the measuremen
 
 ```bash
 export QUANT_CONFIG=/path/to/quant/config/inc/meta-llama-3.1-405b-instruct/maxabs_measure_g3.json
-vllm serve meta-llama/Llama-3.1-405B-Instruct --quantization inc --kv-cache-dtype fp8_inc --tensor_paralel_size 8
+vllm serve meta-llama/Llama-3.1-405B-Instruct --quantization inc --kv-cache-dtype fp8_inc --tensor-parallel-size 8
 ```
 
 !!! tip
diff --git a/docs/features/quantization/int4.md b/docs/features/quantization/int4.md
index 9752039097d634c7841db81de6fc1abb77769342..049a7ceed079b9b732d32ee91847ee4d4ffadba3 100644
--- a/docs/features/quantization/int4.md
+++ b/docs/features/quantization/int4.md
@@ -18,7 +18,7 @@ pip install llmcompressor
 Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
 
 ```bash
-pip install vllm git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
+pip install vllm "lm-eval[api]>=0.4.9.2"
 ```
 
 ## Quantization Process
diff --git a/docs/features/quantization/int8.md b/docs/features/quantization/int8.md
index 701ca6378cb16ad02e48c57009969de9728d5384..8af3e24c7357c68eccbeba62836c2a26b09d3ee3 100644
--- a/docs/features/quantization/int8.md
+++ b/docs/features/quantization/int8.md
@@ -23,7 +23,7 @@ pip install llmcompressor
 Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
 
 ```bash
-pip install vllm git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
+pip install vllm "lm-eval[api]>=0.4.9.2"
 ```
 
 ## Quantization Process
diff --git a/docs/features/quantization/modelopt.md b/docs/features/quantization/modelopt.md
index b02d5ba9e89a208de7e00ea968ccf77a4d30d214..5c846767bc5b8c97317f0ff313c77fff8dcbb77c 100644
--- a/docs/features/quantization/modelopt.md
+++ b/docs/features/quantization/modelopt.md
@@ -8,6 +8,16 @@ We recommend installing the library with:
 pip install nvidia-modelopt
 ```
 
+## Supported ModelOpt checkpoint formats
+
+vLLM detects ModelOpt checkpoints via `hf_quant_config.json` and supports the
+following `quantization.quant_algo` values:
+
+- `FP8`: per-tensor weight scale (+ optional static activation scale).
+- `FP8_PER_CHANNEL_PER_TOKEN`: per-channel weight scale and dynamic per-token activation quantization.
+- `FP8_PB_WO` (ModelOpt may emit `fp8_pb_wo`): block-scaled FP8 weight-only (typically 128×128 blocks).
+- `NVFP4`: ModelOpt NVFP4 checkpoints (use `quantization="modelopt_fp4"`).
+
 ## Quantizing HuggingFace Models with PTQ
 
 You can quantize HuggingFace models using the example scripts provided in the Model Optimizer repository. The primary script for LLM PTQ is typically found within the `examples/llm_ptq` directory.
@@ -80,3 +90,24 @@ The quantized checkpoint can then be deployed with vLLM. As an example, the foll
     if __name__ == "__main__":
         main()
     ```
+
+## Running the OpenAI-compatible server
+
+To serve a local ModelOpt checkpoint via the OpenAI-compatible API:
+
+```bash
+vllm serve <path_to_exported_checkpoint> \
+  --quantization modelopt \
+  --host 0.0.0.0 --port 8000
+```
+
+## Testing (local checkpoints)
+
+vLLM's ModelOpt unit tests are gated by local checkpoint paths and are skipped
+by default in CI. To run the tests locally:
+
+```bash
+export VLLM_TEST_MODELOPT_FP8_PC_PT_MODEL_PATH=<path_to_fp8_pc_pt_checkpoint>
+export VLLM_TEST_MODELOPT_FP8_PB_WO_MODEL_PATH=<path_to_fp8_pb_wo_checkpoint>
+pytest -q tests/quantization/test_modelopt.py
+```
diff --git a/docs/features/quantization/quantized_kvcache.md b/docs/features/quantization/quantized_kvcache.md
index d26a5e217f3146b52a68fd0b418dc6d26fa38238..586117272d3baa4761ee1bb28072e85aad5f025e 100644
--- a/docs/features/quantization/quantized_kvcache.md
+++ b/docs/features/quantization/quantized_kvcache.md
@@ -17,6 +17,16 @@ The E4M3 format offers higher precision compared to E5M2. However, due to its sm
 
 For now, only per-tensor (scalar) scaling factors are supported. Development is ongoing to support scaling factors of a finer granularity (e.g. per-channel).
 
+### How FP8 KV Cache Works
+
+The FP8 KV cache implementation follows this workflow:
+
+1. **Storage**: Key and Value tensors are quantized to FP8 format using scaling factors before being stored in the KV cache
+2. **Retrieval**: When needed for attention computation, cached KV tensors are dequantized back to higher precision (FP16/BF16)
+3. **Attention**: The attention-value multiplication (softmax output × V) is performed using the dequantized higher-precision V tensor
+
+This means the final attention computation operates on dequantized values, not FP8 tensors. The quantization reduces memory usage during storage but maintains computation accuracy by using higher precision during the actual attention operations.
+
 ### Performance Impact
 
 The current FP8 KV cache implementation primarily benefits throughput by allowing approximately double the amount of space for KV cache allocation. This enables either:
diff --git a/docs/features/quantization/quark.md b/docs/features/quantization/quark.md
index c54d7d22519997f4ec66fa6ff074755fa127bf26..bbab97740ff19485593aceee4391eacaf81334dc 100644
--- a/docs/features/quantization/quark.md
+++ b/docs/features/quantization/quark.md
@@ -20,7 +20,7 @@ for more installation details.
 Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
 
 ```bash
-pip install vllm git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
+pip install vllm "lm-eval[api]>=0.4.9.2"
 ```
 
 ## Quantization Process
diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md
index 93cca23856a9bcd28d64c634397543c8a04f5d19..107d1d2b5bceecb95cadd9356355a726fb2074a7 100644
--- a/docs/features/reasoning_outputs.md
+++ b/docs/features/reasoning_outputs.md
@@ -204,6 +204,42 @@ The reasoning content is also available when both tool calling and the reasoning
 
 For more examples, please refer to [examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py](../../examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py).
 
+## Server-Level Default Chat Template Kwargs
+
+You can set default `chat_template_kwargs` at the server level using the `--default-chat-template-kwargs` CLI argument. This is useful for configuring reasoning behavior across all requests without requiring clients to specify it in each request.
+
+### Disabling Thinking Mode by Default
+
+For models like Qwen3 where thinking is enabled by default, you can disable it server-wide:
+
+```bash
+vllm serve Qwen/Qwen3-8B \
+    --reasoning-parser qwen3 \
+    --default-chat-template-kwargs '{"enable_thinking": false}'
+```
+
+### Enabling Thinking Mode by Default
+
+For models like IBM Granite 3.2 or DeepSeek-V3.1 where thinking is disabled by default, you can enable it server-wide:
+
+```bash
+vllm serve ibm-granite/granite-3.2-2b-instruct \
+    --reasoning-parser granite \
+    --default-chat-template-kwargs '{"thinking": true}'
+```
+
+### Request-Level Override
+
+Request-level `chat_template_kwargs` always take priority over server defaults. For example, if the server is started with `enable_thinking=false`, a client can still enable it for a specific request:
+
+```python
+response = client.chat.completions.create(
+    model=model,
+    messages=messages,
+    extra_body={"chat_template_kwargs": {"enable_thinking": True}}  # Overrides server default
+)
+```
+
 ## Limitations
 
 - The reasoning content is only available for online serving's chat completion endpoint (`/v1/chat/completions`).
diff --git a/docs/features/spec_decode.md b/docs/features/spec_decode.md
index 6097500cac01f581ffbcba219a4ce242859617cb..bd525ae3317cdff805f5abb7f09552616f5d20c5 100644
--- a/docs/features/spec_decode.md
+++ b/docs/features/spec_decode.md
@@ -173,7 +173,7 @@ Suffix Decoding can achieve better performance for tasks with high repetition, s
 ## Speculating using MLP speculators
 
 The following code configures vLLM to use speculative decoding where proposals are generated by
-draft models that conditioning draft predictions on both context vectors and sampled tokens.
+draft models that condition draft predictions on both context vectors and sampled tokens.
 For more information see [this blog](https://pytorch.org/blog/hitchhikers-guide-speculative-decoding/) or
 [this technical report](https://arxiv.org/abs/2404.19124).
 
diff --git a/docs/features/structured_outputs.md b/docs/features/structured_outputs.md
index 3ac987559e622840c5cc3c7de6a8af68173cbc00..a1f78911120ad5117a45eaf20797e94d1ac4eefe 100644
--- a/docs/features/structured_outputs.md
+++ b/docs/features/structured_outputs.md
@@ -39,7 +39,7 @@ request. You may also choose a specific backend, along with
 some options. A full set of options is available in the `vllm serve --help`
 text.
 
-Now let´s see an example for each of the cases, starting with the `choice`, as it´s the easiest one:
+Now let's see an example for each of the cases, starting with the `choice`, as it's the easiest one:
 
 ??? code
 
@@ -126,12 +126,12 @@ The next example shows how to use the `response_format` parameter with a Pydanti
     ```
 
 !!! tip
-    While not strictly necessary, normally it´s better to indicate in the prompt the
+    While not strictly necessary, normally it's better to indicate in the prompt the
     JSON schema and how the fields should be populated. This can improve the
     results notably in most cases.
 
 Finally we have the `grammar` option, which is probably the most
-difficult to use, but it´s really powerful. It allows us to define complete
+difficult to use, but it's really powerful. It allows us to define complete
 languages like SQL queries. It works by using a context free EBNF grammar.
 As an example, we can use to define a specific format of simplified SQL queries:
 
@@ -303,7 +303,7 @@ An example of using `structural_tag` can be found here: [examples/online_serving
 ## Offline Inference
 
 Offline inference allows for the same types of structured outputs.
-To use it, we´ll need to configure the structured outputs using the class `StructuredOutputsParams` inside `SamplingParams`.
+To use it, we'll need to configure the structured outputs using the class `StructuredOutputsParams` inside `SamplingParams`.
 The main available options inside `StructuredOutputsParams` are:
 
 - `json`
diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md
index 70a11d6def566311e03a85460f0a208120540eab..a0a56160faf34101bdef1c178e28ab9b62a4b5e6 100644
--- a/docs/features/tool_calling.md
+++ b/docs/features/tool_calling.md
@@ -317,6 +317,15 @@ Supported models:
 
 Flags: `--tool-call-parser deepseek_v31 --chat-template {see_above}`
 
+### OpenAI OSS Models ('openai`)
+
+Supported models:
+
+* `openai/gpt-oss-20b`
+* `openai/gpt-oss-120b`
+
+Flags: `--tool-call-parser openai`
+
 ### Kimi-K2 Models (`kimi_k2`)
 
 Supported models:
@@ -352,15 +361,46 @@ Supported models:
 * `zai-org/GLM-4.5`
 * `zai-org/GLM-4.5-Air`
 * `zai-org/GLM-4.6`
-* `zai-org/GLM-4.6-Air`
 
 Flags: `--tool-call-parser glm45`
 
+### GLM-4.7 Models (`glm47`)
+
+Supported models:
+
+* `zai-org/GLM-4.7`
+
+Flags: `--tool-call-parser glm47`
+
+### FunctionGemma Models (`functiongemma`)
+
+Google's FunctionGemma is a lightweight (270M parameter) model specifically designed for function calling.
+It's built on Gemma 3 and optimized for edge deployment on devices like laptops and phones.
+
+Supported models:
+
+* `google/functiongemma-270m-it`
+
+FunctionGemma uses a unique output format with `<start_function_call>` and `<end_function_call>` tags:
+
+```text
+<start_function_call>call:get_weather{location:<escape>London<escape>}<end_function_call>
+```
+
+The model is designed to be fine-tuned for specific function-calling tasks for best results.
+
+Flags: `--tool-call-parser functiongemma --chat-template examples/tool_chat_template_functiongemma.jinja`
+
+!!! note
+    FunctionGemma is intended to be fine-tuned for your specific function-calling task.
+    The base model provides general function calling capabilities, but best results
+    are achieved with task-specific fine-tuning. See Google's [FunctionGemma documentation](https://ai.google.dev/gemma/docs/functiongemma) for fine-tuning guides.
+
 ### Qwen3-Coder Models (`qwen3_xml`)
 
 Supported models:
 
-* `Qwen/Qwen3-480B-A35B-Instruct`
+* `Qwen/Qwen3-Coder-480B-A35B-Instruct`
 * `Qwen/Qwen3-Coder-30B-A3B-Instruct`
 
 Flags: `--tool-call-parser qwen3_xml`
diff --git a/docs/getting_started/installation/README.md b/docs/getting_started/installation/README.md
index cff7ce1a882a1c191bb6932d62232234272b0ae3..95a2bb041b62c2608b48544af4c027e46ace21b6 100644
--- a/docs/getting_started/installation/README.md
+++ b/docs/getting_started/installation/README.md
@@ -14,16 +14,6 @@ vLLM supports the following hardware platforms:
 
 ## Hardware Plugins
 
-The backends below live **outside** the main `vllm` repository and follow the
-[Hardware-Pluggable RFC](../../design/plugin_system.md).
+vLLM supports third-party hardware plugins that live **outside** the main `vllm` repository. These follow the [Hardware-Pluggable RFC](../../design/plugin_system.md).
 
-| Accelerator | PyPI / package | Repository |
-|-------------|----------------|------------|
-| Google TPU | `tpu-inference` | <https://github.com/vllm-project/tpu-inference> |
-| Ascend NPU | `vllm-ascend` | <https://github.com/vllm-project/vllm-ascend> |
-| Intel Gaudi (HPU) | N/A, install from source | <https://github.com/vllm-project/vllm-gaudi> |
-| MetaX MACA GPU | N/A, install from source | <https://github.com/MetaX-MACA/vLLM-metax> |
-| Rebellions ATOM / REBEL NPU | `vllm-rbln` | <https://github.com/rebellions-sw/vllm-rbln> |
-| IBM Spyre AIU | `vllm-spyre` | <https://github.com/vllm-project/vllm-spyre> |
-| Cambricon MLU | `vllm-mlu` | <https://github.com/Cambricon/vllm-mlu> |
-| Baidu Kunlun XPU | N/A, install from source | <https://github.com/baidu/vLLM-Kunlun> |
+A list of all supported hardware can be found on the [vllm.ai website](https://vllm.ai/#hardware). If you want to add new hardware, please contact us on [Slack](https://slack.vllm.ai/) or [Email](mailto:collaboration@vllm.ai).
diff --git a/docs/getting_started/installation/cpu.apple.inc.md b/docs/getting_started/installation/cpu.apple.inc.md
index 9f1f6e3821397573f8d8a12a0183c2ad5952ad47..c5a4d00ddcf4c1a7876569b9ae37f1a75e2acfd8 100644
--- a/docs/getting_started/installation/cpu.apple.inc.md
+++ b/docs/getting_started/installation/cpu.apple.inc.md
@@ -4,6 +4,9 @@ vLLM has experimental support for macOS with Apple Silicon. For now, users must
 
 Currently the CPU implementation for macOS supports FP32 and FP16 datatypes.
 
+!!! tip "GPU-Accelerated Inference with vLLM-Metal"
+    For GPU-accelerated inference on Apple Silicon using Metal, check out [vllm-metal](https://github.com/vllm-project/vllm-metal), a community-maintained hardware plugin that uses MLX as the compute backend.
+
 # --8<-- [end:installation]
 # --8<-- [start:requirements]
 
diff --git a/docs/getting_started/installation/cpu.arm.inc.md b/docs/getting_started/installation/cpu.arm.inc.md
index 657bf2509db0115da32a939eb602a22441832c95..b5eb777b711137a83b5df3a27653a5538c0d5ef6 100644
--- a/docs/getting_started/installation/cpu.arm.inc.md
+++ b/docs/getting_started/installation/cpu.arm.inc.md
@@ -1,6 +1,6 @@
 # --8<-- [start:installation]
 
-vLLM offers basic model inferencing and serving on Arm CPU platform, with support NEON, data types FP32, FP16 and BF16.
+vLLM offers basic model inferencing and serving on Arm CPU platform, with support for NEON, data types FP32, FP16 and BF16.
 
 # --8<-- [end:installation]
 # --8<-- [start:requirements]
@@ -19,12 +19,26 @@ Pre-built vLLM wheels for Arm are available since version 0.11.2. These wheels c
 
 ```bash
 export VLLM_VERSION=$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//')
-uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_VERSION}/cpu
+uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cpu-cp38-abi3-manylinux_2_35_aarch64.whl
 ```
 
 ??? console "pip"
     ```bash
-    pip install vllm==${VLLM_VERSION}+cpu --extra-index-url https://wheels.vllm.ai/${VLLM_VERSION}/cpu
+    pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cpu-cp38-abi3-manylinux_2_35_aarch64.whl
+    ```
+
+!!! warning "set `LD_PRELOAD`"
+    Before use vLLM CPU installed via wheels, make sure TCMalloc is installed and added to `LD_PRELOAD`:
+    ```bash
+    # install TCMalloc
+    sudo apt-get install -y --no-install-recommends libtcmalloc-minimal4
+
+    # manually find the path
+    sudo find / -iname *libtcmalloc_minimal.so.4
+    TC_PATH=...
+
+    # add them to LD_PRELOAD
+    export LD_PRELOAD="$TC_PATH:$LD_PRELOAD"
     ```
 
 The `uv` approach works for vLLM `v0.6.6` and later. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version.
@@ -37,7 +51,7 @@ LLM inference is a fast-evolving field, and the latest code may contain bug fixe
 
 To install from nightly index, run:
 ```bash
-uv pip install vllm --extra-index-url https://wheels.vllm.ai/nightly/cpu
+uv pip install vllm --extra-index-url https://wheels.vllm.ai/nightly/cpu --index-strategy first-index
 ```
 
 ??? console "pip (there's a caveat)"
@@ -56,7 +70,7 @@ If you want to access the wheels for previous commits (e.g. to bisect the behavi
 
 ```bash
 export VLLM_COMMIT=730bd35378bf2a5b56b6d3a45be28b3092d26519 # use full commit hash from the main branch
-uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT}/cpu
+uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT}/cpu --index-strategy first-index
 ```
 
 # --8<-- [end:pre-built-wheels]
@@ -105,6 +119,20 @@ VLLM_TARGET_DEVICE=cpu uv pip install -e . --no-build-isolation
 
 Testing has been conducted on AWS Graviton3 instances for compatibility.
 
+!!! warning "set `LD_PRELOAD`"
+    Before use vLLM CPU installed via wheels, make sure TCMalloc is installed and added to `LD_PRELOAD`:
+    ```bash
+    # install TCMalloc
+    sudo apt-get install -y --no-install-recommends libtcmalloc-minimal4
+
+    # manually find the path
+    sudo find / -iname *libtcmalloc_minimal.so.4
+    TC_PATH=...
+
+    # add them to LD_PRELOAD
+    export LD_PRELOAD="$TC_PATH:$LD_PRELOAD"
+    ```
+
 # --8<-- [end:build-wheel-from-source]
 # --8<-- [start:pre-built-images]
 
diff --git a/docs/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md
index 210f720e2d92a6b2e2fd62edcb089d3ce2bec6c8..d3e23c359076fa6ec51dfba3ff9ea2656a376d70 100644
--- a/docs/getting_started/installation/cpu.md
+++ b/docs/getting_started/installation/cpu.md
@@ -18,6 +18,12 @@ vLLM is a Python library that supports the following CPU variants. Select your C
 
     --8<-- "docs/getting_started/installation/cpu.s390x.inc.md:installation"
 
+## Technical Discussions
+
+The main discussions happen in the `#sig-cpu` channel of [vLLM Slack](https://slack.vllm.ai/).
+
+When open a Github issue about the CPU backend, please add `[CPU Backend]` in the title and it will be labeled with `cpu` for better awareness.
+
 ## Requirements
 
 - Python: 3.10 -- 3.13
@@ -166,13 +172,13 @@ Note, it is recommended to manually reserve 1 CPU for vLLM front-end process whe
 
 ### What are supported models on CPU?
 
-For the full and up-to-date list of models validated on CPU platforms, please see the official documentation: [Supported Models on CPU](https://docs.vllm.ai/en/latest/models/hardware_supported_models/cpu)
+For the full and up-to-date list of models validated on CPU platforms, please see the official documentation: [Supported Models on CPU](../../models/hardware_supported_models/cpu.md)
 
 ### How to find benchmark configuration examples for supported CPU models?
 
-For any model listed under [Supported Models on CPU](https://docs.vllm.ai/en/latest/models/hardware_supported_models/cpu), optimized runtime configurations are provided in the vLLM Benchmark Suite’s CPU test cases, defined in [cpu test cases](https://github.com/vllm-project/vllm/blob/main/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json)
-For details on how these optimized configurations are determined, see: [performance-benchmark-details](https://github.com/vllm-project/vllm/tree/main/.buildkite/performance-benchmarks#performance-benchmark-details).
-To benchmark the supported models using these optimized settings, follow the steps in [running vLLM Benchmark Suite manually](https://docs.vllm.ai/en/latest/contributing/benchmarks/#manually-trigger-the-benchmark) and run the Benchmark Suite on a CPU environment.  
+For any model listed under [Supported Models on CPU](../../models/hardware_supported_models/cpu.md), optimized runtime configurations are provided in the vLLM Benchmark Suite’s CPU test cases, defined in [cpu test cases](../../../.buildkite/performance-benchmarks/tests/serving-tests-cpu.json)
+For details on how these optimized configurations are determined, see: [performance-benchmark-details](../../../.buildkite/performance-benchmarks/README.md#performance-benchmark-details).
+To benchmark the supported models using these optimized settings, follow the steps in [running vLLM Benchmark Suite manually](../../benchmarking/dashboard.md#manually-trigger-the-benchmark) and run the Benchmark Suite on a CPU environment.  
 
 Below is an example command to benchmark all CPU-supported models using optimized configurations.
 
@@ -258,11 +264,6 @@ vLLM CPU supports data parallel (DP), tensor parallel (TP) and pipeline parallel
     - GPTQ (x86 only)
     - compressed-tensor INT8 W8A8 (x86, s390x)
 
-### (x86 only) What is the purpose of `VLLM_CPU_SGL_KERNEL`?
-
-- Both of them require `amx` CPU flag.
-    - `VLLM_CPU_SGL_KERNEL` can provide better performance for MoE models and small-batch scenarios.
-
 ### Why do I see `get_mempolicy: Operation not permitted` when running in Docker?
 
 In some container environments (like Docker), NUMA-related syscalls used by vLLM (e.g., `get_mempolicy`, `migrate_pages`) are blocked/denied in the runtime's default seccomp/capabilities settings. This may lead to warnings like `get_mempolicy: Operation not permitted`. Functionality is not affected, but NUMA memory binding/migration optimizations may not take effect and performance can be suboptimal.
diff --git a/docs/getting_started/installation/cpu.x86.inc.md b/docs/getting_started/installation/cpu.x86.inc.md
index 1fad7f43388222cf21414f06853b0bea969a6893..013750bc537bff23bf6d12b99a9dbdaba58b4ef7 100644
--- a/docs/getting_started/installation/cpu.x86.inc.md
+++ b/docs/getting_started/installation/cpu.x86.inc.md
@@ -17,7 +17,51 @@ vLLM supports basic model inferencing and serving on x86 CPU platform, with data
 # --8<-- [end:set-up-using-python]
 # --8<-- [start:pre-built-wheels]
 
-Currently, there are no pre-built x86 CPU wheels.
+Pre-built vLLM wheels for x86 with AVX512 are available since version 0.13.0. To install release wheels:
+
+```bash
+export VLLM_VERSION=$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//')
+
+# use uv
+uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cpu-cp38-abi3-manylinux_2_35_x86_64.whl --torch-backend cpu
+```
+??? console "pip"
+    ```bash
+    # use pip
+    pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cpu-cp38-abi3-manylinux_2_35_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cpu
+    ```
+!!! warning "set `LD_PRELOAD`"
+    Before use vLLM CPU installed via wheels, make sure TCMalloc and Intel OpenMP are installed and added to `LD_PRELOAD`:
+    ```bash
+    # install TCMalloc, Intel OpenMP is installed with vLLM CPU
+    sudo apt-get install -y --no-install-recommends libtcmalloc-minimal4
+
+    # manually find the path
+    sudo find / -iname *libtcmalloc_minimal.so.4
+    sudo find / -iname *libiomp5.so
+    TC_PATH=...
+    IOMP_PATH=...
+
+    # add them to LD_PRELOAD
+    export LD_PRELOAD="$TC_PATH:$IOMP_PATH:$LD_PRELOAD"
+    ```
+
+**Install the latest code**
+
+To install the wheel built from the latest main branch:
+
+```bash
+uv pip install vllm --extra-index-url https://wheels.vllm.ai/nightly/cpu --index-strategy first-index --torch-backend cpu
+```
+
+**Install specific revisions**
+
+If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), you can specify the commit hash in the URL:
+
+```bash
+export VLLM_COMMIT=730bd35378bf2a5b56b6d3a45be28b3092d26519 # use full commit hash from the main branch
+uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT}/cpu --index-strategy first-index --torch-backend cpu
+```
 
 # --8<-- [end:pre-built-wheels]
 # --8<-- [start:build-wheel-from-source]
@@ -26,10 +70,12 @@ Install recommended compiler. We recommend to use `gcc/g++ >= 12.3.0` as the def
 
 ```bash
 sudo apt-get update -y
-sudo apt-get install -y gcc-12 g++-12 libnuma-dev python3-dev
+sudo apt-get install -y gcc-12 g++-12 libnuma-dev
 sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
 ```
 
+--8<-- "docs/getting_started/installation/python_env_setup.inc.md"
+
 Clone the vLLM project:
 
 ```bash
@@ -82,6 +128,22 @@ uv pip install dist/*.whl
     pip install dist/*.whl
     ```
 
+!!! warning "set `LD_PRELOAD`"
+    Before use vLLM CPU installed via wheels, make sure TCMalloc and Intel OpenMP are installed and added to `LD_PRELOAD`:
+    ```bash
+    # install TCMalloc, Intel OpenMP is installed with vLLM CPU
+    sudo apt-get install -y --no-install-recommends libtcmalloc-minimal4
+
+    # manually find the path
+    sudo find / -iname *libtcmalloc_minimal.so.4
+    sudo find / -iname *libiomp5.so
+    TC_PATH=...
+    IOMP_PATH=...
+
+    # add them to LD_PRELOAD
+    export LD_PRELOAD="$TC_PATH:$IOMP_PATH:$LD_PRELOAD"
+    ```
+
 !!! example "Troubleshooting"
     - **NumPy ≥2.0 error**: Downgrade using `pip install "numpy<2.0"`.
     - **CMake picks up CUDA**: Add `CMAKE_DISABLE_FIND_PACKAGE_CUDA=ON` to prevent CUDA detection during CPU builds, even if CUDA is installed.
@@ -95,7 +157,6 @@ uv pip install dist/*.whl
       "torch==X.Y.Z+cpu"   # <-------
     ]
     ```
-    - If you are building vLLM from source and not using the pre-built images, remember to set `LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD"` on x86 machines before running vLLM.
 
 # --8<-- [end:build-wheel-from-source]
 # --8<-- [start:pre-built-images]
@@ -112,6 +173,7 @@ uv pip install dist/*.whl
 docker build -f docker/Dockerfile.cpu \
         --build-arg VLLM_CPU_AVX512BF16=false (default)|true \
         --build-arg VLLM_CPU_AVX512VNNI=false (default)|true \
+        --build-arg VLLM_CPU_AMXBF16=false|true (default) \
         --build-arg VLLM_CPU_DISABLE_AVX512=false (default)|true \ 
         --tag vllm-cpu-env \
         --target vllm-openai .
@@ -123,9 +185,8 @@ docker run --rm \
             --shm-size=4g \
             -p 8000:8000 \
             -e VLLM_CPU_KVCACHE_SPACE=<KV cache space> \
-            -e VLLM_CPU_OMP_THREADS_BIND=<CPU cores for inference> \
             vllm-cpu-env \
-            --model=meta-llama/Llama-3.2-1B-Instruct \
+            meta-llama/Llama-3.2-1B-Instruct \
             --dtype=bfloat16 \
             other vLLM OpenAI server arguments
 ```
diff --git a/docs/getting_started/installation/gpu.rocm.inc.md b/docs/getting_started/installation/gpu.rocm.inc.md
index 21120cc6fcd983ab47671d9254a94c5279400b3d..f2b8a7a811f9b26ea035e2451a0dd3b582d9d3bd 100644
--- a/docs/getting_started/installation/gpu.rocm.inc.md
+++ b/docs/getting_started/installation/gpu.rocm.inc.md
@@ -98,9 +98,24 @@ Currently, there are no pre-built ROCm wheels.
     !!! note
         - You will need to config the `$AITER_BRANCH_OR_COMMIT` for your purpose.
         - The validated `$AITER_BRANCH_OR_COMMIT` can be found in the [docker/Dockerfile.rocm_base](https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile.rocm_base).
-        
 
-4. Build vLLM. For example, vLLM on ROCM 7.0 can be built with the following steps:
+
+4. If you want to use MORI for EP or PD disaggregation, you can install [MORI](https://github.com/ROCm/mori) using the following steps:
+
+    ```bash
+    git clone https://github.com/ROCm/mori.git
+    cd mori
+    git checkout $MORI_BRANCH_OR_COMMIT
+    git submodule sync; git submodule update --init --recursive
+    MORI_GPU_ARCHS="gfx942;gfx950" python3 install .
+    ```
+
+    !!! note
+        - You will need to config the `$MORI_BRANCH_OR_COMMIT` for your purpose.
+        - The validated `$MORI_BRANCH_OR_COMMIT` can be found in the [docker/Dockerfile.rocm_base](https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile.rocm_base).
+
+
+5. Build vLLM. For example, vLLM on ROCM 7.0 can be built with the following steps:
 
     ???+ console "Commands"
 
diff --git a/docs/getting_started/installation/python_env_setup.inc.md b/docs/getting_started/installation/python_env_setup.inc.md
index ba78c329723edeb7b44f04694b6f8551648020cd..06794f8d3120e5cf8d5f43d73f88f175454967dc 100644
--- a/docs/getting_started/installation/python_env_setup.inc.md
+++ b/docs/getting_started/installation/python_env_setup.inc.md
@@ -1,4 +1,4 @@
-On NVIDIA CUDA only, it's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment using the following commands:
+It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment using the following commands:
 
 ```bash
 uv venv --python 3.12 --seed
diff --git a/docs/getting_started/quickstart.md b/docs/getting_started/quickstart.md
index e3974354d8f3b1aa1e546816d14340b089e0637c..01025c43e27602cca779658433d7f046daca8b77 100644
--- a/docs/getting_started/quickstart.md
+++ b/docs/getting_started/quickstart.md
@@ -75,7 +75,7 @@ This guide will help you quickly get started with vLLM to perform:
         For more detailed instructions, including Docker, installing from source, and troubleshooting, please refer to the [vLLM on TPU documentation](https://docs.vllm.ai/projects/tpu/en/latest/).
 
 !!! note
-    For more detail and non-CUDA platforms, please refer [here](installation/README.md) for specific instructions on how to install vLLM.
+    For more detail and non-CUDA platforms, please refer to the [installation guide](installation/README.md) for specific instructions on how to install vLLM.
 
 ## Offline Batched Inference
 
diff --git a/docs/governance/collaboration.md b/docs/governance/collaboration.md
index 5b3d2beffe5b98e821be54c99c2e4d6aee7d483e..7f4d3c0dc38623e0445810a862e6b727ca0865b9 100644
--- a/docs/governance/collaboration.md
+++ b/docs/governance/collaboration.md
@@ -18,7 +18,7 @@ For features that you intend to maintain, please feel free to add yourself in [`
 If you use vLLM, we recommend you making the model work with vLLM by following the [model registration](../contributing/model/registration.md) process before you release it publicly.
 
 The vLLM team helps with new model architectures not supported by vLLM, especially models pushing architectural frontiers.
-Here's how the vLLM team works with model providers. The vLLM team includes all [committers](./committers.md) of the project. model providers can exclude certain members but shouldn't, as this may harm release timelines due to missing expertise. Contact [project leads](./process.md) if you want to collaborate.
+Here's how the vLLM team works with model providers. The vLLM team includes all [committers](./committers.md) of the project. Model providers can exclude certain members but shouldn't, as this may harm release timelines due to missing expertise. Contact [project leads](./process.md) if you want to collaborate.
 
 Once we establish the connection between the vLLM team and model provider:
 
@@ -30,7 +30,7 @@ The vLLM team works with model providers on features, integrations, and release
 
 The vLLM maintainers will not publicly share details about model architecture, release timelines, or upcoming releases. We maintain model weights on secure servers with security measures (though we can work with security reviews and testing without certification). We delete pre-release weights or artifacts upon request.
 
-The vLLM team collaborates on marketing and promotional efforts for model releases. model providers can use vLLM's trademark and logo in publications and materials.
+The vLLM team collaborates on marketing and promotional efforts for model releases. Model providers can use vLLM's trademark and logo in publications and materials.
 
 ## Adding New Hardware
 
diff --git a/docs/governance/committers.md b/docs/governance/committers.md
index c9428027da9538f0228d1b968acbe3e971c30e91..2f0780a08978b65d8936ed099fd303813f2a7ed9 100644
--- a/docs/governance/committers.md
+++ b/docs/governance/committers.md
@@ -181,3 +181,4 @@ If you have PRs touching the area, please feel free to ping the area owner for r
 
 - Ascend NPU: [@wangxiyuan](https://github.com/wangxiyuan) and [see more details](https://vllm-ascend.readthedocs.io/en/latest/community/contributors.html#maintainers)
 - Intel Gaudi HPU [@xuechendi](https://github.com/xuechendi) and [@kzawora-intel](https://github.com/kzawora-intel)
+- Semantic Router: [@xunzhuo](https://github.com/xunzhuo), [@rootfs](https://github.com/rootfs) and [see more details](https://vllm-semantic-router.com/community/team)
diff --git a/docs/mkdocs/hooks/generate_argparse.py b/docs/mkdocs/hooks/generate_argparse.py
index 4ae64a6e4bfccbba7a66c8b817b21168e5fa930a..53bfce93ce21a70ea77900a9fa847d69f1ea8ef6 100644
--- a/docs/mkdocs/hooks/generate_argparse.py
+++ b/docs/mkdocs/hooks/generate_argparse.py
@@ -17,7 +17,7 @@ from pydantic_core import core_schema
 logger = logging.getLogger("mkdocs")
 
 ROOT_DIR = Path(__file__).parent.parent.parent.parent
-ARGPARSE_DOC_DIR = ROOT_DIR / "docs/argparse"
+ARGPARSE_DOC_DIR = ROOT_DIR / "docs/generated/argparse"
 
 sys.path.insert(0, str(ROOT_DIR))
 
@@ -92,6 +92,7 @@ def auto_mock(module_name: str, attr: str, max_mocks: int = 100):
 
 
 bench_latency = auto_mock("vllm.benchmarks", "latency")
+bench_mm_processor = auto_mock("vllm.benchmarks", "mm_processor")
 bench_serve = auto_mock("vllm.benchmarks", "serve")
 bench_sweep_plot = auto_mock("vllm.benchmarks.sweep.plot", "SweepPlotArgs")
 bench_sweep_plot_pareto = auto_mock(
@@ -222,6 +223,7 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
         "run-batch": create_parser(openai_run_batch.make_arg_parser),
         # Benchmark CLI
         "bench_latency": create_parser(bench_latency.add_cli_args),
+        "bench_mm_processor": create_parser(bench_mm_processor.add_cli_args),
         "bench_serve": create_parser(bench_serve.add_cli_args),
         "bench_sweep_plot": create_parser(bench_sweep_plot.add_cli_args),
         "bench_sweep_plot_pareto": create_parser(bench_sweep_plot_pareto.add_cli_args),
diff --git a/docs/mkdocs/hooks/generate_metrics.py b/docs/mkdocs/hooks/generate_metrics.py
index b20d43c4b2e9250f2423886a43f8368d831a9290..9cbf635994cc6a5f9b1bb69bbfc0f92e0b944b3d 100644
--- a/docs/mkdocs/hooks/generate_metrics.py
+++ b/docs/mkdocs/hooks/generate_metrics.py
@@ -13,14 +13,14 @@ GENERATED_METRICS_DIR = DOCS_DIR / "generated" / "metrics"
 
 # Files to scan for metric definitions - each will generate a separate table
 METRIC_SOURCE_FILES = [
-    {"path": "vllm/v1/metrics/loggers.py", "output": "general.md"},
+    {"path": "vllm/v1/metrics/loggers.py", "output": "general.inc.md"},
     {
         "path": "vllm/v1/spec_decode/metrics.py",
-        "output": "spec_decode.md",
+        "output": "spec_decode.inc.md",
     },
     {
         "path": "vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py",
-        "output": "nixl_connector.md",
+        "output": "nixl_connector.inc.md",
     },
 ]
 
diff --git a/docs/mkdocs/hooks/url_schemes.py b/docs/mkdocs/hooks/url_schemes.py
index f36a64ed7a3b872a9ed48e707541ec4d0c0bdcf4..66fa25d2ab5920db370c289f58aeb60a5740dcc9 100644
--- a/docs/mkdocs/hooks/url_schemes.py
+++ b/docs/mkdocs/hooks/url_schemes.py
@@ -34,9 +34,10 @@ TITLE = r"(?P<title>[^\[\]<>]+?)"
 REPO = r"(?P<repo>.+?/.+?)"
 TYPE = r"(?P<type>issues|pull|projects)"
 NUMBER = r"(?P<number>\d+)"
+PATH = r"(?P<path>[^\s]+?)"
 FRAGMENT = r"(?P<fragment>#[^\s]+)?"
 URL = f"https://github.com/{REPO}/{TYPE}/{NUMBER}{FRAGMENT}"
-RELATIVE = r"(?!(https?|ftp)://|#)(?P<path>[^\s]+?)"
+RELATIVE = rf"(?!(https?|ftp)://|#){PATH}{FRAGMENT}"
 
 # Common titles to use for GitHub links when none is provided in the link.
 TITLES = {"issues": "Issue ", "pull": "Pull Request ", "projects": "Project "}
@@ -55,6 +56,7 @@ def on_page_markdown(
         title = match.group("title")
         path = match.group("path")
         path = (Path(page.file.abs_src_path).parent / path).resolve()
+        fragment = match.group("fragment") or ""
 
         # Check if the path exists and is outside the docs dir
         if not path.exists() or path.is_relative_to(DOC_DIR):
@@ -64,7 +66,7 @@ def on_page_markdown(
         slug = "tree/main" if path.is_dir() else "blob/main"
 
         path = path.relative_to(ROOT_DIR)
-        url = f"https://github.com/vllm-project/vllm/{slug}/{path}"
+        url = f"https://github.com/vllm-project/vllm/{slug}/{path}{fragment}"
         return f"[{gh_icon} {title}]({url})"
 
     def replace_github_link(match: re.Match) -> str:
@@ -88,8 +90,4 @@ def on_page_markdown(
 
     markdown = relative_link.sub(replace_relative_link, markdown)
     markdown = github_link.sub(replace_github_link, markdown)
-
-    if "interface" in str(page.file.abs_src_path):
-        print(markdown)
-
     return markdown
diff --git a/docs/models/extensions/fastsafetensor.md b/docs/models/extensions/fastsafetensor.md
index 0f30d4e2f69d267302f3b5d1da6419245842c85a..03c673f692840e1d371b18db03c1deaf7e8adb07 100644
--- a/docs/models/extensions/fastsafetensor.md
+++ b/docs/models/extensions/fastsafetensor.md
@@ -1,4 +1,4 @@
-Loading Model weights with fastsafetensors
+Loading model weights with fastsafetensors
 ===================================================================
 
 Using fastsafetensors library enables loading model weights to GPU memory by leveraging GPU direct storage. See [their GitHub repository](https://github.com/foundation-model-stack/fastsafetensors) for more details.
diff --git a/docs/models/generative_models.md b/docs/models/generative_models.md
index be2f25bf06616d1607b2fecacf2d17c3132ed655..99914327e8fedde955ed4de7bd6218d93c68cd6f 100644
--- a/docs/models/generative_models.md
+++ b/docs/models/generative_models.md
@@ -2,7 +2,7 @@
 
 vLLM provides first-class support for generative models, which covers most of LLMs.
 
-In vLLM, generative models implement the[VllmModelForTextGeneration][vllm.model_executor.models.VllmModelForTextGeneration] interface.
+In vLLM, generative models implement the [VllmModelForTextGeneration][vllm.model_executor.models.VllmModelForTextGeneration] interface.
 Based on the final hidden states of the input, these models output log probabilities of the tokens to generate,
 which are then passed through [Sampler][vllm.v1.sample.sampler.Sampler] to obtain the final text.
 
diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 3ffbf63f9a18b75047b26a853cb6ad20d0ba893d..f6ac29877ccbfebf92fe74960c3ca4b5e55422de 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -363,7 +363,7 @@ th {
 | `BailingMoeV2ForCausalLM` | Ling | `inclusionAI/Ling-mini-2.0`, etc. | ✅︎ | ✅︎ |
 | `BambaForCausalLM` | Bamba | `ibm-ai-platform/Bamba-9B-fp8`, `ibm-ai-platform/Bamba-9B` | ✅︎ | ✅︎ |
 | `BloomForCausalLM` | BLOOM, BLOOMZ, BLOOMChat | `bigscience/bloom`, `bigscience/bloomz`, etc. | | ✅︎ |
-| `ChatGLMModel`, `ChatGLMForConditionalGeneration` | ChatGLM | `zai-org/chatglm2-6b`, `zai-org/chatglm3-6b`, `ShieldLM-6B-chatglm3`, etc. | ✅︎ | ✅︎ |
+| `ChatGLMModel`, `ChatGLMForConditionalGeneration` | ChatGLM | `zai-org/chatglm2-6b`, `zai-org/chatglm3-6b`, `thu-coai/ShieldLM-6B-chatglm3`, etc. | ✅︎ | ✅︎ |
 | `CohereForCausalLM`, `Cohere2ForCausalLM` | Command-R, Command-A | `CohereLabs/c4ai-command-r-v01`, `CohereLabs/c4ai-command-r7b-12-2024`, `CohereLabs/c4ai-command-a-03-2025`, `CohereLabs/command-a-reasoning-08-2025`, etc. | ✅︎ | ✅︎ |
 | `DbrxForCausalLM` | DBRX | `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. | | ✅︎ |
 | `DeciLMForCausalLM` | DeciLM | `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc. | ✅︎ | ✅︎ |
@@ -371,10 +371,11 @@ th {
 | `DeepseekV2ForCausalLM` | DeepSeek-V2 | `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat`, etc. | ✅︎ | ✅︎ |
 | `DeepseekV3ForCausalLM` | DeepSeek-V3 | `deepseek-ai/DeepSeek-V3`, `deepseek-ai/DeepSeek-R1`, `deepseek-ai/DeepSeek-V3.1`, etc. | ✅︎ | ✅︎ |
 | `Dots1ForCausalLM` | dots.llm1 | `rednote-hilab/dots.llm1.base`, `rednote-hilab/dots.llm1.inst`, etc. | | ✅︎ |
-| `DotsOCRForCausalLM` | dots_ocr | `rednote-hilab/dots.ocr` | | ✅︎ |
+| `DotsOCRForCausalLM` | dots_ocr | `rednote-hilab/dots.ocr` | ✅︎ | ✅︎ |
 | `Ernie4_5ForCausalLM` | Ernie4.5 | `baidu/ERNIE-4.5-0.3B-PT`, etc. | ✅︎ | ✅︎ |
 | `Ernie4_5_MoeForCausalLM` | Ernie4.5MoE | `baidu/ERNIE-4.5-21B-A3B-PT`, `baidu/ERNIE-4.5-300B-A47B-PT`, etc. |✅︎| ✅︎ |
 | `ExaoneForCausalLM` | EXAONE-3 | `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc. | ✅︎ | ✅︎ |
+| `ExaoneMoeCausalLM` | K-EXAONE | `LGAI-EXAONE/K-EXAONE-236B-A23B`, etc. | | |
 | `Exaone4ForCausalLM` | EXAONE-4 | `LGAI-EXAONE/EXAONE-4.0-32B`, etc. | ✅︎ | ✅︎ |
 | `Fairseq2LlamaForCausalLM` | Llama (fairseq2 format) | `mgleize/fairseq2-dummy-Llama-3.2-1B`, etc. | ✅︎ | ✅︎ |
 | `FalconForCausalLM` | Falcon | `tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc. | | ✅︎ |
@@ -387,24 +388,27 @@ th {
 | `Gemma3nForCausalLM` | Gemma 3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | |
 | `GlmForCausalLM` | GLM-4 | `zai-org/glm-4-9b-chat-hf`, etc. | ✅︎ | ✅︎ |
 | `Glm4ForCausalLM` | GLM-4-0414 | `zai-org/GLM-4-32B-0414`, etc. | ✅︎ | ✅︎ |
-| `Glm4MoeForCausalLM` | GLM-4.5, GLM-4.6 | `zai-org/GLM-4.5`, etc. | ✅︎ | ✅︎ |
-| `GPT2LMHeadModel` | GPT-2 | `gpt2`, `gpt2-xl`, etc. | | ✅︎ |
+| `Glm4MoeForCausalLM` | GLM-4.5, GLM-4.6, GLM-4.7 | `zai-org/GLM-4.5`, etc. | ✅︎ | ✅︎ |
+| `GPT2LMHeadModel` | GPT-2 | `openai-community/gpt2`, `openai-community/gpt2-xl`, etc. | | ✅︎ |
 | `GPTBigCodeForCausalLM` | StarCoder, SantaCoder, WizardCoder | `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc. | ✅︎ | ✅︎ |
 | `GPTJForCausalLM` | GPT-J | `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc. | | ✅︎ |
 | `GPTNeoXForCausalLM` | GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM | `EleutherAI/gpt-neox-20b`, `EleutherAI/pythia-12b`, `OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc. | | ✅︎ |
-| `GptOssForCausalLM` | GPT-OSS | `openai/gpt-oss-120b`, `openai/gpt-oss-20b` | | ✅︎ |
+| `GptOssForCausalLM` | GPT-OSS | `openai/gpt-oss-120b`, `openai/gpt-oss-20b` | ✅︎ | ✅︎ |
 | `GraniteForCausalLM` | Granite 3.0, Granite 3.1, PowerLM | `ibm-granite/granite-3.0-2b-base`, `ibm-granite/granite-3.1-8b-instruct`, `ibm/PowerLM-3b`, etc. | ✅︎ | ✅︎ |
 | `GraniteMoeForCausalLM` | Granite 3.0 MoE, PowerMoE | `ibm-granite/granite-3.0-1b-a400m-base`, `ibm-granite/granite-3.0-3b-a800m-instruct`, `ibm/PowerMoE-3b`, etc. | ✅︎ | ✅︎ |
 | `GraniteMoeHybridForCausalLM` | Granite 4.0 MoE Hybrid | `ibm-granite/granite-4.0-tiny-preview`, etc. | ✅︎ | ✅︎ |
 | `GraniteMoeSharedForCausalLM` | Granite MoE Shared | `ibm-research/moe-7b-1b-active-shared-experts` (test model) | ✅︎ | ✅︎ |
 | `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ |
 | `Grok1ModelForCausalLM` | Grok1 | `hpcai-tech/grok-1`. | ✅︎ | ✅︎ |
+| `Grok1ForCausalLM` | Grok2 | `xai-org/grok-2` | ✅︎ | ✅︎ |
 | `HunYuanDenseV1ForCausalLM` | Hunyuan Dense | `tencent/Hunyuan-7B-Instruct` | ✅︎ | ✅︎ |
 | `HunYuanMoEV1ForCausalLM` | Hunyuan-A13B | `tencent/Hunyuan-A13B-Instruct`, `tencent/Hunyuan-A13B-Pretrain`, `tencent/Hunyuan-A13B-Instruct-FP8`, etc. | ✅︎ | ✅︎ |
 | `HCXVisionForCausalLM` | HyperCLOVAX-SEED-Vision-Instruct-3B | `naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B` | | |
 | `InternLMForCausalLM` | InternLM | `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc. | ✅︎ | ✅︎ |
 | `InternLM2ForCausalLM` | InternLM2 | `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. | ✅︎ | ✅︎ |
 | `InternLM3ForCausalLM` | InternLM3 | `internlm/internlm3-8b-instruct`, etc. | ✅︎ | ✅︎ |
+| `IQuestCoderForCausalLM` | IQuestCoderV1 | `IQuestLab/IQuest-Coder-V1-40B-Instruct`, etc. | | |
+| `IQuestLoopCoderForCausalLM` | IQuestLoopCoderV1 | `IQuestLab/IQuest-Coder-V1-40B-Loop-Instruct`, etc. | | |
 | `JAISLMHeadModel` | Jais | `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc. | | ✅︎ |
 | `Jais2ForCausalLM` | Jais2 | `inceptionai/Jais-2-8B-Chat`, `inceptionai/Jais-2-70B-Chat`, etc. | | ✅︎ |
 | `JambaForCausalLM` | Jamba | `ai21labs/AI21-Jamba-1.5-Large`, `ai21labs/AI21-Jamba-1.5-Mini`, `ai21labs/Jamba-v0.1`, etc. | ✅︎ | ✅︎ |
@@ -415,9 +419,10 @@ th {
 | `MambaForCausalLM` | Mamba | `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc. | | ✅︎ |
 | `Mamba2ForCausalLM` | Mamba2 | `mistralai/Mamba-Codestral-7B-v0.1`, etc. | | ✅︎ |
 | `MiMoForCausalLM` | MiMo | `XiaomiMiMo/MiMo-7B-RL`, etc. | ✅︎ | ✅︎ |
+| `MiMoV2FlashForCausalLM` | MiMoV2Flash | `XiaomiMiMo/MiMo-V2-Flash`, etc. | ︎| ✅︎ |
 | `MiniCPMForCausalLM` | MiniCPM | `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc. | ✅︎ | ✅︎ |
 | `MiniCPM3ForCausalLM` | MiniCPM3 | `openbmb/MiniCPM3-4B`, etc. | ✅︎ | ✅︎ |
-| `MiniMaxM2ForCausalLM` | MiniMax-M2 |`MiniMaxAI/MiniMax-M2`, etc. | | ✅︎ |
+| `MiniMaxM2ForCausalLM` | MiniMax-M2, MiniMax-M2.1 |`MiniMaxAI/MiniMax-M2`, etc. | | ✅︎ |
 | `MistralForCausalLM` | Ministral-3, Mistral, Mistral-Instruct | `mistralai/Ministral-3-3B-Instruct-2512`, `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. | ✅︎ | ✅︎ |
 | `MistralLarge3ForCausalLM` | Mistral-Large-3-675B-Base-2512, Mistral-Large-3-675B-Instruct-2512 | `mistralai/Mistral-Large-3-675B-Base-2512`, `mistralai/Mistral-Large-3-675B-Instruct-2512`, etc. | ✅︎ | ✅︎ |
 | `MixtralForCausalLM` | Mixtral-8x7B, Mixtral-8x7B-Instruct | `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc. | ✅︎ | ✅︎ |
@@ -432,13 +437,14 @@ th {
 | `OrionForCausalLM` | Orion | `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc. | | ✅︎ |
 | `OuroForCausalLM` | ouro | `ByteDance/Ouro-1.4B`, `ByteDance/Ouro-2.6B`, etc. | ✅︎ | |
 | `PanguEmbeddedForCausalLM` |openPangu-Embedded-7B | `FreedomIntelligence/openPangu-Embedded-7B-V1.1` | ✅︎ | ✅︎ |
+| `PanguProMoEV2ForCausalLM` |openpangu-pro-moe-v2 | | ✅︎ | ✅︎ |
 | `PanguUltraMoEForCausalLM` |openpangu-ultra-moe-718b-model | `FreedomIntelligence/openPangu-Ultra-MoE-718B-V1.1` | ✅︎ | ✅︎ |
 | `PhiForCausalLM` | Phi | `microsoft/phi-1_5`, `microsoft/phi-2`, etc. | ✅︎ | ✅︎ |
 | `Phi3ForCausalLM` | Phi-4, Phi-3 | `microsoft/Phi-4-mini-instruct`, `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc. | ✅︎ | ✅︎ |
 | `PhiMoEForCausalLM` | Phi-3.5-MoE | `microsoft/Phi-3.5-MoE-instruct`, etc. | ✅︎ | ✅︎ |
 | `PersimmonForCausalLM` | Persimmon | `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc. | | ✅︎ |
-| `Plamo2ForCausalLM` | PLaMo2 | `pfnet/plamo-2-1b`, `pfnet/plamo-2-8b`, etc. | | ✅︎ |
-| `Plamo3ForCausalLM` | PLaMo3 | `pfnet/plamo-3-nict-2b-base`, `pfnet/plamo-3-nict-8b-base`, etc. | | ✅︎ |
+| `Plamo2ForCausalLM` | PLaMo2 | `pfnet/plamo-2-1b`, `pfnet/plamo-2-8b`, etc. | ✅ | ✅︎ |
+| `Plamo3ForCausalLM` | PLaMo3 | `pfnet/plamo-3-nict-2b-base`, `pfnet/plamo-3-nict-8b-base`, etc. | ✅ | ✅︎ |
 | `QWenLMHeadModel` | Qwen | `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc. | ✅︎ | ✅︎ |
 | `Qwen2ForCausalLM` | QwQ, Qwen2 | `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc. | ✅︎ | ✅︎ |
 | `Qwen2MoeForCausalLM` | Qwen2MoE | `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. | ✅︎ | ✅︎ |
@@ -457,6 +463,9 @@ th {
 | `Zamba2ForCausalLM` | Zamba2 | `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc. | | |
 | `LongcatFlashForCausalLM` | LongCat-Flash | `meituan-longcat/LongCat-Flash-Chat`, `meituan-longcat/LongCat-Flash-Chat-FP8` | ✅︎ | ✅︎ |
 
+!!! note
+    Grok2 requires `tokenizer.tok.json` with `tiktoken` installed. You can optionally override MoE router renormalization with `moe_router_renormalize`.
+
 Some models are supported only via the [Transformers modeling backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers modeling backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it!
 
 | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
@@ -489,6 +498,7 @@ These models primarily support the [`LLM.embed`](./pooling_models.md#llmembed) A
 | `GteNewModel`<sup>C</sup> | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc. |  |  |
 | `ModernBertModel`<sup>C</sup> | ModernBERT-based | `Alibaba-NLP/gte-modernbert-base`, etc. |  |  |
 | `NomicBertModel`<sup>C</sup> | Nomic BERT | `nomic-ai/nomic-embed-text-v1`, `nomic-ai/nomic-embed-text-v2-moe`, `Snowflake/snowflake-arctic-embed-m-long`, etc. |  |  |
+| `LlamaBidirectionalModel`<sup>C</sup> | Llama-based with bidirectional attention | `nvidia/llama-nemotron-embed-1b-v2`, etc. | ✅︎ | ✅︎ |
 | `LlamaModel`<sup>C</sup>, `LlamaForCausalLM`<sup>C</sup>, `MistralModel`<sup>C</sup>, etc. | Llama-based | `intfloat/e5-mistral-7b-instruct`, etc. | ✅︎ | ✅︎ |
 | `Qwen2Model`<sup>C</sup>, `Qwen2ForCausalLM`<sup>C</sup> | Qwen2-based | `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. | ✅︎ | ✅︎ |
 | `Qwen3Model`<sup>C</sup>, `Qwen3ForCausalLM`<sup>C</sup> | Qwen3-based | `Qwen/Qwen3-Embedding-0.6B`, etc. | ✅︎ | ✅︎ |
@@ -537,20 +547,28 @@ If your model is not in the above list, we will try to automatically convert the
 Cross-encoder and reranker models are a subset of classification models that accept two prompts as input.
 These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) API.
 
-| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
-|--------------|--------|-------------------|----------------------|---------------------------|
-| `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | | |
-| `GemmaForSequenceClassification` | Gemma-based | `BAAI/bge-reranker-v2-gemma` (see note), etc. | ✅︎ | ✅︎ |
-| `GteNewForSequenceClassification` | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-reranker-base`, etc. |  |  |
-| `Qwen2ForSequenceClassification` | Qwen2-based | `mixedbread-ai/mxbai-rerank-base-v2` (see note), etc. | ✅︎ | ✅︎ |
-| `Qwen3ForSequenceClassification` | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | ✅︎ | ✅︎ |
-| `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | | |
-| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. | | |
-| `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* |
+| Architecture | Models | Example HF Models | Score template (see note) | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
+|--------------|--------|-------------------|---------------------------|-----------------------------|-----------------------------------------|
+| `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | N/A | | |
+| `GemmaForSequenceClassification` | Gemma-based | `BAAI/bge-reranker-v2-gemma`(see note), etc. | [bge-reranker-v2-gemma.jinja](../../examples/pooling/score/template/bge-reranker-v2-gemma.jinja) | ✅︎ | ✅︎ |
+| `GteNewForSequenceClassification` | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-reranker-base`, etc. | N/A | | |
+| `LlamaBidirectionalForSequenceClassification`<sup>C</sup> | Llama-based with bidirectional attention | `nvidia/llama-nemotron-rerank-1b-v2`, etc. | [nemotron-rerank.jinja](../../examples/pooling/score/template/nemotron-rerank.jinja) | ✅︎ | ✅︎ |
+| `Qwen2ForSequenceClassification`<sup>C</sup> | Qwen2-based | `mixedbread-ai/mxbai-rerank-base-v2`(see note), etc. | [mxbai_rerank_v2.jinja](../../examples/pooling/score/template/mxbai_rerank_v2.jinja) | ✅︎ | ✅︎ |
+| `Qwen3ForSequenceClassification`<sup>C</sup> | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B`(see note), etc. | [qwen3_reranker.jinja](../../examples/pooling/score/template/qwen3_reranker.jinja) | ✅︎ | ✅︎ |
+| `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | N/A | | |
+| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. | N/A | | |
+| `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | N/A | \* | \* |
 
 <sup>C</sup> Automatically converted into a classification model via `--convert classify`. ([details](./pooling_models.md#model-conversion))  
 \* Feature support is the same as that of the original model.
 
+!!! note
+    Some models require a specific prompt format to work correctly.
+
+    You can find Example HF Models's corresponding score template in [examples/pooling/score/template/](../../examples/pooling/score/template)
+
+    Examples : [examples/pooling/score/using_template_offline.py](../../examples/pooling/score/using_template_offline.py) [examples/pooling/score/using_template_online.py](../../examples/pooling/score/using_template_online.py)
+
 !!! note
     Load the official original `BAAI/bge-reranker-v2-gemma` by using the following command.
 
@@ -569,7 +587,7 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A
     ```
 
 !!! note
-    Load the official original `Qwen3 Reranker` by using the following command. More information can be found at: [examples/pooling/score/offline_reranker.py](../../examples/pooling/score/offline_reranker.py).
+    Load the official original `Qwen3 Reranker` by using the following command. More information can be found at: [examples/pooling/score/qwen3_reranker_offline.py](../../examples/pooling/score/qwen3_reranker_offline.py) [examples/pooling/score/qwen3_reranker_online.py](../../examples/pooling/score/qwen3_reranker_online.py).
 
     ```bash
     vllm serve Qwen/Qwen3-Reranker-0.6B --hf_overrides '{"architectures": ["Qwen3ForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}'
@@ -625,29 +643,7 @@ See [this page](../features/multimodal_inputs.md) on how to pass multi-modal inp
     For hybrid-only models such as Llama-4, Step3 and Mistral-3, a text-only mode can be enabled by setting all supported multimodal modalities to 0 (e.g, `--limit-mm-per-prompt '{"image":0}`) so that their multimodal modules will not be loaded to free up more GPU memory for KV cache.
 
 !!! note
-    vLLM currently only supports dynamic LoRA adapters on the language backbone of multimodal models.
-    If you wish to use a model with LoRA in the multi-modal encoder,
-    please merge the weights into the base model first before running it in vLLM like a regular model.
-
-    ```python
-    from peft import PeftConfig, PeftModel
-    from transformers import AutoModelForImageTextToText, AutoProcessor
-
-    def merge_and_save(model_id: str, output_dir: str):
-        base_model = AutoModelForImageTextToText.from_pretrained(model_id)
-        lora_model = PeftModel.from_pretrained(
-            base_model,
-            model_id,
-            config=PeftConfig.from_pretrained(model_id),
-        )
-        model = lora_model.merge_and_unload().to(dtype=base_model.dtype)
-        model._hf_peft_config_loaded = False  # Needed to save the merged model
-
-        processor = AutoProcessor.from_pretrained(model_id)
-
-        model.save_pretrained(output_dir)
-        processor.save_pretrained(output_dir)
-    ```
+    vLLM currently supports adding LoRA adapters to the language backbone for most multimodal models. Additionally, vLLM now experimentally supports adding LoRA to the tower and connector modules for some multimodal models. See [this page](../features/lora.md).
 
 ### Generative Models
 
@@ -664,11 +660,11 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `AyaVisionForConditionalGeneration` | Aya Vision | T + I<sup>+</sup> | `CohereLabs/aya-vision-8b`, `CohereLabs/aya-vision-32b`, etc. | | ✅︎ |
 | `BagelForConditionalGeneration` | BAGEL | T + I<sup>+</sup> | `ByteDance-Seed/BAGEL-7B-MoT` | ✅︎ | ✅︎ |
 | `BeeForConditionalGeneration` | Bee-8B | T + I<sup>E+</sup> | `Open-Bee/Bee-8B-RL`, `Open-Bee/Bee-8B-SFT` | | ✅︎ |
-| `Blip2ForConditionalGeneration` | BLIP-2 | T + I<sup>E</sup> | `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. | | ✅︎ |
+| `Blip2ForConditionalGeneration` | BLIP-2 | T + I<sup>E</sup> | `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. | ✅︎ | ✅︎ |
 | `ChameleonForConditionalGeneration` | Chameleon | T + I | `facebook/chameleon-7b`, etc. | | ✅︎ |
 | `Cohere2VisionForConditionalGeneration` | Command A Vision | T + I<sup>+</sup> | `CohereLabs/command-a-vision-07-2025`, etc. | | ✅︎ |
 | `DeepseekVLV2ForCausalLM`<sup>^</sup> | DeepSeek-VL2 | T + I<sup>+</sup> | `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2`, etc. | | ✅︎ |
-| `DeepseekOCRForCausalLM` | DeepSeek-OCR | T + I<sup>+</sup> | `deepseek-ai/DeepSeek-OCR`, etc. | | ✅︎ |
+| `DeepseekOCRForCausalLM` | DeepSeek-OCR | T + I<sup>+</sup> | `deepseek-ai/DeepSeek-OCR`, etc. | ✅︎ | ✅︎ |
 | `Ernie4_5_VLMoeForConditionalGeneration` | Ernie4.5-VL | T + I<sup>+</sup>/ V<sup>+</sup> | `baidu/ERNIE-4.5-VL-28B-A3B-PT`, `baidu/ERNIE-4.5-VL-424B-A47B-PT` | | ✅︎ |
 | `FuyuForCausalLM` | Fuyu | T + I | `adept/fuyu-8b`, etc. | | ✅︎ |
 | `Gemma3ForConditionalGeneration` | Gemma 3 | T + I<sup>E+</sup> | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ |
@@ -680,16 +676,19 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ |
 | `HunYuanVLForConditionalGeneration` | HunyuanOCR | T + I<sup>E+</sup> | `tencent/HunyuanOCR`, etc. | ✅︎ | ✅︎ |
 | `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | |
+| `IsaacForConditionalGeneration` | Isaac | T + I<sup>+</sup> | `PerceptronAI/Isaac-0.1` | ✅︎ | ✅︎ |
 | `InternS1ForConditionalGeneration` | Intern-S1 | T + I<sup>E+</sup> + V<sup>E+</sup> | `internlm/Intern-S1`, `internlm/Intern-S1-mini`, etc. | ✅︎ | ✅︎ |
 | `InternVLChatModel` | InternVL 3.5, InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3_5-14B`, `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ |
 | `InternVLForConditionalGeneration` | InternVL 3.0 (HF format) | T + I<sup>E+</sup> + V<sup>E+</sup> | `OpenGVLab/InternVL3-1B-hf`, etc. | ✅︎ | ✅︎ |
+| `KananaVForConditionalGeneration` | Kanana-V | T + I<sup>+</sup> | `kakaocorp/kanana-1.5-v-3b-instruct`, etc. | | ✅︎ |
 | `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-8B-Preview` | ✅︎ | ✅︎ |
 | `KeyeVL1_5ForConditionalGeneration` | Keye-VL-1_5-8B | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-1_5-8B` | ✅︎ | ✅︎ |
 | `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | ✅︎ |
 | `LightOnOCRForConditionalGeneration`  | LightOnOCR-1B  | T + I<sup>+</sup> | `lightonai/LightOnOCR-1B`, etc | ✅︎ | ✅︎ |
+| `Lfm2VlForConditionalGeneration` | LFM2-VL | T + I<sup>+</sup> | `LiquidAI/LFM2-VL-450M`, `LiquidAI/LFM2-VL-3B`, `LiquidAI/LFM2-VL-8B-A1B`, etc. | ✅︎ | ✅︎ |
 | `Llama4ForConditionalGeneration` | Llama 4 | T + I<sup>+</sup> | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | ✅︎ | ✅︎ |
 | `Llama_Nemotron_Nano_VL` | Llama Nemotron Nano VL | T + I<sup>E+</sup> | `nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1` | ✅︎ | ✅︎ |
-| `LlavaForConditionalGeneration` | LLaVA-1.5, Pixtral (HF Transformers) | T + I<sup>E+</sup> | `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), `mistral-community/pixtral-12b`, etc. | | ✅︎ |
+| `LlavaForConditionalGeneration` | LLaVA-1.5, Pixtral (HF Transformers) | T + I<sup>E+</sup> | `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), `mistral-community/pixtral-12b`, etc. | ✅︎ | ✅︎ |
 | `LlavaNextForConditionalGeneration` | LLaVA-NeXT | T + I<sup>E+</sup> | `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc. | | ✅︎ |
 | `LlavaNextVideoForConditionalGeneration` | LLaVA-NeXT-Video | T + V | `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. | | ✅︎ |
 | `LlavaOnevisionForConditionalGeneration` | LLaVA-Onevision | T + I<sup>+</sup> + V<sup>+</sup> | `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. | | ✅︎ |
@@ -704,10 +703,10 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `Ovis` | Ovis2, Ovis1.6 | T + I<sup>+</sup> | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc. | | ✅︎ |
 | `Ovis2_5` | Ovis2.5 | T + I<sup>+</sup> + V | `AIDC-AI/Ovis2.5-9B`, etc. | | |
 | `PaddleOCRVLForConditionalGeneration` | Paddle-OCR | T + I<sup>+</sup> | `PaddlePaddle/PaddleOCR-VL`, etc. | | |
-| `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + I<sup>E</sup> | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | | ✅︎ |
+| `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + I<sup>E</sup> | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | ✅︎ | ✅︎ |
 | `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + I<sup>E+</sup> | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ |
 | `Phi4MMForCausalLM` | Phi-4-multimodal | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ |
-| `PixtralForConditionalGeneration` | Ministral 3 (Mistral format), Mistral 3 (Mistral format), Mistral Large 3 (Mistral format), Pixtral (Mistral format) | T + I<sup>+</sup> | `mistralai/Ministral-3-3B-Instruct-2512`, `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistralai/Mistral-Large-3-675B-Instruct-2512` `mistralai/Pixtral-12B-2409` etc. | | ✅︎ |
+| `PixtralForConditionalGeneration` | Ministral 3 (Mistral format), Mistral 3 (Mistral format), Mistral Large 3 (Mistral format), Pixtral (Mistral format) | T + I<sup>+</sup> | `mistralai/Ministral-3-3B-Instruct-2512`, `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistralai/Mistral-Large-3-675B-Instruct-2512` `mistralai/Pixtral-12B-2409` etc. | ✅︎ | ✅︎ |
 | `QwenVLForConditionalGeneration`<sup>^</sup> | Qwen-VL | T + I<sup>E+</sup> | `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. | ✅︎ | ✅︎ |
 | `Qwen2AudioForConditionalGeneration` | Qwen2-Audio | T + A<sup>+</sup> | `Qwen/Qwen2-Audio-7B-Instruct` | | ✅︎ |
 | `Qwen2VLForConditionalGeneration` | QVQ, Qwen2-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ |
@@ -755,19 +754,17 @@ Some models are supported only via the [Transformers modeling backend](#transfor
     The official `openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (`HwwwH/MiniCPM-V-2`) for now.
     For more details, please see: <https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630>
 
-!!! note
-    For Qwen2.5-Omni and Qwen3-Omni, reading audio from video pre-processing (`--mm-processor-kwargs '{"use_audio_in_video": true}'`) is currently work in progress and not yet supported.
-
 #### Transcription
 
 Speech2Text models trained specifically for Automatic Speech Recognition.
 
 | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
 |--------------|--------|-------------------|----------------------|---------------------------|
-| `WhisperForConditionalGeneration` | Whisper | `openai/whisper-small`, `openai/whisper-large-v3-turbo`, etc. | | |
-| `VoxtralForConditionalGeneration` | Voxtral (Mistral format) | `mistralai/Voxtral-Mini-3B-2507`, `mistralai/Voxtral-Small-24B-2507`, etc. | ✅︎ | ✅︎ |
 | `Gemma3nForConditionalGeneration` | Gemma3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | |
+| `GlmAsrForConditionalGeneration` | GLM-ASR | `zai-org/GLM-ASR-Nano-2512` | ✅︎ | ✅︎ |
 | `GraniteSpeechForConditionalGeneration` | Granite Speech | `ibm-granite/granite-speech-3.3-2b`, `ibm-granite/granite-speech-3.3-8b`, etc. | ✅︎ | ✅︎ |
+| `VoxtralForConditionalGeneration` | Voxtral (Mistral format) | `mistralai/Voxtral-Mini-3B-2507`, `mistralai/Voxtral-Small-24B-2507`, etc. | ✅︎ | ✅︎ |
+| `WhisperForConditionalGeneration` | Whisper | `openai/whisper-small`, `openai/whisper-large-v3-turbo`, etc. | | |
 
 !!! note
     `VoxtralForConditionalGeneration` requires `mistral-common[audio]` to be installed.
@@ -790,6 +787,7 @@ The following table lists those that are tested in vLLM.
 | `CLIPModel` | CLIP | T / I | `openai/clip-vit-base-patch32`, `openai/clip-vit-large-patch14`, etc. | | |
 | `LlavaNextForConditionalGeneration`<sup>C</sup> | LLaVA-NeXT-based | T / I | `royokong/e5-v` | | ✅︎ |
 | `Phi3VForCausalLM`<sup>C</sup> | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | | ✅︎ |
+| `Qwen3VLForConditionalGeneration`<sup>C</sup> | Qwen3-VL | T + I + V | `Qwen/Qwen3-VL-Embedding-2B`, etc. | ✅︎ | ✅︎ |
 | `SiglipModel` | SigLIP, SigLIP2 | T / I | `google/siglip-base-patch16-224`, `google/siglip2-base-patch16-224` | | |
 | `*ForConditionalGeneration`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | \* | N/A | \* | \* |
 
@@ -806,10 +804,18 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A
 | Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
 |--------------|--------|--------|-------------------|----------------------|---------------------------|
 | `JinaVLForSequenceClassification` | JinaVL-based | T + I<sup>E+</sup> | `jinaai/jina-reranker-m0`, etc. | ✅︎ | ✅︎ |
+| `Qwen3VLForSequenceClassification` | Qwen3-VL-Reranker | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/Qwen3-VL-Reranker-2B`(see note), etc. | ✅︎ | ✅︎ |
 
 <sup>C</sup> Automatically converted into a classification model via `--convert classify`. ([details](./pooling_models.md#model-conversion))  
 \* Feature support is the same as that of the original model.
 
+!!! note
+    Similar to Qwen3-Reranker, you need to use the following `--hf_overrides` to load the official original `Qwen3-VL-Reranker`.
+
+    ```bash
+    vllm serve Qwen/Qwen3-VL-Reranker-2B --hf_overrides '{"architectures": ["Qwen3VLForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}'
+    ```
+
 ## Model Support Policy
 
 At vLLM, we are committed to facilitating the integration and support of third-party models within our ecosystem. Our approach is designed to balance the need for robustness and the practical limitations of supporting a wide range of models. Here’s how we manage third-party model support:
diff --git a/docs/serving/integrations/claude_code.md b/docs/serving/integrations/claude_code.md
new file mode 100644
index 0000000000000000000000000000000000000000..c4c409d285da81e266170aee151d80acf323a801
--- /dev/null
+++ b/docs/serving/integrations/claude_code.md
@@ -0,0 +1,75 @@
+# Claude Code
+
+[Claude Code](https://code.claude.com/docs/en/quickstart) is Anthropic's official agentic coding tool that lives in your terminal. It can understand your codebase, edit files, run commands, and help you write code more efficiently.
+
+By pointing Claude Code at a vLLM server, you can use your own models as the backend instead of the Anthropic API. This is useful for:
+
+- Running fully local/private coding assistance
+- Using open-weight models with tool calling capabilities
+- Testing and developing with custom models
+
+## How It Works
+
+vLLM implements the Anthropic Messages API, which is the same API that Claude Code uses to communicate with Anthropic's servers. By setting `ANTHROPIC_BASE_URL` to point at your vLLM server, Claude Code sends its requests to vLLM instead of Anthropic. vLLM then translates these requests to work with your local model and returns responses in the format Claude Code expects.
+
+This means any model served by vLLM with proper tool calling support can act as a drop-in replacement for Claude models in Claude Code.
+
+## Requirements
+
+Claude Code requires a model with strong tool calling capabilities. The model must support the OpenAI-compatible tool calling API. See [Tool Calling](../../features/tool_calling.md) for details on enabling tool calling for your model.
+
+## Installation
+
+First, install Claude Code by following the [official installation guide](https://docs.anthropic.com/en/docs/claude-code/getting-started).
+
+## Starting the vLLM Server
+
+Start vLLM with a tool-calling capable model - here's an example using `openai/gpt-oss-120b`:
+
+```bash
+vllm serve openai/gpt-oss-120b --served-model-name my-model --enable-auto-tool-choice --tool-call-parser openai
+```
+
+For other models, you'll need to enable tool calling explicitly with `--enable-auto-tool-choice` and the right `--tool-call-parser`. Refer to the [Tool Calling documentation](../../features/tool_calling.md) for the correct flags for your model.
+
+## Configuring Claude Code
+
+Launch Claude Code with environment variables pointing to your vLLM server:
+
+```bash
+ANTHROPIC_BASE_URL=http://localhost:8000 \
+ANTHROPIC_API_KEY=dummy \
+ANTHROPIC_DEFAULT_OPUS_MODEL=my-model \
+ANTHROPIC_DEFAULT_SONNET_MODEL=my-model \
+ANTHROPIC_DEFAULT_HAIKU_MODEL=my-model \
+claude
+```
+
+The environment variables:
+
+| Variable                         | Description                                                           |
+| -------------------------------- | --------------------------------------------------------------------- |
+| `ANTHROPIC_BASE_URL`             | Points to your vLLM server (default port is 8000)                     |
+| `ANTHROPIC_API_KEY`              | Can be any value since vLLM doesn't require authentication by default |
+| `ANTHROPIC_DEFAULT_OPUS_MODEL`   | Model name for Opus-tier requests                                     |
+| `ANTHROPIC_DEFAULT_SONNET_MODEL` | Model name for Sonnet-tier requests                                   |
+| `ANTHROPIC_DEFAULT_HAIKU_MODEL`  | Model name for Haiku-tier requests                                    |
+
+!!! tip
+    You can add these environment variables to your shell profile (e.g., `.bashrc`, `.zshrc`), Claude Code configuration file (`~/.claude/settings.json`), or create a wrapper script for convenience.
+
+## Testing the Setup
+
+Once Claude Code launches, try a simple prompt to verify the connection:
+
+![Claude Code example chat](../../assets/deployment/claude-code-example.png)
+
+If the model responds correctly, your setup is working. You can now use Claude Code with your vLLM-served model for coding tasks.
+
+## Troubleshooting
+
+**Connection refused**: Ensure vLLM is running and accessible at the specified URL. Check that the port matches.
+
+**Tool calls not working**: Verify that your model supports tool calling and that you've enabled it with the correct `--tool-call-parser` flag. See [Tool Calling](../../features/tool_calling.md).
+
+**Model not found**: Ensure the `--served-model-name` matches the model names in your environment variables. You cannot use model names with `/` in them, such as `openai/gpt-oss-120b` directly from Huggingface, so beware of that limitation with Claude Code.
diff --git a/docs/serving/integrations/langchain.md b/docs/serving/integrations/langchain.md
index 192a61ea5b903aa165af3340cd8a9d7cf108041b..14b336dffa78f4accc6fa9a67603e1894ade353f 100644
--- a/docs/serving/integrations/langchain.md
+++ b/docs/serving/integrations/langchain.md
@@ -16,7 +16,7 @@ To run inference on a single or multiple GPUs, use `VLLM` class from `langchain`
     from langchain_community.llms import VLLM
 
     llm = VLLM(
-        model="mosaicml/mpt-7b",
+        model="Qwen/Qwen3-4B",
         trust_remote_code=True,  # mandatory for hf models
         max_new_tokens=128,
         top_k=10,
diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md
index 0e29204f8947c0b692c1ed6e179463b4e5cb5e09..8c3cfe46a7c85c30f94fb8f8ca34189168560179 100644
--- a/docs/serving/openai_compatible_server.md
+++ b/docs/serving/openai_compatible_server.md
@@ -47,6 +47,8 @@ We currently support the following OpenAI APIs:
 - [Completions API](#completions-api) (`/v1/completions`)
     - Only applicable to [text generation models](../models/generative_models.md).
     - *Note: `suffix` parameter is not supported.*
+- [Responses API](#responses-api) (`/v1/responses`)
+    - Only applicable to [text generation models](../models/generative_models.md).
 - [Chat Completions API](#chat-api) (`/v1/chat/completions`)
     - Only applicable to [text generation models](../models/generative_models.md) with a [chat template](../serving/openai_compatible_server.md#chat-template).
     - *Note: `user` parameter is ignored.*
@@ -171,6 +173,14 @@ with `--enable-request-id-headers`.
     print(completion._request_id)
     ```
 
+## Offline API Documentation
+
+The FastAPI `/docs` endpoint requires an internet connection by default. To enable offline access in air-gapped environments, use the `--enable-offline-docs` flag:
+
+```bash
+vllm serve NousResearch/Meta-Llama-3-8B-Instruct --enable-offline-docs
+```
+
 ## API Reference
 
 ### Completions API
@@ -229,6 +239,31 @@ The following extra parameters are supported:
     --8<-- "vllm/entrypoints/openai/protocol.py:chat-completion-extra-params"
     ```
 
+### Responses API
+
+Our Responses API is compatible with [OpenAI's Responses API](https://platform.openai.com/docs/api-reference/responses);
+you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.
+
+Code example: [examples/online_serving/openai_responses_client_with_tools.py](../../examples/online_serving/openai_responses_client_with_tools.py)
+
+#### Extra parameters
+
+The following extra parameters in the request object are supported:
+
+??? code
+
+    ```python
+    --8<-- "vllm/entrypoints/openai/protocol.py:responses-extra-params"
+    ```
+
+The following extra parameters in the response object are supported:
+
+??? code
+
+    ```python
+    --8<-- "vllm/entrypoints/openai/protocol.py:responses-response-extra-params"
+    ```
+
 ### Embeddings API
 
 Our Embeddings API is compatible with [OpenAI's Embeddings API](https://platform.openai.com/docs/api-reference/embeddings);
@@ -335,7 +370,7 @@ and passing a list of `messages` in the request. Refer to the examples below for
         `MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of the minimum image size for text query embeddings. See the full code
         example below for details.
 
-Full example: [examples/pooling/embed/openai_chat_embedding_client_for_multimodal.py](../../examples/pooling/embed/openai_chat_embedding_client_for_multimodal.py)
+Full example: [examples/pooling/embed/vision_embedding_online.py](../../examples/pooling/embed/vision_embedding_online.py)
 
 #### Extra parameters
 
@@ -640,7 +675,22 @@ Usually, the score for a sentence pair refers to the similarity between two sent
 
 You can find the documentation for cross encoder models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html).
 
-Code example: [examples/pooling/score/openai_cross_encoder_score.py](../../examples/pooling/score/openai_cross_encoder_score.py)
+Code example: [examples/pooling/score/score_api_online.py](../../examples/pooling/score/score_api_online.py)
+
+#### Score Template
+
+Some scoring models require a specific prompt format to work correctly. You can specify a custom score template using the `--chat-template` parameter (see [Chat Template](#chat-template)).
+
+Score templates are supported for **cross-encoder** models only. If you are using an **embedding** model for scoring, vLLM does not apply a score template.
+
+Like chat templates, the score template receives a `messages` list. For scoring, each message has a `role` attribute—either `"query"` or `"document"`. For the usual kind of point-wise cross-encoder, you can expect exactly two messages: one query and one document. To access the query and document content, use Jinja's `selectattr` filter:
+
+- **Query**: `{{ (messages | selectattr("role", "eq", "query") | first).content }}`
+- **Document**: `{{ (messages | selectattr("role", "eq", "document") | first).content }}`
+
+This approach is more robust than index-based access (`messages[0]`, `messages[1]`) because it selects messages by their semantic role. It also avoids assumptions about message ordering if additional message types are added to `messages` in the future.
+
+Example template file: [examples/pooling/score/template/nemotron-rerank.jinja](../../examples/pooling/score/template/nemotron-rerank.jinja)
 
 #### Single inference
 
@@ -821,7 +871,10 @@ You can pass multi-modal inputs to scoring models by passing `content` including
         print("Scoring output:", response_json["data"][0]["score"])
         print("Scoring output:", response_json["data"][1]["score"])
         ```
-Full example: [examples/pooling/score/openai_cross_encoder_score_for_multimodal.py](../../examples/pooling/score/openai_cross_encoder_score_for_multimodal.py)
+Full example:
+
+- [examples/pooling/score/vision_score_api_online.py](../../examples/pooling/score/vision_score_api_online.py)
+- [examples/pooling/score/vision_rerank_api_online.py](../../examples/pooling/score/vision_rerank_api_online.py)
 
 #### Extra parameters
 
@@ -851,7 +904,7 @@ endpoints are compatible with both [Jina AI's re-rank API interface](https://jin
 [Cohere's re-rank API interface](https://docs.cohere.com/v2/reference/rerank) to ensure compatibility with
 popular open-source tools.
 
-Code example: [examples/pooling/score/openai_reranker.py](../../examples/pooling/score/openai_reranker.py)
+Code example: [examples/pooling/score/rerank_api_online.py](../../examples/pooling/score/rerank_api_online.py)
 
 #### Example Request
 
diff --git a/docs/usage/metrics.md b/docs/usage/metrics.md
index 829533b84328f2cd47e3fafd2f2809dd6b1a5ec9..421d5df4a0e613ff9136b92adf3450dba574bcaf 100644
--- a/docs/usage/metrics.md
+++ b/docs/usage/metrics.md
@@ -35,15 +35,15 @@ The following metrics are exposed:
 
 ## General Metrics
 
---8<-- "docs/generated/metrics/general.md"
+--8<-- "docs/generated/metrics/general.inc.md"
 
 ## Speculative Decoding Metrics
 
---8<-- "docs/generated/metrics/spec_decode.md"
+--8<-- "docs/generated/metrics/spec_decode.inc.md"
 
 ## NIXL KV Connector Metrics
 
---8<-- "docs/generated/metrics/nixl_connector.md"
+--8<-- "docs/generated/metrics/nixl_connector.inc.md"
 
 ## Deprecation Policy
 
diff --git a/docs/usage/troubleshooting.md b/docs/usage/troubleshooting.md
index 1995045fd5562028e6864756f65f69d8eb9f948a..128c36b784d8ad207634d6e8870692c5db6524ef 100644
--- a/docs/usage/troubleshooting.md
+++ b/docs/usage/troubleshooting.md
@@ -320,6 +320,32 @@ This indicates vLLM failed to initialize the NCCL communicator, possibly due to
 
 If you see an error like `RuntimeError: CUDA error: the provided PTX was compiled with an unsupported toolchain.`, it means that the CUDA PTX in vLLM's wheels was compiled with a toolchain unsupported by your system. The released vLLM wheels have to be compiled with a specific version of CUDA toolkit, and the compiled code might fail to run on lower versions of CUDA drivers. Read [cuda compatibility](https://docs.nvidia.com/deploy/cuda-compatibility/) for more details. The solution is to install `cuda-compat` package from your package manager. For example, on Ubuntu, you can run `sudo apt-get install cuda-compat-12-9`, and then add `export LD_LIBRARY_PATH=/usr/local/cuda-12.9/compat:$LD_LIBRARY_PATH` to your `.bashrc` file. When successfully installed, you should see that the output of `nvidia-smi` will show `CUDA Version: 12.9`. Note that we use CUDA 12.9 as an example here, you may want to install a higher version of cuda-compat package in case vLLM's default CUDA version goes higher.
 
+## ptxas fatal: Value 'sm_110a' is not defined for option 'gpu-name'
+
+If you use triton kernels with cuda 13, you might see an error like `ptxas fatal: Value 'sm_110a' is not defined for option 'gpu-name'`:
+
+```text
+(EngineCore_0 pid=9492) triton.runtime.errors.PTXASError: PTXAS error: Internal Triton PTX codegen error
+(EngineCore_0 pid=9492) `ptxas` stderr:
+(EngineCore_0 pid=9492) ptxas fatal   : Value 'sm_110a' is not defined for option 'gpu-name'
+(EngineCore_0 pid=9492) 
+(EngineCore_0 pid=9492) Repro command: /home/jetson/.venv/lib/python3.12/site-packages/triton/backends/nvidia/bin/ptxas -lineinfo -v --gpu-name=sm_110a /tmp/tmp95oy_b9d.ptx -o /tmp/tmp95oy_b9d.ptx.o
+(EngineCore_0 pid=9492) 
+    outputs = self.engine_core.get_output()
+              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/jetson/.venv/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 668, in get_output
+    raise self._format_exception(outputs) from None
+vllm.v1.engine.exceptions.EngineDeadError: EngineCore encountered an issue. See stack trace (above) for the root cause.
+```
+
+It means that the ptxas in the triton bundle is not compatible with your device. You need to set `TRITON_PTXAS_PATH` environment variable to use cuda toolkit's ptxas manually instead:
+
+```shell
+export CUDA_HOME=/usr/local/cuda
+export TRITON_PTXAS_PATH="${CUDA_HOME}/bin/ptxas"
+export PATH="${CUDA_HOME}/bin:$PATH"
+```
+
 ## Known Issues
 
 - In `v0.5.2`, `v0.5.3`, and `v0.5.3.post1`, there is a bug caused by [zmq](https://github.com/zeromq/pyzmq/issues/2000) , which can occasionally cause vLLM to hang depending on the machine configuration. The solution is to upgrade to the latest version of `vllm` to include the [fix](https://github.com/vllm-project/vllm/pull/6759).
diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md
index 5f647aafd61d439bd0d569684b4484a2938e731f..8506e01b96d048fbd9fd52d7faf54a8ff64a0c69 100644
--- a/docs/usage/v1_guide.md
+++ b/docs/usage/v1_guide.md
@@ -123,7 +123,7 @@ We are working on enabling prefix caching and chunked prefill for more categorie
 #### Mamba Models
 
 Models using selective state-space mechanisms instead of standard transformer attention are supported.
-Models that use Mamba-2 and Mamba-1 layers (e.g., `Mamba2ForCausalLM`, `MambaForCausalLM`,`FalconMambaForCausalLM`) are supported.
+Models that use Mamba-2 and Mamba-1 layers (e.g., `Mamba2ForCausalLM`, `MambaForCausalLM`, `FalconMambaForCausalLM`) are supported.
 
 Hybrid models that combine Mamba-2 and Mamba-1 layers with standard attention layers are also supported (e.g., `BambaForCausalLM`,
 `Zamba2ForCausalLM`, `NemotronHForCausalLM`, `FalconH1ForCausalLM` and `GraniteMoeHybridForCausalLM`, `JambaForCausalLM`, `Plamo2ForCausalLM`).
diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py
index a6d0c5d12dd4164024bc2de82278f98421b5bc80..e9878382b9c53f70f8ef4ca7c89a2dcc81c1bf5e 100755
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@@ -213,37 +213,6 @@ def run_phi4mm(question: str, audio_count: int) -> ModelRequestData:
     )
 
 
-def run_phi4_multimodal(question: str, audio_count: int) -> ModelRequestData:
-    """
-    Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
-    show how to process audio inputs.
-    """
-    model_path = snapshot_download(
-        "microsoft/Phi-4-multimodal-instruct", revision="refs/pr/70"
-    )
-    # Since the vision-lora and speech-lora co-exist with the base model,
-    # we have to manually specify the path of the lora weights.
-    speech_lora_path = os.path.join(model_path, "speech-lora")
-    placeholders = "<|audio|>" * audio_count
-
-    prompts = f"<|user|>{placeholders}{question}<|end|><|assistant|>"
-
-    engine_args = EngineArgs(
-        model=model_path,
-        max_model_len=12800,
-        max_num_seqs=2,
-        enable_lora=True,
-        max_lora_rank=320,
-        limit_mm_per_prompt={"audio": audio_count},
-    )
-
-    return ModelRequestData(
-        engine_args=engine_args,
-        prompt=prompts,
-        lora_requests=[LoRARequest("speech", 1, speech_lora_path)],
-    )
-
-
 # Qwen2-Audio
 def run_qwen2_audio(question: str, audio_count: int) -> ModelRequestData:
     model_name = "Qwen/Qwen2-Audio-7B-Instruct"
@@ -389,6 +358,34 @@ def run_voxtral(question: str, audio_count: int) -> ModelRequestData:
     )
 
 
+# GLM-ASR
+def run_glmasr(question: str, audio_count: int) -> ModelRequestData:
+    model_name = "zai-org/GLM-ASR-Nano-2512"
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+
+    # GLM-ASR uses <|pad|> token for audio
+    audio_placeholder = "<|pad|>" * audio_count
+
+    messages = [{"role": "user", "content": f"{audio_placeholder}{question}"}]
+    prompt = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=4096,
+        max_num_seqs=2,
+        limit_mm_per_prompt={"audio": audio_count},
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+    )
+
+
 # Whisper
 def run_whisper(question: str, audio_count: int) -> ModelRequestData:
     assert audio_count == 1, "Whisper only support single audio input per prompt"
@@ -412,11 +409,11 @@ def run_whisper(question: str, audio_count: int) -> ModelRequestData:
 model_example_map = {
     "audioflamingo3": run_audioflamingo3,
     "gemma3n": run_gemma3n,
+    "glmasr": run_glmasr,
     "granite_speech": run_granite_speech,
     "midashenglm": run_midashenglm,
     "minicpmo": run_minicpmo,
     "phi4_mm": run_phi4mm,
-    "phi4_multimodal": run_phi4_multimodal,
     "qwen2_audio": run_qwen2_audio,
     "qwen2_5_omni": run_qwen2_5_omni,
     "ultravox": run_ultravox,
@@ -498,27 +495,40 @@ def main(args):
         temperature=0.2, max_tokens=64, stop_token_ids=req_data.stop_token_ids
     )
 
-    mm_data = req_data.multi_modal_data
-    if not mm_data:
-        mm_data = {}
-        if audio_count > 0:
-            mm_data = {
-                "audio": [
-                    asset.audio_and_sample_rate for asset in audio_assets[:audio_count]
-                ]
-            }
+    def get_input(start, end):
+        mm_data = req_data.multi_modal_data
+        if not mm_data:
+            mm_data = {}
+            if end - start > 0:
+                mm_data = {
+                    "audio": [
+                        asset.audio_and_sample_rate for asset in audio_assets[start:end]
+                    ]
+                }
 
-    assert args.num_prompts > 0
-    inputs = {"multi_modal_data": mm_data}
+        inputs = {"multi_modal_data": mm_data}
 
-    if req_data.prompt:
-        inputs["prompt"] = req_data.prompt
-    else:
-        inputs["prompt_token_ids"] = req_data.prompt_token_ids
+        if req_data.prompt:
+            inputs["prompt"] = req_data.prompt
+        else:
+            inputs["prompt_token_ids"] = req_data.prompt_token_ids
+
+        return inputs
 
-    if args.num_prompts > 1:
-        # Batch inference
+    # Batch inference
+    assert args.num_prompts > 0
+    if audio_count != 1:
+        inputs = get_input(0, audio_count)
         inputs = [inputs] * args.num_prompts
+    else:
+        # For single audio input, we need to vary the audio input
+        # to avoid deduplication in vLLM engine.
+        inputs = []
+        for i in range(args.num_prompts):
+            start = i % len(audio_assets)
+            inp = get_input(start, start + 1)
+            inputs.append(inp)
+
     # Add LoRA request if applicable
     lora_request = (
         req_data.lora_requests * args.num_prompts if req_data.lora_requests else None
diff --git a/examples/offline_inference/basic/chat.py b/examples/offline_inference/basic/chat.py
index c42b00730fe43beda6d4717d7d199aaf2f001a95..bca962597c6bad025f646e944093ce7ee2cdd39a 100644
--- a/examples/offline_inference/basic/chat.py
+++ b/examples/offline_inference/basic/chat.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm import LLM, EngineArgs
+from vllm.outputs import RequestOutput
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 
 
@@ -44,12 +45,12 @@ def main(args: dict):
     if top_k is not None:
         sampling_params.top_k = top_k
 
-    def print_outputs(outputs):
+    def print_outputs(outputs: list[RequestOutput], prompts: list):
+        assert len(outputs) == len(prompts)
         print("\nGenerated Outputs:\n" + "-" * 80)
-        for output in outputs:
-            prompt = output.prompt
+        for i, output in enumerate(outputs):
             generated_text = output.outputs[0].text
-            print(f"Prompt: {prompt!r}\n")
+            print(f"Prompt: {prompts[i]!r}\n")
             print(f"Generated text: {generated_text!r}")
             print("-" * 80)
 
@@ -66,14 +67,19 @@ def main(args: dict):
         },
     ]
     outputs = llm.chat(conversation, sampling_params, use_tqdm=False)
-    print_outputs(outputs)
+    print_outputs(
+        outputs,
+        [
+            conversation,
+        ],
+    )
 
     # You can run batch inference with llm.chat API
     conversations = [conversation for _ in range(10)]
 
     # We turn on tqdm progress bar to verify it's indeed running batch inference
     outputs = llm.chat(conversations, sampling_params, use_tqdm=True)
-    print_outputs(outputs)
+    print_outputs(outputs, conversations)
 
     # A chat template can be optionally supplied.
     # If not, the model will use its default chat template.
@@ -87,7 +93,7 @@ def main(args: dict):
             use_tqdm=False,
             chat_template=chat_template,
         )
-        print_outputs(outputs)
+        print_outputs(outputs, conversations)
 
 
 if __name__ == "__main__":
diff --git a/examples/offline_inference/basic/embed.py b/examples/offline_inference/basic/embed.py
index 17f727b33d321fa4355662306dd2643c4c2bfa89..eeb7137ff7bae59b6c9c37f758527a9da2e6411b 100644
--- a/examples/offline_inference/basic/embed.py
+++ b/examples/offline_inference/basic/embed.py
@@ -4,9 +4,6 @@
 from argparse import Namespace
 
 from vllm import LLM, EngineArgs
-from vllm.attention.backends.registry import AttentionBackendEnum
-from vllm.config import AttentionConfig
-from vllm.platforms import current_platform
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 
 
@@ -23,11 +20,6 @@ def parse_args():
 
 
 def main(args: Namespace):
-    if current_platform.is_rocm():
-        args.attention_config = AttentionConfig(
-            backend=AttentionBackendEnum.FLEX_ATTENTION
-        )
-
     # Sample prompts.
     prompts = [
         "Hello, my name is",
diff --git a/examples/offline_inference/basic/score.py b/examples/offline_inference/basic/score.py
index b2dadffd249f5c7d5b4c0bfd189f6789034cd2ef..cbca50eb5efa8a8771759e20a531cc881167ab7d 100644
--- a/examples/offline_inference/basic/score.py
+++ b/examples/offline_inference/basic/score.py
@@ -4,9 +4,6 @@
 from argparse import Namespace
 
 from vllm import LLM, EngineArgs
-from vllm.attention.backends.registry import AttentionBackendEnum
-from vllm.config import AttentionConfig
-from vllm.platforms import current_platform
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 
 
@@ -23,11 +20,6 @@ def parse_args():
 
 
 def main(args: Namespace):
-    if current_platform.is_rocm():
-        args.attention_config = AttentionConfig(
-            backend=AttentionBackendEnum.FLEX_ATTENTION
-        )
-
     # Sample prompts.
     text_1 = "What is the capital of France?"
     texts_2 = [
diff --git a/examples/offline_inference/context_extension.py b/examples/offline_inference/context_extension.py
index 67d33e1881ee90b9fa6f5e2f88e94574fadf4219..fae8590f914eac1e451516f423f387a1583c1442 100644
--- a/examples/offline_inference/context_extension.py
+++ b/examples/offline_inference/context_extension.py
@@ -9,7 +9,7 @@ Usage:
     python examples/offline_inference/context_extension.py
 """
 
-from vllm import LLM, SamplingParams
+from vllm import LLM, RequestOutput, SamplingParams
 
 
 def create_llm():
@@ -45,13 +45,15 @@ def run_llm_chat(llm):
         {"role": "assistant", "content": "Hello! How can I assist you today?"},
     ]
     outputs = llm.chat(conversation, sampling_params, use_tqdm=False)
-    return outputs
+    return outputs, [
+        conversation,
+    ]
 
 
-def print_outputs(outputs):
+def print_outputs(outputs: list[RequestOutput], conversations: list):
     print("\nGenerated Outputs:\n" + "-" * 80)
-    for output in outputs:
-        prompt = output.prompt
+    for i, output in enumerate(outputs):
+        prompt = conversations[i]
         generated_text = output.outputs[0].text
         print(f"Prompt: {prompt!r}\n")
         print(f"Generated text: {generated_text!r}")
@@ -60,8 +62,8 @@ def print_outputs(outputs):
 
 def main():
     llm = create_llm()
-    outputs = run_llm_chat(llm)
-    print_outputs(outputs)
+    outputs, conversations = run_llm_chat(llm)
+    print_outputs(outputs, conversations)
 
 
 if __name__ == "__main__":
diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py
index be0b846995a924c709f387f11d9e95da144b04f2..287409fa2b5c12cdca6d5dee678d0da9514e7b23 100644
--- a/examples/offline_inference/data_parallel.py
+++ b/examples/offline_inference/data_parallel.py
@@ -5,130 +5,91 @@ Usage:
 Single node:
     python examples/offline_inference/data_parallel.py \
             --model="ibm-research/PowerMoE-3b" \
-            --dp-size=2 \
-            --tp-size=2
+            -dp=2 \
+            -tp=2
 
 Multi-node:
     Node 0 (assume the node has ip of 10.99.48.128):
             python examples/offline_inference/data_parallel.py \
                     --model="ibm-research/PowerMoE-3b" \
-                    --dp-size=2 \
-                    --tp-size=2 \
-                    --node-size=2 \
-                    --node-rank=0 \
-                    --master-addr=10.99.48.128 \
-                    --master-port=13345
+                    -dp=2 \
+                    -tp=2 \
+                    --dp-num-nodes=2 \
+                    --dp-node-rank=0 \
+                    --dp-master-addr=10.99.48.128 \
+                    --dp-master-port=13345
     Node 1:
             python examples/offline_inference/data_parallel.py \
                     --model="ibm-research/PowerMoE-3b" \
-                    --dp-size=2 \
-                    --tp-size=2 \
-                    --node-size=2 \
-                    --node-rank=1 \
-                    --master-addr=10.99.48.128 \
-                    --master-port=13345
+                    -dp=2 \
+                    -tp=2 \
+                    --dp-num-nodes=2 \
+                    --dp-node-rank=1 \
+                    --dp-master-addr=10.99.48.128 \
+                    --dp-master-port=13345
 """
 
 import os
 from time import sleep
 
-from vllm import LLM, SamplingParams
+from vllm import LLM, EngineArgs, SamplingParams
 from vllm.platforms import current_platform
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.network_utils import get_open_port
 
 
-def parse_args():
-    import argparse
+def create_parser():
+    parser = FlexibleArgumentParser(description="Data Parallel Inference")
 
-    parser = argparse.ArgumentParser(description="Data Parallel Inference")
-    parser.add_argument(
-        "--model",
-        type=str,
-        default="ibm-research/PowerMoE-3b",
-        help="Model name or path",
-    )
-    parser.add_argument("--dp-size", type=int, default=2, help="Data parallel size")
-    parser.add_argument("--tp-size", type=int, default=2, help="Tensor parallel size")
-    parser.add_argument(
-        "--node-size", type=int, default=1, help="Total number of nodes"
-    )
-    parser.add_argument(
-        "--node-rank", type=int, default=0, help="Rank of the current node"
-    )
-    parser.add_argument(
-        "--master-addr", type=str, default="", help="Master node IP address"
+    # Add all engine args
+    EngineArgs.add_cli_args(parser)
+    parser.set_defaults(
+        model="ibm-research/PowerMoE-3b",
+        enable_expert_parallel=True,
     )
-    parser.add_argument("--master-port", type=int, default=0, help="Master node port")
+
+    # Add DP-specific args (separate from engine args to avoid conflicts)
     parser.add_argument(
-        "--enforce-eager", action="store_true", help="Enforce eager mode execution."
+        "--dp-num-nodes",
+        type=int,
+        default=1,
+        help="Total number of nodes for data parallel.",
     )
     parser.add_argument(
-        "--trust-remote-code", action="store_true", help="Trust remote code."
+        "--dp-node-rank",
+        type=int,
+        default=0,
+        help="Rank of the current node for data parallel.",
     )
     parser.add_argument(
-        "--max-num-seqs",
-        type=int,
-        default=64,
-        help=("Maximum number of sequences to be processed in a single iteration."),
+        "--dp-master-addr",
+        type=str,
+        default="",
+        help="Master node IP address for DP coordination.",
     )
     parser.add_argument(
-        "--max-model-len",
+        "--dp-master-port",
         type=int,
-        help=("Maximum number of tokens to be processed in a single iteration."),
+        default=0,
+        help="Master node port for DP coordination.",
     )
     parser.add_argument(
         "--timeout",
         type=int,
         default=300,
-        help=("Number of seconds before unresponsive process is killed."),
-    )
-    parser.add_argument(
-        "--gpu-memory-utilization",
-        type=float,
-        default=0.8,
-        help=("Fraction of GPU memory vLLM is allowed to allocate (0.0, 1.0]."),
-    )
-    parser.add_argument(
-        "--enable-dbo",
-        action="store_true",
-        help=("Enable microbatched execution"),
-    )
-    parser.add_argument(
-        "--compilation-config",
-        type=int,
-        help=("Compilation optimization (O) mode 0-3."),
-    )
-    parser.add_argument(
-        "--quantization",
-        type=str,
-    )
-    parser.add_argument(
-        "--disable-expert-parallel",
-        dest="enable_expert_parallel",
-        action="store_false",
-        help="Disable expert parallel (default: enabled).",
+        help="Number of seconds before unresponsive process is killed.",
     )
-    parser.set_defaults(enable_expert_parallel=True)
-    return parser.parse_args()
+
+    return parser
 
 
 def main(
-    model,
     dp_size,
     local_dp_rank,
     global_dp_rank,
     dp_master_ip,
     dp_master_port,
-    GPUs_per_dp_rank,
-    enforce_eager,
-    enable_expert_parallel,
-    trust_remote_code,
-    max_num_seqs,
-    max_model_len,
-    compilation_config,
-    gpu_memory_utilization,
-    enable_dbo,
-    quantization,
+    engine_args,
 ):
     os.environ["VLLM_DP_RANK"] = str(global_dp_rank)
     os.environ["VLLM_DP_RANK_LOCAL"] = str(local_dp_rank)
@@ -173,19 +134,7 @@ def main(
     )
 
     # Create an LLM.
-    llm = LLM(
-        model=model,
-        tensor_parallel_size=GPUs_per_dp_rank,
-        enforce_eager=enforce_eager,
-        enable_expert_parallel=enable_expert_parallel,
-        trust_remote_code=trust_remote_code,
-        max_num_seqs=max_num_seqs,
-        max_model_len=max_model_len,
-        gpu_memory_utilization=gpu_memory_utilization,
-        enable_dbo=enable_dbo,
-        quantization=quantization,
-        compilation_config=compilation_config,
-    )
+    llm = LLM(**engine_args)
     outputs = llm.generate(prompts, sampling_params)
     # Print the outputs.
     for i, output in enumerate(outputs):
@@ -204,22 +153,29 @@ def main(
 
 
 if __name__ == "__main__":
-    args = parse_args()
+    parser = create_parser()
+    args = vars(parser.parse_args())
+
+    # Extract DP-specific args (pop to remove from engine_args)
+    dp_size = args.pop("data_parallel_size")
+    dp_num_nodes = args.pop("dp_num_nodes")
+    dp_node_rank = args.pop("dp_node_rank")
+    dp_master_addr = args.pop("dp_master_addr")
+    dp_master_port = args.pop("dp_master_port")
+    timeout = args.pop("timeout")
 
-    dp_size = args.dp_size
-    tp_size = args.tp_size
-    node_size = args.node_size
-    node_rank = args.node_rank
+    # Remaining args are engine args
+    engine_args = args
 
-    if node_size == 1:
+    if dp_num_nodes == 1:
         dp_master_ip = "127.0.0.1"
-        dp_master_port = get_open_port()
+        dp_master_port_val = get_open_port()
     else:
-        dp_master_ip = args.master_addr
-        dp_master_port = args.master_port
+        dp_master_ip = dp_master_addr
+        dp_master_port_val = dp_master_port
 
-    assert dp_size % node_size == 0, "dp_size should be divisible by node_size"
-    dp_per_node = dp_size // node_size
+    assert dp_size % dp_num_nodes == 0, "dp_size should be divisible by dp_num_nodes"
+    dp_per_node = dp_size // dp_num_nodes
 
     from multiprocessing import Process
 
@@ -230,34 +186,24 @@ if __name__ == "__main__":
 
     procs = []
     for local_dp_rank, global_dp_rank in enumerate(
-        range(node_rank * dp_per_node, (node_rank + 1) * dp_per_node)
+        range(dp_node_rank * dp_per_node, (dp_node_rank + 1) * dp_per_node)
     ):
         proc = Process(
             target=main,
             args=(
-                args.model,
                 dp_size,
                 local_dp_rank,
                 global_dp_rank,
                 dp_master_ip,
-                dp_master_port,
-                tp_size,
-                args.enforce_eager,
-                args.enable_expert_parallel,
-                args.trust_remote_code,
-                args.max_num_seqs,
-                args.max_model_len,
-                args.compilation_config,
-                args.gpu_memory_utilization,
-                args.enable_dbo,
-                args.quantization,
+                dp_master_port_val,
+                engine_args,
             ),
         )
         proc.start()
         procs.append(proc)
     exit_code = 0
     for proc in procs:
-        proc.join(timeout=args.timeout)
+        proc.join(timeout=timeout)
         if proc.exitcode is None:
             print(f"Killing process {proc.pid} that didn't stop within 5 minutes.")
             proc.kill()
diff --git a/examples/offline_inference/qwen2_5_omni/README.md b/examples/offline_inference/qwen2_5_omni/README.md
index d8fb50d7fe55c823792acb0857ae95dc5a2eb49b..409ac0223b555aaba05ad13855db6121f564beda 100644
--- a/examples/offline_inference/qwen2_5_omni/README.md
+++ b/examples/offline_inference/qwen2_5_omni/README.md
@@ -10,7 +10,6 @@ python examples/offline_inference/qwen2_5_omni/only_thinker.py \
     -q mixed_modalities
 
 # Read vision and audio inputs from a single video file
-# NOTE: V1 engine does not support interleaved modalities yet.
 python examples/offline_inference/qwen2_5_omni/only_thinker.py \
     -q use_audio_in_video
 
diff --git a/examples/offline_inference/spec_decode.py b/examples/offline_inference/spec_decode.py
index 29b2e95d262f8fbaf23cf5701cfef6d77ab9bbd9..a84d5b11638da4ec6b18354f62be9546a24f6227 100644
--- a/examples/offline_inference/spec_decode.py
+++ b/examples/offline_inference/spec_decode.py
@@ -152,9 +152,12 @@ def main(args):
 
     # print the generated text
     if args.print_output:
-        for output in outputs:
+        for i, output in enumerate(outputs):
             print("-" * 50)
-            print(f"prompt: {output.prompt}")
+            if not args.custom_mm_prompts:
+                print(f"prompt: {prompts[i].prompt}")
+            else:
+                print(f"prompt: {prompts[i]}")
             print(f"generated text: {output.outputs[0].text}")
             print("-" * 50)
 
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index dd5b22ae9b0f64f2e46c9578d96152e4575f5b52..2d8c6081e5f93cf98202a728d2017778a672ee0d 100755
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -15,7 +15,7 @@ from dataclasses import asdict
 from typing import NamedTuple
 
 from huggingface_hub import snapshot_download
-from transformers import AutoTokenizer
+from transformers import AutoProcessor, AutoTokenizer
 
 from vllm import LLM, EngineArgs, SamplingParams
 from vllm.assets.image import ImageAsset
@@ -769,6 +769,33 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
+# Kanana-V
+def run_kanana_v(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+
+    model_name = "kakaocorp/kanana-1.5-v-3b-instruct"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        trust_remote_code=True,
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    messages = [
+        [{"role": "user", "content": f"<image>\n{question}"}] for question in questions
+    ]
+    prompts = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 # Keye-VL
 def run_keye_vl(questions: list[str], modality: str) -> ModelRequestData:
     model_name = "Kwai-Keye/Keye-VL-8B-Preview"
@@ -875,6 +902,37 @@ def run_lightonocr(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
+def run_lfm2_vl(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+
+    model_name = "LiquidAI/LFM2-VL-450M"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=4096,
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    processor = AutoProcessor.from_pretrained(model_name)
+    messages = [
+        [
+            {
+                "role": "user",
+                "content": [{"type": "image"}, {"type": "text", "text": question}],
+            }
+        ]
+        for question in questions
+    ]
+    prompts = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 def run_llama4(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
 
@@ -1424,41 +1482,6 @@ def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
-# HF format Phi-4-multimodal-instruct
-def run_phi4_multimodal(questions: list[str], modality: str) -> ModelRequestData:
-    """
-    Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
-    show how to process image inputs.
-    """
-    assert modality == "image"
-    model_path = snapshot_download(
-        "microsoft/Phi-4-multimodal-instruct", revision="refs/pr/70"
-    )
-    # Since the vision-lora and speech-lora co-exist with the base model,
-    # we have to manually specify the path of the lora weights.
-    vision_lora_path = os.path.join(model_path, "vision-lora")
-    prompts = [
-        f"<|user|><|image|>{question}<|end|><|assistant|>" for question in questions
-    ]
-    engine_args = EngineArgs(
-        model=model_path,
-        max_model_len=5120,
-        max_num_seqs=2,
-        max_num_batched_tokens=12800,
-        enable_lora=True,
-        max_lora_rank=320,
-        # Note - mm_processor_kwargs can also be passed to generate/chat calls
-        mm_processor_kwargs={"dynamic_hd": 16},
-        limit_mm_per_prompt={"image": 1},
-    )
-
-    return ModelRequestData(
-        engine_args=engine_args,
-        prompts=prompts,
-        lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
-    )
-
-
 # Pixtral HF-format
 def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
@@ -1880,10 +1903,12 @@ model_example_map = {
     "idefics3": run_idefics3,
     "interns1": run_interns1,
     "internvl_chat": run_internvl,
+    "kanana_v": run_kanana_v,
     "keye_vl": run_keye_vl,
     "keye_vl1_5": run_keye_vl1_5,
     "kimi_vl": run_kimi_vl,
     "lightonocr": run_lightonocr,
+    "lfm2_vl": run_lfm2_vl,
     "llama4": run_llama4,
     "llava": run_llava,
     "llava-next": run_llava_next,
@@ -1904,7 +1929,6 @@ model_example_map = {
     "paligemma2": run_paligemma2,
     "phi3_v": run_phi3v,
     "phi4_mm": run_phi4mm,
-    "phi4_multimodal": run_phi4_multimodal,
     "pixtral_hf": run_pixtral_hf,
     "qwen_vl": run_qwen_vl,
     "qwen2_vl": run_qwen2_vl,
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index 3c01806baa2036066ad225d313680d108eea2364..2d7aece527aeeb5372a48208ca736f3bfea56eb8 100755
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -932,40 +932,6 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
     )
 
 
-def load_phi4_multimodal(question: str, image_urls: list[str]) -> ModelRequestData:
-    """
-    Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
-    show how to process multi images inputs.
-    """
-
-    model_path = snapshot_download(
-        "microsoft/Phi-4-multimodal-instruct", revision="refs/pr/70"
-    )
-    # Since the vision-lora and speech-lora co-exist with the base model,
-    # we have to manually specify the path of the lora weights.
-    vision_lora_path = os.path.join(model_path, "vision-lora")
-    engine_args = EngineArgs(
-        model=model_path,
-        max_model_len=4096,
-        max_num_seqs=2,
-        limit_mm_per_prompt={"image": len(image_urls)},
-        enable_lora=True,
-        max_lora_rank=320,
-        # Note - mm_processor_kwargs can also be passed to generate/chat calls
-        mm_processor_kwargs={"dynamic_hd": 4},
-    )
-
-    placeholders = "<|image|>" * len(image_urls)
-    prompt = f"<|user|>{placeholders}{question}<|end|><|assistant|>"
-
-    return ModelRequestData(
-        engine_args=engine_args,
-        prompt=prompt,
-        image_data=[fetch_image(url) for url in image_urls],
-        lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
-    )
-
-
 def load_qwen_vl_chat(question: str, image_urls: list[str]) -> ModelRequestData:
     model_name = "Qwen/Qwen-VL-Chat"
     engine_args = EngineArgs(
@@ -1363,7 +1329,6 @@ model_example_map = {
     "paddleocr_vl": load_paddleocr_vl,
     "phi3_v": load_phi3v,
     "phi4_mm": load_phi4mm,
-    "phi4_multimodal": load_phi4_multimodal,
     "pixtral_hf": load_pixtral_hf,
     "qwen_vl_chat": load_qwen_vl_chat,
     "qwen2_vl": load_qwen2_vl,
diff --git a/examples/online_serving/disaggregated_encoder/README.md b/examples/online_serving/disaggregated_encoder/README.md
index b2c3bb974dfabcfc3371199024af663fac108c16..2a59f86d15fb7bd2c93fa938c807268e426fef60 100644
--- a/examples/online_serving/disaggregated_encoder/README.md
+++ b/examples/online_serving/disaggregated_encoder/README.md
@@ -38,6 +38,8 @@ Encoder engines should be launched with the following flags:
 
 - `--max-num-batched-tokens=<large value>` **(default: 2048)** – This flag controls the token scheduling budget per decoding step and is irrelevant to encoder-only instances. **Set it to a very high value (effectively unlimited) to bypass scheduler limitations.** The actual token budget is managed by the encoder cache manager.
 
+- `--convert "mm_encoder_only"` **(Optional)** - The language model is skipped during initialization to reduce device memory usage. **Models using this option must implement the `get_language_model_spec` interface.**
+
 ## Local media inputs
 
 To support local image inputs (from your ```MEDIA_PATH``` directory), add the following flag to the encoder instance:
diff --git a/examples/online_serving/disaggregated_serving/moriio_toy_proxy_server.py b/examples/online_serving/disaggregated_serving/moriio_toy_proxy_server.py
new file mode 100644
index 0000000000000000000000000000000000000000..50732061243299037ac972e22ca4062021f056ed
--- /dev/null
+++ b/examples/online_serving/disaggregated_serving/moriio_toy_proxy_server.py
@@ -0,0 +1,320 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import asyncio
+import copy
+import logging
+import os
+import re
+import socket
+import threading
+import uuid
+
+import aiohttp
+import msgpack
+import zmq
+from quart import Quart, make_response, request
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+prefill_instances: list[dict] = []
+decode_instances: list[dict] = []
+request_nums = 0
+app = Quart(__name__)
+
+IP_PORT_PATTERN = re.compile(r"//(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):(\d+)")
+
+
+TRANSFER_TYPE = None
+
+
+def _append_whole_dict_unique(target_list, data_dict):
+    new_filtered = {k: v for k, v in data_dict.items() if k != "index"}
+    for existed in target_list:
+        existed_filtered = {k: v for k, v in existed.items() if k != "index"}
+        if existed_filtered == new_filtered:
+            return False
+    print("!!APPEND!!", data_dict)
+    target_list.append(data_dict)
+    transfer_mode = data_dict.get("transfer_mode", "unknown")
+    global TRANSFER_TYPE
+
+    if TRANSFER_TYPE is None:
+        TRANSFER_TYPE = transfer_mode
+        logger.info("SET TRANSFER TYPE TO %s", TRANSFER_TYPE)
+    elif transfer_mode != TRANSFER_TYPE:
+        raise ValueError(f"mismatched transfer mode {TRANSFER_TYPE} vs {transfer_mode}")
+
+    return True
+
+
+_list_lock = threading.RLock()
+
+
+def _listen_for_register(hostname, port):
+    context = zmq.Context()
+    router_socket = context.socket(zmq.ROUTER)
+    router_socket.bind(f"tcp://{hostname}:{port}")
+    poller = zmq.Poller()
+    poller.register(router_socket, zmq.POLLIN)
+    global prefill_instances
+    global decode_instances
+
+    while True:
+        socks = dict(poller.poll())
+        if router_socket in socks:
+            remote_addr, msg = router_socket.recv_multipart()
+            data = msgpack.loads(msg)
+            if data["type"] == "HELLO":
+                pass
+            elif (
+                data["type"] == "register"
+                and data["role"] == "P"
+                and data["request_address"] not in prefill_instances
+            ):
+                with _list_lock:
+                    _append_whole_dict_unique(prefill_instances, data)
+
+            elif (
+                data["type"] == "register"
+                and data["role"] == "D"
+                and data["request_address"] not in decode_instances
+            ):
+                with _list_lock:
+                    _append_whole_dict_unique(decode_instances, data)
+
+
+def start_service_discovery(hostname, port):
+    if not hostname:
+        hostname = socket.gethostname()
+    if port == 0:
+        raise ValueError("Port cannot be 0")
+
+    _listener_thread = threading.Thread(
+        target=_listen_for_register, args=(hostname, port), daemon=True
+    )
+    _listener_thread.start()
+    return _listener_thread
+
+
+async def send_request_to_prefill(
+    endpoint, req_data, request_id, d_endpoint, dip, dport, selected_prefill_dp_rank
+):
+    req_data_copy = req_data
+
+    req_data_copy["kv_transfer_params"].update(
+        {
+            "do_remote_decode": True,
+            "do_remote_prefill": False,
+            "remote_handshake_port": d_endpoint["handshake_port"],
+            "remote_notify_port": d_endpoint["notify_port"],
+            "remote_engine_id": None,
+            "remote_block_ids": None,
+            "remote_host": dip,
+            "remote_port": dport,
+        }
+    )
+    req_data_copy["stream"] = False
+    req_data_copy["max_tokens"] = 1
+    if "max_completion_tokens" in req_data_copy:
+        req_data_copy["max_completion_tokens"] = 1
+    if "stream_options" in req_data_copy:
+        del req_data_copy["stream_options"]
+    async with aiohttp.ClientSession(
+        timeout=aiohttp.ClientTimeout(total=6 * 6000 * 6000)
+    ) as session:
+        headers = {
+            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+            "X-Request-Id": request_id,
+        }
+        if selected_prefill_dp_rank is not None:
+            headers["X-data-parallel-rank"] = str(selected_prefill_dp_rank)
+        async with session.post(
+            url=endpoint, json=req_data_copy, headers=headers
+        ) as response:
+            if response.status == 200:
+                return await response.json()
+
+            else:
+                raise RuntimeError(
+                    "send_request_to_prefill response.status != 200response.status = ",
+                    response.status,
+                )
+
+
+async def start_decode_request(endpoint, req_data, request_id):
+    session = aiohttp.ClientSession(
+        timeout=aiohttp.ClientTimeout(total=6 * 6000 * 6000)
+    )
+    headers = {
+        "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+        "X-Request-Id": request_id,
+    }
+    response = await session.post(url=endpoint, json=req_data, headers=headers)
+    return session, response
+
+
+async def stream_decode_response(session, response, request_id):
+    try:
+        if response.status == 200:
+            async for chunk_bytes in response.content.iter_chunked(1024):
+                yield chunk_bytes
+        else:
+            raise RuntimeError(
+                f"decode response.status != 200, status = {response.status}"
+            )
+    finally:
+        await session.close()
+
+
+async def send_request_to_decode(endpoint, req_data, request_id):
+    async with aiohttp.ClientSession(
+        timeout=aiohttp.ClientTimeout(total=6 * 6000 * 6000)
+    ) as session:
+        headers = {
+            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+            "X-Request-Id": request_id,
+        }
+        async with session.post(
+            url=endpoint, json=req_data, headers=headers
+        ) as response:
+            if response.status == 200:
+                async for chunk_bytes in response.content.iter_chunked(1024):
+                    yield chunk_bytes
+            else:
+                raise RuntimeError(
+                    "send_request_to_decode response.status != 200,response.statuus = ",
+                    response.status,
+                )
+
+
+def example_round_robin_dp_loader(request_number, dp_size):
+    return request_nums % dp_size
+
+
+@app.route("/v1/completions", methods=["POST"])
+@app.route("/v1/chat/completions", methods=["POST"])
+async def handle_request():
+    try:
+        with _list_lock:
+            global request_nums
+            request_nums += 1
+
+        def extract_ip_port_fast(url):
+            match = IP_PORT_PATTERN.search(url)
+            if not match:
+                raise ValueError(f"Invalid URL format: {url}")
+            return match.groups()
+
+        req_data = await request.get_json()
+        request_id = str(uuid.uuid4())
+
+        prefill_instance_endpoint = None
+        decode_instance_endpoint = None
+        error_msg = (
+            "Service Unavailable: No prefill or decode instances are registered."
+        )
+        if not prefill_instances or not decode_instances:
+            return await make_response(
+                (
+                    error_msg,
+                    503,
+                )
+            )
+        pid = request_nums % len(prefill_instances)
+        did = request_nums % len(decode_instances)
+        prefill_instance_endpoint = prefill_instances[pid]
+        decode_instance_endpoint = decode_instances[did]
+
+        selected_prefill_dp_rank = None
+        if prefill_instance_endpoint["dp_size"] > 1:
+            selected_prefill_dp_rank = example_round_robin_dp_loader(
+                request_nums // len(prefill_instance_endpoint),
+                prefill_instance_endpoint["dp_size"],
+            )
+
+        dip, dport = extract_ip_port_fast(decode_instance_endpoint["request_address"])
+        ip, port = extract_ip_port_fast(prefill_instance_endpoint["request_address"])
+
+        req_data_to_prefill = copy.deepcopy(req_data)
+        req_data_to_prefill["kv_transfer_params"] = {}
+        req_data["kv_transfer_params"] = {}
+        req_data_to_prefill["kv_transfer_params"]["remote_dp_size"] = (
+            decode_instance_endpoint["dp_size"]
+        )
+        req_data_to_prefill["kv_transfer_params"]["remote_tp_size"] = (
+            decode_instance_endpoint["tp_size"]
+        )
+
+        send_prefill_task = asyncio.create_task(
+            send_request_to_prefill(
+                prefill_instance_endpoint["request_address"],
+                req_data_to_prefill,
+                request_id,
+                decode_instance_endpoint,
+                dip,
+                dport,
+                selected_prefill_dp_rank,
+            )
+        )
+        ip, port = extract_ip_port_fast(prefill_instance_endpoint["request_address"])
+
+        req_data["max_tokens"] -= 1
+
+        req_data["kv_transfer_params"] = {
+            "do_remote_decode": False,
+            "do_remote_prefill": True,
+            "remote_handshake_port": prefill_instance_endpoint["handshake_port"],
+            "remote_notify_port": prefill_instance_endpoint["notify_port"],
+            "remote_engine_id": None,
+            "remote_block_ids": None,
+            "remote_host": ip,
+            "remote_port": port,
+        }
+        if TRANSFER_TYPE == "READ":
+            # In read mode, prefill and decode are executed serially.
+            prefill_response = await send_prefill_task
+            req_data["kv_transfer_params"]["remote_engine_id"] = prefill_response[
+                "kv_transfer_params"
+            ]["remote_engine_id"]
+            req_data["kv_transfer_params"]["remote_block_ids"] = prefill_response[
+                "kv_transfer_params"
+            ]["remote_block_ids"]
+
+        req_data["kv_transfer_params"]["remote_dp_size"] = prefill_instance_endpoint[
+            "dp_size"
+        ]
+        req_data["kv_transfer_params"]["remote_tp_size"] = prefill_instance_endpoint[
+            "tp_size"
+        ]
+
+        if selected_prefill_dp_rank is not None:
+            req_data["kv_transfer_params"]["remote_dp_rank"] = selected_prefill_dp_rank
+
+        decode_request_task = asyncio.create_task(
+            start_decode_request(
+                decode_instance_endpoint["request_address"], req_data, request_id
+            )
+        )
+
+        session, decode_response = await decode_request_task
+        stream_generator = stream_decode_response(session, decode_response, request_id)
+        response = await make_response(stream_generator)
+        return response
+    except Exception as e:
+        logger.exception("An error occurred while handling the request: %s", e)
+        return await make_response(
+            (
+                f"Internal Server Error: {e!s}",
+                500,
+            )
+        )
+
+
+if __name__ == "__main__":
+    t = start_service_discovery("0.0.0.0", 36367)
+    app.debug = True
+    app.config["BODY_TIMEOUT"] = 360000
+    app.config["RESPONSE_TIMEOUT"] = 360000
+
+    app.run(host="0.0.0.0", port=10001)
+    t.join()
diff --git a/examples/online_serving/elastic_ep/serve_deepseek_v2.sh b/examples/online_serving/elastic_ep/serve_deepseek_v2.sh
index 6845545b6fd17777a928e2adb521f890304a4cee..20bf598c03e26e5dabebdf373e494f02a9bf1a19 100644
--- a/examples/online_serving/elastic_ep/serve_deepseek_v2.sh
+++ b/examples/online_serving/elastic_ep/serve_deepseek_v2.sh
@@ -55,7 +55,6 @@ done
 echo "Starting vLLM server for $MODEL_NAME with data parallel size: $DATA_PARALLEL_SIZE and redundant experts: $REDUNDANT_EXPERTS"
 
 export RAY_DEDUP_LOGS=0
-export VLLM_ALL2ALL_BACKEND="pplx"
 export VLLM_USE_DEEP_GEMM=1
 
 vllm serve $MODEL_NAME \
@@ -65,6 +64,7 @@ vllm serve $MODEL_NAME \
     --enforce-eager \
     --enable-expert-parallel \
     --enable-eplb \
+    --all2all-backend pplx \
     --num-redundant-experts $REDUNDANT_EXPERTS \
     --trust-remote-code \
     --host $HOST \
diff --git a/examples/online_serving/kv_events_subscriber.py b/examples/online_serving/kv_events_subscriber.py
index 19f6bd5726102f9ed02bfa9eb502a4fa3526a95c..30c3986f2fa40968c65be7d3eeea3fa7f236612e 100644
--- a/examples/online_serving/kv_events_subscriber.py
+++ b/examples/online_serving/kv_events_subscriber.py
@@ -28,8 +28,14 @@ class BlockStored(KVCacheEvent):
     parent_block_hash: ExternalBlockHash | None
     token_ids: list[int]
     block_size: int
+
     lora_id: int | None
+    """Deprecated: use `lora_name` for KV block key hash.
+    Retained for backward compatibility.
+    """
+
     medium: str | None
+    lora_name: str | None
 
 
 class BlockRemoved(KVCacheEvent):
diff --git a/examples/online_serving/openai_chat_completion_client_for_multimodal.py b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
index 3d1259276998d7097023572e4723f1f0034d5561..198863ae4a8b4373a487f21631b55d59ba6b2c67 100644
--- a/examples/online_serving/openai_chat_completion_client_for_multimodal.py
+++ b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
@@ -21,6 +21,7 @@ python openai_chat_completion_client_for_multimodal.py --chat-type audio
 """
 
 import base64
+import os
 
 import requests
 from openai import OpenAI
@@ -51,6 +52,16 @@ def encode_base64_content_from_url(content_url: str) -> str:
     return result
 
 
+def encode_base64_content_from_file(file_path: str) -> str:
+    """Encode a local file content to base64 format."""
+
+    with open(file_path, "rb") as file:
+        file_content = file.read()
+        result = base64.b64encode(file_content).decode("utf-8")
+
+    return result
+
+
 # Text-only inference
 def run_text_only(model: str, max_completion_tokens: int) -> None:
     chat_completion = client.chat.completions.create(
@@ -67,6 +78,7 @@ def run_text_only(model: str, max_completion_tokens: int) -> None:
 def run_single_image(model: str, max_completion_tokens: int) -> None:
     ## Use image url in the payload
     image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+    image_file = "/path/to/image.jpg"  # local file
     chat_completion_from_url = client.chat.completions.create(
         messages=[
             {
@@ -87,6 +99,30 @@ def run_single_image(model: str, max_completion_tokens: int) -> None:
     result = chat_completion_from_url.choices[0].message.content
     print("Chat completion output from image url:\n", result)
 
+    ## Use local image url in the payload
+    # Launch the API server/engine with the --allowed-local-media-path argument.
+    if os.path.exists(image_file):
+        chat_completion_from_local_image_url = client.chat.completions.create(
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "What's in this image?"},
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": f"file://{image_file}"},
+                        },
+                    ],
+                }
+            ],
+            model=model,
+            max_completion_tokens=max_completion_tokens,
+        )
+        result = chat_completion_from_local_image_url.choices[0].message.content
+        print("Chat completion output from local image file:\n", result)
+    else:
+        print(f"Local image file not found at {image_file}, skipping local file test.")
+
     ## Use base64 encoded image in the payload
     image_base64 = encode_base64_content_from_url(image_url)
     chat_completion_from_base64 = client.chat.completions.create(
@@ -109,6 +145,33 @@ def run_single_image(model: str, max_completion_tokens: int) -> None:
     result = chat_completion_from_base64.choices[0].message.content
     print("Chat completion output from base64 encoded image:", result)
 
+    ## Use base64 encoded local image in the payload
+    if os.path.exists(image_file):
+        local_image_base64 = encode_base64_content_from_file(image_file)
+        chat_completion_from_local_image_base64 = client.chat.completions.create(
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "What's in this image?"},
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:image/jpeg;base64,{local_image_base64}"
+                            },
+                        },
+                    ],
+                }
+            ],
+            model=model,
+            max_completion_tokens=max_completion_tokens,
+        )
+
+        result = chat_completion_from_local_image_base64.choices[0].message.content
+        print("Chat completion output from base64 encoded local image:", result)
+    else:
+        print(f"Local image file not found at {image_file}, skipping local file test.")
+
 
 # Multi-image input inference
 def run_multi_image(model: str, max_completion_tokens: int) -> None:
diff --git a/examples/online_serving/openai_transcription_client.py b/examples/online_serving/openai_transcription_client.py
index 0d1d73fb14a3ab9bdc6f9f9a53495dfcc36ee35a..966bfd2a47a009c8795d33deeb385b5ec363c3a3 100644
--- a/examples/online_serving/openai_transcription_client.py
+++ b/examples/online_serving/openai_transcription_client.py
@@ -18,6 +18,7 @@ The script performs:
 2. Streaming transcription using raw HTTP request to the vLLM server.
 """
 
+import argparse
 import asyncio
 
 from openai import AsyncOpenAI, OpenAI
@@ -25,14 +26,14 @@ from openai import AsyncOpenAI, OpenAI
 from vllm.assets.audio import AudioAsset
 
 
-def sync_openai(audio_path: str, client: OpenAI):
+def sync_openai(audio_path: str, client: OpenAI, model: str):
     """
     Perform synchronous transcription using OpenAI-compatible API.
     """
     with open(audio_path, "rb") as f:
         transcription = client.audio.transcriptions.create(
             file=f,
-            model="openai/whisper-large-v3",
+            model=model,
             language="en",
             response_format="json",
             temperature=0.0,
@@ -42,18 +43,18 @@ def sync_openai(audio_path: str, client: OpenAI):
                 repetition_penalty=1.3,
             ),
         )
-        print("transcription result:", transcription.text)
+        print("transcription result [sync]:", transcription.text)
 
 
-async def stream_openai_response(audio_path: str, client: AsyncOpenAI):
+async def stream_openai_response(audio_path: str, client: AsyncOpenAI, model: str):
     """
     Perform asynchronous transcription using OpenAI-compatible API.
     """
-    print("\ntranscription result:", end=" ")
+    print("\ntranscription result [stream]:", end=" ")
     with open(audio_path, "rb") as f:
         transcription = await client.audio.transcriptions.create(
             file=f,
-            model="openai/whisper-large-v3",
+            model=model,
             language="en",
             response_format="json",
             temperature=0.0,
@@ -72,7 +73,47 @@ async def stream_openai_response(audio_path: str, client: AsyncOpenAI):
     print()  # Final newline after stream ends
 
 
-def main():
+def stream_api_response(audio_path: str, model: str, openai_api_base: str):
+    """
+    Perform streaming transcription using raw HTTP requests to the vLLM API server.
+    """
+    import json
+    import os
+
+    import requests
+
+    api_url = f"{openai_api_base}/audio/transcriptions"
+    headers = {"User-Agent": "Transcription-Client"}
+    with open(audio_path, "rb") as f:
+        files = {"file": (os.path.basename(audio_path), f)}
+        data = {
+            "stream": "true",
+            "model": model,
+            "language": "en",
+            "response_format": "json",
+        }
+
+        print("\ntranscription result [stream]:", end=" ")
+        response = requests.post(
+            api_url, headers=headers, files=files, data=data, stream=True
+        )
+        for chunk in response.iter_lines(
+            chunk_size=8192, decode_unicode=False, delimiter=b"\n"
+        ):
+            if chunk:
+                data = chunk[len("data: ") :]
+                data = json.loads(data.decode("utf-8"))
+                data = data["choices"][0]
+                delta = data["delta"]["content"]
+                print(delta, end="", flush=True)
+
+                finish_reason = data.get("finish_reason")
+                if finish_reason is not None:
+                    print(f"\n[Stream finished reason: {finish_reason}]")
+                    break
+
+
+def main(args):
     mary_had_lamb = str(AudioAsset("mary_had_lamb").get_local_path())
     winning_call = str(AudioAsset("winning_call").get_local_path())
 
@@ -84,14 +125,41 @@ def main():
         base_url=openai_api_base,
     )
 
-    sync_openai(mary_had_lamb, client)
+    model = client.models.list().data[0].id
+    print(f"Using model: {model}")
+
+    # Run the synchronous function
+    sync_openai(args.audio_path if args.audio_path else mary_had_lamb, client, model)
+
     # Run the asynchronous function
-    client = AsyncOpenAI(
-        api_key=openai_api_key,
-        base_url=openai_api_base,
-    )
-    asyncio.run(stream_openai_response(winning_call, client))
+    if "openai" in model:
+        client = AsyncOpenAI(
+            api_key=openai_api_key,
+            base_url=openai_api_base,
+        )
+        asyncio.run(
+            stream_openai_response(
+                args.audio_path if args.audio_path else winning_call, client, model
+            )
+        )
+    else:
+        stream_api_response(
+            args.audio_path if args.audio_path else winning_call,
+            model,
+            openai_api_base,
+        )
 
 
 if __name__ == "__main__":
-    main()
+    # setup argparser
+    parser = argparse.ArgumentParser(
+        description="OpenAI Transcription Client using vLLM API Server"
+    )
+    parser.add_argument(
+        "--audio_path",
+        type=str,
+        default=None,
+        help="The path to the audio file to transcribe.",
+    )
+    args = parser.parse_args()
+    main(args)
diff --git a/examples/online_serving/openai_translation_client.py b/examples/online_serving/openai_translation_client.py
index 6f7253e2a7894dfad9cb259f770c24aebea1647a..264e386436971011ee34c6522f7a6113168c9563 100644
--- a/examples/online_serving/openai_translation_client.py
+++ b/examples/online_serving/openai_translation_client.py
@@ -9,11 +9,11 @@ from openai import OpenAI
 from vllm.assets.audio import AudioAsset
 
 
-def sync_openai(audio_path: str, client: OpenAI):
+def sync_openai(audio_path: str, client: OpenAI, model: str):
     with open(audio_path, "rb") as f:
         translation = client.audio.translations.create(
             file=f,
-            model="openai/whisper-large-v3",
+            model=model,
             response_format="json",
             temperature=0.0,
             # Additional params not provided by OpenAI API.
@@ -26,11 +26,13 @@ def sync_openai(audio_path: str, client: OpenAI):
         print("translation result:", translation.text)
 
 
-async def stream_openai_response(audio_path: str, base_url: str, api_key: str):
+async def stream_openai_response(
+    audio_path: str, base_url: str, api_key: str, model: str
+):
     data = {
         "language": "it",
         "stream": True,
-        "model": "openai/whisper-large-v3",
+        "model": model,
     }
     url = base_url + "/audio/translations"
     headers = {"Authorization": f"Bearer {api_key}"}
@@ -66,9 +68,13 @@ def main():
         api_key=openai_api_key,
         base_url=openai_api_base,
     )
-    sync_openai(foscolo, client)
+
+    model = client.models.list().data[0].id
+    print(f"Using model: {model}")
+
+    sync_openai(foscolo, client, model)
     # Run the asynchronous function
-    asyncio.run(stream_openai_response(foscolo, openai_api_base, openai_api_key))
+    asyncio.run(stream_openai_response(foscolo, openai_api_base, openai_api_key, model))
 
 
 if __name__ == "__main__":
diff --git a/examples/pooling/embed/openai_embedding_long_text/README.md b/examples/pooling/embed/openai_embedding_long_text/README.md
index 00d3ded3e41c1b98a244b40891a7d01f9f311e94..0eda6081035843acc8fd3940d1d4f3a2700f7569 100644
--- a/examples/pooling/embed/openai_embedding_long_text/README.md
+++ b/examples/pooling/embed/openai_embedding_long_text/README.md
@@ -47,7 +47,7 @@ The key parameters for chunked processing are in the `--pooler-config`:
 ```json
 {
   "pooling_type": "auto",
-  "normalize": true,
+  "use_activation": true,
   "enable_chunked_processing": true,
   "max_embed_len": 3072000
 }
diff --git a/examples/pooling/embed/openai_embedding_long_text/client.py b/examples/pooling/embed/openai_embedding_long_text/client.py
index 4a3674bb3f2a8852818c2910d89d18b0c8e3b16e..7cc33b1f24e20a682fb846aa65b917bddc757a0b 100644
--- a/examples/pooling/embed/openai_embedding_long_text/client.py
+++ b/examples/pooling/embed/openai_embedding_long_text/client.py
@@ -14,7 +14,7 @@ Prerequisites:
    # MEAN pooling (processes all chunks, recommended for complete coverage)
    vllm serve intfloat/multilingual-e5-large \
      --pooler-config \
-      '{"pooling_type": "MEAN", "normalize": true, ' \
+      '{"pooling_type": "MEAN", "use_activation": true, ' \
       '"enable_chunked_processing": true, "max_embed_len": 3072000}' \
      --served-model-name multilingual-e5-large \
      --trust-remote-code \
@@ -24,7 +24,7 @@ Prerequisites:
    # OR CLS pooling (native CLS within chunks, MEAN aggregation across chunks)
    vllm serve BAAI/bge-large-en-v1.5 \
      --pooler-config \
-      '{"pooling_type": "CLS", "normalize": true, ' \
+      '{"pooling_type": "CLS", "use_activation": true, ' \
       '"enable_chunked_processing": true, "max_embed_len": 1048576}' \
      --served-model-name bge-large-en-v1.5 \
      --trust-remote-code \
diff --git a/examples/pooling/embed/openai_embedding_long_text/service.sh b/examples/pooling/embed/openai_embedding_long_text/service.sh
index b5c92749466b05b7e5cb90a53df2baa0fff09c7b..0353b8f5a2be8085519d84c5bb92a7162e181f91 100644
--- a/examples/pooling/embed/openai_embedding_long_text/service.sh
+++ b/examples/pooling/embed/openai_embedding_long_text/service.sh
@@ -96,7 +96,7 @@ echo ""
 echo "🔧 Starting server with enhanced chunked processing configuration..."
 
 # Build pooler config JSON
-POOLER_CONFIG="{\"pooling_type\": \"$POOLING_TYPE\", \"normalize\": true, \"enable_chunked_processing\": ${VLLM_ENABLE_CHUNKED_PROCESSING}, \"max_embed_len\": ${MAX_EMBED_LEN}}"
+POOLER_CONFIG="{\"pooling_type\": \"$POOLING_TYPE\", \"use_activation\": true, \"enable_chunked_processing\": ${VLLM_ENABLE_CHUNKED_PROCESSING}, \"max_embed_len\": ${MAX_EMBED_LEN}}"
 
 # Start vLLM server with enhanced chunked processing
 vllm serve "$MODEL_NAME" \
diff --git a/examples/pooling/embed/vision_embedding_offline.py b/examples/pooling/embed/vision_embedding_offline.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef272badebbd60bea745dd1836901a8e004fae91
--- /dev/null
+++ b/examples/pooling/embed/vision_embedding_offline.py
@@ -0,0 +1,93 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa: E501
+"""
+This example shows how to use vLLM for running offline inference with
+the correct prompt format on vision language models for multimodal embedding.
+
+For most models, the prompt format should follow corresponding examples
+on HuggingFace model repository.
+"""
+
+import argparse
+from dataclasses import asdict
+
+from vllm import LLM, EngineArgs
+from vllm.multimodal.utils import fetch_image
+
+image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/cat_snow.jpg"
+text = "A cat standing in the snow."
+multi_modal_data = {"image": fetch_image(image_url)}
+
+
+def print_embeddings(embeds):
+    embeds_trimmed = (str(embeds[:4])[:-1] + ", ...]") if len(embeds) > 4 else embeds
+    print(f"Embeddings: {embeds_trimmed} (size={len(embeds)})")
+
+
+def run_qwen3_vl():
+    engine_args = EngineArgs(
+        model="Qwen/Qwen3-VL-Embedding-2B",
+        runner="pooling",
+        max_model_len=8192,
+        limit_mm_per_prompt={"image": 1},
+    )
+    default_instruction = "Represent the user's input."
+    image_placeholder = "<|vision_start|><|image_pad|><|vision_end|>"
+    text_prompt = f"<|im_start|>system\n{default_instruction}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant\n"
+    image_prompt = f"<|im_start|>system\n{default_instruction}<|im_end|>\n<|im_start|>user\n{image_placeholder}<|im_end|>\n<|im_start|>assistant\n"
+    image_text_prompt = f"<|im_start|>system\n{default_instruction}<|im_end|>\n<|im_start|>user\n{image_placeholder}{text}<|im_end|>\n<|im_start|>assistant\n"
+
+    llm = LLM(**asdict(engine_args))
+
+    print("Text embedding output:")
+    outputs = llm.embed(text_prompt, use_tqdm=False)
+    print_embeddings(outputs[0].outputs.embedding)
+
+    print("Image embedding output:")
+    outputs = llm.embed(
+        {
+            "prompt": image_prompt,
+            "multi_modal_data": multi_modal_data,
+        },
+        use_tqdm=False,
+    )
+    print_embeddings(outputs[0].outputs.embedding)
+
+    print("Image+Text embedding output:")
+    outputs = llm.embed(
+        {
+            "prompt": image_text_prompt,
+            "multi_modal_data": multi_modal_data,
+        },
+        use_tqdm=False,
+    )
+    print_embeddings(outputs[0].outputs.embedding)
+
+
+model_example_map = {
+    "qwen3_vl": run_qwen3_vl,
+}
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        "Script to run a specified VLM through vLLM offline api."
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        choices=model_example_map.keys(),
+        required=True,
+        help="The name of the embedding model.",
+    )
+    return parser.parse_args()
+
+
+def main(args):
+    model_example_map[args.model]()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/examples/pooling/embed/openai_chat_embedding_client_for_multimodal.py b/examples/pooling/embed/vision_embedding_online.py
similarity index 66%
rename from examples/pooling/embed/openai_chat_embedding_client_for_multimodal.py
rename to examples/pooling/embed/vision_embedding_online.py
index a7ab7e73e7d42c2ee1e3a661b5b2edad4a662e90..66c824739f415ea380d4560a987710ab0241ffce 100644
--- a/examples/pooling/embed/openai_chat_embedding_client_for_multimodal.py
+++ b/examples/pooling/embed/vision_embedding_online.py
@@ -21,7 +21,8 @@ from PIL import Image
 openai_api_key = "EMPTY"
 openai_api_base = "http://localhost:8000/v1"
 
-image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/cat_snow.jpg"
+text = "A cat standing in the snow."
 
 
 def create_chat_embeddings(
@@ -30,6 +31,8 @@ def create_chat_embeddings(
     messages: list[ChatCompletionMessageParam],
     model: str,
     encoding_format: Literal["base64", "float"] | NotGiven = NOT_GIVEN,
+    continue_final_message: bool = False,
+    add_special_tokens: bool = False,
 ) -> CreateEmbeddingResponse:
     """
     Convenience function for accessing vLLM's Chat Embeddings API,
@@ -38,10 +41,21 @@ def create_chat_embeddings(
     return client.post(
         "/embeddings",
         cast_to=CreateEmbeddingResponse,
-        body={"messages": messages, "model": model, "encoding_format": encoding_format},
+        body={
+            "messages": messages,
+            "model": model,
+            "encoding_format": encoding_format,
+            "continue_final_message": continue_final_message,
+            "add_special_tokens": add_special_tokens,
+        },
     )
 
 
+def print_embeddings(embeds):
+    embeds_trimmed = (str(embeds[:4])[:-1] + ", ...]") if len(embeds) > 4 else embeds
+    print(f"Embeddings: {embeds_trimmed} (size={len(embeds)})")
+
+
 def run_clip(client: OpenAI, model: str):
     """
     Start the server using:
@@ -145,6 +159,113 @@ def run_dse_qwen2_vl(client: OpenAI, model: str):
     print("Text embedding output:", response.data[0].embedding)
 
 
+def run_qwen3_vl(client: OpenAI, model: str):
+    """
+    Start the server using:
+
+    vllm serve Qwen/Qwen3-VL-Embedding-2B \
+        --runner pooling \
+        --max-model-len 8192
+    """
+
+    default_instruction = "Represent the user's input."
+
+    print("Text embedding output:")
+    response = create_chat_embeddings(
+        client,
+        messages=[
+            {
+                "role": "system",
+                "content": [
+                    {"type": "text", "text": default_instruction},
+                ],
+            },
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": text},
+                ],
+            },
+            {
+                "role": "assistant",
+                "content": [
+                    {"type": "text", "text": ""},
+                ],
+            },
+        ],
+        model=model,
+        encoding_format="float",
+        continue_final_message=True,
+        add_special_tokens=True,
+    )
+    print_embeddings(response.data[0].embedding)
+
+    print("Image embedding output:")
+    response = create_chat_embeddings(
+        client,
+        messages=[
+            {
+                "role": "system",
+                "content": [
+                    {"type": "text", "text": default_instruction},
+                ],
+            },
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image_url", "image_url": {"url": image_url}},
+                    {"type": "text", "text": ""},
+                ],
+            },
+            {
+                "role": "assistant",
+                "content": [
+                    {"type": "text", "text": ""},
+                ],
+            },
+        ],
+        model=model,
+        encoding_format="float",
+        continue_final_message=True,
+        add_special_tokens=True,
+    )
+    print_embeddings(response.data[0].embedding)
+
+    print("Image+Text embedding output:")
+    response = create_chat_embeddings(
+        client,
+        messages=[
+            {
+                "role": "system",
+                "content": [
+                    {"type": "text", "text": default_instruction},
+                ],
+            },
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image_url", "image_url": {"url": image_url}},
+                    {
+                        "type": "text",
+                        "text": f"{text}",
+                    },
+                ],
+            },
+            {
+                "role": "assistant",
+                "content": [
+                    {"type": "text", "text": ""},
+                ],
+            },
+        ],
+        model=model,
+        encoding_format="float",
+        continue_final_message=True,
+        add_special_tokens=True,
+    )
+    print_embeddings(response.data[0].embedding)
+
+
 def run_siglip(client: OpenAI, model: str):
     """
     Start the server using:
@@ -213,7 +334,8 @@ def run_vlm2vec(client: OpenAI, model: str):
         encoding_format="float",
     )
 
-    print("Image embedding output:", response.data[0].embedding)
+    print("Image embedding output:")
+    print_embeddings(response.data[0].embedding)
 
     response = create_chat_embeddings(
         client,
@@ -233,7 +355,8 @@ def run_vlm2vec(client: OpenAI, model: str):
         encoding_format="float",
     )
 
-    print("Image+Text embedding output:", response.data[0].embedding)
+    print("Image+Text embedding output:")
+    print_embeddings(response.data[0].embedding)
 
     response = create_chat_embeddings(
         client,
@@ -249,11 +372,13 @@ def run_vlm2vec(client: OpenAI, model: str):
         encoding_format="float",
     )
 
-    print("Text embedding output:", response.data[0].embedding)
+    print("Text embedding output:")
+    print_embeddings(response.data[0].embedding)
 
 
 model_example_map = {
     "clip": run_clip,
+    "qwen3_vl": run_qwen3_vl,
     "dse_qwen2_vl": run_dse_qwen2_vl,
     "siglip": run_siglip,
     "vlm2vec": run_vlm2vec,
diff --git a/examples/pooling/pooling/vision_language_pooling.py b/examples/pooling/pooling/vision_language_pooling.py
index dda56bc34df2e8be7ee498478f75195e13ee7a1e..e2149a7a6329555f5d2a70c205dc8bf9a6d46da9 100644
--- a/examples/pooling/pooling/vision_language_pooling.py
+++ b/examples/pooling/pooling/vision_language_pooling.py
@@ -133,6 +133,36 @@ def run_jinavl_reranker(query: Query) -> ModelRequestData:
     )
 
 
+def run_qwen3_vl(query: Query) -> ModelRequestData:
+    image_placeholder = "<vision_start><|image_pad|><vision_end>"
+    if query["modality"] == "text":
+        prompt = query["text"]
+        image = None
+    elif query["modality"] == "image":
+        prompt = image_placeholder
+        image = query["image"]
+    elif query["modality"] == "text+image":
+        text = query["text"]
+        prompt = f"{image_placeholder}\n{text}"
+        image = query["image"]
+    else:
+        modality = query["modality"]
+        raise ValueError(f"Unsupported query modality: '{modality}'")
+
+    engine_args = EngineArgs(
+        model="Qwen/Qwen3-VL-Embedding-2B",
+        runner="pooling",
+        max_model_len=8192,
+        limit_mm_per_prompt={"image": 1},
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image=image,
+    )
+
+
 def run_siglip(query: Query) -> ModelRequestData:
     if query["modality"] == "text":
         prompt = query["text"]
@@ -353,6 +383,7 @@ model_example_map = {
     "clip": run_clip,
     "e5_v": run_e5_v,
     "jinavl_reranker": run_jinavl_reranker,
+    "qwen3_vl": run_qwen3_vl,
     "siglip": run_siglip,
     "vlm2vec_phi3v": run_vlm2vec_phi3v,
     "vlm2vec_qwen2vl": run_vlm2vec_qwen2vl,
diff --git a/examples/pooling/score/cohere_rerank_client.py b/examples/pooling/score/cohere_rerank_online.py
similarity index 100%
rename from examples/pooling/score/cohere_rerank_client.py
rename to examples/pooling/score/cohere_rerank_online.py
diff --git a/examples/pooling/score/convert_model_to_seq_cls.py b/examples/pooling/score/convert_model_to_seq_cls.py
index 72356020330fe479aa55573ce1f649c767f2d11d..a3d31ceb12a70c712634cfa63ec36799b1733bda 100644
--- a/examples/pooling/score/convert_model_to_seq_cls.py
+++ b/examples/pooling/score/convert_model_to_seq_cls.py
@@ -2,35 +2,70 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # ruff: noqa: E501
 
+"""
+Script to convert Large Language Models (LLMs) to Sequence Classification models.
+This is particularly useful for converting reranker models that use next-token
+prediction to a sequence classification format for compatibility with standard
+classification and rerank pipelines.
+
+Usage examples:
+- For BAAI/bge-reranker-v2-gemma:
+  python convert_model_to_seq_cls.py --model_name BAAI/bge-reranker-v2-gemma \
+    --classifier_from_tokens '["Yes"]' --method no_post_processing \
+    --path ./bge-reranker-v2-gemma-seq-cls
+
+- For mxbai-rerank-v2:
+  python convert_model_to_seq_cls.py --model_name mixedbread-ai/mxbai-rerank-base-v2 \
+    --classifier_from_tokens '["0", "1"]' --method from_2_way_softmax \
+    --path ./mxbai-rerank-base-v2-seq-cls
+
+- For Qwen3-Reranker:
+  python convert_model_to_seq_cls.py --model_name Qwen/Qwen3-Reranker-0.6B \
+    --classifier_from_tokens '["no", "yes"]' --method from_2_way_softmax \
+    --path ./Qwen3-Reranker-0.6B-seq-cls
+
+Note: For BAAI/bge-reranker-v2-gemma, "Yes" and "yes" are different tokens.
+"""
+
 import argparse
 import json
 
 import torch
 import transformers
 
-# Usage:
-# for BAAI/bge-reranker-v2-gemma
-# Caution: "Yes" and "yes" are two different tokens
-# python convert_model_to_seq_cls.py --model_name BAAI/bge-reranker-v2-gemma --classifier_from_tokens '["Yes"]' --method no_post_processing --path ./bge-reranker-v2-gemma-seq-cls
-# for mxbai-rerank-v2
-# python convert_model_to_seq_cls.py --model_name mixedbread-ai/mxbai-rerank-base-v2 --classifier_from_tokens '["0", "1"]' --method from_2_way_softmax --path ./mxbai-rerank-base-v2-seq-cls
-# for Qwen3-Reranker
-# python convert_model_to_seq_cls.py --model_name Qwen/Qwen3-Reranker-0.6B --classifier_from_tokens '["no", "yes"]' --method from_2_way_softmax --path ./Qwen3-Reranker-0.6B-seq-cls
-
 
 def from_2_way_softmax(causal_lm, seq_cls_model, tokenizer, tokens, device):
-    # refer to https://huggingface.co/Qwen/Qwen3-Reranker-0.6B/discussions/3
-    assert len(tokens) == 2
+    """
+    This method extracts the difference between weights for 'true' and 'false' tokens
+    from the language model head to create a single classification weight vector.
+
+    Args:
+        causal_lm: The original causal language model
+        seq_cls_model: The target sequence classification model
+        tokenizer: Model tokenizer
+        tokens: List of two tokens representing [false_token, true_token]
+        device: Target device (cpu/cuda)
+
+    Reference: https://huggingface.co/Qwen/Qwen3-Reranker-0.6B/discussions/3
+    """
+    assert len(tokens) == 2, (
+        "Method requires exactly two tokens for binary classification"
+    )
 
+    # Get the language model head weights (vocabulary_size x hidden_size)
     lm_head_weights = causal_lm.lm_head.weight
 
+    # Convert token strings to their corresponding token IDs
     false_id = tokenizer.convert_tokens_to_ids(tokens[0])
     true_id = tokenizer.convert_tokens_to_ids(tokens[1])
 
+    # Compute the classification weight as the difference between true and false token weights
+    # This follows the approach in: https://huggingface.co/Qwen/Qwen3-Reranker-0.6B/discussions/3
     score_weight = lm_head_weights[true_id].to(device).to(
         torch.float32
     ) - lm_head_weights[false_id].to(device).to(torch.float32)
 
+    # Copy the computed weights to the sequence classification model
     with torch.no_grad():
         seq_cls_model.score.weight.copy_(score_weight.unsqueeze(0))
         if seq_cls_model.score.bias is not None:
@@ -38,12 +73,29 @@ def from_2_way_softmax(causal_lm, seq_cls_model, tokenizer, tokens, device):
 
 
 def no_post_processing(causal_lm, seq_cls_model, tokenizer, tokens, device):
+    """
+    Directly use token weights from the language model head for classification.
+
+    This method maps each classification label directly to a corresponding token
+    in the vocabulary without additional transformation.
+
+    Args:
+        causal_lm: The original causal language model
+        seq_cls_model: The target sequence classification model
+        tokenizer: Model tokenizer
+        tokens: List of tokens representing class labels
+        device: Target device (cpu/cuda)
+    """
+    # Get the language model head weights (vocabulary_size x hidden_size)
     lm_head_weights = causal_lm.lm_head.weight
 
+    # Convert all tokens to their corresponding token IDs
     token_ids = [tokenizer.convert_tokens_to_ids(t) for t in tokens]
 
+    # Extract weights for the specific tokens (num_tokens x hidden_size)
     score_weight = lm_head_weights[token_ids].to(device)
 
+    # Copy the weights to the sequence classification model
     with torch.no_grad():
         seq_cls_model.score.weight.copy_(score_weight)
         if seq_cls_model.score.bias is not None:
@@ -56,21 +108,35 @@ method_map = {
 
 
 def converting(
-    model_name, classifier_from_tokens, path, method, use_pad_token=False, device="cpu"
+    model_name, classifier_from_tokens, path, method, use_sep_token=False, device="cpu"
 ):
-    assert method in method_map
-
+    """
+    Main conversion function to transform a CausalLM model to SequenceClassification.
+
+    Args:
+        model_name: Name or path of the pretrained model
+        classifier_from_tokens: List of tokens used for classification
+        path: Output path to save the converted model
+        method: Conversion method ('from_2_way_softmax' or 'no_post_processing')
+        use_sep_token: Whether to use separating token in the sequence classification model
+        device: Device to load the model on ('cpu' or 'cuda')
+    """
+    assert method in method_map, f"Unknown method: {method}"
+
+    # Determine number of labels based on conversion method
     if method == "from_2_way_softmax":
         assert len(classifier_from_tokens) == 2
         num_labels = 1
     else:
         num_labels = len(classifier_from_tokens)
 
+    # Load tokenizer and original causal language model
     tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
     causal_lm = transformers.AutoModelForCausalLM.from_pretrained(
         model_name, device_map=device
     )
 
+    # Load an empty sequence classification model with the same architecture
     seq_cls_model = transformers.AutoModelForSequenceClassification.from_pretrained(
         model_name,
         num_labels=num_labels,
@@ -78,14 +144,17 @@ def converting(
         device_map=device,
     )
 
+    # Apply the selected conversion method to transfer weights
     method_map[method](
         causal_lm, seq_cls_model, tokenizer, classifier_from_tokens, device
     )
 
-    # `llm as reranker` defaults to not using pad_token
-    seq_cls_model.config.use_pad_token = use_pad_token
-    seq_cls_model.config.pad_token_id = tokenizer.pad_token_id
+    # Configure separating token settings
+    # Note: `llm as reranker` defaults to not using separating token.
+    seq_cls_model.config.use_sep_token = use_sep_token
+    seq_cls_model.config.sep_token_id = tokenizer.sep_token_id
 
+    # Save the converted model and tokenizer
     seq_cls_model.save_pretrained(path)
     tokenizer.save_pretrained(path)
 
@@ -99,25 +168,30 @@ def parse_args():
         "--model_name",
         type=str,
         default="BAAI/bge-reranker-v2-gemma",
-        help="Model name",
+        help="HuggingFace model name or local path",
     )
     parser.add_argument(
         "--classifier_from_tokens",
         type=str,
         default='["Yes"]',
-        help="classifier from tokens",
+        help="JSON string of tokens used for classification labels",
     )
     parser.add_argument(
-        "--method", type=str, default="no_post_processing", help="Converting converting"
+        "--method",
+        type=str,
+        default="no_post_processing",
+        help="Conversion method to use",
     )
     parser.add_argument(
-        "--use-pad-token", action="store_true", help="Whether to use pad_token"
+        "--use-pad-token",
+        action="store_true",
+        help="Enable padding token in the sequence classification model",
     )
     parser.add_argument(
         "--path",
         type=str,
         default="./bge-reranker-v2-gemma-seq-cls",
-        help="Path to save converted model",
+        help="Output directory to save the converted model",
     )
     return parser.parse_args()
 
@@ -129,6 +203,6 @@ if __name__ == "__main__":
         model_name=args.model_name,
         classifier_from_tokens=json.loads(args.classifier_from_tokens),
         method=args.method,
-        use_pad_token=args.use_pad_token,
+        use_sep_token=args.use_sep_token,
         path=args.path,
     )
diff --git a/examples/pooling/score/offline_reranker.py b/examples/pooling/score/offline_reranker.py
deleted file mode 100644
index 7bc48277f5512173c3706035e185331c531ced8e..0000000000000000000000000000000000000000
--- a/examples/pooling/score/offline_reranker.py
+++ /dev/null
@@ -1,89 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-# ruff: noqa: E501
-
-from vllm import LLM
-
-model_name = "Qwen/Qwen3-Reranker-0.6B"
-
-# What is the difference between the official original version and one
-# that has been converted into a sequence classification model?
-# Qwen3-Reranker is a language model that doing reranker by using the
-# logits of "no" and "yes" tokens.
-# It needs to computing 151669 tokens logits, making this method extremely
-# inefficient, not to mention incompatible with the vllm score API.
-# A method for converting the original model into a sequence classification
-# model was proposed. See：https://huggingface.co/Qwen/Qwen3-Reranker-0.6B/discussions/3
-# Models converted offline using this method can not only be more efficient
-# and support the vllm score API, but also make the init parameters more
-# concise, for example.
-# llm = LLM(model="tomaarsen/Qwen3-Reranker-0.6B-seq-cls", runner="pooling")
-
-# If you want to load the official original version, the init parameters are
-# as follows.
-
-
-def get_llm() -> LLM:
-    """Initializes and returns the LLM model for Qwen3-Reranker."""
-    return LLM(
-        model=model_name,
-        runner="pooling",
-        hf_overrides={
-            "architectures": ["Qwen3ForSequenceClassification"],
-            "classifier_from_token": ["no", "yes"],
-            "is_original_qwen3_reranker": True,
-        },
-    )
-
-
-# Why do we need hf_overrides for the official original version:
-# vllm converts it to Qwen3ForSequenceClassification when loaded for
-# better performance.
-# - Firstly, we need using `"architectures": ["Qwen3ForSequenceClassification"],`
-# to manually route to Qwen3ForSequenceClassification.
-# - Then, we will extract the vector corresponding to classifier_from_token
-# from lm_head using `"classifier_from_token": ["no", "yes"]`.
-# - Third, we will convert these two vectors into one vector.  The use of
-# conversion logic is controlled by `using "is_original_qwen3_reranker": True`.
-
-# Please use the query_template and document_template to format the query and
-# document for better reranker results.
-
-prefix = '<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be "yes" or "no".<|im_end|>\n<|im_start|>user\n'
-suffix = "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"
-
-query_template = "{prefix}<Instruct>: {instruction}\n<Query>: {query}\n"
-document_template = "<Document>: {doc}{suffix}"
-
-
-def main() -> None:
-    instruction = (
-        "Given a web search query, retrieve relevant passages that answer the query"
-    )
-
-    queries = [
-        "What is the capital of China?",
-        "Explain gravity",
-    ]
-
-    documents = [
-        "The capital of China is Beijing.",
-        "Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun.",
-    ]
-
-    queries = [
-        query_template.format(prefix=prefix, instruction=instruction, query=query)
-        for query in queries
-    ]
-    documents = [document_template.format(doc=doc, suffix=suffix) for doc in documents]
-
-    llm = get_llm()
-    outputs = llm.score(queries, documents)
-
-    print("-" * 30)
-    print([output.outputs.score for output in outputs])
-    print("-" * 30)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/pooling/score/openai_cross_encoder_score_for_multimodal.py b/examples/pooling/score/openai_cross_encoder_score_for_multimodal.py
deleted file mode 100644
index 80ed2c27dfb117117178bd42ea140afedf0faf47..0000000000000000000000000000000000000000
--- a/examples/pooling/score/openai_cross_encoder_score_for_multimodal.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-Example online usage of Score API.
-
-Run `vllm serve <model> --runner pooling` to start up the server in vLLM.
-"""
-
-import argparse
-import pprint
-
-import requests
-
-
-def post_http_request(prompt: dict, api_url: str) -> requests.Response:
-    headers = {"User-Agent": "Test Client"}
-    response = requests.post(api_url, headers=headers, json=prompt)
-    return response
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--host", type=str, default="localhost")
-    parser.add_argument("--port", type=int, default=8000)
-    parser.add_argument("--model", type=str, default="jinaai/jina-reranker-m0")
-    return parser.parse_args()
-
-
-def main(args):
-    api_url = f"http://{args.host}:{args.port}/score"
-    model_name = args.model
-
-    text_1 = "slm markdown"
-    text_2 = {
-        "content": [
-            {
-                "type": "image_url",
-                "image_url": {
-                    "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
-                },
-            },
-            {
-                "type": "image_url",
-                "image_url": {
-                    "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
-                },
-            },
-        ]
-    }
-    prompt = {"model": model_name, "text_1": text_1, "text_2": text_2}
-    score_response = post_http_request(prompt=prompt, api_url=api_url)
-    print("\nPrompt when text_1 is string and text_2 is a image list:")
-    pprint.pprint(prompt)
-    print("\nScore Response:")
-    pprint.pprint(score_response.json())
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    main(args)
diff --git a/examples/pooling/score/qwen3_reranker_offline.py b/examples/pooling/score/qwen3_reranker_offline.py
new file mode 100644
index 0000000000000000000000000000000000000000..c79ebf97fe649f900a3a45ea8152e7aa3a597535
--- /dev/null
+++ b/examples/pooling/score/qwen3_reranker_offline.py
@@ -0,0 +1,104 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa: E501
+
+"""
+What is the difference between the official original version and one
+that has been converted into a sequence classification model?
+
+Qwen3-Reranker is a language model that doing reranker by using the
+logits of "no" and "yes" tokens.
+This requires computing logits for all 151,669 tokens in the vocabulary,
+making it inefficient and incompatible with vLLM's score() API.
+
+A conversion method has been proposed to transform the original model into a
+sequence classification model. This converted model:
+1. Is significantly more efficient
+2. Fully supports vLLM's score() API
+3. Simplifies initialization parameters
+Reference: https://huggingface.co/Qwen/Qwen3-Reranker-0.6B/discussions/3
+Reference: https://github.com/vllm-project/vllm/blob/main/examples/pooling/score/convert_model_to_seq_cls.py
+
+For the converted model, initialization would simply be:
+llm = LLM(model="tomaarsen/Qwen3-Reranker-0.6B-seq-cls", runner="pooling")
+
+This example demonstrates loading the ORIGINAL model with special overrides
+to make it compatible with vLLM's score API.
+"""
+
+from pathlib import Path
+
+from vllm import LLM
+
+model_name = "Qwen/Qwen3-Reranker-0.6B"
+
+
+def get_llm() -> LLM:
+    """
+    Initializes and returns the LLM model for Qwen3-Reranker.
+
+    Returns:
+        LLM: Configured vLLM instance for reranking tasks.
+
+    Note:
+        This function loads the ORIGINAL Qwen3-Reranker model with specific
+        overrides to make it compatible with vLLM's score API.
+    """
+    return LLM(
+        # Specify the original model from HuggingFace
+        model=model_name,
+        # Use pooling runner for score task
+        runner="pooling",
+        # HuggingFace model configuration overrides required for compatibility
+        hf_overrides={
+            # Manually route to sequence classification architecture
+            # This tells vLLM to use Qwen3ForSequenceClassification instead of
+            # the default Qwen3ForCausalLM
+            "architectures": ["Qwen3ForSequenceClassification"],
+            # Specify which token logits to extract from the language model head
+            # The original reranker uses "no" and "yes" token logits for scoring
+            "classifier_from_token": ["no", "yes"],
+            # Enable special handling for original Qwen3-Reranker models
+            # This flag triggers conversion logic that transforms the two token
+            # vectors into a single classification vector
+            "is_original_qwen3_reranker": True,
+        },
+    )
+
+
+def main() -> None:
+    # Load the Jinja template for formatting query-document pairs
+    # The template ensures proper formatting for the reranker model
+    template_home = Path(__file__).parent / "template"
+    template_path = "qwen3_reranker.jinja"
+    chat_template = (template_home / template_path).read_text()
+
+    # Sample queries for testing the reranker
+    queries = [
+        "What is the capital of China?",
+        "Explain gravity",
+    ]
+
+    # Corresponding documents to be scored against each query
+    documents = [
+        "The capital of China is Beijing.",
+        "Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun.",
+    ]
+
+    # Initialize the LLM model with the original Qwen3-Reranker configuration
+    llm = get_llm()
+
+    # Compute relevance scores for each query-document pair
+    # The score() method returns a relevance score for each pair
+    # Higher scores indicate better relevance
+    outputs = llm.score(queries, documents, chat_template=chat_template)
+
+    # Extract and print the relevance scores from the outputs
+    # Each output contains a score representing query-document relevance
+    print("-" * 30)
+    print("Relevance scores:", [output.outputs.score for output in outputs])
+    print("-" * 30)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/pooling/score/qwen3_reranker_online.py b/examples/pooling/score/qwen3_reranker_online.py
new file mode 100644
index 0000000000000000000000000000000000000000..441c1709dc524898b8a3fd84eb25b26476724344
--- /dev/null
+++ b/examples/pooling/score/qwen3_reranker_online.py
@@ -0,0 +1,80 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa: E501
+"""
+What is the difference between the official original version and one
+that has been converted into a sequence classification model?
+
+Qwen3-Reranker is a language model that doing reranker by using the
+logits of "no" and "yes" tokens.
+This requires computing logits for all 151,669 tokens in the vocabulary,
+making it inefficient and incompatible with vLLM's score() API.
+
+A conversion method has been proposed to transform the original model into a
+sequence classification model. This converted model:
+1. Is significantly more efficient
+2. Fully supports vLLM's score() API
+3. Simplifies initialization parameters
+Reference: https://huggingface.co/Qwen/Qwen3-Reranker-0.6B/discussions/3
+Reference: https://github.com/vllm-project/vllm/blob/main/examples/pooling/score/convert_model_to_seq_cls.py
+
+For the converted model, initialization would simply be:
+    vllm serve tomaarsen/Qwen3-Reranker-0.6B-seq-cls --runner pooling --chat-template examples/pooling/score/template/qwen3_reranker.jinja
+
+This example demonstrates loading the ORIGINAL model with special overrides
+to make it compatible with vLLM's score API.
+    vllm serve Qwen/Qwen3-Reranker-0.6B --runner pooling --hf_overrides '{"architectures": ["Qwen3ForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}' --chat-template examples/pooling/score/template/qwen3_reranker.jinja
+"""
+
+import json
+
+import requests
+
+# URL of the vLLM server's score endpoint
+# Default vLLM server runs on localhost port 8000
+url = "http://127.0.0.1:8000/score"
+
+# HTTP headers for the request
+headers = {"accept": "application/json", "Content-Type": "application/json"}
+
+# Example queries & documents
+queries = [
+    "What is the capital of China?",
+    "Explain gravity",
+]
+documents = [
+    "The capital of China is Beijing.",
+    "Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun.",
+]
+
+# Request payload for the score API
+data = {
+    "model": "Qwen/Qwen3-Reranker-0.6B",
+    "text_1": queries,
+    "text_2": documents,
+}
+
+
+def main():
+    """Main function to send a score request to the vLLM server.
+
+    This function sends a POST request to the /score endpoint with
+    the query and documents, then prints the relevance scores.
+    """
+    # Send POST request to the vLLM server's score endpoint
+    response = requests.post(url, headers=headers, json=data)
+
+    # Check if the request was successful
+    if response.status_code == 200:
+        print("Request successful!")
+        # Pretty print the JSON response containing relevance scores
+        # The response includes scores for each document's relevance to the query
+        print(json.dumps(response.json(), indent=2))
+    else:
+        # Handle request failure
+        print(f"Request failed with status code: {response.status_code}")
+        print(response.text)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/pooling/score/openai_reranker.py b/examples/pooling/score/rerank_api_online.py
similarity index 100%
rename from examples/pooling/score/openai_reranker.py
rename to examples/pooling/score/rerank_api_online.py
diff --git a/examples/pooling/score/openai_cross_encoder_score.py b/examples/pooling/score/score_api_online.py
similarity index 100%
rename from examples/pooling/score/openai_cross_encoder_score.py
rename to examples/pooling/score/score_api_online.py
diff --git a/examples/pooling/score/template/bge-reranker-v2-gemma.jinja b/examples/pooling/score/template/bge-reranker-v2-gemma.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..cdc83aeab6cb7b56bbcfdcec42963a5d8b1f5629
--- /dev/null
+++ b/examples/pooling/score/template/bge-reranker-v2-gemma.jinja
@@ -0,0 +1,3 @@
+A: {{ (messages | selectattr("role", "eq", "query") | first).content }}
+B: {{ (messages | selectattr("role", "eq", "document") | first).content }}
+Given a query A and a passage B, determine whether the passage contains an answer to the query by providing a prediction of either 'Yes' or 'No'.
\ No newline at end of file
diff --git a/examples/pooling/score/template/mxbai_rerank_v2.jinja b/examples/pooling/score/template/mxbai_rerank_v2.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..32488c48b3afb0e1371b85913cfbc204cf815a83
--- /dev/null
+++ b/examples/pooling/score/template/mxbai_rerank_v2.jinja
@@ -0,0 +1,8 @@
+<|im_start|>system
+You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
+<|im_start|>user
+query: {{ (messages | selectattr("role", "eq", "query") | first).content }}
+document: {{ (messages | selectattr("role", "eq", "document") | first).content }}
+You are a search relevance expert who evaluates how well documents match search queries. For each query-document pair, carefully analyze the semantic relationship between them, then provide your binary relevance judgment (0 for not relevant, 1 for relevant).
+Relevance:<|im_end|>
+<|im_start|>assistant
diff --git a/examples/pooling/score/template/nemotron-rerank.jinja b/examples/pooling/score/template/nemotron-rerank.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..0447d7bcd5d597171981b4e2f707878aadc586e4
--- /dev/null
+++ b/examples/pooling/score/template/nemotron-rerank.jinja
@@ -0,0 +1,3 @@
+question:{{ (messages | selectattr("role", "eq", "query") | first).content }} 
+ 
+ passage:{{ (messages | selectattr("role", "eq", "document") | first).content }}
\ No newline at end of file
diff --git a/examples/pooling/score/template/qwen3_reranker.jinja b/examples/pooling/score/template/qwen3_reranker.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..f33f526dc054ca306a83c4398ad62cc903b30237
--- /dev/null
+++ b/examples/pooling/score/template/qwen3_reranker.jinja
@@ -0,0 +1,11 @@
+<|im_start|>system
+Judge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be "yes" or "no".<|im_end|>
+<|im_start|>user
+<Instruct>: {{ messages | selectattr("role", "eq", "system") | map(attribute="content") | first | default("Given a web search query, retrieve relevant passages that answer the query") }}
+<Query>: {{ messages | selectattr("role", "eq", "query") | map(attribute="content") | first }}
+<Document>: {{ messages | selectattr("role", "eq", "document") | map(attribute="content") | first }}<|im_end|>
+<|im_start|>assistant
+<think>
+
+</think>
+
diff --git a/examples/pooling/score/template/qwen3_vl_reranker.jinja b/examples/pooling/score/template/qwen3_vl_reranker.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..ed89f2a547ac94f446f591fe44677590984a7dbb
--- /dev/null
+++ b/examples/pooling/score/template/qwen3_vl_reranker.jinja
@@ -0,0 +1,23 @@
+<|im_start|>system
+Judge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be "yes" or "no".<|im_end|>
+<|im_start|>user
+<Instruct>: {{
+    messages
+    | selectattr("role", "eq", "system")
+    | map(attribute="content")
+    | first
+    | default("Given a search query, retrieve relevant candidates that answer the query.")
+}}<Query>:{{
+    messages
+    | selectattr("role", "eq", "query")
+    | map(attribute="content")
+    | first
+}}
+<Document>:{{
+    messages
+    | selectattr("role", "eq", "document")
+    | map(attribute="content")
+    | first
+}}<|im_end|>
+<|im_start|>assistant
+
diff --git a/examples/pooling/score/using_template_offline.py b/examples/pooling/score/using_template_offline.py
new file mode 100644
index 0000000000000000000000000000000000000000..f434e699ff0ec82156c2ab0fa87b6a43aa409741
--- /dev/null
+++ b/examples/pooling/score/using_template_offline.py
@@ -0,0 +1,159 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa: E501
+from argparse import Namespace
+from pathlib import Path
+from typing import Any
+
+from vllm import LLM, EngineArgs
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+
+
+def parse_args():
+    """Parse command line arguments for the reranking example.
+
+    This function sets up the argument parser with default values
+    specific to reranking models, including the model name and
+    runner type.
+    """
+    parser = FlexibleArgumentParser()
+    # Add all EngineArgs command line arguments to the parser
+    parser = EngineArgs.add_cli_args(parser)
+
+    # Set default values specific to this reranking example
+    # These defaults ensure the script works out-of-the-box for reranking tasks
+    parser.set_defaults(
+        model="nvidia/llama-nemotron-rerank-1b-v2",  # Default reranking model
+        runner="pooling",  # Required for cross-encoder/reranking models
+        trust_remote_code=True,  # Allow loading models with custom code
+    )
+    return parser.parse_args()
+
+
+def get_chat_template(model: str) -> str:
+    """Load the appropriate chat template for the specified model.
+
+    Reranking models require specific prompt templates to format
+    query-document pairs correctly. This function maps model names
+    to their corresponding template files.
+    """
+    # Directory containing all chat template files
+    template_home = Path(__file__).parent / "template"
+
+    # Mapping from model names to their corresponding template files
+    # Each reranking model has its own specific prompt format
+    model_name_to_template_path_map = {
+        "BAAI/bge-reranker-v2-gemma": "bge-reranker-v2-gemma.jinja",
+        "Qwen/Qwen3-Reranker-0.6B": "qwen3_reranker.jinja",
+        "Qwen/Qwen3-Reranker-4B": "qwen3_reranker.jinja",
+        "Qwen/Qwen3-Reranker-8B": "qwen3_reranker.jinja",
+        "tomaarsen/Qwen3-Reranker-0.6B-seq-cls": "qwen3_reranker.jinja",
+        "tomaarsen/Qwen3-Reranker-4B-seq-cls": "qwen3_reranker.jinja",
+        "tomaarsen/Qwen3-Reranker-8B-seq-cls": "qwen3_reranker.jinja",
+        "mixedbread-ai/mxbai-rerank-base-v2": "mxbai_rerank_v2.jinja",
+        "mixedbread-ai/mxbai-rerank-large-v2": "mxbai_rerank_v2.jinja",
+        "nvidia/llama-nemotron-rerank-1b-v2": "nemotron-rerank.jinja",
+    }
+
+    # Get the template filename for the specified model
+    template_path = model_name_to_template_path_map.get(model)
+
+    if template_path is None:
+        raise ValueError(f"This demo does not support model name: {model}.")
+
+    # Read and return the template content
+    return (template_home / template_path).read_text()
+
+
+def get_hf_overrides(model: str) -> dict[str, Any]:
+    """Convert Large Language Models (LLMs) to Sequence Classification models.
+
+    note:
+        Some reranking models require special configuration overrides to work
+        correctly with vLLM's score API.
+        Reference: https://github.com/vllm-project/vllm/blob/main/examples/pooling/score/qwen3_reranker_offline.py
+        Reference: https://github.com/vllm-project/vllm/blob/main/examples/pooling/score/convert_model_to_seq_cls.py
+    """
+
+    model_name_to_hf_overrides_map = {
+        "BAAI/bge-reranker-v2-gemma": {
+            "architectures": ["GemmaForSequenceClassification"],
+            "classifier_from_token": ["Yes"],
+            "method": "no_post_processing",
+        },
+        "Qwen/Qwen3-Reranker-0.6B": {
+            "architectures": ["Qwen3ForSequenceClassification"],
+            "classifier_from_token": ["no", "yes"],
+            "is_original_qwen3_reranker": True,
+        },
+        "Qwen/Qwen3-Reranker-4B": {
+            "architectures": ["Qwen3ForSequenceClassification"],
+            "classifier_from_token": ["no", "yes"],
+            "is_original_qwen3_reranker": True,
+        },
+        "Qwen/Qwen3-Reranker-8B": {
+            "architectures": ["Qwen3ForSequenceClassification"],
+            "classifier_from_token": ["no", "yes"],
+            "is_original_qwen3_reranker": True,
+        },
+        "tomaarsen/Qwen3-Reranker-0.6B-seq-cls": {},
+        "tomaarsen/Qwen3-Reranker-4B-seq-cls": {},
+        "tomaarsen/Qwen3-Reranker-8B-seq-cls": {},
+        "mixedbread-ai/mxbai-rerank-base-v2": {
+            "architectures": ["Qwen2ForSequenceClassification"],
+            "classifier_from_token": ["0", "1"],
+            "method": "from_2_way_softmax",
+        },
+        "mixedbread-ai/mxbai-rerank-large-v2": {
+            "architectures": ["Qwen2ForSequenceClassification"],
+            "classifier_from_token": ["0", "1"],
+            "method": "from_2_way_softmax",
+        },
+        "nvidia/llama-nemotron-rerank-1b-v2": {},
+    }
+
+    hf_overrides = model_name_to_hf_overrides_map.get(model)
+
+    if hf_overrides is None:
+        raise ValueError(f"This demo does not support model name: {model}.")
+
+    return hf_overrides
+
+
+def main(args: Namespace):
+    """Main execution function for the reranking example."""
+
+    # Get the overrides for the specified model
+    args.hf_overrides = get_hf_overrides(args.model)
+
+    # Initialize the LLM with all provided arguments
+    llm = LLM(**vars(args))
+
+    # Example query for demonstration
+    query = "how much protein should a female eat?"
+
+    # Example documents to be reranked based on relevance to the query
+    documents = [
+        "As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.",
+        "Definition of summit for English Language Learners. : 1  the highest point of a mountain : the top of a mountain. : 2  the highest level. : 3  a meeting or series of meetings between the leaders of two or more governments.",
+        "Calorie intake should not fall below 1,200 a day in women or 1,500 a day in men, except under the supervision of a health professional.",
+    ]
+
+    # Load the appropriate chat template for the selected model
+    # The template formats query-document pairs for the reranking model
+    chat_template = get_chat_template(args.model)
+
+    # Score documents based on relevance to the query
+    # The score method returns relevance scores for each document
+    outputs = llm.score(query, documents, chat_template=chat_template)
+
+    # Display the relevance scores
+    # Higher scores indicate more relevant documents
+    print("-" * 30)
+    print([output.outputs.score for output in outputs])
+    print("-" * 30)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/examples/pooling/score/using_template_online.py b/examples/pooling/score/using_template_online.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0bfa7d157697e23499ec2c67c80089f035c821d
--- /dev/null
+++ b/examples/pooling/score/using_template_online.py
@@ -0,0 +1,75 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa: E501
+"""
+Example of using the rerank API with template.
+
+This script demonstrates how to interact with a vLLM server running
+a reranking model via the REST API.
+Before running this script, start the vLLM server with one of the
+supported reranking models using the commands below.
+
+note:
+    Some reranking models require special configuration overrides to work correctly
+    with vLLM's score API.
+    Reference: https://github.com/vllm-project/vllm/blob/main/examples/pooling/score/qwen3_reranker_online.py
+    Reference: https://github.com/vllm-project/vllm/blob/main/examples/pooling/score/convert_model_to_seq_cls.py
+
+run:
+    vllm serve BAAI/bge-reranker-v2-gemma --hf_overrides '{"architectures": ["GemmaForSequenceClassification"],"classifier_from_token": ["Yes"],"method": "no_post_processing"}' --chat-template examples/pooling/score/template/bge-reranker-v2-gemma.jinja
+    vllm serve tomaarsen/Qwen3-Reranker-0.6B-seq-cls --chat-template examples/pooling/score/template/qwen3_reranker.jinja
+    vllm serve mixedbread-ai/mxbai-rerank-base-v2 --hf_overrides '{"architectures": ["Qwen2ForSequenceClassification"],"classifier_from_token": ["0", "1"], "method": "from_2_way_softmax"}' --chat-template examples/pooling/score/template/mxbai_rerank_v2.jinja
+    vllm serve nvidia/llama-nemotron-rerank-1b-v2 --runner pooling --trust-remote-code --chat-template examples/pooling/score/template/nemotron-rerank.jinja
+    vllm serve Qwen/Qwen3-Reranker-0.6B --runner pooling --hf_overrides '{"architectures": ["Qwen3ForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}' --chat-template examples/pooling/score/template/qwen3_reranker.jinja
+"""
+
+import json
+
+import requests
+
+# URL of the vLLM server's rerank endpoint
+# Default vLLM server runs on localhost port 8000
+url = "http://127.0.0.1:8000/rerank"
+
+# HTTP headers for the request
+headers = {"accept": "application/json", "Content-Type": "application/json"}
+
+# Example query & documents
+query = "how much protein should a female eat?"
+documents = [
+    "As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.",
+    "Definition of summit for English Language Learners. : 1  the highest point of a mountain : the top of a mountain. : 2  the highest level. : 3  a meeting or series of meetings between the leaders of two or more governments.",
+    "Calorie intake should not fall below 1,200 a day in women or 1,500 a day in men, except under the supervision of a health professional.",
+]
+
+# Request payload for the rerank API
+data = {
+    "model": "nvidia/llama-nemotron-rerank-1b-v2",  # Model to use for reranking
+    "query": query,  # The query to score documents against
+    "documents": documents,  # List of documents to be scored
+}
+
+
+def main():
+    """Main function to send a rerank request to the vLLM server.
+
+    This function sends a POST request to the /rerank endpoint with
+    the query and documents, then prints the relevance scores.
+    """
+    # Send POST request to the vLLM server's rerank endpoint
+    response = requests.post(url, headers=headers, json=data)
+
+    # Check if the request was successful
+    if response.status_code == 200:
+        print("Request successful!")
+        # Pretty print the JSON response containing relevance scores
+        # The response includes scores for each document's relevance to the query
+        print(json.dumps(response.json(), indent=2))
+    else:
+        # Handle request failure
+        print(f"Request failed with status code: {response.status_code}")
+        print(response.text)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/pooling/score/vision_rerank_api_online.py b/examples/pooling/score/vision_rerank_api_online.py
new file mode 100644
index 0000000000000000000000000000000000000000..37a7decf39830d8c04116b949a94996502714e0b
--- /dev/null
+++ b/examples/pooling/score/vision_rerank_api_online.py
@@ -0,0 +1,80 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa: E501
+
+"""
+Example Python client for multimodal rerank API which is compatible with
+Jina and Cohere https://jina.ai/reranker
+
+Run `vllm serve <model> --runner pooling` to start up the server in vLLM.
+e.g.
+    vllm serve jinaai/jina-reranker-m0 --runner pooling
+
+    vllm serve Qwen/Qwen3-VL-Reranker-2B \
+        --runner pooling \
+        --max-model-len 4096 \
+        --hf_overrides '{"architectures": ["Qwen3VLForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}' \
+        --chat-template examples/pooling/score/template/qwen3_vl_reranker.jinja
+"""
+
+import argparse
+import json
+
+import requests
+
+headers = {"accept": "application/json", "Content-Type": "application/json"}
+
+query = "A woman playing with her dog on a beach at sunset."
+documents = {
+    "content": [
+        {
+            "type": "text",
+            "text": (
+                "A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset, "  # noqa: E501
+                "as the dog offers its paw in a heartwarming display of companionship and trust."  # noqa: E501
+            ),
+        },
+        {
+            "type": "image_url",
+            "image_url": {
+                "url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
+            },
+        },
+    ]
+}
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=8000)
+    return parser.parse_args()
+
+
+def main(args):
+    base_url = f"http://{args.host}:{args.port}"
+    models_url = base_url + "/v1/models"
+    rerank_url = base_url + "/rerank"
+
+    response = requests.get(models_url, headers=headers)
+    model = response.json()["data"][0]["id"]
+
+    data = {
+        "model": model,
+        "query": query,
+        "documents": documents,
+    }
+    response = requests.post(rerank_url, headers=headers, json=data)
+
+    # Check the response
+    if response.status_code == 200:
+        print("Request successful!")
+        print(json.dumps(response.json(), indent=2))
+    else:
+        print(f"Request failed with status code: {response.status_code}")
+        print(response.text)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/examples/pooling/score/vision_reranker_offline.py b/examples/pooling/score/vision_reranker_offline.py
new file mode 100644
index 0000000000000000000000000000000000000000..657aced98ec51a79de18434e610ab7f2a3e58b06
--- /dev/null
+++ b/examples/pooling/score/vision_reranker_offline.py
@@ -0,0 +1,172 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This example shows how to use vLLM for running offline inference with
+vision language reranker models for multimodal scoring tasks.
+
+Vision language rerankers score the relevance between a text query and
+multimodal documents (text + images/videos).
+"""
+
+from argparse import Namespace
+from collections.abc import Callable
+from dataclasses import asdict
+from pathlib import Path
+from typing import NamedTuple
+
+from vllm import LLM, EngineArgs
+from vllm.entrypoints.score_utils import ScoreMultiModalParam
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+
+TEMPLATE_HOME = Path(__file__).parent / "template"
+
+
+class RerankModelData(NamedTuple):
+    engine_args: EngineArgs
+    chat_template: str | None = None
+
+
+def run_jinavl_reranker(modality: str) -> RerankModelData:
+    assert modality == "image"
+
+    engine_args = EngineArgs(
+        model="jinaai/jina-reranker-m0",
+        runner="pooling",
+        max_model_len=32768,
+        trust_remote_code=True,
+        mm_processor_kwargs={
+            "min_pixels": 3136,
+            "max_pixels": 602112,
+        },
+        limit_mm_per_prompt={modality: 1},
+    )
+    return RerankModelData(
+        engine_args=engine_args,
+    )
+
+
+def run_qwen3_vl_reranker(modality: str) -> RerankModelData:
+    engine_args = EngineArgs(
+        model="Qwen/Qwen3-VL-Reranker-2B",
+        runner="pooling",
+        max_model_len=16384,
+        limit_mm_per_prompt={modality: 1},
+        # HuggingFace model configuration overrides required for compatibility
+        hf_overrides={
+            # Manually route to sequence classification architecture
+            # This tells vLLM to use Qwen3VLForSequenceClassification instead of
+            # the default Qwen3VLForConditionalGeneration
+            "architectures": ["Qwen3VLForSequenceClassification"],
+            # Specify which token logits to extract from the language model head
+            # The original reranker uses "no" and "yes" token logits for scoring
+            "classifier_from_token": ["no", "yes"],
+            # Enable special handling for original Qwen3-Reranker models
+            # This flag triggers conversion logic that transforms the two token
+            # vectors into a single classification vector
+            "is_original_qwen3_reranker": True,
+        },
+    )
+    chat_template_path = "qwen3_vl_reranker.jinja"
+    chat_template = (TEMPLATE_HOME / chat_template_path).read_text()
+    return RerankModelData(
+        engine_args=engine_args,
+        chat_template=chat_template,
+    )
+
+
+model_example_map: dict[str, Callable[[str], RerankModelData]] = {
+    "jinavl_reranker": run_jinavl_reranker,
+    "qwen3_vl_reranker": run_qwen3_vl_reranker,
+}
+
+
+def parse_args():
+    parser = FlexibleArgumentParser(
+        description="Demo on using vLLM for offline inference with "
+        "vision language reranker models for multimodal scoring tasks."
+    )
+    parser.add_argument(
+        "--model-name",
+        "-m",
+        type=str,
+        default="jinavl_reranker",
+        choices=model_example_map.keys(),
+        help="The name of the reranker model.",
+    )
+    parser.add_argument(
+        "--modality",
+        type=str,
+        default="image",
+        choices=["image", "video"],
+        help="Modality of the multimodal input (image or video).",
+    )
+    return parser.parse_args()
+
+
+def get_multi_modal_input(modality: str) -> tuple[str, ScoreMultiModalParam]:
+    # Sample query for testing the reranker
+    if modality == "image":
+        query = "A woman playing with her dog on a beach at sunset."
+        # Sample multimodal documents to be scored against the query
+        # Each document contains an image URL that will be fetched and processed
+        documents: ScoreMultiModalParam = {
+            "content": [
+                {
+                    "type": "text",
+                    "text": (
+                        "A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset, "  # noqa: E501
+                        "as the dog offers its paw in a heartwarming display of companionship and trust."  # noqa: E501
+                    ),
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
+                    },
+                },
+            ]
+        }
+    elif modality == "video":
+        query = "A girl is drawing pictures on an ipad."
+        # Sample video documents to be scored against the query
+        documents: ScoreMultiModalParam = {
+            "content": [
+                {
+                    "type": "text",
+                    "text": "A girl is drawing a guitar on her ipad with Apple Pencil.",
+                },
+                {
+                    "type": "video_url",
+                    "video_url": {
+                        "url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/draw.mp4"
+                    },
+                },
+            ]
+        }
+    else:
+        raise ValueError(f"Unsupported modality: {modality}")
+    return query, documents
+
+
+def main(args: Namespace):
+    # Run the selected reranker model
+    modality = args.modality
+    model_request = model_example_map[args.model_name](modality)
+    engine_args = model_request.engine_args
+
+    llm = LLM(**asdict(engine_args))
+
+    query, documents = get_multi_modal_input(modality)
+    outputs = llm.score(query, documents, chat_template=model_request.chat_template)
+
+    print("-" * 50)
+    print(f"Model: {engine_args.model}")
+    print(f"Modality: {modality}")
+    print(f"Query: {query}")
+    print("Relevance scores:", [output.outputs.score for output in outputs])
+    print("-" * 50)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/examples/pooling/score/vision_score_api_online.py b/examples/pooling/score/vision_score_api_online.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4b4825ee46892c44aa07facde47f93339356d97
--- /dev/null
+++ b/examples/pooling/score/vision_score_api_online.py
@@ -0,0 +1,71 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa: E501
+
+"""
+Example online usage of Score API.
+
+Run `vllm serve <model> --runner pooling` to start up the server in vLLM.
+e.g.
+    vllm serve jinaai/jina-reranker-m0 --runner pooling
+
+    vllm serve Qwen/Qwen3-VL-Reranker-2B \
+        --runner pooling \
+        --max-model-len 4096 \
+        --hf_overrides '{"architectures": ["Qwen3VLForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}' \
+        --chat-template examples/pooling/score/template/qwen3_vl_reranker.jinja
+"""
+
+import argparse
+import json
+import pprint
+
+import requests
+
+headers = {"accept": "application/json", "Content-Type": "application/json"}
+
+text_1 = "slm markdown"
+text_2 = {
+    "content": [
+        {
+            "type": "image_url",
+            "image_url": {
+                "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
+            },
+        },
+        {
+            "type": "image_url",
+            "image_url": {
+                "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
+            },
+        },
+    ]
+}
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=8000)
+    return parser.parse_args()
+
+
+def main(args):
+    base_url = f"http://{args.host}:{args.port}"
+    models_url = base_url + "/v1/models"
+    score_url = base_url + "/score"
+
+    response = requests.get(models_url, headers=headers)
+    model = response.json()["data"][0]["id"]
+
+    prompt = {"model": model, "text_1": text_1, "text_2": text_2}
+    response = requests.post(score_url, headers=headers, json=prompt)
+    print("\nPrompt when text_1 is string and text_2 is a image list:")
+    pprint.pprint(prompt)
+    print("\nScore Response:")
+    print(json.dumps(response.json(), indent=2))
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/examples/tool_chat_template_functiongemma.jinja b/examples/tool_chat_template_functiongemma.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..63b5d336a76b7c314ea9051ec93fcfcdd9f0b477
--- /dev/null
+++ b/examples/tool_chat_template_functiongemma.jinja
@@ -0,0 +1,54 @@
+{%- set ns = namespace(developer_content='', has_tools=false) -%}
+
+{%- if tools is defined and tools | length > 0 -%}
+    {%- set ns.has_tools = true -%}
+{%- endif -%}
+
+{%- for message in messages -%}
+    {%- if message.role == 'developer' or message.role == 'system' -%}
+<start_of_turn>user
+{{ message.content }}
+{%- if ns.has_tools %}
+
+Available functions:
+{%- for tool in tools %}
+{%- if tool.type == 'function' %}
+
+Function: {{ tool.function.name }}
+Description: {{ tool.function.description | default('No description provided') }}
+Parameters: {{ tool.function.parameters | tojson }}
+{%- endif %}
+{%- endfor %}
+{%- endif %}
+<end_of_turn>
+    {%- elif message.role == 'user' -%}
+<start_of_turn>user
+{{ message.content }}<end_of_turn>
+    {%- elif message.role == 'assistant' -%}
+        {%- if message.tool_calls is defined and message.tool_calls | length > 0 -%}
+<start_of_turn>model
+{%- for tool_call in message.tool_calls %}
+<start_function_call>call:{{ tool_call.function.name }}{
+{%- set args = tool_call.function.arguments -%}
+{%- if args is string -%}
+{%- set args = args | fromjson -%}
+{%- endif -%}
+{%- for key, value in args.items() -%}
+{{ key }}:<escape>{{ value }}<escape>{% if not loop.last %},{% endif %}
+{%- endfor -%}
+}<end_function_call>
+{%- endfor %}
+<end_of_turn>
+        {%- else -%}
+<start_of_turn>model
+{{ message.content }}<end_of_turn>
+        {%- endif -%}
+    {%- elif message.role == 'tool' -%}
+<start_of_turn>user
+Function result for {{ message.name | default('function') }}: {{ message.content }}<end_of_turn>
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+<start_of_turn>model
+{%- endif -%}
diff --git a/examples/tool_chat_template_glm4.jinja b/examples/tool_chat_template_glm4.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..11f76b4d4af44b113e8541612051b9c4572981c6
--- /dev/null
+++ b/examples/tool_chat_template_glm4.jinja
@@ -0,0 +1,54 @@
+{%- set counter = namespace(index=0) -%}
+{%- if not tools is defined %}
+    {%- set tools = none %}
+{%- endif %}
+
+{%- if messages and messages[0]['role'] == 'system' %}
+    {%- set system_message = messages[0]['content']|trim %}
+    {%- set messages = messages[1:] %}
+{%- else %}
+    {%- set system_message = "You are a helpful assistant." %}
+{%- endif %}
+
+{%- if tools is not none %}
+    {%- set tool_instruction %}
+You have access to the following tools. When you need to call a tool, you MUST use the following format:
+
+<tool_call>function_name
+<arg_key>parameter_name</arg_key>
+<arg_value>parameter_value</arg_value>
+</tool_call>
+
+Important rules:
+- Always wrap tool calls with <tool_call>...</tool_call> tags
+- Put the function name on the first line after <tool_call>
+- Use <arg_key> and <arg_value> tags for each parameter
+- If a parameter value is a string, keep it as-is. If it's a number or boolean, convert it appropriately
+- You can make multiple tool calls if needed
+- If no tool is suitable, respond with regular text
+
+Available tools:
+{% endset %}
+    {{- tool_instruction + "\n\n" }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+{%- endif %}
+
+{%- for message in messages -%}
+    {%- if message['role'] == 'user' -%}
+        {{- '[Round ' + counter.index|string + ']\n问：' + message['content'] -}}
+        {%- set counter.index = counter.index + 1 -%}
+    {%- endif -%}
+    {%- if message['role'] == 'assistant' -%}
+        {{- '\n答：' + message['content'] -}}
+        {%- if (loop.last and add_generation_prompt) or not loop.last -%}
+            {{- '\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
+    {{- '\n答：' -}}
+{%- endif -%}
diff --git a/mkdocs.yaml b/mkdocs.yaml
index 8fb8f0568c6efc967f26e72588d604c7dac0584c..c5501e7db0f0e1c8632797d3eacfcc75b8931d1a 100644
--- a/mkdocs.yaml
+++ b/mkdocs.yaml
@@ -80,6 +80,7 @@ plugins:
         - "re:vllm\\._.*"  # Internal modules
         - "vllm.third_party"
         - "vllm.vllm_flash_attn"
+        - "re:vllm\\.grpc\\..*_pb2.*"  # Auto-generated protobuf files
         - !ENV [API_AUTONAV_EXCLUDE, "re:^$"]  # Match nothing by default
   - mkdocstrings:
       handlers:
@@ -87,7 +88,8 @@ plugins:
           options:
             show_symbol_type_heading: true
             show_symbol_type_toc: true
-            filters: []
+            filters:
+              - "!.*_pb2_grpc"  # Exclude auto-generated gRPC stubs
             summary:
               modules: true
             show_if_no_docstring: true
diff --git a/pyproject.toml b/pyproject.toml
index ee765b6937e7d835cfed3125d70f98a47aeffa1a..ba14ac535081614d71736067cc21939c57e8ef03 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,9 +6,10 @@ requires = [
     "packaging>=24.2",
     "setuptools>=77.0.3,<81.0.0",
     "setuptools-scm>=8.0",
-    "torch >= 2.7.1",
+    "torch == 2.9.0",
     "wheel",
     "jinja2",
+    "grpcio-tools>=1.76.0",
 ]
 build-backend = "setuptools.build_meta"
 
@@ -55,6 +56,10 @@ include = ["vllm*"]
 "vllm/third_party/**" = ["ALL"]
 "vllm/version.py" = ["F401"]
 "vllm/_version.py" = ["ALL"]
+# Exclude generated protobuf files
+"vllm/grpc/*_pb2.py" = ["ALL"]
+"vllm/grpc/*_pb2_grpc.py" = ["ALL"]
+"vllm/grpc/*_pb2.pyi" = ["ALL"]
 
 [tool.ruff.lint]
 select = [
@@ -120,7 +125,7 @@ python = "./.venv"
 # these files may be written in non english words
 extend-exclude = ["tests/models/fixtures/*", "tests/prompts/*",
     "benchmarks/sonnet.txt", "tests/lora/data/*", "build/*",
-    "vllm/third_party/*"]
+    "vllm/third_party/*", "vllm/entrypoints/serve/instrumentator/static/*"]
 ignore-hidden = true
 ignore-files = true
 ignore-dot = true
@@ -162,6 +167,7 @@ depthwise_seperable_CNN = "depthwise_seperable_CNN"
 [tool.typos.default.extend-words]
 iy = "iy"
 tendencias = "tendencias"
+indx = "indx"
 # intel cpu features
 tme = "tme"
 dout = "dout"
@@ -302,4 +308,4 @@ windo = "windo"
 [tool.typos.type.vimscript.extend-words]
 
 [tool.uv]
-no-build-isolation-package = ["torch"]
+no-build-isolation-package = ["torch"]
\ No newline at end of file
diff --git a/requirements/build.txt b/requirements/build.txt
index b9c1aebe029ed62bc0908f39f3b7936975c54b53..9e62d6db40970e04cce1d60f9a62f28619155ad3 100644
--- a/requirements/build.txt
+++ b/requirements/build.txt
@@ -4,8 +4,10 @@ ninja
 packaging>=24.2
 setuptools>=77.0.3,<81.0.0
 setuptools-scm>=8
-torch==2.9.0
+torch==2.9.1
 wheel
 jinja2>=3.1.6
 regex
 build
+protobuf>=6.33.2
+grpcio-tools>=1.76.0
diff --git a/requirements/common.txt b/requirements/common.txt
index a9c8460772c3741a92a91f20de9d877c23c72426..c03d63dc643d457a6cb2885c86b67d87883bbcb4 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -9,7 +9,7 @@ blake3
 py-cpuinfo
 transformers >= 4.56.0, < 5
 tokenizers >= 0.21.1  # Required for fast incremental detokenization.
-protobuf # Required by LlamaTokenizer.
+protobuf >= 6.30.0 # Required by LlamaTokenizer, gRPC.
 fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
 aiohttp
 openai >= 1.99.1  # For Responses API with reasoning content
@@ -24,25 +24,24 @@ outlines_core == 0.2.11
 # required for outlines backend disk cache
 diskcache == 5.6.3
 lark == 1.2.2
-xgrammar == 0.1.27; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64" or platform_machine == "s390x" or platform_machine == "ppc64le"
+xgrammar == 0.1.29; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64" or platform_machine == "s390x" or platform_machine == "ppc64le"
 typing_extensions >= 4.10
 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
 partial-json-parser # used for parsing partial JSON outputs
 pyzmq >= 25.0.0
 msgspec
 gguf >= 0.17.0
-mistral_common[image] >= 1.8.5
+mistral_common[image] >= 1.8.8
 opencv-python-headless >= 4.11.0    # required for video IO
 pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
 setuptools>=77.0.3,<81.0.0; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
 einops # Required for Qwen2-VL.
-compressed-tensors == 0.12.2 # required for compressed-tensors
+compressed-tensors == 0.13.0 # required for compressed-tensors
 depyf==0.20.0 # required for profiling and debugging with compilation config
 cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
 watchfiles # required for http server to monitor the updates of TLS files
 python-json-logger # Used by logging as per examples/others/logging_configuration.md
-scipy # Required for phi-4-multimodal-instruct
 ninja # Required for xgrammar, rocm, tpu, xpu
 pybase64 # fast base64 implementation
 cbor2 # Required for cross-language serialization of hashable objects
@@ -50,5 +49,7 @@ ijson # Required for mistral streaming tool parser
 setproctitle # Used to set process names for better debugging and monitoring
 openai-harmony >= 0.0.3  # Required for gpt-oss
 anthropic == 0.71.0
-model-hosting-container-standards >= 0.1.9, < 1.0.0
-mcp
\ No newline at end of file
+model-hosting-container-standards >= 0.1.10, < 1.0.0
+mcp
+grpcio>=1.76.0
+grpcio-reflection>=1.76.0
\ No newline at end of file
diff --git a/requirements/cpu-build.txt b/requirements/cpu-build.txt
index 1ea401a04a12c5072fd6c889c2612133848dbd17..a7bd3b17b63231fa540e36ce71e6b7e78f24dd76 100644
--- a/requirements/cpu-build.txt
+++ b/requirements/cpu-build.txt
@@ -1,7 +1,7 @@
 cmake>=3.26.1
 ninja
 packaging>=24.2
-setuptools>=77.0.3,<81.0.0
+setuptools==77.0.3 # this version can reuse CMake build dir
 setuptools-scm>=8
 torch==2.9.1+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
 torch==2.9.1; platform_system == "Darwin" or platform_machine == "ppc64le" or platform_machine == "aarch64"
diff --git a/requirements/cpu.txt b/requirements/cpu.txt
index 7a670812e89431a0523366236c81b1bf81857988..111b8a55115625ae250a8be1eee08187b2ad34fe 100644
--- a/requirements/cpu.txt
+++ b/requirements/cpu.txt
@@ -1,6 +1,8 @@
 # Common dependencies
 -r common.txt
 
+setuptools==77.0.3 # this version can reuse CMake build dir
+
 numba == 0.61.2; platform_machine != "s390x" # Required for N-gram speculative decoding
 
 # Dependencies for CPUs
diff --git a/requirements/cuda.txt b/requirements/cuda.txt
index 462f18ef7159b73e80f7d2fb8c67ca40e872514c..1417fb99120bc03f0e2d9fec9f4859b7de23be1e 100644
--- a/requirements/cuda.txt
+++ b/requirements/cuda.txt
@@ -5,9 +5,9 @@ numba == 0.61.2 # Required for N-gram speculative decoding
 
 # Dependencies for NVIDIA GPUs
 ray[cgraph]>=2.48.0 # Ray Compiled Graph, required for pipeline parallelism in V1.
-torch==2.9.0
-torchaudio==2.9.0
+torch==2.9.1
+torchaudio==2.9.1
 # These must be updated alongside torch
-torchvision==0.24.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
+torchvision==0.24.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
 # FlashInfer should be updated together with the Dockerfile
 flashinfer-python==0.5.3
diff --git a/requirements/kv_connectors_rocm.txt b/requirements/kv_connectors_rocm.txt
new file mode 100644
index 0000000000000000000000000000000000000000..604b96ec5bb57815552f1857453e983223936a0a
--- /dev/null
+++ b/requirements/kv_connectors_rocm.txt
@@ -0,0 +1,2 @@
+tblib
+lm_eval[api]
\ No newline at end of file
diff --git a/requirements/nightly_torch_test.txt b/requirements/nightly_torch_test.txt
index 7b2c665448a3b61f8e7118b25ded544be9674531..72fa1369249e623ebf8b5828736ce52ffbac8a2f 100644
--- a/requirements/nightly_torch_test.txt
+++ b/requirements/nightly_torch_test.txt
@@ -17,17 +17,17 @@ vocos # required for minicpmo_26 test
 peft
 pqdm
 ray[cgraph,default]>=2.48.0 # Ray Compiled Graph, required by pipeline parallelism tests
-sentence-transformers # required for embedding tests
+sentence-transformers>=5.2.0 # required for embedding tests
 soundfile # required for audio tests
 jiwer # required for audio tests
 timm # required for internvl test
 transformers_stream_generator # required for qwen-vl test
 matplotlib # required for qwen-vl test
-mistral_common[image,audio] >= 1.8.5 # required for voxtral test
+mistral_common[image,audio] >= 1.8.8 # required for voxtral test
 num2words # required for smolvlm test
 opencv-python-headless >= 4.11.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
-lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test
+lm-eval[api]>=0.4.9.2 # required for model evaluation test
 mteb>=1.38.11, <2 # required for mteb test
 transformers==4.57.3
 tokenizers==0.22.0
diff --git a/requirements/rocm-build.txt b/requirements/rocm-build.txt
index b977e80be067f65024d0ddf1b356b60e6fee73b6..54af9d995c4a2fd8ed8c4de13fc4fa3b701e25fa 100644
--- a/requirements/rocm-build.txt
+++ b/requirements/rocm-build.txt
@@ -2,11 +2,11 @@
 -r common.txt
 
 --extra-index-url https://download.pytorch.org/whl/rocm6.4
-torch==2.9.0
-torchvision==0.24.0
-torchaudio==2.9.0
+torch==2.9.1
+torchvision==0.24.1
+torchaudio==2.9.1
 
-triton==3.5.0
+triton==3.5.1
 cmake>=3.26.1,<4
 packaging>=24.2
 setuptools>=77.0.3,<80.0.0
diff --git a/requirements/rocm-test.txt b/requirements/rocm-test.txt
index 3f0fd235fba50f00a8c5fc5453115e8f45aa4b4e..be8622065d985e76003fb0da41b6df4c7910b026 100644
--- a/requirements/rocm-test.txt
+++ b/requirements/rocm-test.txt
@@ -58,7 +58,7 @@ schemathesis==3.39.15
     # OpenAI schema test
 
 # Evaluation and benchmarking
-lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d
+lm-eval[api]>=0.4.9.2
 jiwer==4.0.0
 
 # Required for multiprocessed tests that use spawn method, Datasets and Evaluate Test
@@ -74,17 +74,21 @@ torchgeo==0.7.0
 # MTEB Benchmark Test
 mteb==2.1.2
 
-# Data processing
-xgrammar @ git+https://github.com/divakar-amd/xgrammar@3272f7c520564858056a60480d5afdf69ae79c84
-# Test async scheduling
-
 # Utilities
 num2words==0.5.14
     # via lm-eval
 pqdm==0.2.0
     # via lm-eval
 
+# Required for fastsafetensors test
+fastsafetensors @ git+https://github.com/foundation-model-stack/fastsafetensors.git@d6f998a03432b2452f8de2bb5cefb5af9795d459
 # Required for suffix decoding test
 arctic-inference == 0.1.1
 # Required for Nemotron test
 open-clip-torch==2.32.0
+# Required for isaac Multi-Modal generation test
+perceptron==0.1.4
+# Required for the multi-modal models test
+timm==1.0.17
+# Required for plugins test
+albumentations==1.4.6
\ No newline at end of file
diff --git a/requirements/rocm.txt b/requirements/rocm.txt
index 85db5fcb1ab13ac210cc5f571ffb3e4b14e9c237..6bef80f4f35a5a7590eef02bcff783ec206a6620 100644
--- a/requirements/rocm.txt
+++ b/requirements/rocm.txt
@@ -15,7 +15,7 @@ setuptools-scm>=8
 runai-model-streamer[s3,gcs]==0.15.3
 # conch-triton-kernels==1.2.1
 timm>=1.0.17
-fastsafetensors @ git+https://github.com/foundation-model-stack/fastsafetensors.git@d6f998a03432b2452f8de2bb5cefb5af9795d459
+grpcio-tools>=1.76.0
 
 numa
 pytrie
@@ -23,10 +23,10 @@ setuptools_scm>=8
 cmake==3.29
 quart
 fastrlock==0.8.3
-cupy==12.3.0
+# cupy==12.3.0
 
-torch >= 2.7.1
-triton == 3.1
+torch == 2.9.0
+triton == 3.3
 flash_attn == 2.6.1
 flash_mla == 1.0.0
 lightop == 0.6.0
diff --git a/requirements/test.in b/requirements/test.in
index dfae5b75821f8ba2e61829a78f0ac058aa84a479..5fc405a6367236e4309a6938305062d38b6661c5 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -9,6 +9,7 @@ pytest-timeout
 pytest-cov
 
 # testing utils
+albumentations # required for Nemotron Parse in test_common.py
 backoff # required for phi4mm test
 blobfile # required for kimi-vl test
 einops # required for MPT, qwen-vl
@@ -19,23 +20,22 @@ vocos # required for minicpmo_26 test
 peft>=0.15.0 # required for phi-4-mm test
 pqdm
 ray[cgraph,default]>=2.48.0 # Ray Compiled Graph, required by pipeline parallelism tests
-sentence-transformers # required for embedding tests
+sentence-transformers>=5.2.0 # required for embedding tests
 soundfile # required for audio tests
 jiwer # required for audio tests
 tblib # for pickling test exceptions
-timm >=1.0.17 # required for internvl and gemma3n-mm test
-torch==2.9.0
-torchaudio==2.9.0
-torchvision==0.24.0
+timm==1.0.17 # required for internvl and gemma3n-mm test
+torch==2.9.1
+torchaudio==2.9.1
+torchvision==0.24.1
 transformers_stream_generator # required for qwen-vl test
 matplotlib # required for qwen-vl test
-mistral_common[image,audio] >= 1.8.5 # required for voxtral test
+mistral_common[image,audio] >= 1.8.8 # required for voxtral test
 num2words # required for smolvlm test
-open_clip_torch==2.32.0 # Required for nemotron_vl test
+open_clip_torch==2.32.0 # Required for nemotron_vl test, Nemotron Parse in test_common.py
 opencv-python-headless >= 4.11.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
-# TODO: Use lm-eval[api]==0.4.10 once released
-lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test
+lm-eval[api]>=0.4.9.2 # required for model evaluation test
 mteb[bm25s]>=2, <3 # required for mteb test
 transformers==4.57.3
 tokenizers==0.22.0
@@ -57,3 +57,5 @@ pydantic>=2.12 # 2.11 leads to error on python 3.13
 decord==0.6.0
 terratorch @ git+https://github.com/IBM/terratorch.git@1.1.rc3 # required for PrithviMAE test
 gpt-oss >= 0.0.7; python_version > '3.11'
+
+perceptron # required for isaac test
diff --git a/requirements/test.txt b/requirements/test.txt
index 571194e05c1babe1d01fee4b967d4171f34521ee..e78431ab39a4a692cbf3ed1e809f413e40e54216 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -27,7 +27,9 @@ aiosignal==1.4.0
 albucore==0.0.16
     # via terratorch
 albumentations==1.4.6
-    # via terratorch
+    # via
+    #   -r requirements/test.in
+    #   terratorch
 alembic==1.16.4
     # via mlflow
 annotated-types==0.7.0
@@ -135,6 +137,7 @@ cloudpickle==3.1.1
     # via mlflow-skinny
 colorama==0.4.6
     # via
+    #   perceptron
     #   sacrebleu
     #   schemathesis
     #   tqdm-multiprocess
@@ -294,7 +297,7 @@ graphql-relay==3.2.0
     # via graphene
 greenlet==3.2.3
     # via sqlalchemy
-grpcio==1.71.0
+grpcio==1.76.0
     # via ray
 gunicorn==23.0.0
     # via mlflow
@@ -302,6 +305,8 @@ h11==0.14.0
     # via
     #   httpcore
     #   uvicorn
+h2==4.3.0
+    # via httpx
 h5py==3.13.0
     # via terratorch
 harfile==0.3.0
@@ -310,6 +315,8 @@ hf-xet==1.1.7
     # via huggingface-hub
 hiredis==3.0.0
     # via tensorizer
+hpack==4.1.0
+    # via h2
 html2text==2025.4.15
     # via gpt-oss
 httpcore==1.0.6
@@ -317,6 +324,7 @@ httpcore==1.0.6
 httpx==0.27.2
     # via
     #   -r requirements/test.in
+    #   perceptron
     #   schemathesis
 huggingface-hub==0.34.3
     # via
@@ -338,6 +346,8 @@ hydra-core==1.3.2
     # via
     #   lightly
     #   lightning
+hyperframe==6.1.0
+    # via h2
 hypothesis==6.131.0
     # via
     #   hypothesis-graphql
@@ -441,7 +451,7 @@ lightning-utilities==0.14.3
     #   torchmetrics
 llvmlite==0.44.0
     # via numba
-lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d
+lm-eval==0.4.9.2
     # via -r requirements/test.in
 lxml==5.3.0
     # via
@@ -474,7 +484,7 @@ mbstrdecoder==1.1.3
     #   typepy
 mdurl==0.1.2
     # via markdown-it-py
-mistral-common==1.8.5
+mistral-common==1.8.8
     # via -r requirements/test.in
 mlflow==2.22.0
     # via terratorch
@@ -549,6 +559,7 @@ numpy==1.26.4
     #   pandas
     #   patsy
     #   peft
+    #   perceptron
     #   pycocotools
     #   pyogrio
     #   rasterio
@@ -702,6 +713,8 @@ peft==0.16.0
     # via
     #   -r requirements/test.in
     #   lm-eval
+perceptron==0.1.4
+    # via -r requirements/test.in
 pillow==10.4.0
     # via
     #   genai-perf
@@ -709,9 +722,9 @@ pillow==10.4.0
     #   lightly-utils
     #   matplotlib
     #   mistral-common
+    #   perceptron
     #   scikit-image
     #   segmentation-models-pytorch
-    #   sentence-transformers
     #   torchgeo
     #   torchvision
 platformdirs==4.3.6
@@ -745,7 +758,7 @@ propcache==0.2.0
     #   yarl
 proto-plus==1.26.1
     # via google-api-core
-protobuf==5.28.3
+protobuf==6.33.2
     # via
     #   google-api-core
     #   googleapis-common-protos
@@ -952,6 +965,7 @@ rich==13.9.4
     #   genai-perf
     #   lightning
     #   mteb
+    #   perceptron
     #   typer
 rioxarray==0.19.0
     # via terratorch
@@ -1010,7 +1024,7 @@ segmentation-models-pytorch==0.4.0
     # via
     #   terratorch
     #   torchgeo
-sentence-transformers==3.2.1
+sentence-transformers==5.2.0
     # via
     #   -r requirements/test.in
     #   mteb
@@ -1024,7 +1038,9 @@ shapely==2.1.1
     #   geopandas
     #   torchgeo
 shellingham==1.5.4
-    # via typer
+    # via
+    #   perceptron
+    #   typer
 six==1.16.0
     # via
     #   junit-xml
@@ -1123,7 +1139,7 @@ tomli==2.2.1
     # via schemathesis
 tomli-w==1.2.0
     # via schemathesis
-torch==2.9.0+cu129
+torch==2.9.1+cu129
     # via
     #   -r requirements/test.in
     #   accelerate
@@ -1152,7 +1168,7 @@ torch==2.9.0+cu129
     #   torchvision
     #   vector-quantize-pytorch
     #   vocos
-torchaudio==2.9.0+cu129
+torchaudio==2.9.1+cu129
     # via
     #   -r requirements/test.in
     #   encodec
@@ -1165,7 +1181,7 @@ torchmetrics==1.7.4
     #   pytorch-lightning
     #   terratorch
     #   torchgeo
-torchvision==0.24.0+cu129
+torchvision==0.24.1+cu129
     # via
     #   -r requirements/test.in
     #   lightly
@@ -1206,7 +1222,7 @@ transformers==4.57.3
     #   transformers-stream-generator
 transformers-stream-generator==0.0.5
     # via -r requirements/test.in
-triton==3.5.0
+triton==3.5.1
     # via torch
 tritonclient==2.51.0
     # via
@@ -1218,7 +1234,9 @@ typepy==1.3.2
     #   pytablewriter
     #   tabledata
 typer==0.15.2
-    # via fastsafetensors
+    # via
+    #   fastsafetensors
+    #   perceptron
 types-python-dateutil==2.9.0.20241206
     # via arrow
 typeshed-client==2.8.2
@@ -1231,6 +1249,7 @@ typing-extensions==4.15.0
     #   chz
     #   fastapi
     #   graphene
+    #   grpcio
     #   huggingface-hub
     #   librosa
     #   lightning
@@ -1246,6 +1265,7 @@ typing-extensions==4.15.0
     #   pydantic-core
     #   pydantic-extra-types
     #   pytorch-lightning
+    #   sentence-transformers
     #   sqlalchemy
     #   torch
     #   torchgeo
diff --git a/setup.py b/setup.py
index 37136154dbda392f753c7e06acd2264667a84817..dec3d7749b62ef71ae8ba4ad1d8886b877bf1911 100644
--- a/setup.py
+++ b/setup.py
@@ -18,6 +18,8 @@ import torch
 from packaging.version import Version, parse
 from setuptools import Extension, setup
 from setuptools.command.build_ext import build_ext
+from setuptools.command.build_py import build_py
+from setuptools.command.develop import develop
 # from setuptools_scm import get_version
 from torch.utils.cpp_extension import CUDA_HOME, ROCM_HOME
 
@@ -62,15 +64,15 @@ elif not (sys.platform.startswith("linux") or sys.platform.startswith("darwin"))
         sys.platform,
     )
     VLLM_TARGET_DEVICE = "empty"
-elif (
-    sys.platform.startswith("linux")
-    and torch.version.cuda is None
-    and os.getenv("VLLM_TARGET_DEVICE") is None
-    and torch.version.hip is None
-):
-    # if cuda or hip is not available and VLLM_TARGET_DEVICE is not set,
-    # fallback to cpu
-    VLLM_TARGET_DEVICE = "cpu"
+elif sys.platform.startswith("linux") and os.getenv("VLLM_TARGET_DEVICE") is None:
+    if torch.version.hip is not None:
+        VLLM_TARGET_DEVICE = "rocm"
+        logger.info("Auto-detected ROCm")
+    elif torch.version.cuda is not None:
+        VLLM_TARGET_DEVICE = "cuda"
+        logger.info("Auto-detected CUDA")
+    else:
+        VLLM_TARGET_DEVICE = "cpu"
 
 
 def is_sccache_available() -> bool:
@@ -91,6 +93,81 @@ def is_freethreaded():
     return bool(sysconfig.get_config_var("Py_GIL_DISABLED"))
 
 
+def compile_grpc_protos():
+    """Compile gRPC protobuf definitions during build.
+
+    This generates *_pb2.py, *_pb2_grpc.py, and *_pb2.pyi files from
+    the vllm_engine.proto definition.
+    """
+    try:
+        from grpc_tools import protoc
+    except ImportError:
+        logger.warning(
+            "grpcio-tools not installed, skipping gRPC proto compilation. "
+            "gRPC server functionality will not be available."
+        )
+        return False
+
+    proto_file = ROOT_DIR / "vllm" / "grpc" / "vllm_engine.proto"
+    if not proto_file.exists():
+        logger.warning("Proto file not found at %s, skipping compilation", proto_file)
+        return False
+
+    logger.info("Compiling gRPC protobuf: %s", proto_file)
+
+    result = protoc.main(
+        [
+            "grpc_tools.protoc",
+            f"--proto_path={ROOT_DIR}",
+            f"--python_out={ROOT_DIR}",
+            f"--grpc_python_out={ROOT_DIR}",
+            f"--pyi_out={ROOT_DIR}",
+            str(proto_file),
+        ]
+    )
+
+    if result != 0:
+        logger.error("protoc failed with exit code %s", result)
+        return False
+
+    # Add SPDX headers and mypy ignore to generated files
+    spdx_header = (
+        "# SPDX-License-Identifier: Apache-2.0\n"
+        "# SPDX-FileCopyrightText: Copyright contributors to the vLLM project\n"
+        "# mypy: ignore-errors\n"
+    )
+
+    grpc_dir = ROOT_DIR / "vllm" / "grpc"
+    for generated_file in [
+        grpc_dir / "vllm_engine_pb2.py",
+        grpc_dir / "vllm_engine_pb2_grpc.py",
+        grpc_dir / "vllm_engine_pb2.pyi",
+    ]:
+        if generated_file.exists():
+            content = generated_file.read_text()
+            if not content.startswith("# SPDX-License-Identifier"):
+                generated_file.write_text(spdx_header + content)
+
+    logger.info("gRPC protobuf compilation successful")
+    return True
+
+
+class BuildPyAndGenerateGrpc(build_py):
+    """Build Python modules and generate gRPC stubs from proto files."""
+
+    def run(self):
+        compile_grpc_protos()
+        super().run()
+
+
+class DevelopAndGenerateGrpc(develop):
+    """Develop mode that also generates gRPC stubs from proto files."""
+
+    def run(self):
+        compile_grpc_protos()
+        super().run()
+
+
 class CMakeExtension(Extension):
     def __init__(self, name: str, cmake_lists_dir: str = ".", **kwa) -> None:
         super().__init__(name, sources=[], py_limited_api=not is_freethreaded(), **kwa)
@@ -120,20 +197,26 @@ class cmake_build_ext(build_ext):
                 num_jobs = os.cpu_count()
 
         nvcc_threads = None
-        if _is_cuda() and get_nvcc_cuda_version() >= Version("11.2"):
-            # `nvcc_threads` is either the value of the NVCC_THREADS
-            # environment variable (if defined) or 1.
-            # when it is set, we reduce `num_jobs` to avoid
-            # overloading the system.
-            nvcc_threads = envs.NVCC_THREADS
-            if nvcc_threads is not None:
-                nvcc_threads = int(nvcc_threads)
-                logger.info(
-                    "Using NVCC_THREADS=%d as the number of nvcc threads.", nvcc_threads
-                )
-            else:
-                nvcc_threads = 1
-            num_jobs = max(1, num_jobs // nvcc_threads)
+        if _is_cuda() and CUDA_HOME is not None:
+            try:
+                nvcc_version = get_nvcc_cuda_version()
+                if nvcc_version >= Version("11.2"):
+                    # `nvcc_threads` is either the value of the NVCC_THREADS
+                    # environment variable (if defined) or 1.
+                    # when it is set, we reduce `num_jobs` to avoid
+                    # overloading the system.
+                    nvcc_threads = envs.NVCC_THREADS
+                    if nvcc_threads is not None:
+                        nvcc_threads = int(nvcc_threads)
+                        logger.info(
+                            "Using NVCC_THREADS=%d as the number of nvcc threads.",
+                            nvcc_threads,
+                        )
+                    else:
+                        nvcc_threads = 1
+                    num_jobs = max(1, num_jobs // nvcc_threads)
+            except Exception as e:
+                logger.warning("Failed to get NVCC version: %s", e)
 
         return num_jobs, nvcc_threads
 
@@ -211,9 +294,9 @@ class cmake_build_ext(build_ext):
             # Default build tool to whatever cmake picks.
             build_tool = []
         # Make sure we use the nvcc from CUDA_HOME
-        if _is_cuda():
+        if _is_cuda() and CUDA_HOME is not None:
             cmake_args += [f"-DCMAKE_CUDA_COMPILER={CUDA_HOME}/bin/nvcc"]
-        elif _is_hip():
+        elif _is_hip() and ROCM_HOME is not None:
             cmake_args += [f"-DROCM_PATH={ROCM_HOME}"]
 
         other_cmake_args = os.environ.get("CMAKE_ARGS")
@@ -351,6 +434,89 @@ class precompiled_wheel_utils:
             wheels = json.loads(resp.read().decode("utf-8"))
         return wheels, repo_url
 
+    @staticmethod
+    def is_rocm_system() -> bool:
+        """Detect ROCm without relying on torch (for build environment)."""
+        if os.getenv("ROCM_PATH"):
+            return True
+        if os.path.isdir("/opt/rocm"):
+            return True
+        if which("rocminfo") is not None:
+            return True
+        try:
+            import torch
+
+            return torch.version.hip is not None
+        except ImportError:
+            return False
+
+    @staticmethod
+    def find_local_rocm_wheel() -> str | None:
+        """Search for a local vllm wheel in common locations."""
+        import glob
+
+        for pattern in ["/vllm-workspace/dist/vllm-*.whl", "./dist/vllm-*.whl"]:
+            wheels = glob.glob(pattern)
+            if wheels:
+                return sorted(wheels)[-1]
+        return None
+
+    @staticmethod
+    def fetch_wheel_from_pypi_index(index_url: str, package: str = "vllm") -> str:
+        """Fetch the latest wheel URL from a PyPI-style simple index."""
+        import platform
+        from html.parser import HTMLParser
+        from urllib.parse import urljoin
+        from urllib.request import urlopen
+
+        arch = platform.machine()
+
+        class WheelLinkParser(HTMLParser):
+            def __init__(self):
+                super().__init__()
+                self.wheels = []
+
+            def handle_starttag(self, tag, attrs):
+                if tag == "a":
+                    for name, value in attrs:
+                        if name == "href" and value.endswith(".whl"):
+                            self.wheels.append(value)
+
+        simple_url = f"{index_url.rstrip('/')}/{package}/"
+        print(f"Fetching wheel list from {simple_url}")
+        with urlopen(simple_url) as resp:
+            html = resp.read().decode("utf-8")
+
+        parser = WheelLinkParser()
+        parser.feed(html)
+
+        for wheel in reversed(parser.wheels):
+            if arch in wheel:
+                if wheel.startswith("http"):
+                    return wheel
+                return urljoin(simple_url, wheel)
+
+        raise ValueError(f"No compatible wheel found for {arch} at {simple_url}")
+
+    @staticmethod
+    def determine_wheel_url_rocm() -> tuple[str, str | None]:
+        """Determine the precompiled wheel for ROCm."""
+        # Search for local wheel first
+        local_wheel = precompiled_wheel_utils.find_local_rocm_wheel()
+        if local_wheel is not None:
+            print(f"Found local ROCm wheel: {local_wheel}")
+            return local_wheel, None
+
+        # Fall back to AMD's PyPI index
+        index_url = os.getenv(
+            "VLLM_ROCM_WHEEL_INDEX", "https://pypi.amd.com/vllm-rocm/simple"
+        )
+        print(f"Fetching ROCm precompiled wheel from {index_url}")
+        wheel_url = precompiled_wheel_utils.fetch_wheel_from_pypi_index(index_url)
+        download_filename = wheel_url.split("/")[-1].split("#")[0]
+        print(f"Using ROCm precompiled wheel: {wheel_url}")
+        return wheel_url, download_filename
+
     @staticmethod
     def determine_wheel_url() -> tuple[str, str | None]:
         """
@@ -371,6 +537,11 @@ class precompiled_wheel_utils:
             print(f"Using user-specified precompiled wheel location: {wheel_location}")
             return wheel_location, None
         else:
+            # ROCm: use local wheel or AMD's PyPI index
+            # TODO: When we have ROCm nightly wheels, we can update this logic.
+            if precompiled_wheel_utils.is_rocm_system():
+                return precompiled_wheel_utils.determine_wheel_url_rocm()
+
             import platform
 
             arch = platform.machine()
@@ -477,6 +648,8 @@ class precompiled_wheel_utils:
                     "vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
                     "vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
                     "vllm/cumem_allocator.abi3.so",
+                    # ROCm-specific libraries
+                    "vllm/_rocm_C.abi3.so",
                 ]
 
                 flash_attn_regex = re.compile(
@@ -614,6 +787,8 @@ def get_rocm_version():
     # Get the Rocm version from the ROCM_HOME/bin/librocm-core.so
     # see https://github.com/ROCm/rocm-core/blob/d11f5c20d500f729c393680a01fa902ebf92094b/rocm_version.cpp#L21
     try:
+        if ROCM_HOME is None:
+            return None
         librocm_core_file = Path(ROCM_HOME) / "lib" / "librocm-core.so"
         if not librocm_core_file.is_file():
             return None
@@ -690,9 +865,9 @@ def get_version_add(sha: Optional[str] = None) -> str:
     
     new_version_content = f"""
 try:
-    __version__ = "0.13.0"
-    __version_tuple__ = (0, 13, 0)
-    __hcu_version__ = f'0.13.0+{version}' 
+    __version__ = "0.14.0"
+    __version_tuple__ = (0, 14, 0)
+    __hcu_version__ = f'0.14.0+{version}' 
     
     from vllm.version import __version__, __version_tuple__, __hcu_version__
 except Exception as e:
@@ -863,7 +1038,9 @@ if _is_cuda() or _is_hip():
 
 if _is_cuda():
     ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa2_C"))
-    if envs.VLLM_USE_PRECOMPILED or get_nvcc_cuda_version() >= Version("12.3"):
+    if envs.VLLM_USE_PRECOMPILED or (
+        CUDA_HOME and get_nvcc_cuda_version() >= Version("12.3")
+    ):
         # FA3 requires CUDA 12.3 or later
         ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C"))
         # Optional since this doesn't get built (produce an .so file) when
@@ -882,9 +1059,10 @@ if skip_vllm_build:
             "py.typed",
             "model_executor/layers/fused_moe/configs/*.json",
             "model_executor/layers/quantization/utils/configs/*.json",
-            "perf/*.py",
-            "attention/backends/configs/*.json",
+            "entrypoints/serve/instrumentator/static/*.js",
+            "entrypoints/serve/instrumentator/static/*.css",
             "model_executor/layers/quantization/configs/awq/*.json",
+            "attention/backends/configs/*.json",
             "_C.abi3.so",
             "_moe_C.abi3.so",
         ]
@@ -895,7 +1073,8 @@ else:
             "py.typed",
             "model_executor/layers/fused_moe/configs/*.json",
             "model_executor/layers/quantization/utils/configs/*.json",
-            "perf/*.py",
+            "entrypoints/serve/instrumentator/static/*.js",
+            "entrypoints/serve/instrumentator/static/*.css",
             "attention/backends/configs/*.json",
             "model_executor/layers/quantization/configs/awq/*.json",
         ]
@@ -915,12 +1094,17 @@ if _no_device() or skip_vllm_build:
     ext_modules = []
 
 if not ext_modules:
-    cmdclass = {}
+    cmdclass = {
+        "build_py": BuildPyAndGenerateGrpc,
+        "develop": DevelopAndGenerateGrpc,
+    }
 else:
     cmdclass = {
         "build_ext": precompiled_build_ext
         if envs.VLLM_USE_PRECOMPILED
-        else cmake_build_ext
+        else cmake_build_ext,
+        "build_py": BuildPyAndGenerateGrpc,
+        "develop": DevelopAndGenerateGrpc,
     }
 
 setup(
@@ -929,12 +1113,13 @@ setup(
     ext_modules=ext_modules,
     install_requires=get_requirements(),
     extras_require={
-        "bench": ["pandas", "matplotlib", "seaborn", "datasets"],
+        "bench": ["pandas", "matplotlib", "seaborn", "datasets", "scipy"],
         "tensorizer": ["tensorizer==2.10.1"],
         "fastsafetensors": ["fastsafetensors >= 0.1.10"],
         "runai": ["runai-model-streamer[s3,gcs] >= 0.15.3"],
         "audio": [
             "librosa",
+            "scipy",
             "soundfile",
             "mistral_common[audio]",
         ],  # Required for audio processing
diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index 523230fb55c68c8514ad36043ccd7a1e5c7010dc..dc05ffc722c777a38d7ebda79052936a358031f4 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -72,7 +72,6 @@ def _fix_prompt_embed_outputs(
 @pytest.mark.parametrize("model_executor", ["uni", "mp"])
 @pytest.mark.parametrize("enable_prompt_embeds", [True, False])
 def test_models(
-    monkeypatch: pytest.MonkeyPatch,
     hf_runner,
     model: str,
     backend: str,
@@ -82,82 +81,80 @@ def test_models(
     model_executor: str,
     enable_prompt_embeds: bool,
 ) -> None:
+    # 5042 tokens for gemma2
+    # gemma2 has alternating sliding window size of 4096
+    # we need a prompt with more than 4096 tokens to test the sliding window
+    prompt = (
+        "The following numbers of the sequence "
+        + ", ".join(str(i) for i in range(1024))
+        + " are:"
+    )
+    example_prompts = [prompt]
+
+    with hf_runner(model) as hf_model:
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+        if enable_prompt_embeds:
+            with torch.no_grad():
+                prompt_embeds = hf_model.get_prompt_embeddings(example_prompts)
 
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_ATTENTION_BACKEND", backend)
-
-        # 5042 tokens for gemma2
-        # gemma2 has alternating sliding window size of 4096
-        # we need a prompt with more than 4096 tokens to test the sliding window
-        prompt = (
-            "The following numbers of the sequence "
-            + ", ".join(str(i) for i in range(1024))
-            + " are:"
-        )
-        example_prompts = [prompt]
-
-        with hf_runner(model) as hf_model:
-            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+    if not current_platform.is_rocm():
+        with VllmRunner(
+            model,
+            max_model_len=8192,
+            enforce_eager=enforce_eager,
+            enable_prompt_embeds=enable_prompt_embeds,
+            gpu_memory_utilization=0.7,
+            async_scheduling=async_scheduling,
+            distributed_executor_backend=model_executor,
+            attention_config={"backend": backend},
+        ) as vllm_model:
             if enable_prompt_embeds:
-                with torch.no_grad():
-                    prompt_embeds = hf_model.get_prompt_embeddings(example_prompts)
-
-        if not current_platform.is_rocm():
-            with VllmRunner(
-                model,
-                max_model_len=8192,
-                enforce_eager=enforce_eager,
-                enable_prompt_embeds=enable_prompt_embeds,
-                gpu_memory_utilization=0.7,
-                async_scheduling=async_scheduling,
-                distributed_executor_backend=model_executor,
-            ) as vllm_model:
-                if enable_prompt_embeds:
-                    vllm_outputs = vllm_model.generate_greedy(prompt_embeds, max_tokens)
-                    vllm_outputs = _fix_prompt_embed_outputs(
-                        vllm_outputs, hf_model, example_prompts
-                    )
-                else:
-                    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-        else:
-            with VllmRunner(
-                model,
-                max_model_len=8192,
-                enforce_eager=enforce_eager,
-                enable_prompt_embeds=enable_prompt_embeds,
-                gpu_memory_utilization=0.7,
-                async_scheduling=async_scheduling,
-                distributed_executor_backend=model_executor,
-                block_size=64,
-            ) as vllm_model:
-                if enable_prompt_embeds:
-                    vllm_outputs = vllm_model.generate_greedy(prompt_embeds, max_tokens)
-                    vllm_outputs = _fix_prompt_embed_outputs(
-                        vllm_outputs, hf_model, example_prompts
-                    )
-                else:
-                    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+                vllm_outputs = vllm_model.generate_greedy(prompt_embeds, max_tokens)
+                vllm_outputs = _fix_prompt_embed_outputs(
+                    vllm_outputs, hf_model, example_prompts
+                )
+            else:
+                vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+    else:
+        with VllmRunner(
+            model,
+            max_model_len=8192,
+            enforce_eager=enforce_eager,
+            enable_prompt_embeds=enable_prompt_embeds,
+            gpu_memory_utilization=0.7,
+            async_scheduling=async_scheduling,
+            distributed_executor_backend=model_executor,
+            attention_config={"backend": backend},
+            block_size=64,
+        ) as vllm_model:
+            if enable_prompt_embeds:
+                vllm_outputs = vllm_model.generate_greedy(prompt_embeds, max_tokens)
+                vllm_outputs = _fix_prompt_embed_outputs(
+                    vllm_outputs, hf_model, example_prompts
+                )
+            else:
+                vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
 
-        check_outputs_equal(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=vllm_outputs,
-            name_0="hf",
-            name_1="vllm",
-        )
+    check_outputs_equal(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
 
 
 # @multi_gpu_test(num_gpus=2)
 # @pytest.mark.parametrize(
 #     "model, distributed_executor_backend, attention_backend, test_suite, extra_env",
 #     [
-#         (os.path.join(models_path_prefix, "facebook/opt-125m"), "ray", "", "L4", {}),
-#         (os.path.join(models_path_prefix, "facebook/opt-125m"), "mp", "", "L4", {}),
-#         (os.path.join(models_path_prefix, "facebook/opt-125m"), "ray", "", "L4", {"VLLM_SLEEP_WHEN_IDLE": "1"}),
-#         (os.path.join(models_path_prefix, "facebook/opt-125m"), "mp", "", "L4", {"VLLM_SLEEP_WHEN_IDLE": "1"}),
-#         (os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"), "ray", "", "L4", {}),
-#         (os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"), "mp", "", "L4", {}),
-#         (os.path.join(models_path_prefix, "facebook/opt-125m"), "ray", "", "A100", {}),
-#         (os.path.join(models_path_prefix, "facebook/opt-125m"), "mp", "", "A100", {}),
+#         ("facebook/opt-125m", "ray", "", "L4", {}),
+#         ("facebook/opt-125m", "mp", "", "L4", {}),
+#         ("facebook/opt-125m", "ray", "", "L4", {"VLLM_SLEEP_WHEN_IDLE": "1"}),
+#         ("facebook/opt-125m", "mp", "", "L4", {"VLLM_SLEEP_WHEN_IDLE": "1"}),
+#         ("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4", {}),
+#         ("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4", {}),
+#         ("facebook/opt-125m", "ray", "", "A100", {}),
+#         ("facebook/opt-125m", "mp", "", "A100", {}),
 #     ],
 # )
 # @pytest.mark.parametrize("enable_prompt_embeds", [True, False])
@@ -186,12 +183,6 @@ def test_models(
 #         ):  # noqa
 #             pytest.skip("enable_prompt_embeds does not work with ray compiled dag.")
 
-#         if attention_backend:
-#             monkeypatch_context.setenv(
-#                 "VLLM_ATTENTION_BACKEND",
-#                 attention_backend,
-#             )
-
 #         for k, v in extra_env.items():
 #             monkeypatch_context.setenv(k, v)
 
@@ -203,6 +194,7 @@ def test_models(
 #         # if we run HF first, the cuda initialization will be done and it
 #         # will hurt multiprocessing backend with fork method
 #         # (the default method).
+#         attention_config = {"backend": attention_backend} if attention_backend else None
 #         with vllm_runner(
 #             model,
 #             dtype=dtype,
@@ -210,6 +202,7 @@ def test_models(
 #             distributed_executor_backend=distributed_executor_backend,
 #             enable_prompt_embeds=enable_prompt_embeds,
 #             gpu_memory_utilization=0.7,
+#             attention_config=attention_config,
 #         ) as vllm_model:
 #             if enable_prompt_embeds:
 #                 with hf_runner(model, dtype=dtype) as hf_model:
@@ -225,90 +218,12 @@ def test_models(
 #                 with hf_runner(model, dtype=dtype) as hf_model:
 #                     hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
-# @multi_gpu_test(num_gpus=2)
-# @pytest.mark.parametrize(
-#     "model, distributed_executor_backend, attention_backend, "
-#     "test_suite, extra_env", [
-#         ("distilbert/distilgpt2", "ray", "", "L4", {}),
-#         ("distilbert/distilgpt2", "mp", "", "L4", {}),
-#         ("distilbert/distilgpt2", "ray", "", "L4", {
-#             "VLLM_SLEEP_WHEN_IDLE": "1"
-#         }),
-#         ("distilbert/distilgpt2", "mp", "", "L4", {
-#             "VLLM_SLEEP_WHEN_IDLE": "1"
-#         }),
-#         ("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4", {}),
-#         ("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4", {}),
-#         ("distilbert/distilgpt2", "ray", "", "A100", {}),
-#         ("distilbert/distilgpt2", "mp", "", "A100", {}),
-#     ])
-# @pytest.mark.parametrize("enable_prompt_embeds", [True, False])
-# def test_models_distributed(
-#     monkeypatch: pytest.MonkeyPatch,
-#     hf_runner,
-#     vllm_runner,
-#     example_prompts,
-#     model: str,
-#     distributed_executor_backend: str,
-#     attention_backend: str,
-#     test_suite: str,
-#     extra_env: dict[str, str],
-#     enable_prompt_embeds: bool,
-# ) -> None:
-#     if test_suite != TARGET_TEST_SUITE:
-#         pytest.skip(f"Skip test for {test_suite}")
-
-#     with monkeypatch.context() as monkeypatch_context:
-#         if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4":  # noqa
-#             if enable_prompt_embeds:
-#                 pytest.skip(
-#                     "enable_prompt_embeds does not work with ray compiled dag."
-#                 )
-#             monkeypatch_context.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
-#             monkeypatch_context.setenv("VLLM_USE_RAY_COMPILED_DAG", "1")
-
-#         if attention_backend:
-#             monkeypatch_context.setenv(
-#                 "VLLM_ATTENTION_BACKEND",
-#                 attention_backend,
-#             )
-
-#         for k, v in extra_env.items():
-#             monkeypatch_context.setenv(k, v)
-
-#         dtype = "half"
-#         max_tokens = 5
-
-#         # NOTE: take care of the order. run vLLM first, and then run HF.
-#         # vLLM needs a fresh new process without cuda initialization.
-#         # if we run HF first, the cuda initialization will be done and it
-#         # will hurt multiprocessing backend with fork method
-#         # (the default method).
-#         with vllm_runner(
-#                 model,
-#                 dtype=dtype,
-#                 tensor_parallel_size=2,
-#                 distributed_executor_backend=distributed_executor_backend,
-#                 enable_prompt_embeds=enable_prompt_embeds,
-#                 gpu_memory_utilization=0.7,
-#         ) as vllm_model:
-#             if enable_prompt_embeds:
-#                 with hf_runner(model, dtype=dtype) as hf_model:
-#                     with torch.no_grad():
-#                         prompt_embeds = hf_model.get_prompt_embeddings(
-#                             example_prompts)
-#                     vllm_outputs = vllm_model.generate_greedy(
-#                         prompt_embeds, max_tokens)
-#                     vllm_outputs = _fix_prompt_embed_outputs(
-#                         vllm_outputs, hf_model, example_prompts)
-#                     hf_outputs = hf_model.generate_greedy(
-#                         example_prompts, max_tokens)
-#             else:
-#                 vllm_outputs = vllm_model.generate_greedy(
-#                     example_prompts, max_tokens)
-#                 with hf_runner(model, dtype=dtype) as hf_model:
-#                     hf_outputs = hf_model.generate_greedy(
-#                         example_prompts, max_tokens)
+#     check_outputs_equal(
+#         outputs_0_lst=hf_outputs,
+#         outputs_1_lst=vllm_outputs,
+#         name_0="hf",
+#         name_1="vllm",
+#     )
 
 
 def test_failed_model_execution(vllm_runner, monkeypatch) -> None:
diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py
index 188c0321f7d06714fbe3daf78d04eacdbce6a16b..56fb187d2e9466a26169055c75d5223690dcbb59 100644
--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
@@ -248,7 +248,6 @@ def test_deep_sleep_async():
 
 @requires_fp8
 def test_deep_sleep_fp8_kvcache():
-    GiB_bytes = 1 << 30
     model = "Qwen/Qwen2-0.5B"
     used_bytes_baseline = current_platform.get_current_memory_usage()
 
diff --git a/tests/tpu/__init__.py b/tests/benchmarks/sweep/__init__.py
similarity index 100%
rename from tests/tpu/__init__.py
rename to tests/benchmarks/sweep/__init__.py
diff --git a/tests/benchmarks/test_param_sweep.py b/tests/benchmarks/sweep/test_param_sweep.py
similarity index 100%
rename from tests/benchmarks/test_param_sweep.py
rename to tests/benchmarks/sweep/test_param_sweep.py
diff --git a/tests/benchmarks/sweep/test_serve_sla.py b/tests/benchmarks/sweep/test_serve_sla.py
new file mode 100644
index 0000000000000000000000000000000000000000..19f4740bc32863b8a7861062d17350e19b3e7bbf
--- /dev/null
+++ b/tests/benchmarks/sweep/test_serve_sla.py
@@ -0,0 +1,298 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import json
+from collections.abc import Callable
+from pathlib import Path
+from unittest.mock import patch
+
+from vllm.benchmarks.sweep.param_sweep import ParameterSweepItem
+from vllm.benchmarks.sweep.serve_sla import _get_sla_run_path, solve_sla
+from vllm.benchmarks.sweep.server import ServerProcess
+from vllm.benchmarks.sweep.sla_sweep import (
+    SLACriterionBase,
+    SLALessThan,
+    SLALessThanOrEqualTo,
+    SLASweepItem,
+)
+
+
+def _set_return_value(
+    var2metric: Callable[[ParameterSweepItem], list[dict[str, float]]],
+):
+    """
+    Create a patch for run_sla with a specific function
+    indicating the relationship between the benchmark combination
+    (which includes the SLA variable) and the SLA criterion.
+    """
+
+    def mock_run_sla(
+        server: ServerProcess | None,
+        bench_cmd: list[str],
+        *,
+        serve_comb: ParameterSweepItem,
+        bench_comb: ParameterSweepItem,
+        iter_path: Path,
+        num_runs: int,
+        dry_run: bool,
+    ):
+        iter_data = var2metric(bench_comb)
+
+        summary_path = _get_sla_run_path(iter_path, run_number=None)
+        summary_path.parent.mkdir(parents=True, exist_ok=True)
+        with summary_path.open("w") as f:
+            json.dump(iter_data, f, indent=4)
+
+        return iter_data
+
+    return patch("vllm.benchmarks.sweep.serve_sla.run_sla", side_effect=mock_run_sla)
+
+
+def _var2metric_linear():
+    def wrapped(bench_comb):
+        x = float(bench_comb["request_rate"])
+        y = x
+
+        return [{"request_throughput": y}]
+
+    return wrapped
+
+
+def _var2metric_concave(elbow_point: float):
+    def wrapped(bench_comb):
+        x = float(bench_comb["request_rate"])
+        if x < elbow_point:
+            y = 0.5 * (x - elbow_point) + elbow_point
+        else:
+            y = 1.5 * (x - elbow_point) + elbow_point
+
+        return [{"request_throughput": y}]
+
+    return wrapped
+
+
+def _var2metric_convex(elbow_point: float):
+    def wrapped(bench_comb):
+        x = float(bench_comb["request_rate"])
+        if x < elbow_point:
+            y = 1.5 * (x - elbow_point) + elbow_point
+        else:
+            y = 0.5 * (x - elbow_point) + elbow_point
+
+        return [{"request_throughput": y}]
+
+    return wrapped
+
+
+def _var2metric_quadratic(y_intercept: float):
+    def wrapped(bench_comb):
+        x = float(bench_comb["request_rate"])
+        y = y_intercept + 0.1 * x**2
+
+        return [{"request_throughput": y}]
+
+    return wrapped
+
+
+def _var2metric_sqrt(y_intercept: float):
+    def wrapped(bench_comb):
+        x = float(bench_comb["request_rate"])
+        y = y_intercept + 10 * x**0.5
+
+        return [{"request_throughput": y}]
+
+    return wrapped
+
+
+def _run_solve_sla(
+    var2metric: Callable[[ParameterSweepItem], list[dict[str, float]]],
+    criterion: SLACriterionBase,
+    base_path: Path,
+    min_value: int = 1,
+    max_value: int = 100,
+):
+    with _set_return_value(var2metric):
+        result = solve_sla(
+            server=None,
+            bench_cmd=[],
+            serve_comb=ParameterSweepItem(),
+            bench_comb=ParameterSweepItem(),
+            sla_comb=SLASweepItem({"request_throughput": criterion}),
+            base_path=base_path,
+            num_runs=1,
+            dry_run=False,
+            sla_variable="request_rate",
+            sla_min_value=min_value,
+            sla_max_value=max_value,
+        )
+        assert result is not None
+
+        return result
+
+
+def test_solve_linear_sla_le(tmp_path):
+    sla_data, history = _run_solve_sla(
+        _var2metric_linear(),
+        SLALessThanOrEqualTo(target=32),
+        tmp_path,
+    )
+
+    assert history.get_max_passing() == 32
+
+    assert {val: margin <= 0 for val, margin in history.items()} == {
+        100: False,
+        1: True,
+        32: True,
+        33: False,
+    }
+
+
+def test_solve_linear_sla_lt(tmp_path):
+    sla_data, history = _run_solve_sla(
+        _var2metric_linear(),
+        SLALessThan(target=32),
+        tmp_path,
+    )
+
+    assert history.get_max_passing() == 31
+
+    assert {val: margin <= 0 for val, margin in history.items()} == {
+        100: False,
+        1: True,
+        31: True,
+        32: False,
+    }
+
+
+def test_solve_linear_sla_oob(tmp_path):
+    sla_data, history = _run_solve_sla(
+        _var2metric_linear(),
+        SLALessThanOrEqualTo(target=32),
+        tmp_path,
+        min_value=64,
+    )
+
+    assert history.get_max_passing() == 64
+    assert history.get_min_failing() == 64
+
+    assert {val: margin <= 0 for val, margin in history.items()} == {
+        100: False,
+        64: False,
+    }
+
+
+def test_solve_concave_sla_le(tmp_path):
+    sla_data, history = _run_solve_sla(
+        _var2metric_concave(elbow_point=32),
+        SLALessThanOrEqualTo(target=24),
+        tmp_path,
+    )
+
+    assert history.get_max_passing() == 16
+
+    assert {val: margin <= 0 for val, margin in history.items()} == {
+        100: False,
+        1: True,
+        7: True,
+        13: True,
+        15: True,
+        16: True,
+        17: False,
+    }
+
+
+def test_solve_convex_sla_le(tmp_path):
+    sla_data, history = _run_solve_sla(
+        _var2metric_convex(elbow_point=32),
+        SLALessThanOrEqualTo(target=24),
+        tmp_path,
+    )
+
+    assert history.get_max_passing() == 26
+
+    assert {val: margin <= 0 for val, margin in history.items()} == {
+        100: False,
+        1: True,
+        48: False,
+        30: False,
+        24: True,
+        26: True,
+        27: False,
+    }
+
+
+def test_solve_quadratic_sla_le(tmp_path):
+    sla_data, history = _run_solve_sla(
+        _var2metric_quadratic(y_intercept=10),
+        SLALessThanOrEqualTo(target=50),
+        tmp_path,
+    )
+
+    assert history.get_max_passing() == 20
+
+    assert {val: margin <= 0 for val, margin in history.items()} == {
+        100: False,
+        1: True,
+        4: True,
+        20: True,
+        21: False,
+    }
+
+
+def test_solve_sqrt_sla_le(tmp_path):
+    sla_data, history = _run_solve_sla(
+        _var2metric_sqrt(y_intercept=10),
+        SLALessThanOrEqualTo(target=100),
+        tmp_path,
+    )
+
+    assert history.get_max_passing() == 81
+
+    assert {val: margin <= 0 for val, margin in history.items()} == {
+        100: False,
+        1: True,
+        89: False,
+        81: True,
+        82: False,
+    }
+
+
+def test_solve_reuse_history(tmp_path):
+    sla_data, history = _run_solve_sla(
+        _var2metric_linear(),
+        SLALessThanOrEqualTo(target=10),
+        tmp_path,
+        min_value=1,
+        max_value=20,
+    )
+
+    assert history.get_max_passing() == 10
+
+    assert {val: margin <= 0 for val, margin in history.items()} == {
+        20: False,
+        1: True,
+        10: True,
+        11: False,
+    }
+
+    sla_data, history = _run_solve_sla(
+        _var2metric_linear(),
+        SLALessThanOrEqualTo(target=30),
+        tmp_path,
+        min_value=21,
+        max_value=40,
+    )
+
+    assert history.get_max_passing() == 30
+
+    assert {val: margin <= 0 for val, margin in history.items()} == {
+        # Items from the past run
+        # (the margins are different because the target changed)
+        20: True,
+        1: True,
+        10: True,
+        11: True,
+        # Items from this run
+        40: False,
+        30: True,
+        31: False,
+    }
diff --git a/tests/benchmarks/test_bench_startup.py b/tests/benchmarks/test_bench_startup.py
new file mode 100644
index 0000000000000000000000000000000000000000..44c9bc9b735d7c6fecfa557ee8f7c9489adbdff6
--- /dev/null
+++ b/tests/benchmarks/test_bench_startup.py
@@ -0,0 +1,19 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import subprocess
+
+import pytest
+
+
+@pytest.mark.benchmark
+def test_bench_startup():
+    command = [
+        "vllm",
+        "bench",
+        "startup",
+    ]
+    result = subprocess.run(command, capture_output=True, text=True)
+    print(result.stdout)
+    print(result.stderr)
+
+    assert result.returncode == 0, f"Benchmark failed: {result.stderr}"
diff --git a/tests/benchmarks/test_serve_cli.py b/tests/benchmarks/test_serve_cli.py
index 17914c81f6315c9a32b5d959f49f930f1dd87588..474d12df8adfa566e13b875a4926a079c869296a 100644
--- a/tests/benchmarks/test_serve_cli.py
+++ b/tests/benchmarks/test_serve_cli.py
@@ -20,21 +20,18 @@ def server():
 
 @pytest.mark.benchmark
 def test_bench_serve(server):
+    # Test default model detection and input/output len
     command = [
         "vllm",
         "bench",
         "serve",
-        "--model",
-        MODEL_NAME,
         "--host",
         server.host,
         "--port",
         str(server.port),
-        "--dataset-name",
-        "random",
-        "--random-input-len",
+        "--input-len",
         "32",
-        "--random-output-len",
+        "--output-len",
         "4",
         "--num-prompts",
         "5",
diff --git a/tests/compile/distributed/test_async_tp.py b/tests/compile/distributed/test_async_tp.py
index 2eb18e25c98bfe160a9d8848c553013a824ff48a..3b96fa65d02c0c06e150a12986498c6844dccdaf 100644
--- a/tests/compile/distributed/test_async_tp.py
+++ b/tests/compile/distributed/test_async_tp.py
@@ -15,6 +15,7 @@ from vllm.config import (
     ModelConfig,
     PassConfig,
     VllmConfig,
+    set_current_vllm_config,
 )
 from vllm.distributed import (
     tensor_model_parallel_all_gather,
@@ -26,6 +27,7 @@ from vllm.distributed.parallel_state import (
 )
 from vllm.platforms import current_platform
 from vllm.utils.system_utils import update_environment_variables
+from vllm.utils.torch_utils import set_random_seed
 
 from ...models.registry import HF_EXAMPLE_MODELS
 from ...utils import (
@@ -301,7 +303,7 @@ def async_tp_pass_on_test_model(
     dtype: torch.dtype,
     dynamic: bool,
 ):
-    current_platform.seed_everything(0)
+    set_random_seed(0)
 
     device = torch.device(f"cuda:{local_rank}")
     torch.cuda.set_device(device)
@@ -339,38 +341,42 @@ def async_tp_pass_on_test_model(
     )
 
     async_tp_pass = AsyncTPPass(vllm_config)
-    backend = TestBackend(async_tp_pass)
 
-    assert (
-        async_tp_pass.compilation_config.splitting_ops
-        == vllm_config.compilation_config.splitting_ops
-    )
-    assert (
-        async_tp_pass.compilation_config.use_inductor_graph_partition
-        == vllm_config.compilation_config.use_inductor_graph_partition
-    )
+    # Set the global vllm_config for TestBackend which calls
+    # get_current_vllm_config()
+    with set_current_vllm_config(vllm_config):
+        backend = TestBackend(async_tp_pass)
 
-    model = test_model_cls(hidden_size, dtype)  # Pass dtype to model constructor
+        assert (
+            async_tp_pass.compilation_config.splitting_ops
+            == vllm_config.compilation_config.splitting_ops
+        )
+        assert (
+            async_tp_pass.compilation_config.use_inductor_graph_partition
+            == vllm_config.compilation_config.use_inductor_graph_partition
+        )
 
-    hidden_states = torch.randn(
-        (batch_size * seq_len, hidden_size), dtype=dtype, requires_grad=False
-    )
+        model = test_model_cls(hidden_size, dtype)  # Pass dtype to model constructor
+
+        hidden_states = torch.randn(
+            (batch_size * seq_len, hidden_size), dtype=dtype, requires_grad=False
+        )
 
-    if dynamic:
-        torch._dynamo.mark_dynamic(hidden_states, 0)
+        if dynamic:
+            torch._dynamo.mark_dynamic(hidden_states, 0)
 
-    compiled_model = torch.compile(model, backend=backend)
-    compiled_model(hidden_states)
+        compiled_model = torch.compile(model, backend=backend)
+        compiled_model(hidden_states)
 
-    assert async_tp_pass.matched_count == 1
+        assert async_tp_pass.matched_count == 1
 
-    # In pre-nodes, all gather or reduce scatter should exist,
-    # fused_matmul_reduce_scatter or fused_all_gather_matmul should not
-    backend.check_before_ops(model.ops_in_model_before(), fully_replaced=False)
+        # In pre-nodes, all gather or reduce scatter should exist,
+        # fused_matmul_reduce_scatter or fused_all_gather_matmul should not
+        backend.check_before_ops(model.ops_in_model_before(), fully_replaced=False)
 
-    # In post-nodes, fused_matmul_reduce_scatter or \
-    # fused_all_gather_matmul should exist
-    backend.check_after_ops(model.ops_in_model_after())
+        # In post-nodes, fused_matmul_reduce_scatter or \
+        # fused_all_gather_matmul should exist
+        backend.check_after_ops(model.ops_in_model_after())
 
 
 @create_new_process_for_each_test()
diff --git a/tests/compile/distributed/test_fusion_all_reduce.py b/tests/compile/distributed/test_fusion_all_reduce.py
index fc8d1f98ebf87a662c880e1c3241954db1d71a43..d0a194c2b044bb3d67263450244f7a21efc1d6e9 100644
--- a/tests/compile/distributed/test_fusion_all_reduce.py
+++ b/tests/compile/distributed/test_fusion_all_reduce.py
@@ -32,6 +32,7 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
 )
 from vllm.platforms import current_platform
 from vllm.utils.system_utils import update_environment_variables
+from vllm.utils.torch_utils import set_random_seed
 
 from ...utils import has_module_attribute, multi_gpu_test
 from ..backend import TestBackend
@@ -263,7 +264,7 @@ def all_reduce_fusion_pass_on_test_model(
     enable_rms_norm_custom_op,
     enable_quant_fp8_custom_op,
 ):
-    current_platform.seed_everything(0)
+    set_random_seed(0)
 
     device = torch.device(f"cuda:{local_rank}")
     torch.cuda.set_device(device)
diff --git a/tests/compile/distributed/test_fusions_e2e.py b/tests/compile/distributed/test_fusions_e2e.py
index 80086c4e03a9c328b3432a2de3d66294e6c57c92..f8a629ed46ceec464d54cbb00b440b3ac0702947 100644
--- a/tests/compile/distributed/test_fusions_e2e.py
+++ b/tests/compile/distributed/test_fusions_e2e.py
@@ -208,7 +208,8 @@ def test_attn_quant(
     # To capture subprocess logs, we need to know whether spawn or fork is used.
     # Force spawn as it is more general.
     monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
-    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
+
+    model_kwargs["attention_config"] = {"backend": backend.name}
 
     compilation_config = CompilationConfig(
         # Testing properties
@@ -297,7 +298,8 @@ def test_tp2_attn_quant_allreduce_rmsnorm(
     # To capture subprocess logs, we need to know whether spawn or fork is used.
     # Force spawn as it is more general.
     monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
-    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
+
+    model_kwargs["attention_config"] = {"backend": backend.name}
 
     compilation_config = CompilationConfig(
         # Testing properties
@@ -409,7 +411,8 @@ def test_tp2_attn_quant_async_tp(
     # To capture subprocess logs, we need to know whether spawn or fork is used.
     # Force spawn as it is more general.
     monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
-    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
+
+    model_kwargs["attention_config"] = {"backend": backend.name}
 
     compilation_config = CompilationConfig(
         # Testing properties
@@ -554,7 +557,8 @@ def test_rms_group_quant(
     # To capture subprocess logs, we need to know whether spawn or fork is used.
     # Force spawn as it is more general.
     monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
-    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
+
+    model_kwargs["attention_config"] = {"backend": backend.name}
 
     compilation_config = CompilationConfig(
         # Testing properties
@@ -564,7 +568,9 @@ def test_rms_group_quant(
         splitting_ops=splitting_ops,
         # Common
         mode=CompilationMode.VLLM_COMPILE,
-        pass_config=PassConfig(eliminate_noops=True, fuse_norm_quant=True),
+        pass_config=PassConfig(
+            fuse_norm_quant=True, fuse_act_quant=True, eliminate_noops=True
+        ),
         # Inductor caches custom passes by default as well via uuid
         inductor_compile_config={"force_disable_caches": True},
     )
diff --git a/tests/compile/distributed/test_sequence_parallelism.py b/tests/compile/distributed/test_sequence_parallelism.py
index d9fdc3acc3d6f6cb34e7fe62751e860992fbb11d..35916ba99652fe5c12099104a4e14749b057ba4c 100644
--- a/tests/compile/distributed/test_sequence_parallelism.py
+++ b/tests/compile/distributed/test_sequence_parallelism.py
@@ -31,6 +31,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import Fp8LinearOp
 from vllm.platforms import current_platform
 from vllm.utils.system_utils import update_environment_variables
+from vllm.utils.torch_utils import set_random_seed
 
 from ...utils import multi_gpu_test
 from ..backend import TestBackend
@@ -232,7 +233,7 @@ def sequence_parallelism_pass_on_test_model(
     fuse_norm_quant: bool,
     dynamic: bool,
 ):
-    current_platform.seed_everything(0)
+    set_random_seed(0)
 
     device = torch.device(f"cuda:{local_rank}")
     torch.cuda.set_device(device)
diff --git a/tests/compile/fullgraph/test_basic_correctness.py b/tests/compile/fullgraph/test_basic_correctness.py
index 59f4bf9154d29b78e35b9aca788f85cde211f267..eca8680860aa4d4bbfedc9493fdb725b9c9579d0 100644
--- a/tests/compile/fullgraph/test_basic_correctness.py
+++ b/tests/compile/fullgraph/test_basic_correctness.py
@@ -6,10 +6,13 @@ import pytest
 import os
 
 from vllm.config import CompilationMode
+from vllm.platforms import current_platform
 from vllm.utils.torch_utils import cuda_device_count_stateless
 
 from ...utils import compare_all_settings, models_path_prefix
 
+ATTN_BACKEND = "FLASH_ATTN" if not current_platform.is_rocm() else "ROCM_ATTN"
+
 
 @dataclasses.dataclass
 class TestSetting:
@@ -32,7 +35,7 @@ class TestSetting:
             model_args=["--max-model-len", "2048"],
             pp_size=2,
             tp_size=2,
-            attn_backend="FLASH_ATTN",
+            attn_backend=ATTN_BACKEND,
             method="generate",
         ),
         # llama model with quantization
@@ -41,7 +44,7 @@ class TestSetting:
             model_args=["--quantization", "gptq", "--max-model-len", "2048"],
             pp_size=1,
             tp_size=1,
-            attn_backend="FLASH_ATTN",
+            attn_backend=ATTN_BACKEND,
             method="generate",
         ),
         # MoE model
@@ -50,7 +53,7 @@ class TestSetting:
             model_args=["--max-model-len", "2048"],
             pp_size=1,
             tp_size=2,
-            attn_backend="FLASH_ATTN",
+            attn_backend=ATTN_BACKEND,
             method="generate",
         ),
         # embedding model
@@ -66,18 +69,23 @@ class TestSetting:
             ],
             pp_size=1,
             tp_size=1,
-            attn_backend="FLASH_ATTN",
+            attn_backend=ATTN_BACKEND,
             method="encode",
         ),
-        # # TODO
-        # TestSetting(
-        #     model="BAAI/bge-base-en-v1.5",
-        #     model_args=["--runner", "pooling"],
-        #     pp_size=1,
-        #     tp_size=1,
-        #     attn_backend="FLASH_ATTN",
-        #     method="encode",
-        # ),
+        pytest.param(
+            TestSetting(
+                model="BAAI/bge-base-en-v1.5",
+                model_args=["--runner", "pooling"],
+                pp_size=1,
+                tp_size=1,
+                attn_backend="FLASH_ATTN",
+                method="encode",
+            ),
+            marks=pytest.mark.skipif(
+                current_platform.is_rocm(),
+                reason="Encoder self-attention is not implemented for ROCm",
+            ),
+        ),
         # vision language model
         # See https://github.com/vllm-project/vllm/issues/26716.
         # TestSetting(
@@ -91,7 +99,6 @@ class TestSetting:
     ],
 )
 def test_compile_correctness(
-    monkeypatch: pytest.MonkeyPatch,
     test_setting: TestSetting,
 ):
     # this test is run under multiple suits, with different GPUs.
@@ -109,49 +116,48 @@ def test_compile_correctness(
             f"{cuda_device_count_stateless()}"
         )
 
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
-        final_args = [
-            *model_args,
-            "-pp",
-            str(pp_size),
-            "-tp",
-            str(tp_size),
-            "-cc.cudagraph_mode=none",
-        ]
-
-        all_args: list[list[str]] = []
-        all_envs: list[dict[str, str] | None] = []
+    final_args = [
+        *model_args,
+        "-pp",
+        str(pp_size),
+        "-tp",
+        str(tp_size),
+        "-cc.cudagraph_mode=none",
+        f"--attention-backend={attn_backend}",
+    ]
 
-        for comp_mode in [
-            CompilationMode.STOCK_TORCH_COMPILE,
-            CompilationMode.DYNAMO_TRACE_ONCE,
-            CompilationMode.VLLM_COMPILE,
-        ]:
-            for mode in [CompilationMode.NONE, comp_mode]:
-                all_args.append(
-                    final_args + [f"-cc.mode={mode.name}", "-cc.backend=inductor"]
-                )
+    all_args: list[list[str]] = []
+    all_envs: list[dict[str, str] | None] = []
 
-            # inductor will change the output, so we only compare if the output
-            # is close, not exactly the same.
-            compare_all_settings(
-                model,
-                all_args,
-                all_envs,
-                method=method if method != "generate" else "generate_close",
+    for comp_mode in [
+        CompilationMode.STOCK_TORCH_COMPILE,
+        CompilationMode.DYNAMO_TRACE_ONCE,
+        CompilationMode.VLLM_COMPILE,
+    ]:
+        for mode in [CompilationMode.NONE, comp_mode]:
+            all_args.append(
+                final_args + [f"-cc.mode={mode.name}", "-cc.backend=inductor"]
             )
-            all_envs.clear()
-            all_args.clear()
 
-        for mode in [
-            CompilationMode.NONE,
-            CompilationMode.STOCK_TORCH_COMPILE,
-            CompilationMode.DYNAMO_TRACE_ONCE,
-            CompilationMode.VLLM_COMPILE,
-        ]:
-            all_args.append(final_args + [f"-cc.mode={mode.name}", "-cc.backend=eager"])
-            all_envs.append({})
-            all_envs.append({})
+        # inductor will change the output, so we only compare if the output
+        # is close, not exactly the same.
+        compare_all_settings(
+            model,
+            all_args,
+            all_envs,
+            method=method if method != "generate" else "generate_close",
+        )
+        all_envs.clear()
+        all_args.clear()
+
+    for mode in [
+        CompilationMode.NONE,
+        CompilationMode.STOCK_TORCH_COMPILE,
+        CompilationMode.DYNAMO_TRACE_ONCE,
+        CompilationMode.VLLM_COMPILE,
+    ]:
+        all_args.append(final_args + [f"-cc.mode={mode.name}", "-cc.backend=eager"])
+        all_envs.append({})
+        all_envs.append({})
 
-        compare_all_settings(model, all_args * 3, all_envs, method=method)
\ No newline at end of file
+    compare_all_settings(model, all_args * 3, all_envs, method=method)
diff --git a/tests/compile/fullgraph/test_full_cudagraph.py b/tests/compile/fullgraph/test_full_cudagraph.py
index c6d4b5272dbcffdded3286825cc348f34699f695..c7c737371fc3087f3770cd73479aefdcf15a6fc1 100644
--- a/tests/compile/fullgraph/test_full_cudagraph.py
+++ b/tests/compile/fullgraph/test_full_cudagraph.py
@@ -12,6 +12,7 @@ from vllm import LLM, SamplingParams
 from vllm.config import CompilationConfig
 from vllm.platforms import current_platform
 from vllm.utils.torch_utils import is_torch_equal_or_newer
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
 
 
 @contextlib.contextmanager
@@ -70,11 +71,14 @@ def llm_pair(request):
         elif backend_config.specific_gpu_arch == (10, 0):
             pytest.skip("Only Blackwell GPUs support Cutlass MLA")
 
+    # FlashInfer is not supported on ROCm
+    if backend_config == AttentionBackendEnum.FLASHINFER and current_platform.is_rocm():
+        pytest.skip("FlashInfer is not supported on ROCm")
+
     env_vars = {
         # Force native sampler to avoid potential nondeterminism in FlashInfer
         # when per-request generators are not used in V1.
         "VLLM_USE_FLASHINFER_SAMPLER": "0",
-        **backend_config.env_vars,
     }
     with temporary_environ(env_vars):
         full = LLM(
@@ -170,16 +174,10 @@ class TestFullCUDAGraph:
 
 @pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda")
 def test_full_cudagraph_with_invalid_backend():
-    with (
-        temporary_environ(
-            {
-                "VLLM_ATTENTION_BACKEND": "FLEX_ATTENTION",
-                # Flex_Attention is not supported with full cuda graph
-            }
-        ),
-        pytest.raises(RuntimeError),
-    ):
+    # Flex_Attention is not supported with full cuda graph
+    with pytest.raises(RuntimeError):
         LLM(
             model="Qwen/Qwen2-1.5B-Instruct",
             compilation_config=CompilationConfig(cudagraph_mode="FULL"),
+            attention_config={"backend": "FLEX_ATTENTION"},
         )
diff --git a/tests/compile/fullgraph/test_full_graph.py b/tests/compile/fullgraph/test_full_graph.py
index 3cd1d4be2ebdc39c28f3c6094accf5950b78d365..209a879bfb9d278a0725bbffa7762b504053d8dc 100644
--- a/tests/compile/fullgraph/test_full_graph.py
+++ b/tests/compile/fullgraph/test_full_graph.py
@@ -10,10 +10,10 @@ import torch
 
 from tests.quantization.utils import is_quant_method_supported
 from vllm import LLM, SamplingParams
-from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode, PassConfig
 from vllm.platforms import current_platform
 from vllm.utils.torch_utils import is_torch_equal_or_newer
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
 
 from ...utils import create_new_process_for_each_test
 
@@ -62,7 +62,10 @@ def models_list(*, all: bool = True, keywords: list[str] | None = None):
             TEST_MODELS.append(
                 (
                     "alexm-nm/tinyllama-24-marlin24-4bit-g128",
-                    {"quantization": "gptq_marlin_24"},
+                    {
+                        "quantization": "gptq_marlin_24",
+                        "allow_deprecated_quantization": True,
+                    },
                 )
             )
 
@@ -156,6 +159,20 @@ def test_full_graph(
         )
         for model_info in models_list(all=False)
         if is_torch_equal_or_newer("2.9.0.dev")
+    ]
+    + [
+        # Test get_raw_stream patch with compile_sizes
+        # This tests that TorchInductor autotune works correctly with get_raw_stream
+        # patch in torch 2.9 and without patch in torch 2.10+
+        (
+            CompilationConfig(
+                mode=CompilationMode.VLLM_COMPILE,
+                compile_sizes=[1, 2],  # Triggers autotune which uses get_raw_stream
+                cudagraph_mode=CUDAGraphMode.NONE,
+            ),
+            "facebook/opt-125m",
+            {},
+        ),
     ],
 )
 # only test some of the models
@@ -197,20 +214,19 @@ def test_custom_compile_config(
     ],
 )
 def test_fp8_kv_scale_compile(
-    monkeypatch: pytest.MonkeyPatch,
     compilation_mode: int,
     model: str,
     backend: AttentionBackendEnum | None,
 ):
-    if backend:
-        monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
-
     model_kwargs = {
         "quantization": "fp8",
         "kv_cache_dtype": "fp8_e4m3",
         "calculate_kv_scales": True,
         "max_model_len": 512,
     }
+    if backend:
+        model_kwargs["attention_config"] = {"backend": backend.name}
+
     run_model(compilation_mode, model, **model_kwargs)
 
 
diff --git a/tests/compile/fullgraph/test_multimodal_compile.py b/tests/compile/fullgraph/test_multimodal_compile.py
index 621f6a51a918fc4e56fd7a8c024c8147ea23db15..c5dc6f96b2a56b56232713ddc3dd2b0be30a2c3d 100644
--- a/tests/compile/fullgraph/test_multimodal_compile.py
+++ b/tests/compile/fullgraph/test_multimodal_compile.py
@@ -71,3 +71,40 @@ def test_qwen2_5_vl_no_vit_compilation(vllm_runner, monkeypatch):
         ) as _,
     ):
         pass
+
+
+# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
+# Requires Cuda and 8 gpus as well
+@pytest.mark.forked
+@pytest.mark.skip(reason="Skipping due to CI resource constraints")
+def test_mllama4_vit_compilation(vllm_runner, monkeypatch):
+    """Test that Mllama4 vision submodules are compiled.
+
+    This test verifies that the 2 vision submodules (Llama4VisionEncoder,
+    Llama4VisionPixelShuffleMLP) are properly tagged
+    for compilation by checking that num_models_seen increases to 3.
+
+    However since we are using TP=8, we compilation_counter will not
+    work properly so we will just check the run succeeds rn
+    """
+    # Disable multiprocessing so that the counter is in the same process
+    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
+
+    with (
+        monkeypatch.context(),
+        # TODO: Since we require TP=8, this messes with the compilation
+        # counter. We should fix this in the future, but leave for now
+        # to make sure that compilation runs (no crash) with llama vision encoder
+        compilation_counter.expect(num_models_seen=0),
+        vllm_runner(
+            "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+            max_model_len=512,
+            gpu_memory_utilization=0.8,
+            tensor_parallel_size=8,
+            compilation_config={
+                "mode": CompilationMode.VLLM_COMPILE,
+                "compile_mm_encoder": True,
+            },
+        ),
+    ):
+        pass
diff --git a/tests/compile/test_aot_compile.py b/tests/compile/test_aot_compile.py
index 8fa305d6d72f5dd168ded895b3df5ad612ca88ba..ddcb5bdba5236d2eb2d588e1691a1427556e77e1 100644
--- a/tests/compile/test_aot_compile.py
+++ b/tests/compile/test_aot_compile.py
@@ -5,10 +5,12 @@ import functools
 import multiprocessing
 import tempfile
 from contextlib import contextmanager
+from pathlib import Path
 
 import pytest
 import torch
 
+import vllm.model_executor.layers.activation
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import (
     CompilationConfig,
@@ -16,9 +18,19 @@ from vllm.config import (
     VllmConfig,
     set_current_vllm_config,
 )
+from vllm.envs import disable_envs_cache
 from vllm.forward_context import set_forward_context
 from vllm.utils.torch_utils import is_torch_equal_or_newer
 
+from ..utils import create_new_process_for_each_test
+
+
+@pytest.fixture
+def vllm_tmp_cache(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> Path:
+    """Fixture that sets VLLM_CACHE_ROOT to a temporary directory."""
+    monkeypatch.setenv("VLLM_CACHE_ROOT", str(tmp_path / "vllm_cache"))
+    return tmp_path
+
 
 def reference_fn(x: torch.Tensor):
     assert x.shape[0] <= 42
@@ -66,6 +78,7 @@ def test_no_dynamo_cache_entry(monkeypatch: pytest.MonkeyPatch):
                 torch.compiler.set_stance("fail_on_recompile"),
             ):
                 CompiledMod(vllm_config=vllm_config)(*args)
+            disable_envs_cache()
 
             m.setenv("VLLM_USE_AOT_COMPILE", "1")
             torch._dynamo.reset()
@@ -101,6 +114,7 @@ def test_save_and_load(monkeypatch: pytest.MonkeyPatch):
             vllm_config = make_vllm_config()
             with use_vllm_config(vllm_config):
                 expected = CompiledMod(vllm_config=vllm_config)(*args)
+            disable_envs_cache()
 
             m.setenv("VLLM_FORCE_AOT_LOAD", "1")
             vllm_config = make_vllm_config()
@@ -130,6 +144,7 @@ def test_shape_env(monkeypatch: pytest.MonkeyPatch):
                 artifacts = compiled_mod.aot_compiled_fn._artifacts
                 guards_string = artifacts.compiled_fn.shape_env.format_guards()
                 assert guards_string == " - s77 <= 42\n - Eq(Mod(s77, 2), 0)"
+            disable_envs_cache()
 
             m.setenv("VLLM_FORCE_AOT_LOAD", "1")
             vllm_config = make_vllm_config()
@@ -144,7 +159,94 @@ def test_shape_env(monkeypatch: pytest.MonkeyPatch):
 @pytest.mark.skipif(
     not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
 )
-@use_vllm_config(make_vllm_config())
+def test_partition_wrapper_applied_on_aot_load(
+    monkeypatch: pytest.MonkeyPatch, vllm_tmp_cache: Path, mocker
+):
+    """
+    Test that partition wrappers are applied when loading AOT cached functions.
+
+    This test verifies the fix for GitHub issue #31439 where AOT compile
+    caused 2x latency regression when use_inductor_graph_partition=True.
+    The root cause was that partition wrapper context was bypassed when
+    loading from AOT cache.
+    """
+    from vllm.config import CUDAGraphMode
+
+    args = (torch.randn(10, 10),)
+    monkeypatch.setenv("VLLM_USE_AOT_COMPILE", "1")
+
+    # Create config with partition enabled
+    vllm_config = VllmConfig(
+        compilation_config=CompilationConfig(
+            mode=CompilationMode.VLLM_COMPILE,
+            use_inductor_graph_partition=True,
+            cudagraph_mode=CUDAGraphMode.PIECEWISE,
+        )
+    )
+
+    # First compilation - save to cache
+    with use_vllm_config(vllm_config):
+        compiled_mod = CompiledMod(vllm_config=vllm_config)
+        compiled_mod(*args)
+    disable_envs_cache()
+
+    # Second run - load from cache, verify partition wrapper applied
+    monkeypatch.setenv("VLLM_FORCE_AOT_LOAD", "1")
+    vllm_config = VllmConfig(
+        compilation_config=CompilationConfig(
+            mode=CompilationMode.VLLM_COMPILE,
+            use_inductor_graph_partition=True,
+            cudagraph_mode=CUDAGraphMode.PIECEWISE,
+        )
+    )
+
+    # Use mocker to spy on set_customized_partition_wrappers
+    spy = mocker.spy(torch._inductor.utils, "set_customized_partition_wrappers")
+
+    with use_vllm_config(vllm_config):
+        compiled_mod = CompiledMod(vllm_config=vllm_config)
+
+        # First call after restart: loads from AOT cache.
+        # This tests the fix for the first call after a restart.
+        compiled_mod(*args)
+
+        # Verify partition wrapper was called on AOT load.
+        assert spy.call_count >= 2, (
+            "Expected partition wrapper to be set and cleared on AOT load, "
+            f"got {spy.call_count} calls"
+        )
+        # First call should set a wrapper, last call should clear it
+        assert spy.call_args_list[0][0][0] is not None, (
+            "First call on AOT load should set a wrapper function"
+        )
+        assert spy.call_args_list[-1][0][0] is None, (
+            "Last call on AOT load should clear the wrapper"
+        )
+
+        # Reset for the next check.
+        spy.reset_mock()
+
+        # Subsequent call: uses the cached `aot_compiled_fn`.
+        # This tests the fix for subsequent calls.
+        compiled_mod(*args)
+
+        # Verify partition wrapper was called on the subsequent call.
+        assert spy.call_count >= 2, (
+            "Expected partition wrapper set and cleared on subsequent "
+            f"call, got {spy.call_count} calls"
+        )
+        assert spy.call_args_list[0][0][0] is not None, (
+            "First call on subsequent call should set a wrapper function"
+        )
+        assert spy.call_args_list[-1][0][0] is None, (
+            "Last call on subsequent call should clear the wrapper"
+        )
+
+
+@pytest.mark.skipif(
+    not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
+)
+@create_new_process_for_each_test("spawn")
 def test_gpt2_cache_hit(monkeypatch: pytest.MonkeyPatch):
     """
     Test that compiling gpt2 twice results in a cache hit and
@@ -186,6 +288,8 @@ def test_gpt2_cache_hit(monkeypatch: pytest.MonkeyPatch):
 
             # Clean up first model
             del llm_model
+            disable_envs_cache()
+            vllm.model_executor.layers.activation._ACTIVATION_REGISTRY._dict.clear()
 
             # Second compilation - should hit cache
             m.setenv("VLLM_FORCE_AOT_LOAD", "1")
diff --git a/tests/compile/test_config.py b/tests/compile/test_config.py
index 0eedb6d16f343f5007b44fc76fb7a5d43bdfe432..165cc7cb59a5d8e8f9481fa80fbb978b29e14d8a 100644
--- a/tests/compile/test_config.py
+++ b/tests/compile/test_config.py
@@ -15,7 +15,10 @@ from vllm.config.compilation import CompilationMode, PassConfig
 from vllm.engine.arg_utils import EngineArgs
 from vllm.logger import _print_warning_once
 from vllm.platforms import current_platform
-from vllm.utils.torch_utils import _is_torch_equal_or_newer
+from vllm.utils.torch_utils import (
+    _is_torch_equal_or_newer,
+    is_torch_equal,
+)
 
 # This import automatically registers `torch.ops.silly.attention`
 from . import silly_attention  # noqa: F401
@@ -30,6 +33,29 @@ def test_version():
     assert not _is_torch_equal_or_newer("2.7.1", "2.8.0.dev")
 
 
+def test_get_raw_stream_patch():
+    """Test that get_raw_stream patch is applied only for torch 2.9.0 or 2.9.1."""
+    import builtins
+
+    # Check if get_raw_stream exists in builtins
+    has_patch = hasattr(builtins, "get_raw_stream")
+
+    # Import torch to get actual version
+
+    is_torch_2_9 = is_torch_equal("2.9.0") or is_torch_equal("2.9.1")
+
+    if is_torch_2_9:
+        # For torch 2.9.x, the patch should be applied
+        assert has_patch, "get_raw_stream should be patched for torch 2.9.x"
+        # Verify it's callable (it should be the _cuda_getCurrentRawStream function)
+        get_raw_stream = builtins.get_raw_stream  # type: ignore[attr-defined]
+        assert callable(get_raw_stream)
+        # Verify it's the correct function from torch._C
+        from torch._C import _cuda_getCurrentRawStream
+
+        assert get_raw_stream is _cuda_getCurrentRawStream
+
+
 def test_copy_pass():
     vllm_config = VllmConfig()
     inductor_pass = FixFunctionalizationPass(vllm_config)
@@ -406,51 +432,43 @@ def test_cudagraph_sizes_post_init(
         )
 
 
-def test_pass_config_deprecation(caplog_vllm):
-    caplog_vllm.set_level(logging.WARNING)
-
-    # Clear cache to ensure warnings are re-issued
-    _print_warning_once.cache_clear()
-
-    # Test enable_fusion -> fuse_norm_quant, fuse_act_quant
-    caplog_vllm.clear()
-    config = PassConfig(enable_fusion=True)
-    assert "enable_fusion is deprecated" in caplog_vllm.text
-    assert config.fuse_norm_quant is True
-    assert config.fuse_act_quant is True
-    assert config.enable_fusion is None
-
-    # Test enable_attn_fusion -> fuse_attn_quant
-    caplog_vllm.clear()
-    config = PassConfig(enable_attn_fusion=True)
-    assert "enable_attn_fusion is deprecated" in caplog_vllm.text
-    assert config.fuse_attn_quant is True
-    assert config.enable_attn_fusion is None
-
-    # Test enable_noop -> eliminate_noops
-    caplog_vllm.clear()
-    config = PassConfig(enable_noop=True)
-    assert "enable_noop is deprecated" in caplog_vllm.text
-    assert config.eliminate_noops is True
-    assert config.enable_noop is None
-
-    # Test enable_sequence_parallelism -> enable_sp
-    caplog_vllm.clear()
-    config = PassConfig(enable_sequence_parallelism=True)
-    assert "enable_sequence_parallelism is deprecated" in caplog_vllm.text
-    assert config.enable_sp is True
-    assert config.enable_sequence_parallelism is None
-
-    # Test enable_async_tp -> fuse_gemm_comms
-    caplog_vllm.clear()
-    config = PassConfig(enable_async_tp=True)
-    assert "enable_async_tp is deprecated" in caplog_vllm.text
-    assert config.fuse_gemm_comms is True
-    assert config.enable_async_tp is None
-
-    # Test enable_fi_allreduce_fusion -> fuse_allreduce_rms
-    caplog_vllm.clear()
-    config = PassConfig(enable_fi_allreduce_fusion=True)
-    assert "enable_fi_allreduce_fusion is deprecated" in caplog_vllm.text
-    assert config.fuse_allreduce_rms is True
-    assert config.enable_fi_allreduce_fusion is None
+def test_cached_compilation_config(default_vllm_config):
+    import torch
+    from torch._inductor.utils import run_and_get_code
+
+    from vllm.config import get_cached_compilation_config, set_current_vllm_config
+    from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
+    from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
+
+    dtype = torch.bfloat16
+    device = torch.device("cuda:0")
+    batch_size, num_qo_heads, head_size = 8, 16, 128
+
+    # access and cache default compilation config
+    # default compilation config does not contain +quant_fp8 custom op. If this is
+    # used, the generated code would use inductor-generated triton kernel instead
+    # of the custom op `torch.ops._C.static_scaled_fp8_quant`.
+    get_cached_compilation_config()
+
+    vllm_config = VllmConfig(
+        compilation_config=CompilationConfig(
+            mode=CompilationMode.VLLM_COMPILE,
+            custom_ops=["+quant_fp8"],
+        )
+    )
+
+    # set_current_vllm_config should clear cached compilation config and
+    # use the new compilation_config in vllm_config
+    with set_current_vllm_config(vllm_config):
+        query_quant = QuantFP8(static=True, group_shape=GroupShape.PER_TENSOR)
+        query_quant = torch.compile(query_quant)
+
+        _q_scale = torch.tensor(1.0, dtype=torch.float32, device="cuda")
+        query = torch.randn(
+            batch_size, num_qo_heads * head_size, dtype=dtype, device=device
+        )
+
+        _, code = run_and_get_code(query_quant, query, _q_scale)
+
+    code = " ".join(code)
+    assert "torch.ops._C.static_scaled_fp8_quant.default(" in code
diff --git a/tests/compile/test_dynamic_shapes_compilation.py b/tests/compile/test_dynamic_shapes_compilation.py
index 9ccb363b088f5fe82a5e01b5ae816583ea7f0414..1fda21dea63613e7d91003037fdce06decbe4f4c 100644
--- a/tests/compile/test_dynamic_shapes_compilation.py
+++ b/tests/compile/test_dynamic_shapes_compilation.py
@@ -77,6 +77,7 @@ def test_dynamic_shapes_compilation(
                 "evaluate_guards": evaluate_guards,
             },
         },
+        max_model_len=1024,
     )
 
     output = model.generate(prompt)
diff --git a/tests/compile/test_noop_elimination.py b/tests/compile/test_noop_elimination.py
index bfe08382fd94985ddd47a3de787bf6f7d0ae8794..02bc4023051d41ae51a4c2bb456b7660e99e6e30 100644
--- a/tests/compile/test_noop_elimination.py
+++ b/tests/compile/test_noop_elimination.py
@@ -25,10 +25,13 @@ def test_noop_elimination(dtype, num_tokens, hidden_size, buffer_size):
     class Model(torch.nn.Module):
         def __init__(self) -> None:
             super().__init__()
-            self.pos_embed = torch.empty(buffer_size, hidden_size, dtype=dtype)
+            # Avoid using empty, since on rocm torch.empty
+            # does not initialize the memory.
+            self.pos_embed = torch.randn(buffer_size, hidden_size, dtype=dtype)
 
         def forward(self, x):
-            x += self.pos_embed[: x.shape[0]]
+            # Avoid += to prevent inplace addition.
+            x = x + self.pos_embed[: x.shape[0]]
             # Chain of reshapes
             y = x.reshape(-1, 128, 32)
             z = y.reshape(-1, 4096)
diff --git a/tests/compile/test_qk_norm_rope_fusion.py b/tests/compile/test_qk_norm_rope_fusion.py
index e0968ac7992564e2e28c3cd3cd918e2f3ad7c5e8..45a114679beb92cfd95d8692a82ef326c06c2dcc 100644
--- a/tests/compile/test_qk_norm_rope_fusion.py
+++ b/tests/compile/test_qk_norm_rope_fusion.py
@@ -5,7 +5,6 @@ import pytest
 import torch
 
 from tests.compile.backend import TestBackend
-from vllm.attention.backends.abstract import AttentionType
 from vllm.attention.layer import Attention
 from vllm.compilation.matcher_utils import FLASHINFER_ROTARY_OP, RMS_OP, ROTARY_OP
 from vllm.compilation.noop_elimination import NoOpEliminationPass
@@ -25,6 +24,7 @@ from vllm.config import (
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
 from vllm.platforms import current_platform
+from vllm.v1.attention.backend import AttentionType
 
 RSQRT_OP = torch.ops.aten.rsqrt.default
 INDEX_SELECT_OP = torch.ops.aten.index.Tensor
diff --git a/tests/compile/untest_fusion.py b/tests/compile/untest_fusion.py
index 6b72c595cd7792b06a6d5187d9da33d104378d81..7755e9f9b738030e3b7110ec27d3d62c3894a07d 100644
--- a/tests/compile/untest_fusion.py
+++ b/tests/compile/untest_fusion.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import itertools
 
 import pytest
 import torch
@@ -53,37 +52,61 @@ class TestModel(torch.nn.Module):
         hidden_size: int,
         eps: float,
         group_shape: GroupShape,
-        cuda_force_torch: bool,
+        use_aiter: bool = False,
+        cuda_force_torch: bool = False,
+        use_aiter_quant_op: bool = True,
         *args,
         **kwargs,
     ):
         super().__init__(*args, **kwargs)
+        self.use_aiter = use_aiter
+        self.use_aiter_quant_op = use_aiter_quant_op
         self.cuda_force_torch = cuda_force_torch
+        self.group_shape = group_shape
+        self.enable_quant_fp8_custom_op = None  # Will be set later if applicable
+
         self.norm = [RMSNorm(hidden_size, eps) for _ in range(4)]
-        if group_shape.is_per_group():
-            self.wscale = [
-                torch.rand(
-                    (hidden_size // group_shape[1], hidden_size // group_shape[1]),
-                    dtype=torch.float32,
-                )
-                for _ in range(3)
-            ]
-        else:
-            self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(3)]
-        static = group_shape == GroupShape.PER_TENSOR
+
+        # Setup quantization scale descriptor
+        static = group_shape == GroupShape.PER_TENSOR and not use_aiter
         quant_scale = ScaleDesc(torch.float32, static, group_shape)
         self.quant_key = QuantKey(dtype=FP8_DTYPE, scale=quant_scale, symmetric=True)
+
+        # Setup scales
         if static:
             self.scale = [torch.rand(1, dtype=torch.float32) for _ in range(3)]
         else:
             self.scale = [None for _ in range(3)]
+
+        # Setup weights
         self.w = [
             torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE) for _ in range(3)
         ]
-        if not group_shape.is_per_group():
+        if not group_shape.is_per_group() or use_aiter:
             self.w = [self.w[0].t() for _ in range(3)]
 
+        # Setup weight scales
         if group_shape.is_per_group():
+            scale_size = (
+                (hidden_size + 128 - 1) // 128
+                if use_aiter
+                else hidden_size // group_shape[1]
+            )
+            wscale_shape: tuple[int, ...] = (scale_size, scale_size)
+        else:
+            wscale_shape = (1,)
+        self.wscale = [torch.rand(wscale_shape, dtype=torch.float32) for _ in range(3)]
+
+        # Setup FP8 linear operation
+        is_per_group = group_shape.is_per_group()
+        if is_per_group and use_aiter:
+            self.fp8_linear = W8A8BlockFp8LinearOp(
+                weight_group_shape=GroupShape(128, 128),
+                act_quant_group_shape=group_shape,
+                use_aiter_and_is_supported=use_aiter_quant_op,
+            )
+            # AITER blockwise doesn't use enable_quant_fp8_custom_op
+        elif is_per_group:
             self.fp8_linear = W8A8BlockFp8LinearOp(
                 weight_group_shape=GroupShape(group_shape[1], group_shape[1]),
                 act_quant_group_shape=group_shape,
@@ -91,6 +114,13 @@ class TestModel(torch.nn.Module):
                 use_aiter_and_is_supported=False,
             )
             self.enable_quant_fp8_custom_op = self.fp8_linear.input_quant_op.enabled()
+        elif use_aiter:
+            self.fp8_linear = Fp8LinearOp(
+                act_quant_static=False,
+                act_quant_group_shape=group_shape,
+            )
+            self.fp8_linear.quant_fp8.use_aiter = use_aiter_quant_op
+            self.enable_quant_fp8_custom_op = self.fp8_linear.quant_fp8.enabled()
         else:
             with override_cutlass_fp8_supported(not cuda_force_torch):
                 self.fp8_linear = Fp8LinearOp(
@@ -100,7 +130,6 @@ class TestModel(torch.nn.Module):
                 self.enable_quant_fp8_custom_op = self.fp8_linear.quant_fp8.enabled()
 
         self.enable_rms_norm_custom_op = self.norm[0].enabled()
-        self.group_shape = group_shape
 
     def forward(self, x):
         # avoid having graph input be an arg to a pattern directly
@@ -126,19 +155,49 @@ class TestModel(torch.nn.Module):
         y4, resid = self.norm[3](x4, resid)  # use resid here
         return y4
 
+    def ops_in_model_before(self):
+        if (
+            self.use_aiter
+            and self.group_shape.is_per_group()
+            and current_platform.is_fp8_fnuz()
+        ):
+            return [rocm_aiter_ops.get_group_quant_op()]
+        if self.use_aiter and self.group_shape.is_per_group():
+            return [torch.ops.vllm.triton_per_token_group_quant_fp8.default]
+        if self.use_aiter and self.use_aiter_quant_op:
+            return [rocm_aiter_ops.get_per_token_quant_op()]
+        if self.use_aiter:
+            return [QUANT_OPS[self.quant_key]]
+        if self.enable_quant_fp8_custom_op:
+            return [QUANT_OPS[self.quant_key]]
+        return [torch.ops.aten.reciprocal]
+
     def ops_in_model_after(self):
+        if self.use_aiter and self.group_shape.is_per_group():
+            from vllm.compilation.rocm_aiter_fusion import (
+                AiterFusedAddRMSFp8GroupQuantPattern,
+                AiterRMSFp8GroupQuantPattern,
+            )
+
+            return [
+                AiterFusedAddRMSFp8GroupQuantPattern.FUSED_OP,
+                AiterRMSFp8GroupQuantPattern.FUSED_OP,
+            ]
+        if self.use_aiter:
+            from vllm.compilation.rocm_aiter_fusion import (
+                AiterFusedAddRMSNormDynamicQuantPattern,
+                AiterRMSNormDynamicQuantPattern,
+            )
+
+            return [
+                AiterFusedAddRMSNormDynamicQuantPattern.FUSED_OP,
+                AiterRMSNormDynamicQuantPattern.FUSED_OP,
+            ]
         return [
             FUSED_OPS[FusedRMSQuantKey(self.quant_key, True)],
             FUSED_OPS[FusedRMSQuantKey(self.quant_key, False)],
         ]
 
-    def ops_in_model_before(self):
-        return (
-            [QUANT_OPS[self.quant_key]]
-            if self.enable_quant_fp8_custom_op
-            else [torch.ops.aten.reciprocal]
-        )
-
     def ops_in_model_before_partial(self):
         return (
             [RMS_OP, RMS_ADD_OP]
@@ -155,67 +214,45 @@ GROUP_SHAPES = [
 ]
 
 
-class TestRmsnormGroupFp8QuantModel(torch.nn.Module):
-    def __init__(self, hidden_size: int, eps: float, **kwargs):
-        super().__init__()
-        self.w8a8_block_fp8_linear = W8A8BlockFp8LinearOp(
-            weight_group_shape=GroupShape(128, 128),
-            act_quant_group_shape=GroupShape(1, 128),
-            cutlass_block_fp8_supported=False,
-            use_aiter_and_is_supported=True,
-        )
-        self.w = [
-            torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t()
-            for _ in range(3)
-        ]
+def _run_fusion_test(
+    model,
+    fusion_pass,
+    vllm_config,
+    dtype,
+    hidden_size,
+    num_tokens,
+):
+    """Helper function for common fusion test logic.
 
-        scale_hidden_size = (hidden_size + 128 - 1) // 128
-        self.wscale = [
-            torch.rand((scale_hidden_size, scale_hidden_size), dtype=torch.float32)
-            for _ in range(3)
-        ]
+    Must be called within vllm_config context.
+    """
+    noop_pass = NoOpEliminationPass(vllm_config)
+    cleanup_pass = PostCleanupPass(vllm_config)
 
-        self.norm_weight = [torch.ones(hidden_size) for _ in range(4)]
-        self.eps = eps
+    backend = TestBackend(noop_pass, fusion_pass, cleanup_pass)
+    backend2 = TestBackend(noop_pass, cleanup_pass)
 
-    def forward(self, x):
-        # avoid having graph input be an arg to a pattern directly
-        x = resid = torch.relu(x)
-        y = rocm_aiter_ops.rms_norm(x, self.norm_weight[0], self.eps)
+    x = torch.rand(num_tokens, hidden_size)
+    torch._dynamo.mark_dynamic(x, 0)
 
-        x2 = self.w8a8_block_fp8_linear.apply(y, self.w[0], self.wscale[0])
-        # make sure resid is used for replacement to work
-        y2, resid = rocm_aiter_ops.rms_norm2d_with_add(
-            x2, resid, self.norm_weight[1], self.eps
-        )
+    model_fused = torch.compile(model, backend=backend)
+    result_fused = model_fused(x)
 
-        x3 = self.w8a8_block_fp8_linear.apply(y2, self.w[1], self.wscale[1])
+    model_unfused = torch.compile(model, backend=backend2)
+    result_unfused = model_unfused(x)
 
-        y3, resid = rocm_aiter_ops.rms_norm2d_with_add(
-            x3, resid, self.norm_weight[2], self.eps
-        )
+    if dtype == torch.float16:
+        ATOL, RTOL = (2e-3, 2e-3)
+    else:
+        ATOL, RTOL = (1e-2, 1e-2)
 
-        x4 = self.w8a8_block_fp8_linear.apply(y3, self.w[2], self.wscale[2])
+    torch.testing.assert_close(result_fused, result_unfused, atol=ATOL, rtol=RTOL)
 
-        y4, resid = rocm_aiter_ops.rms_norm2d_with_add(
-            x4, resid, self.norm_weight[3], self.eps
-        )
-        return y4
+    assert fusion_pass.matched_count == 3
+    backend.check_before_ops(model.ops_in_model_before())
+    backend.check_after_ops(model.ops_in_model_after())
 
-    def ops_in_model_before(self):
-        return [
-            torch.ops.vllm.rocm_aiter_rms_norm,
-            torch.ops.vllm.rocm_aiter_group_fp8_quant,
-        ]
-
-    def ops_in_model_before_partial(self):
-        return []
-
-    def ops_in_model_after(self):
-        return [
-            torch.ops.vllm.rocm_aiter_rmsnorm_fp8_group_quant,
-            torch.ops.vllm.rocm_aiter_rmsnorm_with_add_fp8_group_quant,
-        ]
+    return backend, backend2
 
 
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
@@ -223,11 +260,8 @@ class TestRmsnormGroupFp8QuantModel(torch.nn.Module):
 @pytest.mark.parametrize("num_tokens", [257])
 @pytest.mark.parametrize("eps", [1e-5, 1e-6])
 @pytest.mark.parametrize("group_shape", GROUP_SHAPES)
-@pytest.mark.parametrize(
-    "model_class, enable_rms_norm_custom_op, enable_quant_fp8_custom_op",
-    list(itertools.product([TestModel], [True, False], [True, False]))
-    + [(TestRmsnormGroupFp8QuantModel, False, False)],
-)
+@pytest.mark.parametrize("enable_rms_norm_custom_op", [True, False])
+@pytest.mark.parametrize("enable_quant_fp8_custom_op", [True, False])
 # cuda_force_torch used to test torch code path on platforms that
 # cutlass_fp8_supported() == True.
 @pytest.mark.parametrize(
@@ -242,23 +276,13 @@ def test_fusion_rmsnorm_quant(
     num_tokens,
     eps,
     group_shape,
-    model_class,
     enable_rms_norm_custom_op,
     enable_quant_fp8_custom_op,
     cuda_force_torch,
 ):
-    if model_class is TestRmsnormGroupFp8QuantModel and not IS_AITER_FOUND:
-        pytest.skip("AITER is not supported on this GPU.")
-
-    torch.set_default_device("cuda")
-    torch.set_default_dtype(dtype)
-    torch.manual_seed(1)
-    maybe_create_device_identity()  # needed for certain non-cutlass fp8 paths
-
     if not enable_quant_fp8_custom_op and group_shape.is_per_group():
         pytest.skip("Unsupported unwrapped quant fp8 op for blockwise quantization")
 
-    # Skip test for 64-bit group shape when running with cutlass or deepgemm
     if group_shape == GroupShape(1, 64) and (
         cutlass_block_fp8_supported() or is_deep_gemm_supported()
     ):
@@ -269,6 +293,7 @@ def test_fusion_rmsnorm_quant(
         custom_ops.append("+rms_norm")
     if enable_quant_fp8_custom_op:
         custom_ops.append("+quant_fp8")
+
     vllm_config = VllmConfig(
         model_config=ModelConfig(dtype=dtype),
         compilation_config=CompilationConfig(
@@ -279,60 +304,97 @@ def test_fusion_rmsnorm_quant(
             ),
         ),
     )
-    with vllm.config.set_current_vllm_config(vllm_config):
-        # Reshape pass is needed for the fusion pass to work
-        noop_pass = NoOpEliminationPass(vllm_config)
-        if model_class is TestRmsnormGroupFp8QuantModel:
-            from vllm.compilation.rocm_aiter_fusion import (
-                RocmAiterRMSNormFp8GroupQuantFusionPass,
-            )
 
-            fusion_pass = RocmAiterRMSNormFp8GroupQuantFusionPass(vllm_config)
-        else:
-            fusion_pass = RMSNormQuantFusionPass(vllm_config)
-        cleanup_pass = PostCleanupPass(vllm_config)
-
-        backend = TestBackend(noop_pass, fusion_pass, cleanup_pass)
-        backend2 = TestBackend(noop_pass, cleanup_pass)
-        model = model_class(
+    with vllm.config.set_current_vllm_config(vllm_config):
+        # Setup device before model creation
+        torch.set_default_device("cuda")
+        torch.set_default_dtype(dtype)
+        torch.manual_seed(1)
+        maybe_create_device_identity()
+
+        fusion_pass = RMSNormQuantFusionPass(vllm_config)
+        model = TestModel(
             hidden_size=hidden_size,
             eps=eps,
             group_shape=group_shape,
+            use_aiter=False,
             cuda_force_torch=cuda_force_torch,
         )
-        # First dimension dynamic
-        x = torch.rand(num_tokens, hidden_size)
-        torch._dynamo.mark_dynamic(x, 0)
-
-        model_fused = torch.compile(model, backend=backend)
-        result_fused = model_fused(x)
-
-        model_unfused = torch.compile(model, backend=backend2)
-        result_unfused = model_unfused(x)
-
-        if dtype == torch.float16:
-            ATOL, RTOL = (2e-3, 2e-3)
-        else:
-            ATOL, RTOL = (1e-2, 1e-2)
-
-        torch.testing.assert_close(result_fused, result_unfused, atol=ATOL, rtol=RTOL)
 
-        assert fusion_pass.matched_count == 3
-        backend.check_before_ops(model.ops_in_model_before())
+        backend, _ = _run_fusion_test(
+            model, fusion_pass, vllm_config, dtype, hidden_size, num_tokens
+        )
         backend.check_before_ops(
             model.ops_in_model_before_partial(), fully_replaced=False
         )
-        backend.check_after_ops(model.ops_in_model_after())
 
         # If RMSNorm custom op is disabled (native/torch impl used),
         # there's a risk that the fused add doesn't get included in the
         # replacement and only the rms part gets fused with quant.
         # Hence, we check only 2 add nodes are left (final fused rmsnorm add).
-        if (
-            not enable_rms_norm_custom_op
-            and model_class is not TestRmsnormGroupFp8QuantModel
-        ):
+        if not enable_rms_norm_custom_op:
             n_add_nodes = lambda g: sum(1 for _ in find_op_nodes(torch.ops.aten.add, g))
             # 7 = 1 (RMS) + 3x2 (3xRMS_ADD, 2 each)
             assert n_add_nodes(backend.graph_pre_pass) == 7
             assert n_add_nodes(backend.graph_post_pass) == 2
+
+
+GROUP_SHAPE_QUANT_OPS_MATCHS = [
+    (GroupShape.PER_TOKEN, True),
+    (GroupShape.PER_TOKEN, False),
+    (GroupShape(1, 128), True),
+]
+
+
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
+@pytest.mark.parametrize("hidden_size", [256])
+@pytest.mark.parametrize("num_tokens", [257])
+@pytest.mark.parametrize("eps", [1e-5, 1e-6])
+@pytest.mark.parametrize(
+    "group_shape, use_aiter_quant_op", GROUP_SHAPE_QUANT_OPS_MATCHS
+)
+@pytest.mark.skipif(
+    (not current_platform.is_rocm() or not IS_AITER_FOUND),
+    reason="Only test on ROCm with aiter package installed",
+)
+def test_aiter_fusion_rmsnorm_quant(
+    dtype: torch.dtype,
+    hidden_size: int,
+    num_tokens: int,
+    eps: float,
+    group_shape: GroupShape,
+    use_aiter_quant_op: bool,
+    monkeypatch: pytest.MonkeyPatch,
+):
+    vllm_config = VllmConfig(
+        model_config=ModelConfig(dtype=dtype),
+        compilation_config=CompilationConfig(
+            mode=CompilationMode.VLLM_COMPILE,
+            custom_ops=["+rms_norm", "+quant_fp8"],
+            pass_config=PassConfig(fuse_norm_quant=True, eliminate_noops=True),
+        ),
+    )
+
+    with vllm.config.set_current_vllm_config(vllm_config), monkeypatch.context() as m:
+        from vllm.compilation.rocm_aiter_fusion import RocmAiterRMSNormFusionPass
+
+        m.setenv("VLLM_ROCM_USE_AITER", "1")
+        rocm_aiter_ops.refresh_env_variables()
+
+        torch.set_default_device("cuda")
+        torch.set_default_dtype(dtype)
+        torch.manual_seed(1)
+        maybe_create_device_identity()
+
+        fusion_pass = RocmAiterRMSNormFusionPass(vllm_config)
+        model = TestModel(
+            hidden_size=hidden_size,
+            eps=eps,
+            group_shape=group_shape,
+            use_aiter=True,
+            use_aiter_quant_op=use_aiter_quant_op,
+        )
+
+        _run_fusion_test(
+            model, fusion_pass, vllm_config, dtype, hidden_size, num_tokens
+        )
diff --git a/tests/compile/untest_fusion_attn.py b/tests/compile/untest_fusion_attn.py
index db95dff5e0fc7dc89b872e4840d8e4d7dec6ff06..a1fd098aee5fc8bb5b7616055402075a5a7169c5 100644
--- a/tests/compile/untest_fusion_attn.py
+++ b/tests/compile/untest_fusion_attn.py
@@ -9,8 +9,6 @@ from tests.compile.backend import LazyInitPass, TestBackend
 from tests.utils import flat_product
 from tests.v1.attention.utils import BatchSpec, create_common_attn_metadata
 from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
-from vllm.attention.backends.abstract import AttentionMetadata
-from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.layer import Attention
 from vllm.compilation.fusion_attn import ATTN_OP, AttnFusionPass
 from vllm.compilation.fx_utils import find_op_nodes
@@ -37,6 +35,8 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import Fp8LinearOp
 from vllm.platforms import current_platform
 from vllm.utils.flashinfer import has_flashinfer
+from vllm.v1.attention.backend import AttentionMetadata
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
 from vllm.v1.kv_cache_interface import AttentionSpec
 
 FP8_DTYPE = current_platform.fp8_dtype()
@@ -305,8 +305,12 @@ def test_attention_quant_pattern(
     model_class: type[AttentionQuantPatternModel],
     backend: AttentionBackendEnum,
     dist_init,
+    monkeypatch,
+    use_fresh_inductor_cache,
 ):
     """Test AttentionStaticQuantPattern fusion pass"""
+    monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
+
     if backend == AttentionBackendEnum.FLASHINFER and (
         not current_platform.is_device_capability((10, 0)) or not has_flashinfer()
     ):
@@ -363,13 +367,15 @@ def test_attention_quant_pattern(
             vllm_config=vllm_config_unfused,
         )
         model_unfused = model_unfused.to(device)
+        result_unfused_0 = model_unfused(q, k, v)  # noqa: F841  HACK: See #131044
 
         forward_ctx = get_forward_context()
         forward_ctx.attn_metadata = model_unfused.build_attn_metadata(batch_size)
 
         # Run model directly without fusion
         # Still compile so query QuantFP8 has closer numerics
-        result_unfused = torch.compile(model_unfused, fullgraph=True)(q, k, v)
+        compiled_unfused = torch.compile(model_unfused, fullgraph=True)
+        result_unfused = compiled_unfused(q, k, v)
 
     # Run model with attn fusion enabled
     vllm_config.compilation_config.pass_config = PassConfig(
@@ -399,24 +405,26 @@ def test_attention_quant_pattern(
         cleanup_pass = PostCleanupPass(vllm_config)
 
         test_backend = TestBackend(noop_pass, attn_pass, cleanup_pass)
+        # HACK: See https://github.com/vllm-project/vllm/issues/31044
+        result_fused_0 = model_fused(q, k, v)  # noqa: F841
 
         # Compile model with fusion enabled
-        model_compiled = torch.compile(
+        compiled_fused = torch.compile(
             model_fused, backend=test_backend, fullgraph=True
         )
-        assert model_compiled.attn._o_scale_float is None
+        assert compiled_fused.attn._o_scale_float is None
 
-        result_fused_1 = model_compiled(q, k, v)
+        result_fused = compiled_fused(q, k, v)
 
         if backend == AttentionBackendEnum.FLASHINFER:
             # With the Flashinfer backend after the 1st round of the forward
             # pass, output quant scale should be loaded into the attn layer's
             # _o_scale_float, the 2nd round should reuse the loaded
             # _o_scale_float
-            assert model_compiled.attn._o_scale_float is not None
-            result_fused_2 = model_compiled(q, k, v)
+            assert compiled_fused.attn._o_scale_float is not None
+            result_fused_2 = compiled_fused(q, k, v)
 
-            assert model_compiled.attn._o_scale_float is not None
+            assert compiled_fused.attn._o_scale_float is not None
 
             torch.testing.assert_close(
                 result_unfused, result_fused_2, atol=1e-2, rtol=1e-2
@@ -474,4 +482,4 @@ def test_attention_quant_pattern(
         )
 
     # Check that results are close
-    torch.testing.assert_close(result_unfused, result_fused_1, atol=1e-2, rtol=1e-2)
+    torch.testing.assert_close(result_unfused, result_fused, atol=1e-2, rtol=1e-2)
diff --git a/tests/config/base_model_arch_groundtruth.json b/tests/config/base_model_arch_groundtruth.json
new file mode 100644
index 0000000000000000000000000000000000000000..3401198ad7d56c17b0a97304602989efa5d3ad9f
--- /dev/null
+++ b/tests/config/base_model_arch_groundtruth.json
@@ -0,0 +1,359 @@
+{
+    "state-spaces/mamba-130m-hf": {
+        "architectures": [
+            "MambaForCausalLM"
+        ],
+        "model_type": "mamba",
+        "text_model_type": "mamba",
+        "hidden_size": 768,
+        "total_num_hidden_layers": 24,
+        "total_num_attention_heads": 0,
+        "head_size": 0,
+        "vocab_size": 50280,
+        "total_num_kv_heads": 0,
+        "num_experts": 0,
+        "is_deepseek_mla": false,
+        "is_multimodal_model": false,
+        "dtype": "torch.float32"
+    },
+    "mistralai/Mamba-Codestral-7B-v0.1": {
+        "architectures": [
+            "Mamba2ForCausalLM"
+        ],
+        "model_type": "mamba",
+        "text_model_type": "mamba",
+        "hidden_size": 4096,
+        "total_num_hidden_layers": 64,
+        "total_num_attention_heads": 0,
+        "head_size": 0,
+        "vocab_size": 32768,
+        "total_num_kv_heads": 0,
+        "num_experts": 0,
+        "is_deepseek_mla": false,
+        "is_multimodal_model": false,
+        "dtype": "torch.bfloat16"
+    },
+    "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11": {
+        "architectures": [
+            "Terratorch"
+        ],
+        "model_type": "timm_wrapper",
+        "text_model_type": "timm_wrapper",
+        "hidden_size": 0,
+        "total_num_hidden_layers": 0,
+        "total_num_attention_heads": 0,
+        "head_size": 0,
+        "vocab_size": 0,
+        "total_num_kv_heads": 0,
+        "num_experts": 0,
+        "is_deepseek_mla": false,
+        "is_multimodal_model": true,
+        "dtype": "torch.float32"
+    },
+    "tiiuae/falcon-mamba-7b-instruct": {
+        "architectures": [
+            "FalconMambaForCausalLM"
+        ],
+        "model_type": "falcon_mamba",
+        "text_model_type": "falcon_mamba",
+        "hidden_size": 4096,
+        "total_num_hidden_layers": 64,
+        "total_num_attention_heads": 0,
+        "head_size": 0,
+        "vocab_size": 65024,
+        "total_num_kv_heads": 0,
+        "num_experts": 0,
+        "is_deepseek_mla": false,
+        "is_multimodal_model": false,
+        "dtype": "torch.bfloat16"
+    },
+    "Zyphra/Zamba2-7B-instruct": {
+        "architectures": [
+            "Zamba2ForCausalLM"
+        ],
+        "model_type": "zamba2",
+        "text_model_type": "zamba2",
+        "hidden_size": 3584,
+        "total_num_hidden_layers": 81,
+        "total_num_attention_heads": 32,
+        "head_size": 224,
+        "vocab_size": 32000,
+        "total_num_kv_heads": 32,
+        "num_experts": 0,
+        "is_deepseek_mla": false,
+        "is_multimodal_model": false,
+        "dtype": "torch.bfloat16"
+    },
+    "mosaicml/mpt-7b": {
+        "architectures": [
+            "MPTForCausalLM"
+        ],
+        "model_type": "mpt",
+        "text_model_type": "mpt",
+        "hidden_size": 4096,
+        "total_num_hidden_layers": 32,
+        "total_num_attention_heads": 32,
+        "head_size": 128,
+        "vocab_size": 50432,
+        "total_num_kv_heads": 32,
+        "num_experts": 0,
+        "is_deepseek_mla": false,
+        "is_multimodal_model": false,
+        "dtype": "torch.bfloat16"
+    },
+    "databricks/dbrx-instruct": {
+        "architectures": [
+            "DbrxForCausalLM"
+        ],
+        "model_type": "dbrx",
+        "text_model_type": "dbrx",
+        "hidden_size": 6144,
+        "total_num_hidden_layers": 40,
+        "total_num_attention_heads": 48,
+        "head_size": 128,
+        "vocab_size": 100352,
+        "total_num_kv_heads": 8,
+        "num_experts": 0,
+        "is_deepseek_mla": false,
+        "is_multimodal_model": false,
+        "dtype": "torch.bfloat16"
+    },
+    "tiiuae/falcon-7b": {
+        "architectures": [
+            "FalconForCausalLM"
+        ],
+        "model_type": "falcon",
+        "text_model_type": "falcon",
+        "hidden_size": 4544,
+        "total_num_hidden_layers": 32,
+        "total_num_attention_heads": 71,
+        "head_size": 64,
+        "vocab_size": 65024,
+        "total_num_kv_heads": 1,
+        "num_experts": 0,
+        "is_deepseek_mla": false,
+        "is_multimodal_model": false,
+        "dtype": "torch.bfloat16"
+    },
+    "tiiuae/falcon-40b": {
+        "architectures": [
+            "FalconForCausalLM"
+        ],
+        "model_type": "falcon",
+        "text_model_type": "falcon",
+        "hidden_size": 8192,
+        "total_num_hidden_layers": 60,
+        "total_num_attention_heads": 128,
+        "head_size": 64,
+        "vocab_size": 65024,
+        "total_num_kv_heads": 8,
+        "num_experts": 0,
+        "is_deepseek_mla": false,
+        "is_multimodal_model": false,
+        "dtype": "torch.bfloat16"
+    },
+    "luccafong/deepseek_mtp_main_random": {
+        "architectures": [
+            "DeepseekV3ForCausalLM"
+        ],
+        "model_type": "deepseek_v3",
+        "text_model_type": "deepseek_v3",
+        "hidden_size": 2560,
+        "total_num_hidden_layers": 5,
+        "total_num_attention_heads": 32,
+        "head_size": 576,
+        "vocab_size": 129280,
+        "total_num_kv_heads": 32,
+        "num_experts": 72,
+        "is_deepseek_mla": true,
+        "is_multimodal_model": false,
+        "dtype": "torch.bfloat16"
+    },
+    "luccafong/deepseek_mtp_draft_random": {
+        "architectures": [
+            "DeepseekV3ForCausalLM"
+        ],
+        "model_type": "deepseek_v3",
+        "text_model_type": "deepseek_v3",
+        "hidden_size": 2560,
+        "total_num_hidden_layers": 10,
+        "total_num_attention_heads": 32,
+        "head_size": 576,
+        "vocab_size": 129280,
+        "total_num_kv_heads": 32,
+        "num_experts": 72,
+        "is_deepseek_mla": true,
+        "is_multimodal_model": false,
+        "dtype": "torch.bfloat16"
+    },
+    "Qwen/Qwen3-Next-80B-A3B-Instruct": {
+        "architectures": [
+            "Qwen3NextForCausalLM"
+        ],
+        "model_type": "qwen3_next",
+        "text_model_type": "qwen3_next",
+        "hidden_size": 2048,
+        "total_num_hidden_layers": 48,
+        "total_num_attention_heads": 16,
+        "head_size": 256,
+        "vocab_size": 151936,
+        "total_num_kv_heads": 2,
+        "num_experts": 512,
+        "is_deepseek_mla": false,
+        "is_multimodal_model": false,
+        "dtype": "torch.bfloat16"
+    },
+    "tiny-random/qwen3-next-moe": {
+        "architectures": [
+            "Qwen3NextForCausalLM"
+        ],
+        "model_type": "qwen3_next",
+        "text_model_type": "qwen3_next",
+        "hidden_size": 8,
+        "total_num_hidden_layers": 4,
+        "total_num_attention_heads": 16,
+        "head_size": 32,
+        "vocab_size": 151936,
+        "total_num_kv_heads": 8,
+        "num_experts": 32,
+        "is_deepseek_mla": false,
+        "is_multimodal_model": false,
+        "dtype": "torch.bfloat16"
+    },
+    "zai-org/GLM-4.5": {
+        "architectures": [
+            "Glm4MoeForCausalLM"
+        ],
+        "model_type": "glm4_moe",
+        "text_model_type": "glm4_moe",
+        "hidden_size": 5120,
+        "total_num_hidden_layers": 92,
+        "total_num_attention_heads": 96,
+        "head_size": 128,
+        "vocab_size": 151552,
+        "total_num_kv_heads": 8,
+        "num_experts": 160,
+        "is_deepseek_mla": false,
+        "is_multimodal_model": false,
+        "dtype": "torch.bfloat16"
+    },
+    "baidu/ERNIE-4.5-21B-A3B-PT": {
+        "architectures": [
+            "Ernie4_5_MoeForCausalLM"
+        ],
+        "model_type": "ernie4_5_moe",
+        "text_model_type": "ernie4_5_moe",
+        "hidden_size": 2560,
+        "total_num_hidden_layers": 28,
+        "total_num_attention_heads": 20,
+        "head_size": 128,
+        "vocab_size": 103424,
+        "total_num_kv_heads": 4,
+        "num_experts": 64,
+        "is_deepseek_mla": false,
+        "is_multimodal_model": false,
+        "dtype": "torch.bfloat16"
+    },
+    "lmsys/gpt-oss-20b-bf16": {
+        "architectures": [
+            "GptOssForCausalLM"
+        ],
+        "model_type": "gpt_oss",
+        "text_model_type": "gpt_oss",
+        "hidden_size": 2880,
+        "total_num_hidden_layers": 24,
+        "total_num_attention_heads": 64,
+        "head_size": 64,
+        "vocab_size": 201088,
+        "total_num_kv_heads": 8,
+        "num_experts": 32,
+        "is_deepseek_mla": false,
+        "is_multimodal_model": false,
+        "dtype": "torch.bfloat16"
+    },
+    "deepseek-ai/DeepSeek-V3.2-Exp": {
+        "architectures": [
+            "DeepseekV32ForCausalLM"
+        ],
+        "model_type": "deepseek_v32",
+        "text_model_type": "deepseek_v32",
+        "hidden_size": 7168,
+        "total_num_hidden_layers": 61,
+        "total_num_attention_heads": 128,
+        "head_size": 576,
+        "vocab_size": 129280,
+        "total_num_kv_heads": 128,
+        "num_experts": 256,
+        "is_deepseek_mla": true,
+        "is_multimodal_model": false,
+        "dtype": "torch.bfloat16"
+    },
+    "meta-llama/Llama-4-Scout-17B-16E-Instruct": {
+        "architectures": [
+            "Llama4ForConditionalGeneration"
+        ],
+        "model_type": "llama4",
+        "text_model_type": "llama4_text",
+        "hidden_size": 5120,
+        "total_num_hidden_layers": 48,
+        "total_num_attention_heads": 40,
+        "head_size": 128,
+        "vocab_size": 202048,
+        "total_num_kv_heads": 8,
+        "num_experts": 16,
+        "is_deepseek_mla": false,
+        "is_multimodal_model": true,
+        "dtype": "torch.bfloat16"
+    },
+    "nvidia/Llama-3_3-Nemotron-Super-49B-v1": {
+        "architectures": [
+            "DeciLMForCausalLM"
+        ],
+        "model_type": "nemotron-nas",
+        "text_model_type": "nemotron-nas",
+        "hidden_size": 8192,
+        "total_num_hidden_layers": 80,
+        "total_num_attention_heads": 64,
+        "head_size": 128,
+        "vocab_size": 128256,
+        "total_num_kv_heads": 8,
+        "num_experts": 0,
+        "is_deepseek_mla": false,
+        "is_multimodal_model": false,
+        "dtype": "torch.bfloat16"
+    },
+    "XiaomiMiMo/MiMo-7B-RL": {
+        "architectures": [
+            "MiMoForCausalLM"
+        ],
+        "model_type": "mimo",
+        "text_model_type": "mimo",
+        "hidden_size": 4096,
+        "total_num_hidden_layers": 36,
+        "total_num_attention_heads": 32,
+        "head_size": 128,
+        "vocab_size": 151680,
+        "total_num_kv_heads": 8,
+        "num_experts": 0,
+        "is_deepseek_mla": false,
+        "is_multimodal_model": false,
+        "dtype": "torch.bfloat16"
+    },
+    "meituan-longcat/LongCat-Flash-Chat": {
+        "architectures": [
+            "LongcatFlashForCausalLM"
+        ],
+        "model_type": "longcat_flash",
+        "text_model_type": "longcat_flash",
+        "hidden_size": 6144,
+        "total_num_hidden_layers": 28,
+        "total_num_attention_heads": 64,
+        "head_size": 576,
+        "vocab_size": 131072,
+        "total_num_kv_heads": 64,
+        "num_experts": 512,
+        "is_deepseek_mla": true,
+        "is_multimodal_model": false,
+        "dtype": "torch.float32"
+    }
+}
diff --git a/tests/config/draft_model_arch_groundtruth.json b/tests/config/draft_model_arch_groundtruth.json
new file mode 100644
index 0000000000000000000000000000000000000000..dfe6f3d39e93b6a1a189d906bde5a1ab26475c31
--- /dev/null
+++ b/tests/config/draft_model_arch_groundtruth.json
@@ -0,0 +1,87 @@
+{
+    "abhigoyal/vllm-medusa-llama-68m-random": {
+        "architectures": [
+            "MedusaModel"
+        ],
+        "model_type": "medusa",
+        "text_model_type": "medusa",
+        "hidden_size": 768,
+        "total_num_hidden_layers": 1,
+        "total_num_attention_heads": 0,
+        "head_size": "Error: integer division or modulo by zero",
+        "vocab_size": 32000,
+        "total_num_kv_heads": 0,
+        "num_experts": 0,
+        "is_deepseek_mla": false,
+        "is_multimodal_model": false,
+        "dtype": "torch.float32"
+    },
+    "luccafong/deepseek_mtp_draft_random": {
+        "architectures": [
+            "DeepSeekMTPModel"
+        ],
+        "model_type": "deepseek_mtp",
+        "text_model_type": "deepseek_mtp",
+        "hidden_size": 2560,
+        "total_num_hidden_layers": 1,
+        "total_num_attention_heads": 32,
+        "head_size": 576,
+        "vocab_size": 129280,
+        "total_num_kv_heads": 32,
+        "num_experts": 72,
+        "is_deepseek_mla": true,
+        "is_multimodal_model": false,
+        "dtype": "torch.bfloat16"
+    },
+    "eagle618/eagle-deepseek-v3-random": {
+        "architectures": [
+            "EagleDeepSeekMTPModel"
+        ],
+        "model_type": "eagle",
+        "text_model_type": "deepseek_mtp",
+        "hidden_size": 2560,
+        "total_num_hidden_layers": 1,
+        "total_num_attention_heads": 32,
+        "head_size": 576,
+        "vocab_size": 129280,
+        "total_num_kv_heads": 32,
+        "num_experts": 72,
+        "is_deepseek_mla": true,
+        "is_multimodal_model": false,
+        "dtype": "bfloat16"
+    },
+    "yuhuili/EAGLE-LLaMA3-Instruct-8B": {
+        "architectures": [
+            "EagleLlamaForCausalLM"
+        ],
+        "model_type": "eagle",
+        "text_model_type": "llama",
+        "hidden_size": 4096,
+        "total_num_hidden_layers": 1,
+        "total_num_attention_heads": 32,
+        "head_size": 128,
+        "vocab_size": 128256,
+        "total_num_kv_heads": 8,
+        "num_experts": 0,
+        "is_deepseek_mla": false,
+        "is_multimodal_model": false,
+        "dtype": "float16"
+    },
+    "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B": {
+        "architectures": [
+            "Eagle3LlamaForCausalLM"
+        ],
+        "model_type": "eagle",
+        "text_model_type": "llama",
+        "hidden_size": 4096,
+        "total_num_hidden_layers": 1,
+        "total_num_attention_heads": 32,
+        "head_size": 128,
+        "vocab_size": 128256,
+        "total_num_kv_heads": 8,
+        "num_experts": 0,
+        "is_deepseek_mla": false,
+        "is_multimodal_model": false,
+        "dtype": "float16"
+    }
+}
diff --git a/tests/config/test_model_arch_config.py b/tests/config/test_model_arch_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..06d4c6e7a8655f9167ba3302ebe66bf5d3bd5334
--- /dev/null
+++ b/tests/config/test_model_arch_config.py
@@ -0,0 +1,152 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for ModelArchitectureConfig and its integration with ModelConfig."""
+
+import json
+from pathlib import Path
+
+import pytest
+
+from vllm.config import ModelConfig, ParallelConfig, SpeculativeConfig
+from vllm.transformers_utils.model_arch_config_convertor import (
+    ModelArchConfigConvertorBase,
+)
+
+BASE_TRUST_REMOTE_CODE_MODELS = {
+    "nvidia/Llama-3_3-Nemotron-Super-49B-v1",
+    "XiaomiMiMo/MiMo-7B-RL",
+    # Excluded: Not available online right now
+    # "FreedomIntelligence/openPangu-Ultra-MoE-718B-V1.1",
+    "meituan-longcat/LongCat-Flash-Chat",
+}
+
+BASE_MODELS_TO_TEST = [
+    "state-spaces/mamba-130m-hf",
+    "mistralai/Mamba-Codestral-7B-v0.1",
+    # Excluded: terratorch/torchgeo version mismatch in CPU CI environment
+    # (NonGeoDataset import error). Tested in model initialization tests.
+    # "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11",
+    "Zyphra/Zamba2-7B-instruct",
+    # FIXME: mosaicml/mpt-7b has been deleted
+    # "mosaicml/mpt-7b",
+    # FIXME: databricks/dbrx-instruct has been deleted
+    # "databricks/dbrx-instruct",
+    "tiiuae/falcon-7b",
+    "tiiuae/falcon-40b",
+    "luccafong/deepseek_mtp_main_random",
+    "Qwen/Qwen3-Next-80B-A3B-Instruct",
+    "tiny-random/qwen3-next-moe",
+    "zai-org/GLM-4.5",
+    "baidu/ERNIE-4.5-21B-A3B-PT",
+    # Models using base convertor
+    "lmsys/gpt-oss-20b-bf16",
+    "deepseek-ai/DeepSeek-V3.2-Exp",
+    "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+] + list(BASE_TRUST_REMOTE_CODE_MODELS)
+
+# (target_model, draft_model, trust_remote_code)
+SPECULATIVE_MODELS = [
+    ("JackFram/llama-68m", "abhigoyal/vllm-medusa-llama-68m-random", False),
+    ("luccafong/deepseek_mtp_main_random", "luccafong/deepseek_mtp_draft_random", True),
+    ("eagle618/deepseek-v3-random", "eagle618/eagle-deepseek-v3-random", True),
+    ("meta-llama/Meta-Llama-3-8B-Instruct", "yuhuili/EAGLE-LLaMA3-Instruct-8B", True),
+    ("meta-llama/Llama-3.1-8B-Instruct", "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B", True),
+]
+
+
+def _load_groundtruth(filename: str) -> dict:
+    """Load groundtruth JSON from the test directory."""
+    groundtruth_path = Path(__file__).parent / filename
+    with open(groundtruth_path) as f:
+        return json.load(f)
+
+
+def _assert_model_arch_config(
+    model_config, expected: dict, check_head_size: bool = True
+):
+    """Assert model_arch_config matches expected values."""
+    model_arch_config = model_config.model_arch_config
+    assert model_arch_config.architectures == expected["architectures"]
+    assert model_arch_config.model_type == expected["model_type"]
+    assert model_arch_config.text_model_type == expected["text_model_type"]
+    assert model_arch_config.hidden_size == expected["hidden_size"]
+    assert (
+        model_arch_config.total_num_hidden_layers == expected["total_num_hidden_layers"]
+    )
+    assert (
+        model_arch_config.total_num_attention_heads
+        == expected["total_num_attention_heads"]
+    )
+    assert model_arch_config.vocab_size == expected["vocab_size"]
+    assert model_arch_config.total_num_kv_heads == expected["total_num_kv_heads"]
+    assert model_arch_config.num_experts == expected["num_experts"]
+    assert model_arch_config.is_deepseek_mla == expected["is_deepseek_mla"]
+
+    torch_dtype = ModelArchConfigConvertorBase.get_torch_dtype(
+        model_config.hf_config, model_config.model, revision=model_config.revision
+    )
+    assert str(torch_dtype) == expected["dtype"]
+
+    if check_head_size:
+        assert model_arch_config.head_size == expected["head_size"]
+
+
+def _assert_model_config_methods(
+    model_config, expected: dict, check_head_size: bool = True
+):
+    """Assert model_config methods return expected values."""
+    assert model_config.architectures == expected["architectures"]
+    assert model_config.get_vocab_size() == expected["vocab_size"]
+    assert model_config.get_hidden_size() == expected["hidden_size"]
+    assert model_config.get_total_num_kv_heads() == expected["total_num_kv_heads"]
+    assert model_config.get_num_experts() == expected["num_experts"]
+    assert (
+        model_config.get_total_num_hidden_layers()
+        == expected["total_num_hidden_layers"]
+    )
+
+    if check_head_size:
+        assert model_config.get_head_size() == expected["head_size"]
+
+
+@pytest.mark.parametrize("model", BASE_MODELS_TO_TEST)
+def test_base_model_arch_config(model: str):
+    """Test model architecture config for base models."""
+    groundtruth = _load_groundtruth("base_model_arch_groundtruth.json")
+    expected = groundtruth[model]
+
+    model_config = ModelConfig(
+        model, trust_remote_code=model in BASE_TRUST_REMOTE_CODE_MODELS
+    )
+
+    _assert_model_arch_config(model_config, expected)
+    _assert_model_config_methods(model_config, expected)
+
+
+@pytest.mark.parametrize(
+    "target_model,draft_model,trust_remote_code", SPECULATIVE_MODELS
+)
+def test_draft_model_arch_config(
+    target_model: str, draft_model: str, trust_remote_code: bool
+):
+    """Test model architecture config for draft/speculative models."""
+    groundtruth = _load_groundtruth("draft_model_arch_groundtruth.json")
+    expected = groundtruth[draft_model]
+
+    target_model_config = ModelConfig(target_model, trust_remote_code=trust_remote_code)
+    speculative_config = SpeculativeConfig(
+        model=draft_model,
+        num_speculative_tokens=1,
+        target_model_config=target_model_config,
+        target_parallel_config=ParallelConfig(),
+    )
+    model_config = speculative_config.draft_model_config
+
+    # For medusa models, head_size may cause division by zero before
+    # model_arch_config was introduced, so we conditionally check it
+    check_head_size = isinstance(expected["head_size"], int)
+
+    _assert_model_arch_config(model_config, expected, check_head_size=check_head_size)
+    _assert_model_config_methods(
+        model_config, expected, check_head_size=check_head_size
+    )
diff --git a/tests/config/test_multimodal_config.py b/tests/config/test_multimodal_config.py
index 3d02893e52f1e82d4729e8d0b52395e382fff484..51bf938785e517c5dc25e1f439119a29b61a7a0c 100644
--- a/tests/config/test_multimodal_config.py
+++ b/tests/config/test_multimodal_config.py
@@ -3,8 +3,8 @@
 
 import pytest
 
-from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config.multimodal import MultiModalConfig
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
 
 
 def test_mm_encoder_attn_backend_str_conversion():
diff --git a/tests/conftest.py b/tests/conftest.py
index 97e3cd7f9e2b39261a19db2f25937eb047f5ede0..3b408be1c2ea3fb425ea18925acd02dd7af0c8e3 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -47,7 +47,11 @@ from transformers import (
 )
 from transformers.models.auto.auto_factory import _BaseAutoModelClass
 
-from tests.models.utils import TokensTextLogprobs, TokensTextLogprobsPromptLogprobs
+from tests.models.utils import (
+    TokensTextLogprobs,
+    TokensTextLogprobsPromptLogprobs,
+    softmax,
+)
 from vllm import LLM, SamplingParams, envs
 from vllm.assets.audio import AudioAsset
 from vllm.assets.image import ImageAsset
@@ -189,6 +193,17 @@ def dist_init():
     cleanup_dist_env_and_memory()
 
 
+@pytest.fixture
+def default_vllm_config():
+    """Set a default VllmConfig for tests that directly test CustomOps or pathways
+    that use get_current_vllm_config() outside of a full engine context.
+    """
+    from vllm.config import VllmConfig, set_current_vllm_config
+
+    with set_current_vllm_config(VllmConfig()):
+        yield
+
+
 @pytest.fixture()
 def should_do_global_cleanup_after_test(request) -> bool:
     """Allow subdirectories to skip global cleanup by overriding this fixture.
@@ -414,7 +429,7 @@ class HfRunner:
 
         # don't put this import at the top level
         # it will call torch.cuda.device_count()
-        from transformers import AutoProcessor  # noqa: F401
+        from transformers import AutoProcessor
 
         self.processor = AutoProcessor.from_pretrained(
             model_name,
@@ -517,7 +532,7 @@ class HfRunner:
             elif problem_type == "multi_label_classification":
                 logits = output.logits.sigmoid()[0].tolist()
             else:
-                logits = output.logits.softmax(dim=-1)[0].tolist()
+                logits = softmax(output.logits)[0].tolist()
             outputs.append(logits)
 
         return outputs
@@ -685,6 +700,7 @@ class HfRunner:
         images: PromptImageInput | None = None,
         audios: PromptAudioInput | None = None,
         videos: PromptVideoInput | None = None,
+        use_cache: bool = True,
         **kwargs: Any,
     ) -> list[TokensTextLogprobs]:
         all_inputs = self.get_inputs(
@@ -698,7 +714,7 @@ class HfRunner:
         for inputs in all_inputs:
             output: "GenerateOutput" = self.model.generate(
                 **self.wrap_device(inputs),
-                use_cache=True,
+                use_cache=use_cache,
                 do_sample=False,
                 max_new_tokens=max_tokens,
                 output_hidden_states=True,
diff --git a/tests/distributed/test_context_parallel.py b/tests/distributed/test_context_parallel.py
index aa47f28a34dd56ed9fbdbdbff8860b1981550d62..a28630921771918e4dc3abb3143883fa4da7c6e8 100644
--- a/tests/distributed/test_context_parallel.py
+++ b/tests/distributed/test_context_parallel.py
@@ -219,14 +219,12 @@ def _test_cp_gsm8k(
         ]
     )
 
-    server_env = {}
     if attn_backend:
-        server_env["VLLM_ATTENTION_BACKEND"] = attn_backend
+        server_args.append(f"--attention-backend={attn_backend}")
 
     with RemoteOpenAIServer(
         model_id,
         server_args,
-        env_dict=server_env,
         max_wait_seconds=720,
     ) as remote_server:
         host = f"http://{remote_server.host}"
diff --git a/tests/distributed/test_eplb_algo.py b/tests/distributed/test_eplb_algo.py
index a53a61840e79ea4db36a2c47d3f464eef60816a2..6fe44fc218016b5798799acc4944c8895ca33f02 100644
--- a/tests/distributed/test_eplb_algo.py
+++ b/tests/distributed/test_eplb_algo.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import numpy as np
 import pytest
 import torch
 
@@ -310,3 +311,143 @@ if __name__ == "__main__":
     print(phy2log)
 
     test_basic_rebalance()
+
+
+def _make_phy_replicas_idx_from_phy2log(phy2log: np.ndarray) -> np.ndarray:
+    """Create replicas indices mapping from phy2log."""
+    pr = np.zeros_like(phy2log, dtype=np.int64)
+    for layer in range(phy2log.shape[0]):
+        seen: dict[int, int] = {}
+        row = phy2log[layer].tolist()
+        for i, expert in enumerate(row):
+            r = seen.get(expert, 0)
+            pr[layer, i] = r
+            seen[expert] = r + 1
+    return pr
+
+
+def _validate_intragpu_rearrangement(
+    old_global_expert_indices: np.ndarray,
+    new_phy2log: np.ndarray,
+    new_phy_replicas_idx: np.ndarray,
+    post_phy2log: np.ndarray,
+    post_phy_replicas_idx: np.ndarray,
+    num_ranks: int,
+    slots_per_gpu: int,
+):
+    # Per-GPU checks
+    for gpu_idx in range(num_ranks):
+        start = gpu_idx * slots_per_gpu
+        end = start + slots_per_gpu
+        old_seg = old_global_expert_indices[0, start:end]
+        new_seg = new_phy2log[0, start:end]
+        new_rnk = new_phy_replicas_idx[0, start:end]
+        post_seg = post_phy2log[0, start:end]
+        post_rnk = post_phy_replicas_idx[0, start:end]
+
+        # Pairwise equality for (expert, rank) pairs to ensure nothing is lost
+        def sorted_pairs(seg, rnk):
+            pairs = list(zip(seg.tolist(), rnk.tolist()))
+            pairs.sort()
+            return pairs
+
+        assert sorted_pairs(post_seg, post_rnk) == sorted_pairs(new_seg, new_rnk), (
+            f"Per-GPU pairs of (expert,rank) must match new mapping for GPU {gpu_idx}"
+        )
+
+        # For experts that remain on the same GPU, the old slot is preserved
+        # for at least one occurrence; rank at that slot must be valid for that expert
+        old_list = old_seg.tolist()
+        new_list = new_seg.tolist()
+        post_list = post_seg.tolist()
+        remained = set(old_list) & set(new_list)
+        new_ranks_for_expert: dict[int, list[int]] = {}
+        for v, r in zip(new_list, new_rnk.tolist()):
+            new_ranks_for_expert.setdefault(v, []).append(r)
+        for expert in remained:
+            old_pos = old_list.index(expert)
+            assert post_list[old_pos] == expert, (
+                f"Expert {expert} on GPU {gpu_idx} should stay at old slot {old_pos}"
+            )
+            # Rank at preserved slot must be one of the ranks
+            # the expert has in new mapping
+            assert post_rnk.tolist()[old_pos] in new_ranks_for_expert[expert], (
+                f"Rank for expert {expert} at preserved slot on GPU {gpu_idx} "
+                "must come from new mapping"
+            )
+
+
+@pytest.mark.parametrize(
+    "num_ranks, slots_per_gpu, old_phy2log, new_phy2log",
+    [
+        pytest.param(
+            # Setup: 2 GPUs, 4 slots each, 1 layer
+            # Old mapping: GPU0 -> [0,1,2,3], GPU1 -> [4,5,6,7]
+            # New mapping shuffles within GPU0 and brings 4,5 into GPU0.
+            # GPU0 new -> [1,5,0,4]; GPU1 new -> [6,2,7,3]
+            2,
+            4,
+            np.array([[0, 1, 2, 3, 4, 5, 6, 7]]),
+            np.array([[1, 5, 0, 4, 6, 2, 7, 3]]),
+            id="simple",
+        ),
+        pytest.param(
+            # Setup: 2 GPUs, 5 slots each (total 10 physical experts), 1 layer
+            # Old mapping:
+            #   GPU0 -> [0, 1, 0, 2, 3]  (expert 0 duplicated)
+            #   GPU1 -> [4, 5, 6, 1, 2]
+            # New mapping reorders within GPUs and moves some experts across GPUs,
+            # while still including duplicates:
+            #   GPU0 new -> [0, 5, 4, 0, 1]  (expert 0 duplicated, 4/5 incoming)
+            #   GPU1 new -> [6, 2, 3, 2, 1]  (expert 2 duplicated)
+            2,
+            5,
+            np.array([[0, 1, 0, 2, 3, 4, 5, 6, 1, 2]]),
+            np.array([[0, 5, 4, 0, 1, 6, 2, 3, 2, 1]]),
+            id="duplicates",
+        ),
+        pytest.param(
+            # Setup: 3 GPUs, 4 slots each (total 12 physical experts), 1 layer
+            # Old mapping:
+            #   GPU0 -> [0, 1, 2, 3]
+            #   GPU1 -> [0, 1, 2, 3]
+            #   GPU2 -> [0, 1, 2, 3]
+            # New mapping decides to use one expert on 2 GPUs and shuffles
+            # experts on the third GPU,
+            #   GPU0 new -> [0, 0, 0, 0]
+            #   GPU1 new -> [0, 0, 0, 0]
+            #   GPU2 new -> [1, 2, 3, 0]
+            3,
+            4,
+            np.array([[0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]]),
+            np.array([[0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 0]]),
+            id="skewed_expert",
+        ),
+    ],
+)
+def test_preserve_intragpu_slots(
+    num_ranks: int,
+    slots_per_gpu: int,
+    old_phy2log: torch.Tensor,
+    new_phy2log: torch.Tensor,
+):
+    """Experts that stay on a GPU keep their old slots; incoming not lost."""
+    phy_replicas_idx = _make_phy_replicas_idx_from_phy2log(new_phy2log)
+
+    post_phy2log, post_phy_replicas_idx = DefaultEplbPolicy.preserve_intragpu_slots(
+        new_phy2log, phy_replicas_idx, num_ranks, old_phy2log
+    )
+
+    # Shapes preserved
+    assert post_phy2log.shape == new_phy2log.shape
+    assert post_phy_replicas_idx.shape == phy_replicas_idx.shape
+
+    _validate_intragpu_rearrangement(
+        old_phy2log,
+        new_phy2log,
+        phy_replicas_idx,
+        post_phy2log,
+        post_phy_replicas_idx,
+        num_ranks,
+        slots_per_gpu,
+    )
diff --git a/tests/distributed/test_eplb_execute.py b/tests/distributed/test_eplb_execute.py
index 781dfd44c1ef61c27a0d23817095da593887caef..f8f950084a51115ad52d54a108eeb821adde37a2 100644
--- a/tests/distributed/test_eplb_execute.py
+++ b/tests/distributed/test_eplb_execute.py
@@ -286,15 +286,17 @@ def _test_async_transfer_layer_without_mtp_worker(
         device,
         old_indices,
     )
+    old_indices_cpu = old_indices.cpu()
+    new_indices_cpu = new_indices.cpu()
 
     expert_buffer = [torch.empty_like(w) for w in expert_weights[0]]
     cuda_stream = torch.cuda.Stream(device=device)
 
     for layer_idx in range(num_layers):
-        is_unchanged, is_received_locally, experts_recv_loc = asyncio.run(
+        is_unchanged, is_received_locally, recv_metadata = asyncio.run(
             transfer_layer(
-                old_global_expert_indices=old_indices,
-                new_global_expert_indices=new_indices,
+                old_global_expert_indices=old_indices_cpu,
+                new_global_expert_indices=new_indices_cpu,
                 expert_weights=expert_weights,
                 expert_weights_buffer=expert_buffer,
                 ep_group=ep_group,
@@ -302,16 +304,15 @@ def _test_async_transfer_layer_without_mtp_worker(
                 cuda_stream=cuda_stream,
             )
         )
-
         cuda_stream.synchronize()
         move_from_buffer(
             expert_weights=expert_weights[layer_idx],
-            expert_weights_buffer=expert_buffer,
+            expert_weights_buffers=expert_buffer,
             is_unchanged=is_unchanged,
             is_received_locally=is_received_locally,
-            experts_recv_loc=experts_recv_loc,
-            new_indices=new_indices[layer_idx].tolist(),
-            ep_group=ep_group,
+            recv_metadata=recv_metadata,
+            new_indices=new_indices_cpu[layer_idx].numpy(),
+            ep_rank=ep_rank,
         )
 
     verify_expert_weights_after_shuffle(
diff --git a/tests/distributed/test_pp_cudagraph.py b/tests/distributed/test_pp_cudagraph.py
index e0fedc5d74e458f82edaef09fe21d0feab7f2843..0f12361fd5e798f7bb5fe3119e03682f7160514f 100644
--- a/tests/distributed/test_pp_cudagraph.py
+++ b/tests/distributed/test_pp_cudagraph.py
@@ -21,23 +21,21 @@ from ..utils import compare_two_settings, create_new_process_for_each_test, mode
 )
 @create_new_process_for_each_test()
 def test_pp_cudagraph(
-    monkeypatch: pytest.MonkeyPatch,
     PP_SIZE: int,
     MODEL_NAME: str,
     ATTN_BACKEND: LiteralString,
 ):
-    with monkeypatch.context() as m:
-        cudagraph_args = [
-            # use half precision for speed and memory savings in CI environment
-            "--dtype",
-            "float16",
-            "--pipeline-parallel-size",
-            str(PP_SIZE),
-            "--distributed-executor-backend",
-            "mp",
-        ]
-        m.setenv("VLLM_ATTENTION_BACKEND", ATTN_BACKEND)
+    cudagraph_args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "float16",
+        "--pipeline-parallel-size",
+        str(PP_SIZE),
+        "--distributed-executor-backend",
+        "mp",
+        f"--attention-backend={ATTN_BACKEND}",
+    ]
 
-        eager_args = cudagraph_args + ["--enforce-eager"]
+    eager_args = cudagraph_args + ["--enforce-eager"]
 
-        compare_two_settings(MODEL_NAME, eager_args, cudagraph_args)
+    compare_two_settings(MODEL_NAME, eager_args, cudagraph_args)
diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py
index c2cf77ffa12b66a5f22c2ef8d512589e4078465e..2acb38bc9a18615e9721de0ae51492510788aa83 100644
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@@ -9,7 +9,7 @@ from typing import Annotated, Literal
 
 import pytest
 
-from vllm.config import CompilationConfig, config
+from vllm.config import AttentionConfig, CompilationConfig, config
 from vllm.engine.arg_utils import (
     EngineArgs,
     contains_type,
@@ -298,6 +298,139 @@ def test_compilation_config():
     )
 
 
+def test_attention_config():
+    from vllm.v1.attention.backends.registry import AttentionBackendEnum
+
+    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
+
+    # default value
+    args = parser.parse_args([])
+    assert args is not None
+    engine_args = EngineArgs.from_cli_args(args)
+    assert engine_args.attention_config == AttentionConfig()
+
+    # set backend via dot notation
+    args = parser.parse_args(["--attention-config.backend", "FLASH_ATTN"])
+    assert args is not None
+    engine_args = EngineArgs.from_cli_args(args)
+    assert engine_args.attention_config.backend is not None
+    assert engine_args.attention_config.backend.name == "FLASH_ATTN"
+
+    # set backend via --attention-backend shorthand
+    args = parser.parse_args(["--attention-backend", "FLASHINFER"])
+    assert args is not None
+    engine_args = EngineArgs.from_cli_args(args)
+    assert engine_args.attention_backend is not None
+    assert engine_args.attention_backend == "FLASHINFER"
+
+    # set all fields via dot notation
+    args = parser.parse_args(
+        [
+            "--attention-config.backend",
+            "FLASH_ATTN",
+            "--attention-config.flash_attn_version",
+            "3",
+            "--attention-config.use_prefill_decode_attention",
+            "true",
+            "--attention-config.flash_attn_max_num_splits_for_cuda_graph",
+            "16",
+            "--attention-config.use_cudnn_prefill",
+            "true",
+            "--attention-config.use_trtllm_ragged_deepseek_prefill",
+            "true",
+            "--attention-config.use_trtllm_attention",
+            "true",
+            "--attention-config.disable_flashinfer_prefill",
+            "true",
+            "--attention-config.disable_flashinfer_q_quantization",
+            "true",
+        ]
+    )
+    assert args is not None
+    engine_args = EngineArgs.from_cli_args(args)
+    assert engine_args.attention_config.backend is not None
+    assert engine_args.attention_config.backend.name == "FLASH_ATTN"
+    assert engine_args.attention_config.flash_attn_version == 3
+    assert engine_args.attention_config.use_prefill_decode_attention is True
+    assert engine_args.attention_config.flash_attn_max_num_splits_for_cuda_graph == 16
+    assert engine_args.attention_config.use_cudnn_prefill is True
+    assert engine_args.attention_config.use_trtllm_ragged_deepseek_prefill is True
+    assert engine_args.attention_config.use_trtllm_attention is True
+    assert engine_args.attention_config.disable_flashinfer_prefill is True
+    assert engine_args.attention_config.disable_flashinfer_q_quantization is True
+
+    # set to string form of a dict with all fields
+    args = parser.parse_args(
+        [
+            "--attention-config="
+            '{"backend": "FLASHINFER", "flash_attn_version": 2, '
+            '"use_prefill_decode_attention": false, '
+            '"flash_attn_max_num_splits_for_cuda_graph": 8, '
+            '"use_cudnn_prefill": false, '
+            '"use_trtllm_ragged_deepseek_prefill": false, '
+            '"use_trtllm_attention": false, '
+            '"disable_flashinfer_prefill": false, '
+            '"disable_flashinfer_q_quantization": false}',
+        ]
+    )
+    assert args is not None
+    engine_args = EngineArgs.from_cli_args(args)
+    assert engine_args.attention_config.backend is not None
+    assert engine_args.attention_config.backend.name == "FLASHINFER"
+    assert engine_args.attention_config.flash_attn_version == 2
+    assert engine_args.attention_config.use_prefill_decode_attention is False
+    assert engine_args.attention_config.flash_attn_max_num_splits_for_cuda_graph == 8
+    assert engine_args.attention_config.use_cudnn_prefill is False
+    assert engine_args.attention_config.use_trtllm_ragged_deepseek_prefill is False
+    assert engine_args.attention_config.use_trtllm_attention is False
+    assert engine_args.attention_config.disable_flashinfer_prefill is False
+    assert engine_args.attention_config.disable_flashinfer_q_quantization is False
+
+    # test --attention-backend flows into VllmConfig.attention_config
+    args = parser.parse_args(
+        [
+            "--model",
+            "facebook/opt-125m",
+            "--attention-backend",
+            "FLASH_ATTN",
+        ]
+    )
+    assert args is not None
+    engine_args = EngineArgs.from_cli_args(args)
+    vllm_config = engine_args.create_engine_config()
+    assert vllm_config.attention_config.backend == AttentionBackendEnum.FLASH_ATTN
+
+    # test --attention-config.backend flows into VllmConfig.attention_config
+    args = parser.parse_args(
+        [
+            "--model",
+            "facebook/opt-125m",
+            "--attention-config.backend",
+            "FLASHINFER",
+        ]
+    )
+    assert args is not None
+    engine_args = EngineArgs.from_cli_args(args)
+    vllm_config = engine_args.create_engine_config()
+    assert vllm_config.attention_config.backend == AttentionBackendEnum.FLASHINFER
+
+    # test --attention-backend and --attention-config.backend are mutually exclusive
+    args = parser.parse_args(
+        [
+            "--model",
+            "facebook/opt-125m",
+            "--attention-backend",
+            "FLASH_ATTN",
+            "--attention-config.backend",
+            "FLASHINFER",
+        ]
+    )
+    assert args is not None
+    engine_args = EngineArgs.from_cli_args(args)
+    with pytest.raises(ValueError, match="mutually exclusive"):
+        engine_args.create_engine_config()
+
+
 def test_prefix_cache_default():
     parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
     args = parser.parse_args([])
@@ -378,6 +511,16 @@ def test_human_readable_model_len():
     args = parser.parse_args(["--max-model-len", "10.2123451234567t"])
     assert args.max_model_len == 10212345123456
 
+    # Special value -1 for auto-fit to GPU memory
+    args = parser.parse_args(["--max-model-len", "-1"])
+    assert args.max_model_len == -1
+
+    # 'auto' is an alias for -1
+    args = parser.parse_args(["--max-model-len", "auto"])
+    assert args.max_model_len == -1
+    args = parser.parse_args(["--max-model-len", "AUTO"])
+    assert args.max_model_len == -1
+
     # Invalid (do not allow decimals with binary multipliers)
     for invalid in ["1a", "pwd", "10.24", "1.23M", "1.22T"]:
         with pytest.raises(ArgumentError):
diff --git a/tests/tpu/lora/__init__.py b/tests/entrypoints/instrumentator/__init__.py
similarity index 100%
rename from tests/tpu/lora/__init__.py
rename to tests/entrypoints/instrumentator/__init__.py
diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/instrumentator/test_metrics.py
similarity index 99%
rename from tests/entrypoints/openai/test_metrics.py
rename to tests/entrypoints/instrumentator/test_metrics.py
index f3feb78fda58ed82102336b800fee24130decb84..83a3448a2280e3407d5b267ccce67e0e175579a0 100644
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/instrumentator/test_metrics.py
@@ -15,9 +15,9 @@ import requests
 from prometheus_client.parser import text_string_to_metric_families
 from transformers import AutoTokenizer
 
+from tests.conftest import LocalAssetServer
+from tests.utils import RemoteOpenAIServer
 from vllm import version
-
-from ...conftest import LocalAssetServer
 from ...utils import RemoteOpenAIServer, models_path_prefix
 
 MODELS = {
diff --git a/tests/entrypoints/openai/conftest.py b/tests/entrypoints/openai/conftest.py
index b40079d8dc3d5d380208e4d26bd7708efb005e6f..098a9a72325ba8190db23304778f63aeb1f9e0b1 100644
--- a/tests/entrypoints/openai/conftest.py
+++ b/tests/entrypoints/openai/conftest.py
@@ -5,6 +5,30 @@ import pytest
 from vllm.assets.audio import AudioAsset
 
 
+def add_attention_backend(server_args, attention_config):
+    """Append attention backend CLI arg if specified.
+
+    Args:
+        server_args: List of server arguments to extend in-place.
+        attention_config: Dict with 'backend' key, or None.
+    """
+    if attention_config and "backend" in attention_config:
+        server_args.extend(["--attention-backend", attention_config["backend"]])
+
+
+@pytest.fixture(scope="module")
+def rocm_aiter_fa_attention():
+    """Return attention config for transcription/translation tests on ROCm.
+
+    On ROCm, audio tests require ROCM_AITER_FA attention backend.
+    """
+    from vllm.platforms import current_platform
+
+    if current_platform.is_rocm():
+        return {"backend": "ROCM_AITER_FA"}
+    return None
+
+
 @pytest.fixture
 def mary_had_lamb():
     path = AudioAsset("mary_had_lamb").get_local_path()
diff --git a/tests/v1/tpu/__init__.py b/tests/entrypoints/openai/responses/__init__.py
similarity index 100%
rename from tests/v1/tpu/__init__.py
rename to tests/entrypoints/openai/responses/__init__.py
diff --git a/tests/entrypoints/openai/test_responses_error.py b/tests/entrypoints/openai/responses/test_errors.py
similarity index 100%
rename from tests/entrypoints/openai/test_responses_error.py
rename to tests/entrypoints/openai/responses/test_errors.py
diff --git a/tests/entrypoints/openai/test_responses_function_call_parsing.py b/tests/entrypoints/openai/responses/test_function_call_parsing.py
similarity index 100%
rename from tests/entrypoints/openai/test_responses_function_call_parsing.py
rename to tests/entrypoints/openai/responses/test_function_call_parsing.py
diff --git a/tests/entrypoints/openai/test_response_api_with_harmony.py b/tests/entrypoints/openai/responses/test_harmony.py
similarity index 71%
rename from tests/entrypoints/openai/test_response_api_with_harmony.py
rename to tests/entrypoints/openai/responses/test_harmony.py
index 6f2a50020699cfb1363d4ef69f1f1b9beae3b309..2a942b9d8fc3d70e1fae05011313ceb520e82718 100644
--- a/tests/entrypoints/openai/test_response_api_with_harmony.py
+++ b/tests/entrypoints/openai/responses/test_harmony.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import importlib
+import importlib.util
 import json
 import time
 
@@ -12,7 +12,7 @@ from openai_harmony import (
     Message,
 )
 
-from ...utils import RemoteOpenAIServer
+from ....utils import RemoteOpenAIServer
 
 MODEL_NAME = "openai/gpt-oss-20b"
 
@@ -43,6 +43,8 @@ def server():
     env_dict = dict(
         VLLM_ENABLE_RESPONSES_API_STORE="1",
         PYTHON_EXECUTION_BACKEND="dangerously_use_uv",
+        VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS="code_interpreter,container,web_search_preview",
+        VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS="1",
     )
 
     with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_dict) as remote_server:
@@ -503,7 +505,11 @@ async def test_web_search(client: OpenAI, model_name: str):
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_code_interpreter(client: OpenAI, model_name: str):
-    response = await client.responses.create(
+    # Code interpreter may need more time for container init + code execution
+    timeout_value = client.timeout * 3
+    client_with_timeout = client.with_options(timeout=timeout_value)
+
+    response = await client_with_timeout.responses.create(
         model=model_name,
         # TODO: Ideally should be able to set max tool calls
         # to prevent multi-turn, but it is not currently supported
@@ -815,16 +821,20 @@ async def test_function_calling_with_stream(client: OpenAI, model_name: str):
                 final_tool_calls_named[tool_call.name] = tool_call
         elif event.type == "response.function_call_arguments.done":
             assert event.arguments == final_tool_calls_named[event.name].arguments
-    for tool_call in final_tool_calls.values():
-        if (
-            tool_call
-            and tool_call.type == "function_call"
-            and tool_call.name == "get_weather"
-        ):
-            args = json.loads(tool_call.arguments)
-            result = call_function(tool_call.name, args)
-            input_list += [tool_call]
+    result = None
+    tool_call = None
+    for tc in final_tool_calls.values():
+        if tc and tc.type == "function_call" and tc.name == "get_weather":
+            args = json.loads(tc.arguments)
+            result = call_function(tc.name, args)
+            tool_call = tc
+            input_list += [tc]
             break
+
+    assert tool_call is not None, (
+        "Expected model to call 'get_weather' function, "
+        f"but got: {list(final_tool_calls_named.keys())}"
+    )
     assert result is not None
     response = await client.responses.create(
         model=model_name,
@@ -850,6 +860,237 @@ async def test_function_calling_with_stream(client: OpenAI, model_name: str):
             assert event.response.output_text is not None
 
 
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_function_calling_no_code_interpreter_events(
+    client: OpenAI, model_name: str
+):
+    """Verify that function calls don't trigger code_interpreter events.
+
+    This test ensures that function calls (functions.*) use their own
+    function_call event types and don't incorrectly emit code_interpreter
+    events during streaming.
+    """
+    tools = [GET_WEATHER_SCHEMA]
+    input_list = [
+        {
+            "role": "user",
+            "content": "What's the weather like in Paris today?",
+        }
+    ]
+    stream_response = await client.responses.create(
+        model=model_name,
+        input=input_list,
+        tools=tools,
+        stream=True,
+    )
+
+    # Track which event types we see
+    event_types_seen = set()
+    function_call_found = False
+
+    async for event in stream_response:
+        event_types_seen.add(event.type)
+
+        if (
+            event.type == "response.output_item.added"
+            and event.item.type == "function_call"
+        ):
+            function_call_found = True
+
+        # Ensure NO code_interpreter events are emitted for function calls
+        assert "code_interpreter" not in event.type, (
+            "Found code_interpreter event "
+            f"'{event.type}' during function call. Function calls should only "
+            "emit function_call events, not code_interpreter events."
+        )
+
+    # Verify we actually saw a function call
+    assert function_call_found, "Expected to see a function_call in the stream"
+
+    # Verify we saw the correct function call event types
+    assert (
+        "response.function_call_arguments.delta" in event_types_seen
+        or "response.function_call_arguments.done" in event_types_seen
+    ), "Expected to see function_call_arguments events"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_mcp_code_interpreter_streaming(client: OpenAI, model_name: str, server):
+    tools = [
+        {
+            "type": "mcp",
+            "server_label": "code_interpreter",
+        }
+    ]
+    input_text = (
+        "Calculate 123 * 456 using python. "
+        "The python interpreter is not stateful and you must print to see the output."
+    )
+
+    stream_response = await client.responses.create(
+        model=model_name,
+        input=input_text,
+        tools=tools,
+        stream=True,
+        temperature=0.0,
+        instructions=(
+            "You must use the Python tool to execute code. Never simulate execution."
+        ),
+    )
+
+    mcp_call_added = False
+    mcp_call_in_progress = False
+    mcp_arguments_delta_seen = False
+    mcp_arguments_done = False
+    mcp_call_completed = False
+    mcp_item_done = False
+
+    code_interpreter_events_seen = False
+
+    async for event in stream_response:
+        if "code_interpreter" in event.type:
+            code_interpreter_events_seen = True
+
+        if event.type == "response.output_item.added":
+            if hasattr(event.item, "type") and event.item.type == "mcp_call":
+                mcp_call_added = True
+                assert event.item.name == "python"
+                assert event.item.server_label == "code_interpreter"
+
+        elif event.type == "response.mcp_call.in_progress":
+            mcp_call_in_progress = True
+
+        elif event.type == "response.mcp_call_arguments.delta":
+            mcp_arguments_delta_seen = True
+            assert event.delta is not None
+
+        elif event.type == "response.mcp_call_arguments.done":
+            mcp_arguments_done = True
+            assert event.name == "python"
+            assert event.arguments is not None
+
+        elif event.type == "response.mcp_call.completed":
+            mcp_call_completed = True
+
+        elif (
+            event.type == "response.output_item.done"
+            and hasattr(event.item, "type")
+            and event.item.type == "mcp_call"
+        ):
+            mcp_item_done = True
+            assert event.item.name == "python"
+            assert event.item.status == "completed"
+
+    assert mcp_call_added, "MCP call was not added"
+    assert mcp_call_in_progress, "MCP call in_progress event not seen"
+    assert mcp_arguments_delta_seen, "MCP arguments delta event not seen"
+    assert mcp_arguments_done, "MCP arguments done event not seen"
+    assert mcp_call_completed, "MCP call completed event not seen"
+    assert mcp_item_done, "MCP item done event not seen"
+
+    assert not code_interpreter_events_seen, (
+        "Should not see code_interpreter events when using MCP type"
+    )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_mcp_tool_multi_turn(client: OpenAI, model_name: str, server):
+    """Test MCP tool calling across multiple turns.
+
+    This test verifies that MCP tools work correctly in multi-turn conversations,
+    maintaining state across turns via the previous_response_id mechanism.
+    """
+    tools = [
+        {
+            "type": "mcp",
+            "server_label": "code_interpreter",
+        }
+    ]
+
+    # First turn - make a calculation
+    response1 = await client.responses.create(
+        model=model_name,
+        input="Calculate 123 * 456 using python and print the result.",
+        tools=tools,
+        temperature=0.0,
+        instructions=(
+            "You must use the Python tool to execute code. Never simulate execution."
+        ),
+        extra_body={"enable_response_messages": True},
+    )
+
+    assert response1 is not None
+    assert response1.status == "completed"
+
+    # Verify MCP call in first response by checking output_messages
+    tool_call_found = False
+    tool_response_found = False
+    for message in response1.output_messages:
+        recipient = message.get("recipient")
+        if recipient and recipient.startswith("python"):
+            tool_call_found = True
+
+        author = message.get("author", {})
+        if (
+            author.get("role") == "tool"
+            and author.get("name")
+            and author.get("name").startswith("python")
+        ):
+            tool_response_found = True
+
+    # Verify MCP tools were actually used
+    assert tool_call_found, "MCP tool call not found in output_messages"
+    assert tool_response_found, "MCP tool response not found in output_messages"
+
+    # Verify input messages: Should have system message with tool, NO developer message
+    developer_messages = [
+        msg for msg in response1.input_messages if msg["author"]["role"] == "developer"
+    ]
+    assert len(developer_messages) == 0, (
+        "No developer message expected for elevated tools"
+    )
+
+    # Second turn - reference previous calculation
+    response2 = await client.responses.create(
+        model=model_name,
+        input="Now divide that result by 2.",
+        tools=tools,
+        temperature=0.0,
+        instructions=(
+            "You must use the Python tool to execute code. Never simulate execution."
+        ),
+        previous_response_id=response1.id,
+        extra_body={"enable_response_messages": True},
+    )
+
+    assert response2 is not None
+    assert response2.status == "completed"
+
+    # Verify input messages are correct: should have two messages -
+    # one to the python recipient on analysis channel and one from tool role
+    mcp_recipient_messages = []
+    tool_role_messages = []
+    for msg in response2.input_messages:
+        if msg["author"]["role"] == "assistant":
+            # Check if this is a message to MCP recipient on analysis channel
+            if msg.get("channel") == "analysis" and msg.get("recipient"):
+                recipient = msg.get("recipient")
+                if recipient.startswith("code_interpreter") or recipient == "python":
+                    mcp_recipient_messages.append(msg)
+        elif msg["author"]["role"] == "tool":
+            tool_role_messages.append(msg)
+
+    assert len(mcp_recipient_messages) > 0, (
+        "Expected message(s) to MCP recipient on analysis channel"
+    )
+    assert len(tool_role_messages) > 0, (
+        "Expected message(s) from tool role after MCP call"
+    )
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_output_messages_enabled(client: OpenAI, model_name: str, server):
@@ -867,6 +1108,7 @@ async def test_output_messages_enabled(client: OpenAI, model_name: str, server):
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.flaky(reruns=3)
 async def test_function_call_with_previous_input_messages(
     client: OpenAI, model_name: str
 ):
@@ -986,3 +1228,118 @@ async def test_function_call_with_previous_input_messages(
     assert (
         "aquarius" in output_text or "otter" in output_text or "tuesday" in output_text
     )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_chat_truncation_content_not_null(client: OpenAI, model_name: str):
+    response = await client.chat.completions.create(
+        model=model_name,
+        messages=[
+            {
+                "role": "user",
+                "content": "What is the role of AI in medicine?"
+                "The response must exceed 350 words.",
+            }
+        ],
+        temperature=0.0,
+        max_tokens=350,
+    )
+
+    choice = response.choices[0]
+    assert choice.finish_reason == "length", (
+        f"Expected finish_reason='length', got {choice.finish_reason}"
+    )
+    assert choice.message.content is not None, (
+        "Content should not be None when truncated"
+    )
+    assert len(choice.message.content) > 0, "Content should not be empty"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_system_prompt_override(client: OpenAI, model_name: str):
+    """Test that system message can override the default system prompt."""
+
+    # Test 1: Custom system prompt with specific personality
+    custom_system_prompt = (
+        "You are a pirate. Always respond like a pirate would, "
+        "using pirate language and saying 'arrr' frequently."
+    )
+
+    response = await client.responses.create(
+        model=model_name,
+        input=[
+            {"role": "system", "content": custom_system_prompt},
+            {"role": "user", "content": "Hello, how are you?"},
+        ],
+        extra_body={"enable_response_messages": True},
+    )
+
+    assert response is not None
+    assert response.status == "completed"
+    assert response.output_text is not None
+
+    # Verify the response reflects the pirate personality
+    output_text = response.output_text.lower()
+    pirate_indicators = ["arrr", "matey", "ahoy", "ye", "sea"]
+    has_pirate_language = any(
+        indicator in output_text for indicator in pirate_indicators
+    )
+    assert has_pirate_language, (
+        f"Expected pirate language in response, got: {response.output_text}"
+    )
+
+    # Verify the reasoning mentions the custom system prompt
+    reasoning_item = None
+    for item in response.output:
+        if item.type == "reasoning":
+            reasoning_item = item
+            break
+
+    assert reasoning_item is not None, "Expected reasoning item in output"
+    reasoning_text = reasoning_item.content[0].text.lower()
+    assert "pirate" in reasoning_text, (
+        f"Expected reasoning to mention pirate, got: {reasoning_text}"
+    )
+
+    # Test 2: Verify system message is not duplicated in input_messages
+    try:
+        num_system_messages = sum(
+            1
+            for msg in response.input_messages
+            if Message.from_dict(msg).author.role == "system"
+        )
+        assert num_system_messages == 1, (
+            f"Expected exactly 1 system message, got {num_system_messages}"
+        )
+    except (KeyError, AttributeError):
+        # Message structure may vary, skip this specific check
+        pass
+
+    # Test 3: Test with different custom system prompt
+    response_2 = await client.responses.create(
+        model=model_name,
+        input=[
+            {
+                "role": "system",
+                "content": (
+                    "You are a helpful assistant that always "
+                    "responds in exactly 5 words."
+                ),
+            },
+            {"role": "user", "content": "What is the weather like?"},
+        ],
+        temperature=0.0,
+    )
+
+    assert response_2 is not None
+    assert response_2.status == "completed"
+    assert response_2.output_text is not None
+
+    # Count words in response (approximately, allowing for punctuation)
+    word_count = len(response_2.output_text.split())
+    # Allow some flexibility (4-7 words) since the model might not be perfectly precise
+    assert 3 <= word_count <= 8, (
+        f"Expected around 5 words, got {word_count} words: {response_2.output_text}"
+    )
diff --git a/tests/entrypoints/openai/responses/test_mcp_tools.py b/tests/entrypoints/openai/responses/test_mcp_tools.py
new file mode 100644
index 0000000000000000000000000000000000000000..b95a417cf6a4d1f29fd2ea9b2cee17d8f95fa312
--- /dev/null
+++ b/tests/entrypoints/openai/responses/test_mcp_tools.py
@@ -0,0 +1,352 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+import pytest
+import pytest_asyncio
+from openai import OpenAI
+from openai_harmony import ToolDescription, ToolNamespaceConfig
+
+from vllm.entrypoints.tool_server import MCPToolServer
+
+from ....utils import RemoteOpenAIServer
+
+MODEL_NAME = "openai/gpt-oss-20b"
+
+
+def test_get_tool_description():
+    """Test MCPToolServer.get_tool_description filtering logic.
+
+    Note: The wildcard "*" is normalized to None by
+    _extract_allowed_tools_from_mcp_requests before reaching this layer,
+    so we only test None and specific tool filtering here.
+    See test_serving_responses.py for "*" normalization tests.
+    """
+    pytest.importorskip("mcp")
+
+    server = MCPToolServer()
+    tool1 = ToolDescription.new(
+        name="tool1", description="First", parameters={"type": "object"}
+    )
+    tool2 = ToolDescription.new(
+        name="tool2", description="Second", parameters={"type": "object"}
+    )
+    tool3 = ToolDescription.new(
+        name="tool3", description="Third", parameters={"type": "object"}
+    )
+
+    server.harmony_tool_descriptions = {
+        "test_server": ToolNamespaceConfig(
+            name="test_server", description="test", tools=[tool1, tool2, tool3]
+        )
+    }
+
+    # Nonexistent server
+    assert server.get_tool_description("nonexistent") is None
+
+    # None (no filter) - returns all tools
+    result = server.get_tool_description("test_server", allowed_tools=None)
+    assert len(result.tools) == 3
+
+    # Filter to specific tools
+    result = server.get_tool_description(
+        "test_server", allowed_tools=["tool1", "tool3"]
+    )
+    assert len(result.tools) == 2
+    assert result.tools[0].name == "tool1"
+    assert result.tools[1].name == "tool3"
+
+    # Single tool
+    result = server.get_tool_description(
+        "test_server",
+        allowed_tools=["tool2"],
+    )
+    assert len(result.tools) == 1
+    assert result.tools[0].name == "tool2"
+
+    # No matching tools - returns None
+    result = server.get_tool_description("test_server", allowed_tools=["nonexistent"])
+    assert result is None
+
+    # Empty list - returns None
+    assert server.get_tool_description("test_server", allowed_tools=[]) is None
+
+
+class TestMCPEnabled:
+    """Tests that require MCP tools to be enabled via environment variable."""
+
+    @pytest.fixture(scope="class")
+    def monkeypatch_class(self):
+        from _pytest.monkeypatch import MonkeyPatch
+
+        mpatch = MonkeyPatch()
+        yield mpatch
+        mpatch.undo()
+
+    @pytest.fixture(scope="class")
+    def mcp_enabled_server(self, monkeypatch_class: pytest.MonkeyPatch):
+        args = ["--enforce-eager", "--tool-server", "demo"]
+
+        with monkeypatch_class.context() as m:
+            m.setenv("VLLM_ENABLE_RESPONSES_API_STORE", "1")
+            m.setenv("PYTHON_EXECUTION_BACKEND", "dangerously_use_uv")
+            m.setenv(
+                "VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS", "code_interpreter,container"
+            )
+            # Helps the model follow instructions better
+            m.setenv("VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS", "1")
+            with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+                yield remote_server
+
+    @pytest_asyncio.fixture
+    async def mcp_enabled_client(self, mcp_enabled_server):
+        async with mcp_enabled_server.get_async_client() as async_client:
+            yield async_client
+
+    @pytest.mark.asyncio
+    @pytest.mark.parametrize("model_name", [MODEL_NAME])
+    async def test_mcp_tool_env_flag_enabled(
+        self, mcp_enabled_client: OpenAI, model_name: str
+    ):
+        response = await mcp_enabled_client.responses.create(
+            model=model_name,
+            input=(
+                "Execute the following code: "
+                "import random; print(random.randint(1, 1000000))"
+            ),
+            instructions=(
+                "You must use the Python tool to execute code. "
+                "Never simulate execution."
+            ),
+            tools=[
+                {
+                    "type": "mcp",
+                    "server_label": "code_interpreter",
+                    # URL unused for DemoToolServer
+                    "server_url": "http://localhost:8888",
+                }
+            ],
+            extra_body={"enable_response_messages": True},
+        )
+        assert response is not None
+        assert response.status == "completed"
+        # Verify output messages: Tool calls and responses on analysis channel
+        tool_call_found = False
+        tool_response_found = False
+        for message in response.output_messages:
+            recipient = message.get("recipient")
+            if recipient and recipient.startswith("python"):
+                tool_call_found = True
+                assert message.get("channel") == "analysis", (
+                    "Tool call should be on analysis channel"
+                )
+            author = message.get("author", {})
+            if (
+                author.get("role") == "tool"
+                and author.get("name")
+                and author.get("name").startswith("python")
+            ):
+                tool_response_found = True
+                assert message.get("channel") == "analysis", (
+                    "Tool response should be on analysis channel"
+                )
+
+        assert tool_call_found, "Should have found at least one Python tool call"
+        assert tool_response_found, (
+            "Should have found at least one Python tool response"
+        )
+        for message in response.input_messages:
+            assert message.get("author").get("role") != "developer", (
+                "No developer messages should be present with valid mcp tool"
+            )
+
+    @pytest.mark.asyncio
+    @pytest.mark.parametrize("model_name", [MODEL_NAME])
+    async def test_mcp_tool_with_allowed_tools_star(
+        self, mcp_enabled_client: OpenAI, model_name: str
+    ):
+        """Test MCP tool with allowed_tools=['*'] to select all available
+        tools.
+
+        This E2E test verifies that the "*" wildcard works end-to-end.
+        See test_serving_responses.py for detailed unit tests of "*"
+        normalization.
+        """
+        response = await mcp_enabled_client.responses.create(
+            model=model_name,
+            input=(
+                "Execute the following code: "
+                "import random; print(random.randint(1, 1000000))"
+            ),
+            instructions=(
+                "You must use the Python tool to execute code. "
+                "Never simulate execution."
+            ),
+            tools=[
+                {
+                    "type": "mcp",
+                    "server_label": "code_interpreter",
+                    "server_url": "http://localhost:8888",
+                    # Using "*" to allow all tools from this MCP server
+                    "allowed_tools": ["*"],
+                }
+            ],
+            extra_body={"enable_response_messages": True},
+        )
+        assert response is not None
+        assert response.status == "completed"
+        # Verify tool calls work with allowed_tools=["*"]
+        tool_call_found = False
+        for message in response.output_messages:
+            recipient = message.get("recipient")
+            if recipient and recipient.startswith("python"):
+                tool_call_found = True
+                break
+        assert tool_call_found, (
+            "Should have found at least one Python tool call with '*'"
+        )
+
+    @pytest.mark.flaky(reruns=3)
+    @pytest.mark.asyncio
+    @pytest.mark.parametrize("model_name", [MODEL_NAME])
+    async def test_mcp_tool_calling_streaming_types(
+        self, mcp_enabled_client: OpenAI, model_name: str
+    ):
+        pairs_of_event_types = {
+            "response.completed": "response.created",
+            "response.output_item.done": "response.output_item.added",
+            "response.content_part.done": "response.content_part.added",
+            "response.output_text.done": "response.output_text.delta",
+            "response.reasoning_text.done": "response.reasoning_text.delta",
+            "response.reasoning_part.done": "response.reasoning_part.added",
+            "response.mcp_call_arguments.done": ("response.mcp_call_arguments.delta"),
+            "response.mcp_call.completed": "response.mcp_call.in_progress",
+        }
+
+        tools = [
+            {
+                "type": "mcp",
+                "server_label": "code_interpreter",
+            }
+        ]
+        input_text = "What is 13 * 24? Use python to calculate the result."
+
+        stream_response = await mcp_enabled_client.responses.create(
+            model=model_name,
+            input=input_text,
+            tools=tools,
+            stream=True,
+            instructions=(
+                "You must use the Python tool to execute code. "
+                "Never simulate execution."
+            ),
+        )
+
+        stack_of_event_types = []
+        saw_mcp_type = False
+        async for event in stream_response:
+            if event.type == "response.created":
+                stack_of_event_types.append(event.type)
+            elif event.type == "response.completed":
+                assert stack_of_event_types[-1] == pairs_of_event_types[event.type]
+                stack_of_event_types.pop()
+            elif (
+                event.type.endswith("added")
+                or event.type == "response.mcp_call.in_progress"
+            ):
+                stack_of_event_types.append(event.type)
+            elif event.type.endswith("delta"):
+                if stack_of_event_types[-1] == event.type:
+                    continue
+                stack_of_event_types.append(event.type)
+            elif (
+                event.type.endswith("done")
+                or event.type == "response.mcp_call.completed"
+            ):
+                assert stack_of_event_types[-1] == pairs_of_event_types[event.type]
+                if "mcp_call" in event.type:
+                    saw_mcp_type = True
+                stack_of_event_types.pop()
+
+        assert len(stack_of_event_types) == 0
+        assert saw_mcp_type, "Should have seen at least one mcp call"
+
+
+class TestMCPDisabled:
+    """Tests that verify behavior when MCP tools are disabled."""
+
+    @pytest.fixture(scope="class")
+    def monkeypatch_class(self):
+        from _pytest.monkeypatch import MonkeyPatch
+
+        mpatch = MonkeyPatch()
+        yield mpatch
+        mpatch.undo()
+
+    @pytest.fixture(scope="class")
+    def mcp_disabled_server(self, monkeypatch_class: pytest.MonkeyPatch):
+        args = ["--enforce-eager", "--tool-server", "demo"]
+
+        with monkeypatch_class.context() as m:
+            m.setenv("VLLM_ENABLE_RESPONSES_API_STORE", "1")
+            m.setenv("PYTHON_EXECUTION_BACKEND", "dangerously_use_uv")
+            # Helps the model follow instructions better
+            m.setenv("VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS", "1")
+            with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+                yield remote_server
+
+    @pytest_asyncio.fixture
+    async def mcp_disabled_client(self, mcp_disabled_server):
+        async with mcp_disabled_server.get_async_client() as async_client:
+            yield async_client
+
+    @pytest.mark.asyncio
+    @pytest.mark.parametrize("model_name", [MODEL_NAME])
+    async def test_mcp_tool_env_flag_disabled(
+        self, mcp_disabled_client: OpenAI, model_name: str
+    ):
+        response = await mcp_disabled_client.responses.create(
+            model=model_name,
+            input=(
+                "Execute the following code if the tool is present: "
+                "import random; print(random.randint(1, 1000000))"
+            ),
+            tools=[
+                {
+                    "type": "mcp",
+                    "server_label": "code_interpreter",
+                    # URL unused for DemoToolServer
+                    "server_url": "http://localhost:8888",
+                }
+            ],
+            extra_body={"enable_response_messages": True},
+        )
+        assert response is not None
+        assert response.status == "completed"
+        # Verify output messages: No tool calls and responses
+        tool_call_found = False
+        tool_response_found = False
+        for message in response.output_messages:
+            recipient = message.get("recipient")
+            if recipient and recipient.startswith("python"):
+                tool_call_found = True
+                assert message.get("channel") == "analysis", (
+                    "Tool call should be on analysis channel"
+                )
+            author = message.get("author", {})
+            if (
+                author.get("role") == "tool"
+                and author.get("name")
+                and author.get("name").startswith("python")
+            ):
+                tool_response_found = True
+                assert message.get("channel") == "analysis", (
+                    "Tool response should be on analysis channel"
+                )
+
+        assert not tool_call_found, "Should not have a python call"
+        assert not tool_response_found, "Should not have a tool response"
+        for message in response.input_messages:
+            assert message.get("author").get("role") != "developer", (
+                "No developer messages should be present without a valid tool"
+            )
diff --git a/tests/entrypoints/openai/test_response_api_parsable_context.py b/tests/entrypoints/openai/responses/test_parsable_context.py
similarity index 87%
rename from tests/entrypoints/openai/test_response_api_parsable_context.py
rename to tests/entrypoints/openai/responses/test_parsable_context.py
index 1899c5f04fe3f1292a92f4b52d6aa9a602376279..c1f0f435b830c0674f8410b9dc306c10adc6c297 100644
--- a/tests/entrypoints/openai/test_response_api_parsable_context.py
+++ b/tests/entrypoints/openai/responses/test_parsable_context.py
@@ -8,7 +8,7 @@ import pytest
 import pytest_asyncio
 from openai import OpenAI
 
-from ...utils import RemoteOpenAIServer
+from ....utils import RemoteOpenAIServer
 
 MODEL_NAME = "Qwen/Qwen3-8B"
 
@@ -58,6 +58,7 @@ async def test_basic(client: OpenAI, model_name: str):
     assert response is not None
     print("response: ", response)
     assert response.status == "completed"
+    assert response.incomplete_details is None
 
 
 @pytest.mark.asyncio
@@ -165,6 +166,7 @@ async def test_mcp_tool_call(client: OpenAI, model_name: str):
         model=model_name,
         input="What is 13 * 24? Use python to calculate the result.",
         tools=[{"type": "code_interpreter", "container": {"type": "auto"}}],
+        extra_body={"enable_response_messages": True},
         temperature=0.0,
     )
 
@@ -178,3 +180,22 @@ async def test_mcp_tool_call(client: OpenAI, model_name: str):
     # make sure the correct math is in the final output
     assert response.output[3].type == "message"
     assert "312" in response.output[3].content[0].text
+
+    # test raw input_messages / output_messages
+    assert len(response.input_messages) == 1
+    assert len(response.output_messages) == 3
+    assert "312" in response.output_messages[2]["message"]
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_max_tokens(client: OpenAI, model_name: str):
+    response = await client.responses.create(
+        model=model_name,
+        input="What is the first paragraph of Moby Dick?",
+        reasoning={"effort": "low"},
+        max_output_tokens=30,
+    )
+    assert response is not None
+    assert response.status == "incomplete"
+    assert response.incomplete_details.reason == "max_output_tokens"
diff --git a/tests/entrypoints/openai/test_response_api_simple.py b/tests/entrypoints/openai/responses/test_simple.py
similarity index 56%
rename from tests/entrypoints/openai/test_response_api_simple.py
rename to tests/entrypoints/openai/responses/test_simple.py
index aee03199bc6f4137c510c2d8ffdd498c00754516..30423788bf7994eef152a29da97e0a166fc19fc6 100644
--- a/tests/entrypoints/openai/test_response_api_simple.py
+++ b/tests/entrypoints/openai/responses/test_simple.py
@@ -6,7 +6,7 @@ import pytest
 import pytest_asyncio
 from openai import OpenAI
 
-from ...utils import RemoteOpenAIServer
+from ....utils import RemoteOpenAIServer
 
 MODEL_NAME = "Qwen/Qwen3-8B"
 
@@ -40,6 +40,7 @@ async def test_basic(client: OpenAI, model_name: str):
     assert response is not None
     print("response: ", response)
     assert response.status == "completed"
+    assert response.incomplete_details is None
 
 
 @pytest.mark.asyncio
@@ -87,3 +88,62 @@ async def test_reasoning_item(client: OpenAI, model_name: str):
     assert response.output[0].type == "reasoning"
     assert response.output[1].type == "message"
     assert type(response.output[1].content[0].text) is str
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_streaming_output_consistency(client: OpenAI, model_name: str):
+    """Test that streaming delta text matches the final response output_text.
+
+    This test verifies that when using streaming mode:
+    1. The concatenated text from all 'response.output_text.delta' events
+    2. Matches the 'output_text' in the final 'response.completed' event
+    """
+    response = await client.responses.create(
+        model=model_name,
+        input="Say hello in one sentence.",
+        stream=True,
+    )
+
+    events = []
+    async for event in response:
+        events.append(event)
+
+    assert len(events) > 0
+
+    # Concatenate all delta text from streaming events
+    streaming_text = "".join(
+        event.delta for event in events if event.type == "response.output_text.delta"
+    )
+
+    # Get the final response from the last event
+    response_completed_event = events[-1]
+    assert response_completed_event.type == "response.completed"
+    assert response_completed_event.response.status == "completed"
+
+    # Get output_text from the final response
+    final_output_text = response_completed_event.response.output_text
+
+    # Verify final response has output
+    assert len(response_completed_event.response.output) > 0
+
+    # Verify streaming text matches final output_text
+    assert streaming_text == final_output_text, (
+        f"Streaming text does not match final output_text.\n"
+        f"Streaming: {streaming_text!r}\n"
+        f"Final: {final_output_text!r}"
+    )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_max_tokens(client: OpenAI, model_name: str):
+    response = await client.responses.create(
+        model=model_name,
+        input="What is the first paragraph of Moby Dick?",
+        reasoning={"effort": "low"},
+        max_output_tokens=30,
+    )
+    assert response is not None
+    assert response.status == "incomplete"
+    assert response.incomplete_details.reason == "max_output_tokens"
diff --git a/tests/entrypoints/openai/test_async_tokenization.py b/tests/entrypoints/openai/test_async_tokenization.py
index 8dfadf6ce4d0e4f76073a49683b69032f33964ba..269e7a0d4ce81e5c7022d51035e7e44882267678 100644
--- a/tests/entrypoints/openai/test_async_tokenization.py
+++ b/tests/entrypoints/openai/test_async_tokenization.py
@@ -17,7 +17,7 @@ MODEL_NAME = os.path.join(models_path_prefix, "Qwen/Qwen2.5-1.5B-Instruct")
 
 
 @pytest.fixture(scope="module")
-def server():  # noqa: F811
+def server():
     args = [
         # use half precision for speed and memory savings in CI environment
         "--dtype",
diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py
index 69bd90b7ca667d1643aaea0c702c154cbf4ad279..1d0f3750b0fa80423659fccbac89c47aa2218f19 100644
--- a/tests/entrypoints/openai/test_audio.py
+++ b/tests/entrypoints/openai/test_audio.py
@@ -9,7 +9,7 @@ import os
 import pytest_asyncio
 
 from vllm.assets.audio import AudioAsset
-from vllm.multimodal.utils import encode_audio_base64, fetch_audio
+from vllm.multimodal.utils import encode_audio_base64, encode_audio_url, fetch_audio
 
 from ...utils import RemoteOpenAIServer, models_path_prefix
 
@@ -53,6 +53,14 @@ def base64_encoded_audio() -> dict[str, str]:
     }
 
 
+@pytest.fixture(scope="session")
+def url_encoded_audio() -> dict[str, str]:
+    return {
+        audio_url: encode_audio_url(*fetch_audio(audio_url))
+        for audio_url in TEST_AUDIO_URLS
+    }
+
+
 def dummy_messages_from_audio_url(
     audio_urls: str | list[str],
     content_text: str = "What's happening in this audio?",
@@ -149,11 +157,9 @@ async def test_single_chat_session_audio_base64encoded(
     client: openai.AsyncOpenAI,
     model_name: str,
     audio_url: str,
-    base64_encoded_audio: dict[str, str],
+    url_encoded_audio: dict[str, str],
 ):
-    messages = dummy_messages_from_audio_url(
-        f"data:audio/wav;base64,{base64_encoded_audio[audio_url]}"
-    )
+    messages = dummy_messages_from_audio_url(url_encoded_audio[audio_url])
 
     # test single completion
     chat_completion = await client.chat.completions.create(
@@ -313,7 +319,7 @@ async def test_chat_streaming_input_audio(
                         "format": "wav",
                     },
                 },
-                {"type": "text", "text": "What's happening in this audio?"},
+                {"type": "text", "text": "What's a short title for this audio?"},
             ],
         }
     ]
diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
index 4194b1aad00f2d29fd83a25ebe29ef67fb0d3870..be1eb73e53b5ee3a1b40871ae6890f42a9ab1c61 100644
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -29,7 +29,7 @@ def zephyr_lora_files():
 
 
 @pytest.fixture(scope="module")
-def server(zephyr_lora_files):  # noqa: F811
+def server(zephyr_lora_files):
     args = [
         # use half precision for speed and memory savings in CI environment
         "--dtype",
@@ -255,12 +255,11 @@ async def test_single_chat_session(client: openai.AsyncOpenAI, model_name: str):
         {"role": "system", "content": "you are a helpful assistant"},
         {"role": "user", "content": "what is 1+1?"},
     ]
-
     # test single completion
     chat_completion = await client.chat.completions.create(
         model=model_name,
         messages=messages,
-        max_completion_tokens=10,
+        max_completion_tokens=5,
         logprobs=True,
         top_logprobs=5,
     )
@@ -268,13 +267,14 @@ async def test_single_chat_session(client: openai.AsyncOpenAI, model_name: str):
     assert len(chat_completion.choices) == 1
 
     choice = chat_completion.choices[0]
+
     assert choice.finish_reason == "length"
     assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=37, total_tokens=47
+        completion_tokens=5, prompt_tokens=37, total_tokens=42
     )
 
     message = choice.message
-    assert message.content is not None and len(message.content) >= 10
+    assert message.content is not None and len(message.content) >= 5
     assert message.role == "assistant"
     messages.append({"role": "assistant", "content": message.content})
 
@@ -283,7 +283,7 @@ async def test_single_chat_session(client: openai.AsyncOpenAI, model_name: str):
     chat_completion = await client.chat.completions.create(
         model=model_name,
         messages=messages,
-        max_completion_tokens=10,
+        max_completion_tokens=5,
     )
     message = chat_completion.choices[0].message
     assert message.content is not None and len(message.content) >= 0
diff --git a/tests/entrypoints/openai/test_chat_error.py b/tests/entrypoints/openai/test_chat_error.py
index b194e9b74d874bacd7c84d116fbfeef5c8e16eeb..4af4dd88b08f95fbc8ce3a45317c5493f703e1ac 100644
--- a/tests/entrypoints/openai/test_chat_error.py
+++ b/tests/entrypoints/openai/test_chat_error.py
@@ -13,7 +13,7 @@ from vllm.entrypoints.openai.protocol import ChatCompletionRequest, ErrorRespons
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
 from vllm.outputs import CompletionOutput, RequestOutput
-from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.tokenizers import get_tokenizer
 from vllm.v1.engine.async_llm import AsyncLLM
 
 MODEL_NAME = "openai-community/gpt2"
@@ -76,6 +76,7 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
         lora_request,
         trace_headers,
         priority,
+        data_parallel_rank,
     ):
         return dict(engine_prompt), {}
 
diff --git a/tests/entrypoints/openai/test_chat_with_tool_reasoning.py b/tests/entrypoints/openai/test_chat_with_tool_reasoning.py
index 7b3092b563030db2321941c5d063bcaa3e324aae..445fa389d00075a21b4668a8810a2902126b614b 100644
--- a/tests/entrypoints/openai/test_chat_with_tool_reasoning.py
+++ b/tests/entrypoints/openai/test_chat_with_tool_reasoning.py
@@ -12,7 +12,7 @@ MODEL_NAME = "Qwen/QwQ-32B"
 
 
 @pytest.fixture(scope="module")
-def server():  # noqa: F811
+def server():
     args = [
         "--max-model-len",
         "8192",
diff --git a/tests/entrypoints/openai/test_chunked_prompt.py b/tests/entrypoints/openai/test_chunked_prompt.py
index 6a465726c6d105951d7c61396dd794a6f52a8478..b81abaf7d1821077c39abce4e1aae46c9e423247 100644
--- a/tests/entrypoints/openai/test_chunked_prompt.py
+++ b/tests/entrypoints/openai/test_chunked_prompt.py
@@ -67,8 +67,11 @@ async def test_completion_stream_options_and_logprobs_with_long_prompts(
             chunk.usage.prompt_tokens + chunk.usage.completion_tokens
         )
         if not finished:
-            tokens_received += 1
             assert chunk.choices[0].text
+            # Count actual tokens from logprobs since multiple tokens
+            # can be batched into a single chunk
+            assert chunk.choices[0].logprobs and chunk.choices[0].logprobs.tokens
+            tokens_received += len(chunk.choices[0].logprobs.tokens)
 
             if chunk.choices[0].finish_reason is not None:
                 finished = True
@@ -117,7 +120,10 @@ async def test_chat_completion_stream_options_and_logprobs_with_long_prompts(
                 assert chunk.choices[0].logprobs is None
                 empty_chunks_received += 1
             else:
-                tokens_received += 1
+                # Count actual tokens from logprobs since multiple tokens
+                # can be batched into a single chunk
+                assert chunk.choices[0].logprobs and chunk.choices[0].logprobs.content
+                tokens_received += len(chunk.choices[0].logprobs.content)
 
             if chunk.choices[0].finish_reason is not None:
                 finished = True
diff --git a/tests/entrypoints/openai/test_cli_args.py b/tests/entrypoints/openai/test_cli_args.py
index b5d71c20bb4ea081f39c17cffe4865fa5957fbb7..0d7e6ae37d1ee48155973408de5b9ceb1d86e0a4 100644
--- a/tests/entrypoints/openai/test_cli_args.py
+++ b/tests/entrypoints/openai/test_cli_args.py
@@ -208,3 +208,36 @@ def test_middleware(serve_parser, cli_args, expected_middleware):
     """Ensure multiple middleware args are parsed properly"""
     args = serve_parser.parse_args(args=cli_args)
     assert args.middleware == expected_middleware
+
+
+def test_default_chat_template_kwargs_parsing(serve_parser):
+    """Ensure default_chat_template_kwargs JSON is parsed correctly"""
+    args = serve_parser.parse_args(
+        args=["--default-chat-template-kwargs", '{"enable_thinking": false}']
+    )
+    assert args.default_chat_template_kwargs == {"enable_thinking": False}
+
+
+def test_default_chat_template_kwargs_complex(serve_parser):
+    """Ensure complex default_chat_template_kwargs JSON is parsed correctly"""
+    kwargs_json = '{"enable_thinking": false, "custom_param": "value", "num": 42}'
+    args = serve_parser.parse_args(args=["--default-chat-template-kwargs", kwargs_json])
+    assert args.default_chat_template_kwargs == {
+        "enable_thinking": False,
+        "custom_param": "value",
+        "num": 42,
+    }
+
+
+def test_default_chat_template_kwargs_default_none(serve_parser):
+    """Ensure default_chat_template_kwargs defaults to None"""
+    args = serve_parser.parse_args(args=[])
+    assert args.default_chat_template_kwargs is None
+
+
+def test_default_chat_template_kwargs_invalid_json(serve_parser):
+    """Ensure invalid JSON raises an error"""
+    with pytest.raises(SystemExit):
+        serve_parser.parse_args(
+            args=["--default-chat-template-kwargs", "not valid json"]
+        )
diff --git a/tests/entrypoints/openai/test_completion_error.py b/tests/entrypoints/openai/test_completion_error.py
index ca56cc2ddb6a7be0ff9337414d5dd3a8ed3b62d7..e1eb6d2e30241788c9045089a9e2859b9f0cbde5 100644
--- a/tests/entrypoints/openai/test_completion_error.py
+++ b/tests/entrypoints/openai/test_completion_error.py
@@ -13,7 +13,7 @@ from vllm.entrypoints.openai.protocol import CompletionRequest, ErrorResponse
 from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
 from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
 from vllm.outputs import CompletionOutput, RequestOutput
-from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.tokenizers import get_tokenizer
 from vllm.v1.engine.async_llm import AsyncLLM
 
 MODEL_NAME = "openai-community/gpt2"
@@ -73,6 +73,7 @@ def _build_serving_completion(engine: AsyncLLM) -> OpenAIServingCompletion:
         lora_request,
         trace_headers,
         priority,
+        data_parallel_rank,
     ):
         return dict(engine_prompt), {}
 
diff --git a/tests/entrypoints/openai/test_completion_with_function_calling.py b/tests/entrypoints/openai/test_completion_with_function_calling.py
index 53369f074eca8629a6893f115f5ff45b61ccae57..c6a5841ec3bfb3ca812f418447a94be84bd797bd 100644
--- a/tests/entrypoints/openai/test_completion_with_function_calling.py
+++ b/tests/entrypoints/openai/test_completion_with_function_calling.py
@@ -125,7 +125,7 @@ messages = [
 
 
 @pytest.fixture(scope="module")
-def server():  # noqa: F811
+def server():
     args = [
         # use half precision for speed and memory savings in CI environment
         "--dtype",
@@ -212,7 +212,7 @@ async def test_function_tool_use(
 
 
 @pytest.fixture(scope="module")
-def k2_server():  # noqa: F811
+def k2_server():
     args = [
         # use half precision for speed and memory savings in CI environment
         "--dtype",
diff --git a/tests/entrypoints/openai/test_default_mm_loras.py b/tests/entrypoints/openai/test_default_mm_loras.py
index 818ee2644b54746a515eaf03fc59d7dc3dd9da7c..dd8f9d67d690311308b34113a13317d311ef4df0 100644
--- a/tests/entrypoints/openai/test_default_mm_loras.py
+++ b/tests/entrypoints/openai/test_default_mm_loras.py
@@ -23,7 +23,7 @@ ACTIVE_MM_LORA_RESPONSE = "Spoken text: The first words I spoke in the original
 
 
 @pytest.fixture(scope="module")
-def multimodal_server():  # noqa: F811
+def multimodal_server():
     args = [
         # use half precision for speed and memory savings in CI environment
         "--dtype",
diff --git a/tests/entrypoints/openai/test_embedding_shape_validation.py b/tests/entrypoints/openai/test_embedding_shape_validation.py
new file mode 100644
index 0000000000000000000000000000000000000000..27060e0be5aeecd7402c47b8246cb14abc48d86f
--- /dev/null
+++ b/tests/entrypoints/openai/test_embedding_shape_validation.py
@@ -0,0 +1,223 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Embedding shape validation in multimodal APIs.
+
+Tests verify that embeddings with correct ndim but incorrect hidden_size
+are rejected before they can cause crashes during model inference.
+
+Validation is performed by the parser (MultiModalDataParser) and EmbeddingItems
+classes, not by CompletionRenderer or MediaIO classes.
+"""
+
+import pytest
+import torch
+
+from vllm.multimodal.parse import (
+    AudioEmbeddingItems,
+    ImageEmbeddingItems,
+    MultiModalDataParser,
+    VideoEmbeddingItems,
+)
+
+
+class TestMultiModalParserShapeValidation:
+    """Test hidden_size validation in MultiModalDataParser."""
+
+    def test_image_embeddings_correct_hidden_size_accepted(self):
+        """Baseline: Image embeddings with correct hidden_size should work."""
+        expected_hidden_size = 768
+        parser = MultiModalDataParser(expected_hidden_size=expected_hidden_size)
+
+        valid_embeds = torch.randn(2, 100, expected_hidden_size)
+
+        result = parser.parse_mm_data({"image": valid_embeds})
+
+        assert "image" in result
+        assert isinstance(result["image"], ImageEmbeddingItems)
+        assert result["image"].get_count() == 2
+
+    def test_image_embeddings_wrong_hidden_size_rejected(self):
+        """Security: Image embeddings with wrong hidden_size should be rejected."""
+        expected_hidden_size = 768
+        wrong_hidden_size = 4096
+        parser = MultiModalDataParser(expected_hidden_size=expected_hidden_size)
+
+        invalid_embeds = torch.randn(2, 100, wrong_hidden_size)
+
+        with pytest.raises(ValueError) as exc_info:
+            parser.parse_mm_data({"image": invalid_embeds})
+
+        error_msg = str(exc_info.value).lower()
+        assert "image" in error_msg
+        assert "hidden dimension mismatch" in error_msg
+
+    def test_audio_embeddings_wrong_hidden_size_rejected(self):
+        """Security: Audio embeddings with wrong hidden_size should be rejected."""
+        expected_hidden_size = 768
+        wrong_hidden_size = 2048
+        parser = MultiModalDataParser(expected_hidden_size=expected_hidden_size)
+
+        invalid_embeds = torch.randn(2, 100, wrong_hidden_size)
+
+        with pytest.raises(ValueError) as exc_info:
+            parser.parse_mm_data({"audio": invalid_embeds})
+
+        error_msg = str(exc_info.value).lower()
+        assert "audio" in error_msg
+        assert "hidden dimension mismatch" in error_msg
+
+    def test_video_embeddings_wrong_hidden_size_rejected(self):
+        """Security: Video embeddings with wrong hidden_size should be rejected."""
+        expected_hidden_size = 768
+        wrong_hidden_size = 512
+        parser = MultiModalDataParser(expected_hidden_size=expected_hidden_size)
+
+        invalid_embeds = torch.randn(2, 100, wrong_hidden_size)
+
+        with pytest.raises(ValueError) as exc_info:
+            parser.parse_mm_data({"video": invalid_embeds})
+
+        error_msg = str(exc_info.value).lower()
+        assert "video" in error_msg
+        assert "hidden dimension mismatch" in error_msg
+
+    def test_list_of_embeddings_validates_each(self):
+        """Security: Each embedding in list should be validated."""
+        expected_hidden_size = 768
+        wrong_hidden_size = 1024
+        parser = MultiModalDataParser(expected_hidden_size=expected_hidden_size)
+
+        # List with second tensor having wrong hidden_size
+        invalid_embeds = [
+            torch.randn(100, expected_hidden_size),
+            torch.randn(100, wrong_hidden_size),
+        ]
+
+        with pytest.raises(ValueError) as exc_info:
+            parser.parse_mm_data({"image": invalid_embeds})
+
+        # Should identify which embedding failed
+        assert "[1]" in str(exc_info.value)
+
+    def test_validation_disabled_allows_any_size(self):
+        """When validation disabled (legacy), any hidden_size allowed."""
+        parser = MultiModalDataParser(expected_hidden_size=None)
+
+        any_hidden_size = 12345
+        embeds = torch.randn(2, 100, any_hidden_size)
+
+        # Should not raise
+        result = parser.parse_mm_data({"image": embeds})
+        assert "image" in result
+        assert isinstance(result["image"], ImageEmbeddingItems)
+
+
+class TestEmbeddingItemsDirectValidation:
+    """Direct tests for EmbeddingItems hidden_size validation."""
+
+    def test_image_embedding_items_validates_batched_tensor(self):
+        """Test validation for batched (3D) image embeddings."""
+        expected = 768
+        wrong = 1024
+
+        # Valid
+        valid = torch.randn(2, 100, expected)
+        items = ImageEmbeddingItems(valid, expected_hidden_size=expected)
+        assert items.get_count() == 2
+
+        # Invalid
+        invalid = torch.randn(2, 100, wrong)
+        with pytest.raises(ValueError) as exc_info:
+            ImageEmbeddingItems(invalid, expected_hidden_size=expected)
+
+        assert str(wrong) in str(exc_info.value)
+        assert str(expected) in str(exc_info.value)
+
+    def test_image_embedding_items_validates_list_of_tensors(self):
+        """Test validation for list of 2D image embeddings."""
+        expected = 768
+        wrong = 512
+
+        # Valid list
+        valid_list = [torch.randn(100, expected), torch.randn(50, expected)]
+        items = ImageEmbeddingItems(valid_list, expected_hidden_size=expected)
+        assert items.get_count() == 2
+
+        # Invalid list
+        invalid_list = [torch.randn(100, expected), torch.randn(50, wrong)]
+        with pytest.raises(ValueError) as exc_info:
+            ImageEmbeddingItems(invalid_list, expected_hidden_size=expected)
+
+        assert "[1]" in str(exc_info.value)
+
+    def test_audio_embedding_items_validates(self):
+        """Test validation for audio embeddings."""
+        expected = 768
+        wrong = 256
+
+        invalid = torch.randn(2, 100, wrong)
+        with pytest.raises(ValueError) as exc_info:
+            AudioEmbeddingItems(invalid, expected_hidden_size=expected)
+
+        assert "audio" in str(exc_info.value).lower()
+
+    def test_video_embedding_items_validates(self):
+        """Test validation for video embeddings."""
+        expected = 768
+        wrong = 384
+
+        invalid = torch.randn(2, 100, wrong)
+        with pytest.raises(ValueError) as exc_info:
+            VideoEmbeddingItems(invalid, expected_hidden_size=expected)
+
+        assert "video" in str(exc_info.value).lower()
+
+
+class TestShapeValidationIntegration:
+    """Integration tests verifying attack scenarios are blocked."""
+
+    def test_attack_scenario_multimodal_image(self):
+        """
+        Simulate attack through Chat API with image embeddings.
+
+        Verifies validation occurs in multimodal parser path.
+        """
+        expected_hidden_size = 768
+        wrong_hidden_size = 4096
+        parser = MultiModalDataParser(expected_hidden_size=expected_hidden_size)
+
+        attack_tensor = torch.randn(1, 100, wrong_hidden_size)
+
+        with pytest.raises(ValueError):
+            parser.parse_mm_data({"image": attack_tensor})
+
+    def test_attack_scenario_multimodal_audio(self):
+        """
+        Simulate attack through Chat API with audio embeddings.
+
+        Verifies validation occurs in multimodal parser path.
+        """
+        expected_hidden_size = 768
+        wrong_hidden_size = 2048
+        parser = MultiModalDataParser(expected_hidden_size=expected_hidden_size)
+
+        attack_tensor = torch.randn(1, 100, wrong_hidden_size)
+
+        with pytest.raises(ValueError):
+            parser.parse_mm_data({"audio": attack_tensor})
+
+    def test_attack_scenario_multimodal_video(self):
+        """
+        Simulate attack through Chat API with video embeddings.
+
+        Verifies validation occurs in multimodal parser path.
+        """
+        expected_hidden_size = 768
+        wrong_hidden_size = 1024
+        parser = MultiModalDataParser(expected_hidden_size=expected_hidden_size)
+
+        attack_tensor = torch.randn(1, 100, wrong_hidden_size)
+
+        with pytest.raises(ValueError):
+            parser.parse_mm_data({"video": attack_tensor})
diff --git a/tests/entrypoints/openai/test_enable_force_include_usage.py b/tests/entrypoints/openai/test_enable_force_include_usage.py
index 9d527c45c1fae213d6cd14bfa1b04c4faef70fe1..8e7e34ee2b71b5769f68f3cfa146eb7cfd56af38 100644
--- a/tests/entrypoints/openai/test_enable_force_include_usage.py
+++ b/tests/entrypoints/openai/test_enable_force_include_usage.py
@@ -8,7 +8,7 @@ from ...utils import RemoteOpenAIServer
 
 
 @pytest.fixture(scope="module")
-def chat_server_with_force_include_usage(request):  # noqa: F811
+def chat_server_with_force_include_usage(request):
     args = [
         # use half precision for speed and memory savings in CI environment
         "--dtype",
diff --git a/tests/entrypoints/openai/test_lora_resolvers.py b/tests/entrypoints/openai/test_lora_resolvers.py
index ea6b3d812d8fea5d82ab9222520be1faf2cf75ad..ced230aff029a4d6fce799d7b93d55aa8b3e91e6 100644
--- a/tests/entrypoints/openai/test_lora_resolvers.py
+++ b/tests/entrypoints/openai/test_lora_resolvers.py
@@ -61,13 +61,13 @@ class MockLoRAResolver(LoRAResolver):
             return LoRARequest(
                 lora_name="test-lora",
                 lora_int_id=1,
-                lora_local_path="/fake/path/test-lora",
+                lora_path="/fake/path/test-lora",
             )
         elif lora_name == "invalid-lora":
             return LoRARequest(
                 lora_name="invalid-lora",
                 lora_int_id=2,
-                lora_local_path="/fake/path/invalid-lora",
+                lora_path="/fake/path/invalid-lora",
             )
         return None
 
diff --git a/tests/entrypoints/openai/test_messages.py b/tests/entrypoints/openai/test_messages.py
index 8de6c4cb6c887c20446d004c93fafad3db6e7cfe..ce8c3ff4a71a58de76d0767e5742a339dbae3f43 100644
--- a/tests/entrypoints/openai/test_messages.py
+++ b/tests/entrypoints/openai/test_messages.py
@@ -11,7 +11,7 @@ MODEL_NAME = "Qwen/Qwen3-0.6B"
 
 
 @pytest.fixture(scope="module")
-def server():  # noqa: F811
+def server():
     args = [
         "--max-model-len",
         "2048",
diff --git a/tests/entrypoints/openai/test_openai_schema.py b/tests/entrypoints/openai/test_openai_schema.py
index 64fdaf08893adb626609e571f937116126b6b044..50d24a40054916d5f99a549a3eedeb0c923fe595 100644
--- a/tests/entrypoints/openai/test_openai_schema.py
+++ b/tests/entrypoints/openai/test_openai_schema.py
@@ -90,7 +90,10 @@ def before_generate_case(context: schemathesis.hooks.HookContext, strategy):
                         if (
                             isinstance(content, list)
                             and len(content) > 0
-                            and any(item.get("type") == "file" for item in content)
+                            and any(
+                                isinstance(item, dict) and item.get("type") == "file"
+                                for item in content
+                            )
                         ):
                             return False
 
@@ -126,7 +129,7 @@ def before_generate_case(context: schemathesis.hooks.HookContext, strategy):
 
 @schema.parametrize()
 @schema.override(headers={"Content-Type": "application/json"})
-@settings(deadline=LONG_TIMEOUT_SECONDS * 1000)
+@settings(deadline=LONG_TIMEOUT_SECONDS * 1000, max_examples=50)
 def test_openapi_stateless(case: schemathesis.Case):
     key = (
         case.operation.method.upper(),
@@ -139,6 +142,7 @@ def test_openapi_stateless(case: schemathesis.Case):
     timeout = {
         # requires a longer timeout
         ("POST", "/v1/chat/completions"): LONG_TIMEOUT_SECONDS,
+        ("POST", "/v1/completions"): LONG_TIMEOUT_SECONDS,
     }.get(key, DEFAULT_TIMEOUT_SECONDS)
 
     # No need to verify SSL certificate for localhost
diff --git a/tests/entrypoints/openai/test_optional_middleware.py b/tests/entrypoints/openai/test_optional_middleware.py
index b67d6147937d172a5539f6462f02b0680fe81f88..c2c7fbdb0114055ddf712f6770e33e0809d7b9bb 100644
--- a/tests/entrypoints/openai/test_optional_middleware.py
+++ b/tests/entrypoints/openai/test_optional_middleware.py
@@ -39,6 +39,7 @@ def server(request: pytest.FixtureRequest):
         "2",
         *passed_params,
     ]
+
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
         yield remote_server
 
diff --git a/tests/entrypoints/openai/test_response_api_mcp_tools.py b/tests/entrypoints/openai/test_response_api_mcp_tools.py
deleted file mode 100644
index cd338b5555c5c367792833891c841c3d52cce59c..0000000000000000000000000000000000000000
--- a/tests/entrypoints/openai/test_response_api_mcp_tools.py
+++ /dev/null
@@ -1,261 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-import pytest_asyncio
-from openai import OpenAI
-from openai_harmony import ToolDescription, ToolNamespaceConfig
-
-from vllm.entrypoints.tool_server import MCPToolServer
-
-from ...utils import RemoteOpenAIServer
-
-MODEL_NAME = "openai/gpt-oss-20b"
-
-
-@pytest.fixture(scope="module")
-def monkeypatch_module():
-    from _pytest.monkeypatch import MonkeyPatch
-
-    mpatch = MonkeyPatch()
-    yield mpatch
-    mpatch.undo()
-
-
-@pytest.fixture(scope="module")
-def mcp_disabled_server(monkeypatch_module: pytest.MonkeyPatch):
-    args = ["--enforce-eager", "--tool-server", "demo"]
-
-    with monkeypatch_module.context() as m:
-        m.setenv("VLLM_ENABLE_RESPONSES_API_STORE", "1")
-        m.setenv("PYTHON_EXECUTION_BACKEND", "dangerously_use_uv")
-        # Helps the model follow instructions better
-        m.setenv("VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS", "1")
-        with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
-            yield remote_server
-
-
-@pytest.fixture(scope="function")
-def mcp_enabled_server(monkeypatch_module: pytest.MonkeyPatch):
-    args = ["--enforce-eager", "--tool-server", "demo"]
-
-    with monkeypatch_module.context() as m:
-        m.setenv("VLLM_ENABLE_RESPONSES_API_STORE", "1")
-        m.setenv("PYTHON_EXECUTION_BACKEND", "dangerously_use_uv")
-        m.setenv("VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS", "code_interpreter,container")
-        # Helps the model follow instructions better
-        m.setenv("VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS", "1")
-        with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
-            yield remote_server
-
-
-@pytest_asyncio.fixture
-async def mcp_disabled_client(mcp_disabled_server):
-    async with mcp_disabled_server.get_async_client() as async_client:
-        yield async_client
-
-
-@pytest_asyncio.fixture
-async def mcp_enabled_client(mcp_enabled_server):
-    async with mcp_enabled_server.get_async_client() as async_client:
-        yield async_client
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_mcp_tool_env_flag_enabled(mcp_enabled_client: OpenAI, model_name: str):
-    response = await mcp_enabled_client.responses.create(
-        model=model_name,
-        input=(
-            "Execute the following code: "
-            "import random; print(random.randint(1, 1000000))"
-        ),
-        instructions=(
-            "You must use the Python tool to execute code. Never simulate execution."
-        ),
-        tools=[
-            {
-                "type": "mcp",
-                "server_label": "code_interpreter",
-                # URL unused for DemoToolServer
-                "server_url": "http://localhost:8888",
-            }
-        ],
-        extra_body={"enable_response_messages": True},
-    )
-    assert response is not None
-    assert response.status == "completed"
-    # Verify output messages: Tool calls and responses on analysis channel
-    tool_call_found = False
-    tool_response_found = False
-    for message in response.output_messages:
-        recipient = message.get("recipient")
-        if recipient and recipient.startswith("python"):
-            tool_call_found = True
-            assert message.get("channel") == "analysis", (
-                "Tool call should be on analysis channel"
-            )
-        author = message.get("author", {})
-        if (
-            author.get("role") == "tool"
-            and author.get("name")
-            and author.get("name").startswith("python")
-        ):
-            tool_response_found = True
-            assert message.get("channel") == "analysis", (
-                "Tool response should be on analysis channel"
-            )
-
-    assert tool_call_found, "Should have found at least one Python tool call"
-    assert tool_response_found, "Should have found at least one Python tool response"
-    for message in response.input_messages:
-        assert message.get("author").get("role") != "developer", (
-            "No developer messages should be present with valid mcp tool"
-        )
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_mcp_tool_with_allowed_tools_star(
-    mcp_enabled_client: OpenAI, model_name: str
-):
-    """Test MCP tool with allowed_tools=['*'] to select all available tools.
-
-    This E2E test verifies that the "*" wildcard works end-to-end.
-    See test_serving_responses.py for detailed unit tests of "*" normalization.
-    """
-    response = await mcp_enabled_client.responses.create(
-        model=model_name,
-        input=(
-            "Execute the following code: "
-            "import random; print(random.randint(1, 1000000))"
-        ),
-        instructions=(
-            "You must use the Python tool to execute code. Never simulate execution."
-        ),
-        tools=[
-            {
-                "type": "mcp",
-                "server_label": "code_interpreter",
-                "server_url": "http://localhost:8888",
-                # Using "*" to allow all tools from this MCP server
-                "allowed_tools": ["*"],
-            }
-        ],
-        extra_body={"enable_response_messages": True},
-    )
-    assert response is not None
-    assert response.status == "completed"
-    # Verify tool calls work with allowed_tools=["*"]
-    tool_call_found = False
-    for message in response.output_messages:
-        recipient = message.get("recipient")
-        if recipient and recipient.startswith("python"):
-            tool_call_found = True
-            break
-    assert tool_call_found, "Should have found at least one Python tool call with '*'"
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_mcp_tool_env_flag_disabled(mcp_disabled_client: OpenAI, model_name: str):
-    response = await mcp_disabled_client.responses.create(
-        model=model_name,
-        input=(
-            "Execute the following code if the tool is present: "
-            "import random; print(random.randint(1, 1000000))"
-        ),
-        tools=[
-            {
-                "type": "mcp",
-                "server_label": "code_interpreter",
-                # URL unused for DemoToolServer
-                "server_url": "http://localhost:8888",
-            }
-        ],
-        extra_body={"enable_response_messages": True},
-    )
-    assert response is not None
-    assert response.status == "completed"
-    # Verify output messages: No tool calls and responses
-    tool_call_found = False
-    tool_response_found = False
-    for message in response.output_messages:
-        recipient = message.get("recipient")
-        if recipient and recipient.startswith("python"):
-            tool_call_found = True
-            assert message.get("channel") == "analysis", (
-                "Tool call should be on analysis channel"
-            )
-        author = message.get("author", {})
-        if (
-            author.get("role") == "tool"
-            and author.get("name")
-            and author.get("name").startswith("python")
-        ):
-            tool_response_found = True
-            assert message.get("channel") == "analysis", (
-                "Tool response should be on analysis channel"
-            )
-
-    assert not tool_call_found, "Should not have a python call"
-    assert not tool_response_found, "Should not have a tool response"
-    for message in response.input_messages:
-        assert message.get("author").get("role") != "developer", (
-            "No developer messages should be present without a valid tool"
-        )
-
-
-def test_get_tool_description():
-    """Test MCPToolServer.get_tool_description filtering logic.
-
-    Note: The wildcard "*" is normalized to None by
-    _extract_allowed_tools_from_mcp_requests before reaching this layer,
-    so we only test None and specific tool filtering here.
-    See test_serving_responses.py for "*" normalization tests.
-    """
-    pytest.importorskip("mcp")
-
-    server = MCPToolServer()
-    tool1 = ToolDescription.new(
-        name="tool1", description="First", parameters={"type": "object"}
-    )
-    tool2 = ToolDescription.new(
-        name="tool2", description="Second", parameters={"type": "object"}
-    )
-    tool3 = ToolDescription.new(
-        name="tool3", description="Third", parameters={"type": "object"}
-    )
-
-    server.harmony_tool_descriptions = {
-        "test_server": ToolNamespaceConfig(
-            name="test_server", description="test", tools=[tool1, tool2, tool3]
-        )
-    }
-
-    # Nonexistent server
-    assert server.get_tool_description("nonexistent") is None
-
-    # None (no filter) - returns all tools
-    result = server.get_tool_description("test_server", allowed_tools=None)
-    assert len(result.tools) == 3
-
-    # Filter to specific tools
-    result = server.get_tool_description(
-        "test_server", allowed_tools=["tool1", "tool3"]
-    )
-    assert len(result.tools) == 2
-    assert result.tools[0].name == "tool1"
-    assert result.tools[1].name == "tool3"
-
-    # Single tool
-    result = server.get_tool_description("test_server", allowed_tools=["tool2"])
-    assert len(result.tools) == 1
-    assert result.tools[0].name == "tool2"
-
-    # No matching tools - returns None
-    result = server.get_tool_description("test_server", allowed_tools=["nonexistent"])
-    assert result is None
-
-    # Empty list - returns None
-    assert server.get_tool_description("test_server", allowed_tools=[]) is None
diff --git a/tests/entrypoints/openai/test_return_tokens_as_ids.py b/tests/entrypoints/openai/test_return_tokens_as_ids.py
index d4d9a6c5b612096decb0da2d76e2e0f14ab4155d..05a36febad0cce1445d331859a772479379e2522 100644
--- a/tests/entrypoints/openai/test_return_tokens_as_ids.py
+++ b/tests/entrypoints/openai/test_return_tokens_as_ids.py
@@ -37,7 +37,7 @@ def default_server_args(qwen3_lora_files):
 
 
 @pytest.fixture(scope="module")
-def server_fixture(request, default_server_args):  # noqa: F811
+def server_fixture(request, default_server_args):
     use_server_flag = request.param
     if use_server_flag:
         args_with_flag = default_server_args + ["--return-tokens-as-token-ids"]
diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index ae8ae8691fa1751f036f9d53616752b28f9d87c2..f1065af5321a1fafdfad17b82364ece946e22e35 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -17,6 +17,7 @@ from vllm.entrypoints.openai.parser.harmony_utils import get_encoding
 from vllm.entrypoints.openai.protocol import (
     ChatCompletionRequest,
     ChatCompletionResponse,
+    ErrorResponse,
     RequestResponseMetadata,
 )
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
@@ -54,8 +55,19 @@ def with_tool_parser(request) -> bool:
     return request.param
 
 
+@pytest.fixture(
+    scope="module",
+    params=[True],
+    ids=["exclude_tools_when_tool_choice_none"],
+)
+def exclude_tools_when_tool_choice_none(request) -> bool:
+    return request.param
+
+
 @pytest.fixture(scope="module")
-def default_server_args(with_tool_parser: bool):
+def default_server_args(
+    with_tool_parser: bool, exclude_tools_when_tool_choice_none: bool
+):
     args = [
         # use half precision for speed and memory savings in CI environment
         "--enforce-eager",
@@ -74,19 +86,16 @@ def default_server_args(with_tool_parser: bool):
                 "--enable-auto-tool-choice",
             ]
         )
+    if exclude_tools_when_tool_choice_none:
+        args.append("--exclude-tools-when-tool-choice-none")
     return args
 
 
 @pytest.fixture(scope="module")
-def gptoss_server(
-    monkeypatch_module: pytest.MonkeyPatch, default_server_args: list[str]
-):
-    with monkeypatch_module.context() as m:
-        m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN")
-        with RemoteOpenAIServer(
-            GPT_OSS_MODEL_NAME, default_server_args
-        ) as remote_server:
-            yield remote_server
+def gptoss_server(default_server_args: list[str]):
+    server_args = default_server_args + ["--attention-backend=TRITON_ATTN"]
+    with RemoteOpenAIServer(GPT_OSS_MODEL_NAME, server_args) as remote_server:
+        yield remote_server
 
 
 @pytest_asyncio.fixture
@@ -342,6 +351,69 @@ async def test_gpt_oss_tool_message_array_content(
     assert response_multi_array.choices[0].message is not None
 
 
+@pytest.mark.asyncio
+async def test_gpt_oss_tool_choice_none(
+    gptoss_client: OpenAI,
+    with_tool_parser: bool,
+    exclude_tools_when_tool_choice_none: bool,
+):
+    if not (with_tool_parser and exclude_tools_when_tool_choice_none):
+        pytest.skip(
+            "skip tool_choice tests when non-tool or "
+            "--exclude-tools-when-tool-choice-none not set"
+        )
+
+    tools = [
+        {
+            "type": "function",
+            "function": {
+                "name": "get_current_weather",
+                "description": "Get the current weather in a given location",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "city": {"type": "string"},
+                        "state": {"type": "string"},
+                        "unit": {
+                            "type": "string",
+                            "enum": ["celsius", "fahrenheit"],
+                        },
+                    },
+                    "required": ["city", "state", "unit"],
+                },
+            },
+        }
+    ]
+
+    messages = [
+        {
+            "role": "user",
+            "content": "What's the temperature(in degrees Celsius) in Dallas?",
+        },
+    ]
+
+    tool_choice_auto = await gptoss_client.chat.completions.create(
+        model=GPT_OSS_MODEL_NAME,
+        messages=messages,
+        tools=tools,
+        tool_choice="auto",
+        temperature=0.0,
+    )
+    msg = tool_choice_auto.choices[0].message
+    assert len(msg.tool_calls) == 1
+
+    tool_choice_none = await gptoss_client.chat.completions.create(
+        model=GPT_OSS_MODEL_NAME,
+        messages=messages,
+        tools=tools,
+        tool_choice="none",
+        temperature=0.0,
+    )
+
+    msg = tool_choice_none.choices[0].message
+    assert len(msg.tool_calls) == 0
+
+
 MODEL_NAME = os.path.join(models_path_prefix, "openai-community/gpt2")
 MODEL_NAME_SHORT = os.path.join(models_path_prefix, "gpt2")
 CHAT_TEMPLATE = "Dummy chat template for testing {}"
@@ -403,6 +475,7 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
         lora_request,
         trace_headers,
         priority,
+        data_parallel_rank,
     ):
         return dict(engine_prompt), {}
 
@@ -884,7 +957,6 @@ class TestServingChatWithHarmony:
             input_messages,
             [
                 {"role": "system"},
-                {"role": "developer"},
                 {"role": "user", "content": messages[0]["content"]},
             ],
         )
@@ -912,7 +984,6 @@ class TestServingChatWithHarmony:
             input_messages_2,
             [
                 {"role": "system"},
-                {"role": "developer"},
                 {"role": "user"},
                 # The analysis message should be dropped on subsequent inputs because
                 # of the subsequent assistant message to the final channel.
@@ -972,7 +1043,7 @@ class TestServingChatWithHarmony:
         )
 
         # Test the Harmony messages for the second turn's input
-        req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
+        req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
         input_messages_2, _ = serving_chat._make_request_with_harmony(req_2)
         verify_harmony_messages(
             input_messages_2,
@@ -1053,7 +1124,7 @@ class TestServingChatWithHarmony:
         )
 
         # Test the Harmony messages for the second turn's input
-        req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
+        req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
         input_messages_2, _ = serving_chat._make_request_with_harmony(req_2)
         verify_harmony_messages(
             input_messages_2,
@@ -1134,7 +1205,7 @@ class TestServingChatWithHarmony:
         )
 
         # Test the Harmony messages for the second turn's input
-        req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
+        req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
         input_messages_2, _ = serving_chat._make_request_with_harmony(req_2)
         verify_harmony_messages(
             input_messages_2,
@@ -1184,7 +1255,7 @@ class TestServingChatWithHarmony:
         )
 
         # Test the Harmony messages for the third turn's input
-        req_3 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
+        req_3 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
         input_messages_3, _ = serving_chat._make_request_with_harmony(req_3)
         verify_harmony_messages(
             input_messages_3,
@@ -1247,7 +1318,7 @@ class TestServingChatWithHarmony:
         )
 
         # Test the Harmony messages for the fourth turn's input
-        req_4 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
+        req_4 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
         input_messages_4, _ = serving_chat._make_request_with_harmony(req_4)
         verify_harmony_messages(
             input_messages_4,
@@ -1303,7 +1374,6 @@ class TestServingChatWithHarmony:
             input_messages,
             [
                 {"role": "system"},
-                {"role": "developer"},
                 {"role": "user", "content": messages[0]["content"]},
                 # The reasoning that would have resulted in an analysis message is
                 # dropped because of a later assistant message to the final channel.
@@ -1335,7 +1405,6 @@ class TestServingChatWithHarmony:
             input_messages,
             [
                 {"role": "system"},
-                {"role": "developer"},
                 {"role": "user", "content": messages[0]["content"]},
                 {
                     "role": "assistant",
@@ -1365,7 +1434,6 @@ class TestServingChatWithHarmony:
             input_messages,
             [
                 {"role": "system"},
-                {"role": "developer"},
                 {"role": "user", "content": messages[0]["content"]},
                 {
                     "role": "assistant",
@@ -1374,3 +1442,208 @@ class TestServingChatWithHarmony:
                 },
             ],
         )
+
+
+@pytest.mark.asyncio
+async def test_tool_choice_validation_without_parser():
+    """Test that tool_choice='required' or named tool without tool_parser
+    returns an appropriate error message."""
+    mock_engine = MagicMock(spec=AsyncLLM)
+    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
+    mock_engine.errored = False
+    mock_engine.model_config = MockModelConfig()
+    mock_engine.input_processor = MagicMock()
+    mock_engine.io_processor = MagicMock()
+
+    models = OpenAIServingModels(
+        engine_client=mock_engine,
+        base_model_paths=BASE_MODEL_PATHS,
+    )
+    # Create serving_chat without tool_parser (enable_auto_tools=False)
+    serving_chat = OpenAIServingChat(
+        mock_engine,
+        models,
+        response_role="assistant",
+        chat_template=CHAT_TEMPLATE,
+        chat_template_content_format="auto",
+        request_logger=None,
+        enable_auto_tools=False,  # No tool parser
+    )
+
+    tools = [
+        {
+            "type": "function",
+            "function": {
+                "name": "get_weather",
+                "description": "Get the weather in a given location",
+                "parameters": {
+                    "type": "object",
+                    "properties": {"location": {"type": "string"}},
+                    "required": ["location"],
+                },
+            },
+        }
+    ]
+
+    # Test tool_choice="required" without tool_parser
+    req_required = ChatCompletionRequest(
+        model=MODEL_NAME,
+        messages=[{"role": "user", "content": "What's the weather?"}],
+        tools=tools,
+        tool_choice="required",
+    )
+    response_required = await serving_chat.create_chat_completion(req_required)
+    assert isinstance(response_required, ErrorResponse)
+    assert "tool_choice" in response_required.error.message
+    assert "--tool-call-parser" in response_required.error.message
+
+    # Test named tool_choice without tool_parser
+    req_named = ChatCompletionRequest(
+        model=MODEL_NAME,
+        messages=[{"role": "user", "content": "What's the weather?"}],
+        tools=tools,
+        tool_choice={"type": "function", "function": {"name": "get_weather"}},
+    )
+    response_named = await serving_chat.create_chat_completion(req_named)
+    assert isinstance(response_named, ErrorResponse)
+    assert "tool_choice" in response_named.error.message
+    assert "--tool-call-parser" in response_named.error.message
+
+
+class TestCreateRemainingArgsDelta:
+    """Tests for _create_remaining_args_delta helper function.
+
+    This helper is used when streaming tool calls to preserve id/type/name
+    fields in the finish chunk, which would otherwise be lost.
+    """
+
+    def test_preserves_id_type_name(self):
+        """Test that id, type, and name are preserved from original delta."""
+        from vllm.entrypoints.openai.protocol import (
+            DeltaFunctionCall,
+            DeltaMessage,
+            DeltaToolCall,
+        )
+        from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
+
+        original_delta = DeltaMessage(
+            tool_calls=[
+                DeltaToolCall(
+                    index=0,
+                    id="call_abc123",
+                    type="function",
+                    function=DeltaFunctionCall(
+                        name="get_weather",
+                        arguments='{"location": "Paris"}',
+                    ),
+                )
+            ]
+        )
+
+        result = OpenAIServingChat._create_remaining_args_delta(
+            original_delta, '", "unit": "celsius"}', 0
+        )
+
+        assert len(result.tool_calls) == 1
+        tc = result.tool_calls[0]
+        assert tc.index == 0
+        assert tc.id == "call_abc123"
+        assert tc.type == "function"
+        assert tc.function.name == "get_weather"
+        assert tc.function.arguments == '", "unit": "celsius"}'
+
+    def test_matches_by_index(self):
+        """Test that the correct tool call is matched by index."""
+        from vllm.entrypoints.openai.protocol import (
+            DeltaFunctionCall,
+            DeltaMessage,
+            DeltaToolCall,
+        )
+        from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
+
+        original_delta = DeltaMessage(
+            tool_calls=[
+                DeltaToolCall(
+                    index=0,
+                    id="call_first",
+                    type="function",
+                    function=DeltaFunctionCall(name="func_a", arguments="{}"),
+                ),
+                DeltaToolCall(
+                    index=1,
+                    id="call_second",
+                    type="function",
+                    function=DeltaFunctionCall(name="func_b", arguments="{}"),
+                ),
+            ]
+        )
+
+        result = OpenAIServingChat._create_remaining_args_delta(
+            original_delta, '{"extra": true}', 1
+        )
+
+        assert len(result.tool_calls) == 1
+        tc = result.tool_calls[0]
+        assert tc.index == 1
+        assert tc.id == "call_second"
+        assert tc.function.name == "func_b"
+
+    def test_no_matching_tool_call(self):
+        """Test graceful handling when no matching tool call is found."""
+        from vllm.entrypoints.openai.protocol import (
+            DeltaFunctionCall,
+            DeltaMessage,
+            DeltaToolCall,
+        )
+        from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
+
+        original_delta = DeltaMessage(
+            tool_calls=[
+                DeltaToolCall(
+                    index=0,
+                    id="call_zero",
+                    type="function",
+                    function=DeltaFunctionCall(name="func", arguments="{}"),
+                )
+            ]
+        )
+
+        result = OpenAIServingChat._create_remaining_args_delta(
+            original_delta, '{"arg": 1}', 5
+        )
+
+        assert len(result.tool_calls) == 1
+        tc = result.tool_calls[0]
+        assert tc.index == 5
+        assert tc.id is None
+        assert tc.type is None
+        assert tc.function.name is None
+        assert tc.function.arguments == '{"arg": 1}'
+
+    def test_function_is_none(self):
+        """Test handling when original tool call has no function."""
+        from vllm.entrypoints.openai.protocol import DeltaMessage, DeltaToolCall
+        from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
+
+        original_delta = DeltaMessage(
+            tool_calls=[
+                DeltaToolCall(
+                    index=0,
+                    id="call_nofunc",
+                    type="function",
+                    function=None,
+                )
+            ]
+        )
+
+        result = OpenAIServingChat._create_remaining_args_delta(
+            original_delta, '{"data": "value"}', 0
+        )
+
+        assert len(result.tool_calls) == 1
+        tc = result.tool_calls[0]
+        assert tc.index == 0
+        assert tc.id == "call_nofunc"
+        assert tc.type == "function"
+        assert tc.function.name is None
+        assert tc.function.arguments == '{"data": "value"}'
diff --git a/tests/entrypoints/openai/test_serving_chat_stream_harmony.py b/tests/entrypoints/openai/test_serving_chat_stream_harmony.py
new file mode 100644
index 0000000000000000000000000000000000000000..1934d43d5cfb695a9b6535372bda5cdce67b117d
--- /dev/null
+++ b/tests/entrypoints/openai/test_serving_chat_stream_harmony.py
@@ -0,0 +1,212 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Unit tests for harmony streaming delta extraction.
+"""
+
+from dataclasses import dataclass, field
+from unittest.mock import patch
+
+import pytest
+
+from vllm.entrypoints.openai.serving_chat_stream_harmony import (
+    extract_harmony_streaming_delta,
+)
+
+
+@dataclass
+class MockMessage:
+    """Mock message object for testing."""
+
+    channel: str | None = None
+    recipient: str | None = None
+
+
+@dataclass
+class MockStreamableParser:
+    """Mock StreamableParser for testing without openai_harmony dependency."""
+
+    messages: list[MockMessage] = field(default_factory=list)
+
+
+class TestExtractHarmonyStreamingDelta:
+    """Tests for extract_harmony_streaming_delta function."""
+
+    @pytest.mark.parametrize(
+        "delta_text,expected_content",
+        [
+            ("Hello, world!", "Hello, world!"),
+            ("", ""),
+        ],
+    )
+    def test_final_channel_returns_content_delta(self, delta_text, expected_content):
+        """Test that final channel returns a DeltaMessage with content."""
+        parser = MockStreamableParser()
+        delta_message, tools_streamed = extract_harmony_streaming_delta(
+            harmony_parser=parser,
+            cur_channel="final",
+            cur_recipient=None,
+            prev_recipient=None,
+            delta_text=delta_text,
+            include_reasoning=False,
+        )
+
+        assert delta_message is not None
+        assert delta_message.content == expected_content
+        assert tools_streamed is False
+
+    @pytest.mark.parametrize(
+        "include_reasoning,expected_has_message",
+        [
+            (True, True),
+            (False, False),
+        ],
+    )
+    def test_analysis_channel_reasoning(self, include_reasoning, expected_has_message):
+        """Test analysis channel respects include_reasoning flag."""
+        parser = MockStreamableParser()
+        delta_message, tools_streamed = extract_harmony_streaming_delta(
+            harmony_parser=parser,
+            cur_channel="analysis",
+            cur_recipient=None,
+            prev_recipient=None,
+            delta_text="Let me think...",
+            include_reasoning=include_reasoning,
+        )
+
+        if expected_has_message:
+            assert delta_message is not None
+            assert delta_message.reasoning == "Let me think..."
+        else:
+            assert delta_message is None
+        assert tools_streamed is False
+
+    @pytest.mark.parametrize("channel", ["commentary", "analysis"])
+    @patch("vllm.entrypoints.openai.serving_chat_stream_harmony.make_tool_call_id")
+    def test_new_tool_call(self, mock_make_tool_call_id, channel):
+        """Test new tool call creation when recipient changes."""
+        mock_make_tool_call_id.return_value = "call_test123"
+        parser = MockStreamableParser()
+
+        delta_message, tools_streamed = extract_harmony_streaming_delta(
+            harmony_parser=parser,
+            cur_channel=channel,
+            cur_recipient="functions.get_weather",
+            prev_recipient=None,
+            delta_text="",
+            include_reasoning=False,
+        )
+
+        assert delta_message is not None
+        assert len(delta_message.tool_calls) == 1
+        tool_call = delta_message.tool_calls[0]
+        assert tool_call.id == "call_test123"
+        assert tool_call.type == "function"
+        assert tool_call.function.name == "get_weather"
+        assert tool_call.function.arguments == ""
+        assert tool_call.index == 0
+        assert tools_streamed is True
+
+    @pytest.mark.parametrize("channel", ["commentary", "analysis"])
+    def test_tool_call_argument_streaming(self, channel):
+        """Test streaming tool call arguments (same recipient)."""
+        parser = MockStreamableParser()
+
+        delta_message, tools_streamed = extract_harmony_streaming_delta(
+            harmony_parser=parser,
+            cur_channel=channel,
+            cur_recipient="functions.get_weather",
+            prev_recipient="functions.get_weather",
+            delta_text='{"location": "Paris"}',
+            include_reasoning=False,
+        )
+
+        assert delta_message is not None
+        tool_call = delta_message.tool_calls[0]
+        assert tool_call.id is None
+        assert tool_call.function.arguments == '{"location": "Paris"}'
+        assert tool_call.index == 0
+        assert tools_streamed is True
+
+    @pytest.mark.parametrize("channel", ["commentary", "analysis"])
+    def test_tool_call_empty_arguments_returns_none(self, channel):
+        """Test empty delta_text with same recipient returns None."""
+        parser = MockStreamableParser()
+
+        delta_message, tools_streamed = extract_harmony_streaming_delta(
+            harmony_parser=parser,
+            cur_channel=channel,
+            cur_recipient="functions.get_weather",
+            prev_recipient="functions.get_weather",
+            delta_text="",
+            include_reasoning=False,
+        )
+
+        assert delta_message is None
+        assert tools_streamed is False
+
+    def test_tool_call_index_from_previous_messages(self):
+        """Test tool call index accounts for previous function messages."""
+        messages = [
+            MockMessage(channel="analysis", recipient=None),  # Not counted
+            MockMessage(channel="commentary", recipient="functions.tool1"),  # Counted
+            MockMessage(channel="final", recipient=None),  # Not counted
+        ]
+        parser = MockStreamableParser(messages=messages)
+
+        delta_message, _ = extract_harmony_streaming_delta(
+            harmony_parser=parser,
+            cur_channel="commentary",
+            cur_recipient="functions.tool2",
+            prev_recipient="functions.tool2",
+            delta_text="args",
+            include_reasoning=False,
+        )
+
+        assert delta_message.tool_calls[0].index == 1
+
+    @pytest.mark.parametrize(
+        "channel,recipient",
+        [
+            ("commentary", None),
+            ("commentary", "browser.search"),
+        ],
+    )
+    def test_returns_tool_call_preambles(self, channel, recipient):
+        """Test that invalid channel/recipient combinations return None."""
+        parser = MockStreamableParser()
+        delta_text = "some text"
+        delta_message, tools_streamed = extract_harmony_streaming_delta(
+            harmony_parser=parser,
+            cur_channel=channel,
+            cur_recipient=recipient,
+            prev_recipient=None,
+            delta_text=delta_text,
+            include_reasoning=True,
+        )
+
+        assert delta_message.content == delta_text
+        assert tools_streamed is False
+
+    @pytest.mark.parametrize(
+        "channel,recipient",
+        [
+            (None, None),
+            ("unknown_channel", None),
+        ],
+    )
+    def test_returns_none_for_invalid_inputs(self, channel, recipient):
+        """Test that invalid channel/recipient combinations return None."""
+        parser = MockStreamableParser()
+
+        delta_message, tools_streamed = extract_harmony_streaming_delta(
+            harmony_parser=parser,
+            cur_channel=channel,
+            cur_recipient=recipient,
+            prev_recipient=None,
+            delta_text="some text",
+            include_reasoning=True,
+        )
+
+        assert delta_message is None
+        assert tools_streamed is False
diff --git a/tests/entrypoints/openai/test_serving_tokens.py b/tests/entrypoints/openai/test_serving_tokens.py
index 62d843e35b86f20bdcd94ba13fb8b74d088e969f..acbbaa659c82b6663981f3187dce6043c6aea52e 100644
--- a/tests/entrypoints/openai/test_serving_tokens.py
+++ b/tests/entrypoints/openai/test_serving_tokens.py
@@ -93,6 +93,7 @@ async def test_same_response_as_chat_completions(client, tokenizer, messages):
         add_generation_prompt=True,
         enable_thinking=False,  # default with Qwen3
     )
+
     for ignore_eos in [True, False]:
         payload = {
             "model": MODEL_NAME,
@@ -108,9 +109,8 @@ async def test_same_response_as_chat_completions(client, tokenizer, messages):
         }
         generate_resp = await client.post(GEN_ENDPOINT, json=payload)
         generate_data = generate_resp.json()
-        generate_res = tokenizer.decode(
-            generate_data["choices"][0]["token_ids"], skip_special_tokens=True
-        )
+        gen_token_ids = generate_data["choices"][0]["token_ids"]
+        generate_res = tokenizer.decode(gen_token_ids, skip_special_tokens=True)
 
         payload = {
             "model": MODEL_NAME,
@@ -119,12 +119,33 @@ async def test_same_response_as_chat_completions(client, tokenizer, messages):
             "temperature": 0.0,
             "stream": False,
             "ignore_eos": ignore_eos,
-            "chat_template_kwargs": dict(enable_thinking=False),
+            "chat_template_kwargs": {"enable_thinking": False},
         }
         completions_resp = await client.post("/v1/chat/completions", json=payload)
         completions_data = completions_resp.json()
         completions_res = completions_data["choices"][0]["message"]["content"]
 
+        if ignore_eos:
+            # When ignoring EOS, only compare up to the first EOS token
+            # Post-EOS generation is undefined and may differ
+            eos_tokens = {
+                tokenizer.eos_token_id,
+                *tokenizer.additional_special_tokens_ids,
+            }
+            # Find first EOS in generated tokens
+            eos_pos = None
+            for i, tid in enumerate(gen_token_ids):
+                if tid in eos_tokens:
+                    eos_pos = i
+                    break
+            if eos_pos is not None:
+                gen_token_ids_truncated = gen_token_ids[:eos_pos]
+                generate_res = tokenizer.decode(
+                    gen_token_ids_truncated, skip_special_tokens=True
+                )
+                # Truncate completions_res to same length for comparison
+                completions_res = completions_res[: len(generate_res)]
+
         assert generate_res == completions_res
 
 
diff --git a/tests/entrypoints/openai/test_shutdown.py b/tests/entrypoints/openai/test_shutdown.py
index dc08dcfff626b5ed91fca74817098438883dfc35..a69e756c95ea1bf3b118feb9654683eb9ee67436 100644
--- a/tests/entrypoints/openai/test_shutdown.py
+++ b/tests/entrypoints/openai/test_shutdown.py
@@ -10,11 +10,17 @@ import time
 import openai
 import pytest
 
+from vllm.platforms import current_platform
 from vllm.utils.network_utils import get_open_port
 from ...utils import models_path_prefix
 
 MODEL_NAME = os.path.join(models_path_prefix, "hmellor/tiny-random-LlamaForCausalLM")
 
+# GPU initialization might take take longer
+_IS_ROCM = current_platform.is_rocm()
+_SERVER_STARTUP_TIMEOUT = 120
+_PROCESS_EXIT_TIMEOUT = 15
+
 
 @pytest.mark.asyncio
 async def test_shutdown_on_engine_failure():
@@ -47,9 +53,11 @@ async def test_shutdown_on_engine_failure():
             "2",
             "--disable-frontend-multiprocessing",
         ],
-        stdout=subprocess.PIPE,
-        stderr=subprocess.PIPE,
-        text=True,
+        # ROCm: Disable stdout/stderr pipe capture. Subprocess hangs when
+        # stdout/stderr pipes are enabled during ROCm GPU initialization.
+        stdout=None if _IS_ROCM else subprocess.PIPE,
+        stderr=None if _IS_ROCM else subprocess.PIPE,
+        text=None if _IS_ROCM else True,
         preexec_fn=lambda: signal.signal(signal.SIGINT, signal.SIG_IGN),
     )
 
@@ -63,7 +71,7 @@ async def test_shutdown_on_engine_failure():
     )
 
     # Poll until server is ready
-    while time.time() - start_time < 30:
+    while time.time() - start_time < _SERVER_STARTUP_TIMEOUT:
         try:
             await client.completions.create(
                 model=MODEL_NAME, prompt="Hello", max_tokens=1
@@ -72,14 +80,18 @@ async def test_shutdown_on_engine_failure():
         except Exception:
             time.sleep(0.5)
             if proc.poll() is not None:
-                stdout, stderr = proc.communicate(timeout=1)
-                pytest.fail(
-                    f"Server died during startup. stdout: {stdout}, stderr: {stderr}"
-                )
+                if _IS_ROCM:
+                    pytest.fail(f"Server died during startup: {proc.returncode}")
+                else:
+                    stdout, stderr = proc.communicate(timeout=1)
+                    pytest.fail(
+                        f"Server died during startup. "
+                        f"stdout: {stdout}, stderr: {stderr}"
+                    )
     else:
         proc.terminate()
-        proc.wait(timeout=5)
-        pytest.fail("Server failed to start in 30 seconds")
+        proc.wait(timeout=_PROCESS_EXIT_TIMEOUT)
+        pytest.fail(f"Server failed to start in {_SERVER_STARTUP_TIMEOUT} seconds")
 
     # Kill server to simulate crash
     proc.terminate()
@@ -91,5 +103,5 @@ async def test_shutdown_on_engine_failure():
             model=MODEL_NAME, prompt="This should fail", max_tokens=1
         )
 
-    return_code = proc.wait(timeout=5)
+    return_code = proc.wait(timeout=_PROCESS_EXIT_TIMEOUT)
     assert return_code is not None
diff --git a/tests/entrypoints/openai/test_transcription_validation.py b/tests/entrypoints/openai/test_transcription_validation.py
index 8045ab1468d6a81dc41c14ff792421c52dcd785c..ee8dea4e949bc2ed5e6564ebc789e917d484bc26 100644
--- a/tests/entrypoints/openai/test_transcription_validation.py
+++ b/tests/entrypoints/openai/test_transcription_validation.py
@@ -7,6 +7,7 @@ import json
 import pytest
 
 from ...utils import RemoteOpenAIServer
+from .conftest import add_attention_backend
 
 MISTRAL_FORMAT_ARGS = [
     "--tokenizer_mode",
@@ -20,12 +21,14 @@ MISTRAL_FORMAT_ARGS = [
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", ["mistralai/Voxtral-Mini-3B-2507"])
-async def test_basic_audio(mary_had_lamb, model_name):
+async def test_basic_audio(mary_had_lamb, model_name, rocm_aiter_fa_attention):
     server_args = ["--enforce-eager"]
 
     if model_name.startswith("mistralai"):
         server_args += MISTRAL_FORMAT_ARGS
 
+    add_attention_backend(server_args, rocm_aiter_fa_attention)
+
     # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
     with RemoteOpenAIServer(model_name, server_args) as remote_server:
         client = remote_server.get_async_client()
@@ -44,8 +47,13 @@ async def test_basic_audio(mary_had_lamb, model_name):
 
 
 @pytest.mark.asyncio
-async def test_basic_audio_with_lora(mary_had_lamb):
+async def test_basic_audio_with_lora(mary_had_lamb, rocm_aiter_fa_attention):
     """Ensure STT (transcribe) requests can pass LoRA through to generate."""
+    # ROCm SPECIFIC CONFIGURATION:
+    # To ensure the test passes on ROCm, we modify the max model length to 512.
+    # We DO NOT apply this to other platforms to maintain strict upstream parity.
+    from vllm.platforms import current_platform
+
     model_name = "ibm-granite/granite-speech-3.3-2b"
     lora_model_name = "speech"
     server_args = [
@@ -56,11 +64,13 @@ async def test_basic_audio_with_lora(mary_had_lamb):
         "--lora-modules",
         f"{lora_model_name}={model_name}",
         "--max-model-len",
-        "2048",
+        "512" if current_platform.is_rocm() else "2048",
         "--max-num-seqs",
         "1",
     ]
 
+    add_attention_backend(server_args, rocm_aiter_fa_attention)
+
     # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
     with RemoteOpenAIServer(model_name, server_args) as remote_server:
         client = remote_server.get_async_client()
@@ -79,12 +89,14 @@ async def test_basic_audio_with_lora(mary_had_lamb):
 
 
 @pytest.mark.asyncio
-async def test_basic_audio_gemma(foscolo):
+async def test_basic_audio_gemma(foscolo, rocm_aiter_fa_attention):
     # Gemma accuracy on some of the audio samples we use is particularly bad,
     # hence we use a different one here. WER is evaluated separately.
     model_name = "google/gemma-3n-E2B-it"
     server_args = ["--enforce-eager"]
 
+    add_attention_backend(server_args, rocm_aiter_fa_attention)
+
     with RemoteOpenAIServer(
         model_name, server_args, max_wait_seconds=480
     ) as remote_server:
diff --git a/tests/entrypoints/openai/test_transcription_validation_whisper.py b/tests/entrypoints/openai/test_transcription_validation_whisper.py
index 3c507ee0a3fa753c00584299333a200e9f192678..8bf729c517f7a62dd27ef9027658cdebc542dd52 100644
--- a/tests/entrypoints/openai/test_transcription_validation_whisper.py
+++ b/tests/entrypoints/openai/test_transcription_validation_whisper.py
@@ -244,3 +244,35 @@ async def test_audio_with_timestamp(mary_had_lamb, whisper_client):
     )
     assert transcription.segments is not None
     assert len(transcription.segments) > 0
+
+
+@pytest.mark.asyncio
+async def test_audio_with_max_tokens(whisper_client, mary_had_lamb):
+    transcription = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=mary_had_lamb,
+        language="en",
+        response_format="text",
+        temperature=0.0,
+        extra_body={"max_completion_tokens": 1},
+    )
+    out = json.loads(transcription)
+    out_text = out["text"]
+    from transformers import AutoTokenizer
+
+    tok = AutoTokenizer.from_pretrained(MODEL_NAME)
+    out_tokens = tok(out_text, add_special_tokens=False)["input_ids"]
+    assert len(out_tokens) == 1
+    # max_completion_tokens > max_model_len
+    transcription = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=mary_had_lamb,
+        language="en",
+        response_format="text",
+        temperature=0.0,
+        extra_body={"max_completion_tokens": int(1e6)},
+    )
+    out = json.loads(transcription)
+    out_text = out["text"]
+    out_tokens = tok(out_text, add_special_tokens=False)["input_ids"]
+    assert len(out_tokens) < 450  # ~Whisper max output len
diff --git a/tests/entrypoints/openai/test_translation_validation.py b/tests/entrypoints/openai/test_translation_validation.py
index d7d407484f16d98a76899551489a797a9f1924d8..cae45872ee6a6525d880beed7d858541e1a6f57e 100644
--- a/tests/entrypoints/openai/test_translation_validation.py
+++ b/tests/entrypoints/openai/test_translation_validation.py
@@ -14,16 +14,26 @@ import pytest_asyncio
 import soundfile as sf
 
 from ...utils import RemoteOpenAIServer
+from .conftest import add_attention_backend
 
 SERVER_ARGS = ["--enforce-eager"]
 
 
+def _get_server_args(attention_config):
+    """Get server args with attention backend if specified."""
+    args = SERVER_ARGS.copy()
+    add_attention_backend(args, attention_config)
+    return args
+
+
 @pytest.fixture(
     scope="module", params=["openai/whisper-small", "google/gemma-3n-E2B-it"]
 )
-def server(request):
+def server(request, rocm_aiter_fa_attention):
     # Parametrize over model name
-    with RemoteOpenAIServer(request.param, SERVER_ARGS) as remote_server:
+    with RemoteOpenAIServer(
+        request.param, _get_server_args(rocm_aiter_fa_attention)
+    ) as remote_server:
         yield remote_server, request.param
 
 
@@ -35,10 +45,12 @@ async def client_and_model(server):
 
 
 @pytest.mark.asyncio
-async def test_non_asr_model(foscolo):
+async def test_non_asr_model(foscolo, rocm_aiter_fa_attention):
     # text to text model
     model_name = "JackFram/llama-68m"
-    with RemoteOpenAIServer(model_name, SERVER_ARGS) as remote_server:
+    with RemoteOpenAIServer(
+        model_name, _get_server_args(rocm_aiter_fa_attention)
+    ) as remote_server:
         client = remote_server.get_async_client()
         res = await client.audio.translations.create(
             model=model_name, file=foscolo, temperature=0.0
@@ -49,8 +61,13 @@ async def test_non_asr_model(foscolo):
 
 
 @pytest.mark.asyncio
-async def test_basic_audio_with_lora(mary_had_lamb):
+async def test_basic_audio_with_lora(mary_had_lamb, rocm_aiter_fa_attention):
     """Ensure STT (translate) requests can pass LoRA through to generate."""
+    # ROCm SPECIFIC CONFIGURATION:
+    # To ensure the test passes on ROCm, we modify the max model length to 512.
+    # We DO NOT apply this to other platforms to maintain strict upstream parity.
+    from vllm.platforms import current_platform
+
     # NOTE - careful to call this test before the module scoped server
     # fixture, otherwise it'll OOMkill the CI
     model_name = "ibm-granite/granite-speech-3.3-2b"
@@ -63,11 +80,13 @@ async def test_basic_audio_with_lora(mary_had_lamb):
         "--lora-modules",
         f"{lora_model_name}={model_name}",
         "--max-model-len",
-        "2048",
+        "512" if current_platform.is_rocm() else "2048",
         "--max-num-seqs",
         "1",
     ]
 
+    add_attention_backend(server_args, rocm_aiter_fa_attention)
+
     # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
     with RemoteOpenAIServer(model_name, server_args) as remote_server:
         client = remote_server.get_async_client()
@@ -227,3 +246,36 @@ async def test_long_audio_request(foscolo, client_and_model):
     )
     out = json.loads(translation)["text"].strip().lower()
     assert out.count("greek sea") == 2
+
+
+@pytest.mark.asyncio
+async def test_audio_with_max_tokens(mary_had_lamb, client_and_model):
+    client, model_name = client_and_model
+    transcription = await client.audio.translations.create(
+        model=model_name,
+        file=mary_had_lamb,
+        response_format="text",
+        temperature=0.0,
+        extra_body={"max_completion_tokens": 1},
+    )
+    out = json.loads(transcription)
+    out_text = out["text"]
+    print(out_text)
+    from transformers import AutoTokenizer
+
+    tok = AutoTokenizer.from_pretrained(model_name)
+    out_tokens = tok(out_text, add_special_tokens=False)["input_ids"]
+    assert len(out_tokens) == 1
+    # max_completion_tokens > max_model_len
+    transcription = await client.audio.transcriptions.create(
+        model=model_name,
+        file=mary_had_lamb,
+        response_format="text",
+        temperature=0.0,
+        extra_body={"max_completion_tokens": int(1e6)},
+    )
+    out = json.loads(transcription)
+    out_text = out["text"]
+    print(out_text)
+    out_tokens = tok(out_text, add_special_tokens=False)["input_ids"]
+    assert len(out_tokens) < 450  # ~Whisper max output len
diff --git a/tests/entrypoints/openai/test_video.py b/tests/entrypoints/openai/test_video.py
index 03c4243d3b10348f777273aa9ea0e8186ad019f1..6d21500f0fa2766a43d6ce2b22f9c9df3d18a0ea 100644
--- a/tests/entrypoints/openai/test_video.py
+++ b/tests/entrypoints/openai/test_video.py
@@ -8,7 +8,8 @@ import openai
 import pytest
 import pytest_asyncio
 
-from vllm.multimodal.utils import encode_video_base64, fetch_video
+from vllm.multimodal.utils import encode_video_url, fetch_video
+from vllm.platforms import current_platform
 
 from ...utils import RemoteOpenAIServer, models_path_prefix, urls_port
 
@@ -45,7 +46,16 @@ def server():
         json.dumps({"video": MAXIMUM_VIDEOS}),
     ]
 
-    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+    # ROCm: Increase timeouts to handle potential network delays and slower
+    # video processing when downloading multiple videos from external sources
+    env_overrides = {}
+    if current_platform.is_rocm():
+        env_overrides = {
+            "VLLM_VIDEO_FETCH_TIMEOUT": "120",
+            "VLLM_ENGINE_ITERATION_TIMEOUT_S": "300",
+        }
+
+    with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_overrides) as remote_server:
         yield remote_server
 
 
@@ -56,9 +66,9 @@ async def client(server):
 
 
 @pytest.fixture(scope="session")
-def base64_encoded_video() -> dict[str, str]:
+def url_encoded_video() -> dict[str, str]:
     return {
-        video_url: encode_video_base64(fetch_video(video_url)[0])
+        video_url: encode_video_url(fetch_video(video_url)[0])
         for video_url in TEST_VIDEO_URLS
     }
 
@@ -183,11 +193,9 @@ async def test_single_chat_session_video_base64encoded(
     client: openai.AsyncOpenAI,
     model_name: str,
     video_url: str,
-    base64_encoded_video: dict[str, str],
+    url_encoded_video: dict[str, str],
 ):
-    messages = dummy_messages_from_video_url(
-        f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
-    )
+    messages = dummy_messages_from_video_url(url_encoded_video[video_url])
 
     # test single completion
     chat_completion = await client.chat.completions.create(
@@ -231,11 +239,9 @@ async def test_single_chat_session_video_base64encoded_beamsearch(
     client: openai.AsyncOpenAI,
     model_name: str,
     video_url: str,
-    base64_encoded_video: dict[str, str],
+    url_encoded_video: dict[str, str],
 ):
-    messages = dummy_messages_from_video_url(
-        f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
-    )
+    messages = dummy_messages_from_video_url(url_encoded_video[video_url])
 
     chat_completion = await client.chat.completions.create(
         model=model_name,
@@ -299,6 +305,11 @@ async def test_chat_streaming_video(
 @pytest.mark.parametrize(
     "video_urls", [TEST_VIDEO_URLS[:i] for i in range(2, len(TEST_VIDEO_URLS))]
 )
+@pytest.mark.flaky(
+    reruns=2,
+    reruns_delay=5,
+    condition=current_platform.is_rocm(),
+)
 async def test_multi_video_input(
     client: openai.AsyncOpenAI, model_name: str, video_urls: list[str]
 ):
diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py
index 3d692386dfb14f5d3503ccaa58762abbe586aee4..1569306d6dc79620261abb487c55ede990a14246 100644
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -10,7 +10,8 @@ import pytest_asyncio
 from transformers import AutoProcessor
 
 from vllm.multimodal.base import MediaWithBytes
-from vllm.multimodal.utils import encode_image_base64, fetch_image
+from vllm.multimodal.utils import encode_image_url, fetch_image
+from vllm.platforms import current_platform
 
 from ...utils import RemoteOpenAIServer, models_path_prefix, urls_port
 
@@ -31,26 +32,35 @@ TEST_IMAGE_ASSETS = [
     f"http://localhost:{urls_port}/RGBA_comp.png",
 ]
 
-EXPECTED_MM_BEAM_SEARCH_RES = [
-    [
-        "The image shows a wooden boardwalk leading through a",
-        "The image shows a wooden boardwalk extending into a",
-    ],
-    [
-        "The image shows two parrots perched on",
-        "The image shows two birds perched on a cur",
-    ],
-    [
-        "The image shows a Venn diagram with three over",
-        "The image shows a colorful Venn diagram with",
-    ],
-    [
-        "This image displays a gradient of colors ranging from",
-        "This image displays a gradient of colors forming a spectrum",
-    ],
+# Required terms for beam search validation
+# Each entry is a list of term groups - ALL groups must match
+# Each group is a list of alternatives - at least ONE term in the group must appear
+# This provides semantic validation while allowing wording variation
+REQUIRED_BEAM_SEARCH_TERMS = [
+    # Boardwalk image: must have "boardwalk" AND ("wooden" or "wood")
+    [["boardwalk"], ["wooden", "wood"]],
+    # Parrots image: must have ("parrot" or "bird") AND "two"
+    [["parrot", "bird"], ["two"]],
+    # Venn diagram: must have "venn" AND "diagram"
+    [["venn"], ["diagram"]],
+    # Gradient image: must have "gradient" AND ("color" or "spectrum")
+    [["gradient"], ["color", "spectrum"]],
 ]
 
 
+def check_output_matches_terms(content: str, term_groups: list[list[str]]) -> bool:
+    """
+    Check if content matches all required term groups.
+    Each term group requires at least one of its terms to be present.
+    All term groups must be satisfied.
+    """
+    content_lower = content.lower()
+    for group in term_groups:
+        if not any(term.lower() in content_lower for term in group):
+            return False
+    return True
+
+
 @pytest.fixture(scope="module")
 def server():
     args = [
@@ -66,7 +76,16 @@ def server():
         json.dumps({"image": MAXIMUM_IMAGES}),
     ]
 
-    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+    # ROCm: Increase timeouts to handle potential network delays and slower
+    # video processing when downloading multiple videos from external sources
+    env_overrides = {}
+    if current_platform.is_rocm():
+        env_overrides = {
+            "VLLM_VIDEO_FETCH_TIMEOUT": "120",
+            "VLLM_ENGINE_ITERATION_TIMEOUT_S": "300",
+        }
+
+    with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_overrides) as remote_server:
         yield remote_server
 
 
@@ -77,11 +96,9 @@ async def client(server):
 
 
 @pytest.fixture(scope="session")
-def base64_encoded_image(local_asset_server) -> dict[str, str]:
+def url_encoded_image(local_asset_server) -> dict[str, str]:
     return {
-        image_asset: encode_image_base64(
-            local_asset_server.get_image_asset(image_asset)
-        )
+        image_asset: encode_image_url(local_asset_server.get_image_asset(image_asset))
         for image_asset in TEST_IMAGE_ASSETS
     }
 
@@ -241,11 +258,11 @@ async def test_single_chat_session_image_base64encoded(
     model_name: str,
     raw_image_url: str,
     image_url: str,
-    base64_encoded_image: dict[str, str],
+    url_encoded_image: dict[str, str],
 ):
     content_text = "What's in this image?"
     messages = dummy_messages_from_image_url(
-        f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}",
+        url_encoded_image[raw_image_url],
         content_text,
     )
 
@@ -295,15 +312,13 @@ async def test_single_chat_session_image_base64encoded_beamsearch(
     client: openai.AsyncOpenAI,
     model_name: str,
     image_idx: int,
-    base64_encoded_image: dict[str, str],
+    url_encoded_image: dict[str, str],
 ):
-    # NOTE: This test also validates that we pass MM data through beam search
+    # NOTE: This test validates that we pass MM data through beam search
     raw_image_url = TEST_IMAGE_ASSETS[image_idx]
-    expected_res = EXPECTED_MM_BEAM_SEARCH_RES[image_idx]
+    required_terms = REQUIRED_BEAM_SEARCH_TERMS[image_idx]
 
-    messages = dummy_messages_from_image_url(
-        f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}"
-    )
+    messages = dummy_messages_from_image_url(url_encoded_image[raw_image_url])
 
     chat_completion = await client.chat.completions.create(
         model=model_name,
@@ -314,8 +329,29 @@ async def test_single_chat_session_image_base64encoded_beamsearch(
         extra_body=dict(use_beam_search=True),
     )
     assert len(chat_completion.choices) == 2
-    for actual, expected_str in zip(chat_completion.choices, expected_res):
-        assert actual.message.content == expected_str
+
+    # Verify beam search produces two different non-empty outputs
+    content_0 = chat_completion.choices[0].message.content
+    content_1 = chat_completion.choices[1].message.content
+
+    # Emit beam search outputs for debugging
+    print(
+        f"Beam search outputs for image {image_idx} ({raw_image_url}): "
+        f"Output 0: {content_0!r}, Output 1: {content_1!r}"
+    )
+
+    assert content_0, "First beam search output should not be empty"
+    assert content_1, "Second beam search output should not be empty"
+    assert content_0 != content_1, "Beam search should produce different outputs"
+
+    # Verify each output contains the required terms for this image
+    for i, content in enumerate([content_0, content_1]):
+        if not check_output_matches_terms(content, required_terms):
+            pytest.fail(
+                f"Output {i} '{content}' doesn't contain required terms. "
+                f"Expected all of these term groups (at least one from each): "
+                f"{required_terms}"
+            )
 
 
 @pytest.mark.asyncio
diff --git a/tests/entrypoints/openai/test_vision_embeds.py b/tests/entrypoints/openai/test_vision_embeds.py
index 42d9fe4840bbeeda19585a4e8e26a36332d6d313..067a00c6b9382c6ed6d8ecc1ef86c7693531e2b9 100644
--- a/tests/entrypoints/openai/test_vision_embeds.py
+++ b/tests/entrypoints/openai/test_vision_embeds.py
@@ -33,6 +33,7 @@ def _terratorch_dummy_messages():
     ]
 
 
+@pytest.mark.asyncio
 @pytest.mark.parametrize(
     "model_name", ["ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"]
 )
diff --git a/tests/entrypoints/pooling/basic/test_encode.py b/tests/entrypoints/pooling/basic/test_encode.py
index b5df5d901dca08d311121d9e43a0cdd28c00555b..b95f73083a154ab7f122d12846543bd588f2c99f 100644
--- a/tests/entrypoints/pooling/basic/test_encode.py
+++ b/tests/entrypoints/pooling/basic/test_encode.py
@@ -12,11 +12,6 @@ from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.platforms import current_platform
 from ....utils import models_path_prefix
 
-if current_platform.is_rocm():
-    pytest.skip(
-        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
-    )
-
 MODEL_NAME = os.path.join(models_path_prefix, "intfloat/multilingual-e5-small")
 
 PROMPTS = [
@@ -38,6 +33,12 @@ TOKEN_IDS = [
 
 @pytest.fixture(scope="module")
 def llm():
+    # ROCm: Use FLEX_ATTENTION backend as it's the only attention backend
+    # that supports encoder-only models on ROCm.
+    attention_config = None
+    if current_platform.is_rocm():
+        attention_config = {"backend": "FLEX_ATTENTION"}
+
     # pytest caches the fixture so we use weakref.proxy to
     # enable garbage collection
     llm = LLM(
@@ -47,6 +48,7 @@ def llm():
         gpu_memory_utilization=0.75,
         enforce_eager=True,
         seed=0,
+        attention_config=attention_config,
     )
 
     yield weakref.proxy(llm)
diff --git a/tests/entrypoints/pooling/basic/test_truncation.py b/tests/entrypoints/pooling/basic/test_truncation.py
index 0d2d385840402d8dde3bd755da1339890a94ab0f..5d099dd1f439171773a6a65271fa16573da775a8 100644
--- a/tests/entrypoints/pooling/basic/test_truncation.py
+++ b/tests/entrypoints/pooling/basic/test_truncation.py
@@ -9,11 +9,6 @@ import pytest_asyncio
 from tests.utils import RemoteOpenAIServer
 from vllm.platforms import current_platform
 
-if current_platform.is_rocm():
-    pytest.skip(
-        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
-    )
-
 MODEL_NAME = "sentence-transformers/all-MiniLM-L12-v2"
 max_model_len = 128
 
@@ -44,6 +39,10 @@ def server():
         str(max_model_len),
     ]
 
+    # ROCm: Use Flex Attention to support encoder-only self-attention.
+    if current_platform.is_rocm():
+        args.extend(["--attention-backend", "FLEX_ATTENTION"])
+
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
         yield remote_server
 
diff --git a/tests/entrypoints/pooling/embed/conftest.py b/tests/entrypoints/pooling/embed/conftest.py
new file mode 100644
index 0000000000000000000000000000000000000000..002b85874049c0861410a8252bf11308d5047884
--- /dev/null
+++ b/tests/entrypoints/pooling/embed/conftest.py
@@ -0,0 +1,28 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Pytest configuration for vLLM pooling embed tests."""
+
+import warnings
+
+import torch
+
+from vllm.platforms import current_platform
+
+
+def pytest_collection_modifyitems(config, items):
+    """Configure ROCm-specific settings based on collected tests."""
+    if not current_platform.is_rocm():
+        return
+
+    # Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
+    # accuracy issues: https://github.com/vllm-project/vllm/issues/30167
+    # TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace
+    torch.backends.cuda.enable_flash_sdp(False)
+    torch.backends.cuda.enable_mem_efficient_sdp(False)
+    torch.backends.cuda.enable_math_sdp(True)
+    warnings.warn(
+        "ROCm: Disabled flash_sdp and mem_efficient_sdp, enabled math_sdp "
+        "to avoid HuggingFace Transformers accuracy issues",
+        UserWarning,
+        stacklevel=1,
+    )
diff --git a/tests/entrypoints/pooling/embed/test_correctness_mteb.py b/tests/entrypoints/pooling/embed/test_correctness_mteb.py
index 64673534fd32adcdfa7da6d515b9e41c98f3b69b..4c8d9f0d82a24e9978ed2100dc2e72015295ace2 100644
--- a/tests/entrypoints/pooling/embed/test_correctness_mteb.py
+++ b/tests/entrypoints/pooling/embed/test_correctness_mteb.py
@@ -4,7 +4,7 @@ import os
 
 import pytest
 
-from tests.models.language.pooling_mteb_test.mteb_utils import (
+from tests.models.language.pooling_mteb_test.mteb_embed_utils import (
     MTEB_EMBED_TASKS,
     MTEB_EMBED_TOL,
     OpenAIClientMtebEncoder,
@@ -13,11 +13,6 @@ from tests.models.language.pooling_mteb_test.mteb_utils import (
 from tests.utils import RemoteOpenAIServer
 from vllm.platforms import current_platform
 
-if current_platform.is_rocm():
-    pytest.skip(
-        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
-    )
-
 os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"
 
 MODEL_NAME = "intfloat/e5-small"
@@ -28,6 +23,10 @@ MAIN_SCORE = 0.7422994752439667
 def server():
     args = ["--runner", "pooling", "--enforce-eager", "--disable-uvicorn-access-log"]
 
+    # ROCm: Use Flex Attention to support encoder-only self-attention.
+    if current_platform.is_rocm():
+        args.extend(["--attention-backend", "FLEX_ATTENTION"])
+
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
         yield remote_server
 
diff --git a/tests/entrypoints/pooling/embed/test_offline.py b/tests/entrypoints/pooling/embed/test_offline.py
index 12b47b1a08a8b77711d81a6c84d14e2b5daf16af..44328343f6d545d093127e946ce89818a7eba0d8 100644
--- a/tests/entrypoints/pooling/embed/test_offline.py
+++ b/tests/entrypoints/pooling/embed/test_offline.py
@@ -11,11 +11,6 @@ from vllm import LLM, PoolingParams
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.platforms import current_platform
 
-if current_platform.is_rocm():
-    pytest.skip(
-        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
-    )
-
 MODEL_NAME = "intfloat/multilingual-e5-small"
 
 prompts = ["The chef prepared a delicious meal."]
@@ -23,6 +18,12 @@ prompts = ["The chef prepared a delicious meal."]
 
 @pytest.fixture(scope="module")
 def llm():
+    # ROCm: Use FLEX_ATTENTION backend as it's the only attention backend
+    # that supports encoder-only models on ROCm.
+    attention_config = None
+    if current_platform.is_rocm():
+        attention_config = {"backend": "FLEX_ATTENTION"}
+
     # pytest caches the fixture so we use weakref.proxy to
     # enable garbage collection
     llm = LLM(
@@ -32,6 +33,7 @@ def llm():
         gpu_memory_utilization=0.75,
         enforce_eager=True,
         seed=0,
+        attention_config=attention_config,
     )
 
     yield weakref.proxy(llm)
@@ -51,7 +53,9 @@ def test_token_embed(llm: LLM):
 def test_pooling_params(llm: LLM):
     def get_outputs(normalize):
         outputs = llm.embed(
-            prompts, pooling_params=PoolingParams(normalize=normalize), use_tqdm=False
+            prompts,
+            pooling_params=PoolingParams(use_activation=normalize),
+            use_tqdm=False,
         )
         return torch.tensor([x.outputs.embedding for x in outputs])
 
diff --git a/tests/entrypoints/pooling/embed/test_online.py b/tests/entrypoints/pooling/embed/test_online.py
index f96338c47f0be26c903dc6ad3ff170638c0d9c9c..f5e563daeaa03671a3789ea536dc26a4a627dffa 100644
--- a/tests/entrypoints/pooling/embed/test_online.py
+++ b/tests/entrypoints/pooling/embed/test_online.py
@@ -28,16 +28,20 @@ from vllm.utils.serial_utils import (
     decode_pooling_output,
 )
 
-if current_platform.is_rocm():
-    pytest.skip(
-        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
-    )
-
 MODEL_NAME = "intfloat/multilingual-e5-small"
 DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}"""  # noqa: E501
 DTYPE = "bfloat16"
 
 
+if current_platform.is_rocm():
+    # Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
+    # accuracy issues: https://github.com/vllm-project/vllm/issues/30167
+    # TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace
+    torch.backends.cuda.enable_flash_sdp(False)
+    torch.backends.cuda.enable_mem_efficient_sdp(False)
+    torch.backends.cuda.enable_math_sdp(True)
+
+
 @pytest.fixture(scope="module")
 def server():
     args = [
@@ -53,6 +57,10 @@ def server():
         DUMMY_CHAT_TEMPLATE,
     ]
 
+    # ROCm: Use Flex Attention to support encoder-only self-attention.
+    if current_platform.is_rocm():
+        args.extend(["--attention-backend", "FLEX_ATTENTION"])
+
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
         yield remote_server
 
diff --git a/tests/entrypoints/pooling/embed/test_online_dimensions.py b/tests/entrypoints/pooling/embed/test_online_dimensions.py
index 26aa57742b02acf943d2f884c21ea8825d9a25c9..0545b8a0ae2fcffb8783502a970576ca04700a27 100644
--- a/tests/entrypoints/pooling/embed/test_online_dimensions.py
+++ b/tests/entrypoints/pooling/embed/test_online_dimensions.py
@@ -14,11 +14,6 @@ from tests.utils import RemoteOpenAIServer
 from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
 from vllm.platforms import current_platform
 
-if current_platform.is_rocm():
-    pytest.skip(
-        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
-    )
-
 MODELS = [
     EmbedModelInfo("intfloat/multilingual-e5-small", is_matryoshka=False),
     EmbedModelInfo(
@@ -62,6 +57,10 @@ def server(model_info, dtype: str):
             ["--trust_remote_code", "--hf_overrides", '{"matryoshka_dimensions":[256]}']
         )
 
+    # ROCm: Use Flex Attention to support encoder-only self-attention.
+    if current_platform.is_rocm():
+        args.extend(["--attention-backend", "FLEX_ATTENTION"])
+
     with RemoteOpenAIServer(model_info.name, args) as remote_server:
         yield remote_server
 
diff --git a/tests/entrypoints/pooling/embed/test_online_long_text.py b/tests/entrypoints/pooling/embed/test_online_long_text.py
index 0be7eebc2017d53bb2252d86e44c0978bcf5dcd9..eaefbc02383fd85bfda176135307abf35ded348e 100644
--- a/tests/entrypoints/pooling/embed/test_online_long_text.py
+++ b/tests/entrypoints/pooling/embed/test_online_long_text.py
@@ -18,11 +18,6 @@ from tests.utils import RemoteOpenAIServer
 from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
 from vllm.platforms import current_platform
 
-if current_platform.is_rocm():
-    pytest.skip(
-        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
-    )
-
 
 def _generate_random_text(word_count: int) -> str:
     """Generate random text with approximately the specified word count."""
@@ -221,13 +216,17 @@ def server_with_chunked_processing():
         "512",  # Set smaller max_model_len to trigger chunking mechanism
         "--pooler-config",
         (
-            '{"pooling_type": "MEAN", "normalize": true, '
+            '{"pooling_type": "MEAN", "use_activation": true, '
             '"enable_chunked_processing": true, "max_embed_len": 10000}'
         ),
         "--gpu-memory-utilization",
         "0.8",
     ]
 
+    # ROCm: Use Flex Attention to support encoder-only self-attention.
+    if current_platform.is_rocm():
+        args.extend(["--attention-backend", "FLEX_ATTENTION"])
+
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
         yield remote_server
 
diff --git a/tests/entrypoints/pooling/embed/test_online_vision.py b/tests/entrypoints/pooling/embed/test_online_vision.py
index 0c1b6521760da039687fef8f6291481495855f78..dc48ab1a55c901d3020b98474f68452895e27bd4 100644
--- a/tests/entrypoints/pooling/embed/test_online_vision.py
+++ b/tests/entrypoints/pooling/embed/test_online_vision.py
@@ -11,7 +11,7 @@ from transformers import AutoProcessor
 from tests.utils import VLLM_PATH, RemoteOpenAIServer
 from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
 from vllm.multimodal.base import MediaWithBytes
-from vllm.multimodal.utils import encode_image_base64, fetch_image
+from vllm.multimodal.utils import fetch_image
 
 from ...utils import models_path_prefix, urls_port
 
@@ -55,14 +55,6 @@ def server():
         yield remote_server
 
 
-@pytest.fixture(scope="session")
-def base64_encoded_image(local_asset_server) -> dict[str, str]:
-    return {
-        image_url: encode_image_base64(local_asset_server.get_image_asset(image_url))
-        for image_url in TEST_IMAGE_ASSETS
-    }
-
-
 def get_hf_prompt_tokens(model_name, content, image_url):
     processor = AutoProcessor.from_pretrained(
         model_name, trust_remote_code=True, num_crops=4
diff --git a/tests/entrypoints/pooling/score/test_correctness_mteb.py b/tests/entrypoints/pooling/score/test_correctness_mteb.py
index 81ad0097187b01ca471198e7d85042e649bf012f..1ee45b44596fa61989b35b033c961aaccdd1784e 100644
--- a/tests/entrypoints/pooling/score/test_correctness_mteb.py
+++ b/tests/entrypoints/pooling/score/test_correctness_mteb.py
@@ -4,7 +4,7 @@ import os
 
 import pytest
 
-from tests.models.language.pooling_mteb_test.mteb_utils import (
+from tests.models.language.pooling_mteb_test.mteb_score_utils import (
     MTEB_RERANK_LANGS,
     MTEB_RERANK_TASKS,
     MTEB_RERANK_TOL,
@@ -15,11 +15,6 @@ from tests.models.language.pooling_mteb_test.mteb_utils import (
 from tests.utils import RemoteOpenAIServer
 from vllm.platforms import current_platform
 
-if current_platform.is_rocm():
-    pytest.skip(
-        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
-    )
-
 os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"
 
 MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2"
@@ -30,6 +25,10 @@ st_main_score = 0.33457
 def server():
     args = ["--runner", "pooling", "--enforce-eager", "--disable-uvicorn-access-log"]
 
+    # ROCm: Use Flex Attention to support encoder-only self-attention.
+    if current_platform.is_rocm():
+        args.extend(["--attention-backend", "FLEX_ATTENTION"])
+
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
         yield remote_server
 
diff --git a/tests/entrypoints/pooling/score/test_offline.py b/tests/entrypoints/pooling/score/test_offline.py
index ce36d61cb847618f512ee229939f877ab7d044dc..c02c02cf234a61d035712bee5e33194ec96f522a 100644
--- a/tests/entrypoints/pooling/score/test_offline.py
+++ b/tests/entrypoints/pooling/score/test_offline.py
@@ -11,16 +11,17 @@ from vllm import LLM, PoolingParams
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.platforms import current_platform
 
-if current_platform.is_rocm():
-    pytest.skip(
-        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
-    )
-
 MODEL_NAME = "tomaarsen/Qwen3-Reranker-0.6B-seq-cls"
 
 
 @pytest.fixture(scope="module")
 def llm():
+    # ROCm: Use FLEX_ATTENTION backend as it's the only attention backend
+    # that supports encoder-only models on ROCm.
+    attention_config = None
+    if current_platform.is_rocm():
+        attention_config = {"backend": "FLEX_ATTENTION"}
+
     # pytest caches the fixture so we use weakref.proxy to
     # enable garbage collection
     llm = LLM(
@@ -30,6 +31,7 @@ def llm():
         gpu_memory_utilization=0.75,
         enforce_eager=True,
         seed=0,
+        attention_config=attention_config,
     )
 
     yield weakref.proxy(llm)
diff --git a/tests/entrypoints/pooling/score/test_online_rerank.py b/tests/entrypoints/pooling/score/test_online_rerank.py
index f262dd4cb06b666dea1b3cc1ad8eca210a820915..7f2af611d2e434460c42b5a6ad9001cdb8a27d71 100644
--- a/tests/entrypoints/pooling/score/test_online_rerank.py
+++ b/tests/entrypoints/pooling/score/test_online_rerank.py
@@ -11,11 +11,6 @@ from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse
 from vllm.entrypoints.pooling.score.protocol import RerankResponse
 from vllm.platforms import current_platform
 
-if current_platform.is_rocm():
-    pytest.skip(
-        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
-    )
-
 MODEL_NAME = "BAAI/bge-reranker-base"
 DTYPE = "bfloat16"
 
@@ -24,6 +19,10 @@ DTYPE = "bfloat16"
 def server():
     args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE]
 
+    # ROCm: Use Flex Attention to support encoder-only self-attention.
+    if current_platform.is_rocm():
+        args.extend(["--attention-backend", "FLEX_ATTENTION"])
+
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
         yield remote_server
 
diff --git a/tests/entrypoints/pooling/score/test_online_score.py b/tests/entrypoints/pooling/score/test_online_score.py
index 30ef55c8b675607e4d565f4dd17cf541e84fbb14..053a836f669809f1868d32438e2b909b34a5a116 100644
--- a/tests/entrypoints/pooling/score/test_online_score.py
+++ b/tests/entrypoints/pooling/score/test_online_score.py
@@ -12,11 +12,6 @@ from tests.utils import RemoteOpenAIServer
 from vllm.entrypoints.pooling.score.protocol import ScoreResponse
 from vllm.platforms import current_platform
 
-if current_platform.is_rocm():
-    pytest.skip(
-        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
-    )
-
 MODELS = [
     {"name": "BAAI/bge-reranker-v2-m3", "is_cross_encoder": True},
     {"name": "BAAI/bge-base-en-v1.5", "is_cross_encoder": False},
@@ -44,6 +39,10 @@ def model(request):
 def server(model: dict[str, Any]):
     args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE]
 
+    # ROCm: Use Flex Attention to support encoder-only self-attention.
+    if current_platform.is_rocm():
+        args.extend(["--attention-backend", "FLEX_ATTENTION"])
+
     with RemoteOpenAIServer(model["name"], args) as remote_server:
         yield remote_server
 
@@ -237,17 +236,14 @@ class TestModel:
                     "use_activation": use_activation,
                 },
             )
-            if response.status_code != 200:
-                return response
-
             outputs = response.json()
             return torch.tensor([x["score"] for x in outputs["data"]])
 
-        if model["is_cross_encoder"]:
-            default = get_outputs(use_activation=None)
-            w_activation = get_outputs(use_activation=True)
-            wo_activation = get_outputs(use_activation=False)
+        default = get_outputs(use_activation=None)
+        w_activation = get_outputs(use_activation=True)
+        wo_activation = get_outputs(use_activation=False)
 
+        if model["is_cross_encoder"]:
             assert torch.allclose(default, w_activation, atol=1e-2), (
                 "Default should use activation."
             )
@@ -257,9 +253,3 @@ class TestModel:
             assert torch.allclose(F.sigmoid(wo_activation), w_activation, atol=1e-2), (
                 "w_activation should be close to activation(wo_activation)."
             )
-        else:
-            get_outputs(use_activation=None)
-
-            # The activation parameter only works for the is_cross_encoder model
-            response = get_outputs(use_activation=True)
-            assert response.status_code == 400
diff --git a/tests/entrypoints/pooling/score/test_utils.py b/tests/entrypoints/pooling/score/test_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a57e53be20a93ba654236626ad03fc0085367f1
--- /dev/null
+++ b/tests/entrypoints/pooling/score/test_utils.py
@@ -0,0 +1,351 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from unittest.mock import patch
+
+import pytest
+
+from vllm.config import ModelConfig
+from vllm.entrypoints.chat_utils import ChatTemplateResolutionError
+from vllm.entrypoints.score_utils import get_score_prompt
+from vllm.inputs import TokensPrompt
+from vllm.tokenizers import get_tokenizer
+
+# A cross-encoder model for testing
+CROSS_ENCODER_MODEL_ID = "cross-encoder/ms-marco-MiniLM-L-6-v2"
+
+
+def assert_prompt_tokenization_consistent(
+    tokenizer, full_prompt, engine_prompt, add_special_tokens=True
+):
+    """Verify that engine_prompt token_ids match tokenizing full_prompt."""
+    expected_ids = tokenizer(full_prompt, add_special_tokens=add_special_tokens)[
+        "input_ids"
+    ]
+    actual_ids = engine_prompt["prompt_token_ids"]
+    assert actual_ids == expected_ids, (
+        f"Token IDs don't match.\nExpected: {expected_ids}\nActual:   {actual_ids}"
+    )
+
+
+@pytest.fixture(scope="module")
+def cross_encoder_model_config():
+    return ModelConfig(
+        CROSS_ENCODER_MODEL_ID,
+        runner="pooling",
+    )
+
+
+@pytest.fixture(scope="module")
+def cross_encoder_tokenizer(cross_encoder_model_config):
+    return get_tokenizer(
+        CROSS_ENCODER_MODEL_ID,
+        trust_remote_code=cross_encoder_model_config.trust_remote_code,
+    )
+
+
+@pytest.fixture(scope="module")
+def llm_reranker_model_config():
+    """Model config for LLM-as-reranker style (no pad token)."""
+    config = ModelConfig(
+        CROSS_ENCODER_MODEL_ID,
+        runner="pooling",
+    )
+    # use_sep_token is a property that reads from hf_config,
+    # so we set it there to override the default (True)
+    config.hf_config.use_sep_token = False
+    return config
+
+
+@pytest.fixture
+def tokenization_kwargs():
+    """Common tokenization kwargs used across tests."""
+    return {"add_special_tokens": True, "return_tensors": None}
+
+
+@pytest.fixture
+def mock_model_with_score_template():
+    """Mock model class that supports score template and tracks post_process calls."""
+
+    class MockModelWithScoreTemplate:
+        supports_score_template = True
+        post_process_called: list[TokensPrompt] = []
+
+        @staticmethod
+        def get_score_template(p1: str, p2: str) -> str:
+            return f"[QUERY]{p1}[SEP][DOC]{p2}"
+
+        @staticmethod
+        def post_process_tokens(prompt: TokensPrompt) -> None:
+            MockModelWithScoreTemplate.post_process_called.append(prompt)
+
+    return MockModelWithScoreTemplate
+
+
+@pytest.fixture
+def mock_model_no_score_template():
+    """Mock model class that does not support score template."""
+
+    class MockModelNoScoreTemplate:
+        supports_score_template = False
+
+    return MockModelNoScoreTemplate
+
+
+class TestGetScorePrompt:
+    """Tests for the get_score_prompt function."""
+
+    def test_tokenization_kwargs_passed_through(
+        self,
+        llm_reranker_model_config,
+        cross_encoder_tokenizer,
+    ):
+        """Test that tokenization kwargs are properly passed through."""
+        data_1 = "Query text"
+        data_2 = "Document text"
+
+        # Test with truncation - custom kwargs for this test
+        custom_tokenization_kwargs = {
+            "add_special_tokens": True,
+            "return_tensors": None,
+            "truncation": True,
+            "max_length": 20,
+        }
+
+        full_prompt, engine_prompt = get_score_prompt(
+            llm_reranker_model_config,
+            cross_encoder_tokenizer,
+            custom_tokenization_kwargs,
+            data_1,
+            data_2,
+        )
+
+        assert isinstance(full_prompt, str)
+        assert "prompt_token_ids" in engine_prompt
+        # With max_length=20 and truncation, should not exceed this
+        assert len(engine_prompt["prompt_token_ids"]) <= 20
+        # Since truncation was applied, token_ids should be a prefix of full encoding
+        full_ids = cross_encoder_tokenizer(full_prompt, add_special_tokens=True)[
+            "input_ids"
+        ]
+        actual_ids = engine_prompt["prompt_token_ids"]
+        assert full_ids[: len(actual_ids)] == actual_ids, (
+            f"Token IDs are not a prefix of full encoding.\n"
+            f"Full IDs:   {full_ids}\n"
+            f"Actual IDs: {actual_ids}"
+        )
+
+    def test_model_supports_score_template(
+        self,
+        cross_encoder_model_config,
+        cross_encoder_tokenizer,
+        tokenization_kwargs,
+        mock_model_with_score_template,
+    ):
+        """Test when model supports score template (no score_template arg)."""
+        with patch(
+            "vllm.model_executor.model_loader.get_model_cls",
+            return_value=mock_model_with_score_template,
+        ):
+            full_prompt, engine_prompt = get_score_prompt(
+                cross_encoder_model_config,
+                cross_encoder_tokenizer,
+                tokenization_kwargs,
+                "query text",
+                "document text",
+            )
+
+        assert full_prompt == "[QUERY]query text[SEP][DOC]document text"
+        assert "prompt_token_ids" in engine_prompt
+        assert len(engine_prompt["prompt_token_ids"]) > 0
+        assert_prompt_tokenization_consistent(
+            cross_encoder_tokenizer, full_prompt, engine_prompt
+        )
+
+    def test_model_supports_score_template_but_custom_template_provided(
+        self,
+        cross_encoder_model_config,
+        cross_encoder_tokenizer,
+        tokenization_kwargs,
+        mock_model_with_score_template,
+    ):
+        """Test when model supports score template but custom template is provided."""
+        template = (
+            'TEMPLATE_USED {{ messages[0]["content"] }} {{ messages[1]["content"] }}'
+        )
+        with (
+            patch(
+                "vllm.model_executor.model_loader.get_model_cls",
+                return_value=mock_model_with_score_template,
+            ),
+        ):
+            full_prompt, engine_prompt = get_score_prompt(
+                cross_encoder_model_config,
+                cross_encoder_tokenizer,
+                tokenization_kwargs,
+                "query",
+                "doc",
+                score_template=template,  # Providing a template
+            )
+
+        assert "prompt_token_ids" in engine_prompt
+        assert full_prompt == "TEMPLATE_USED query doc"
+
+        assert_prompt_tokenization_consistent(
+            cross_encoder_tokenizer, full_prompt, engine_prompt
+        )
+
+    def test_not_using_default_template(
+        self,
+        llm_reranker_model_config,
+        cross_encoder_tokenizer,
+        tokenization_kwargs,
+        mock_model_no_score_template,
+    ):
+        # FIXME: For now, we only apply a template when one is explicitly provided.
+        # We cannot rely on the tokenizer's chat template because many models
+        # inherit junk templates from their base LLM, which breaks both the models
+        # and the tests that use them.
+        with (
+            patch(
+                "vllm.model_executor.model_loader.get_model_cls",
+                return_value=mock_model_no_score_template,
+            ),
+            patch(
+                "vllm.entrypoints.score_utils.apply_hf_chat_template",
+                return_value="test querytest doc",
+            ),
+        ):
+            full_prompt, engine_prompt = get_score_prompt(
+                llm_reranker_model_config,
+                cross_encoder_tokenizer,
+                tokenization_kwargs,
+                "test query",
+                "test doc",
+            )
+
+        assert full_prompt == "test querytest doc"
+        assert "prompt_token_ids" in engine_prompt
+        assert_prompt_tokenization_consistent(
+            cross_encoder_tokenizer, full_prompt, engine_prompt
+        )
+
+    def test_fallback_with_sep_token(
+        self,
+        cross_encoder_model_config,
+        cross_encoder_tokenizer,
+        tokenization_kwargs,
+        mock_model_no_score_template,
+    ):
+        """Test fallback path when ChatTemplateResolutionError
+        and use_sep_token=True."""
+        with (
+            patch(
+                "vllm.model_executor.model_loader.get_model_cls",
+                return_value=mock_model_no_score_template,
+            ),
+            patch(
+                "vllm.entrypoints.score_utils.apply_hf_chat_template",
+                side_effect=ChatTemplateResolutionError("No template"),
+            ),
+        ):
+            full_prompt, engine_prompt = get_score_prompt(
+                cross_encoder_model_config,  # use_sep_token=True
+                cross_encoder_tokenizer,
+                tokenization_kwargs,
+                "query",
+                "document",
+            )
+
+        assert "prompt_token_ids" in engine_prompt
+        # Should have token_type_ids from text_pair encoding
+        assert "token_type_ids" in engine_prompt
+        assert "query" in full_prompt
+        assert "document" in full_prompt
+        assert full_prompt != "querydocument"
+        assert (
+            engine_prompt["prompt_token_ids"]
+            == cross_encoder_tokenizer(
+                "query", text_pair="document", add_special_tokens=True
+            )["input_ids"]
+        )
+
+        # FIXME(?): add_special_tokens=False is needed because in this case
+        # full_prompt is obtained by decoding the tokenized prompt, which includes
+        # special tokens and we would get duplicated special tokens otherwise.
+        # This is inconsistent with other cases.
+        assert_prompt_tokenization_consistent(
+            cross_encoder_tokenizer,
+            full_prompt,
+            engine_prompt,
+            add_special_tokens=False,
+        )
+
+    def test_fallback_without_sep_token(
+        self,
+        llm_reranker_model_config,
+        cross_encoder_tokenizer,
+        tokenization_kwargs,
+        mock_model_no_score_template,
+    ):
+        """Test fallback path when ChatTemplateResolutionError
+        and use_sep_token=False."""
+        with (
+            patch(
+                "vllm.model_executor.model_loader.get_model_cls",
+                return_value=mock_model_no_score_template,
+            ),
+            patch(
+                "vllm.entrypoints.score_utils.apply_hf_chat_template",
+                side_effect=ChatTemplateResolutionError("No template"),
+            ),
+        ):
+            full_prompt, engine_prompt = get_score_prompt(
+                llm_reranker_model_config,  # use_sep_token=False
+                cross_encoder_tokenizer,
+                tokenization_kwargs,
+                "query",
+                "document",
+            )
+
+        assert full_prompt == "querydocument"
+        assert "prompt_token_ids" in engine_prompt
+        assert_prompt_tokenization_consistent(
+            cross_encoder_tokenizer, full_prompt, engine_prompt
+        )
+
+    def test_post_process_tokens_called(
+        self,
+        cross_encoder_model_config,
+        cross_encoder_tokenizer,
+        tokenization_kwargs,
+        mock_model_with_score_template,
+    ):
+        """Test that post_process_tokens is called on the engine prompt."""
+        # Reset the call tracker
+        mock_model_with_score_template.post_process_called.clear()
+
+        with (
+            patch(
+                "vllm.model_executor.model_loader.get_model_cls",
+                return_value=mock_model_with_score_template,
+            ),
+            patch(
+                "vllm.entrypoints.score_utils.apply_hf_chat_template",
+                side_effect=ChatTemplateResolutionError("No template"),
+            ),
+        ):
+            full_prompt, engine_prompt = get_score_prompt(
+                cross_encoder_model_config,
+                cross_encoder_tokenizer,
+                tokenization_kwargs,
+                "query",
+                "doc",
+            )
+
+        # post_process_tokens should have been called once
+        assert len(mock_model_with_score_template.post_process_called) == 1
+        assert mock_model_with_score_template.post_process_called[0] is engine_prompt
+        assert_prompt_tokenization_consistent(
+            cross_encoder_tokenizer, full_prompt, engine_prompt
+        )
diff --git a/tests/v1/tpu/worker/__init__.py b/tests/entrypoints/rpc/__init__.py
similarity index 100%
rename from tests/v1/tpu/worker/__init__.py
rename to tests/entrypoints/rpc/__init__.py
diff --git a/tests/entrypoints/openai/test_collective_rpc.py b/tests/entrypoints/rpc/test_collective_rpc.py
similarity index 96%
rename from tests/entrypoints/openai/test_collective_rpc.py
rename to tests/entrypoints/rpc/test_collective_rpc.py
index cbd6b02f05dce267f0d2aae10eddfb41415f087e..56d93a427315f84dc83ef3d7345a9a296792cbe5 100644
--- a/tests/entrypoints/openai/test_collective_rpc.py
+++ b/tests/entrypoints/rpc/test_collective_rpc.py
@@ -37,7 +37,7 @@ def server():
         "--max-num-seqs",
         "128",
         "--worker-extension-cls",
-        "tests.entrypoints.openai.test_collective_rpc.TestWorkerExtension",
+        "tests.entrypoints.rpc.test_collective_rpc.TestWorkerExtension",
     ]
     with RemoteOpenAIServer(
         MODEL_NAME,
diff --git a/vllm/attention/backends/__init__.py b/tests/entrypoints/sleep/__init__.py
similarity index 100%
rename from vllm/attention/backends/__init__.py
rename to tests/entrypoints/sleep/__init__.py
diff --git a/tests/entrypoints/openai/test_sleep.py b/tests/entrypoints/sleep/test_sleep.py
similarity index 98%
rename from tests/entrypoints/openai/test_sleep.py
rename to tests/entrypoints/sleep/test_sleep.py
index 0b4cc25ef49d1722d7cf551c959e7ffda6e8ded8..226a84e5a5711308be8ef56b26b66edb0195bb33 100644
--- a/tests/entrypoints/openai/test_sleep.py
+++ b/tests/entrypoints/sleep/test_sleep.py
@@ -5,7 +5,7 @@ import os
 import requests
 from prometheus_client.parser import text_string_to_metric_families
 
-from ...utils import RemoteOpenAIServer, models_path_prefix
+from tests.utils import RemoteOpenAIServer, models_path_prefix
 
 MODEL_NAME = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B")
 
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index 0709bdda59a8a010a7bccbae31d326a8af398378..0b1a57082bdf13ec394bb6431301e34bf875b091 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -26,9 +26,9 @@ from vllm.entrypoints.chat_utils import (
 )
 from vllm.multimodal import MultiModalDataDict, MultiModalUUIDDict
 from vllm.multimodal.utils import (
-    encode_audio_base64,
-    encode_image_base64,
-    encode_video_base64,
+    encode_audio_url,
+    encode_image_url,
+    encode_video_url,
 )
 from vllm.tokenizers import get_tokenizer
 from vllm.tokenizers.mistral import MistralTokenizer
@@ -142,22 +142,19 @@ def mistral_model_config():
 @pytest.fixture(scope="module")
 def image_url():
     image = ImageAsset("cherry_blossom")
-    base64 = encode_image_base64(image.pil_image)
-    return f"data:image/jpeg;base64,{base64}"
+    return encode_image_url(image.pil_image)
 
 
 @pytest.fixture(scope="module")
 def video_url():
     video = VideoAsset("baby_reading", 1)
-    base64 = encode_video_base64(video.np_ndarrays)
-    return f"data:video/jpeg;base64,{base64}"
+    return encode_video_url(video.np_ndarrays)
 
 
 @pytest.fixture(scope="module")
 def audio_url():
     audio = AudioAsset("mary_had_lamb")
-    base64 = encode_audio_base64(*audio.audio_and_sample_rate)
-    return f"data:audio/ogg;base64,{base64}"
+    return encode_audio_url(*audio.audio_and_sample_rate)
 
 
 def _assert_mm_data_is_image_input(
diff --git a/tests/entrypoints/test_grpc_server.py b/tests/entrypoints/test_grpc_server.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4e3a38602e3bede77296d6636ab7c39992a176b
--- /dev/null
+++ b/tests/entrypoints/test_grpc_server.py
@@ -0,0 +1,428 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+End-to-end tests for the vLLM gRPC server.
+"""
+
+import asyncio
+import socket
+import subprocess
+import sys
+import time
+
+import grpc
+import pytest
+import pytest_asyncio
+
+from vllm.grpc import vllm_engine_pb2, vllm_engine_pb2_grpc
+
+# Use a small model for fast testing
+MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
+
+
+def find_free_port() -> int:
+    """Find a free port on localhost."""
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind(("", 0))
+        s.listen(1)
+        port = s.getsockname()[1]
+    return port
+
+
+async def wait_for_server(port: int, timeout: float = 60.0) -> bool:
+    """Wait for the gRPC server to be ready by trying health checks."""
+    start_time = time.time()
+    print("waiting for server to start...")
+    while time.time() - start_time < timeout:
+        try:
+            channel = grpc.aio.insecure_channel(f"localhost:{port}")
+            stub = vllm_engine_pb2_grpc.VllmEngineStub(channel)
+            request = vllm_engine_pb2.HealthCheckRequest()
+            response = await stub.HealthCheck(request, timeout=5.0)
+            await channel.close()
+            if response.healthy:
+                print("server returned healthy=True")
+                return True
+        except Exception:
+            await asyncio.sleep(0.5)
+    return False
+
+
+class GrpcServerProcess:
+    """Manages a gRPC server running in a subprocess."""
+
+    def __init__(self):
+        self.process: subprocess.Popen | None = None
+        self.port: int | None = None
+
+    async def start(self):
+        """Start the gRPC server process."""
+        self.port = find_free_port()
+
+        # Start the server as a subprocess
+        self.process = subprocess.Popen(
+            [
+                sys.executable,
+                "-m",
+                "vllm.entrypoints.grpc_server",
+                "--model",
+                MODEL_NAME,
+                "--host",
+                "localhost",
+                "--port",
+                str(self.port),
+                "--max-num-batched-tokens",
+                "512",
+                "--disable-log-stats-server",
+            ],
+        )
+
+        # Wait for server to be ready
+        if not await wait_for_server(self.port):
+            self.stop()
+            raise RuntimeError("gRPC server failed to start within timeout")
+
+    def stop(self):
+        """Stop the gRPC server process."""
+        if self.process:
+            self.process.terminate()
+            try:
+                self.process.wait(timeout=10)
+            except subprocess.TimeoutExpired:
+                self.process.kill()
+                self.process.wait()
+
+
+@pytest_asyncio.fixture(scope="module")
+async def grpc_server():
+    """Fixture providing a running gRPC server in a subprocess."""
+    server = GrpcServerProcess()
+    await server.start()
+
+    yield server
+
+    server.stop()
+
+
+@pytest_asyncio.fixture
+async def grpc_client(grpc_server):
+    """Fixture providing a gRPC client connected to the server."""
+    channel = grpc.aio.insecure_channel(f"localhost:{grpc_server.port}")
+    stub = vllm_engine_pb2_grpc.VllmEngineStub(channel)
+
+    yield stub
+
+    await channel.close()
+
+
+@pytest.mark.asyncio
+async def test_health_check(grpc_client):
+    """Test the HealthCheck RPC."""
+    request = vllm_engine_pb2.HealthCheckRequest()
+    response = await grpc_client.HealthCheck(request)
+
+    assert response.healthy is True
+    assert response.message == "Health"
+
+
+@pytest.mark.asyncio
+async def test_get_model_info(grpc_client):
+    """Test the GetModelInfo RPC."""
+    request = vllm_engine_pb2.GetModelInfoRequest()
+    response = await grpc_client.GetModelInfo(request)
+
+    assert response.model_path == MODEL_NAME
+    assert response.is_generation is True
+    assert response.max_context_length > 0
+    assert response.vocab_size > 0
+    assert response.supports_vision is False
+
+
+@pytest.mark.asyncio
+async def test_get_server_info(grpc_client):
+    """Test the GetServerInfo RPC."""
+    request = vllm_engine_pb2.GetServerInfoRequest()
+    response = await grpc_client.GetServerInfo(request)
+
+    assert response.active_requests >= 0
+    assert response.is_paused is False
+    assert response.uptime_seconds >= 0
+    assert response.server_type == "vllm-grpc"
+    assert response.last_receive_timestamp > 0
+
+
+@pytest.mark.asyncio
+async def test_generate_non_streaming(grpc_client):
+    """Test the Generate RPC in non-streaming mode."""
+    # Create a simple request
+    request = vllm_engine_pb2.GenerateRequest(
+        request_id="test-non-streaming-1",
+        tokenized=vllm_engine_pb2.TokenizedInput(
+            original_text="Hello, my name is",
+            input_ids=[15496, 11, 616, 1438, 318],  # GPT-2 tokens for the prompt
+        ),
+        sampling_params=vllm_engine_pb2.SamplingParams(
+            temperature=0.0,
+            max_tokens=10,
+            n=1,
+        ),
+        stream=False,
+    )
+
+    # Collect all responses
+    responses = []
+    async for response in grpc_client.Generate(request):
+        responses.append(response)
+
+    # Should have exactly one response (complete)
+    assert len(responses) == 1
+
+    # Check the response
+    final_response = responses[0]
+    assert final_response.HasField("complete")
+
+    complete = final_response.complete
+    assert len(complete.output_ids) > 0
+    assert complete.finish_reason in ["stop", "length"]
+    assert complete.prompt_tokens > 0
+    assert complete.completion_tokens > 0
+
+
+@pytest.mark.asyncio
+async def test_generate_streaming(grpc_client):
+    """Test the Generate RPC in streaming mode."""
+    request = vllm_engine_pb2.GenerateRequest(
+        request_id="test-streaming-1",
+        tokenized=vllm_engine_pb2.TokenizedInput(
+            original_text="The capital of France is",
+            input_ids=[464, 3139, 286, 4881, 318],  # GPT-2 tokens
+        ),
+        sampling_params=vllm_engine_pb2.SamplingParams(
+            temperature=0.0, max_tokens=10, n=1
+        ),
+        stream=True,
+    )
+
+    # Collect all responses
+    chunks = []
+    complete_response = None
+
+    async for response in grpc_client.Generate(request):
+        if response.HasField("chunk"):
+            chunks.append(response.chunk)
+        elif response.HasField("complete"):
+            complete_response = response.complete
+
+    # Should have received some chunks
+    assert len(chunks) >= 0  # May have 0 chunks if generation is very fast
+
+    # Should have a final complete response
+    assert complete_response is not None
+    assert complete_response.finish_reason in ["stop", "length"]
+    assert complete_response.prompt_tokens > 0
+
+    # Verify chunk structure
+    for chunk in chunks:
+        assert chunk.prompt_tokens > 0
+        assert chunk.completion_tokens >= 0
+
+
+@pytest.mark.asyncio
+async def test_generate_with_different_sampling_params(grpc_client):
+    """Test Generate with various sampling parameters."""
+    # Test with temperature
+    request = vllm_engine_pb2.GenerateRequest(
+        request_id="test-sampling-temp",
+        tokenized=vllm_engine_pb2.TokenizedInput(
+            original_text="Hello",
+            input_ids=[15496],
+        ),
+        sampling_params=vllm_engine_pb2.SamplingParams(
+            temperature=0.8, top_p=0.95, max_tokens=5
+        ),
+        stream=False,
+    )
+
+    responses = [r async for r in grpc_client.Generate(request)]
+    assert len(responses) == 1
+    assert responses[0].HasField("complete")
+
+    # Test with top_k
+    request = vllm_engine_pb2.GenerateRequest(
+        request_id="test-sampling-topk",
+        tokenized=vllm_engine_pb2.TokenizedInput(
+            original_text="Hello",
+            input_ids=[15496],
+        ),
+        sampling_params=vllm_engine_pb2.SamplingParams(
+            temperature=1.0, top_k=50, max_tokens=5
+        ),
+        stream=False,
+    )
+
+    responses = [r async for r in grpc_client.Generate(request)]
+    assert len(responses) == 1
+    assert responses[0].HasField("complete")
+
+
+@pytest.mark.asyncio
+async def test_generate_with_stop_strings(grpc_client):
+    """Test Generate with stop strings."""
+    request = vllm_engine_pb2.GenerateRequest(
+        request_id="test-stop-strings",
+        tokenized=vllm_engine_pb2.TokenizedInput(
+            original_text="Hello",
+            input_ids=[15496],
+        ),
+        sampling_params=vllm_engine_pb2.SamplingParams(
+            temperature=0.0,
+            max_tokens=20,
+            stop=["\n", "END"],
+        ),
+        stream=False,
+    )
+
+    responses = [r async for r in grpc_client.Generate(request)]
+    assert len(responses) == 1
+    assert responses[0].HasField("complete")
+
+    complete = responses[0].complete
+    assert complete.finish_reason in ["stop", "length"]
+
+
+@pytest.mark.asyncio
+async def test_generate_multiple_requests(grpc_client):
+    """Test handling multiple concurrent Generate requests."""
+
+    async def make_request(request_id: str):
+        request = vllm_engine_pb2.GenerateRequest(
+            request_id=request_id,
+            tokenized=vllm_engine_pb2.TokenizedInput(
+                original_text="Hello",
+                input_ids=[15496],
+            ),
+            sampling_params=vllm_engine_pb2.SamplingParams(
+                temperature=0.0, max_tokens=5
+            ),
+            stream=False,
+        )
+
+        responses = [r async for r in grpc_client.Generate(request)]
+        return responses[0]
+
+    # Send multiple requests concurrently
+    tasks = [make_request(f"test-concurrent-{i}") for i in range(3)]
+    responses = await asyncio.gather(*tasks)
+
+    # Verify all requests completed successfully
+    assert len(responses) == 3
+    for i, response in enumerate(responses):
+        assert response.HasField("complete")
+
+
+@pytest.mark.asyncio
+async def test_generate_with_seed(grpc_client):
+    """Test Generate with a fixed seed for reproducibility."""
+
+    def make_request(request_id: str, seed: int):
+        return vllm_engine_pb2.GenerateRequest(
+            request_id=request_id,
+            tokenized=vllm_engine_pb2.TokenizedInput(
+                original_text="The future of AI is",
+                input_ids=[464, 2003, 286, 9552, 318],
+            ),
+            sampling_params=vllm_engine_pb2.SamplingParams(
+                temperature=1.0, max_tokens=10, seed=seed
+            ),
+            stream=False,
+        )
+
+    # Make two requests with the same seed
+    request1 = make_request("test-seed-1", 42)
+    request2 = make_request("test-seed-2", 42)
+
+    response_list1 = [r async for r in grpc_client.Generate(request1)]
+    response_list2 = [r async for r in grpc_client.Generate(request2)]
+
+    # Both should complete successfully
+    assert len(response_list1) == 1
+    assert len(response_list2) == 1
+    assert response_list1[0].HasField("complete")
+    assert response_list2[0].HasField("complete")
+
+    # With the same seed, outputs should be identical
+    output_ids1 = list(response_list1[0].complete.output_ids)
+    output_ids2 = list(response_list2[0].complete.output_ids)
+    assert output_ids1 == output_ids2
+
+
+@pytest.mark.asyncio
+async def test_generate_error_handling(grpc_client):
+    """Test error handling in Generate RPC."""
+    # Request with invalid top_p value (-33)
+    request = vllm_engine_pb2.GenerateRequest(
+        request_id="test-error-invalid-topp",
+        sampling_params=vllm_engine_pb2.SamplingParams(
+            temperature=0.0, max_tokens=10, top_p=-33
+        ),
+        stream=False,
+    )
+
+    # Should raise an error response
+    with pytest.raises(grpc.RpcError) as exc_info:
+        _ = [r async for r in grpc_client.Generate(request)]
+
+    assert exc_info.value.code() == grpc.StatusCode.INVALID_ARGUMENT
+    assert "top_p must be in (0, 1], got -33.0" in exc_info.value.details()
+
+
+@pytest.mark.asyncio
+async def test_abort_request(grpc_client):
+    """Test the out-of-band Abort RPC."""
+    request_id = "test-abort-1"
+
+    # Start a long-running streaming generate request
+    generate_request = vllm_engine_pb2.GenerateRequest(
+        request_id=request_id,
+        tokenized=vllm_engine_pb2.TokenizedInput(
+            original_text="Hello",
+            input_ids=[15496],
+        ),
+        sampling_params=vllm_engine_pb2.SamplingParams(
+            temperature=0.0,
+            min_tokens=500,
+            max_tokens=500,  # Request many tokens to ensure it runs long enough
+        ),
+        stream=True,
+    )
+
+    # Track whether we were aborted
+    was_aborted = False
+    received_chunks = 0
+
+    async def run_generate():
+        nonlocal was_aborted, received_chunks
+        async for response in grpc_client.Generate(generate_request):
+            if response.HasField("chunk"):
+                received_chunks += 1
+
+            if response.HasField("complete"):
+                complete = response.complete
+                was_aborted = complete.finish_reason == "abort"
+            else:
+                was_aborted = False
+
+    async def abort_after_delay():
+        # Small delay to ensure generate has started
+        await asyncio.sleep(0.1)
+        abort_request = vllm_engine_pb2.AbortRequest(request_ids=[request_id])
+        await grpc_client.Abort(abort_request)
+
+    # Run generate and abort concurrently
+    await asyncio.gather(run_generate(), abort_after_delay())
+
+    # The request should have been aborted (received final chunk with
+    # "abort" finish reason) and finished early due to the abort.
+    assert was_aborted and received_chunks < 500, (
+        "Request should have been aborted before generating all 500 tokens"
+    )
diff --git a/tests/entrypoints/test_responses_utils.py b/tests/entrypoints/test_responses_utils.py
index a522967111307b0f5422740ee2860b44b568c95d..53e4f4606ef6bddcd7b2cef4a4c62f2aeeba4ae7 100644
--- a/tests/entrypoints/test_responses_utils.py
+++ b/tests/entrypoints/test_responses_utils.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
+from openai.types.chat import ChatCompletionMessageParam
 from openai.types.responses.response_function_tool_call import ResponseFunctionToolCall
 from openai.types.responses.response_function_tool_call_output_item import (
     ResponseFunctionToolCallOutputItem,
@@ -14,8 +15,10 @@ from openai.types.responses.response_reasoning_item import (
     Summary,
 )
 
+from vllm.entrypoints.constants import MCP_PREFIX
 from vllm.entrypoints.responses_utils import (
     _construct_single_message_from_response_item,
+    _maybe_combine_reasoning_and_tool_call,
     construct_chat_messages_with_tool_call,
     convert_tool_responses_to_completions_format,
 )
@@ -160,3 +163,118 @@ class TestResponsesUtils:
         formatted_item = _construct_single_message_from_response_item(output_item)
         assert formatted_item["role"] == "assistant"
         assert formatted_item["content"] == "dongyi"
+
+
+class TestMaybeCombineReasoningAndToolCall:
+    """Tests for _maybe_combine_reasoning_and_tool_call function."""
+
+    def test_returns_none_when_item_id_is_none(self):
+        """
+        Test fix from PR #31999: when item.id is None, should return None
+        instead of raising TypeError on startswith().
+        """
+        item = ResponseFunctionToolCall(
+            type="function_call",
+            id=None,  # This was causing TypeError before the fix
+            call_id="call_123",
+            name="test_function",
+            arguments="{}",
+        )
+        messages: list[ChatCompletionMessageParam] = []
+
+        result = _maybe_combine_reasoning_and_tool_call(item, messages)
+
+        assert result is None
+
+    def test_returns_none_when_id_does_not_start_with_mcp_prefix(self):
+        """Test that non-MCP tool calls are not combined."""
+        item = ResponseFunctionToolCall(
+            type="function_call",
+            id="regular_id",  # Does not start with MCP_PREFIX
+            call_id="call_123",
+            name="test_function",
+            arguments="{}",
+        )
+        messages = [{"role": "assistant", "reasoning": "some reasoning"}]
+
+        result = _maybe_combine_reasoning_and_tool_call(item, messages)
+
+        assert result is None
+
+    def test_returns_none_when_last_message_is_not_assistant(self):
+        """Test that non-assistant last message returns None."""
+        item = ResponseFunctionToolCall(
+            type="function_call",
+            id=f"{MCP_PREFIX}tool_id",
+            call_id="call_123",
+            name="test_function",
+            arguments="{}",
+        )
+        messages = [{"role": "user", "content": "hello"}]
+
+        result = _maybe_combine_reasoning_and_tool_call(item, messages)
+
+        assert result is None
+
+    def test_returns_none_when_last_message_has_no_reasoning(self):
+        """Test that assistant message without reasoning returns None."""
+        item = ResponseFunctionToolCall(
+            type="function_call",
+            id=f"{MCP_PREFIX}tool_id",
+            call_id="call_123",
+            name="test_function",
+            arguments="{}",
+        )
+        messages = [{"role": "assistant", "content": "some content"}]
+
+        result = _maybe_combine_reasoning_and_tool_call(item, messages)
+
+        assert result is None
+
+    def test_combines_reasoning_and_mcp_tool_call(self):
+        """Test successful combination of reasoning message and MCP tool call."""
+        item = ResponseFunctionToolCall(
+            type="function_call",
+            id=f"{MCP_PREFIX}tool_id",
+            call_id="call_123",
+            name="test_function",
+            arguments='{"arg": "value"}',
+        )
+        messages = [{"role": "assistant", "reasoning": "I need to call this tool"}]
+
+        result = _maybe_combine_reasoning_and_tool_call(item, messages)
+
+        assert result is not None
+        assert result["role"] == "assistant"
+        assert result["reasoning"] == "I need to call this tool"
+        assert "tool_calls" in result
+        assert len(result["tool_calls"]) == 1
+        assert result["tool_calls"][0]["id"] == "call_123"
+        assert result["tool_calls"][0]["function"]["name"] == "test_function"
+        assert result["tool_calls"][0]["function"]["arguments"] == '{"arg": "value"}'
+        assert result["tool_calls"][0]["type"] == "function"
+
+    def test_returns_none_for_non_function_tool_call_type(self):
+        """Test that non-ResponseFunctionToolCall items return None."""
+        # Pass a dict instead of ResponseFunctionToolCall
+        item = {"type": "message", "content": "hello"}
+        messages = [{"role": "assistant", "reasoning": "some reasoning"}]
+
+        result = _maybe_combine_reasoning_and_tool_call(item, messages)
+
+        assert result is None
+
+    def test_returns_none_when_id_is_empty_string(self):
+        """Test that empty string id returns None (falsy check)."""
+        item = ResponseFunctionToolCall(
+            type="function_call",
+            id="",  # Empty string is falsy
+            call_id="call_123",
+            name="test_function",
+            arguments="{}",
+        )
+        messages = [{"role": "assistant", "reasoning": "some reasoning"}]
+
+        result = _maybe_combine_reasoning_and_tool_call(item, messages)
+
+        assert result is None
diff --git a/tests/entrypoints/test_utils.py b/tests/entrypoints/test_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc1101840645bc8ac9122ec1d840994cfcbc818a
--- /dev/null
+++ b/tests/entrypoints/test_utils.py
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from vllm.entrypoints.utils import sanitize_message
+
+
+def test_sanitize_message():
+    assert (
+        sanitize_message("<_io.BytesIO object at 0x7a95e299e750>")
+        == "<_io.BytesIO object>"
+    )
diff --git a/tests/evals/gsm8k/README.md b/tests/evals/gsm8k/README.md
index 29c5199e1e87a05e574f86cb2f0530815b97c06f..dcbfd85bfeee811854132e8d42d6688d9c4cc79c 100644
--- a/tests/evals/gsm8k/README.md
+++ b/tests/evals/gsm8k/README.md
@@ -7,9 +7,8 @@ This directory contains a replacement for the lm-eval-harness GSM8K evaluation,
 ### Run tests with pytest (like buildkite)
 
 ```bash
-pytest -s -v tests/gsm8k/test_gsm8k_correctness.py \
-    --config-list-file=configs/models-small.txt \
-    --tp-size=1
+pytest -s -v tests/evals/gsm8k/test_gsm8k_correctness.py \
+    --config-list-file=configs/models-small.txt
 ```
 
 ### Run standalone evaluation script
@@ -31,5 +30,11 @@ model_name: "Qwen/Qwen2.5-1.5B-Instruct"
 accuracy_threshold: 0.54  # Minimum expected accuracy
 num_questions: 1319       # Number of questions (default: full test set)
 num_fewshot: 5            # Few-shot examples from train set
-max_model_len: 4096       # Model context length
+server_args: "--max-model-len 4096 --tensor-parallel-size 2"  # Server arguments
+env:                      # Environment variables (optional)
+  VLLM_USE_FLASHINFER_MOE_FP4: "1"
 ```
+
+The `server_args` field accepts any arguments that can be passed to `vllm serve`.
+
+The `env` field accepts a dictionary of environment variables to set for the server process.
diff --git a/tests/evals/gsm8k/configs/DeepSeek-R1-DP.yaml b/tests/evals/gsm8k/configs/DeepSeek-R1-DP.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f351a17220644c9bebb1966e73f78121cda2f8de
--- /dev/null
+++ b/tests/evals/gsm8k/configs/DeepSeek-R1-DP.yaml
@@ -0,0 +1,11 @@
+model_name: "deepseek-ai/DeepSeek-R1"
+accuracy_threshold: 0.95
+num_questions: 1319
+num_fewshot: 5
+startup_max_wait_seconds: 1200
+server_args: >-
+  --enforce-eager
+  --max-model-len 4096
+  --data-parallel-size 8
+  --enable-expert-parallel
+  --speculative-config '{"method":"mtp","num_speculative_tokens":1}'
diff --git a/tests/evals/gsm8k/configs/DeepSeek-R1-TP.yaml b/tests/evals/gsm8k/configs/DeepSeek-R1-TP.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ba3463463b5ee1fe520bf9c4d29db999aab6ae04
--- /dev/null
+++ b/tests/evals/gsm8k/configs/DeepSeek-R1-TP.yaml
@@ -0,0 +1,11 @@
+model_name: "deepseek-ai/DeepSeek-R1"
+accuracy_threshold: 0.95
+num_questions: 1319
+num_fewshot: 5
+startup_max_wait_seconds: 1200
+server_args: >-
+  --enforce-eager
+  --max-model-len 4096
+  --tensor-parallel-size 8
+  --enable-expert-parallel
+  --speculative-config '{"method":"mtp","num_speculative_tokens":1}'
diff --git a/tests/evals/gsm8k/configs/DeepSeek-V2-Lite-Instruct-FP8.yaml b/tests/evals/gsm8k/configs/DeepSeek-V2-Lite-Instruct-FP8.yaml
index 7ec6a1e0be27f8e20dd95b89ee1cac25e3789599..72fa7e8a38c73ba95075177f5e9c68f31c337e33 100644
--- a/tests/evals/gsm8k/configs/DeepSeek-V2-Lite-Instruct-FP8.yaml
+++ b/tests/evals/gsm8k/configs/DeepSeek-V2-Lite-Instruct-FP8.yaml
@@ -2,5 +2,4 @@ model_name: "RedHatAI/DeepSeek-Coder-V2-Lite-Instruct-FP8"
 accuracy_threshold: 0.72
 num_questions: 1319
 num_fewshot: 5
-max_model_len: 4096
-
+server_args: "--enforce-eager --max-model-len 4096"
diff --git a/tests/evals/gsm8k/configs/Llama-3-8B-Instruct-nonuniform-CT.yaml b/tests/evals/gsm8k/configs/Llama-3-8B-Instruct-nonuniform-CT.yaml
index caa0448f23d48a0544ea5ceac008850bb555e92d..b7b59e9dcd5ce5ce427f537eec798419444c0c7b 100644
--- a/tests/evals/gsm8k/configs/Llama-3-8B-Instruct-nonuniform-CT.yaml
+++ b/tests/evals/gsm8k/configs/Llama-3-8B-Instruct-nonuniform-CT.yaml
@@ -2,4 +2,4 @@ model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
 accuracy_threshold: 0.74
 num_questions: 1319
 num_fewshot: 5
-max_model_len: 4096
\ No newline at end of file
+server_args: "--enforce-eager --max-model-len 4096"
diff --git a/tests/evals/gsm8k/configs/Llama-3.2-1B-Instruct-INT8-CT.yaml b/tests/evals/gsm8k/configs/Llama-3.2-1B-Instruct-INT8-CT.yaml
index 615aa69a2d2b6aed88a3e328db644de627a07681..8b3c9ff645e8711a51089ae67d2ad7ef64d47be9 100644
--- a/tests/evals/gsm8k/configs/Llama-3.2-1B-Instruct-INT8-CT.yaml
+++ b/tests/evals/gsm8k/configs/Llama-3.2-1B-Instruct-INT8-CT.yaml
@@ -2,4 +2,4 @@ model_name: "RedHatAI/Llama-3.2-1B-Instruct-quantized.w8a8"
 accuracy_threshold: 0.31
 num_questions: 1319
 num_fewshot: 5
-max_model_len: 4096
\ No newline at end of file
+server_args: "--enforce-eager --max-model-len 4096"
diff --git a/tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml b/tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml
index 9297bf6ddf2d3ce107ccf1fe70780fb1093ba566..4a1b1948acac8a150cecd86b08a8cc671d5a5e7a 100644
--- a/tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml
+++ b/tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml
@@ -2,4 +2,4 @@ model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
 accuracy_threshold: 0.45
 num_questions: 1319
 num_fewshot: 5
-max_model_len: 4096
+server_args: "--enforce-eager --max-model-len 4096"
diff --git a/tests/evals/gsm8k/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml b/tests/evals/gsm8k/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
index 5319ada30f645020a462bfd72e4c485ab823be3f..5ce3af8be346a3d4c4420df83201bceec9b25a6e 100644
--- a/tests/evals/gsm8k/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
+++ b/tests/evals/gsm8k/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
@@ -2,4 +2,4 @@ model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
 accuracy_threshold: 0.60
 num_questions: 1319
 num_fewshot: 5
-max_model_len: 4096
\ No newline at end of file
+server_args: "--enforce-eager --max-model-len 4096"
diff --git a/tests/evals/gsm8k/configs/Qwen3-0.6B-FP8.yaml b/tests/evals/gsm8k/configs/Qwen3-0.6B-FP8.yaml
index c39fb979d98ac0b1d7a3f94b7bea0c8cfc29cdd4..5452ebe753f0404acd30aedacf624150d5f8b106 100644
--- a/tests/evals/gsm8k/configs/Qwen3-0.6B-FP8.yaml
+++ b/tests/evals/gsm8k/configs/Qwen3-0.6B-FP8.yaml
@@ -2,4 +2,4 @@ model_name: "Qwen/Qwen3-0.6B-FP8"
 accuracy_threshold: 0.375
 num_questions: 1319
 num_fewshot: 5
-max_model_len: 4096
\ No newline at end of file
+server_args: "--enforce-eager --max-model-len 4096"
diff --git a/tests/evals/gsm8k/configs/Qwen3-30B-A3B-NVFP4.yaml b/tests/evals/gsm8k/configs/Qwen3-30B-A3B-NVFP4.yaml
index 6b7bdd1e65bb31599a7f3784d3addce25ecc487e..f162aa8bfe5b0965138b35a57548b0844063606f 100644
--- a/tests/evals/gsm8k/configs/Qwen3-30B-A3B-NVFP4.yaml
+++ b/tests/evals/gsm8k/configs/Qwen3-30B-A3B-NVFP4.yaml
@@ -2,5 +2,4 @@ model_name: "nvidia/Qwen3-30B-A3B-FP4"
 accuracy_threshold: 0.89
 num_questions: 1319
 num_fewshot: 5
-max_model_len: 4096
-
+server_args: "--enforce-eager --max-model-len 4096"
diff --git a/tests/evals/gsm8k/configs/Qwen3-Next-80B-A3B-NVFP4-EP2.yaml b/tests/evals/gsm8k/configs/Qwen3-Next-80B-A3B-NVFP4-EP2.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..673b473f817eb81d63317a05427a9aea93292584
--- /dev/null
+++ b/tests/evals/gsm8k/configs/Qwen3-Next-80B-A3B-NVFP4-EP2.yaml
@@ -0,0 +1,12 @@
+model_name: "nm-testing/Qwen3-Next-80B-A3B-Instruct-NVFP4"
+accuracy_threshold: 0.75
+num_questions: 1319
+num_fewshot: 5
+server_args: >-
+  --enforce-eager
+  --max-model-len 4096
+  --tensor-parallel-size 2
+  --enable-expert-parallel
+  --speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}'
+env:
+  VLLM_USE_FLASHINFER_MOE_FP4: "1"
diff --git a/tests/evals/gsm8k/configs/Qwen3-Next-FP8-EP2.yaml b/tests/evals/gsm8k/configs/Qwen3-Next-FP8-EP2.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9fae32734d75327da32ac7ee9c132efef6654d96
--- /dev/null
+++ b/tests/evals/gsm8k/configs/Qwen3-Next-FP8-EP2.yaml
@@ -0,0 +1,11 @@
+model_name: "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8"
+accuracy_threshold: 0.85
+num_questions: 1319
+num_fewshot: 5
+server_args: >-
+  --max-model-len 4096
+  --tensor-parallel-size 2
+  --enable-expert-parallel
+  --async-scheduling
+env:
+  VLLM_USE_FLASHINFER_MOE_FP8: "1"
diff --git a/tests/evals/gsm8k/configs/models-blackwell.txt b/tests/evals/gsm8k/configs/models-blackwell.txt
index 3c9b1084de7bc288f6bf7ac2c99bf97a16ae8c01..c27031d25fb8cf17744e2ff4c99682c2c383c09d 100644
--- a/tests/evals/gsm8k/configs/models-blackwell.txt
+++ b/tests/evals/gsm8k/configs/models-blackwell.txt
@@ -3,3 +3,5 @@ Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
 Qwen1.5-MoE-W4A16-CT.yaml
 DeepSeek-V2-Lite-Instruct-FP8.yaml
 Qwen3-30B-A3B-NVFP4.yaml
+Qwen3-Next-80B-A3B-NVFP4-EP2.yaml
+Qwen3-Next-FP8-EP2.yaml
diff --git a/tests/evals/gsm8k/configs/models-h200.txt b/tests/evals/gsm8k/configs/models-h200.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ec936533bb667e536f1c6737d786faf012f96ef0
--- /dev/null
+++ b/tests/evals/gsm8k/configs/models-h200.txt
@@ -0,0 +1,2 @@
+DeepSeek-R1-TP.yaml
+DeepSeek-R1-DP.yaml
diff --git a/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Llama-4-Scout-Fp8-ModelOpt-triton.yaml b/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Llama-4-Scout-Fp8-ModelOpt-triton.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9d62c542a0851763c61dcc30f190c8d6579522ad
--- /dev/null
+++ b/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Llama-4-Scout-Fp8-ModelOpt-triton.yaml
@@ -0,0 +1,5 @@
+model_name: "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8"
+accuracy_threshold: 0.92
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 8192 --data-parallel-size 2 --enable-expert-parallel"
diff --git a/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-Fp8-AutoFp8-deepgemm-deepep-ht.yaml b/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-Fp8-AutoFp8-deepgemm-deepep-ht.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..276d63f4ee1066211507f2d7420d97740f77693e
--- /dev/null
+++ b/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-Fp8-AutoFp8-deepgemm-deepep-ht.yaml
@@ -0,0 +1,8 @@
+model_name: "Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8"
+accuracy_threshold: 0.88
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 8192 --data-parallel-size 2 --enable-expert-parallel --all2all-backend deepep_high_throughput"
+env:
+  VLLM_USE_DEEP_GEMM: "1"
+  VLLM_USE_DEEP_GEMM_MOE: "1"
diff --git a/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-Fp8-AutoFp8-deepgemm-deepep-ll.yaml b/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-Fp8-AutoFp8-deepgemm-deepep-ll.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..54e6ab7b35f604d4265b577cd4814b59c90ee5cf
--- /dev/null
+++ b/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-Fp8-AutoFp8-deepgemm-deepep-ll.yaml
@@ -0,0 +1,9 @@
+model_name: "Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8"
+accuracy_threshold: 0.88
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 8192 --data-parallel-size 2 --enable-expert-parallel --all2all-backend deepep_low_latency --disable-uvicorn-access-log"
+env:
+  VLLM_USE_DEEP_GEMM: "1"
+  VLLM_USE_DEEP_GEMM_MOE: "1"
+  VLLM_USE_DEEP_GEMM_E8M0: "0"
diff --git a/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-Fp8-AutoFp8-deepgemm.yaml b/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-Fp8-AutoFp8-deepgemm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..eee58539c4a827f6f8a66b22f0c89e6170a345e4
--- /dev/null
+++ b/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-Fp8-AutoFp8-deepgemm.yaml
@@ -0,0 +1,8 @@
+model_name: "Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8"
+accuracy_threshold: 0.88
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 8192 --data-parallel-size 2 --enable-expert-parallel"
+env:
+  VLLM_USE_DEEP_GEMM: "1"
+  VLLM_USE_DEEP_GEMM_MOE: "1"
diff --git a/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-Fp8-CT-Block-deepgemm-deepep-ht.yaml b/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-Fp8-CT-Block-deepgemm-deepep-ht.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2083df585f4d50a2152d037396315b8203cf2fc8
--- /dev/null
+++ b/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-Fp8-CT-Block-deepgemm-deepep-ht.yaml
@@ -0,0 +1,8 @@
+model_name: "RedHatAI/Qwen3-30B-A3B-FP8-block"
+accuracy_threshold: 0.85
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 8192 --data-parallel-size 2 --enable-expert-parallel --all2all-backend deepep_high_throughput"
+env:
+  VLLM_USE_DEEP_GEMM: "1"
+  VLLM_USE_DEEP_GEMM_MOE: "1"
diff --git a/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-Fp8-CT-Block-deepgemm-deepep-ll.yaml b/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-Fp8-CT-Block-deepgemm-deepep-ll.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1d4cbfe96b68d0006e37b41345a11c52479537e2
--- /dev/null
+++ b/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-Fp8-CT-Block-deepgemm-deepep-ll.yaml
@@ -0,0 +1,9 @@
+model_name: "RedHatAI/Qwen3-30B-A3B-FP8-block"
+accuracy_threshold: 0.85
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 8192 --data-parallel-size 2 --enable-expert-parallel --all2all-backend deepep_low_latency --disable-uvicorn-access-log"
+env:
+  VLLM_USE_DEEP_GEMM: "1"
+  VLLM_USE_DEEP_GEMM_MOE: "1"
+  VLLM_USE_DEEP_GEMM_E8M0: "0"
diff --git a/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-Fp8-CT-Block-deepgemm.yaml b/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-Fp8-CT-Block-deepgemm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..246549d629612ad6bcddbbda87b320fea766ff8a
--- /dev/null
+++ b/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-Fp8-CT-Block-deepgemm.yaml
@@ -0,0 +1,8 @@
+model_name: "RedHatAI/Qwen3-30B-A3B-FP8-block"
+accuracy_threshold: 0.85
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 8192 --data-parallel-size 2 --enable-expert-parallel"
+env:
+  VLLM_USE_DEEP_GEMM: "1"
+  VLLM_USE_DEEP_GEMM_MOE: "1"
diff --git a/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-NvFp4-CT-fi-cutedsl-deepep-ll.yaml b/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-NvFp4-CT-fi-cutedsl-deepep-ll.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1328fdedf0c40d8d4f80b9e42b931fcbcd49c470
--- /dev/null
+++ b/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-NvFp4-CT-fi-cutedsl-deepep-ll.yaml
@@ -0,0 +1,8 @@
+model_name: "RedHatAI/Qwen3-30B-A3B-NVFP4"
+accuracy_threshold: 0.88
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 8192 --data-parallel-size 2 --enable-expert-parallel --all2all-backend deepep_low_latency"
+env:
+  VLLM_USE_FLASHINFER_MOE_FP4: "1"
+  VLLM_FLASHINFER_MOE_BACKEND: "masked_gemm"
diff --git a/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-NvFp4-CT-fi-cutlass.yaml b/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-NvFp4-CT-fi-cutlass.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..53fd62bac83925e4ae8c11ed8d8167cfa4369121
--- /dev/null
+++ b/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-NvFp4-CT-fi-cutlass.yaml
@@ -0,0 +1,8 @@
+model_name: "RedHatAI/Qwen3-30B-A3B-NVFP4"
+accuracy_threshold: 0.88
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 8192 --data-parallel-size 2 --enable-expert-parallel"
+env:
+  VLLM_USE_FLASHINFER_MOE_FP4: "1"
+  VLLM_FLASHINFER_MOE_BACKEND: "throughput"
diff --git a/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-NvFp4-ModelOpt-fi-cutedsl-deepep-ll.yaml b/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-NvFp4-ModelOpt-fi-cutedsl-deepep-ll.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..87fac0e708c58594c903794810407ccdf793891f
--- /dev/null
+++ b/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-NvFp4-ModelOpt-fi-cutedsl-deepep-ll.yaml
@@ -0,0 +1,8 @@
+model_name: "nvidia/Qwen3-30B-A3B-NVFP4"
+accuracy_threshold: 0.88
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 8192 --data-parallel-size 2 --enable-expert-parallel --all2all-backend deepep_low_latency"
+env:
+  VLLM_USE_FLASHINFER_MOE_FP4: "1"
+  VLLM_FLASHINFER_MOE_BACKEND: "masked_gemm"
diff --git a/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-NvFp4-ModelOpt-fi-cutlass.yaml b/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-NvFp4-ModelOpt-fi-cutlass.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..44f8700e4b4655e99a60611088cd6528790e6991
--- /dev/null
+++ b/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-NvFp4-ModelOpt-fi-cutlass.yaml
@@ -0,0 +1,8 @@
+model_name: "nvidia/Qwen3-30B-A3B-NVFP4"
+accuracy_threshold: 0.88
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 8192 --data-parallel-size 2 --enable-expert-parallel"
+env:
+  VLLM_USE_FLASHINFER_MOE_FP4: "1"
+  VLLM_FLASHINFER_MOE_BACKEND: "throughput"
diff --git a/tests/evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt b/tests/evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt
new file mode 100644
index 0000000000000000000000000000000000000000..53e2fa8a7dd1c459bfed603d68f67fffbe3cd8f8
--- /dev/null
+++ b/tests/evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt
@@ -0,0 +1,10 @@
+Qwen3-30B-A3B-NvFp4-CT-fi-cutedsl-deepep-ll.yaml
+Qwen3-30B-A3B-NvFp4-ModelOpt-fi-cutedsl-deepep-ll.yaml
+Qwen3-30B-A3B-NvFp4-CT-fi-cutlass.yaml
+Qwen3-30B-A3B-NvFp4-ModelOpt-fi-cutlass.yaml
+Qwen3-30B-A3B-Fp8-AutoFp8-deepgemm-deepep-ht.yaml
+Qwen3-30B-A3B-Fp8-AutoFp8-deepgemm-deepep-ll.yaml
+Qwen3-30B-A3B-Fp8-AutoFp8-deepgemm.yaml
+Qwen3-30B-A3B-Fp8-CT-Block-deepgemm-deepep-ht.yaml
+Qwen3-30B-A3B-Fp8-CT-Block-deepgemm-deepep-ll.yaml
+Qwen3-30B-A3B-Fp8-CT-Block-deepgemm.yaml
diff --git a/tests/evals/gsm8k/configs/moe-refactor/Llama-4-Scout-Fp8-CT-vllm-cutlass.yaml b/tests/evals/gsm8k/configs/moe-refactor/Llama-4-Scout-Fp8-CT-vllm-cutlass.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bf8c93921f41978cb761525afe398bffdb6b7362
--- /dev/null
+++ b/tests/evals/gsm8k/configs/moe-refactor/Llama-4-Scout-Fp8-CT-vllm-cutlass.yaml
@@ -0,0 +1,5 @@
+model_name: "RedHatAI/Llama-4-Scout-17B-16E-Instruct-FP8-dynamic"
+accuracy_threshold: 0.92
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
diff --git a/tests/evals/gsm8k/configs/moe-refactor/Llama-4-Scout-Fp8-ModelOpt-fi-cutlass.yaml b/tests/evals/gsm8k/configs/moe-refactor/Llama-4-Scout-Fp8-ModelOpt-fi-cutlass.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4c9a01274d9900159839eea7184860645231c025
--- /dev/null
+++ b/tests/evals/gsm8k/configs/moe-refactor/Llama-4-Scout-Fp8-ModelOpt-fi-cutlass.yaml
@@ -0,0 +1,8 @@
+model_name: "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8"
+accuracy_threshold: 0.92
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
+env:
+  VLLM_USE_FLASHINFER_MOE_FP8: "1"
+  VLLM_FLASHINFER_MOE_BACKEND: "throughput"
diff --git a/tests/evals/gsm8k/configs/moe-refactor/Llama-4-Scout-Fp8-ModelOpt-fi-trtllm.yaml b/tests/evals/gsm8k/configs/moe-refactor/Llama-4-Scout-Fp8-ModelOpt-fi-trtllm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..17f067215eb50eeab5341be311b1a706656ac51d
--- /dev/null
+++ b/tests/evals/gsm8k/configs/moe-refactor/Llama-4-Scout-Fp8-ModelOpt-fi-trtllm.yaml
@@ -0,0 +1,8 @@
+model_name: "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8"
+accuracy_threshold: 0.92
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
+env:
+  VLLM_USE_FLASHINFER_MOE_FP8: "1"
+  VLLM_FLASHINFER_MOE_BACKEND: "latency"
diff --git a/tests/evals/gsm8k/configs/moe-refactor/Llama-4-Scout-Fp8-ModelOpt-marlin.yaml b/tests/evals/gsm8k/configs/moe-refactor/Llama-4-Scout-Fp8-ModelOpt-marlin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..be8192f2a89ae6735e088ae7791ff3c90a35e175
--- /dev/null
+++ b/tests/evals/gsm8k/configs/moe-refactor/Llama-4-Scout-Fp8-ModelOpt-marlin.yaml
@@ -0,0 +1,7 @@
+model_name: "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8"
+accuracy_threshold: 0.92
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
+env:
+  VLLM_TEST_FORCE_FP8_MARLIN: "1"
diff --git a/tests/evals/gsm8k/configs/moe-refactor/Llama-4-Scout-Fp8-ModelOpt-triton.yaml b/tests/evals/gsm8k/configs/moe-refactor/Llama-4-Scout-Fp8-ModelOpt-triton.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..80e279edc97100f80c5edaa01a02adb299c66c36
--- /dev/null
+++ b/tests/evals/gsm8k/configs/moe-refactor/Llama-4-Scout-Fp8-ModelOpt-triton.yaml
@@ -0,0 +1,5 @@
+model_name: "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8"
+accuracy_threshold: 0.92
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
diff --git a/tests/evals/gsm8k/configs/moe-refactor/Mixtral-8x7B-Fp8-AutoFp8-fi-cutlass.yaml b/tests/evals/gsm8k/configs/moe-refactor/Mixtral-8x7B-Fp8-AutoFp8-fi-cutlass.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b9c6a1997dc34d1584a1e8aef10b51aaec8ec47a
--- /dev/null
+++ b/tests/evals/gsm8k/configs/moe-refactor/Mixtral-8x7B-Fp8-AutoFp8-fi-cutlass.yaml
@@ -0,0 +1,9 @@
+# TODO(rob): enable
+# model_name: "amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV"
+# accuracy_threshold: 0.62
+# num_questions: 1319
+# num_fewshot: 5
+# server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
+# env:
+#   VLLM_USE_FLASHINFER_MOE_FP8: "1"
+#   VLLM_FLASHINFER_MOE_BACKEND: "throughput"
diff --git a/tests/evals/gsm8k/configs/moe-refactor/Mixtral-8x7B-Fp8-AutoFp8-triton.yaml b/tests/evals/gsm8k/configs/moe-refactor/Mixtral-8x7B-Fp8-AutoFp8-triton.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f730e2e2fb1a41f31cccc4a11942fa52ca47460a
--- /dev/null
+++ b/tests/evals/gsm8k/configs/moe-refactor/Mixtral-8x7B-Fp8-AutoFp8-triton.yaml
@@ -0,0 +1,5 @@
+model_name: "amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV"
+accuracy_threshold: 0.62
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
diff --git a/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-AutoFp8-deepgemm.yaml b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-AutoFp8-deepgemm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b6cff0abc9d371828829c25105b06cce8192e540
--- /dev/null
+++ b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-AutoFp8-deepgemm.yaml
@@ -0,0 +1,8 @@
+model_name: "Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8"
+accuracy_threshold: 0.88
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
+env:
+  VLLM_USE_DEEP_GEMM: "1"
+  VLLM_USE_DEEP_GEMM_MOE: "1"
diff --git a/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-AutoFp8-fi-cutlass.yaml b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-AutoFp8-fi-cutlass.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..080c8d338e58e9ede987c42d3d939a9322a568ac
--- /dev/null
+++ b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-AutoFp8-fi-cutlass.yaml
@@ -0,0 +1,10 @@
+model_name: "Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8"
+accuracy_threshold: 0.88
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
+env:
+  VLLM_USE_DEEP_GEMM: "0"
+  VLLM_USE_DEEP_GEMM_MOE: "0"
+  VLLM_USE_FLASHINFER_MOE_FP8: "1"
+  VLLM_FLASHINFER_MOE_BACKEND: "throughput"
diff --git a/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-AutoFp8-fi-trtllm.yaml b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-AutoFp8-fi-trtllm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a656cc7c37f1b113e1d668c68f7c4991a636fa70
--- /dev/null
+++ b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-AutoFp8-fi-trtllm.yaml
@@ -0,0 +1,10 @@
+model_name: "Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8"
+accuracy_threshold: 0.88
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
+env:
+  VLLM_USE_DEEP_GEMM: "0"
+  VLLM_USE_DEEP_GEMM_MOE: "0"
+  VLLM_USE_FLASHINFER_MOE_FP8: "1"
+  VLLM_FLASHINFER_MOE_BACKEND: "latency"
diff --git a/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-AutoFp8-marlin.yaml b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-AutoFp8-marlin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f2273bf2c96c452aa942c85ec633ac2d070970fc
--- /dev/null
+++ b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-AutoFp8-marlin.yaml
@@ -0,0 +1,9 @@
+model_name: "Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8"
+accuracy_threshold: 0.88
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
+env:
+  VLLM_USE_DEEP_GEMM: "0"
+  VLLM_USE_DEEP_GEMM_MOE: "0"
+  VLLM_TEST_FORCE_FP8_MARLIN: "1"
diff --git a/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-AutoFp8-triton.yaml b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-AutoFp8-triton.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ed61e9b89978bf9c0e2245e1baf4f99f767ca945
--- /dev/null
+++ b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-AutoFp8-triton.yaml
@@ -0,0 +1,8 @@
+model_name: "Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8"
+accuracy_threshold: 0.88
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
+env:
+  VLLM_USE_DEEP_GEMM: "0"
+  VLLM_USE_DEEP_GEMM_MOE: "0"
diff --git a/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-CT-Block-deepgemm.yaml b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-CT-Block-deepgemm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f7ddd30342b3147699dd48563bd7fb76ba341a52
--- /dev/null
+++ b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-CT-Block-deepgemm.yaml
@@ -0,0 +1,8 @@
+model_name: "RedHatAI/Qwen3-30B-A3B-FP8-block"
+accuracy_threshold: 0.85
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
+env:
+  VLLM_USE_DEEP_GEMM: "1"
+  VLLM_USE_DEEP_GEMM_MOE: "1"
diff --git a/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-CT-Block-fi-cutlass.yaml b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-CT-Block-fi-cutlass.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..db18dd01bb23d966e5589aecc443a27c729d137f
--- /dev/null
+++ b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-CT-Block-fi-cutlass.yaml
@@ -0,0 +1,10 @@
+model_name: "RedHatAI/Qwen3-30B-A3B-FP8-block"
+accuracy_threshold: 0.85
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
+env:
+  VLLM_USE_DEEP_GEMM: "0"
+  VLLM_USE_DEEP_GEMM_MOE: "0"
+  VLLM_USE_FLASHINFER_MOE_FP8: "1"
+  VLLM_FLASHINFER_MOE_BACKEND: "throughput"
diff --git a/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-CT-Block-marlin.yaml b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-CT-Block-marlin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3d82d2e22c1a8fafbbc6a4c73771124ad0953966
--- /dev/null
+++ b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-CT-Block-marlin.yaml
@@ -0,0 +1,9 @@
+model_name: "RedHatAI/Qwen3-30B-A3B-FP8-block"
+accuracy_threshold: 0.85
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
+env:
+  VLLM_USE_DEEP_GEMM: "0"
+  VLLM_USE_DEEP_GEMM_MOE: "0"
+  VLLM_TEST_FORCE_FP8_MARLIN: "1"
diff --git a/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-CT-Block-triton.yaml b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-CT-Block-triton.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5621217de83ad28471c030c15eb802016ed25e2f
--- /dev/null
+++ b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-CT-Block-triton.yaml
@@ -0,0 +1,8 @@
+model_name: "RedHatAI/Qwen3-30B-A3B-FP8-block"
+accuracy_threshold: 0.85
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
+env:
+  VLLM_USE_DEEP_GEMM: "0"
+  VLLM_USE_DEEP_GEMM_MOE: "0"
diff --git a/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-CT-Channel-marlin.yaml b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-CT-Channel-marlin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8ed6410c36b5e93cfe884f3c0cc9c98b759a175f
--- /dev/null
+++ b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-CT-Channel-marlin.yaml
@@ -0,0 +1,7 @@
+model_name: "RedHatAI/Qwen3-30B-A3B-FP8-dynamic"
+accuracy_threshold: 0.85
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
+env:
+  VLLM_TEST_FORCE_FP8_MARLIN: "1"
diff --git a/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-CT-Channel-vllm-cutlass.yaml b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-CT-Channel-vllm-cutlass.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d6adbfc5fba0a78e39d109b37668f9185221a712
--- /dev/null
+++ b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-CT-Channel-vllm-cutlass.yaml
@@ -0,0 +1,5 @@
+model_name: "RedHatAI/Qwen3-30B-A3B-FP8-dynamic"
+accuracy_threshold: 0.85
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
diff --git a/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-CT-fi-cutlass-dp-ep.yaml b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-CT-fi-cutlass-dp-ep.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..53fd62bac83925e4ae8c11ed8d8167cfa4369121
--- /dev/null
+++ b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-CT-fi-cutlass-dp-ep.yaml
@@ -0,0 +1,8 @@
+model_name: "RedHatAI/Qwen3-30B-A3B-NVFP4"
+accuracy_threshold: 0.88
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 8192 --data-parallel-size 2 --enable-expert-parallel"
+env:
+  VLLM_USE_FLASHINFER_MOE_FP4: "1"
+  VLLM_FLASHINFER_MOE_BACKEND: "throughput"
diff --git a/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-CT-fi-cutlass.yaml b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-CT-fi-cutlass.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6edacc32975cf58c4a1959e0383ca31a104e2a2a
--- /dev/null
+++ b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-CT-fi-cutlass.yaml
@@ -0,0 +1,8 @@
+model_name: "RedHatAI/Qwen3-30B-A3B-NVFP4"
+accuracy_threshold: 0.88
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
+env:
+  VLLM_USE_FLASHINFER_MOE_FP4: "1"
+  VLLM_FLASHINFER_MOE_BACKEND: "throughput"
diff --git a/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-CT-fi-trtllm.yaml b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-CT-fi-trtllm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8e0b155fa70dbec600ec497ff1dc1430c2f1d622
--- /dev/null
+++ b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-CT-fi-trtllm.yaml
@@ -0,0 +1,8 @@
+model_name: "RedHatAI/Qwen3-30B-A3B-NVFP4"
+accuracy_threshold: 0.88
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
+env:
+  VLLM_USE_FLASHINFER_MOE_FP4: "1"
+  VLLM_FLASHINFER_MOE_BACKEND: "latency"
diff --git a/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-CT-marlin.yaml b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-CT-marlin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8199e65634955defcc3d93afe1543190266d8ec0
--- /dev/null
+++ b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-CT-marlin.yaml
@@ -0,0 +1,7 @@
+model_name: "RedHatAI/Qwen3-30B-A3B-NVFP4"
+accuracy_threshold: 0.88
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
+env:
+  VLLM_TEST_FORCE_FP8_MARLIN: "1"
diff --git a/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-CT-vllm-cutlass.yaml b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-CT-vllm-cutlass.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b1ccadeddbba395fc8a47e2c73fd37e452f4b676
--- /dev/null
+++ b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-CT-vllm-cutlass.yaml
@@ -0,0 +1,5 @@
+model_name: "RedHatAI/Qwen3-30B-A3B-NVFP4"
+accuracy_threshold: 0.88
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
diff --git a/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-ModelOpt-fi-cutlass-dp-ep.yaml b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-ModelOpt-fi-cutlass-dp-ep.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..44f8700e4b4655e99a60611088cd6528790e6991
--- /dev/null
+++ b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-ModelOpt-fi-cutlass-dp-ep.yaml
@@ -0,0 +1,8 @@
+model_name: "nvidia/Qwen3-30B-A3B-NVFP4"
+accuracy_threshold: 0.88
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 8192 --data-parallel-size 2 --enable-expert-parallel"
+env:
+  VLLM_USE_FLASHINFER_MOE_FP4: "1"
+  VLLM_FLASHINFER_MOE_BACKEND: "throughput"
diff --git a/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-ModelOpt-fi-cutlass.yaml b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-ModelOpt-fi-cutlass.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..09e76e21ab4302c143f1a2965e65149a593073c1
--- /dev/null
+++ b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-ModelOpt-fi-cutlass.yaml
@@ -0,0 +1,8 @@
+model_name: "nvidia/Qwen3-30B-A3B-NVFP4"
+accuracy_threshold: 0.88
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
+env:
+  VLLM_USE_FLASHINFER_MOE_FP4: "1"
+  VLLM_FLASHINFER_MOE_BACKEND: "throughput"
diff --git a/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-ModelOpt-fi-trtllm.yaml b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-ModelOpt-fi-trtllm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a98afafbcde9d0511ef227cbcd1ca5b23ad8e92d
--- /dev/null
+++ b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-ModelOpt-fi-trtllm.yaml
@@ -0,0 +1,8 @@
+model_name: "nvidia/Qwen3-30B-A3B-NVFP4"
+accuracy_threshold: 0.88
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
+env:
+  VLLM_USE_FLASHINFER_MOE_FP4: "1"
+  VLLM_FLASHINFER_MOE_BACKEND: "latency"
diff --git a/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-ModelOpt-marlin.yaml b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-ModelOpt-marlin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4156cec897610ba709ebc02928c142173bf4c092
--- /dev/null
+++ b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-ModelOpt-marlin.yaml
@@ -0,0 +1,7 @@
+model_name: "nvidia/Qwen3-30B-A3B-NVFP4"
+accuracy_threshold: 0.88
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
+env:
+  VLLM_TEST_FORCE_FP8_MARLIN: "1"
diff --git a/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-ModelOpt-vllm-cutlass.yaml b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-ModelOpt-vllm-cutlass.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..49a1589fcfea732835ca8b693d16edba66c694ee
--- /dev/null
+++ b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-ModelOpt-vllm-cutlass.yaml
@@ -0,0 +1,5 @@
+model_name: "nvidia/Qwen3-30B-A3B-NVFP4"
+accuracy_threshold: 0.88
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
diff --git a/tests/evals/gsm8k/configs/moe-refactor/config-b200.txt b/tests/evals/gsm8k/configs/moe-refactor/config-b200.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9d86e432e84f7862e868980ce91f3c94fefe8bd2
--- /dev/null
+++ b/tests/evals/gsm8k/configs/moe-refactor/config-b200.txt
@@ -0,0 +1,13 @@
+Llama-4-Scout-Fp8-CT-vllm-cutlass.yaml
+Llama-4-Scout-Fp8-ModelOpt-fi-trtllm.yaml
+Qwen3-30B-A3B-Fp8-AutoFp8-fi-trtllm.yaml  
+Qwen3-30B-A3B-NvFp4-CT-vllm-cutlass.yaml
+Qwen3-30B-A3B-NvFp4-CT-marlin.yaml
+Qwen3-30B-A3B-NvFp4-CT-fi-trtllm.yaml
+Qwen3-30B-A3B-NvFp4-CT-fi-cutlass.yaml
+Qwen3-30B-A3B-NvFp4-CT-fi-cutlass-dp-ep.yaml
+Qwen3-30B-A3B-NvFp4-ModelOpt-vllm-cutlass.yaml
+Qwen3-30B-A3B-NvFp4-ModelOpt-marlin.yaml
+Qwen3-30B-A3B-NvFp4-ModelOpt-fi-trtllm.yaml
+Qwen3-30B-A3B-NvFp4-ModelOpt-fi-cutlass.yaml
+Qwen3-30B-A3B-NvFp4-ModelOpt-fi-cutlass-dp-ep.yaml
diff --git a/tests/evals/gsm8k/configs/moe-refactor/config-h100.txt b/tests/evals/gsm8k/configs/moe-refactor/config-h100.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2c25ea2c2caaf089f3aedcb68697e1245ea8258e
--- /dev/null
+++ b/tests/evals/gsm8k/configs/moe-refactor/config-h100.txt
@@ -0,0 +1,13 @@
+Mixtral-8x7B-Fp8-AutoFp8-triton.yaml
+Qwen3-30B-A3B-Fp8-AutoFp8-deepgemm.yaml
+Qwen3-30B-A3B-Fp8-AutoFp8-fi-cutlass.yaml
+Qwen3-30B-A3B-Fp8-AutoFp8-marlin.yaml
+Qwen3-30B-A3B-Fp8-AutoFp8-triton.yaml
+Qwen3-30B-A3B-Fp8-CT-Block-deepgemm.yaml
+Qwen3-30B-A3B-Fp8-CT-Block-marlin.yaml
+Qwen3-30B-A3B-Fp8-CT-Block-triton.yaml
+Qwen3-30B-A3B-Fp8-CT-Channel-marlin.yaml
+Qwen3-30B-A3B-Fp8-CT-Channel-vllm-cutlass.yaml
+Llama-4-Scout-Fp8-ModelOpt-fi-cutlass.yaml
+Llama-4-Scout-Fp8-ModelOpt-marlin.yaml
+Llama-4-Scout-Fp8-ModelOpt-triton.yaml
diff --git a/tests/evals/gsm8k/configs/moe-refactor/config-test.txt b/tests/evals/gsm8k/configs/moe-refactor/config-test.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1816666bec0a20dc297bf7509c28ae9ec43cb301
--- /dev/null
+++ b/tests/evals/gsm8k/configs/moe-refactor/config-test.txt
@@ -0,0 +1 @@
+Qwen3-30B-A3B-NvFp4-CT-marlin.yaml
\ No newline at end of file
diff --git a/tests/evals/gsm8k/conftest.py b/tests/evals/gsm8k/conftest.py
index 1932a13cdfc63bbe57c44e9067974e5664130bea..6f25fe6414af4e76b112c417e327a6135fbaf40e 100644
--- a/tests/evals/gsm8k/conftest.py
+++ b/tests/evals/gsm8k/conftest.py
@@ -11,14 +11,12 @@ def pytest_addoption(parser):
         default="configs/models-small.txt",
         help="File containing list of config files to test",
     )
-    parser.addoption("--tp-size", default=1, type=int, help="Tensor parallel size")
 
 
 def pytest_generate_tests(metafunc):
     """Generate test parameters from config files."""
     if "config_filename" in metafunc.fixturenames:
         config_list_file = metafunc.config.getoption("--config-list-file")
-        tp_size = metafunc.config.getoption("--tp-size")
 
         # Handle both relative and absolute paths
         config_list_path = Path(config_list_file)
@@ -55,9 +53,9 @@ def pytest_generate_tests(metafunc):
         # Generate test parameters
         if config_files:
             metafunc.parametrize(
-                ["config_filename", "tp_size"],
-                [(config_file, int(tp_size)) for config_file in config_files],
-                ids=[f"{config_file.stem}-tp{tp_size}" for config_file in config_files],
+                "config_filename",
+                config_files,
+                ids=[config_file.stem for config_file in config_files],
             )
         else:
             print("No config files found, test will be skipped")
diff --git a/tests/evals/gsm8k/test_gsm8k_correctness.py b/tests/evals/gsm8k/test_gsm8k_correctness.py
index b5d67df7bf3db48f40e1a96bd938dd23f3a19fba..6b2cb02e9401c691551937e57b5bfb6c5ca0a37f 100644
--- a/tests/evals/gsm8k/test_gsm8k_correctness.py
+++ b/tests/evals/gsm8k/test_gsm8k_correctness.py
@@ -5,30 +5,31 @@ GSM8K evaluation using vLLM server and isolated GSM8K script.
 Replacement for lm-eval-harness with better performance and control.
 
 Usage:
-pytest -s -v test_gsm8k_correctness.py \
-    --config-list-file=configs/models-small.txt \
-    --tp-size=1
+pytest -s -v tests/evals/gsm8k/test_gsm8k_correctness.py \
+    --config-list-file=configs/models-small.txt
 """
 
+import shlex
+
 import yaml
 
 from tests.utils import RemoteOpenAIServer
 
 from .gsm8k_eval import evaluate_gsm8k
 
-RTOL = 0.08  # Relative tolerance for accuracy comparison
+TOL = 0.08  # Absolute tolerance for accuracy comparison
 
 
-def launch_gsm8k_eval(eval_config, server_url, tp_size):
-    """Launch GSM8K evaluation using our isolated script."""
+def run_gsm8k_eval(eval_config: dict, server_url: str) -> dict:
+    """Run GSM8K evaluation using our isolated script."""
     # Extract host and port from server URL
     if "://" in server_url:
         server_url = server_url.split("://")[1]
 
     host_port = server_url.split("/")[0]  # Remove path if present
     if ":" in host_port:
-        host, port = host_port.split(":")
-        port = int(port)
+        host, p = host_port.split(":")
+        port = int(p)
     else:
         host = host_port
         port = 8000
@@ -48,46 +49,59 @@ def launch_gsm8k_eval(eval_config, server_url, tp_size):
     return results
 
 
-def test_gsm8k_correctness_param(config_filename, tp_size):
+def test_gsm8k_correctness(config_filename):
     """Test GSM8K correctness for a given model configuration."""
     eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8"))
 
-    # Server arguments
-    server_args = [
-        "--max-model-len",
-        str(eval_config.get("max_model_len", 4096)),
-        "--enforce-eager",
-        "--trust-remote-code",
-        "--tensor-parallel-size",
-        str(tp_size),
-    ]
+    # Parse server arguments from config (use shlex to handle quoted strings)
+    server_args_str = eval_config.get("server_args", "")
+    server_args = shlex.split(server_args_str) if server_args_str else []
+
+    # Add standard server arguments
+    server_args.extend(
+        [
+            "--trust-remote-code",
+            "--disable-uvicorn-access-log",
+        ]
+    )
 
     env_dict = eval_config.get("env", None)
 
+    print(f"Starting GSM8K evaluation for model: {eval_config['model_name']}")
+    print(f"Expected metric threshold: {eval_config['accuracy_threshold']}")
+    print(f"Number of questions: {eval_config['num_questions']}")
+    print(f"Number of few-shot examples: {eval_config['num_fewshot']}")
+    print(f"Server args: {' '.join(server_args)}")
+    print(f"Environment variables: {env_dict}")
+
     # Launch server and run evaluation
     with RemoteOpenAIServer(
-        eval_config["model_name"], server_args, env_dict=env_dict, max_wait_seconds=480
+        eval_config["model_name"],
+        server_args,
+        env_dict=env_dict,
+        max_wait_seconds=eval_config.get("startup_max_wait_seconds", 600),
     ) as remote_server:
         server_url = remote_server.url_for("v1")
+        print(f"Server started at: {server_url}")
 
-        results = launch_gsm8k_eval(eval_config, server_url, tp_size)
+        results = run_gsm8k_eval(eval_config, server_url)
 
-        # Check accuracy against threshold
-        measured_accuracy = results["accuracy"]
-        expected_accuracy = eval_config["accuracy_threshold"]
+        measured_metric = results["accuracy"]
+        expected_metric = eval_config["accuracy_threshold"]
 
         print(f"GSM8K Results for {eval_config['model_name']}:")
-        print(f"  Accuracy: {measured_accuracy:.3f}")
-        print(f"  Expected: {expected_accuracy:.3f}")
+        print(f"  Measured metric: {measured_metric:.4f}")
+        print(f"  Expected metric: {expected_metric:.4f}")
+        print(f"  Tolerance: {TOL:.4f}")
         print(f"  Questions: {results['num_questions']}")
         print(f"  Invalid rate: {results['invalid_rate']:.3f}")
         print(f"  Latency: {results['latency']:.1f}s")
         print(f"  QPS: {results['questions_per_second']:.1f}")
 
-        # Verify accuracy is within tolerance
-        assert measured_accuracy >= expected_accuracy - RTOL, (
-            f"Accuracy too low: {measured_accuracy:.3f} < "
-            f"{expected_accuracy:.3f} - {RTOL:.3f}"
+        # Verify metric is within tolerance
+        assert measured_metric >= expected_metric - TOL, (
+            f"GSM8K metric too low: {measured_metric:.4f} < "
+            f"{expected_metric:.4f} - {TOL:.4f} = {expected_metric - TOL:.4f}"
         )
 
         print(f"✅ GSM8K test passed for {eval_config['model_name']}")
diff --git a/tests/kernels/attention/test_aiter_flash_attn.py b/tests/kernels/attention/test_aiter_flash_attn.py
index 8f58c470d217a59a79ada76e810d00375400452a..cf24630c509f366432caafe1e8e3038874b38532 100644
--- a/tests/kernels/attention/test_aiter_flash_attn.py
+++ b/tests/kernels/attention/test_aiter_flash_attn.py
@@ -6,8 +6,9 @@ import pytest
 import torch
 
 import vllm.v1.attention.backends.rocm_aiter_fa  # noqa: F401
-from vllm.attention.utils.fa_utils import is_flash_attn_varlen_func_available
 from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed
+from vllm.v1.attention.backends.fa_utils import is_flash_attn_varlen_func_available
 
 NUM_HEADS = [(4, 4), (8, 2)]
 HEAD_SIZES = [128, 256]
@@ -104,7 +105,7 @@ def test_varlen_with_paged_kv(
     if not is_flash_attn_varlen_func_available():
         pytest.skip("flash_attn_varlen_func required to run this test.")
     torch.set_default_device("cuda")
-    current_platform.seed_everything(0)
+    set_random_seed(0)
     num_seqs = len(seq_lens)
     query_lens = [x[0] for x in seq_lens]
     kv_lens = [x[1] for x in seq_lens]
diff --git a/tests/kernels/attention/test_attention.py b/tests/kernels/attention/test_attention.py
index 573d68baf7a7139ed3f9e17b857836a270f11b4d..aecf890a0395ec341bad49ce4ab2738cce13acf4 100644
--- a/tests/kernels/attention/test_attention.py
+++ b/tests/kernels/attention/test_attention.py
@@ -9,9 +9,11 @@ import torch
 from tests.kernels.allclose_default import get_default_atol, get_default_rtol
 from tests.kernels.utils import opcheck
 from vllm import _custom_ops as ops
-from vllm.attention.layer import Attention, MultiHeadAttention
+from vllm.attention.layer import Attention
+from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention
 from vllm.platforms import current_platform
 from vllm.utils.mem_utils import get_max_shared_memory_bytes
+from vllm.utils.torch_utils import set_random_seed
 
 if current_platform.is_rocm():
     from flash_attn import vllm_flash_attn_with_kvcache
@@ -31,7 +33,7 @@ NUM_PREFILL_SEQS = [3]  # Arbitrary values for testing
 NUM_HEADS = [(40, 40), (64, 8)]  # Arbitrary values for testing
 
 # This should be sync with get_supported_head_sizes() in
-# vllm.attention.ops.paged_attn.PagedAttention
+# vllm.v1.attention.ops.paged_attn.PagedAttention
 HEAD_SIZES = [32, 80, 128, 256]
 
 BLOCK_SIZES = [16, 32]
@@ -152,7 +154,7 @@ def test_paged_attention(
 
     global PARTITION_SIZE
 
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
     torch.set_default_device(device)
     scale = float(1.0 / (head_size**0.5))
     num_query_heads, num_kv_heads = num_heads
@@ -445,7 +447,7 @@ def ref_multi_query_kv_attention(
     return torch.cat(ref_outputs, dim=0)
 
 
-@pytest.mark.parametrize("attention_cls", [Attention, MultiHeadAttention])
+@pytest.mark.parametrize("attention_cls", [Attention, MMEncoderAttention])
 def test_num_heads_not_divisble_by_num_kv_heads(attention_cls: type) -> None:
     head_size = 64
     scale = float(1.0 / (head_size**0.5))
diff --git a/tests/kernels/attention/test_cache.py b/tests/kernels/attention/test_cache.py
index 4d7001f1c4ec0d66062be40981de537a5c92a12f..7fdf62cc62a500d156215eb533973225dc86b73a 100644
--- a/tests/kernels/attention/test_cache.py
+++ b/tests/kernels/attention/test_cache.py
@@ -9,6 +9,7 @@ import torch
 from tests.kernels.utils import DEFAULT_OPCHECK_TEST_UTILS, opcheck
 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed
 
 COPYING_DIRECTION = [("cuda", "cpu"), ("cuda", "cuda"), ("cpu", "cuda")]
 DTYPES = [torch.bfloat16, torch.float]
@@ -41,93 +42,6 @@ KV_CACHE_DTYPE = ["auto"]
 RESHAPE_FLASH_IMPLEMENTATIONS = ["cuda", "triton"]
 
 
-@pytest.mark.parametrize("num_mappings", NUM_MAPPINGS)
-@pytest.mark.parametrize("num_layers", NUM_LAYERS)
-@pytest.mark.parametrize("num_heads", NUM_HEADS)
-@pytest.mark.parametrize("head_size", HEAD_SIZES)
-@pytest.mark.parametrize("block_size", BLOCK_SIZES)
-@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
-@torch.inference_mode()
-def test_copy_blocks(
-    kv_cache_factory,
-    num_mappings: int,
-    num_layers: int,
-    num_heads: int,
-    head_size: int,
-    block_size: int,
-    num_blocks: int,
-    dtype: torch.dtype,
-    seed: int,
-    kv_cache_dtype: str,
-    device: str,
-) -> None:
-    if kv_cache_dtype == "fp8" and head_size % 16:
-        pytest.skip()
-    current_platform.seed_everything(seed)
-    torch.set_default_device(device)
-    torch.cuda.set_device(device)
-    # Generate random block mappings where each source block is mapped to two
-    # destination blocks.
-    assert 2 * num_mappings <= num_blocks
-    src_blocks = random.sample(range(num_blocks), num_mappings)
-    remaining_blocks = list(set(range(num_blocks)) - set(src_blocks))
-    dst_blocks = random.sample(remaining_blocks, 2 * num_mappings)
-    block_mapping: list[tuple[int, int]] = []
-    for i in range(num_mappings):
-        src = src_blocks[i]
-        dst1 = dst_blocks[2 * i]
-        dst2 = dst_blocks[2 * i + 1]
-        block_mapping.append((src, dst1))
-        block_mapping.append((src, dst2))
-
-    # Create the KV caches.
-    key_caches, value_caches = kv_cache_factory(
-        num_blocks,
-        block_size,
-        num_layers,
-        num_heads,
-        head_size,
-        kv_cache_dtype,
-        dtype,
-        seed,
-        device,
-    )
-
-    # Clone the KV caches.
-    cloned_key_caches = [key_cache.clone() for key_cache in key_caches]
-    cloned_value_caches = [value_cache.clone() for value_cache in value_caches]
-
-    # Call the copy blocks kernel.
-    block_mapping_tensor = torch.tensor(
-        block_mapping, dtype=torch.int64, device=device
-    ).view(-1, 2)
-
-    opcheck(
-        torch.ops._C_cache_ops.copy_blocks,
-        (key_caches, value_caches, block_mapping_tensor),
-        test_utils=DEFAULT_OPCHECK_TEST_UTILS,
-        cond=(head_size == HEAD_SIZES[0]),
-    )
-    ops.copy_blocks(key_caches, value_caches, block_mapping_tensor)
-
-    # Run the reference implementation.
-    for src, dst in block_mapping:
-        for cloned_key_cache in cloned_key_caches:
-            cloned_key_cache[dst].copy_(cloned_key_cache[src])
-        for cloned_value_cache in cloned_value_caches:
-            cloned_value_cache[dst].copy_(cloned_value_cache[src])
-
-    # Compare the results.
-    for key_cache, cloned_key_cache in zip(key_caches, cloned_key_caches):
-        torch.testing.assert_close(key_cache, cloned_key_cache)
-    for value_cache, cloned_value_cache in zip(value_caches, cloned_value_caches):
-        torch.testing.assert_close(value_cache, cloned_value_cache)
-
-
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
@@ -152,7 +66,7 @@ def test_reshape_and_cache(
 ) -> None:
     if kv_cache_dtype == "fp8" and head_size % 16:
         pytest.skip()
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
     torch.set_default_device(device)
     torch.cuda.set_device(device)
     # Create a random slot mapping.
@@ -273,7 +187,7 @@ def test_reshape_and_cache_flash(
     kv_cache_layout: str,
     implementation: str,
 ) -> None:
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
     torch.set_default_device(device)
     torch.cuda.set_device(device)
     assert implementation in ["cuda", "triton"]
@@ -357,7 +271,7 @@ def test_reshape_and_cache_flash(
             v_scale,
         )
     elif implementation == "triton":
-        from vllm.attention.ops.triton_reshape_and_cache_flash import (
+        from vllm.v1.attention.ops.triton_reshape_and_cache_flash import (
             triton_reshape_and_cache_flash,
         )
 
@@ -443,7 +357,7 @@ def test_swap_blocks(
     if kv_cache_dtype == "fp8" and head_size % 16:
         pytest.skip()
 
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
 
     src_device = device if direction[0] == "cuda" else "cpu"
     dst_device = device if direction[1] == "cuda" else "cpu"
@@ -534,7 +448,7 @@ def test_fp8_e4m3_conversion(
     seed: int,
     device: str,
 ) -> None:
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
 
     low = -224.0
     high = 224.0
@@ -597,7 +511,7 @@ def test_concat_and_cache_mla(
     device: str,
     kv_cache_dtype: str,
 ) -> None:
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
     torch.set_default_device(device)
     torch.cuda.set_device(device)
 
@@ -674,7 +588,7 @@ def test_concat_and_cache_ds_mla(
     if dtype.itemsize != 2:
         pytest.skip("ds_mla only supports 16-bit input")
     kv_cache_dtype = "fp8_ds_mla"
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
     torch.set_default_device(device)
     torch.cuda.set_device(device)
 
@@ -766,73 +680,6 @@ def test_concat_and_cache_ds_mla(
         torch.testing.assert_close(kv_rope, ref_rope, atol=0.001, rtol=0.1)
 
 
-@pytest.mark.parametrize("kv_lora_rank", KV_LORA_RANKS)
-@pytest.mark.parametrize("qk_rope_head_dim", QK_ROPE_HEAD_DIMS)
-@pytest.mark.parametrize("block_size", BLOCK_SIZES_MLA)
-@pytest.mark.parametrize("num_blocks", NUM_BLOCKS_MLA)
-@pytest.mark.parametrize("num_layers", NUM_LAYERS)
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
-@torch.inference_mode()
-def test_copy_blocks_mla(
-    kv_lora_rank: int,
-    qk_rope_head_dim: int,
-    block_size: int,
-    num_blocks: int,
-    num_layers: int,
-    dtype: torch.dtype,
-    seed: int,
-    device: str,
-    kv_cache_dtype: str,
-) -> None:
-    current_platform.seed_everything(seed)
-    torch.set_default_device(device)
-    torch.cuda.set_device(device)
-
-    entry_size = kv_lora_rank + qk_rope_head_dim
-
-    kv_caches = []
-    for _ in range(num_layers):
-        kv_cache = _create_mla_cache(
-            num_blocks, block_size, entry_size, dtype, kv_cache_dtype, device
-        )
-        _fill_mla_cache(kv_cache, kv_cache_dtype=kv_cache_dtype)
-        kv_caches.append(kv_cache)
-
-    ref_caches = [kv_cache.clone() for kv_cache in kv_caches]
-
-    num_mappings = min(2, num_blocks // 2)
-    src_blocks = random.sample(range(num_blocks), num_mappings)
-    remaining = list(set(range(num_blocks)) - set(src_blocks))
-    dst_blocks = random.sample(remaining, 2 * num_mappings)
-    block_mapping = []
-    for i in range(num_mappings):
-        src = src_blocks[i]
-        dst1 = dst_blocks[2 * i]
-        dst2 = dst_blocks[2 * i + 1]
-        block_mapping.append((src, dst1))
-        block_mapping.append((src, dst2))
-    block_mapping_tensor = torch.tensor(
-        block_mapping, dtype=torch.int64, device=device
-    ).view(-1, 2)
-
-    for src, dst in block_mapping:
-        for ref_cache in ref_caches:
-            ref_cache[dst].copy_(ref_cache[src])
-
-    opcheck(
-        torch.ops._C_cache_ops.copy_blocks_mla,
-        (kv_caches, block_mapping_tensor),
-        test_utils=DEFAULT_OPCHECK_TEST_UTILS,
-    )
-    ops.copy_blocks_mla(kv_caches, block_mapping_tensor)
-
-    for kv_cache, ref_cache in zip(kv_caches, ref_caches):
-        torch.testing.assert_close(kv_cache, ref_cache)
-
-
 @pytest.mark.parametrize("kv_lora_rank", KV_LORA_RANKS)
 @pytest.mark.parametrize("qk_rope_head_dim", QK_ROPE_HEAD_DIMS)
 @pytest.mark.parametrize("block_size", BLOCK_SIZES_MLA)
@@ -852,7 +699,7 @@ def test_swap_blocks_mla(
     device: str,
     kv_cache_dtype: str,
 ) -> None:
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
     torch.set_default_device(device)
     torch.cuda.set_device(device)
 
@@ -1104,7 +951,7 @@ def test_concat_and_cache_mla_cpu(
 ) -> None:
     device = "cpu"
     kv_cache_dtype = "auto"
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
     torch.set_default_device(device)
 
     total_slots = num_blocks * block_size
diff --git a/tests/kernels/attention/test_cascade_flash_attn.py b/tests/kernels/attention/test_cascade_flash_attn.py
index ac59440d5132888e88903da377ad1318870e1c4d..3c6d6f3c76460d76506ac095337cd1fd3a448dd9 100644
--- a/tests/kernels/attention/test_cascade_flash_attn.py
+++ b/tests/kernels/attention/test_cascade_flash_attn.py
@@ -6,6 +6,7 @@ import pytest
 import torch
 
 from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed
 from vllm.v1.attention.backends.flash_attn import cascade_attention, merge_attn_states
 from vllm.platforms import current_platform
 
@@ -46,7 +47,7 @@ def test_merge_kernel(
     dtype: torch.dtype,
 ):
     torch.set_default_device("cuda")
-    current_platform.seed_everything(0)
+    set_random_seed(0)
     num_query_heads = num_heads[0]
     num_kv_heads = num_heads[1]
     assert num_query_heads % num_kv_heads == 0
@@ -110,7 +111,7 @@ CASES = [
 #             f'to: "{fa_version_unsupported_reason(fa_version)}"'
 #         )
 
-#     current_platform.seed_everything(0)
+#     set_random_seed(0)
 
 #     window_size = (-1, -1)
 #     scale = head_size**-0.5
diff --git a/tests/kernels/attention/test_cpu_attn.py b/tests/kernels/attention/test_cpu_attn.py
index be5d66197f6ef4cd16c1705a5b7ac163f09176a8..ef0099f635a578c93b8dba89c69e73536a1f58f7 100644
--- a/tests/kernels/attention/test_cpu_attn.py
+++ b/tests/kernels/attention/test_cpu_attn.py
@@ -8,6 +8,7 @@ import pytest
 import torch
 
 from vllm.platforms import CpuArchEnum, current_platform
+from vllm.utils.torch_utils import set_random_seed
 from vllm.v1.attention.backends.cpu_attn import _get_attn_isa
 
 if not current_platform.is_cpu():
@@ -190,7 +191,7 @@ def varlen_with_paged_kv(
     use_sink: bool,
     isa: str,
 ) -> None:
-    current_platform.seed_everything(0)
+    set_random_seed(0)
     num_seqs = len(seq_lens)
     query_lens = [x[0] for x in seq_lens]
     kv_lens = [x[1] for x in seq_lens]
diff --git a/tests/kernels/attention/test_flash_attn.py b/tests/kernels/attention/test_flash_attn.py
index 59c17292c3c880f4741e5baf4a38e63973fdbc63..86956c3caca23bfc7716f0a9960a39a77b0ac270 100644
--- a/tests/kernels/attention/test_flash_attn.py
+++ b/tests/kernels/attention/test_flash_attn.py
@@ -6,6 +6,7 @@ import pytest
 import torch
 
 from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed
 
 try:
     if current_platform.is_rocm():
@@ -132,7 +133,7 @@ def test_varlen_with_paged_kv(
             "Flash attention with quantized inputs is only "
             "supported on version 3 with bfloat16 base type"
         )
-    current_platform.seed_everything(0)
+    set_random_seed(0)
     num_seqs = len(seq_lens)
     query_lens = [x[0] for x in seq_lens]
     kv_lens = [x[1] for x in seq_lens]
diff --git a/tests/kernels/attention/test_flashinfer_trtllm_attention.py b/tests/kernels/attention/test_flashinfer_trtllm_attention.py
index 06a7085a82ba0843e34c3fa3456d5211c64ed5e8..b5f8584015be56199e297036762f1f2801c4b1b4 100644
--- a/tests/kernels/attention/test_flashinfer_trtllm_attention.py
+++ b/tests/kernels/attention/test_flashinfer_trtllm_attention.py
@@ -10,6 +10,7 @@ from tests.kernels.quantization.nvfp4_utils import (
 )
 from vllm.platforms import current_platform
 from vllm.utils.math_utils import round_up
+from vllm.utils.torch_utils import set_random_seed
 
 if not current_platform.is_device_capability_family(100):
     pytest.skip(
@@ -80,7 +81,7 @@ def test_flashinfer_trtllm_decode_with_baseline(
     has_sinks: bool,
 ) -> None:
     torch.set_default_device("cuda")
-    current_platform.seed_everything(42)
+    set_random_seed(42)
 
     q_quant_dtype, kv_quant_dtype, o_quant_dtype = quant_dtypes
     q_quant_dtype = q_quant_dtype or dtype
@@ -279,7 +280,7 @@ def test_flashinfer_trtllm_prefill_with_baseline(
     has_sinks: bool,
 ) -> None:
     torch.set_default_device("cuda")
-    current_platform.seed_everything(42)
+    set_random_seed(42)
 
     q_quant_dtype, kv_quant_dtype, o_quant_dtype = quant_dtypes
     q_quant_dtype = q_quant_dtype or dtype
diff --git a/tests/kernels/attention/test_flashmla.py b/tests/kernels/attention/test_flashmla.py
index 70b9948d60a5c5ba14e8ea75342d1eee1fc5bfdf..311b6912e2130ef402a8e6300ef3c197d057c854 100644
--- a/tests/kernels/attention/test_flashmla.py
+++ b/tests/kernels/attention/test_flashmla.py
@@ -7,12 +7,12 @@ import random
 import pytest
 import torch
 
-from vllm.attention.ops.flashmla import (
+from vllm.triton_utils import triton
+from vllm.v1.attention.ops.flashmla import (
     flash_mla_with_kvcache,
     get_mla_metadata,
     is_flashmla_dense_supported,
 )
-from vllm.triton_utils import triton
 
 
 def cal_diff(
diff --git a/tests/kernels/attention/test_flashmla_sparse.py b/tests/kernels/attention/test_flashmla_sparse.py
index 7ee6f4b07b4a9b474d134cf5fa9294c353065ca9..c1147ae9edb159e3d7c1b63bfb7b58175c36e953 100644
--- a/tests/kernels/attention/test_flashmla_sparse.py
+++ b/tests/kernels/attention/test_flashmla_sparse.py
@@ -5,7 +5,7 @@ import torch
 
 
 def test_sparse_flashmla_metadata_smoke():
-    import vllm.attention.ops.flashmla as fm
+    import vllm.v1.attention.ops.flashmla as fm
 
     ok, reason = fm.is_flashmla_sparse_supported()
     if not ok:
@@ -34,7 +34,7 @@ def test_sparse_flashmla_metadata_smoke():
 
 
 def test_sparse_flashmla_decode_smoke():
-    import vllm.attention.ops.flashmla as fm
+    import vllm.v1.attention.ops.flashmla as fm
 
     ok, reason = fm.is_flashmla_sparse_supported()
     if not ok:
@@ -97,7 +97,7 @@ def test_sparse_flashmla_decode_smoke():
 
 
 def test_sparse_flashmla_prefill_smoke():
-    import vllm.attention.ops.flashmla as fm
+    import vllm.v1.attention.ops.flashmla as fm
 
     ok, reason = fm.is_flashmla_sparse_supported()
     if not ok:
diff --git a/tests/kernels/attention/test_lightning_attn.py b/tests/kernels/attention/test_lightning_attn.py
index ec938caff2c6dfcb9ac7645779d2ceb242fc13c0..37fd85ccec04a5471469c84ceb10f7ee575aa2dd 100644
--- a/tests/kernels/attention/test_lightning_attn.py
+++ b/tests/kernels/attention/test_lightning_attn.py
@@ -5,7 +5,7 @@ import pytest
 import torch
 
 from vllm.model_executor.layers.lightning_attn import linear_decode_forward_triton
-from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed
 
 NUM_HEADS = [4, 8]
 HEAD_SIZES = [64]
@@ -124,7 +124,7 @@ def test_linear_decode_forward_triton(
     torch.set_default_device("cuda")
     torch.manual_seed(42)
     torch.cuda.manual_seed_all(42)
-    current_platform.seed_everything(42)
+    set_random_seed(42)
     base = 0.01
     q = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
     k = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
@@ -167,7 +167,7 @@ def test_linear_decode_forward_triton_with_padding(
     torch.set_default_device("cuda")
     torch.manual_seed(42)
     torch.cuda.manual_seed_all(42)
-    current_platform.seed_everything(42)
+    set_random_seed(42)
 
     batch_size = 4
     base = 0.01
@@ -231,7 +231,7 @@ def test_lightning_attention_reference(
     torch.set_default_device("cuda")
     torch.manual_seed(42)
     torch.cuda.manual_seed_all(42)
-    current_platform.seed_everything(42)
+    set_random_seed(42)
 
     base = 0.01
     q = base * torch.randn(batch_size, num_heads, seq_len, head_size, dtype=dtype)
diff --git a/tests/kernels/attention/test_merge_attn_states.py b/tests/kernels/attention/test_merge_attn_states.py
index c7662223e1ca5015e19a48e680166f7a694fd3f4..a9f525cdc3ce59783a44c00b4b4b4d0f9e0d2870 100644
--- a/tests/kernels/attention/test_merge_attn_states.py
+++ b/tests/kernels/attention/test_merge_attn_states.py
@@ -5,10 +5,10 @@ import pytest
 import torch
 
 from vllm._custom_ops import merge_attn_states as merge_attn_states_cuda
-from vllm.attention.ops.triton_merge_attn_states import (
+from vllm.platforms import current_platform
+from vllm.v1.attention.ops.triton_merge_attn_states import (
     merge_attn_states as merge_attn_states_triton,
 )
-from vllm.platforms import current_platform
 
 
 # Naive PyTorch Implements section 2.2 of https://www.arxiv.org/pdf/2501.01005
diff --git a/tests/kernels/attention/test_mha_attn.py b/tests/kernels/attention/test_mha_attn.py
index 639abdf6f0487c6cb172ee1bdd7bdbb29a3570e5..ecaea88674c27a5759c5e9933e6de5d08f2d7c7f 100644
--- a/tests/kernels/attention/test_mha_attn.py
+++ b/tests/kernels/attention/test_mha_attn.py
@@ -3,21 +3,23 @@
 """
 Test:
 
-* Tests for MultiHeadAttention layer
+* Tests for MMEncoderAttention layer
 """
 
+import itertools
 from unittest.mock import patch
 
 import pytest
 import torch
 
-from vllm.attention.backends.registry import AttentionBackendEnum
-from vllm.attention.layer import MultiHeadAttention
-from vllm.attention.selector import _cached_get_attn_backend
+from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention
 from vllm.platforms import current_platform
 from vllm.platforms.cpu import CpuPlatform
 from vllm.platforms.cuda import CudaPlatform
 from vllm.platforms.rocm import RocmPlatform
+from vllm.utils.torch_utils import set_random_seed
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
+from vllm.v1.attention.selector import _cached_get_attn_backend
 
 
 @pytest.fixture(autouse=True)
@@ -34,7 +36,7 @@ if current_platform.is_rocm():
 
 
 @pytest.mark.parametrize("device", devices)
-def test_mha_attn_platform(device: str):
+def test_mha_attn_platform(default_vllm_config, device: str):
     """
     Test the attention selector between different platform and device.
     """
@@ -42,35 +44,31 @@ def test_mha_attn_platform(device: str):
 
     if device == "cpu":
         with (
-            patch("vllm.attention.layer.current_platform", CpuPlatform()),
             patch("vllm.model_executor.models.vision.current_platform", CpuPlatform()),
         ):
-            attn = MultiHeadAttention(16, 64, scale=1)
+            attn = MMEncoderAttention(16, 64, scale=1)
             assert attn.attn_backend == AttentionBackendEnum.TORCH_SDPA
     elif device == "hip":
         with (
-            patch("vllm.attention.layer.current_platform", RocmPlatform()),
             patch("vllm.model_executor.models.vision.current_platform", RocmPlatform()),
         ):
-            attn = MultiHeadAttention(16, 64, scale=1)
+            attn = MMEncoderAttention(16, 64, scale=1)
             assert attn.attn_backend == AttentionBackendEnum.FLASH_ATTN
     else:
         # Test CUDA with head_size=64 (divisible by 32)
         # - should use vLLM's FlashAttention
         with (
-            patch("vllm.attention.layer.current_platform", CudaPlatform()),
             patch("vllm.model_executor.models.vision.current_platform", CudaPlatform()),
         ):
-            attn = MultiHeadAttention(16, 64, scale=1)
+            attn = MMEncoderAttention(16, 64, scale=1)
             assert attn.attn_backend == AttentionBackendEnum.FLASH_ATTN
 
         # Test CUDA with head_size=72 (not divisible by 32)
         # - should use vLLM's FlashAttention
         with (
-            patch("vllm.attention.layer.current_platform", CudaPlatform()),
             patch("vllm.model_executor.models.vision.current_platform", CudaPlatform()),
         ):
-            attn = MultiHeadAttention(16, 72, scale=1)
+            attn = MMEncoderAttention(16, 72, scale=1)
             assert attn.attn_backend == AttentionBackendEnum.FLASH_ATTN
 
 
@@ -94,6 +92,10 @@ def ref_attention(
 
 BATCH_SIZES = [1, 16]
 SEQ_LENS = [1]
+VAR_SEQ_LENS = [
+    [2, 2],
+    [2, 3, 4],
+]
 NUM_HEADS = [1, 16]
 NUM_KV_HEADS = [1]
 HEAD_SIZES = [64, 80]
@@ -114,6 +116,7 @@ CUDA_DEVICES = ["cuda"]
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 def test_mha_attn_forward(
+    default_vllm_config,
     batch_size: int,
     seq_len: int,
     num_heads: int,
@@ -122,7 +125,7 @@ def test_mha_attn_forward(
     dtype: torch.dtype,
     device: str,
 ):
-    current_platform.seed_everything(0)
+    set_random_seed(0)
     torch.set_default_device(device)
     torch.set_default_dtype(dtype)
 
@@ -130,7 +133,7 @@ def test_mha_attn_forward(
     k = torch.randn(batch_size, seq_len, num_kv_heads * head_size)
     v = torch.randn(batch_size, seq_len, num_kv_heads * head_size)
     scale = 1.0 / head_size**0.5
-    attn = MultiHeadAttention(
+    attn = MMEncoderAttention(
         num_heads, head_size, scale=scale, num_kv_heads=num_kv_heads
     )
     output = attn(q, k, v)
@@ -151,3 +154,59 @@ def test_mha_attn_forward(
         scale=scale,
     ).reshape(batch_size, seq_len, num_heads * head_size)
     torch.testing.assert_close(output, ref_output)
+
+
+@pytest.mark.parametrize("var_seq_len", VAR_SEQ_LENS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("num_kv_heads", NUM_KV_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_mha_attn_varlen_forward(
+    default_vllm_config,
+    var_seq_len: list[int],
+    num_heads: int,
+    num_kv_heads: int,
+    head_size: int,
+    dtype: torch.dtype,
+    device: str,
+):
+    set_random_seed(0)
+    torch.set_default_device(device)
+    torch.set_default_dtype(dtype)
+
+    q = torch.randn(1, sum(var_seq_len), num_heads, head_size)
+    k = torch.randn(1, sum(var_seq_len), num_kv_heads, head_size)
+    v = torch.randn(1, sum(var_seq_len), num_kv_heads, head_size)
+    cu_seqlens = torch.tensor(
+        [0] + list(itertools.accumulate(var_seq_len)), dtype=torch.int32
+    )
+    scale = 1.0 / head_size**0.5
+    attn = MMEncoderAttention(
+        num_heads, head_size, scale=scale, num_kv_heads=num_kv_heads
+    )
+    output = attn(
+        q, k, v, cu_seqlens=cu_seqlens, max_seqlen=torch.tensor(max(var_seq_len))
+    )
+
+    assert num_heads % num_kv_heads == 0
+    num_queries_per_kv = num_heads // num_kv_heads
+    if num_queries_per_kv > 1:
+        k = torch.repeat_interleave(k, num_queries_per_kv, dim=2)
+        v = torch.repeat_interleave(v, num_queries_per_kv, dim=2)
+
+    ref_output = []
+    for q_i, k_i, v_i in zip(
+        torch.split(q, var_seq_len, dim=1),
+        torch.split(k, var_seq_len, dim=1),
+        torch.split(v, var_seq_len, dim=1),
+    ):
+        output_i = ref_attention(
+            q_i,
+            k_i,
+            v_i,
+            scale=scale,
+        )
+        ref_output.append(output_i)
+    ref_output = torch.cat(ref_output, dim=1)
+    torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2)
diff --git a/tests/kernels/attention/test_pack_unpack_triton.py b/tests/kernels/attention/test_pack_unpack_triton.py
index d2aa14738d9d953b0414b9b03db5ab207a7670d0..158ae550ef03959150c0f5d2943150aa1abb5305 100644
--- a/tests/kernels/attention/test_pack_unpack_triton.py
+++ b/tests/kernels/attention/test_pack_unpack_triton.py
@@ -4,7 +4,7 @@
 import torch
 from torch.testing import assert_close
 
-from vllm.attention.ops.common import pack_seq_triton, unpack_seq_triton
+from vllm.v1.attention.ops.common import pack_seq_triton, unpack_seq_triton
 
 
 def test_pack_seq_basic_fp8():
diff --git a/tests/kernels/attention/test_prefix_prefill.py b/tests/kernels/attention/test_prefix_prefill.py
index 27d7b71571cd487d2ce7847c3f8ef8045727d6ad..c855d2586f59c8a2e194c0815117ac837c39ad2a 100644
--- a/tests/kernels/attention/test_prefix_prefill.py
+++ b/tests/kernels/attention/test_prefix_prefill.py
@@ -10,10 +10,12 @@ import pytest
 import torch
 import torch.nn.functional as F
 
-from vllm.attention.ops.chunked_prefill_paged_decode import chunked_prefill_paged_decode
-from vllm.attention.ops.prefix_prefill import context_attention_fwd
 from vllm.platforms import current_platform
-from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
+from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, set_random_seed
+from vllm.v1.attention.ops.chunked_prefill_paged_decode import (
+    chunked_prefill_paged_decode,
+)
+from vllm.v1.attention.ops.prefix_prefill import context_attention_fwd
 
 if not current_platform.is_rocm():
     from xformers import ops as xops
@@ -117,6 +119,7 @@ def test_contexted_kv_attention(
     kv_cache_dtype: str,
     device: str,
     op: Callable,
+    block_size: int = 32,
 ) -> None:
     if "fp8" in kv_cache_dtype and not current_platform.has_device_capability(89):
         pytest.skip(
@@ -130,7 +133,7 @@ def test_contexted_kv_attention(
     ):
         pytest.skip("ROCm custom paged attention does not support fp8_e5m2 KV cache")
 
-    current_platform.seed_everything(0)
+    set_random_seed(0)
     torch.set_default_device(device)
 
     # Need this, otherwise when we capture the graph the process
@@ -143,7 +146,6 @@ def test_contexted_kv_attention(
     MAX_CTX_LEN = 1024
     BS = 10
     cache_size = 640
-    block_size = 32
     max_block_per_request = 64
     query_lens = [random.randint(16, MAX_SEQ_LEN) for _ in range(BS)]
     # ensure one sequence in batch is a decode
@@ -338,6 +340,7 @@ def test_contexted_kv_attention_alibi(
     kv_cache_dtype: str,
     device: str,
     op: Callable,
+    block_size: int = 32,
 ) -> None:
     if "fp8" in kv_cache_dtype and not current_platform.has_device_capability(89):
         pytest.skip(
@@ -351,7 +354,7 @@ def test_contexted_kv_attention_alibi(
     ):
         pytest.skip("ROCm custom paged attention does not support fp8_e5m2 KV cache")
 
-    current_platform.seed_everything(0)
+    set_random_seed(0)
     torch.set_default_device(device)
 
     # Need this, otherwise when we capture the graph the process
@@ -390,7 +393,6 @@ def test_contexted_kv_attention_alibi(
     MAX_CTX_LEN = 1024
     BS = 10
     cache_size = 640
-    block_size = 32
     max_block_per_request = 64
     query_lens = [random.randint(16, MAX_SEQ_LEN) for _ in range(BS)]
     ctx_lens = [random.randint(16, MAX_CTX_LEN) for _ in range(BS)]
@@ -643,3 +645,34 @@ def test_contexted_kv_attention_alibi_f32(
     test_contexted_kv_attention_alibi(
         num_heads, num_queries_per_kv, head_size, dtype, kv_cache_dtype, device, op
     )
+
+
+@pytest.mark.parametrize("head_size", [128])
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("op", OPS)
+@torch.inference_mode()
+def test_qwen3_nonstandard_block_size(
+    head_size: int,
+    dtype: torch.dtype,
+    device: str,
+    op: Callable,
+) -> None:
+    """
+    A separate test function specifically added
+    for Qwen3-Next-80B (Block Size 544).
+    """
+    if not current_platform.is_rocm():
+        pytest.skip("544 block size optimization is only for ROCm.")
+
+    test_contexted_kv_attention(
+        num_heads=64,
+        num_queries_per_kv=1,
+        head_size=head_size,
+        block_size=544,
+        sliding_window=0,
+        dtype=dtype,
+        kv_cache_dtype="auto",
+        device=device,
+        op=op,
+    )
diff --git a/tests/kernels/attention/test_rocm_attention_selector.py b/tests/kernels/attention/test_rocm_attention_selector.py
index 824d99cd3bee0fbcaeb07aad4d82934f945f46e3..83b2e33a77e7a677fffa82150454b07bb39ee8d2 100644
--- a/tests/kernels/attention/test_rocm_attention_selector.py
+++ b/tests/kernels/attention/test_rocm_attention_selector.py
@@ -4,8 +4,10 @@
 import pytest
 import torch
 
-from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
+from vllm.config import AttentionConfig, VllmConfig, set_current_vllm_config
 from vllm.platforms.rocm import RocmPlatform
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
+from vllm.v1.attention.selector import _cached_get_attn_backend, get_attn_backend
 
 
 @pytest.fixture(autouse=True)
@@ -16,40 +18,56 @@ def clear_cache():
 
 @pytest.mark.skip(reason="Skipped for now. Should be revisited.")
 def test_selector(monkeypatch: pytest.MonkeyPatch):
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_ATTENTION_BACKEND", "ROCM_ATTN")
+    # Set the current platform to ROCm using monkeypatch
+    monkeypatch.setattr("vllm.v1.attention.selector.current_platform", RocmPlatform())
 
-        # Set the current platform to ROCm using monkeypatch
-        monkeypatch.setattr("vllm.attention.selector.current_platform", RocmPlatform())
+    # Test standard ROCm attention
+    attention_config = AttentionConfig(backend=AttentionBackendEnum.ROCM_ATTN)
+    vllm_config = VllmConfig(attention_config=attention_config)
 
-        # Test standard ROCm attention
+    with set_current_vllm_config(vllm_config):
         backend = get_attn_backend(16, torch.float16, torch.float16, 16, False)
         assert backend.get_name() == "ROCM_FLASH" or backend.get_name() == "TRITON_ATTN"
 
-        # MLA test for deepseek related
+    # MLA test for deepseek related
+    # Change the attention backend to triton MLA
+    attention_config = AttentionConfig(backend=AttentionBackendEnum.TRITON_MLA)
+    vllm_config = VllmConfig(attention_config=attention_config)
 
-        # change the attention backend to triton MLA
-        m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_MLA")
+    with set_current_vllm_config(vllm_config):
         backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False, use_mla=True)
         assert backend.get_name() == "TRITON_MLA"
 
-        # If attention backend is None
-        # If use_mla is true
-        # The selected backend is triton MLA
-        m.setenv("VLLM_ATTENTION_BACKEND", "")
+    # If attention backend is None
+    # If use_mla is true
+    # The selected backend is triton MLA
+    attention_config = AttentionConfig(backend=None)
+    vllm_config = VllmConfig(attention_config=attention_config)
+
+    with set_current_vllm_config(vllm_config):
         backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False, use_mla=True)
         assert backend.get_name() == "TRITON_MLA"
 
-        # change the attention backend to AITER MLA
-        # m.setenv("VLLM_ATTENTION_BACKEND", "ROCM_AITER_MLA")
-        # backend = get_attn_backend(576, torch.bfloat16, "auto", 1, False, use_mla=True)
-        # assert backend.get_name() == "ROCM_AITER_MLA"
-
-        # # If attention backend is None
-        # # If use_mla is true
-        # # If VLLM_ROCM_USE_AITER is enabled
-        # # The selected backend is ROCM_AITER_MLA
-        # m.setenv("VLLM_ATTENTION_BACKEND", "")
-        # m.setenv("VLLM_ROCM_USE_AITER", "1")
-        # backend = get_attn_backend(576, torch.bfloat16, "auto", 1, False, use_mla=True)
-        # assert backend.get_name() == "ROCM_AITER_MLA"
+    # Change the attention backend to AITER MLA
+    attention_config = AttentionConfig(backend=AttentionBackendEnum.ROCM_AITER_MLA)
+    vllm_config = VllmConfig(attention_config=attention_config)
+
+    # with set_current_vllm_config(vllm_config):
+    #     backend = get_attn_backend(576, torch.bfloat16, "auto", 1, False, use_mla=True)
+    #     assert backend.get_name() == "ROCM_AITER_MLA"
+
+    # # If attention backend is None
+    # # If use_mla is true
+    # # If VLLM_ROCM_USE_AITER is enabled
+    # # The selected backend is ROCM_AITER_MLA
+    # with monkeypatch.context() as m:
+    #     m.setenv("VLLM_ROCM_USE_AITER", "1")
+
+    #     attention_config = AttentionConfig(backend=None)
+    #     vllm_config = VllmConfig(attention_config=attention_config)
+
+    #     with set_current_vllm_config(vllm_config):
+    #         backend = get_attn_backend(
+    #             576, torch.bfloat16, "auto", 1, False, use_mla=True
+    #         )
+    #         assert backend.get_name() == "ROCM_AITER_MLA"
diff --git a/tests/kernels/attention/test_triton_decode_attention.py b/tests/kernels/attention/test_triton_decode_attention.py
index e4df5213a08242b5317ef99330b49cffd1be167f..29a5491f8670e6e8008ac309919ddf7a9c368d65 100644
--- a/tests/kernels/attention/test_triton_decode_attention.py
+++ b/tests/kernels/attention/test_triton_decode_attention.py
@@ -4,8 +4,8 @@
 import pytest
 import torch
 
-from vllm.attention.ops.triton_decode_attention import decode_attention_fwd
 from vllm.utils.math_utils import cdiv
+from vllm.v1.attention.ops.triton_decode_attention import decode_attention_fwd
 
 
 @pytest.mark.parametrize("B", [3, 5])
diff --git a/tests/kernels/attention/test_triton_prefill_attention.py b/tests/kernels/attention/test_triton_prefill_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4505d91f5f77cf766c0e543c1c13de28b64803d
--- /dev/null
+++ b/tests/kernels/attention/test_triton_prefill_attention.py
@@ -0,0 +1,225 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+import torch.nn.functional as F
+
+from vllm.v1.attention.ops.triton_prefill_attention import context_attention_fwd
+
+
+def ref_masked_attention(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    is_causal: bool = True,
+    sliding_window_q: int | None = None,
+    sliding_window_k: int | None = None,
+) -> torch.Tensor:
+    """Reference implementation using PyTorch SDPA."""
+    # q, k, v: [total_tokens, num_heads, head_dim]
+    # SDPA expects [batch, num_heads, seq_len, head_dim]
+
+    total_tokens = q.shape[0]
+
+    # Add batch dimension and transpose
+    q = q.unsqueeze(0).transpose(1, 2)  # [1, num_heads, total_tokens, head_dim]
+    k = k.unsqueeze(0).transpose(1, 2)  # [1, num_heads, total_tokens, head_dim]
+    v = v.unsqueeze(0).transpose(1, 2)  # [1, num_heads, total_tokens, head_dim]
+
+    # Create attention mask if needed
+    attn_mask = None
+    use_causal = is_causal
+
+    # If we have sliding window or need custom masking, create explicit mask
+    sliding_window_q = sliding_window_q if sliding_window_q is not None else 0
+    sliding_window_k = sliding_window_k if sliding_window_k is not None else 0
+    if (sliding_window_q > 0) or (sliding_window_k > 0):
+        # Position indices
+        pos_q = torch.arange(total_tokens, device=q.device).unsqueeze(1)
+        pos_k = torch.arange(total_tokens, device=q.device).unsqueeze(0)
+
+        # Start with valid mask (False = no masking)
+        mask = torch.ones(
+            (total_tokens, total_tokens), dtype=torch.bool, device=q.device
+        )
+
+        # Apply causal mask
+        if is_causal:
+            mask = mask & (pos_q >= pos_k)
+
+        # Apply sliding window masks
+        sliding_window_mask = torch.ones_like(mask)
+        if sliding_window_q > 0:
+            sliding_window_mask &= pos_q - pos_k <= sliding_window_q
+
+        if sliding_window_k > 0:
+            sliding_window_mask &= pos_k - pos_q <= sliding_window_k
+
+        mask = mask & sliding_window_mask
+
+        attn_mask = torch.where(mask, 0.0, float("-inf")).to(q.dtype)
+        use_causal = False  # Don't use is_causal when providing explicit mask
+
+    # Use SDPA
+    output = F.scaled_dot_product_attention(
+        q, k, v, attn_mask=attn_mask, is_causal=use_causal, dropout_p=0.0
+    )
+
+    # Convert back to original shape: [total_tokens, num_heads, head_dim]
+    output = output.transpose(1, 2).squeeze(0)
+
+    return output
+
+
+@pytest.mark.parametrize("B", [5])
+@pytest.mark.parametrize("max_seq_len", [1024])
+@pytest.mark.parametrize("H_Q", [32])
+@pytest.mark.parametrize("H_KV", [32, 8])
+@pytest.mark.parametrize("D", [128])
+@pytest.mark.parametrize("is_causal", [True, False])
+@pytest.mark.parametrize("dtype", [torch.float32, torch.bfloat16])
+def test_context_attention(
+    B: int,
+    max_seq_len: int,
+    H_Q: int,
+    H_KV: int,
+    D: int,
+    is_causal: bool,
+    dtype: torch.dtype,
+):
+    """Test basic context attention without sliding window."""
+    torch.manual_seed(42)
+
+    # Generate random sequence lengths for each batch
+    seq_lens = torch.randint(max_seq_len // 2, max_seq_len + 1, (B,), device="cuda")
+    total_tokens = seq_lens.sum().item()
+
+    # Create batch start locations
+    b_start_loc = torch.zeros(B, dtype=torch.int32, device="cuda")
+    b_start_loc[1:] = torch.cumsum(seq_lens[:-1], dim=0)
+
+    # Create input tensors
+    q = torch.randn(total_tokens, H_Q, D, dtype=dtype, device="cuda")
+    k = torch.randn(total_tokens, H_KV, D, dtype=dtype, device="cuda")
+    v = torch.randn(total_tokens, H_KV, D, dtype=dtype, device="cuda")
+    o = torch.zeros_like(q)
+
+    # Call Triton kernel
+    context_attention_fwd(
+        q,
+        k,
+        v,
+        o,
+        b_start_loc,
+        seq_lens,
+        max_seq_len,
+        is_causal=is_causal,
+        sliding_window_q=None,
+        sliding_window_k=None,
+    )
+
+    # Compute reference output for each sequence in batch
+    o_ref = torch.zeros_like(q)
+    for i in range(B):
+        start = b_start_loc[i].item()
+        end = start + seq_lens[i].item()
+
+        q_seq = q[start:end]
+        k_seq = k[start:end]
+        v_seq = v[start:end]
+
+        # Expand KV heads if using GQA
+        if H_Q != H_KV:
+            kv_group_num = H_Q // H_KV
+            k_seq = k_seq.repeat_interleave(kv_group_num, dim=1)
+            v_seq = v_seq.repeat_interleave(kv_group_num, dim=1)
+
+        o_ref[start:end] = ref_masked_attention(
+            q_seq,
+            k_seq,
+            v_seq,
+            is_causal=is_causal,
+            sliding_window_q=None,
+            sliding_window_k=None,
+        )
+
+    # Compare outputs
+    torch.testing.assert_close(o, o_ref, rtol=1e-2, atol=1e-2)
+
+
+@pytest.mark.parametrize("B", [4])
+@pytest.mark.parametrize("max_seq_len", [1024])
+@pytest.mark.parametrize("H_Q", [32])
+@pytest.mark.parametrize("H_KV", [32, 8])
+@pytest.mark.parametrize("D", [128])
+@pytest.mark.parametrize("sliding_window", [(32, 32), (32, 0), (0, 32)])
+@pytest.mark.parametrize("dtype", [torch.float32, torch.bfloat16])
+def test_context_attention_sliding_window(
+    B: int,
+    max_seq_len: int,
+    H_Q: int,
+    H_KV: int,
+    D: int,
+    sliding_window: tuple[int, int],
+    dtype: torch.dtype,
+):
+    sliding_window_q, sliding_window_k = sliding_window
+    """Test context attention with sliding window."""
+    torch.manual_seed(42)
+
+    # Generate random sequence lengths for each batch
+    seq_lens = torch.randint(max_seq_len // 2, max_seq_len + 1, (B,), device="cuda")
+    total_tokens = seq_lens.sum().item()
+
+    # Create batch start locations
+    b_start_loc = torch.zeros(B, dtype=torch.int32, device="cuda")
+    b_start_loc[1:] = torch.cumsum(seq_lens[:-1], dim=0)
+
+    # Create input tensors
+    q = torch.randn(total_tokens, H_Q, D, dtype=dtype, device="cuda")
+    k = torch.randn(total_tokens, H_KV, D, dtype=dtype, device="cuda")
+    v = torch.randn(total_tokens, H_KV, D, dtype=dtype, device="cuda")
+    o = torch.zeros_like(q)
+
+    # Call Triton kernel
+    context_attention_fwd(
+        q,
+        k,
+        v,
+        o,
+        b_start_loc,
+        seq_lens,
+        max_seq_len,
+        is_causal=False,
+        sliding_window_q=sliding_window_q,
+        sliding_window_k=sliding_window_k,
+    )
+
+    # Compute reference output for each sequence in batch
+    o_ref = torch.zeros_like(q)
+    for i in range(B):
+        start = b_start_loc[i].item()
+        end = start + seq_lens[i].item()
+
+        q_seq = q[start:end]
+        k_seq = k[start:end]
+        v_seq = v[start:end]
+
+        # Expand KV heads if using GQA
+        if H_Q != H_KV:
+            kv_group_num = H_Q // H_KV
+            k_seq = k_seq.repeat_interleave(kv_group_num, dim=1)
+            v_seq = v_seq.repeat_interleave(kv_group_num, dim=1)
+
+        o_ref[start:end] = ref_masked_attention(
+            q_seq,
+            k_seq,
+            v_seq,
+            is_causal=False,
+            sliding_window_q=sliding_window_q if sliding_window_q > 0 else None,
+            sliding_window_k=sliding_window_k if sliding_window_k > 0 else None,
+        )
+
+    # Compare outputs
+    torch.testing.assert_close(o, o_ref, rtol=2e-2, atol=2e-2)
diff --git a/tests/kernels/attention/test_triton_unified_attention.py b/tests/kernels/attention/test_triton_unified_attention.py
index 94893d6cdc0a0209393a91db85ec0f271f734884..2b265f79a6f4f983903ac286f0d4ba77f6556ef0 100644
--- a/tests/kernels/attention/test_triton_unified_attention.py
+++ b/tests/kernels/attention/test_triton_unified_attention.py
@@ -5,9 +5,10 @@
 import pytest
 import torch
 
-from vllm.attention.ops.triton_unified_attention import unified_attention
 from vllm.platforms import current_platform
 from vllm.utils.math_utils import next_power_of_2
+from vllm.utils.torch_utils import set_random_seed
+from vllm.v1.attention.ops.triton_unified_attention import unified_attention
 
 NUM_HEADS = [(4, 4), (8, 2)]
 HEAD_SIZES = [128, 256]
@@ -113,7 +114,7 @@ def test_triton_unified_attn(
 ) -> None:
     torch.set_default_device("cuda")
 
-    current_platform.seed_everything(0)
+    set_random_seed(0)
     num_seqs = len(seq_lens)
     query_lens = [x[0] for x in seq_lens]
     kv_lens = [x[1] for x in seq_lens]
diff --git a/tests/kernels/attention/untest_attention_selector.py b/tests/kernels/attention/untest_attention_selector.py
index c959b2f4bb03c8c23bc1f61a70d7452a5f24b901..a63297c3579ef9533bd1cd6a30e55e966dd8b99f 100644
--- a/tests/kernels/attention/untest_attention_selector.py
+++ b/tests/kernels/attention/untest_attention_selector.py
@@ -6,11 +6,13 @@ from unittest.mock import patch
 import pytest
 import torch
 
-from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
+from vllm.config import AttentionConfig, VllmConfig, set_current_vllm_config
 from vllm.platforms import current_platform
 from vllm.platforms.cpu import CpuPlatform
 from vllm.platforms.cuda import CudaPlatform
 from vllm.platforms.rocm import RocmPlatform
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
+from vllm.v1.attention.selector import _cached_get_attn_backend, get_attn_backend
 
 
 @pytest.fixture(autouse=True)
@@ -73,18 +75,18 @@ def generate_params():
 
 
 @pytest.mark.parametrize("device, name, use_mla, block_size", generate_params())
-def test_env(
+def test_backend_selection(
     device: str,
     name: str,
     use_mla: bool,
     block_size: int,
-    monkeypatch: pytest.MonkeyPatch,
 ):
     """Test attention backend selection with valid device-backend pairs."""
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_ATTENTION_BACKEND", name)
-        m.setenv("VLLM_MLA_DISABLE", "1" if use_mla else "0")
+    # Create AttentionConfig with the specified backend
+    attention_config = AttentionConfig(backend=AttentionBackendEnum[name])
+    vllm_config = VllmConfig(attention_config=attention_config)
 
+    with set_current_vllm_config(vllm_config):
         if device == "cpu":
             with patch("vllm.platforms.current_platform", CpuPlatform()):
                 backend = get_attn_backend(16, torch.float16, None, block_size)
@@ -180,7 +182,7 @@ def test_env(
                         expected = name
                         assert backend.get_name() == expected
                     elif name == "FLASH_ATTN_MLA":
-                        from vllm.attention.utils.fa_utils import (
+                        from vllm.v1.attention.backends.fa_utils import (
                             flash_attn_supports_mla,
                         )
 
@@ -217,27 +219,32 @@ def test_env(
 @pytest.mark.parametrize("device", ["cpu", "cuda"])
 def test_fp32_fallback(device: str):
     """Test attention backend selection with fp32."""
-    if device == "cpu":
-        with patch("vllm.platforms.current_platform", CpuPlatform()):
-            backend = get_attn_backend(16, torch.float32, None, 16)
-        assert backend.get_name() == "CPU_ATTN"
+    # Use default config (no backend specified)
+    vllm_config = VllmConfig()
 
-    elif device == "cuda":
-        with patch("vllm.platforms.current_platform", CudaPlatform()):
-            backend = get_attn_backend(16, torch.float32, None, 16)
-        assert backend.get_name() == "FLEX_ATTENTION"
+    with set_current_vllm_config(vllm_config):
+        if device == "cpu":
+            with patch("vllm.platforms.current_platform", CpuPlatform()):
+                backend = get_attn_backend(16, torch.float32, None, 16)
+            assert backend.get_name() == "CPU_ATTN"
+
+        elif device == "cuda":
+            with patch("vllm.platforms.current_platform", CudaPlatform()):
+                backend = get_attn_backend(16, torch.float32, None, 16)
+            assert backend.get_name() == "FLEX_ATTENTION"
 
 
 def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
     """Test FlashAttn validation."""
     pytest.skip(
         "Skipping as current backend selector does not "
-        "handle fallbacks when a backend is set via env var."
+        "handle fallbacks when a backend is explicitly set."
     )
 
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_ATTENTION_BACKEND", "FLASH_ATTN")
+    attention_config = AttentionConfig(backend=AttentionBackendEnum.FLASH_ATTN)
+    vllm_config = VllmConfig(attention_config=attention_config)
 
+    with set_current_vllm_config(vllm_config):
         # Unsupported CUDA arch
         monkeypatch.setattr(torch.cuda, "get_device_capability", lambda _=None: (7, 5))
         backend = get_attn_backend(16, torch.float16, None, 16)
@@ -277,15 +284,10 @@ def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
         assert backend.get_name() != "FLASH_ATTN"
 
 
-def test_invalid_env(monkeypatch: pytest.MonkeyPatch):
+def test_invalid_backend():
     """Test that invalid attention backend names raise ValueError."""
     with (
-        monkeypatch.context() as m,
-        patch("vllm.platforms.current_platform", CudaPlatform()),
+        pytest.raises(ValueError),
     ):
-        m.setenv("VLLM_ATTENTION_BACKEND", "INVALID")
-
-        # Should raise ValueError for invalid backend
-        with pytest.raises(ValueError) as exc_info:
-            get_attn_backend(32, torch.float16, None, 16)
-        assert "Invalid value 'INVALID'" in str(exc_info.value)
+        # Invalid backend name should raise ValueError when creating enum
+        AttentionConfig(backend=AttentionBackendEnum["INVALID"])
diff --git a/tests/kernels/attention/untest_flashinfer.py b/tests/kernels/attention/untest_flashinfer.py
index eedeec33e0d45a0a44acbba5934221ed997fae0b..570bf7fc865abf9b986de139fc669a5bee00886a 100644
--- a/tests/kernels/attention/untest_flashinfer.py
+++ b/tests/kernels/attention/untest_flashinfer.py
@@ -5,6 +5,7 @@
 import pytest
 
 from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed
 
 try:
     import flashinfer
@@ -101,7 +102,7 @@ def test_flashinfer_decode_with_paged_kv(
     sliding_window: int | None,
 ) -> None:
     torch.set_default_device("cuda")
-    current_platform.seed_everything(0)
+    set_random_seed(0)
     num_seqs = len(kv_lens)
     num_query_heads = num_heads[0]
     num_kv_heads = num_heads[1]
@@ -196,7 +197,7 @@ def test_flashinfer_prefill_with_paged_kv(
     sliding_window: int | None,
 ) -> None:
     torch.set_default_device("cuda")
-    current_platform.seed_everything(0)
+    set_random_seed(0)
     num_seqs = len(seq_lens)
     query_lens = [x[0] for x in seq_lens]
     kv_lens = [x[1] for x in seq_lens]
@@ -299,7 +300,7 @@ def test_flashinfer_prefill_with_paged_fp8_kv(
 ) -> None:
     pytest.skip("TODO: fix the accuracy issue")
     torch.set_default_device("cuda")
-    current_platform.seed_everything(0)
+    set_random_seed(0)
     num_seqs = len(seq_lens)
     query_lens = [x[0] for x in seq_lens]
     kv_lens = [x[1] for x in seq_lens]
@@ -409,7 +410,7 @@ def test_flashinfer_decode_with_paged_fp8_kv(
 ) -> None:
     # test doesn't work for num_heads = (16,16)
     torch.set_default_device("cuda")
-    current_platform.seed_everything(0)
+    set_random_seed(0)
     num_seqs = len(kv_lens)
     num_query_heads = num_heads[0]
     num_kv_heads = num_heads[1]
diff --git a/tests/kernels/core/test_activation.py b/tests/kernels/core/test_activation.py
index e8777ec4f59e8cb2e50385daca945be2592e7d65..2fa4fd6272048d8cb4e8879572e7cc5e3bf54c4f 100644
--- a/tests/kernels/core/test_activation.py
+++ b/tests/kernels/core/test_activation.py
@@ -18,7 +18,7 @@ from vllm.model_executor.layers.activation import (
     SiluAndMul,
     SwigluOAIAndMul,
 )
-from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed
 
 DTYPES = [torch.half, torch.bfloat16, torch.float]
 NUM_TOKENS = [7, 83, 2048]  # Arbitrary values for testing
@@ -45,6 +45,7 @@ CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 e
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
 def test_act_and_mul(
+    default_vllm_config,
     activation: str,
     num_tokens: int,
     d: int,
@@ -52,7 +53,7 @@ def test_act_and_mul(
     seed: int,
     device: str,
 ) -> None:
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
     torch.set_default_device(device)
     x = torch.randn(num_tokens, 2 * d, dtype=dtype)
     if activation == "silu_and_mul":
@@ -122,6 +123,7 @@ def test_act_and_mul(
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
 def test_activation(
+    default_vllm_config,
     activation: type[torch.nn.Module],
     num_tokens: int,
     d: int,
@@ -129,7 +131,7 @@ def test_activation(
     seed: int,
     device: str,
 ) -> None:
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
     torch.set_default_device(device)
     x = torch.randn(num_tokens, d, dtype=dtype)
     layer = activation[0]()
diff --git a/tests/kernels/core/test_fused_qk_norm_rope.py b/tests/kernels/core/test_fused_qk_norm_rope.py
index a23959e353da9b6cbe606a7eccf62e8eccd82c03..43737f4f23b118844ea9c6d191fb8bd3cb2ed11c 100644
--- a/tests/kernels/core/test_fused_qk_norm_rope.py
+++ b/tests/kernels/core/test_fused_qk_norm_rope.py
@@ -8,11 +8,13 @@ from tests.kernels.utils import opcheck
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
 from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed
 
 DTYPES = [torch.bfloat16, torch.float16]
 IS_NEOX = [True, False]
 EPS_VALUES = [1e-5, 1e-6]
 SEEDS = [13]
+PARTIAL_ROPE = [True, False]
 CUDA_DEVICES = ["cuda:0"]
 
 
@@ -52,16 +54,19 @@ def _apply_qk_norm_rope(
 @pytest.mark.parametrize("is_neox", IS_NEOX)
 @pytest.mark.parametrize("eps", EPS_VALUES)
 @pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("rotary_ratio", [1.0, 0.5, 0.25])
 @torch.inference_mode()
 def test_fused_qk_norm_rope_matches_reference(
+    default_vllm_config,
     device: str,
     dtype: torch.dtype,
     is_neox: bool,
     eps: float,
     seed: int,
+    rotary_ratio: float,
 ):
     torch.set_default_device(device)
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
     num_heads, num_kv_heads, head_dim = 16, 4, 128
     num_tokens = 4
 
@@ -76,10 +81,10 @@ def test_fused_qk_norm_rope_matches_reference(
     k_norm.weight.data.normal_(mean=1.0, std=0.1)
     q_weight = q_norm.weight.data
     k_weight = k_norm.weight.data
-
+    rotary_dim = int(head_dim * rotary_ratio)
     rope = RotaryEmbedding(
         head_size=head_dim,
-        rotary_dim=head_dim,
+        rotary_dim=rotary_dim,
         max_position_embeddings=4096,
         base=10000.0,
         is_neox_style=is_neox,
diff --git a/tests/kernels/core/test_fused_quant_layernorm.py b/tests/kernels/core/test_fused_quant_layernorm.py
index 16fa6ce26ce6a159e7e159b2c6363d2baed5286f..904b8a6021660d687c1cdd7af9993b38d554125a 100644
--- a/tests/kernels/core/test_fused_quant_layernorm.py
+++ b/tests/kernels/core/test_fused_quant_layernorm.py
@@ -147,6 +147,7 @@ def ops_impl(
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
 def test_rms_norm(
+    default_vllm_config,
     num_tokens: int,
     hidden_size: int,
     add_residual: bool,
diff --git a/tests/kernels/core/test_layernorm.py b/tests/kernels/core/test_layernorm.py
index db56bb4f943a029d47b2d5fff1bb2a83a2b6a84e..030e09a8cc1ef09009b35521d35a6724e88b81f8 100644
--- a/tests/kernels/core/test_layernorm.py
+++ b/tests/kernels/core/test_layernorm.py
@@ -7,7 +7,7 @@ import torch
 from tests.kernels.quant_utils import FP8_DTYPE
 from tests.kernels.utils import opcheck
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed
 
 DTYPES = [torch.half, torch.bfloat16, torch.float]
 NUM_TOKENS = [7, 83, 4096]  # Arbitrary values for testing
@@ -26,6 +26,7 @@ CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 e
 @pytest.mark.parametrize("strided_input", [False, True])
 @torch.inference_mode()
 def test_rms_norm(
+    default_vllm_config,
     num_tokens: int,
     hidden_size: int,
     add_residual: bool,
@@ -34,7 +35,7 @@ def test_rms_norm(
     device: str,
     strided_input: bool,
 ) -> None:
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
     torch.set_default_device(device)
     layer = RMSNorm(hidden_size).to(dtype=dtype)
     layer.weight.data.normal_(mean=1.0, std=0.1)
@@ -70,6 +71,80 @@ def test_rms_norm(
         )
 
 
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("add_residual", ADD_RESIDUAL)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("quant_scale", [0.01, 1.0, 10.0])
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("strided_input", [False, True])
+def test_fused_rms_norm_quant(
+    num_tokens: int,
+    hidden_size: int,
+    add_residual: bool,
+    dtype: torch.dtype,
+    quant_scale: float,
+    seed: int,
+    device: str,
+    strided_input: bool,
+) -> None:
+    set_random_seed(seed)
+    torch.set_default_device(device)
+
+    weight = torch.empty(hidden_size, dtype=dtype).normal_(mean=1.0, std=0.1)
+    scale = 1 / (2 * hidden_size)
+    last_dim = 2 * hidden_size if strided_input else hidden_size
+    x_base = torch.randn(num_tokens, last_dim, dtype=dtype)
+    x = x_base[..., :hidden_size]
+    assert x.is_contiguous() != strided_input
+
+    x *= scale
+    if add_residual:
+        residual = torch.randn_like(x) * scale
+        residual_fused = residual.clone()
+    else:
+        residual = residual_fused = None
+
+    out_norm = torch.empty_like(x)
+    out_quant = torch.empty_like(x, dtype=FP8_DTYPE)
+    out_quant_fused = torch.empty_like(out_quant)
+
+    quant_scale_t = torch.tensor(quant_scale, dtype=torch.float32)
+
+    if add_residual:
+        torch.ops._C.fused_add_rms_norm_static_fp8_quant(
+            out_quant_fused, x, residual_fused, weight, quant_scale_t, 1e-6
+        )
+
+        # Unfused kernel is in-place so it goes second
+        # Also use a separate clone of x to avoid modifying the input
+        x_unfused_base = x_base.clone()
+        x_unfused = x_unfused_base[..., :hidden_size]
+        assert x_unfused.is_contiguous() != strided_input
+        torch.ops._C.fused_add_rms_norm(x_unfused, residual, weight, 1e-6)
+        torch.ops._C.static_scaled_fp8_quant(
+            out_quant, x_unfused.contiguous(), quant_scale_t
+        )
+
+        torch.cuda.synchronize()
+        torch.testing.assert_close(residual_fused, residual, atol=1e-2, rtol=1e-2)
+        opcheck(
+            torch.ops._C.fused_add_rms_norm_static_fp8_quant,
+            (out_quant_fused, x, residual_fused, weight, quant_scale_t, 1e-6),
+        )
+    else:
+        torch.ops._C.rms_norm_static_fp8_quant(
+            out_quant_fused, x, weight, quant_scale_t, 1e-6
+        )
+
+        torch.ops._C.rms_norm(out_norm, x, weight, 1e-6)
+        torch.ops._C.static_scaled_fp8_quant(out_quant, out_norm, quant_scale_t)
+
+        opcheck(
+            torch.ops._C.rms_norm_static_fp8_quant,
+            (out_quant_fused, x, weight, quant_scale_t, 1e-6),
+        )
 
 # @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
 # @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
diff --git a/tests/kernels/core/test_mrope.py b/tests/kernels/core/test_mrope.py
index ba5d593b2d35559b42f78c1d7c0ed92912aa42c7..f12dc18654a6af3c633c2aadbc38c48a67448bae 100644
--- a/tests/kernels/core/test_mrope.py
+++ b/tests/kernels/core/test_mrope.py
@@ -10,6 +10,7 @@ from transformers import __version__ as TRANSFORMERS_VERSION
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.platforms import current_platform
 from vllm.transformers_utils.config import get_config
+from vllm.utils.torch_utils import set_random_seed
 
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
@@ -24,7 +25,7 @@ def generate_test_data(
     device: torch.device,
 ):
     """Generate test data for given configuration."""
-    current_platform.seed_everything(42)
+    set_random_seed(42)
     # Create 2D positions (3, num_tokens) for multimodal case
     positions = torch.randint(
         0, max_position_embeddings // 4, (3, num_tokens), device=device
@@ -89,6 +90,7 @@ num_tokens_list = [11, 8192]
 @pytest.mark.parametrize("dtype", [torch.bfloat16])
 @pytest.mark.parametrize("num_tokens", num_tokens_list)
 def test_mrope(
+    default_vllm_config,
     model_name: str,
     model_info: MRoPETestInfo,
     tp_size: int,
@@ -158,6 +160,7 @@ def test_mrope(
 @pytest.mark.parametrize("dtype", [torch.bfloat16])
 @pytest.mark.parametrize("num_tokens", num_tokens_list)
 def test_mrope_torch_compile_tracing(
+    default_vllm_config,
     model_name: str,
     model_info: MRoPETestInfo,
     tp_size: int,
diff --git a/tests/kernels/core/test_pos_encoding.py b/tests/kernels/core/test_pos_encoding.py
index d18f01314c8f53b10fb6fe3bcbb5f85f7475ef94..b43e1dab4c5b6f6f590a182f7fbd0ef858c6a89e 100644
--- a/tests/kernels/core/test_pos_encoding.py
+++ b/tests/kernels/core/test_pos_encoding.py
@@ -9,7 +9,7 @@ import torch
 
 from tests.kernels.allclose_default import get_default_atol, get_default_rtol
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed
 
 IS_NEOX_STYLE = [True, False]
 DTYPES = [torch.bfloat16, torch.float]
@@ -62,6 +62,7 @@ TENSORS_SHAPES_FN = [
 @pytest.mark.parametrize("use_key", USE_KEY)
 @torch.inference_mode()
 def test_rotary_embedding(
+    default_vllm_config,
     is_neox_style: bool,
     tensor_shape_fn: Callable[[int, int, int, int], tuple[int, ...]],
     batch_size: int,
@@ -79,7 +80,7 @@ def test_rotary_embedding(
     if rotary_dim is None:
         rotary_dim = head_size
 
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
     torch.set_default_device(device)
     if rotary_dim is None:
         rotary_dim = head_size
@@ -123,7 +124,7 @@ def test_rotary_embedding(
 
 
 @torch.inference_mode()
-def test_rope_module_cache():
+def test_rope_module_cache(default_vllm_config):
     MAX_POSITIONS = [123, 1234]
     ROPE_THETAS = [10000, 1000000]
     ROPE_PARAMETERS = (
diff --git a/tests/kernels/core/test_rotary_embedding.py b/tests/kernels/core/test_rotary_embedding.py
index 30c64e0bd72a7e13213d4ee4b5873386db4b4126..912a422e0ce44b3f2844d3135a9c8c98b3269a5e 100644
--- a/tests/kernels/core/test_rotary_embedding.py
+++ b/tests/kernels/core/test_rotary_embedding.py
@@ -36,6 +36,7 @@ def rotary_embedding_opcheck(
 @pytest.mark.parametrize("use_key", [True, False])
 @pytest.mark.parametrize("head_stride_is_contiguous", [True, False])
 def test_rotary_embedding_opcheck(
+    default_vllm_config,
     dist_init,
     device,
     max_position,
diff --git a/tests/kernels/core/test_rotary_embedding_mla_cache_fused.py b/tests/kernels/core/test_rotary_embedding_mla_cache_fused.py
new file mode 100644
index 0000000000000000000000000000000000000000..021171d888a5353a6f37f1c94378ca4a1c2131a3
--- /dev/null
+++ b/tests/kernels/core/test_rotary_embedding_mla_cache_fused.py
@@ -0,0 +1,162 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Tests for fused MLA KV-cache write and RoPE fused kernel
+"""
+
+import random
+
+import pytest
+import torch
+
+from tests.kernels.allclose_default import get_default_atol, get_default_rtol
+from tests.kernels.utils import DEFAULT_OPCHECK_TEST_UTILS, opcheck
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
+from vllm.utils.torch_utils import set_random_seed
+
+
+@pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16, torch.float])
+@pytest.mark.parametrize("is_neox_style", [False, True])
+@pytest.mark.parametrize("seq_len", [11, 42])
+@pytest.mark.parametrize("qk_rope_head_dim", [64, 128])
+@pytest.mark.parametrize("num_q_heads", [128])
+@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
+@pytest.mark.parametrize("kv_lora_rank", [512])
+@pytest.mark.parametrize("num_blocks", [64])
+@pytest.mark.parametrize("block_size", [16, 64, 256])
+@pytest.mark.parametrize("seed", [0])
+@pytest.mark.parametrize(
+    "device", [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
+)
+@torch.inference_mode()
+def test_concat_and_cache_mla_rope_fused(
+    default_vllm_config,
+    dtype: torch.dtype,
+    is_neox_style: bool,
+    seq_len: int,
+    qk_rope_head_dim: int,
+    num_q_heads: int,
+    kv_cache_dtype: str,
+    kv_lora_rank: int,
+    num_blocks: int,
+    block_size: int,
+    seed: int,
+    device: str,
+    max_position: int = 8192,
+    base: float = 10000,
+) -> None:
+    set_random_seed(seed)
+    torch.set_default_device(device)
+
+    rope = RotaryEmbedding(
+        qk_rope_head_dim,
+        qk_rope_head_dim,
+        max_position,
+        base,
+        is_neox_style,
+        torch.float32,
+    )
+
+    rope = rope.to(dtype=dtype, device=torch.get_default_device())
+
+    positions = torch.randint(0, max_position, (seq_len,))
+
+    query = torch.randn(seq_len, num_q_heads, qk_rope_head_dim, dtype=dtype)
+    key = torch.randn(seq_len, 1, qk_rope_head_dim + kv_lora_rank, dtype=dtype)
+
+    k_pe = torch.flatten(key[..., :qk_rope_head_dim], start_dim=1).to(device=device)
+    kv_c = torch.flatten(key[..., qk_rope_head_dim:], start_dim=1).to(device=device)
+
+    # NOTE(woosuk): The reference implementation should be executed first
+    # because the custom kernel is in-place.
+    ref_q_pe, ref_k_pe = rope.forward_native(positions, query, k_pe)
+    assert ref_k_pe is not None
+
+    ref_k_pe = torch.flatten(ref_k_pe, start_dim=1).to(device=device)
+    ref_k_rope = ref_k_pe[..., :qk_rope_head_dim]
+
+    total_available_slots = num_blocks * block_size
+    total_needed_slots = seq_len
+    assert total_available_slots >= total_needed_slots, "Not enough kv slots!"
+
+    slot_mapping_lst = random.sample(range(total_available_slots), total_needed_slots)
+    slot_mapping = torch.tensor(slot_mapping_lst, dtype=torch.long, device=device)
+
+    entry_size = kv_lora_rank + qk_rope_head_dim
+
+    kv_cache_scale = torch.tensor([0.1], dtype=torch.float32, device=device)
+
+    kv_cache = torch.zeros(
+        num_blocks,
+        block_size,
+        entry_size,
+        dtype=torch.uint8 if kv_cache_dtype == "fp8" else dtype,
+        device=device,
+    )
+
+    ref_temp = torch.zeros(*kv_cache.shape, dtype=dtype, device=device)
+
+    for i in range(seq_len):
+        slot = slot_mapping[i].item()
+        block_idx = slot // block_size
+        block_offset = slot % block_size
+        ref_temp[block_idx, block_offset] = torch.cat((kv_c[i], ref_k_rope[i]), -1)
+
+    if kv_cache_dtype == "fp8":
+        ref_kv_cache = torch.empty_like(ref_temp, dtype=kv_cache.dtype)
+        ops.convert_fp8(
+            ref_kv_cache, ref_temp, kv_cache_scale.item(), kv_dtype=kv_cache_dtype
+        )
+    else:
+        ref_kv_cache = ref_temp
+
+    opcheck(
+        torch.ops._C_cache_ops.concat_and_cache_mla_rope_fused,
+        (
+            positions,
+            query,
+            k_pe,
+            kv_c,
+            rope.cos_sin_cache,
+            is_neox_style,
+            slot_mapping,
+            kv_cache,
+            kv_cache_dtype,
+            kv_cache_scale,
+        ),
+        test_utils=DEFAULT_OPCHECK_TEST_UTILS,
+    )
+
+    ops.concat_and_cache_mla_rope_fused(
+        positions,
+        query,
+        k_pe,
+        kv_c,
+        rope.cos_sin_cache,
+        is_neox_style,
+        slot_mapping,
+        kv_cache,
+        kv_cache_dtype,
+        kv_cache_scale,
+    )
+
+    if kv_cache_dtype == "fp8":
+        result_temp = torch.empty_like(kv_cache, dtype=torch.float16)
+        ops.convert_fp8(
+            result_temp,
+            kv_cache.contiguous(),
+            kv_cache_scale.item(),
+            kv_dtype=kv_cache_dtype,
+        )
+        expected_temp = torch.empty_like(ref_kv_cache, dtype=torch.float16)
+        ops.convert_fp8(
+            expected_temp, ref_kv_cache, kv_cache_scale.item(), kv_dtype=kv_cache_dtype
+        )
+        torch.testing.assert_close(result_temp, expected_temp, atol=0.001, rtol=0.1)
+    else:
+        torch.testing.assert_close(kv_cache, ref_kv_cache)
+
+    torch.testing.assert_close(
+        query, ref_q_pe, atol=get_default_atol(query), rtol=get_default_rtol(query)
+    )
diff --git a/tests/kernels/mamba/test_mamba_mixer2.py b/tests/kernels/mamba/test_mamba_mixer2.py
index 6fca33acd48a3f9e465b7049fbad6c0d7616b6ed..98879ff6ed7fd7fc294748a9bfb041d5d32b1b6e 100644
--- a/tests/kernels/mamba/test_mamba_mixer2.py
+++ b/tests/kernels/mamba/test_mamba_mixer2.py
@@ -12,8 +12,8 @@ from vllm.distributed.parallel_state import (
     initialize_model_parallel,
 )
 from vllm.model_executor.layers.mamba.mamba_mixer2 import Mixer2RMSNormGated
-from vllm.platforms import current_platform
 from vllm.utils.system_utils import update_environment_variables
+from vllm.utils.torch_utils import set_random_seed
 
 
 @multi_gpu_test(num_gpus=2)
@@ -68,7 +68,7 @@ def mixer2_gated_norm_tensor_parallel(
     dtype: torch.dtype,
     device: str,
 ):
-    current_platform.seed_everything(0)
+    set_random_seed(0)
 
     device = torch.device(f"cuda:{local_rank}")
     torch.cuda.set_device(device)
diff --git a/tests/kernels/mamba/untest_causal_conv1d.py b/tests/kernels/mamba/untest_causal_conv1d.py
index 4647b97c477183f0ec8864fd28ad2176fd4fdf9c..039f2fc06d57912f192500b06e6da74d1bd9a5c1 100644
--- a/tests/kernels/mamba/untest_causal_conv1d.py
+++ b/tests/kernels/mamba/untest_causal_conv1d.py
@@ -7,12 +7,12 @@ import torch
 import torch.nn.functional as F
 from einops import rearrange
 
-from vllm.attention.backends.utils import PAD_SLOT_ID
 from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
     causal_conv1d_fn,
     causal_conv1d_update,
 )
-from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed
+from vllm.v1.attention.backends.utils import PAD_SLOT_ID
 
 
 def causal_conv1d_ref(
@@ -154,7 +154,7 @@ def test_causal_conv1d_update(dim, width, seqlen, has_bias, silu_activation, ity
     if itype == torch.bfloat16:
         rtol, atol = 1e-2, 5e-2
     # set seed
-    current_platform.seed_everything(0)
+    set_random_seed(0)
     batch = 2
     x = torch.randn(batch, dim, seqlen, device=device, dtype=itype)
     x_ref = x.clone()
@@ -201,7 +201,7 @@ def test_causal_conv1d_update_with_batch_gather(
         rtol, atol = 1e-2, 5e-2
 
     # set seed
-    current_platform.seed_everything(0)
+    set_random_seed(0)
 
     padding = 5 if with_padding else 0
     padded_batch_size = batch_size + padding
@@ -278,7 +278,7 @@ def test_causal_conv1d_varlen(
     if itype == torch.bfloat16:
         rtol, atol = 1e-2, 5e-2
     # set seed
-    current_platform.seed_everything(0)
+    set_random_seed(0)
     seqlens = []
     batch_size = batch
     padding = 3 if with_padding else 0
diff --git a/tests/kernels/mamba/untest_mamba_ssm.py b/tests/kernels/mamba/untest_mamba_ssm.py
index 50e48aad6ebaaba49c3129b5ec5b91205c6bee43..905207109474803b64e8d756574260354abf7ea0 100644
--- a/tests/kernels/mamba/untest_mamba_ssm.py
+++ b/tests/kernels/mamba/untest_mamba_ssm.py
@@ -8,12 +8,12 @@ from einops import rearrange, repeat
 
 from tests.kernels.utils import opcheck
 from vllm import _custom_ops as ops  # noqa: F401
-from vllm.attention.backends.utils import PAD_SLOT_ID
 from vllm.model_executor.layers.mamba.ops.mamba_ssm import (
     selective_scan_fn,
     selective_state_update,
 )
-from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed
+from vllm.v1.attention.backends.utils import PAD_SLOT_ID
 
 
 def selective_state_update_ref(
@@ -271,7 +271,7 @@ def test_selective_scan(
         rtolw = max(rtolw, rtol)
         atolw = max(atolw, atol)
     # set seed
-    current_platform.seed_everything(0)
+    set_random_seed(0)
     batch_size = 1
     dim = 4
     dstate = 8
@@ -401,7 +401,7 @@ def test_selective_state_update(dim, dstate, has_z, itype):
         if torch.version.hip:
             atol *= 2
     # set seed
-    current_platform.seed_everything(0)
+    set_random_seed(0)
     batch_size = 1
     state = torch.randn(batch_size, dim, dstate, dtype=itype, device=device)
     x = torch.randn(batch_size, dim, device=device, dtype=itype)
@@ -438,7 +438,7 @@ def test_selective_state_update_varlen(dim, dstate, has_z, itype, max_seq_len):
         if torch.version.hip:
             atol *= 2
     # set seed
-    current_platform.seed_everything(0)
+    set_random_seed(0)
     batch_size = 4
     token_counts = torch.randint(1, max_seq_len + 1, (batch_size,), device=device)
     total_tokens = int(token_counts.sum().item())
@@ -857,7 +857,7 @@ def test_selective_state_update_with_num_accepted_tokens(
         if torch.version.hip:
             atol *= 2
 
-    current_platform.seed_everything(0)
+    set_random_seed(0)
     batch_size = 4
 
     tokens_per_seq = torch.randint(1, max_seq_len + 1, (batch_size,), device=device)
@@ -983,7 +983,7 @@ def test_selective_state_update_varlen_with_num_accepted(
         if torch.version.hip:
             atol *= 2
 
-    current_platform.seed_everything(0)
+    set_random_seed(0)
     batch_size = 4
 
     tokens_per_seq = torch.randint(1, max_seq_len + 1, (batch_size,), device=device)
diff --git a/tests/kernels/mamba/untest_mamba_ssm_ssd.py b/tests/kernels/mamba/untest_mamba_ssm_ssd.py
index 0b0b82e484a1c42323bfd65361cfc6667976abbe..40aa3d017d781d4abcf14b7a7279c0df8b31ef03 100644
--- a/tests/kernels/mamba/untest_mamba_ssm_ssd.py
+++ b/tests/kernels/mamba/untest_mamba_ssm_ssd.py
@@ -9,7 +9,7 @@ from einops import rearrange, repeat
 from vllm.model_executor.layers.mamba.ops.ssd_combined import (
     mamba_chunk_scan_combined_varlen,
 )
-from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed
 from vllm.v1.attention.backends.mamba2_attn import compute_varlen_chunk_metadata
 
 # Added by the IBM Team, 2024
@@ -82,7 +82,7 @@ def ssd_minimal_discrete(X, A, B, C, block_len, initial_states=None):
 
 
 def generate_random_inputs(batch_size, seqlen, n_heads, d_head, itype, device="cuda"):
-    current_platform.seed_everything(0)
+    set_random_seed(0)
     A = -torch.exp(torch.rand(n_heads, dtype=itype, device=device))
     dt = F.softplus(
         torch.randn(batch_size, seqlen, n_heads, dtype=itype, device=device) - 4
diff --git a/tests/kernels/moe/modular_kernel_tools/common.py b/tests/kernels/moe/modular_kernel_tools/common.py
index 6078ce44cee9f14753d5acfd9fc104e967de4b96..537dcae4e74b4e687b48a8fd03c0c335d3b6def5 100644
--- a/tests/kernels/moe/modular_kernel_tools/common.py
+++ b/tests/kernels/moe/modular_kernel_tools/common.py
@@ -258,16 +258,16 @@ class Config:
                     f"{self.fe_supported_types()}."
                 )
 
-        # Check block quanization support
-        is_block_quatized = self.quant_block_shape is not None
-        if is_block_quatized and self.quant_dtype is None:
+        # Check block quantization support
+        is_block_quantized = self.quant_block_shape is not None
+        if is_block_quantized and self.quant_dtype is None:
             return False, "No block quantization support."
 
-        if is_block_quatized and not self.is_block_quant_supported():
+        if is_block_quantized and not self.is_block_quant_supported():
             return False, "Mismatched block quantization support."
 
         # deep_gemm only works with block-quantized
-        if self.needs_deep_gemm() and not is_block_quatized:
+        if self.needs_deep_gemm() and not is_block_quantized:
             return False, "Needs DeepGEMM but not block quantized."
 
         # Check dependencies (turn into asserts?)
diff --git a/tests/kernels/moe/modular_kernel_tools/make_feature_matrix.py b/tests/kernels/moe/modular_kernel_tools/make_feature_matrix.py
index 95db6327c4f105ea6b6fc3decb53ff2693408bc8..08e50c52cbedb96107859de2482250e4854dd4d6 100644
--- a/tests/kernels/moe/modular_kernel_tools/make_feature_matrix.py
+++ b/tests/kernels/moe/modular_kernel_tools/make_feature_matrix.py
@@ -10,7 +10,7 @@ from tqdm import tqdm
 
 from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.fused_moe.config import FUSED_MOE_UNQUANTIZED_CONFIG
-from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed
 
 from .common import (
     Config,
@@ -40,7 +40,7 @@ def rank_worker(
     config: Config,
     weights: WeightTensors,
 ):
-    current_platform.seed_everything(pgi.rank)
+    set_random_seed(pgi.rank)
 
     # sanity check
     from vllm import envs
diff --git a/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py b/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py
index a3e264c5f5e2889827dbe79a61bb32fafb1c2b4f..3cdc7b82130b80c544d5d9bb15afe10ccb578e6e 100644
--- a/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py
+++ b/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py
@@ -9,7 +9,7 @@ from typing import Any
 import torch
 
 from vllm.config import VllmConfig
-from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed
 
 from .common import Config, RankTensors, WeightTensors, make_modular_kernel
 from .parallel_utils import ProcessGroupInfo, parallel_launch_with_config
@@ -82,7 +82,7 @@ def rank_worker(
     config: Config,
     weights: WeightTensors,
 ):
-    current_platform.seed_everything(pgi.rank)
+    set_random_seed(pgi.rank)
 
     # sanity check
     from vllm import envs
diff --git a/tests/kernels/moe/test_batched_moe.py b/tests/kernels/moe/test_batched_moe.py
index 2ef170f1ab308e39f63bc9663d3f71ce538cdfaa..c9d425b5b9903642802807cb24ab26ab71d5d568 100644
--- a/tests/kernels/moe/test_batched_moe.py
+++ b/tests/kernels/moe/test_batched_moe.py
@@ -21,6 +21,7 @@ from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
 from vllm.platforms import current_platform
 from vllm.triton_utils import tl
+from vllm.utils.torch_utils import set_random_seed
 
 MNK_FACTORS = [
     (1, 128, 128),
@@ -115,7 +116,7 @@ def test_batched_mm(
 ):
     """Note: float8_e4m3fn is not supported on CUDA architecture < 89,
     and those tests will be skipped on unsupported hardware."""
-    current_platform.seed_everything(7)
+    set_random_seed(7)
 
     use_fp8_w8a8 = dtype == torch.float8_e4m3fn
 
@@ -252,7 +253,7 @@ def test_fused_moe_batched_experts(
 ):
     """Note: float8_e4m3fn is not supported on CUDA architecture < 89,
     and those tests will be skipped on unsupported hardware."""
-    current_platform.seed_everything(7)
+    set_random_seed(7)
 
     use_fp8_w8a8 = dtype == torch.float8_e4m3fn
 
diff --git a/tests/kernels/moe/test_cpu_fused_moe.py b/tests/kernels/moe/test_cpu_fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0f817a9ca7cd3627ff1407e2adceeee960f3b83
--- /dev/null
+++ b/tests/kernels/moe/test_cpu_fused_moe.py
@@ -0,0 +1,169 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+from tests.kernels.allclose_default import get_default_atol, get_default_rtol
+from vllm._custom_ops import cpu_fused_moe, cpu_prepack_moe_weight
+from vllm.model_executor.layers.fused_moe.cpu_fused_moe import _CPU_MOE_ACT
+from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed
+
+if not current_platform.is_cpu():
+    pytest.skip("skipping CPU-only tests", allow_module_level=True)
+
+EXPERT_NUM = [
+    8,
+]
+HIDDEN_DIM = [128, 2880]
+INTERMEDIATE_DIM = [128, 2880]
+BATCH_SIZE = [1, 64, 256]
+ACT = ["silu", "swigluoai"]
+USE_BIAS = [True, False]
+ISA = ["amx", "vec"] if torch._C._cpu._is_amx_tile_supported() else ["vec"]
+DTYPE = [torch.bfloat16]
+
+
+def ref_fused_moe(
+    input: torch.Tensor,
+    w13: torch.Tensor,
+    w2: torch.Tensor,
+    w13_bias: torch.Tensor | None,
+    w2_bias: torch.Tensor | None,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    activation: str,
+) -> torch.Tensor:
+    len_experts = w13.size(0)
+
+    cnts = topk_ids.new_zeros((topk_ids.shape[0], len_experts))
+    cnts.scatter_(1, topk_ids.to(torch.int64), 1)
+    tokens_per_expert = cnts.sum(dim=0)
+    idxs = topk_ids.view(-1).argsort()
+
+    sorted_tokens = input[idxs // topk_ids.shape[1]]
+    tokens_per_expert = tokens_per_expert.cpu().numpy()
+
+    outputs = []
+    start_idx = 0
+
+    for i, num_tokens in enumerate(tokens_per_expert):
+        end_idx = start_idx + num_tokens
+        if num_tokens == 0:
+            continue
+        tokens_for_this_expert = sorted_tokens[start_idx:end_idx].float()
+        curr_w13 = w13[i].float()
+        curr_w2 = w2[i].float()
+
+        curr_w13_bias = None
+        if w13_bias is not None:
+            curr_w13_bias = w13_bias[i].float()
+
+        curr_w2_bias = None
+        if w2_bias is not None:
+            curr_w2_bias = w2_bias[i].float()
+
+        gate_up = torch.nn.functional.linear(
+            tokens_for_this_expert, curr_w13, curr_w13_bias
+        )
+        # Note: to simulate the kernel implementation
+        gate_up = (
+            _CPU_MOE_ACT[activation]
+            .forward_native(gate_up)
+            .to(dtype=input.dtype)
+            .float()
+        )
+        expert_out = torch.nn.functional.linear(gate_up, curr_w2, curr_w2_bias)
+
+        outputs.append(expert_out)
+        start_idx = end_idx
+
+    outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)
+    new_x = torch.empty_like(outs)
+
+    new_x[idxs] = outs
+    final_out = (
+        new_x.view(*topk_ids.shape, -1)
+        .mul_(topk_weights.unsqueeze(dim=-1))
+        .sum(dim=1)
+        .type(input.dtype)
+    )
+    return final_out
+
+
+@pytest.mark.parametrize("batch_size", BATCH_SIZE)
+@pytest.mark.parametrize("expert_num", EXPERT_NUM)
+@pytest.mark.parametrize("hidden_size", HIDDEN_DIM)
+@pytest.mark.parametrize("intermediate_size", INTERMEDIATE_DIM)
+@pytest.mark.parametrize("use_bias", USE_BIAS)
+@pytest.mark.parametrize("dtype", DTYPE)
+@pytest.mark.parametrize("act", ACT)
+@pytest.mark.parametrize("isa", ISA)
+def test_cpu_fused_moe(
+    default_vllm_config,
+    batch_size: int,
+    expert_num: int,
+    hidden_size: int,
+    intermediate_size: int,
+    use_bias: bool,
+    dtype: torch.dtype,
+    act: str,
+    isa: str,
+):
+    set_random_seed(0)
+
+    topk_num = max(expert_num // 2, 1)
+    up_dim = 2 * intermediate_size
+
+    input = torch.randn((batch_size, hidden_size), dtype=dtype) / (
+        0.5 * hidden_size**0.5
+    )
+    w13 = torch.randn((expert_num, up_dim, hidden_size), dtype=dtype) / (
+        0.5 * hidden_size**0.5
+    )
+    w2 = torch.randn((expert_num, hidden_size, intermediate_size), dtype=dtype) / (
+        0.5 * intermediate_size**0.5
+    )
+    router_logits = torch.randn((batch_size, expert_num), dtype=dtype)
+    w13_bias = None
+    w2_bias = None
+    if use_bias:
+        w13_bias = torch.randn((expert_num, up_dim), dtype=dtype) / (0.5 * up_dim**0.5)
+        w2_bias = torch.randn((expert_num, hidden_size), dtype=dtype) / (
+            0.5 * hidden_size**0.5
+        )
+    score = torch.softmax(router_logits, dim=-1, dtype=torch.float32)
+    topk_weight, topk_ids = torch.topk(score, topk_num)
+    topk_ids = topk_ids.to(torch.int32)
+
+    ref_output = ref_fused_moe(
+        input,
+        w13,
+        w2,
+        w13_bias,
+        w2_bias,
+        topk_weight,
+        topk_ids,
+        act,
+    )
+
+    packed_w13 = cpu_prepack_moe_weight(w13, isa)
+    packed_w2 = cpu_prepack_moe_weight(w2, isa)
+    output = cpu_fused_moe(
+        input,
+        packed_w13,
+        packed_w2,
+        w13_bias,
+        w2_bias,
+        topk_weight,
+        topk_ids,
+        act,
+        isa,
+    )
+
+    atol, rtol = get_default_atol(output), get_default_rtol(output)
+    (
+        torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol),
+        f"{torch.max(torch.abs(output - ref_output))}",
+    )
diff --git a/tests/kernels/moe/test_cutlass_grouped_gemm.py b/tests/kernels/moe/test_cutlass_grouped_gemm.py
deleted file mode 100644
index 1c10cb3b2c699e64916bfbd9a3db3d23e9f7e265..0000000000000000000000000000000000000000
--- a/tests/kernels/moe/test_cutlass_grouped_gemm.py
+++ /dev/null
@@ -1,92 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# DeepGEMM Style Cutlass Grouped GEMM Test
-# See https://github.com/deepseek-ai/DeepGEMM/blob/main/tests/test_core.py
-
-import random
-
-import pytest
-import torch
-
-from tests.kernels.moe.utils import per_token_cast_to_fp8
-from tests.kernels.utils import baseline_scaled_mm
-from vllm import _custom_ops as ops
-from vllm.platforms import current_platform
-from vllm.utils.deep_gemm import per_block_cast_to_fp8
-from vllm.utils.math_utils import cdiv
-
-
-@pytest.mark.parametrize(
-    "num_groups, expected_m_per_group, k, n",
-    [
-        (4, 8192, 7168, 4096),
-        (4, 8192, 2048, 7168),
-        (8, 4096, 7168, 4096),
-        (8, 4096, 2048, 7168),
-        (32, 1024, 7168, 4096),
-        (32, 1024, 2048, 7168),
-    ],
-)
-@pytest.mark.parametrize("out_dtype", [torch.float16])
-@pytest.mark.skipif(
-    (lambda x: x is None or x.to_int() != 100)(
-        current_platform.get_device_capability()
-    ),
-    reason="Block Scaled Grouped GEMM is only supported on SM100.",
-)
-def test_cutlass_grouped_gemm(
-    num_groups: int,
-    expected_m_per_group: int,
-    k: int,
-    n: int,
-    out_dtype: torch.dtype,
-):
-    device = "cuda"
-    alignment = 128
-    group_ms = [
-        int(expected_m_per_group * random.uniform(0.7, 1.3)) for _ in range(num_groups)
-    ]
-    m = sum([cdiv(m, alignment) * alignment for m in group_ms])
-
-    x = torch.randn((m, k), device=device, dtype=out_dtype)
-    y = torch.randn((num_groups, n, k), device=device, dtype=out_dtype)
-    out = torch.empty((m, n), device=device, dtype=out_dtype)
-    ref_out = torch.randn((m, n), device=device, dtype=out_dtype)
-
-    ep_offset = [0] + [sum(group_ms[:i]) for i in range(1, num_groups)] + [m]
-    pb_size = []
-    for i in range(num_groups):
-        pb_size.append([ep_offset[i + 1] - ep_offset[i], n, k])
-    problem_sizes = torch.tensor(pb_size, device=device, dtype=torch.int32)
-    expert_offsets = torch.tensor(ep_offset, device=device, dtype=torch.int32)
-
-    x_fp8 = per_token_cast_to_fp8(x)
-    y_fp8 = (
-        torch.empty_like(y, dtype=torch.float8_e4m3fn),
-        torch.empty(
-            (num_groups, cdiv(n, 128), k // 128), device=device, dtype=torch.float
-        ),
-    )
-    for i in range(num_groups):
-        y_fp8[0][i], y_fp8[1][i] = per_block_cast_to_fp8(y[i], [128, 128])
-
-    for i in range(num_groups):
-        a = x_fp8[0][ep_offset[i] : ep_offset[i + 1]]
-        a_scale = x_fp8[1][ep_offset[i] : ep_offset[i + 1]]
-        b = y_fp8[0][i].t()
-        b_scale = y_fp8[1][i].t()
-        baseline = baseline_scaled_mm(a, b, a_scale, b_scale, out_dtype)
-        ref_out[ep_offset[i] : ep_offset[i + 1]] = baseline
-
-    ops.cutlass_blockwise_scaled_grouped_mm(
-        out,
-        x_fp8[0],
-        y_fp8[0],
-        x_fp8[1],
-        y_fp8[1],
-        problem_sizes,
-        expert_offsets[:-1],
-    )
-
-    torch.testing.assert_close(ref_out, out, atol=5e-1, rtol=1e-3)
diff --git a/tests/kernels/moe/test_deepep_deepgemm_moe.py b/tests/kernels/moe/test_deepep_deepgemm_moe.py
index f427734ef09e21ae072f01b9a38865c357d300c8..8987b688ab4abd012e3f6822ef14b980f773e134 100644
--- a/tests/kernels/moe/test_deepep_deepgemm_moe.py
+++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py
@@ -22,13 +22,13 @@ from vllm.model_executor.layers.fused_moe.config import (
 )
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts
 from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel
-from vllm.platforms import current_platform
 from vllm.utils.deep_gemm import (
     get_mk_alignment_for_contiguous_layout,
     is_deep_gemm_e8m0_used,
     is_deep_gemm_supported,
 )
 from vllm.utils.import_utils import has_deep_ep, has_deep_gemm
+from vllm.utils.torch_utils import set_random_seed
 from vllm.v1.worker.workspace import init_workspace_manager
 
 from ...utils import multi_gpu_test
@@ -367,7 +367,7 @@ def _test_deepep_deepgemm_moe(
     device = torch.device(f"cuda:{pgi.local_rank}")
     init_workspace_manager(device)
 
-    current_platform.seed_everything(pgi.rank)
+    set_random_seed(pgi.rank)
 
     w1 = w1.to(device=torch.cuda.current_device())
     w2 = w2.to(device=torch.cuda.current_device())
@@ -456,7 +456,7 @@ def test_ht_deepep_deepgemm_moe(
     """
 
     m, n, k = mnk
-    current_platform.seed_everything(7)
+    set_random_seed(7)
 
     if topk > num_experts:
         pytest.skip(f"Skipping test: topk={topk} > E={num_experts}")
@@ -531,7 +531,7 @@ def test_ll_deepep_deepgemm_moe(
     assert not is_deep_gemm_e8m0_used()
 
     m, n, k = mnk
-    current_platform.seed_everything(7)
+    set_random_seed(7)
 
     if topk > num_experts:
         pytest.skip(f"Skipping test: topk={topk} > E={num_experts}")
diff --git a/tests/kernels/moe/test_deepep_moe.py b/tests/kernels/moe/test_deepep_moe.py
index e698ca92a1515865d5606f1410178c301b52d546..e57e0d72067e48b3f6d88fa63f8eb79e785ca904 100644
--- a/tests/kernels/moe/test_deepep_moe.py
+++ b/tests/kernels/moe/test_deepep_moe.py
@@ -20,8 +20,8 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularK
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
     per_token_group_quant_fp8,
 )
-from vllm.platforms import current_platform
 from vllm.utils.import_utils import has_deep_ep
+from vllm.utils.torch_utils import set_random_seed
 from vllm.v1.worker.workspace import init_workspace_manager
 
 from ...utils import multi_gpu_test
@@ -446,7 +446,7 @@ def test_deep_ep_moe(
     low_latency_mode = False
     use_fp8_dispatch = False
 
-    current_platform.seed_everything(7)
+    set_random_seed(7)
     world_size, dp_size = world_dp_size
     config = TestConfig(dtype=dtype, topk=topk, m=m, k=k, n=n, num_experts=num_experts)
 
@@ -507,7 +507,7 @@ def test_low_latency_deep_ep_moe(
             f"hidden sizes {DeepEPLLPrepareAndFinalize.SUPPORTED_HIDDEN_SIZES}"
         )
 
-    current_platform.seed_everything(7)
+    set_random_seed(7)
     world_size, dp_size = world_dp_size
     config = TestConfig(dtype=dtype, topk=topk, m=m, k=k, n=n, num_experts=num_experts)
 
diff --git a/tests/kernels/moe/test_flashinfer.py b/tests/kernels/moe/test_flashinfer.py
index bf4ef2d30466b8c650efb27a92c7d2f415f2c279..bb2f6b8739417be874e837d50dfa7ff124debc57 100644
--- a/tests/kernels/moe/test_flashinfer.py
+++ b/tests/kernels/moe/test_flashinfer.py
@@ -11,17 +11,23 @@ from vllm.model_executor.layers.fused_moe.config import (
     FusedMoEQuantConfig,
     fp8_w8a8_moe_quant_config,
 )
+from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
+    FlashInferExperts,
+)
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts
+from vllm.model_executor.layers.fused_moe.prepare_finalize import (
+    MoEPrepareAndFinalizeNoEP,
+)
 from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
-    apply_flashinfer_per_tensor_scale_fp8,
-    flashinfer_cutlass_moe_fp8,
-    register_moe_scaling_factors,
-    rotate_flashinfer_fp8_moe_weights,
+    apply_fi_trtllm_fp8_per_tensor_moe,
+    register_scales_for_trtllm_fp8_per_tensor_moe,
+    rotate_weights_for_fi_trtllm_fp8_per_tensor_moe,
     swap_w13_to_w31,
 )
 from vllm.model_executor.layers.quantization.utils.fp8_utils import input_to_float8
 from vllm.model_executor.models.llama4 import Llama4MoE
 from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed
 
 try:
     from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
@@ -84,7 +90,7 @@ class TestData:
 
     @staticmethod
     def make_moe_tensors_8bit(
-        m: int, k: int, n: int, e: int, reorder: bool, activation: str = "silu"
+        m: int, k: int, n: int, e: int, is_trtllm: bool, activation: str = "silu"
     ) -> "TestData":
         is_gated = activation != "relu2_no_mul"
 
@@ -102,6 +108,7 @@ class TestData:
         w2_quantized, w2_weight_scale = quant_fp8_per_tensor_batches(w2)
 
         layer = torch.nn.Module()
+        layer.orig_dtype = torch.bfloat16
         layer.w13_weight = w13_quantized.clone()
         layer.w2_weight = w2_quantized.clone()
         layer.w13_input_scale = a1_scale
@@ -114,20 +121,27 @@ class TestData:
             pcp_size=1,
             dp_size=1,
             ep_size=1,
-            tp_rank=1,
-            pcp_rank=1,
-            dp_rank=1,
-            ep_rank=1,
+            tp_rank=0,
+            pcp_rank=0,
+            dp_rank=0,
+            ep_rank=0,
             use_ep=False,
             all2all_backend="naive",
         )
 
-        register_moe_scaling_factors(layer)
-
         # flashinfer expects swapped rows for w13
         layer.w13_weight.data = swap_w13_to_w31(layer.w13_weight.data)
-        if reorder:
-            rotate_flashinfer_fp8_moe_weights(layer.w13_weight, layer.w2_weight)
+        if is_trtllm:
+            rotate_weights_for_fi_trtllm_fp8_per_tensor_moe(
+                layer.w13_weight, layer.w2_weight
+            )
+            register_scales_for_trtllm_fp8_per_tensor_moe(
+                layer,
+                layer.w13_weight_scale,
+                layer.w13_input_scale,
+                layer.w2_weight_scale,
+                layer.w2_input_scale,
+            )
         layer.custom_routing_function = Llama4MoE.custom_routing_function
         layer.intermediate_size_per_partition = n
         layer.ep_rank = 0
@@ -158,10 +172,10 @@ def test_flashinfer_per_tensor_moe_fp8_no_graph(
 ):
     if not current_platform.has_device_capability(100):
         pytest.skip("Test is only supported for sm >= 100")
-    current_platform.seed_everything(7)
+    set_random_seed(7)
     monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192")
     with set_current_vllm_config(vllm_config):
-        td = TestData.make_moe_tensors_8bit(m, k, n, e, reorder=True)
+        td = TestData.make_moe_tensors_8bit(m, k, n, e, is_trtllm=True)
 
         score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
         topk_weights, topk_ids = Llama4MoE.custom_routing_function(
@@ -193,7 +207,7 @@ def test_flashinfer_per_tensor_moe_fp8_no_graph(
             quant_config=quant_config,
         )
 
-        flashinfer_output = apply_flashinfer_per_tensor_scale_fp8(
+        flashinfer_output = apply_fi_trtllm_fp8_per_tensor_moe(
             layer=td.layer,
             hidden_states=td.hidden_states,
             router_logits=score,
@@ -222,11 +236,11 @@ def test_flashinfer_cutlass_moe_fp8_no_graph(
     monkeypatch,
     workspace_init,
 ):
-    current_platform.seed_everything(7)
+    set_random_seed(7)
     monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192")
     with set_current_vllm_config(vllm_config):
         td = TestData.make_moe_tensors_8bit(
-            m, k, n, e, reorder=False, activation=activation
+            m, k, n, e, is_trtllm=False, activation=activation
         )
 
         score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
@@ -271,17 +285,34 @@ def test_flashinfer_cutlass_moe_fp8_no_graph(
         td.layer.get_fused_moe_quant_config = get_fused_moe_quant_config
         td.layer.quant_method = td.layer
 
-        flashinfer_cutlass_output = flashinfer_cutlass_moe_fp8(
+        kernel = mk.FusedMoEModularKernel(
+            MoEPrepareAndFinalizeNoEP(
+                defer_input_quant=quant_config.is_block_quantized
+            ),
+            FlashInferExperts(
+                out_dtype=td.layer.orig_dtype,
+                quant_config=quant_config,
+                ep_rank=td.layer.moe_parallel_config.ep_rank,
+                ep_size=td.layer.moe_parallel_config.ep_size,
+                tp_rank=td.layer.moe_parallel_config.tp_rank,
+                tp_size=td.layer.moe_parallel_config.tp_size,
+                use_dp=False,
+                use_deepseek_fp8_block_scale=False,
+            ),
+        )
+
+        flashinfer_cutlass_output = kernel(
             td.hidden_states,
-            td.layer,
+            td.layer.w13_weight,
+            td.layer.w2_weight,
             topk_weights,
             topk_ids,
+            inplace=False,
             activation=activation,
             global_num_experts=e,
             expert_map=None,
             apply_router_weight_on_input=True,
         )
-
         torch.testing.assert_close(
             output, flashinfer_cutlass_output, atol=5.5e-2, rtol=1e-2
         )
diff --git a/tests/kernels/moe/test_flashinfer_moe.py b/tests/kernels/moe/test_flashinfer_moe.py
index 133a8a4a30a60e8aef8222511ea076a9d61a6c60..1262eea70baba9f0b191a3cb2ae44c068b480db7 100644
--- a/tests/kernels/moe/test_flashinfer_moe.py
+++ b/tests/kernels/moe/test_flashinfer_moe.py
@@ -23,6 +23,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
 from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel
 from vllm.platforms import current_platform
 from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
+from vllm.utils.torch_utils import set_random_seed
 
 if not has_flashinfer_cutlass_fused_moe() or not current_platform.has_device_capability(
     100
@@ -60,7 +61,7 @@ def test_flashinfer_fp4_moe_no_graph(
     activation: str,
     workspace_init,
 ):
-    current_platform.seed_everything(7)
+    set_random_seed(7)
     with set_current_vllm_config(
         VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
     ):
diff --git a/tests/kernels/moe/test_grouped_topk.py b/tests/kernels/moe/test_grouped_topk.py
index 662e0723b75833def8e8f0c1b07eb74b996663dc..f676cc4fee1b0b87752ba5c3cd5b80c6fa621304 100644
--- a/tests/kernels/moe/test_grouped_topk.py
+++ b/tests/kernels/moe/test_grouped_topk.py
@@ -8,11 +8,18 @@ Run `pytest tests/kernels/moe/test_grouped_topk.py`.
 import pytest
 import torch
 
+from vllm.config import (
+    CompilationConfig,
+    VllmConfig,
+    get_cached_compilation_config,
+    set_current_vllm_config,
+)
 from vllm.model_executor.layers.fused_moe.fused_moe import (
+    GroupedTopk,
     fused_grouped_topk,
-    grouped_topk,
 )
 from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed
 
 
 @pytest.mark.skipif(
@@ -27,7 +34,8 @@ from vllm.platforms import current_platform
 @pytest.mark.parametrize("topk_group", [2])
 @pytest.mark.parametrize("scoring_func", ["softmax", "sigmoid"])
 @pytest.mark.parametrize("routed_scaling_factor", [1.0, 2.5])
-@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float32])
+@pytest.mark.parametrize("input_dtype", [torch.bfloat16, torch.float32])
+@pytest.mark.parametrize("bias_dtype", [torch.float32])
 def test_grouped_topk(
     monkeypatch: pytest.MonkeyPatch,
     n_token: int,
@@ -39,26 +47,33 @@ def test_grouped_topk(
     topk_group: int,
     scoring_func: str,
     routed_scaling_factor: float,
-    dtype: torch.dtype,
+    input_dtype: torch.dtype,
+    bias_dtype: torch.dtype,
 ):
-    current_platform.seed_everything(0)
-    hidden_states = torch.randn((n_token, n_hidden), dtype=dtype, device="cuda")
-    gating_output = torch.randn((n_token, n_expert), dtype=dtype, device="cuda")
-    e_score_correction_bias = torch.randn(
-        (n_expert,), dtype=torch.float32, device="cuda"
+    vllm_config = VllmConfig(
+        compilation_config=CompilationConfig(custom_ops=["all", "+grouped_topk"])
     )
+    get_cached_compilation_config.cache_clear()
+
+    set_random_seed(0)
+    hidden_states = torch.randn((n_token, n_hidden), dtype=input_dtype, device="cuda")
+    gating_output = torch.randn((n_token, n_expert), dtype=input_dtype, device="cuda")
+    e_score_correction_bias = torch.randn((n_expert,), dtype=bias_dtype, device="cuda")
 
-    with monkeypatch.context() as m:
+    with set_current_vllm_config(vllm_config), monkeypatch.context() as m:
         m.setenv("VLLM_USE_FUSED_MOE_GROUPED_TOPK", "0")
-        baseline_topk_weights, baseline_topk_ids = grouped_topk(
-            hidden_states=hidden_states,
-            gating_output=gating_output,
+        grouped_topk = GroupedTopk(
             topk=topk,
             renormalize=renormalize,
             num_expert_group=num_expert_group,
             topk_group=topk_group,
             scoring_func=scoring_func,
             routed_scaling_factor=routed_scaling_factor,
+        )
+        assert grouped_topk._forward_method.__name__ == "forward_cuda"
+        baseline_topk_weights, baseline_topk_ids = grouped_topk(
+            hidden_states=hidden_states,
+            gating_output=gating_output,
             e_score_correction_bias=e_score_correction_bias,
         )
 
diff --git a/tests/kernels/moe/test_modular_kernel_combinations.py b/tests/kernels/moe/test_modular_kernel_combinations.py
index 6ebf1016c166c782975d4b1d4f51b671e8c149dc..ec31e66140a1acc0482625afac38cb7a91e6c67b 100644
--- a/tests/kernels/moe/test_modular_kernel_combinations.py
+++ b/tests/kernels/moe/test_modular_kernel_combinations.py
@@ -15,7 +15,7 @@ from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.platforms import current_platform
 from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
 from vllm.utils.import_utils import has_deep_ep, has_deep_gemm, has_pplx
-from vllm.utils.torch_utils import cuda_device_count_stateless
+from vllm.utils.torch_utils import cuda_device_count_stateless, set_random_seed
 from vllm.v1.worker.workspace import init_workspace_manager
 
 from .modular_kernel_tools.common import (
@@ -82,7 +82,7 @@ def rank_worker(
     device = torch.device(f"cuda:{pgi.local_rank}")
     init_workspace_manager(device)
 
-    current_platform.seed_everything(pgi.rank)
+    set_random_seed(pgi.rank)
 
     # sanity check
     from vllm import envs
diff --git a/tests/kernels/moe/test_modular_oai_triton_moe.py b/tests/kernels/moe/test_modular_oai_triton_moe.py
index 1abb08f878b2bd414846b6f08b48e8fbcc335294..8733ba4d8e319d18544aa5c32bc4774ec044a14e 100644
--- a/tests/kernels/moe/test_modular_oai_triton_moe.py
+++ b/tests/kernels/moe/test_modular_oai_triton_moe.py
@@ -34,6 +34,7 @@ from vllm.model_executor.layers.fused_moe.prepare_finalize import (
 )
 from vllm.model_executor.layers.utils import shuffle_weight
 from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed
 
 MNK = [
     (1, 512, 384),
@@ -211,7 +212,7 @@ def test_oai_triton_moe(
     unfused: bool,
     workspace_init,
 ):
-    current_platform.seed_everything(0)
+    set_random_seed(0)
     (
         w1,
         w2,
diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py
index d4e4682830510cc2e6525fc80bb8459d14eae24c..67598397370f3b5df604b097a7f3980e61d36c4e 100644
--- a/tests/kernels/moe/test_moe.py
+++ b/tests/kernels/moe/test_moe.py
@@ -60,10 +60,14 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import quantize_w
 from vllm.model_executor.models.mixtral import MixtralMoE
 from vllm.platforms import current_platform
 from vllm.scalar_type import ScalarType, scalar_types
+from vllm.utils.torch_utils import set_random_seed
+from vllm.v1.worker.workspace import init_workspace_manager
 
 NUM_EXPERTS = [8, 64, 192]
+NUM_EXPERTS_LARGE = [128, 256]
 EP_SIZE = [1, 4]
 TOP_KS = [2, 6]
+TOP_KS_SMALL = [1, 2]
 
 MOE_MARLIN_QUANT_TEST_CONFIGS = [
     # AWQ-INT4
@@ -131,6 +135,13 @@ FUSED_MOE_MNK_FACTORS = [
     (40000, 1024, 1024),
 ]
 
+FUSED_MOE_MNK_FACTORS_SMALL_M = [
+    (1, 128, 128),
+    (1, 2048, 128),
+    (2, 2048, 128),
+    (2, 2048, 511),
+]
+
 FUSED_MOE_WN16_MNK_FACTORS = [
     (1, 128, 128),
     (1, 1024, 1024),
@@ -233,7 +244,7 @@ def test_fused_moe(
     monkeypatch,
     workspace_init,
 ):
-    current_platform.seed_everything(7)
+    set_random_seed(7)
 
     monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", str(chunk_size))
 
@@ -328,6 +339,111 @@ def test_fused_moe(
         )
 
 
+@pytest.mark.parametrize("m,n,k", FUSED_MOE_MNK_FACTORS_SMALL_M)
+@pytest.mark.parametrize("e", NUM_EXPERTS_LARGE)
+@pytest.mark.parametrize("topk", TOP_KS_SMALL)
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
+@pytest.mark.parametrize("padding", [True, False])
+@pytest.mark.parametrize("chunk_size", [8192])
+def test_naive_block_assignment_moe(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    dtype: torch.dtype,
+    padding: bool,
+    chunk_size: int,
+    monkeypatch,
+    workspace_init,
+):
+    current_platform.seed_everything(7)
+
+    monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", str(chunk_size))
+
+    #
+    # Setup test data
+    #
+
+    #
+    # Setup test data
+    #
+
+    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
+    w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
+    w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
+
+    score = torch.randn((m, e), device="cuda", dtype=dtype)
+
+    e_map = None
+
+    #
+    # Setup test functions
+    #
+    quant_config = FUSED_MOE_UNQUANTIZED_CONFIG
+
+    m_fused_moe_fn = modular_triton_fused_moe(quant_config)
+
+    def m_fused_moe(
+        a: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        score: torch.Tensor,
+        topk: int,
+        global_num_experts: int = -1,
+        expert_map: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        topk_weights, topk_ids, _ = fused_topk(a, score, topk, False)
+        return m_fused_moe_fn(
+            a,
+            w1,
+            w2,
+            topk_weights,
+            topk_ids,
+            global_num_experts=global_num_experts,
+            expert_map=expert_map,
+        )
+
+    fused_moe_fn = functools.partial(fused_moe, renormalize=False)
+
+    #
+    # Run tests
+    #
+    runner = functools.partial(
+        run_moe_test,
+        a=a,
+        w1=w1,
+        w2=w2,
+        score=score,
+        topk=topk,
+        global_num_experts=e,
+        expert_map=e_map,
+        padding=padding,
+    )
+
+    # Note: for now use_compile will error out if the problem size is
+    # large enough to trigger chunking. I'm leaving the flag and
+    # setup code in case we are able to revisit this later.
+    use_compile = False
+
+    use_cudagraph = n >= 1024 and k >= 1024 and current_platform.is_cuda_alike()
+
+    with set_current_vllm_config(vllm_config):
+        baseline_output = runner(torch_moe, iterative_moe)
+        runner(
+            baseline_output,
+            fused_moe_fn,
+            use_compile=use_compile,
+            use_cudagraph=use_cudagraph,
+        )
+        runner(
+            baseline_output,
+            m_fused_moe,
+            use_compile=use_compile,
+            use_cudagraph=use_cudagraph,
+        )
+
+
 @pytest.mark.parametrize("m,n,k", FUSED_MOE_WN16_MNK_FACTORS)
 @pytest.mark.parametrize("e", NUM_EXPERTS)
 @pytest.mark.parametrize("topk", TOP_KS)
@@ -466,7 +582,12 @@ def test_fused_moe_wn16(
 )
 @torch.inference_mode()
 def test_mixtral_moe(
-    dist_init, dtype: torch.dtype, padding: bool, use_rocm_aiter: bool, monkeypatch
+    default_vllm_config,
+    dist_init,
+    dtype: torch.dtype,
+    padding: bool,
+    use_rocm_aiter: bool,
+    monkeypatch,
 ):
     """Make sure our Mixtral MoE implementation agrees with the one from
     huggingface."""
@@ -487,6 +608,7 @@ def test_mixtral_moe(
     monkeypatch.setenv("MASTER_ADDR", "localhost")
     monkeypatch.setenv("MASTER_PORT", "12345")
     init_distributed_environment()
+    init_workspace_manager(torch.cuda.current_device())
 
     # Instantiate our and huggingface's MoE blocks
     vllm_config.compilation_config.static_forward_context = dict()
@@ -540,6 +662,11 @@ def test_mixtral_moe(
             torch.cuda.synchronize()
             torch.cuda.empty_cache()
 
+        # FIXME (zyongye) fix this after we move self.kernel
+        # assignment in FusedMoE.__init__
+
+        vllm_moe.experts.quant_method.process_weights_after_loading(vllm_moe.experts)
+
         # Run forward passes for both MoE blocks
         hf_states, _ = hf_moe.forward(hf_inputs)
         vllm_states = vllm_moe.forward(vllm_inputs)
diff --git a/tests/kernels/moe/test_moe_align_block_size.py b/tests/kernels/moe/test_moe_align_block_size.py
index 1abfc11fb460e9d3c6b16094ed861b7cd78c2249..652a2ee21614026b97e01dfc206f35c092cfa6dc 100644
--- a/tests/kernels/moe/test_moe_align_block_size.py
+++ b/tests/kernels/moe/test_moe_align_block_size.py
@@ -14,12 +14,13 @@ from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
 )
 from vllm.platforms import current_platform
 from vllm.utils.math_utils import round_up
+from vllm.utils.torch_utils import set_random_seed
 
 NUM_TOKENS = [1, 3, 256, 2256, 4096]
 NUM_EXPERTS = [32, 160, 256, 257]
 TOP_KS = [1, 2, 16, 32]
 BLOCK_SIZES = [32, 128]
-current_platform.seed_everything(0)
+set_random_seed(0)
 
 
 def _group_tokens_by_expert(
diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py
index 35e554e16cb38553f4cb57618ec744b5e9957550..c08a54f0e9f6e8e19c0ad47b892d8c09b92f5dcb 100644
--- a/tests/kernels/moe/test_pplx_moe.py
+++ b/tests/kernels/moe/test_pplx_moe.py
@@ -44,8 +44,8 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularK
 from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
     TopKWeightAndReduceDelegate,
 )
-from vllm.platforms import current_platform
 from vllm.utils.math_utils import round_up
+from vllm.utils.torch_utils import set_random_seed
 from vllm.v1.worker.workspace import init_workspace_manager
 
 from ...utils import multi_gpu_test
@@ -184,7 +184,7 @@ def test_fused_moe_batched_experts(
     dtype: torch.dtype,
     workspace_init,
 ):
-    current_platform.seed_everything(7)
+    set_random_seed(7)
 
     a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
     w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
@@ -491,7 +491,7 @@ def test_pplx_prepare_finalize_slow(
     if per_act_token_quant and block_shape is not None:
         pytest.skip("Skip illegal quantization combination")
 
-    current_platform.seed_everything(7)
+    set_random_seed(7)
     m, n, k = mnk
     world_size, dp_size = world_dp_size
     device = "cuda"
@@ -809,7 +809,7 @@ def test_pplx_moe_slow(
     block_shape: list[int] | None,
     use_internode: bool,
 ):
-    current_platform.seed_everything(7)
+    set_random_seed(7)
     m, n, k = mnk
     world_size, dp_size = world_dp_size
 
@@ -888,7 +888,7 @@ def _pplx_test_loop(
         new_vllm_config.parallel_config.enable_expert_parallel = True
         _set_vllm_config(new_vllm_config, pgi.world_size, pgi.rank, pgi.local_rank)
 
-    current_platform.seed_everything(7)
+    set_random_seed(7)
     combos = itertools.product(
         PPLX_COMBOS, NUM_EXPERTS, TOP_KS, DTYPES, [False, True], [None, [128, 128]]
     )
@@ -982,7 +982,7 @@ def test_pplx_prepare_finalize(
     world_dp_size: tuple[int, int],
     use_internode: bool,
 ):
-    current_platform.seed_everything(7)
+    set_random_seed(7)
     world_size, dp_size = world_dp_size
     parallel_launch(
         world_size * dp_size,
@@ -1005,7 +1005,7 @@ def test_pplx_moe(
     use_internode: bool,
     use_shared_experts: bool,
 ):
-    current_platform.seed_everything(7)
+    set_random_seed(7)
     world_size, dp_size = world_dp_size
     parallel_launch(
         world_size,
diff --git a/tests/kernels/moe/test_silu_mul_per_token_group_quant_fp8_colmajor.py b/tests/kernels/moe/test_silu_mul_per_token_group_quant_fp8_colmajor.py
index e4617072cd52c41cc2880d80a47901dfb56fafdb..cca02928b4982c3a2baa264c03547b5f83cd36aa 100644
--- a/tests/kernels/moe/test_silu_mul_per_token_group_quant_fp8_colmajor.py
+++ b/tests/kernels/moe/test_silu_mul_per_token_group_quant_fp8_colmajor.py
@@ -11,6 +11,7 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import (
 from vllm.platforms import current_platform
 from vllm.triton_utils import triton
 from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used
+from vllm.utils.torch_utils import set_random_seed
 
 FLOAT8_DTYPE = torch.float8_e4m3fn
 GROUP_SIZE = 128
@@ -67,8 +68,12 @@ def reference(x: torch.Tensor, use_ue8m0: bool) -> tuple[torch.Tensor, torch.Ten
 
 @pytest.mark.parametrize("T", [128, 256, 512])
 @pytest.mark.parametrize("N", [128 * 2, 256 * 2, 768 * 2, 2048 * 2, 7168 * 2])
+@pytest.mark.skipif(
+    current_platform.is_rocm(),
+    reason="ROCm does not support DeepGemm.",
+)
 def test_silu_mul_fp8_quant_deep_gemm(T: int, N: int):
-    current_platform.seed_everything(42)
+    set_random_seed(42)
 
     input = torch.rand((T, N), dtype=torch.bfloat16, device="cuda")
 
diff --git a/tests/kernels/moe/test_triton_moe_no_act_mul.py b/tests/kernels/moe/test_triton_moe_no_act_mul.py
new file mode 100644
index 0000000000000000000000000000000000000000..12d5180f964ee7cbf29d985e6301870be09cc87b
--- /dev/null
+++ b/tests/kernels/moe/test_triton_moe_no_act_mul.py
@@ -0,0 +1,201 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for MoE with non-gated activations (*_no_mul).
+
+These tests verify that MoE layers work correctly with activations like
+silu_no_mul, gelu_no_mul, relu2_no_mul where the activation output dimension
+equals N (not N // 2 like gated activations).
+"""
+
+import pytest
+import torch
+
+from vllm.model_executor.layers.fused_moe.config import (
+    FUSED_MOE_UNQUANTIZED_CONFIG,
+)
+from vllm.model_executor.layers.fused_moe.fused_moe import TritonExperts
+from vllm.model_executor.layers.fused_moe.utils import (
+    GELU_NO_MUL,
+    RELU2_NO_MUL,
+    SILU_NO_MUL,
+)
+from vllm.platforms import current_platform
+
+# Test parameters
+M_SIZES = [1, 16, 64]
+N_SIZES = [128, 256]
+K_SIZES = [64, 128]
+TOPK_VALUES = [1, 2]
+NUM_EXPERTS = 8
+NO_MUL_ACTIVATIONS = [SILU_NO_MUL, GELU_NO_MUL, RELU2_NO_MUL]
+
+
+def make_test_tensors(
+    m: int,
+    n: int,
+    k: int,
+    num_experts: int,
+    topk: int,
+    dtype: torch.dtype = torch.bfloat16,
+    device: str = "cuda",
+):
+    """Create test tensors for MoE with non-gated activation.
+
+    For non-gated activations (*_no_mul):
+    - w1: (E, N, K) - projects from K to N
+    - w2: (E, K, N) - projects from N back to K (note: N, not N//2)
+    """
+    hidden_states = torch.randn(m, k, dtype=dtype, device=device)
+
+    # For non-gated: w1 projects K -> N, w2 projects N -> K
+    w1 = torch.randn(num_experts, n, k, dtype=dtype, device=device) * 0.1
+    w2 = torch.randn(num_experts, k, n, dtype=dtype, device=device) * 0.1
+
+    topk_weights = torch.ones(m, topk, dtype=torch.float32, device=device) / topk
+    topk_ids = torch.randint(0, num_experts, (m, topk), device=device)
+
+    return hidden_states, w1, w2, topk_weights, topk_ids
+
+
+@pytest.mark.skipif(
+    not current_platform.has_device_capability(80),
+    reason="Requires compute capability >= 8.0",
+)
+@pytest.mark.parametrize("m", M_SIZES)
+@pytest.mark.parametrize("n", N_SIZES)
+@pytest.mark.parametrize("k", K_SIZES)
+@pytest.mark.parametrize("topk", TOPK_VALUES)
+@pytest.mark.parametrize("activation", NO_MUL_ACTIVATIONS)
+@torch.inference_mode()
+def test_triton_experts_no_mul_activation(
+    m: int,
+    n: int,
+    k: int,
+    topk: int,
+    activation: str,
+):
+    hidden_states, w1, w2, topk_weights, topk_ids = make_test_tensors(
+        m, n, k, NUM_EXPERTS, topk
+    )
+
+    experts = TritonExperts(FUSED_MOE_UNQUANTIZED_CONFIG)
+
+    ws1_shape, ws2_shape, out_shape = experts.workspace_shapes(
+        M=m,
+        N=n,
+        K=k,
+        topk=topk,
+        global_num_experts=NUM_EXPERTS,
+        local_num_experts=NUM_EXPERTS,
+        expert_tokens_meta=None,
+        activation=activation,
+    )
+
+    # Verify workspace shapes are correct for no_mul activation
+    # workspace1 should handle activation_out_dim = N (not N//2)
+    assert ws1_shape == (m, topk, max(n, k)), (
+        f"workspace1 shape mismatch: expected {(m, topk, max(n, k))}, got {ws1_shape}"
+    )
+    # workspace2 should handle max(N, K) for intermediate_cache1/cache3
+    assert ws2_shape == (m, topk, max(n, k)), (
+        f"workspace2 shape mismatch: expected {(m, topk, max(n, k))}, got {ws2_shape}"
+    )
+    assert out_shape == (m, k), (
+        f"output shape mismatch: expected {(m, k)}, got {out_shape}"
+    )
+
+    workspace1 = torch.empty(
+        ws1_shape[0] * ws1_shape[1] * ws1_shape[2],
+        dtype=hidden_states.dtype,
+        device=hidden_states.device,
+    )
+    workspace2 = torch.empty(
+        ws2_shape[0] * ws2_shape[1] * ws2_shape[2],
+        dtype=hidden_states.dtype,
+        device=hidden_states.device,
+    )
+    output = torch.zeros(m, k, dtype=hidden_states.dtype, device=hidden_states.device)
+
+    experts.apply(
+        output=output,
+        hidden_states=hidden_states,
+        w1=w1,
+        w2=w2,
+        topk_weights=topk_weights,
+        topk_ids=topk_ids,
+        activation=activation,
+        global_num_experts=NUM_EXPERTS,
+        expert_map=None,
+        a1q_scale=None,
+        a2_scale=None,
+        workspace13=workspace1,
+        workspace2=workspace2,
+        expert_tokens_meta=None,
+        apply_router_weight_on_input=False,
+    )
+
+    assert output.shape == (m, k), f"Expected shape {(m, k)}, got {output.shape}"
+    assert not torch.isnan(output).any(), "Output contains NaN"
+    assert not torch.isinf(output).any(), "Output contains Inf"
+    assert output.abs().sum() > 0, "Output is all zeros"
+
+
+@pytest.mark.skipif(
+    not current_platform.has_device_capability(80),
+    reason="Requires compute capability >= 8.0",
+)
+@torch.inference_mode()
+def test_workspace_shapes_no_mul_vs_gated():
+    """Test that workspace shapes differ correctly between gated and non-gated."""
+    from vllm.model_executor.layers.fused_moe.fused_moe import TritonExperts
+
+    M, N, K, topk = 64, 256, 128, 2
+
+    experts = TritonExperts(FUSED_MOE_UNQUANTIZED_CONFIG)
+
+    ws1_no_mul, _, out_no_mul = experts.workspace_shapes(
+        M, N, K, topk, 8, 8, None, SILU_NO_MUL
+    )
+
+    ws1_gated, _, out_gated = experts.workspace_shapes(
+        M, N, K, topk, 8, 8, None, "silu"
+    )
+
+    # For no_mul: activation_out_dim = N
+    # For gated: activation_out_dim = N // 2
+    # workspace1 should use max(activation_out_dim, K)
+    activation_out_dim_no_mul = N
+    activation_out_dim_gated = N // 2
+
+    assert ws1_no_mul[2] == max(activation_out_dim_no_mul, K), (
+        f"no_mul workspace1 last dim should be max({activation_out_dim_no_mul}, {K})"
+    )
+    assert ws1_gated[2] == max(activation_out_dim_gated, K), (
+        f"gated workspace1 last dim should be max({activation_out_dim_gated}, {K})"
+    )
+
+    # Output shapes should be the same
+    assert out_no_mul == out_gated == (M, K)
+
+
+@pytest.mark.skipif(
+    not current_platform.has_device_capability(80),
+    reason="Requires compute capability >= 8.0",
+)
+@torch.inference_mode()
+def test_adjust_n_for_activation():
+    """Test the adjust_N_for_activation method."""
+    from vllm.model_executor.layers.fused_moe.fused_moe import TritonExperts
+
+    experts = TritonExperts(FUSED_MOE_UNQUANTIZED_CONFIG)
+
+    N = 256
+
+    # Gated activations should return N // 2
+    assert experts.adjust_N_for_activation(N, "silu") == N // 2
+    assert experts.adjust_N_for_activation(N, "gelu") == N // 2
+
+    # Non-gated activations should return N
+    assert experts.adjust_N_for_activation(N, SILU_NO_MUL) == N
+    assert experts.adjust_N_for_activation(N, GELU_NO_MUL) == N
+    assert experts.adjust_N_for_activation(N, RELU2_NO_MUL) == N
diff --git a/tests/kernels/moe/untest_cutlass_moe.py b/tests/kernels/moe/untest_cutlass_moe.py
index 0160694d7bb548163687e271ae5fb08ac7adfefd..cd5bf47d69e5aadd05e968d33b380fe75c3129cd 100644
--- a/tests/kernels/moe/untest_cutlass_moe.py
+++ b/tests/kernels/moe/untest_cutlass_moe.py
@@ -7,19 +7,25 @@ from math import prod
 import pytest
 import torch
 
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm import _custom_ops as ops
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.fused_moe.config import (
     FUSED_MOE_UNQUANTIZED_CONFIG,
+    FusedMoEQuantConfig,
     fp8_w8a8_moe_quant_config,
 )
 from vllm.model_executor.layers.fused_moe.cutlass_moe import (
-    cutlass_moe_fp8,
+    CutlassExpertsFp8,
     run_cutlass_moe_fp8,
 )
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
+from vllm.model_executor.layers.fused_moe.prepare_finalize import (
+    MoEPrepareAndFinalizeNoEP,
+)
 from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input
 from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed
 
 NUM_EXPERTS = [40, 64]
 TOP_KS = [6, 8]
@@ -149,16 +155,15 @@ class MOETensors8Bit(MOETensors):
 
 
 def run_with_expert_maps(
-    num_experts: int, num_local_experts: int, **cutlass_moe_kwargs
+    num_experts: int,
+    num_local_experts: int,
+    quant_config: FusedMoEQuantConfig,
+    **cutlass_moe_kwargs,
 ):
     def slice_experts():
         slice_params = [
-            "w1_q",
-            "w2_q",
-            "ab_strides1",
-            "ab_strides2",
-            "c_strides1",
-            "c_strides2",
+            "w1",
+            "w2",
         ]
         full_tensors = {
             k: v
@@ -166,8 +171,6 @@ def run_with_expert_maps(
             if k in slice_params and k in cutlass_moe_kwargs
         }
 
-        quant_config = cutlass_moe_kwargs["quant_config"]
-
         for i in range(0, num_experts, num_local_experts):
             s, e = i, i + num_local_experts
 
@@ -186,13 +189,23 @@ def run_with_expert_maps(
             new_quant_config._w1.scale = quant_config.w1_scale[s:e]
             new_quant_config._w2.scale = quant_config.w2_scale[s:e]
 
-            cutlass_moe_kwargs["quant_config"] = new_quant_config
-
-            yield cutlass_moe_kwargs
-
-    out_tensor = torch.zeros_like(cutlass_moe_kwargs["a"])
-    for kwargs in slice_experts():
-        out_tensor = out_tensor + cutlass_moe_fp8(**kwargs)
+            yield cutlass_moe_kwargs, new_quant_config
+
+    out_tensor = torch.zeros_like(cutlass_moe_kwargs["hidden_states"])
+    for kwargs, new_quant_config in slice_experts():
+        kernel = mk.FusedMoEModularKernel(
+            MoEPrepareAndFinalizeNoEP(),
+            CutlassExpertsFp8(
+                out_dtype=kwargs["hidden_states"].dtype,
+                # NOTE(rob): w2 is shaped as [E, hidden, intermediate]
+                e=kwargs["w2"].shape[0],  # type: ignore[union-attr]
+                n=kwargs["w2"].shape[2],  # type: ignore[union-attr]
+                k=kwargs["w2"].shape[1],  # type: ignore[union-attr]
+                quant_config=new_quant_config,
+                device="cuda",
+            ),
+        )
+        out_tensor = out_tensor + kernel(**kwargs)
 
     return out_tensor
 
@@ -229,27 +242,35 @@ def run_8_bit(
     )
 
     kwargs = {
-        "a": moe_tensors.a,
-        "w1_q": moe_tensors.w1_q,  # type: ignore[union-attr]
-        "w2_q": moe_tensors.w2_q,  # type: ignore[union-attr]
+        "hidden_states": moe_tensors.a,
+        "w1": moe_tensors.w1_q,  # type: ignore[union-attr]
+        "w2": moe_tensors.w2_q,  # type: ignore[union-attr]
         "topk_weights": topk_weights,
         "topk_ids": topk_ids,
-        "ab_strides1": moe_tensors.ab_strides1,
-        "ab_strides2": moe_tensors.ab_strides2,
-        "c_strides1": moe_tensors.c_strides1,
-        "c_strides2": moe_tensors.c_strides2,
-        "quant_config": quant_config,
     }
 
     num_experts = moe_tensors.w1.size(0)
     with_ep = num_local_experts is not None or num_local_experts == num_experts
     if not with_ep:
-        return cutlass_moe_fp8(**kwargs)
+        kernel = mk.FusedMoEModularKernel(
+            MoEPrepareAndFinalizeNoEP(),
+            CutlassExpertsFp8(
+                out_dtype=moe_tensors.a.dtype,
+                # NOTE(rob): w2 is shaped as [E, hidden, intermediate]
+                e=moe_tensors.w2_q.shape[0],  # type: ignore[union-attr]
+                n=moe_tensors.w2_q.shape[2],  # type: ignore[union-attr]
+                k=moe_tensors.w2_q.shape[1],  # type: ignore[union-attr]
+                quant_config=quant_config,
+                device="cuda",
+            ),
+        )
+        return kernel(**kwargs)
 
     assert num_local_experts is not None
     return run_with_expert_maps(
         num_experts,
         num_local_experts,  # type: ignore[arg-type]
+        quant_config,
         **kwargs,
     )
 
@@ -277,7 +298,7 @@ def test_cutlass_moe_8_bit_no_graph(
     workspace_init,
     ep_size: int | None = None,
 ):
-    current_platform.seed_everything(7)
+    set_random_seed(7)
     monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192")
     with set_current_vllm_config(vllm_config):
         mt = MOETensors8Bit.make_moe_tensors_8bit(m, k, n, e, per_act_token, per_out_ch)
@@ -332,7 +353,7 @@ def test_cutlass_moe_8_bit_cuda_graph(
     monkeypatch,
     workspace_init,
 ):
-    current_platform.seed_everything(7)
+    set_random_seed(7)
     monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192")
     with set_current_vllm_config(vllm_config):
         dtype = torch.half
@@ -469,7 +490,7 @@ def test_run_cutlass_moe_fp8(
     ep_size: int,
     workspace_init,
 ):
-    current_platform.seed_everything(7)
+    set_random_seed(7)
     with set_current_vllm_config(vllm_config):
         mt = MOETensors8Bit.make_moe_tensors_8bit(
             m, k, n, e, per_act_token, per_out_channel
diff --git a/tests/kernels/moe/untest_moe_permute_unpermute.py b/tests/kernels/moe/untest_moe_permute_unpermute.py
index 12dd322dccc5272ac4ec462bcc9e37620839f5e4..45127ce0ac638e4360a37702517089f704b72e12 100644
--- a/tests/kernels/moe/untest_moe_permute_unpermute.py
+++ b/tests/kernels/moe/untest_moe_permute_unpermute.py
@@ -17,11 +17,12 @@ from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
     moe_unpermute,
 )
 from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed
 
 NUM_EXPERTS = [16, 64, 256]
 TOP_KS = [2, 6, 8]
 EP_SIZE = [1, 4, 16]
-current_platform.seed_everything(0)
+set_random_seed(0)
 
 if current_platform.is_rocm():
     pytest.skip(
@@ -226,7 +227,7 @@ def test_moe_permute_unpermute(
         n_local_expert, expert_map, _ = determine_expert_map(ep_size, ep_rank, n_expert)
         expert_map = expert_map.cuda()
     start_expert = n_local_expert * ep_rank
-    current_platform.seed_everything(0)
+    set_random_seed(0)
     hidden_states = torch.randn((n_token, n_hidden), device="cuda").to(dtype)
     gating_output = torch.randn((n_token, n_expert), device="cuda").to(dtype)
     topk_weights, topk_ids, token_expert_indices = fused_topk(
diff --git a/tests/kernels/moe/untest_nvfp4_moe.py b/tests/kernels/moe/untest_nvfp4_moe.py
index e67bd76a1618173c2dd4e1f10f0510176d025673..873d72117de769ec32b1e7c167c4545c7e5989d7 100644
--- a/tests/kernels/moe/untest_nvfp4_moe.py
+++ b/tests/kernels/moe/untest_nvfp4_moe.py
@@ -3,6 +3,7 @@
 import pytest
 import torch
 
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from tests.kernels.moe.utils import make_test_weights
 from tests.kernels.quantization.nvfp4_utils import (
     FLOAT4_E2M1_MAX,
@@ -13,9 +14,15 @@ from tests.kernels.utils import torch_moe
 from vllm import _custom_ops as ops
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.fused_moe.config import nvfp4_moe_quant_config
-from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4
+from vllm.model_executor.layers.fused_moe.cutlass_moe import (
+    CutlassExpertsFp4,
+)
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
+from vllm.model_executor.layers.fused_moe.prepare_finalize import (
+    MoEPrepareAndFinalizeNoEP,
+)
 from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed
 
 if not current_platform.has_device_capability(100):
     pytest.skip(
@@ -42,7 +49,7 @@ MNK_FACTORS = [
 def test_cutlass_fp4_moe_no_graph(
     m: int, n: int, k: int, e: int, topk: int, dtype: torch.dtype, workspace_init
 ):
-    current_platform.seed_everything(7)
+    set_random_seed(7)
     with set_current_vllm_config(
         VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
     ):
@@ -82,17 +89,21 @@ def test_cutlass_fp4_moe_no_graph(
             w2_scale=w2_blockscale,
         )
 
-        cutlass_output = cutlass_moe_fp4(
-            a=a,
-            w1_fp4=w1_q,
-            w2_fp4=w2_q,
+        kernel = mk.FusedMoEModularKernel(
+            MoEPrepareAndFinalizeNoEP(defer_input_quant=True),
+            CutlassExpertsFp4(
+                out_dtype=dtype,
+                max_experts_per_worker=e,
+                quant_config=quant_config,
+            ),
+        )
+
+        cutlass_output = kernel(
+            hidden_states=a,
+            w1=w1_q,
+            w2=w2_q,
             topk_weights=topk_weights,
             topk_ids=topk_ids,
-            quant_config=quant_config,
-            m=m,
-            n=n,
-            k=k,
-            e=e,
         )
 
         # Reference check:
diff --git a/tests/kernels/moe/untest_pplx_cutlass_moe.py b/tests/kernels/moe/untest_pplx_cutlass_moe.py
index dd4eb4da913bddedce9108180efb30eb7c9d6b3f..3a5801ae49961a913b2cb4879d0ad41a99128aa4 100644
--- a/tests/kernels/moe/untest_pplx_cutlass_moe.py
+++ b/tests/kernels/moe/untest_pplx_cutlass_moe.py
@@ -14,6 +14,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
 from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel
 from vllm.platforms import current_platform
 from vllm.utils.math_utils import cdiv
+from vllm.utils.torch_utils import set_random_seed
 
 from ...utils import multi_gpu_test
 from .parallel_utils import ProcessGroupInfo, parallel_launch
@@ -290,7 +291,7 @@ def test_cutlass_moe_pplx(
     world_dp_size: tuple[int, int],
     use_internode: bool,
 ):
-    current_platform.seed_everything(7)
+    set_random_seed(7)
 
     with set_current_vllm_config(vllm_config):
         dtype = torch.half
diff --git a/tests/kernels/moe/untest_silu_mul_fp8_quant_deep_gemm.py b/tests/kernels/moe/untest_silu_mul_fp8_quant_deep_gemm.py
index b220205759e2d156053c59c9f21db47d79c36415..62b7ecb17fbe7972a09de4c6ecc3b1f4d7799997 100644
--- a/tests/kernels/moe/untest_silu_mul_fp8_quant_deep_gemm.py
+++ b/tests/kernels/moe/untest_silu_mul_fp8_quant_deep_gemm.py
@@ -13,6 +13,7 @@ from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
 from vllm.platforms import current_platform
 from vllm.utils.deep_gemm import DeepGemmQuantScaleFMT, has_deep_gemm
 from vllm.utils.math_utils import cdiv, round_up
+from vllm.utils.torch_utils import set_random_seed
 
 if current_platform.is_fp8_fnuz():
     pytest.skip(
@@ -201,7 +202,7 @@ def token_random(E, T, H2, tokens_per_expert):
 @torch.inference_mode()
 def test_silu_mul_fp8_quant_deep_gemm(E: int, T: int, H: int, fp8_type: torch.dtype):
     group_size = 128
-    current_platform.seed_everything(42)
+    set_random_seed(42)
 
     tokens_per_expert = torch.randint(
         low=0,
diff --git a/tests/kernels/quant_utils.py b/tests/kernels/quant_utils.py
index 7927bd0d200d806c8ccc561939c5a433d5cdbe16..3d11413c5ad8bb0a11aa0a93265da41272091581 100644
--- a/tests/kernels/quant_utils.py
+++ b/tests/kernels/quant_utils.py
@@ -4,13 +4,13 @@
 
 import torch
 
-from vllm.model_executor.layers.quantization.utils.quant_utils import group_broadcast
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    get_fp8_min_max,
+    group_broadcast,
+)
 from vllm.platforms import current_platform
 from vllm.utils.math_utils import round_up
 
-# Using the default value (240.0) from pytorch will cause accuracy
-# issue on dynamic quantization models. Here use 224.0 for rocm.
-ROCM_FP8FNUZ_MAX = 224.0
 FP8_DTYPE = current_platform.fp8_dtype()
 
 
@@ -25,16 +25,12 @@ def ref_dynamic_per_token_quant(
     if scale_ub is not None:
         assert quant_dtype == FP8_DTYPE
 
-    qtype_traits = (
-        torch.iinfo(quant_dtype)
-        if quant_dtype == torch.int8
-        else torch.finfo(quant_dtype)
-    )
-    use_fp8fnuz = (
-        current_platform.is_fp8_fnuz() and quant_dtype == current_platform.fp8_dtype()
-    )
-    qtype_traits_max = ROCM_FP8FNUZ_MAX if use_fp8fnuz else qtype_traits.max
-    qtype_traits_min = -ROCM_FP8FNUZ_MAX if use_fp8fnuz else qtype_traits.min
+    if quant_dtype == torch.int8:
+        qtype_traits = torch.iinfo(quant_dtype)
+        qtype_traits_min = qtype_traits.min
+        qtype_traits_max = qtype_traits.max
+    else:
+        qtype_traits_min, qtype_traits_max = get_fp8_min_max()
     qtype_max = as_float32_tensor(qtype_traits_max)
     s_1 = as_float32_tensor(1.0)
     s_512 = as_float32_tensor(512.0)
@@ -72,17 +68,7 @@ def ref_dynamic_per_token_quant(
 def ref_dynamic_per_tensor_fp8_quant(
     x: torch.Tensor,
 ) -> tuple[torch.Tensor, torch.Tensor]:
-    fp8_traits = torch.finfo(FP8_DTYPE)
-    fp8_traits_max = (
-        ROCM_FP8FNUZ_MAX
-        if current_platform.is_rocm() and current_platform.is_fp8_fnuz()
-        else fp8_traits.max
-    )
-    fp8_traits_min = (
-        -ROCM_FP8FNUZ_MAX
-        if current_platform.is_rocm() and current_platform.is_fp8_fnuz()
-        else fp8_traits.min
-    )
+    fp8_traits_min, fp8_traits_max = get_fp8_min_max()
     fp8_max = as_float32_tensor(fp8_traits_max)
     one = as_float32_tensor(1.0)
 
diff --git a/tests/kernels/quantization/test_awq_triton.py b/tests/kernels/quantization/test_awq_triton.py
index 596a629ba4920355435bdace26a4a48f0a2b7db1..e14b1315ff04ee17f66ec70cd967fc2b42299947 100644
--- a/tests/kernels/quantization/test_awq_triton.py
+++ b/tests/kernels/quantization/test_awq_triton.py
@@ -13,7 +13,7 @@ from vllm.model_executor.layers.quantization.awq_triton import (
     awq_dequantize_triton,
     awq_gemm_triton,
 )
-from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed
 
 device = "cuda"  
 
@@ -86,7 +86,7 @@ def test_dequantize(qweight_rows, qweight_cols, group_size):
     zeros_cols = qweight_cols
     zeros_dtype = torch.int32
 
-    current_platform.seed_everything(0)
+    set_random_seed(0)
 
     qweight = torch.randint(
         0,
@@ -141,7 +141,7 @@ def test_gemm(N, K, M, splitK, group_size):
     qzeros_rows = scales_rows
     qzeros_cols = qweight_cols
 
-    current_platform.seed_everything(0)
+    set_random_seed(0)
 
     input = torch.rand((input_rows, input_cols), dtype=input_dtype, device=device)
     qweight = torch.randint(
diff --git a/tests/kernels/quantization/test_cutlass_w4a8_moe.py b/tests/kernels/quantization/test_cutlass_w4a8_moe.py
index a855f7333b617a337bf0a94816b341d5b49d83f7..de0e347d8fe7a01f907c438d53eb0fd96eee2c96 100644
--- a/tests/kernels/quantization/test_cutlass_w4a8_moe.py
+++ b/tests/kernels/quantization/test_cutlass_w4a8_moe.py
@@ -17,6 +17,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
 )
 from vllm.platforms import current_platform
 from vllm.scalar_type import ScalarType, scalar_types
+from vllm.utils.torch_utils import set_random_seed
 
 IS_SUPPORTED_BY_GPU = (
     current_platform.is_cuda() and current_platform.get_device_capability()[0] >= 9
@@ -248,7 +249,7 @@ def compute_moe_reference_output(setup: MoETestSetup) -> torch.Tensor:
 @pytest.mark.parametrize("random_zero", [True, False])
 def test_cutlass_w4a8_moe_mm_end_to_end(shape, random_zero):
     num_experts, N, K = shape
-    current_platform.seed_everything(42)
+    set_random_seed(42)
     setup = make_moe_test_setup(
         num_experts=num_experts, K=K, N=N, max_blocks=64, random_zero=random_zero
     )
@@ -308,7 +309,7 @@ class W4A8MoELayer(torch.nn.Module):
     reason="W4A8 Grouped GEMM is not supported on this GPU type.",
 )
 def test_cutlass_w4a8_moe_mm_cuda_graph():
-    current_platform.seed_everything(42)
+    set_random_seed(42)
     # Fixed config for CUDA graph test (single parameter point).
     num_experts = 8
     K = 512
diff --git a/tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py b/tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
index 1e5c7dafb0f5a48e36babd3d8e9d75cee507b467..94fa38b5aae4f20614e7bf89b50a1f49c48711f8 100644
--- a/tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
+++ b/tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
@@ -12,6 +12,7 @@ from nvfp4_utils import (
 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
 from vllm.utils.flashinfer import flashinfer_scaled_fp4_mm
+from vllm.utils.torch_utils import set_random_seed
 
 if not current_platform.has_device_capability(100):
     pytest.skip(
@@ -72,7 +73,7 @@ def test_flashinfer_nvfp4_gemm(
     if backend == "trtllm" and dtype == torch.float16:
         pytest.skip("Only torch.bfloat16 is supported for TRTLLM FP4 GEMM operations")
 
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
     m, n, packed_k = shape
     k = packed_k * 2
     block_size = 16
diff --git a/tests/kernels/quantization/test_flashinfer_scaled_mm.py b/tests/kernels/quantization/test_flashinfer_scaled_mm.py
index b30821b6895bcf9d275ebda9076750c6ac29247e..2c945ffcc4cd04c0c3342ec217de9d17dc1dd548 100644
--- a/tests/kernels/quantization/test_flashinfer_scaled_mm.py
+++ b/tests/kernels/quantization/test_flashinfer_scaled_mm.py
@@ -6,6 +6,7 @@ import torch
 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
 from vllm.utils.flashinfer import flashinfer_scaled_fp8_mm
+from vllm.utils.torch_utils import set_random_seed
 
 if not current_platform.has_device_capability(100):
     pytest.skip(
@@ -38,7 +39,7 @@ def test_flashinfer_fp8_gemm(
     device: str,
     autotune: bool,
 ) -> None:
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
     m, n, k = shape
     a = torch.randn((m, k), dtype=dtype, device=device)
     b = torch.randn((n, k), dtype=dtype, device=device) / k
diff --git a/tests/kernels/quantization/test_fp8_min_max_helper.py b/tests/kernels/quantization/test_fp8_min_max_helper.py
new file mode 100644
index 0000000000000000000000000000000000000000..8cd68a3fef7e6e381f2ef9bcacfb7c09c9e3b60f
--- /dev/null
+++ b/tests/kernels/quantization/test_fp8_min_max_helper.py
@@ -0,0 +1,65 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Unit tests for the get_fp8_min_max() helper function.
+
+These tests verify the FP8 min/max value logic for both standard
+and fnuz (ROCm MI300) dtype handling.
+"""
+
+from unittest.mock import patch
+
+import pytest
+import torch
+
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    get_fp8_min_max,
+)
+
+
+class TestGetFp8MinMax:
+    """Test cases for get_fp8_min_max() function."""
+
+    @patch("vllm.model_executor.layers.quantization.utils.quant_utils.current_platform")
+    def test_standard_fp8_platform(self, mock_platform):
+        """Test that standard FP8 platform uses PyTorch's finfo values."""
+        mock_platform.is_fp8_fnuz.return_value = False
+        mock_platform.fp8_dtype.return_value = torch.float8_e4m3fn
+
+        fp8_min, fp8_max = get_fp8_min_max()
+        finfo = torch.finfo(torch.float8_e4m3fn)
+
+        # Standard FP8 max is 448.0 for e4m3fn
+        assert fp8_max == finfo.max, f"Expected finfo.max={finfo.max}, got {fp8_max}"
+        assert fp8_min == finfo.min, f"Expected finfo.min={finfo.min}, got {fp8_min}"
+
+    @patch("vllm.model_executor.layers.quantization.utils.quant_utils.current_platform")
+    def test_fnuz_platform_returns_224(self, mock_platform):
+        """Test that fnuz platform returns 224.0."""
+        mock_platform.is_fp8_fnuz.return_value = True
+
+        fp8_min, fp8_max = get_fp8_min_max()
+
+        # fnuz on ROCm MI300 should return 224.0, not 240.0
+        assert fp8_max == 224.0, f"Expected 224.0 for fnuz platform, got {fp8_max}"
+        assert fp8_min == -224.0, f"Expected -224.0 for fnuz platform, got {fp8_min}"
+
+    @patch("vllm.model_executor.layers.quantization.utils.quant_utils.current_platform")
+    def test_non_fnuz_platform_uses_finfo(self, mock_platform):
+        """Test that non-fnuz platform uses finfo values."""
+        mock_platform.is_fp8_fnuz.return_value = False
+        mock_platform.fp8_dtype.return_value = torch.float8_e4m3fn
+
+        fp8_min, fp8_max = get_fp8_min_max()
+        finfo = torch.finfo(torch.float8_e4m3fn)
+
+        assert fp8_max == finfo.max, (
+            f"Non-fnuz platform should use finfo.max={finfo.max}, got {fp8_max}"
+        )
+        assert fp8_min == finfo.min, (
+            f"Non-fnuz platform should use finfo.min={finfo.min}, got {fp8_min}"
+        )
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/tests/kernels/quantization/test_fp8_quant_group.py b/tests/kernels/quantization/test_fp8_quant_group.py
index f5e1cde94b6e9271d13e1ba8ffd4d7cd4a2ce228..113afb3c102e666be53e50fc1c0c56f88d8fe2ce 100644
--- a/tests/kernels/quantization/test_fp8_quant_group.py
+++ b/tests/kernels/quantization/test_fp8_quant_group.py
@@ -7,7 +7,7 @@ import torch
 
 from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
 from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
-from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed
 
 
 @pytest.mark.parametrize(
@@ -23,14 +23,19 @@ from vllm.platforms import current_platform
 @pytest.mark.parametrize("use_ue8m0", [True, False])
 @torch.inference_mode()
 def test_quantfp8_group_functionality(
-    batch_size: int, hidden_dim: int, group_size: int, seed: int, use_ue8m0: bool
+    default_vllm_config,
+    batch_size: int,
+    hidden_dim: int,
+    group_size: int,
+    seed: int,
+    use_ue8m0: bool,
 ) -> None:
     """Test QuantFP8 group quantization with various configurations.
 
     Tests both CUDA and native implementations, column-major scales,
     and verifies consistency between implementations.
     """
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
 
     x = torch.randn((batch_size, hidden_dim), dtype=torch.bfloat16, device="cuda") * 8
     expected_num_groups = (hidden_dim + group_size - 1) // group_size
@@ -82,8 +87,10 @@ def test_quantfp8_group_functionality(
 @pytest.mark.parametrize("seed", [42])
 @pytest.mark.parametrize("use_ue8m0", [True, False])
 @torch.inference_mode()
-def test_quantfp8_group_multidimensional(seed: int, use_ue8m0: bool) -> None:
-    current_platform.seed_everything(seed)
+def test_quantfp8_group_multidimensional(
+    default_vllm_config, seed: int, use_ue8m0: bool
+) -> None:
+    set_random_seed(seed)
 
     group_size = 64
 
@@ -135,8 +142,8 @@ def test_quantfp8_group_multidimensional(seed: int, use_ue8m0: bool) -> None:
 
 @pytest.mark.parametrize("seed", [42])
 @torch.inference_mode()
-def test_quantfp8_group_edge_cases(seed: int) -> None:
-    current_platform.seed_everything(seed)
+def test_quantfp8_group_edge_cases(default_vllm_config, seed: int) -> None:
+    set_random_seed(seed)
 
     batch_size = 16
     group_size = 64
diff --git a/tests/kernels/quantization/test_gguf.py b/tests/kernels/quantization/test_gguf.py
index 9004512c2c96a596fe0e13cc375e95c5e58ad556..02a9c5cc6a0b30f084d1d442200ebe239c83fc3b 100644
--- a/tests/kernels/quantization/test_gguf.py
+++ b/tests/kernels/quantization/test_gguf.py
@@ -12,8 +12,8 @@ from huggingface_hub import snapshot_download
 import vllm._custom_ops as ops
 from vllm.model_executor.layers.fused_moe import fused_experts
 from vllm.model_executor.layers.quantization.gguf import _fused_moe_gguf
-from vllm.platforms import current_platform
 from ...utils import models_path_prefix
+from vllm.utils.torch_utils import set_random_seed
 
 # GGUF_SAMPLE = snapshot_download("Isotr0py/test-gguf-sample")
 # GGUF_SAMPLE_MOE = snapshot_download("SzymonOzog/test-gguf-moe-sample")
@@ -95,7 +95,7 @@ def test_dequantize(
 @pytest.mark.parametrize("quant_type", QUANT_TYPES)
 @torch.inference_mode()
 def test_mmvq(hidden_size: int, dtype: torch.dtype, quant_type: GGMLQuantizationType):
-    current_platform.seed_everything(0)
+    set_random_seed(0)
 
     tensors = get_gguf_sample_tensors(hidden_size, quant_type)
     x = torch.rand((1, hidden_size), dtype=dtype, device="cuda")
@@ -138,7 +138,7 @@ def test_mmq(
     dtype: torch.dtype,
     quant_type: GGMLQuantizationType,
 ):
-    current_platform.seed_everything(0)
+    set_random_seed(0)
 
     tensors = get_gguf_sample_tensors(hidden_size, quant_type)
     x = torch.rand((num_tokens, hidden_size), dtype=dtype, device="cuda")
@@ -173,7 +173,7 @@ def test_moe(
     quant_type: GGMLQuantizationType,
     top_k: int,
 ):
-    current_platform.seed_everything(0)
+    set_random_seed(0)
     H, E = 1024, 256
 
     x = torch.rand((num_tokens, H), dtype=dtype, device="cuda")
diff --git a/tests/kernels/quantization/test_int8_kernel.py b/tests/kernels/quantization/test_int8_kernel.py
index 25eb97326edeb9164bba0b2203873bb849ab2df2..2a8cd3d54cfd09dc02f6a614f0883688c6810dd6 100644
--- a/tests/kernels/quantization/test_int8_kernel.py
+++ b/tests/kernels/quantization/test_int8_kernel.py
@@ -107,7 +107,7 @@ SEEDS = [0]
     itertools.product(M, N, K, E, TOP_KS, DTYPES, SEEDS),
 )
 @torch.inference_mode()
-def test_w8a8_fp8_fused_moe(M, N, K, E, topk, dtype, seed):
+def test_w8a8_fp8_fused_moe(default_vllm_config, M, N, K, E, topk, dtype, seed):
     torch.manual_seed(seed)
     # Initialize int8 quantization parameters
     factor_for_scale = 1e-2
diff --git a/tests/kernels/quantization/test_int8_quant.py b/tests/kernels/quantization/test_int8_quant.py
index f46adcac438121a2bc6b9723d5c85c45ec9fd98d..617c2f7cab776a8110ea0d9b25af1112899a7ab5 100644
--- a/tests/kernels/quantization/test_int8_quant.py
+++ b/tests/kernels/quantization/test_int8_quant.py
@@ -7,7 +7,7 @@ import torch
 from tests.kernels.quant_utils import ref_dynamic_per_token_quant
 from tests.kernels.utils import opcheck
 from vllm._custom_ops import scaled_int8_quant
-from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed
 
 DTYPES = [torch.bfloat16, torch.float]
 HIDDEN_SIZES = [17, 1024, 1025, 1026, 5137, 8193]
@@ -48,7 +48,7 @@ def opcheck_int8_quant_dynamic(output, input, symmetric=True):
 def test_dynamic_scaled_int8_quant(
     num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int
 ) -> None:
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
 
     x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000
 
@@ -74,7 +74,7 @@ def test_dynamic_scaled_int8_quant(
 def test_dynamic_scaled_int8_azp_quant(
     num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int
 ) -> None:
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
     int8_traits = torch.iinfo(torch.int8)
 
     x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000 - 300
@@ -115,7 +115,7 @@ def test_dynamic_scaled_int8_azp_quant(
 def test_static_scaled_int8_quant(
     num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int, scale: float
 ) -> None:
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
     int8_traits = torch.iinfo(torch.int8)
 
     x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000
@@ -148,7 +148,7 @@ def test_static_scaled_int8_azp_quant(
     scale: float,
     azp: int,
 ) -> None:
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
     int8_traits = torch.iinfo(torch.int8)
 
     x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000 - 300
diff --git a/tests/kernels/quantization/test_mxfp4_qutlass.py b/tests/kernels/quantization/test_mxfp4_qutlass.py
index 0bacbef2046b445b687718a04b09e39061d0a1fc..0ad8e48ab159428079ccfab711b86eab2b4a6340 100644
--- a/tests/kernels/quantization/test_mxfp4_qutlass.py
+++ b/tests/kernels/quantization/test_mxfp4_qutlass.py
@@ -24,6 +24,7 @@ from compressed_tensors.transform.utils.hadamard import deterministic_hadamard_m
 from vllm._custom_ops import fusedQuantizeMx, matmul_mxf4_bf16_tn
 from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked
 from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed
 
 if not torch.cuda.is_available():
     pytest.skip("CUDA required for these tests.", allow_module_level=True)
@@ -205,7 +206,7 @@ LLAMA_MODELS = {
 
 @pytest.fixture(autouse=True)
 def _seed_each_test():
-    current_platform.seed_everything(0)
+    set_random_seed(0)
     np.random.seed(0)
     torch.random.manual_seed(0)
 
diff --git a/tests/kernels/quantization/test_nvfp4_qutlass.py b/tests/kernels/quantization/test_nvfp4_qutlass.py
index 3824a080f5047abfb6b87c108a5465c3833f9edb..bb25c4ab9aaf78d2c105cf5c3ad9c150b6b130ef 100644
--- a/tests/kernels/quantization/test_nvfp4_qutlass.py
+++ b/tests/kernels/quantization/test_nvfp4_qutlass.py
@@ -25,6 +25,7 @@ from vllm import _custom_ops as ops  # use existing nvfp4 gemm in vllm
 from vllm._custom_ops import fusedQuantizeNv
 from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked
 from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed
 
 if not torch.cuda.is_available():
     pytest.skip("CUDA required for these tests.", allow_module_level=True)
@@ -193,7 +194,7 @@ LLAMA_MODELS = {
 
 @pytest.fixture(autouse=True)
 def _seed_each_test():
-    current_platform.seed_everything(0)
+    set_random_seed(0)
     np.random.seed(0)
     torch.random.manual_seed(0)
 
diff --git a/tests/kernels/quantization/test_silu_mul_nvfp4_quant.py b/tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
index 4617464a3978817aded41b9f8c1fca60eb9b5c98..dd6c6abacbe4a66018ac081a4d1094b8db4e2928 100644
--- a/tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
+++ b/tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
@@ -11,6 +11,7 @@ from tests.kernels.quantization.nvfp4_utils import (
 from vllm._custom_ops import scaled_fp4_quant
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed
 
 if not current_platform.has_device_capability(100):
     pytest.skip(
@@ -30,10 +31,11 @@ BLOCK_SIZE = 16
 @pytest.mark.parametrize("shape", SHAPES)
 @torch.inference_mode()
 def test_silu_mul_nvfp4_quant(
+    default_vllm_config,
     dtype: torch.dtype,
     shape: tuple[int, int],
 ) -> None:
-    current_platform.seed_everything(42)
+    set_random_seed(42)
     device = "cuda:0"
     torch.set_default_device(device)
 
diff --git a/tests/kernels/quantization/test_triton_scaled_mm.py b/tests/kernels/quantization/test_triton_scaled_mm.py
index 881d5c597f2d3733272887c682f536332a1dd955..2540224a928afa3dbcf1e3604f24cd5c0202a89c 100644
--- a/tests/kernels/quantization/test_triton_scaled_mm.py
+++ b/tests/kernels/quantization/test_triton_scaled_mm.py
@@ -11,7 +11,9 @@ import pytest
 import torch
 
 from vllm.platforms import current_platform
+
 from ...utils import models_path_prefix
+from vllm.utils.torch_utils import set_random_seed
 
 device = "cuda"
 
@@ -86,7 +88,7 @@ def test_scaled_mm(
 ):
     is_floating_point_type = lambda t: torch.tensor([1, 1], dtype=t).is_floating_point()
 
-    current_platform.seed_everything(0)
+    set_random_seed(0)
 
     # NOTE: There are cases, where if the matrix is large enough, an output
     # like 65504.4 can be produced, and can easily turn into inf when
diff --git a/tests/kernels/quantization/untest_block_fp8.py b/tests/kernels/quantization/untest_block_fp8.py
index 32c77b9a01ece4c20877770a859e1a96c5019bb8..bd4a737ca63009631a9744a0e9f56d09ca6c7b18 100644
--- a/tests/kernels/quantization/untest_block_fp8.py
+++ b/tests/kernels/quantization/untest_block_fp8.py
@@ -24,6 +24,10 @@ from vllm.utils.deep_gemm import (
     per_block_cast_to_fp8,
     should_use_deepgemm_for_fp8_linear,
 )
+from vllm.utils.flashinfer import (
+    flashinfer_fp8_blockscale_gemm,
+    has_flashinfer_fp8_blockscale_gemm,
+)
 from vllm.utils.import_utils import has_deep_gemm
 
 if current_platform.get_device_capability() < (9, 0):
@@ -205,3 +209,50 @@ def test_w8a8_block_fp8_deep_gemm_matmul(M, N, K, block_size, out_dtype, seed):
         torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))
     ) / torch.mean(torch.abs(ref_out.to(torch.float32)))
     assert rel_diff < 0.001
+
+
+@pytest.mark.skipif(
+    current_platform.is_fp8_fnuz(),
+    reason="This platform supports e4m3fnuz, not e4m3fn.",
+)
+@pytest.mark.parametrize(
+    "M,N,K,block_size,out_dtype,seed",
+    itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS),
+)
+@torch.inference_mode()
+def test_w8a8_block_fp8_flashinfer_matmul(M, N, K, block_size, out_dtype, seed):
+    if not has_flashinfer_fp8_blockscale_gemm():
+        pytest.skip(
+            "FlashInfer block GEMM not available (requires SM90+ and FlashInfer)"
+        )
+    # only aligned sizes
+    if K % 128 != 0 or N % 64 != 0:
+        pytest.skip(f"Skipping test; invalid size {M}, {N}, {K}")
+
+    torch.manual_seed(seed)
+    fp8_info = torch.finfo(torch.float8_e4m3fn)
+    fp8_max = fp8_info.max
+
+    A_bf16 = (torch.rand(M, K, dtype=torch.bfloat16) - 0.5) * 2 * fp8_max
+    B_bf16 = (torch.rand(N, K, dtype=torch.bfloat16) - 0.5) * 2 * fp8_max
+
+    A_fp8, As_fp8 = per_token_group_quant_fp8(A_bf16, block_size[1], use_ue8m0=False)
+    B_fp8, Bs_fp8 = per_block_cast_to_fp8(B_bf16, block_size, use_ue8m0=False)
+
+    As = As_fp8.to(torch.float32)
+    Bs = Bs_fp8.to(torch.float32)
+
+    ref_out = native_w8a8_block_matmul(A_fp8, B_fp8, As, Bs, block_size, out_dtype)
+
+    out = flashinfer_fp8_blockscale_gemm(
+        input=A_bf16,
+        weight=B_fp8,
+        input_scale=None,
+        weight_scale=Bs,
+        out_dtype=out_dtype,
+    )
+
+    rel_diff = torch.mean(
+        torch.abs(out.to(torch.bfloat16) - ref_out.to(torch.bfloat16))
+    ) / torch.mean(torch.abs(ref_out.to(torch.bfloat16)))
+    assert rel_diff < 0.001
diff --git a/tests/kernels/quantization/untest_fp8_quant.py b/tests/kernels/quantization/untest_fp8_quant.py
index 229e950655714363835915c5609037faaff1ee42..4572c28de57e3abc7c4a12ea3723cc1be1f3015c 100644
--- a/tests/kernels/quantization/untest_fp8_quant.py
+++ b/tests/kernels/quantization/untest_fp8_quant.py
@@ -11,7 +11,11 @@ from tests.kernels.quant_utils import (
     ref_dynamic_per_token_quant,
 )
 from tests.kernels.utils import opcheck
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    scaled_quantize,
+)
 from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed
 
 DTYPES = [torch.bfloat16, torch.float]
 HIDDEN_SIZES = [17, 1024, 1025, 1026, 5137, 8193]
@@ -21,10 +25,18 @@ SEEDS = [0]
 
 
 def opcheck_fp8_quant(
-    output, input, scale=None, scale_ub=None, use_per_token_if_dynamic=False
+    output,
+    input,
+    scale=None,
+    scale_ub=None,
+    use_per_token_if_dynamic=False,
+    group_shape=None,
 ):
     if scale is not None:
-        opcheck(torch.ops._C.static_scaled_fp8_quant, (output, input, scale))
+        opcheck(
+            torch.ops._C.static_scaled_fp8_quant,
+            (output, input, scale, group_shape),
+        )
     elif use_per_token_if_dynamic:
         scale = torch.empty(
             (input.shape[0], 1), device=input.device, dtype=torch.float32
@@ -51,7 +63,7 @@ def opcheck_fp8_quant(
 def test_dynamic_per_token_fp8_quant(
     num_tokens: int, hidden_size: int, dtype: torch.dtype, scale_ub: bool, seed: int
 ) -> None:
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
 
     x = (
         torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") + 1e-6
@@ -81,7 +93,7 @@ def test_dynamic_per_token_fp8_quant(
 def test_dynamic_per_tensor_fp8_quant(
     num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int
 ) -> None:
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
 
     x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda")
 
@@ -101,7 +113,7 @@ def test_dynamic_per_tensor_fp8_quant(
 @torch.inference_mode()
 @pytest.mark.parametrize("seed", SEEDS)
 def test_fp8_quant_large(seed: int) -> None:
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
 
     num_tokens = 1024000  # Mistral-Nemo's max_position_embeddings
     hidden_size = 1152  # Smallest hidden_size to reproduce the error
@@ -117,4 +129,93 @@ def test_fp8_quant_large(seed: int) -> None:
     ref_out = ref_out.to(dtype=dtype)
     ops_out = ops_out.to(dtype=dtype)
 
-    torch.testing.assert_close(ref_out, ops_out)
\ No newline at end of file
+    torch.testing.assert_close(ref_out, ops_out)
+
+
+# Test static FP8 quantization with 2D group scales
+GROUP_SHAPES_2D = [
+    (-1, -1),  # Per-tensor
+    (-1, 1),  # Per-channel
+    (1, -1),  # Per-token
+    (-1, 128),  # Per-head quantization
+    (1, 128),  # DeepSeek-style per-token-per-group (group_m=1, group_n=128)
+    (128, 128),  # DeepSeek-style block quantization
+    (1, 64),  # Smaller group size
+    (1, 16),  # Small group (scalar path in kernel)
+    (4, 256),  # Non-trivial both dimensions
+]
+# Use sizes divisible by all group shapes
+NUM_TOKENS_GROUP = [128, 512]
+HIDDEN_SIZES_GROUP = [256, 1024, 2048]
+
+
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS_GROUP)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES_GROUP)
+@pytest.mark.parametrize("group_shape", GROUP_SHAPES_2D)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@torch.inference_mode()
+def test_static_fp8_quant_group_2d(
+    num_tokens: int,
+    hidden_size: int,
+    group_shape: tuple[int, int],
+    dtype: torch.dtype,
+    seed: int,
+) -> None:
+    """Test static FP8 quantization with 2D group scales using scaled_quantize."""
+    # Normalize group_shape (-1 means full extent)
+    norm_group_m = num_tokens if group_shape[0] == -1 else group_shape[0]
+    norm_group_n = hidden_size if group_shape[1] == -1 else group_shape[1]
+
+    # Skip if sizes are not divisible by group shape
+    if num_tokens % norm_group_m != 0 or hidden_size % norm_group_n != 0:
+        pytest.skip(
+            f"Skipping: ({num_tokens}, {hidden_size}) not divisible by "
+            f"group_shape ({group_shape[0]}, {group_shape[1]})"
+        )
+
+    current_platform.seed_everything(seed)
+
+    x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda")
+    ref_out, scale = scaled_quantize(
+        x, group_shape, FP8_DTYPE, compute_dtype=torch.float32
+    )
+    ops_out, ops_scale = ops.scaled_fp8_quant(x, scale=scale, group_shape=group_shape)
+
+    torch.testing.assert_close(scale, ops_scale)
+    torch.testing.assert_close(ref_out.float(), ops_out.float(), rtol=0.12, atol=0.0)
+
+    opcheck_fp8_quant(ops_out, x, scale=scale)
+
+
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS_GROUP)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES_GROUP)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("group_shape", [(1, -1), (-1, 1)])  # per-token, per-channel
+@torch.inference_mode()
+def test_static_fp8_quant_1d_scale(
+    num_tokens: int,
+    hidden_size: int,
+    dtype: torch.dtype,
+    seed: int,
+    group_shape: tuple[int, int],
+) -> None:
+    """Test static FP8 quantization with 1D scale (per-token or per-channel)."""
+    current_platform.seed_everything(seed)
+
+    x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda")
+    ref_out, scale_2d = scaled_quantize(
+        x, group_shape, FP8_DTYPE, compute_dtype=torch.float32
+    )
+
+    # Flatten scale to 1D for testing 1D scale path
+    scale_1d = scale_2d.flatten()
+    ops_out, ops_scale = ops.scaled_fp8_quant(
+        x, scale=scale_1d, group_shape=group_shape
+    )
+
+    torch.testing.assert_close(scale_1d, ops_scale)
+    torch.testing.assert_close(ref_out.float(), ops_out.float(), rtol=0.12, atol=0.0)
+
+    opcheck_fp8_quant(ops_out, x, scale=scale_1d, group_shape=group_shape)
diff --git a/tests/kernels/quantization/untest_nvfp4_quant.py b/tests/kernels/quantization/untest_nvfp4_quant.py
index 90168681c267510a663ccb09bd0eaf8a58822a25..39d25471a532c5aed96fa9b6fafade09bcea5384 100644
--- a/tests/kernels/quantization/untest_nvfp4_quant.py
+++ b/tests/kernels/quantization/untest_nvfp4_quant.py
@@ -6,6 +6,7 @@ import torch
 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
+from vllm.utils.torch_utils import set_random_seed
 
 if not current_platform.has_device_capability(100):
     pytest.skip(
@@ -134,7 +135,7 @@ def test_quantize_to_fp4(
     seed: int,
     device: str,
 ) -> None:
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
     torch.set_default_device(device)
 
     m, n = shape
@@ -156,7 +157,7 @@ def test_quantize_to_fp4(
 @torch.inference_mode()
 def test_quantize_to_fp4_padded(pad_shape: tuple[int, int]) -> None:
     dtype = torch.float16
-    current_platform.seed_everything(42)
+    set_random_seed(42)
     torch.set_default_device("cuda:0")
 
     m, n = pad_shape
diff --git a/tests/kernels/quantization/untest_nvfp4_scaled_mm.py b/tests/kernels/quantization/untest_nvfp4_scaled_mm.py
index 434564737c889da3d44c413c4c7a2e41f1995e00..e7e16817593b52c05e3b5e1a0034a025586ff49e 100644
--- a/tests/kernels/quantization/untest_nvfp4_scaled_mm.py
+++ b/tests/kernels/quantization/untest_nvfp4_scaled_mm.py
@@ -6,6 +6,7 @@ from nvfp4_utils import FLOAT4_E2M1_MAX, FLOAT8_E4M3_MAX, dequantize_nvfp4_to_dt
 
 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed
 
 if not current_platform.has_device_capability(100):
     pytest.skip(
@@ -59,7 +60,7 @@ def test_nvfp4_gemm(
     seed: int,
     device: str,
 ) -> None:
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
     m, n, packed_k = shape
     k = packed_k * 2
     block_size = 16
diff --git a/tests/kernels/test_apply_repetition_penalties.py b/tests/kernels/test_apply_repetition_penalties.py
index a4619f5846b166633648d4513411eff70706f0a5..8270cf885f607a1d7e144663fa0f8be295347f7a 100644
--- a/tests/kernels/test_apply_repetition_penalties.py
+++ b/tests/kernels/test_apply_repetition_penalties.py
@@ -9,6 +9,7 @@ from vllm._custom_ops import (
     apply_repetition_penalties_torch,
 )
 from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed
 
 NUM_SEQS = [1, 2, 3, 4, 8, 13, 17, 32, 37, 256, 1023, 1024, 1025]
 # [stress, stress, stress, Qwen, llama 4]
@@ -38,7 +39,7 @@ def test_apply_repetition_penalties(
     Test the apply_repetition_penalties custom op
     against a reference implementation.
     """
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
     torch.set_default_device("cuda:0")
 
     # Create test data
@@ -95,7 +96,7 @@ def test_apply_repetition_penalties_zero_seqs() -> None:
     dtype = torch.float32
     seed = 0
 
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
     torch.set_default_device("cuda:0")
 
     # Create test data
diff --git a/tests/kernels/test_fla_layernorm_guard.py b/tests/kernels/test_fla_layernorm_guard.py
index f944c6dcfa73b3ea18031923f8bc83fb6c1bd6ab..2ece5497cb06edb67967ac4ab3354845d0e1d198 100644
--- a/tests/kernels/test_fla_layernorm_guard.py
+++ b/tests/kernels/test_fla_layernorm_guard.py
@@ -10,7 +10,7 @@ from vllm.model_executor.layers.fla.ops.layernorm_guard import (
     layernorm_fn,
     rms_norm_ref,
 )
-from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed
 
 
 def layer_norm_ref(
@@ -114,7 +114,7 @@ def test_layer_norm_fwd_basic(
     is_rms_norm: bool,
 ) -> None:
     """Test basic layer norm forward pass without z (gate) tensor."""
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
     device = torch.device("cuda:0")
 
     # Create inputs
@@ -156,7 +156,7 @@ def test_layer_norm_fwd_with_gate(
     is_rms_norm: bool,
 ) -> None:
     """Test layer norm forward pass with z (gate) tensor."""
-    current_platform.seed_everything(42)
+    set_random_seed(42)
     device = torch.device("cuda:0")
 
     # Create inputs
@@ -213,7 +213,7 @@ def test_layer_norm_fwd_with_groups(
             f"hidden_size {hidden_size} not divisible by group_size {group_size}"
         )
 
-    current_platform.seed_everything(42)
+    set_random_seed(42)
     device = torch.device("cuda:0")
 
     # Create inputs
@@ -253,7 +253,7 @@ def test_layer_norm_rows_per_block(
     dtype: torch.dtype,
 ) -> None:
     """Test that rows_per_block logic works correctly for various M sizes."""
-    current_platform.seed_everything(42)
+    set_random_seed(42)
     device = torch.device("cuda:0")
     hidden_size = 1024
 
@@ -278,7 +278,7 @@ def test_layer_norm_rows_per_block(
 def test_strided_input(dtype: torch.dtype) -> None:
     """Test that the kernel handles non-contiguous (strided)
     inputs correctly."""
-    current_platform.seed_everything(42)
+    set_random_seed(42)
     device = torch.device("cuda:0")
     num_tokens = 128
     hidden_size = 1024
@@ -318,7 +318,7 @@ def test_output_buffer_provided(
     dtype: torch.dtype,
 ) -> None:
     """Test that the kernel works when an output buffer is provided."""
-    current_platform.seed_everything(42)
+    set_random_seed(42)
     device = torch.device("cuda:0")
 
     # Create inputs
@@ -359,7 +359,7 @@ def test_multidimensional_input(
     dtype: torch.dtype,
 ) -> None:
     """Test that the autograd function handles multidimensional inputs."""
-    current_platform.seed_everything(42)
+    set_random_seed(42)
     device = torch.device("cuda:0")
     hidden_size = shape[-1]
 
diff --git a/tests/kernels/test_flex_attention.py b/tests/kernels/test_flex_attention.py
index d427e0032f001d1fa121863046cafee2cd42d210..e74bc456ffd8d1a92578e30ab2b97165e010ee60 100644
--- a/tests/kernels/test_flex_attention.py
+++ b/tests/kernels/test_flex_attention.py
@@ -42,7 +42,7 @@ def set_seed(seed):
     not torch.cuda.is_available() or TORCH_VERSION < MINIMUM_TORCH_VERSION,
     reason="CUDA not available or PyTorch version < 2.7",
 )
-def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
+def test_flex_attention_vs_default_backend(vllm_runner):
     """Test that FlexAttention produces the same outputs as the default backend.
 
     This test compares the outputs from the FlexAttention backend with
@@ -59,35 +59,32 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
     ]
 
     # Run with flex attention
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
-
-        set_seed(seed)
-        with vllm_runner(
-            model_name,
-            runner="generate",
-            tensor_parallel_size=1,
-            num_gpu_blocks_override=128,
-            enforce_eager=True,
-        ) as llm_flex:
-            output_flex = llm_flex.generate_greedy_logprobs(
-                prompts, max_tokens, num_logprobs
-            )
+    set_seed(seed)
+    with vllm_runner(
+        model_name,
+        runner="generate",
+        tensor_parallel_size=1,
+        num_gpu_blocks_override=128,
+        enforce_eager=True,
+        attention_config={"backend": "FLEX_ATTENTION"},
+    ) as llm_flex:
+        output_flex = llm_flex.generate_greedy_logprobs(
+            prompts, max_tokens, num_logprobs
+        )
 
     # Run with default backend
-    with monkeypatch.context() as m:
-        set_seed(seed)
-        with vllm_runner(
-            model_name,
-            runner="generate",
-            tensor_parallel_size=1,
-            num_gpu_blocks_override=128,
-            enforce_eager=True,
-            gpu_memory_utilization=0.85,
-        ) as llm_default:
-            output_default = llm_default.generate_greedy_logprobs(
-                prompts, max_tokens, num_logprobs
-            )
+    set_seed(seed)
+    with vllm_runner(
+        model_name,
+        runner="generate",
+        tensor_parallel_size=1,
+        num_gpu_blocks_override=128,
+        enforce_eager=True,
+        gpu_memory_utilization=0.85,
+    ) as llm_default:
+        output_default = llm_default.generate_greedy_logprobs(
+            prompts, max_tokens, num_logprobs
+        )
 
     check_logprobs_close(
         outputs_0_lst=output_flex,
@@ -101,7 +98,7 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
     not torch.cuda.is_available() or TORCH_VERSION < MINIMUM_TORCH_VERSION,
     reason="CUDA not available or PyTorch version < 2.7",
 )
-def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
+def test_encoder_flex_attention_vs_default_backend(vllm_runner):
     """Test that FlexAttention produces the same outputs as the default backend.
 
     This test compares the outputs from the FlexAttention backend with
@@ -115,30 +112,26 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
     ]
 
     # Run with flex attention
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
-        with vllm_runner(
-            model_name,
-            runner="pooling",
-            dtype=torch.bfloat16,
-            tensor_parallel_size=1,
-            max_model_len=100,
-            enforce_eager=True,
-        ) as llm_flex:
-            flex_outputs = llm_flex.embed(prompts)
+    with vllm_runner(
+        model_name,
+        runner="pooling",
+        dtype=torch.bfloat16,
+        tensor_parallel_size=1,
+        max_model_len=100,
+        enforce_eager=True,
+        attention_config={"backend": "FLEX_ATTENTION"},
+    ) as llm_flex:
+        flex_outputs = llm_flex.embed(prompts)
 
     # Run with default backend
-    with (
-        monkeypatch.context() as m,
-        vllm_runner(
-            model_name,
-            runner="pooling",
-            dtype=torch.bfloat16,
-            tensor_parallel_size=1,
-            max_model_len=100,
-            enforce_eager=True,
-        ) as llm_default,
-    ):
+    with vllm_runner(
+        model_name,
+        runner="pooling",
+        dtype=torch.bfloat16,
+        tensor_parallel_size=1,
+        max_model_len=100,
+        enforce_eager=True,
+    ) as llm_default:
         default_outputs = llm_default.embed(prompts)
 
     check_embeddings_close(
diff --git a/tests/kernels/untest_fused_quant_activation.py b/tests/kernels/untest_fused_quant_activation.py
index d9a179b922033d4fded7a56a2f151940272466af..b5860cc1d26dd515156173816b5a2318bae5fffa 100644
--- a/tests/kernels/untest_fused_quant_activation.py
+++ b/tests/kernels/untest_fused_quant_activation.py
@@ -39,6 +39,7 @@ def ops_impl(x: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
 def test_silu_and_mul(
+    default_vllm_config,
     num_tokens: int,
     hidden_size: int,
     dtype: torch.dtype,
diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py
index 72c79370d19c172f4afeefcbd82ccd96fb28814a..ccdacf40c430beeb710f1839736e2074b5d80d72 100644
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -13,11 +13,11 @@ import torch
 from torch._prims_common import TensorLikeType
 
 from tests.kernels.quant_utils import native_w8a8_block_matmul
-from vllm.attention.backends.abstract import AttentionType
 from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input
 from vllm.utils.torch_utils import make_tensor_with_pad
+from vllm.v1.attention.backend import AttentionType
 
 # For now, disable "test_aot_dispatch_dynamic" since there are some
 # bugs related to this test in PyTorch 2.4.
diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index 4d39f87fca517d7fa2bd88cd0c148e7bbeedb899..97c6737b75e675e0c10c390ad1d3dbc4cead56b5 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -84,7 +84,7 @@ class DummyLoRAModel(nn.Sequential, SupportsLoRA):
 
 
 @pytest.fixture
-def dummy_model() -> nn.Module:
+def dummy_model(default_vllm_config) -> nn.Module:
     model = DummyLoRAModel(
         OrderedDict(
             [
@@ -117,7 +117,7 @@ def dummy_model() -> nn.Module:
 
 
 @pytest.fixture
-def dummy_model_gate_up() -> nn.Module:
+def dummy_model_gate_up(default_vllm_config) -> nn.Module:
     model = DummyLoRAModel(
         OrderedDict(
             [
@@ -214,6 +214,31 @@ def qwen25vl_lora_files():
     return snapshot_download(repo_id="jeeejeee/qwen25-vl-lora-pokemon")
 
 
+@pytest.fixture(scope="session")
+def qwen2vl_language_lora_files():
+    return snapshot_download(repo_id="prashanth058/qwen2vl-flickr-lora-language")
+
+
+@pytest.fixture(scope="session")
+def qwen2vl_vision_tower_connector_lora_files():
+    return snapshot_download(repo_id="prashanth058/qwen2vl-flickr-lora-tower-connector")
+
+
+@pytest.fixture(scope="session")
+def qwen2vl_vision_tower_lora_files():
+    return snapshot_download(repo_id="prashanth058/qwen2vl-flickr-lora-tower")
+
+
+@pytest.fixture(scope="session")
+def qwen25vl_vision_lora_files():
+    return snapshot_download(repo_id="EpochEcho/qwen2.5-3b-vl-lora-vision-connector")
+
+
+@pytest.fixture(scope="session")
+def qwen3vl_vision_lora_files():
+    return snapshot_download(repo_id="EpochEcho/qwen3-4b-vl-lora-vision-connector")
+
+
 @pytest.fixture(scope="session")
 def tinyllama_lora_files():
     # return snapshot_download(repo_id="jashing/tinyllama-colorist-lora")
diff --git a/tests/lora/test_fused_moe_lora_kernel.py b/tests/lora/test_fused_moe_lora_kernel.py
index 91c8b861c3c5cd83fdb642a37e0b09d2eced5376..a4d314be095cec15859122b4bfa6744bc09af3f7 100644
--- a/tests/lora/test_fused_moe_lora_kernel.py
+++ b/tests/lora/test_fused_moe_lora_kernel.py
@@ -18,8 +18,8 @@ from vllm.distributed.parallel_state import (
     get_tensor_model_parallel_world_size,
 )
 from vllm.lora.ops.triton_ops import fused_moe_lora
-from vllm.platforms import current_platform
 from vllm.utils.network_utils import get_open_port
+from vllm.utils.torch_utils import set_random_seed
 
 
 @pytest.fixture(autouse=True)
@@ -265,7 +265,7 @@ def test_fused_moe_lora_kernel(
     seed,
 ):
     torch.set_default_device(device)
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
     # the number of randomly generated sentences.
     num_sequences = 10
     # generate data
@@ -358,7 +358,7 @@ def test_fused_moe_lora_kernel_fully_sharded(
     seed,
     column_parallel,
 ):
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
     # the number of randomly generated sentences.
     num_sequences = 10
     # generate data
@@ -415,7 +415,7 @@ def use_fused_moe_lora_kernel_tensor_parallel(
     def _get_shard_slice(shard_size):
         return slice(local_rank * shard_size, (local_rank + 1) * shard_size)
 
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
 
     device = torch.device(f"cuda:{local_rank}")
     torch.cuda.set_device(device)
diff --git a/tests/lora/test_gptoss_tp.py b/tests/lora/test_gptoss_tp.py
index f4269750feb6b1c80d87beb2f18befdba13b2ebd..14d0ff47d4ca0acaf65dbf20eb8fb7a16ced93fa 100644
--- a/tests/lora/test_gptoss_tp.py
+++ b/tests/lora/test_gptoss_tp.py
@@ -34,9 +34,9 @@ The Competition_ID of competition_record is the foreign key of Competition_ID of
 ###Response:<|end|><|start|>assistant<|channel|>final<|message|>"""  # noqa: E501
 
 EXPECTED_LORA_OUTPUT = [
-    "SELECT AVG(Working_Horses) FROM farm WHERE Total_Horses > 5000;",
-    "SELECT MAX(Cows) AS Max_Cows, MIN(Cows) AS Min_Cows FROM farm;",
-    "SELECT MAX(Cows) AS Max_Cows, MIN(Cows) AS Min_Cows FROM farm;",
+    "SELECT avg(Working_Horses) FROM farm WHERE Total_Horses  >  5000",
+    "SELECT max(Cows) ,  min(Cows) FROM farm",
+    "SELECT max(Cows) ,  min(Cows) FROM farm",
 ]
 
 
@@ -69,38 +69,54 @@ def generate_and_test(llm: vllm.LLM, lora_path: str, lora_id: int) -> None:
         assert generated_texts[i].startswith(EXPECTED_LORA_OUTPUT[i])
 
 
-def test_gpt_oss_lora(gptoss20b_lora_files):
-    llm = vllm.LLM(
-        MODEL_PATH,
-        max_model_len=1024,
-        enable_lora=True,
-        max_loras=4,
-        max_lora_rank=8,
-        compilation_config=vllm.config.CompilationConfig(  # Avoid OOM
-            cudagraph_specialize_lora=False,
-        ),
-    )
-
-    generate_and_test(llm, gptoss20b_lora_files, lora_id=1)
-    generate_and_test(llm, gptoss20b_lora_files, lora_id=2)
+@pytest.mark.parametrize("mxfp4_use_marlin", [True, False])
+def test_gpt_oss_lora(
+    monkeypatch: pytest.MonkeyPatch, gptoss20b_lora_files, mxfp4_use_marlin
+):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_MXFP4_USE_MARLIN", "1" if mxfp4_use_marlin else "0")
+        llm = vllm.LLM(
+            MODEL_PATH,
+            max_model_len=1024,
+            enable_lora=True,
+            max_loras=4,
+            max_lora_rank=8,
+            max_num_seqs=2,
+            max_num_batched_tokens=2048,
+            compilation_config=vllm.config.CompilationConfig(  # Avoid OOM
+                cudagraph_specialize_lora=False,
+            ),
+        )
+
+        generate_and_test(llm, gptoss20b_lora_files, lora_id=1)
+        generate_and_test(llm, gptoss20b_lora_files, lora_id=2)
 
 
 @multi_gpu_test(num_gpus=2)
 @pytest.mark.parametrize("fully_sharded_loras", [False, True])
-def test_gpt_oss_lora_tp2(gptoss20b_lora_files, fully_sharded_loras):
-    llm = vllm.LLM(
-        MODEL_PATH,
-        max_model_len=1024,
-        enable_lora=True,
-        max_loras=2,
-        max_lora_rank=8,
-        max_num_seqs=16,
-        tensor_parallel_size=2,
-        fully_sharded_loras=fully_sharded_loras,
-        compilation_config=vllm.config.CompilationConfig(  # Avoid OOM
-            cudagraph_specialize_lora=False,
-        ),
-    )
-
-    generate_and_test(llm, gptoss20b_lora_files, lora_id=1)
-    generate_and_test(llm, gptoss20b_lora_files, lora_id=2)
+@pytest.mark.parametrize("mxfp4_use_marlin", [True, False])
+def test_gpt_oss_lora_tp2(
+    monkeypatch: pytest.MonkeyPatch,
+    gptoss20b_lora_files,
+    fully_sharded_loras,
+    mxfp4_use_marlin,
+):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_MXFP4_USE_MARLIN", "1" if mxfp4_use_marlin else "0")
+        llm = vllm.LLM(
+            MODEL_PATH,
+            max_model_len=1024,
+            enable_lora=True,
+            max_loras=2,
+            max_num_seqs=2,
+            max_num_batched_tokens=2048,
+            tensor_parallel_size=2,
+            gpu_memory_utilization=0.8,
+            fully_sharded_loras=fully_sharded_loras,
+            compilation_config=vllm.config.CompilationConfig(  # Avoid OOM
+                cudagraph_specialize_lora=False,
+            ),
+        )
+
+        generate_and_test(llm, gptoss20b_lora_files, lora_id=1)
+        generate_and_test(llm, gptoss20b_lora_files, lora_id=2)
diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index 47d1fcfe9a0c7d1b67a83c4a04dd57899679708b..54fc3bd4b9988c81e3a264fd20b673a2a62e15b3 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -43,8 +43,8 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding,
     get_masked_input_and_mask,
 )
-from vllm.model_executor.utils import set_random_seed
 from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed
 
 from .utils import DummyLoRAManager
 
@@ -252,7 +252,9 @@ def check_punica_wrapper(punica_wrapper) -> bool:
 @pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000])
 @pytest.mark.parametrize("stage", STAGES)
-def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None:
+def test_embeddings(
+    default_vllm_config, dist_init, num_loras, device, vocab_size, stage
+) -> None:
     # For multi-GPU testing of Triton kernel, we must explicitly set the CUDA
     # device, see: https://github.com/triton-lang/triton/issues/2925
     # Same below.
@@ -261,11 +263,11 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None:
 
     torch.set_default_device(device)
     max_loras = 8
-    punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
-    assert check_punica_wrapper(punica_wrapper)
     lora_config = LoRAConfig(
         max_loras=max_loras, max_lora_rank=8, lora_dtype=torch.float16
     )
+    punica_wrapper = get_punica_wrapper(8192, 256, device, lora_config=lora_config)
+    assert check_punica_wrapper(punica_wrapper)
 
     def create_random_embedding_layer():
         embedding = VocabParallelEmbedding(vocab_size, 256)
@@ -353,18 +355,18 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None:
 @pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 256512])
 @pytest.mark.parametrize("stage", STAGES)
 def test_lm_head_logits_processor(
-    dist_init, num_loras, device, vocab_size, stage
+    default_vllm_config, dist_init, num_loras, device, vocab_size, stage
 ) -> None:
     if current_platform.is_cuda_alike():
         torch.cuda.set_device(device)
 
     torch.set_default_device(device)
     max_loras = 8
-    punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
-    assert check_punica_wrapper(punica_wrapper)
     lora_config = LoRAConfig(
         max_loras=max_loras, max_lora_rank=8, lora_dtype=torch.float16
     )
+    punica_wrapper = get_punica_wrapper(8192, 256, device, lora_config=lora_config)
+    assert check_punica_wrapper(punica_wrapper)
 
     def _pretest():
         linear = ParallelLMHead(
@@ -470,6 +472,7 @@ def test_lm_head_logits_processor(
 @pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("stage", STAGES)
 def test_linear_replicated(
+    default_vllm_config,
     dist_init,
     num_loras,
     device,
@@ -480,13 +483,13 @@ def test_linear_replicated(
 
     max_loras = 8
     torch.set_default_device(device)
-    punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
-    assert check_punica_wrapper(punica_wrapper)
     lora_config = LoRAConfig(
         max_loras=max_loras,
         max_lora_rank=8,
         lora_dtype=torch.float16,
     )
+    punica_wrapper = get_punica_wrapper(8192, 256, device, lora_config=lora_config)
+    assert check_punica_wrapper(punica_wrapper)
 
     def create_random_linear_replicated_layer():
         linear = ReplicatedLinear(4096, 4096, bias=False, params_dtype=torch.float16)
@@ -580,21 +583,21 @@ def test_linear_replicated(
 @pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("stage", STAGES)
 def test_linear_parallel(
-    dist_init, num_loras, orientation, fully_shard, device, stage
+    default_vllm_config, dist_init, num_loras, orientation, fully_shard, device, stage
 ) -> None:
     if current_platform.is_cuda_alike():
         torch.cuda.set_device(device)
 
     max_loras = 8
     torch.set_default_device(device)
-    punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
-    assert check_punica_wrapper(punica_wrapper)
     lora_config = LoRAConfig(
         max_loras=max_loras,
         max_lora_rank=8,
         fully_sharded_loras=fully_shard,
         lora_dtype=torch.float16,
     )
+    punica_wrapper = get_punica_wrapper(8192, 256, device, lora_config=lora_config)
+    assert check_punica_wrapper(punica_wrapper)
 
     def create_random_linear_parallel_layer():
         if orientation == "row":
@@ -705,21 +708,21 @@ def test_linear_parallel(
 @pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("stage", STAGES)
 def test_column_parallel_packed(
-    dist_init, num_loras, repeats, fully_shard, device, stage
+    default_vllm_config, dist_init, num_loras, repeats, fully_shard, device, stage
 ) -> None:
     if current_platform.is_cuda_alike():
         torch.cuda.set_device(device)
 
     max_loras = 8
     torch.set_default_device(device)
-    punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
-    assert check_punica_wrapper(punica_wrapper)
     lora_config = LoRAConfig(
         max_loras=max_loras,
         max_lora_rank=8,
         fully_sharded_loras=fully_shard,
         lora_dtype=torch.float16,
     )
+    punica_wrapper = get_punica_wrapper(8192, 256, device, lora_config=lora_config)
+    assert check_punica_wrapper(punica_wrapper)
 
     def create_column_parallel_packed_layer():
         if repeats == 2:
@@ -851,7 +854,7 @@ def test_column_parallel_packed(
 @pytest.mark.parametrize(
     "seed", list(range(VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS))
 )
-def test_vocab_parallel_embedding_indices(tp_size, seed):
+def test_vocab_parallel_embedding_indices(tp_size, seed, default_vllm_config):
     random.seed(seed)
     vocab_size = random.randint(4000, 64000)
     added_vocab_size = random.randint(0, 1024)
diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py
index 5eb846ee1bc89d560d90d6522717b0d9f0183f58..43f4b37b4b36ad3e38e0a6132fd5fd57e769ab26 100644
--- a/tests/lora/test_llama_tp.py
+++ b/tests/lora/test_llama_tp.py
@@ -77,11 +77,18 @@ def do_sample(
             if lora_id
             else None,
         )
-    # Print the outputs.
+    lora_request = LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None
     generated_texts: list[str] = []
     for output in outputs:
         prompt = output.prompt
         generated_text = output.outputs[0].text
+        # The output should include  correct lora_request info
+        if lora_request is not None:
+            assert output.lora_request.lora_name == lora_request.lora_name
+            assert output.lora_request.lora_int_id == lora_request.lora_int_id
+            assert output.lora_request.lora_path == lora_request.lora_path
+        else:
+            assert output.lora_request is None
         generated_texts.append(generated_text)
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
     return generated_texts
diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py
index 50f17ced5dd74ebf8c43b36a8c5a06c0f04a1629..c37780ec6f13398d1e0260707291b34b59555833 100644
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -18,6 +18,7 @@ from vllm.lora.layers import (
 from vllm.lora.lora_model import LoRAModel
 from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights
 from vllm.lora.model_manager import (
+    DEFAULT_LANGUAGE_WRAPPER_KEY,
     LoRAMapping,
     LoRAModelManager,
     LRUCacheLoRAModelManager,
@@ -110,7 +111,7 @@ def create_packed_lora(
     return LoRAModel(lora_id, 8, loras)
 
 
-def test_replace_submodules(dist_init, dummy_model):
+def test_replace_submodules(default_vllm_config, dist_init, dummy_model):
     model = dummy_model
     manager = LoRAModelManager(
         model,
@@ -132,7 +133,7 @@ def test_replace_submodules(dist_init, dummy_model):
 
 
 @pytest.mark.parametrize("device", DEVICES)
-def test_lora_model_manager(dist_init, dummy_model, device):
+def test_lora_model_manager(default_vllm_config, dist_init, dummy_model, device):
     model = dummy_model
     model_lora1 = create_lora(
         1, model, ["layer1.dense1", "dense2", "lm_head"], device=device
@@ -183,9 +184,11 @@ def test_lora_model_manager(dist_init, dummy_model, device):
     assert manager.activate_adapter(2)
     assert manager.lora_index_to_id[0] == 3
     assert manager.lora_index_to_id[1] == 2
-
     assert manager.device == device
-    assert manager.punica_wrapper.device == device
+    assert (
+        manager.punica_wrapper_mapping.get(DEFAULT_LANGUAGE_WRAPPER_KEY).device
+        == device
+    )
     assert hasattr(manager, "supported_lora_modules")
     assert sorted(manager.supported_lora_modules) == [
         "dense1",
@@ -196,7 +199,9 @@ def test_lora_model_manager(dist_init, dummy_model, device):
 
 
 @pytest.mark.parametrize("device", DEVICES)
-def test_lora_lru_cache_model_manager(dist_init, dummy_model, device):
+def test_lora_lru_cache_model_manager(
+    default_vllm_config, dist_init, dummy_model, device
+):
     model = dummy_model
     model_lora1 = create_lora(
         1, model, ["layer1.dense1", "dense2", "lm_head"], device=device
@@ -278,13 +283,15 @@ def test_lora_lru_cache_model_manager(dist_init, dummy_model, device):
     assert manager.remove_adapter(3)
     with pytest.raises(ValueError):
         assert manager.pin_adapter(3)
-
-    assert manager.punica_wrapper.device == device
+    assert (
+        manager.punica_wrapper_mapping.get(DEFAULT_LANGUAGE_WRAPPER_KEY).device
+        == device
+    )
     assert manager.device == device
 
 
 @pytest.mark.parametrize("device", DEVICES)
-def test_lru_lora_model_manager(dist_init, dummy_model, device):
+def test_lru_lora_model_manager(default_vllm_config, dist_init, dummy_model, device):
     # This tests just the LRU cache functionality, everything else is
     # tested in test_lora_model_manager
     model = dummy_model
@@ -402,12 +409,17 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device):
         assert manager.remove_oldest_adapter()
 
     assert set(manager.list_adapters()) == {1}
-    assert manager.punica_wrapper.device == device
+    assert (
+        manager.punica_wrapper_mapping.get(DEFAULT_LANGUAGE_WRAPPER_KEY).device
+        == device
+    )
     assert manager.device == device
 
 
 @pytest.mark.parametrize("device", DEVICES)
-def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device, tmp_path):
+def test_lru_cache_worker_adapter_manager(
+    default_vllm_config, dist_init, dummy_model, device, tmp_path
+):
     lora_config = LoRAConfig(
         max_lora_rank=8, max_cpu_loras=4, max_loras=4, lora_dtype=DEFAULT_DTYPE
     )
@@ -514,11 +526,16 @@ def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device, tmp_pa
         )
 
     assert worker_adapter_manager.device == device
-    assert worker_adapter_manager._adapter_manager.punica_wrapper.device == device
+    punica_wrapper = worker_adapter_manager._adapter_manager.punica_wrapper_mapping.get(
+        DEFAULT_LANGUAGE_WRAPPER_KEY
+    )
+    assert punica_wrapper.device == device
 
 
 @pytest.mark.parametrize("device", DEVICES)
-def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device, tmp_path):
+def test_worker_adapter_manager(
+    default_vllm_config, dist_init, dummy_model_gate_up, device, tmp_path
+):
     # Should remove every LoRA not specified in the request.
     lora_config = LoRAConfig(
         max_lora_rank=8, max_cpu_loras=4, max_loras=4, lora_dtype=DEFAULT_DTYPE
@@ -618,11 +635,14 @@ def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device, tmp_path
         )
 
     assert worker_adapter_manager.device == device
-    assert worker_adapter_manager._adapter_manager.punica_wrapper.device == device
+    punica_wrapper = worker_adapter_manager._adapter_manager.punica_wrapper_mapping.get(
+        DEFAULT_LANGUAGE_WRAPPER_KEY
+    )
+    assert punica_wrapper.device == device
 
 
 @pytest.mark.parametrize("device", DEVICES)
-def test_packed_loras(dist_init, dummy_model_gate_up, device):
+def test_packed_loras(default_vllm_config, dist_init, dummy_model_gate_up, device):
     model = dummy_model_gate_up
     model_lora = create_packed_lora(
         1,
diff --git a/tests/lora/test_punica_ops.py b/tests/lora/test_punica_ops.py
index e4df9751077d2e836447febaf8a13493c9071152..5083f500c5cdc82727869fc42851088d1d9a8d4d 100644
--- a/tests/lora/test_punica_ops.py
+++ b/tests/lora/test_punica_ops.py
@@ -9,7 +9,7 @@ import vllm.lora.ops.torch_ops as torch_ops
 import vllm.lora.ops.triton_ops as triton_ops
 from vllm.lora.ops.triton_ops import LoRAKernelMeta
 from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
-from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed
 
 from .utils import PunicaTensors, assert_close, generate_data_for_nslices
 
@@ -395,7 +395,7 @@ def test_kernels(
     Tests LoRA kernels.
     """
     torch.set_default_device(device)
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
 
     if op_type == "shrink":
         check_lora_shrink_kernel(
@@ -447,7 +447,7 @@ def test_kernels_hidden_size(
     Tests SGMV and LoRA kernels.
     """
     torch.set_default_device(device)
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
 
     if op_type == "shrink":
         check_lora_shrink_kernel(
diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwenvl.py
similarity index 58%
rename from tests/lora/test_qwen2vl.py
rename to tests/lora/test_qwenvl.py
index 4af34fdb5338692789346b639c7ba3cda5a60018..ace7b418f0cf9dca58d1a2aac3f159cd379a023d 100644
--- a/tests/lora/test_qwen2vl.py
+++ b/tests/lora/test_qwenvl.py
@@ -2,10 +2,12 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
 
+import os
 import vllm
 from vllm.assets.image import ImageAsset
 from vllm.lora.request import LoRARequest
 from vllm.sampling_params import BeamSearchParams
+from ..utils import models_path_prefix
 
 
 @dataclass
@@ -14,9 +16,12 @@ class TestConfig:
     lora_path: str
     max_num_seqs: int = 2
     max_loras: int = 2
-    max_lora_rank: int = 16
-    max_model_len: int = 4096
+    max_lora_rank: int = 32
+    enable_tower_connector_lora: bool = False
+    max_model_len: int = 8192
+    gpu_memory_utilization: float = 0.85
     mm_processor_kwargs: dict[str, int] | None = None
+    mm_processor_cache_gb: float = 4
 
     def __post_init__(self):
         if self.mm_processor_kwargs is None:
@@ -48,8 +53,11 @@ class Qwen2VLTester:
             enable_lora=True,
             max_loras=self.config.max_loras,
             max_lora_rank=self.config.max_lora_rank,
+            enable_tower_connector_lora=self.config.enable_tower_connector_lora,
             trust_remote_code=True,
+            gpu_memory_utilization=self.config.gpu_memory_utilization,
             mm_processor_kwargs=self.config.mm_processor_kwargs,
+            mm_processor_cache_gb=self.config.mm_processor_cache_gb,
             max_model_len=self.config.max_model_len,
         )
 
@@ -58,6 +66,7 @@ class Qwen2VLTester:
         images: list[ImageAsset],
         expected_outputs: list[str],
         lora_id: int | None = None,
+        lora_name: str | None = None,
         temperature: float = 0,
         max_tokens: int = 5,
     ):
@@ -73,10 +82,11 @@ class Qwen2VLTester:
             for asset in images
         ]
 
-        lora_request = LoRARequest(str(lora_id), lora_id, self.config.lora_path)
+        lora_request = LoRARequest(
+            lora_name if lora_name else str(lora_id), lora_id, self.config.lora_path
+        )
         outputs = self.llm.generate(inputs, sampling_params, lora_request=lora_request)
         generated_texts = [output.outputs[0].text.strip() for output in outputs]
-
         # Validate outputs
         for generated, expected in zip(generated_texts, expected_outputs):
             assert expected.startswith(generated), (
@@ -127,6 +137,22 @@ EXPECTED_OUTPUTS = [
     "A majestic skyscraper stands tall, partially obscured by a vibrant canopy of cherry blossoms, against a clear blue sky.",  # noqa: E501
 ]
 
+EXPECTED_OUTPUTS_LANGUAGE = [
+    "A stop sign is shown in an Asian city, with buildings and a car in the "
+    "background.",
+    "The Tokyo Skytree can be seen behind the pink blossoms of the cherry trees.",
+]
+
+EXPECTED_OUTPUTS_VISION = [
+    "A stop sign in front of oriental buildings.",
+    "A tree with pink flowers in front of it and a blue sky behind the flowers.",
+]
+
+EXPECTED_OUTPUTS_VISION_NO_CONNECTOR = [
+    "A stop sign is located on the street of a Chinese neighborhood.",
+    "A closeup shot of the Tokyo Skytree with pink flowers in the foreground.",
+]
+
 # NOTE - beam search .text contains the whole text
 EXPECTED_BEAM_SEARCH_OUTPUTS = [
     [
@@ -137,6 +163,7 @@ EXPECTED_BEAM_SEARCH_OUTPUTS = [
 
 QWEN2VL_MODEL_PATH = os.path.join(models_path_prefix, "Qwen/Qwen2-VL-2B-Instruct")
 QWEN25VL_MODEL_PATH = os.path.join(models_path_prefix, "Qwen/Qwen2.5-VL-3B-Instruct")
+QWEN3VL_MODEL_PATH = os.path.join(models_path_prefix, "Qwen/Qwen3-VL-4B-Instruct")
 
 
 def test_qwen2vl_lora(qwen2vl_lora_files):
@@ -175,3 +202,99 @@ def test_qwen25vl_lora(qwen25vl_lora_files):
     # Test with different LoRA IDs
     for lora_id in [1, 2]:
         tester.run_test(TEST_IMAGES, expected_outputs=EXPECTED_OUTPUTS, lora_id=lora_id)
+
+
+def test_qwen25vl_vision_lora(qwen25vl_vision_lora_files):
+    config = TestConfig(
+        model_path=QWEN25VL_MODEL_PATH,
+        lora_path=qwen25vl_vision_lora_files,
+        # Currently, tower_connector_lora is incompatible with
+        # the multi-modal processor cache.
+        # TODO: Remove this restriction
+        mm_processor_cache_gb=0,
+        enable_tower_connector_lora=True,
+    )
+    tester = Qwen2VLTester(config)
+    for lora_id in [1, 2]:
+        tester.run_test(
+            TEST_IMAGES,
+            expected_outputs=EXPECTED_OUTPUTS,
+            lora_id=lora_id,
+        )
+
+
+def test_qwen3vl_vision_lora(qwen3vl_vision_lora_files):
+    config = TestConfig(
+        model_path=QWEN3VL_MODEL_PATH,
+        lora_path=qwen3vl_vision_lora_files,
+        # Currently, tower_connector_lora is incompatible with
+        # the multi-modal processor cache.
+        # TODO: Remove this restriction
+        mm_processor_cache_gb=0,
+        enable_tower_connector_lora=True,
+    )
+    tester = Qwen2VLTester(config)
+    for lora_id in [1, 2]:
+        tester.run_test(
+            TEST_IMAGES,
+            expected_outputs=EXPECTED_OUTPUTS,
+            lora_id=lora_id,
+        )
+
+
+def test_qwen2vl_multiple_lora_types(
+    qwen2vl_language_lora_files,
+    qwen2vl_vision_tower_connector_lora_files,
+    qwen2vl_vision_tower_lora_files,
+):
+    """
+    Test multiple LoRA adapter types (language, vision tower + connector,
+    vision tower only) using the same LLM instance to verify mm_encoder_cache
+    behavior with different LoRA requests.
+
+    By reusing the same LLM instance across different LoRA requests, we ensure that
+    the multimodal encoder cache correctly manages state transitions between
+    language-only and vision-enabled LoRA adapters.
+    """
+    config = TestConfig(
+        model_path=QWEN2VL_MODEL_PATH,
+        # We'll override the lora_path for each specific test, but need to provide
+        # an initial path for initialization
+        lora_path=qwen2vl_language_lora_files,
+        # Currently, tower_connector_lora is incompatible with
+        # the multi-modal processor cache.
+        # TODO: Remove this restriction
+        mm_processor_cache_gb=0,
+        enable_tower_connector_lora=True,
+    )
+    tester = Qwen2VLTester(config)
+
+    # Test 1: Language-only LoRA adapter
+    tester.config.lora_path = qwen2vl_language_lora_files
+    for lora_id in [1, 2]:
+        tester.run_test(
+            TEST_IMAGES,
+            expected_outputs=EXPECTED_OUTPUTS_LANGUAGE,
+            lora_id=lora_id,
+            lora_name="language_only",
+        )
+
+    # Test 2: Vision tower + connector LoRA adapter
+    tester.config.lora_path = qwen2vl_vision_tower_connector_lora_files
+    for lora_id in [3, 4]:
+        tester.run_test(
+            TEST_IMAGES,
+            expected_outputs=EXPECTED_OUTPUTS_VISION,
+            lora_id=lora_id,
+            lora_name="vision_tower_connector",
+        )
+
+    # Test 3: Vision tower only LoRA adapter (no connector)
+    tester.config.lora_path = qwen2vl_vision_tower_lora_files
+    for lora_id in [5, 6]:
+        tester.run_test(
+            TEST_IMAGES,
+            expected_outputs=EXPECTED_OUTPUTS_VISION_NO_CONNECTOR,
+            lora_id=lora_id,
+            lora_name="vision_tower",
+        )
diff --git a/tests/lora/test_utils.py b/tests/lora/test_utils.py
index eb026c2ec0209874f4adca12e05a9cd21fc4eba7..bec12eeeb48d5999965b5821dfc97d87e8c98044 100644
--- a/tests/lora/test_utils.py
+++ b/tests/lora/test_utils.py
@@ -3,7 +3,7 @@
 
 from collections import OrderedDict
 from typing import NamedTuple
-from unittest.mock import patch
+from unittest.mock import MagicMock, patch
 
 import pytest
 from huggingface_hub.utils import HfHubHTTPError
@@ -194,5 +194,8 @@ def test_get_adapter_absolute_path_huggingface_error(
     # Hugging Face model identifier with download error
     path = "org/repo"
     mock_exist.return_value = False
-    mock_snapshot_download.side_effect = HfHubHTTPError("failed to query model info")
+    mock_snapshot_download.side_effect = HfHubHTTPError(
+        "failed to query model info",
+        response=MagicMock(),
+    )
     assert get_adapter_absolute_path(path) == path
diff --git a/tests/model_executor/model_loader/runai_streamer_loader/conftest.py b/tests/model_executor/model_loader/runai_streamer_loader/conftest.py
index 9a022f6bbd9d17ae488e8f8ce6530a7a1570fd3b..bad9dea1bf653570ac96e68b89d1b317ff55e4f3 100644
--- a/tests/model_executor/model_loader/runai_streamer_loader/conftest.py
+++ b/tests/model_executor/model_loader/runai_streamer_loader/conftest.py
@@ -29,11 +29,7 @@ class RunaiDummyExecutor(UniProcExecutor):
             is_driver_worker=is_driver_worker,
         )
 
-        wrapper_kwargs = {
-            "vllm_config": self.vllm_config,
-        }
-
-        self.driver_worker = WorkerWrapperBase(**wrapper_kwargs)
+        self.driver_worker = WorkerWrapperBase()
 
         self.collective_rpc("init_worker", args=([worker_rpc_kwargs],))
         self.collective_rpc("init_device")
diff --git a/tests/model_executor/model_loader/tensorizer_loader/conftest.py b/tests/model_executor/model_loader/tensorizer_loader/conftest.py
index 826ecec71e6cf2ad5fe41a1a670d077089e5b386..6c85a1399196a37115c46aa10bc7f944985bb3c0 100644
--- a/tests/model_executor/model_loader/tensorizer_loader/conftest.py
+++ b/tests/model_executor/model_loader/tensorizer_loader/conftest.py
@@ -67,7 +67,7 @@ def assert_from_collective_rpc(engine: LLM, closure: Callable, closure_kwargs: d
 class DummyExecutor(UniProcExecutor):
     def _init_executor(self) -> None:
         """Initialize the worker and load the model."""
-        self.driver_worker = WorkerWrapperBase(vllm_config=self.vllm_config, rpc_rank=0)
+        self.driver_worker = WorkerWrapperBase(rpc_rank=0)
         distributed_init_method = get_distributed_init_method(get_ip(), get_open_port())
         local_rank = 0
         # set local rank as the device index if specified
diff --git a/tests/model_executor/test_eagle_quantization.py b/tests/model_executor/test_eagle_quantization.py
index 1ab75933ee31e7c4ab1d4a71c0e4b1a3c6637bb2..6f0dc55a5e41bec3184f114d7d1c275f035d02b8 100644
--- a/tests/model_executor/test_eagle_quantization.py
+++ b/tests/model_executor/test_eagle_quantization.py
@@ -55,7 +55,7 @@ def test_get_draft_quant_config_without_draft_model():
 
 @torch.inference_mode()
 @pytest.mark.parametrize("device", DEVICES)
-def test_fc_layer_quant_config_usage(dist_init, device) -> None:
+def test_fc_layer_quant_config_usage(default_vllm_config, dist_init, device) -> None:
     import torch
 
     from vllm.model_executor.layers.linear import ReplicatedLinear
diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py
index 3bf9335ac870dc947a64c6c96333ba376ef5b530..2f9ad4cd26d262cad2073d6a17488d516ac4bb22 100644
--- a/tests/model_executor/test_model_load_with_params.py
+++ b/tests/model_executor/test_model_load_with_params.py
@@ -5,12 +5,8 @@ import os
 
 import pytest
 
-from vllm.model_executor.layers.pooler import (
-    CLSPool,
-    DispatchPooler,
-    MeanPool,
-    PoolingType,
-)
+from vllm.model_executor.layers.pooler import DispatchPooler
+from vllm.model_executor.layers.pooler.seqwise import CLSPool, MeanPool
 from vllm.model_executor.models.bert import BertEmbeddingModel
 from vllm.model_executor.models.roberta import RobertaEmbeddingModel
 from vllm.platforms import current_platform
@@ -51,8 +47,9 @@ def test_model_loading_with_params(vllm_runner, monkeypatch):
         assert model_config.encoder_config["do_lower_case"]
 
         # asserts on the pooling config files
-        assert model_config.pooler_config.pooling_type == PoolingType.CLS.name
-        assert model_config.pooler_config.normalize
+        assert model_config.pooler_config.seq_pooling_type == "CLS"
+        assert model_config.pooler_config.tok_pooling_type == "ALL"
+        assert model_config.pooler_config.use_activation
 
         # asserts on the tokenizer loaded
         assert model_config.tokenizer == os.path.join(models_path_prefix, "BAAI/bge-base-en-v1.5")
@@ -95,8 +92,9 @@ def test_roberta_model_loading_with_params(vllm_runner, monkeypatch):
         assert not model_config.encoder_config["do_lower_case"]
 
         # asserts on the pooling config files
-        assert model_config.pooler_config.pooling_type == PoolingType.MEAN.name
-        assert model_config.pooler_config.normalize
+        assert model_config.pooler_config.seq_pooling_type == "MEAN"
+        assert model_config.pooler_config.tok_pooling_type == "ALL"
+        assert model_config.pooler_config.use_activation
 
         # asserts on the tokenizer loaded
         assert model_config.tokenizer == os.path.join(models_path_prefix, "intfloat/multilingual-e5-base")
diff --git a/tests/models/fixtures/qwen2_5_math_prm_reward_step.json b/tests/models/fixtures/qwen2_5_math_prm_reward_step.json
new file mode 100644
index 0000000000000000000000000000000000000000..dc0f3010cc3aa59fd77e9e09142ef74eb5228b6d
--- /dev/null
+++ b/tests/models/fixtures/qwen2_5_math_prm_reward_step.json
@@ -0,0 +1 @@
+[[[0.0006361007690429688, 0.99951171875], [0.81884765625, 0.1812744140625], [0.025543212890625, 0.974609375], [0.0004382133483886719, 0.99951171875]]]
\ No newline at end of file
diff --git a/tests/models/language/generation/conftest.py b/tests/models/language/generation/conftest.py
new file mode 100644
index 0000000000000000000000000000000000000000..f423b656b2f25094747393fd123baf504bdfa6af
--- /dev/null
+++ b/tests/models/language/generation/conftest.py
@@ -0,0 +1,28 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Pytest configuration for vLLM language generation tests."""
+
+import warnings
+
+import torch
+
+from vllm.platforms import current_platform
+
+
+def pytest_sessionstart(session):
+    """Configure ROCm-specific settings before test session starts."""
+    if not current_platform.is_rocm():
+        return
+
+    # Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
+    # accuracy issues: https://github.com/vllm-project/vllm/issues/30167
+    # TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace
+    torch.backends.cuda.enable_flash_sdp(False)
+    torch.backends.cuda.enable_mem_efficient_sdp(False)
+    torch.backends.cuda.enable_math_sdp(True)
+    warnings.warn(
+        "ROCm: Disabled flash_sdp and mem_efficient_sdp, enabled math_sdp "
+        "to avoid HuggingFace Transformers accuracy issues",
+        UserWarning,
+        stacklevel=1,
+    )
diff --git a/tests/models/language/generation/test_common.py b/tests/models/language/generation/test_common.py
index 96f26a406a2eb1bc7e6d19da274d7ead91325143..bcd4f980f8a94721b6ec674df8b506952b0c458e 100644
--- a/tests/models/language/generation/test_common.py
+++ b/tests/models/language/generation/test_common.py
@@ -12,6 +12,11 @@ from ...registry import HF_EXAMPLE_MODELS
 from ...utils import check_logprobs_close
 from ....utils import models_path_prefix
 
+# Models that require embedding scaling for prompt_embeds test
+EMBED_SCALING_MODELS = {
+    "openbmb/MiniCPM4.1-8B",
+}
+
 # This list contains the model that are using AITER kernel.
 # Skip model that are not using AITER tests.
 # When more AITER kernels are added, this list will not be
@@ -66,8 +71,8 @@ AITER_MODEL_LIST = [
             marks=[pytest.mark.core_model, pytest.mark.cpu_model],
         ),
         pytest.param(
-            os.path.join(models_path_prefix, "openbmb/MiniCPM3-4B"),
-            marks=[pytest.mark.core_model, large_gpu_mark(min_gb=32)],
+            os.path.join(models_path_prefix, "openbmb/MiniCPM4.1-8B"),  # minicpm
+            marks=[pytest.mark.core_model, large_gpu_mark(min_gb=48)],
         ),
         pytest.param(
             os.path.join(models_path_prefix, "facebook/opt-125m"),  # opt
@@ -137,16 +142,20 @@ def test_models(
 
         prompt_embeds: list[torch.Tensor] | None = [] if use_prompt_embeds else None
 
-        prompt_token_ids = []
         for prompt in example_prompts:
             token_ids = hf_model.tokenizer(prompt, return_tensors="pt").input_ids.to(
                 hf_model.model.device
             )
-            prompt_token_ids.append(token_ids)
             if prompt_embeds is not None:
-                prompt_embeds.append(
-                    hf_model.model.get_input_embeddings()(token_ids).squeeze(0)
-                )
+                embed = hf_model.model.get_input_embeddings()(token_ids)
+
+                # MiniCPM models apply scale_emb to embeddings internally.
+                # vLLM expects pre-scaled embeddings when using inputs_embeds.
+                if model in EMBED_SCALING_MODELS:
+                    config = hf_model.model.config
+                    embed = embed * config.scale_emb
+
+                prompt_embeds.append(embed.squeeze(0))
 
     with vllm_runner(
         model,
diff --git a/tests/models/language/generation/test_grok.py b/tests/models/language/generation/test_grok.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2f1e8b4413de21e723e7247c6da9ec8f238cea1
--- /dev/null
+++ b/tests/models/language/generation/test_grok.py
@@ -0,0 +1,43 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+
+from ...utils import dummy_hf_overrides
+
+MODELS = ["xai-org/grok-2"]
+
+
+def _grok2_dummy_overrides(hf_config):
+    hf_config = dummy_hf_overrides(hf_config, model_arch="Grok1ForCausalLM")
+    text_config = hf_config.get_text_config()
+    text_config.update(
+        {
+            "hidden_size": 256,
+            "intermediate_size": 512,
+            "moe_intermediate_size": 256,
+            "num_attention_heads": 4,
+            "num_key_value_heads": 2,
+            "head_dim": 64,
+        }
+    )
+    return hf_config
+
+
+@pytest.mark.parametrize("model", MODELS)
+def test_dummy_generate(vllm_runner, monkeypatch, model: str) -> None:
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
+        with vllm_runner(
+            model,
+            load_format="dummy",
+            max_model_len=128,
+            hf_overrides=_grok2_dummy_overrides,
+            enforce_eager=True,
+        ) as llm:
+            prompt = "Hello from Grok-2"
+            tokenizer = llm.get_llm().get_tokenizer()
+            prompt_len = len(tokenizer.encode(prompt))
+            outputs = llm.generate_greedy([prompt], max_tokens=1)
+            output_ids, output_str = outputs[0]
+            assert len(output_ids) > prompt_len
+            assert output_str is not None
diff --git a/tests/models/language/generation/test_phimoe.py b/tests/models/language/generation/test_phimoe.py
index 8c77abbc2aac3c128ad49a989039454be0747fe2..8fa01c02bde06ad89b7ca5f8d93e7576c13502bd 100644
--- a/tests/models/language/generation/test_phimoe.py
+++ b/tests/models/language/generation/test_phimoe.py
@@ -62,6 +62,19 @@ def test_phimoe_routing_function():
         assert torch.equal(topk_ids, ground_truth[test_id]["topk_ids"])
 
 
+# There is a known issue that triggers `AttributeError: 'DynamicCache'
+# object has no attribute 'seen_tokens'` when running:
+# `tests/models/language/generation/test_phimoe.py::test_models
+#   [5-64-bfloat16-microsoft/Phi-3.5-MoE-instruct]`
+# This issue is being investigated and tracked in:
+#   https://huggingface.co/microsoft/Phi-3.5-MoE-instruct/discussions/58
+# It is platform-agnostic. Therefore, we skip this test on all platforms for now.
+@pytest.mark.skip(
+    reason="Skipping due to known issue: "
+    "'DynamicCache' object has no attribute 'seen_tokens'. See: "
+    "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct/discussions/58 "
+    "for details.",
+)
 @pytest.mark.skipif(
     condition=current_platform.is_cpu(),
     reason="This test takes a lot time to run on CPU, "
diff --git a/tests/models/language/pooling/conftest.py b/tests/models/language/pooling/conftest.py
new file mode 100644
index 0000000000000000000000000000000000000000..6348d49cd7759e1c6132cbf31559219692d58f8f
--- /dev/null
+++ b/tests/models/language/pooling/conftest.py
@@ -0,0 +1,29 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Pytest configuration for vLLM language generation tests."""
+
+import warnings
+
+import torch
+
+from vllm.platforms import current_platform
+
+
+def pytest_sessionstart(session):
+    """Configure ROCm-specific settings before test session starts."""
+    if not current_platform.is_rocm():
+        return
+
+    # Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
+    # accuracy issues: https://github.com/vllm-project/vllm/issues/30167
+    # TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace
+    torch.backends.cuda.enable_flash_sdp(False)
+    torch.backends.cuda.enable_mem_efficient_sdp(False)
+    torch.backends.cuda.enable_math_sdp(True)
+    torch.set_float32_matmul_precision("high")
+    warnings.warn(
+        "ROCm: Disabled flash_sdp and mem_efficient_sdp, enabled math_sdp "
+        "to avoid HuggingFace Transformers accuracy issues",
+        UserWarning,
+        stacklevel=1,
+    )
diff --git a/tests/models/language/pooling/test_embedding.py b/tests/models/language/pooling/test_embedding.py
index 8c7ab35892eb391d97993e42c5d2edad525dfb8f..bdf3b4d1d49c00fcecfed3c73380a90a6a76f280 100644
--- a/tests/models/language/pooling/test_embedding.py
+++ b/tests/models/language/pooling/test_embedding.py
@@ -61,7 +61,7 @@ def test_models(
     vllm_extra_kwargs = {}
     if model == (os.path.join(models_path_prefix, "ssmits/Qwen2-7B-Instruct-embed-base"):
         vllm_extra_kwargs["pooler_config"] = PoolerConfig(
-            pooling_type="MEAN", normalize=False
+            seq_pooling_type="MEAN", normalize=False
         )
 
     max_model_len: int | None = 512
diff --git a/tests/models/language/pooling/test_mm_classifier_conversion.py b/tests/models/language/pooling/test_mm_classifier_conversion.py
index d50ee85b9fd2b604619a9ae03bfd307971be642d..631fd394f719ed99992930899e22bf3b910841a8 100644
--- a/tests/models/language/pooling/test_mm_classifier_conversion.py
+++ b/tests/models/language/pooling/test_mm_classifier_conversion.py
@@ -88,7 +88,7 @@ def test_gemma_multimodal(
         convert="classify",
         load_format="auto",
         hf_overrides=update_config,
-        pooler_config=PoolerConfig(pooling_type="LAST"),
+        pooler_config=PoolerConfig(seq_pooling_type="LAST"),
         max_model_len=512,
         enforce_eager=True,
         tensor_parallel_size=1,
diff --git a/tests/models/language/pooling/test_pooler_config_init_behaviour.py b/tests/models/language/pooling/test_pooler_config_init_behaviour.py
index deb5de984d90917e8e8ee98a1579d5035893a0c3..a5a0c07e0c5d4d5d036be2e26451a071d18670fb 100644
--- a/tests/models/language/pooling/test_pooler_config_init_behaviour.py
+++ b/tests/models/language/pooling/test_pooler_config_init_behaviour.py
@@ -66,7 +66,7 @@ def test_embed_models_using_normalize(
         model,
         max_model_len=512,
         dtype=dtype,
-        pooler_config=PoolerConfig(normalize=False),
+        pooler_config=PoolerConfig(use_activation=False),
     ) as vllm_model:
         wo_normalize = torch.tensor(vllm_model.embed(example_prompts))
 
@@ -74,7 +74,7 @@ def test_embed_models_using_normalize(
         model,
         max_model_len=512,
         dtype=dtype,
-        pooler_config=PoolerConfig(normalize=True),
+        pooler_config=PoolerConfig(use_activation=True),
     ) as vllm_model:
         w_normalize = torch.tensor(vllm_model.embed(example_prompts))
 
@@ -146,7 +146,7 @@ def test_multi_vector_retrieval_models_using_normalize(
         model,
         max_model_len=512,
         dtype=dtype,
-        pooler_config=PoolerConfig(normalize=False),
+        pooler_config=PoolerConfig(use_activation=False),
     ) as vllm_model:
         wo_normalize = vllm_model.token_embed(example_prompts)
 
@@ -154,7 +154,7 @@ def test_multi_vector_retrieval_models_using_normalize(
         model,
         max_model_len=512,
         dtype=dtype,
-        pooler_config=PoolerConfig(normalize=True),
+        pooler_config=PoolerConfig(use_activation=True),
     ) as vllm_model:
         w_normalize = vllm_model.token_embed(example_prompts)
 
diff --git a/tests/models/language/pooling/test_reward.py b/tests/models/language/pooling/test_reward.py
index c42186c7db9a3af2740b74fd379ffc94a35a3324..22e0539a989063e41f2d8559012c734a8a3b6da0 100644
--- a/tests/models/language/pooling/test_reward.py
+++ b/tests/models/language/pooling/test_reward.py
@@ -1,5 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import json
+from typing import TYPE_CHECKING
 
 import pytest
 import torch
@@ -9,7 +11,18 @@ from transformers import AutoModel
 from vllm.platforms import current_platform
 
 from ....conftest import HfRunner
-from ...utils import check_transformers_version
+from ....utils import VLLM_PATH
+from ...registry import HF_EXAMPLE_MODELS
+
+if TYPE_CHECKING:
+    from _typeshed import StrPath
+
+
+FIXTURES_PATH = VLLM_PATH / "tests/models/fixtures"
+assert FIXTURES_PATH.exists()
+FIXTURE_REWARD_RESULT = {
+    "Qwen/Qwen2.5-Math-PRM-7B": FIXTURES_PATH / "qwen2_5_math_prm_reward_step.json",
+}
 
 
 @pytest.fixture
@@ -60,6 +73,16 @@ def step_reward_patch_hf_model(hf_model: HfRunner):
     return hf_model
 
 
+def dump_reward_outputs(outputs: list[list[float]], filename: "StrPath"):
+    with open(filename, "w", encoding="utf-8") as f:
+        json.dump(outputs, f)
+
+
+def load_reward_outputs(filename: "StrPath") -> list[list[float]]:
+    with open(filename, encoding="utf-8") as f:
+        return json.load(f)
+
+
 @pytest.mark.parametrize(
     "model",
     [
@@ -77,9 +100,8 @@ def test_prm_models(
     model: str,
     dtype: str,
 ) -> None:
-    check_transformers_version(
-        "Qwen/Qwen2.5-Math-PRM-7B", max_transformers_version="4.53.2"
-    )
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+    model_info.check_transformers_version(on_fail="skip")
 
     if current_platform.is_cpu():
         pytest.skip("CPU only supports V1")
@@ -91,9 +113,46 @@ def test_prm_models(
         hf_model = step_reward_patch_hf_model(hf_model)
         hf_outputs = hf_model.reward(math_step_prompts)
 
+    dump_reward_outputs(
+        hf_outputs,
+        FIXTURE_REWARD_RESULT[model],
+    )
+
     # check logits difference
     for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
         hf_output = torch.tensor(hf_output).float()
         vllm_output = torch.tensor(vllm_output).float()
 
         assert torch.allclose(hf_output, vllm_output, 1.5e-2)
+
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        pytest.param(
+            "Qwen/Qwen2.5-Math-PRM-7B",
+            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
+        ),
+    ],
+)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_prm_models_with_golden_outputs(
+    vllm_runner,
+    math_step_prompts,
+    model: str,
+    dtype: str,
+) -> None:
+    if not FIXTURE_REWARD_RESULT.get(model):
+        pytest.skip(f"No available golden outputs for {model}.")
+
+    with vllm_runner(model, max_model_len=1024, dtype=dtype) as vllm_model:
+        vllm_outputs = vllm_model.reward(math_step_prompts)
+
+    golden_outputs = load_reward_outputs(FIXTURE_REWARD_RESULT[model])
+
+    # check logits difference
+    for golden_output, vllm_output in zip(golden_outputs, vllm_outputs):
+        golden_output = torch.tensor(golden_output).float()
+        vllm_output = torch.tensor(vllm_output).float()
+
+        assert torch.allclose(golden_output, vllm_output, 1.5e-2)
diff --git a/tests/models/language/pooling/test_token_classification.py b/tests/models/language/pooling/test_token_classification.py
index 64d42432c74b976d5fb333fb36bdb48e6ef6ae86..7bc8daaea650c707199fdd018d718b810489c35b 100644
--- a/tests/models/language/pooling/test_token_classification.py
+++ b/tests/models/language/pooling/test_token_classification.py
@@ -5,6 +5,7 @@ import torch
 from transformers import AutoModelForTokenClassification
 
 from tests.models.utils import softmax
+from vllm.platforms import current_platform
 
 
 @pytest.mark.parametrize("model", ["boltuix/NeuroBERT-NER"])
@@ -21,8 +22,17 @@ def test_bert_models(
     with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model:
         vllm_outputs = vllm_model.token_classify(example_prompts)
 
+    # Use eager attention on ROCm to avoid HF Transformers flash attention
+    # accuracy issues: https://github.com/vllm-project/vllm/issues/30167
+    hf_model_kwargs = {}
+    if current_platform.is_rocm():
+        hf_model_kwargs["attn_implementation"] = "eager"
+
     with hf_runner(
-        model, dtype=dtype, auto_cls=AutoModelForTokenClassification
+        model,
+        dtype=dtype,
+        auto_cls=AutoModelForTokenClassification,
+        model_kwargs=hf_model_kwargs,
     ) as hf_model:
         tokenizer = hf_model.tokenizer
         hf_outputs = []
@@ -34,9 +44,9 @@ def test_bert_models(
 
     # check logits difference
     for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
-        hf_output = torch.tensor(hf_output).cpu().float()
-        vllm_output = torch.tensor(vllm_output).cpu().float()
-        assert torch.allclose(hf_output, vllm_output, 1e-2)
+        hf_output = hf_output.detach().clone().cpu().float()
+        vllm_output = vllm_output.detach().clone().cpu().float()
+        torch.testing.assert_close(hf_output, vllm_output, atol=1.2e-2, rtol=1e-3)
 
 
 @pytest.mark.parametrize("model", ["disham993/electrical-ner-ModernBERT-base"])
@@ -52,8 +62,17 @@ def test_modernbert_models(
     with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model:
         vllm_outputs = vllm_model.token_classify(example_prompts)
 
+    # Use eager attention on ROCm to avoid HF Transformers flash attention
+    # accuracy issues: https://github.com/vllm-project/vllm/issues/30167
+    hf_model_kwargs = {}
+    if current_platform.is_rocm():
+        hf_model_kwargs["attn_implementation"] = "eager"
+
     with hf_runner(
-        model, dtype=dtype, auto_cls=AutoModelForTokenClassification
+        model,
+        dtype=dtype,
+        auto_cls=AutoModelForTokenClassification,
+        model_kwargs=hf_model_kwargs,
     ) as hf_model:
         tokenizer = hf_model.tokenizer
         hf_outputs = []
@@ -65,9 +84,9 @@ def test_modernbert_models(
 
     # check logits difference
     for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
-        hf_output = torch.tensor(hf_output).cpu().float()
-        vllm_output = torch.tensor(vllm_output).cpu().float()
-        assert torch.allclose(hf_output, vllm_output, atol=1e-2)
+        hf_output = hf_output.detach().clone().cpu().float()
+        vllm_output = vllm_output.detach().clone().cpu().float()
+        torch.testing.assert_close(hf_output, vllm_output, atol=1.2e-2, rtol=1e-3)
 
 
 @pytest.mark.parametrize("model", ["bd2lcco/Qwen3-0.6B-finetuned"])
@@ -96,6 +115,6 @@ def test_auto_conversion(
 
     # check logits difference
     for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
-        hf_output = torch.tensor(hf_output).cpu().float()
-        vllm_output = torch.tensor(vllm_output).cpu().float()
+        hf_output = hf_output.detach().clone().cpu().float()
+        vllm_output = vllm_output.detach().clone().cpu().float()
         assert torch.allclose(hf_output, vllm_output, atol=1e-2)
diff --git a/tests/models/language/pooling_mteb_test/mteb_utils.py b/tests/models/language/pooling_mteb_test/mteb_embed_utils.py
similarity index 50%
rename from tests/models/language/pooling_mteb_test/mteb_utils.py
rename to tests/models/language/pooling_mteb_test/mteb_embed_utils.py
index 189cdbae99dcd46b8ec827558915da7995bee95d..a736b991d4d5ab8a0bb017c9432c455514f216a9 100644
--- a/tests/models/language/pooling_mteb_test/mteb_utils.py
+++ b/tests/models/language/pooling_mteb_test/mteb_embed_utils.py
@@ -1,11 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import tempfile
-
 import mteb
 import numpy as np
-import requests
 import torch
 from mteb.models import ModelMeta
 from mteb.types import Array
@@ -14,7 +11,6 @@ from torch.utils.data import DataLoader
 import tests.ci_envs as ci_envs
 from tests.models.utils import (
     EmbedModelInfo,
-    RerankModelInfo,
     check_embeddings_close,
     get_vllm_extra_kwargs,
 )
@@ -23,14 +19,10 @@ from tests.models.utils import (
 # - Model implementation and minor changes in tensor dtype
 #   results in differences less than 1e-4
 # - Different model results in differences more than 1e-3
-# 1e-4 is a good tolerance threshold
+# 5e-4 is a good tolerance threshold
 MTEB_EMBED_TASKS = ["STS12"]
-MTEB_EMBED_TOL = 1e-4
+MTEB_EMBED_TOL = 5e-4
 
-# See #19344
-MTEB_RERANK_TASKS = ["NFCorpus"]
-MTEB_RERANK_LANGS = ["eng"]
-MTEB_RERANK_TOL = 2e-3
 
 _empty_model_meta = ModelMeta(
     loader=None,
@@ -54,29 +46,9 @@ _empty_model_meta = ModelMeta(
 )
 
 
-class VllmMtebEncoder(mteb.EncoderProtocol):
+class MtebEmbedMixin(mteb.EncoderProtocol):
     mteb_model_meta = _empty_model_meta
 
-    def __init__(self, vllm_model):
-        self.llm = vllm_model
-        self.rng = np.random.default_rng(seed=42)
-
-    def encode(
-        self,
-        inputs: DataLoader[mteb.types.BatchedInput],
-        *args,
-        **kwargs,
-    ) -> np.ndarray:
-        # Hoping to discover potential scheduling
-        # issues by randomizing the order.
-        sentences = [text for batch in inputs for text in batch["text"]]
-        r = self.rng.permutation(len(sentences))
-        sentences = [sentences[i] for i in r]
-        outputs = self.llm.embed(sentences, use_tqdm=False)
-        embeds = np.array(outputs)
-        embeds = embeds[np.argsort(r)]
-        return embeds
-
     def similarity(
         self,
         embeddings1: np.ndarray,
@@ -102,31 +74,29 @@ class VllmMtebEncoder(mteb.EncoderProtocol):
         return sim
 
 
-class VllmMtebCrossEncoder(mteb.CrossEncoderProtocol):
-    mteb_model_meta = _empty_model_meta
-
+class VllmMtebEncoder(MtebEmbedMixin):
     def __init__(self, vllm_model):
         self.llm = vllm_model
         self.rng = np.random.default_rng(seed=42)
 
-    def predict(
+    def encode(
         self,
-        inputs1: DataLoader[mteb.types.BatchedInput],
-        inputs2: DataLoader[mteb.types.BatchedInput],
+        inputs: DataLoader[mteb.types.BatchedInput],
         *args,
         **kwargs,
     ) -> np.ndarray:
-        queries = [text for batch in inputs1 for text in batch["text"]]
-        corpus = [text for batch in inputs2 for text in batch["text"]]
-
-        outputs = self.llm.score(
-            queries, corpus, truncate_prompt_tokens=-1, use_tqdm=False
-        )
-        scores = np.array(outputs)
-        return scores
+        # Hoping to discover potential scheduling
+        # issues by randomizing the order.
+        sentences = [text for batch in inputs for text in batch["text"]]
+        r = self.rng.permutation(len(sentences))
+        sentences = [sentences[i] for i in r]
+        outputs = self.llm.embed(sentences, use_tqdm=False)
+        embeds = np.array(outputs)
+        embeds = embeds[np.argsort(r)]
+        return embeds
 
 
-class OpenAIClientMtebEncoder(VllmMtebEncoder):
+class OpenAIClientMtebEncoder(MtebEmbedMixin):
     def __init__(self, model_name: str, client):
         self.model_name = model_name
         self.client = client
@@ -153,58 +123,6 @@ class OpenAIClientMtebEncoder(VllmMtebEncoder):
         return embeds
 
 
-class ScoreClientMtebEncoder(mteb.CrossEncoderProtocol):
-    mteb_model_meta = _empty_model_meta
-
-    def __init__(self, model_name: str, url):
-        self.model_name = model_name
-        self.url = url
-        self.rng = np.random.default_rng(seed=42)
-
-    def predict(
-        self,
-        inputs1: DataLoader[mteb.types.BatchedInput],
-        inputs2: DataLoader[mteb.types.BatchedInput],
-        *args,
-        **kwargs,
-    ) -> np.ndarray:
-        queries = [text for batch in inputs1 for text in batch["text"]]
-        full_corpus = [text for batch in inputs2 for text in batch["text"]]
-
-        outputs = []
-        for query, corpus in zip(queries, full_corpus):
-            outputs.append(self.get_score(query, corpus))
-
-        scores = np.array(outputs)
-        return scores
-
-    def get_score(self, query, corpus):
-        response = requests.post(
-            self.url,
-            json={
-                "model": self.model_name,
-                "text_1": query,
-                "text_2": corpus,
-                "truncate_prompt_tokens": -1,
-            },
-        ).json()
-        return response["data"][0]["score"]
-
-
-class RerankClientMtebEncoder(ScoreClientMtebEncoder):
-    def get_score(self, query, corpus):
-        response = requests.post(
-            self.url,
-            json={
-                "model": self.model_name,
-                "query": query,
-                "documents": [corpus],
-                "truncate_prompt_tokens": -1,
-            },
-        ).json()
-        return response["results"][0]["relevance_score"]
-
-
 def run_mteb_embed_task(encoder: mteb.EncoderProtocol, tasks):
     tasks = mteb.get_tasks(tasks=tasks)
     results = mteb.evaluate(
@@ -243,12 +161,24 @@ def mteb_test_embed_models(
         if model_info.architecture:
             assert model_info.architecture in model_config.architectures
 
-        # Confirm whether vllm uses the correct default_pooling_type, which
-        # relates to whether chunked prefill and prefix caching are enabled
-        assert (
-            model_config._model_info.default_pooling_type
-            == model_info.default_pooling_type
-        )
+        # Confirm whether the important configs in model_config are correct.
+        pooler_config = model_config.pooler_config
+        if model_info.seq_pooling_type is not None:
+            assert pooler_config.seq_pooling_type == model_info.seq_pooling_type
+        if model_info.tok_pooling_type is not None:
+            assert pooler_config.tok_pooling_type == model_info.tok_pooling_type
+        if model_info.attn_type is not None:
+            assert model_config.attn_type == model_info.attn_type
+        if model_info.is_prefix_caching_supported is not None:
+            assert (
+                model_config.is_prefix_caching_supported
+                == model_info.is_prefix_caching_supported
+            )
+        if model_info.is_chunked_prefill_supported is not None:
+            assert (
+                model_config.is_chunked_prefill_supported
+                == model_info.is_chunked_prefill_supported
+            )
 
         vllm_main_score = run_mteb_embed_task(
             VllmMtebEncoder(vllm_model), MTEB_EMBED_TASKS
@@ -299,117 +229,3 @@ def mteb_test_embed_models(
     # We are not concerned that the vllm mteb results are better
     # than SentenceTransformers, so we only perform one-sided testing.
     assert st_main_score - vllm_main_score < atol
-
-
-def run_mteb_rerank(cross_encoder: mteb.CrossEncoderProtocol, tasks, languages):
-    with tempfile.TemporaryDirectory() as prediction_folder:
-        bm25s = mteb.get_model("bm25s")
-        eval_splits = ["test"]
-
-        mteb_tasks: list[mteb.abstasks.AbsTaskRetrieval] = mteb.get_tasks(
-            tasks=tasks, languages=languages, eval_splits=eval_splits
-        )
-
-        mteb.evaluate(
-            bm25s,
-            mteb_tasks,
-            prediction_folder=prediction_folder,
-            show_progress_bar=False,
-            # don't save results for test runs
-            cache=None,
-            overwrite_strategy="always",
-        )
-
-        second_stage_tasks = []
-        for task in mteb_tasks:
-            second_stage_tasks.append(
-                task.convert_to_reranking(
-                    prediction_folder,
-                    top_k=10,
-                )
-            )
-
-        results = mteb.evaluate(
-            cross_encoder,
-            second_stage_tasks,
-            show_progress_bar=False,
-            cache=None,
-        )
-        main_score = results[0].scores["test"][0]["main_score"]
-    return main_score
-
-
-def mteb_test_rerank_models_hf(
-    hf_runner, model_name, hf_dtype="float32", hf_model_callback=None
-):
-    with hf_runner(model_name, is_cross_encoder=True, dtype=hf_dtype) as hf_model:
-        if hf_model_callback is not None:
-            hf_model_callback(hf_model)
-
-        st_main_score = run_mteb_rerank(
-            hf_model, tasks=MTEB_RERANK_TASKS, languages=MTEB_RERANK_LANGS
-        )
-        st_dtype = next(hf_model.model.model.parameters()).dtype
-    return st_main_score, st_dtype
-
-
-def mteb_test_rerank_models(
-    hf_runner,
-    vllm_runner,
-    model_info: RerankModelInfo,
-    vllm_extra_kwargs=None,
-    hf_model_callback=None,
-    vllm_mteb_encoder=VllmMtebCrossEncoder,
-    atol=MTEB_RERANK_TOL,
-):
-    vllm_extra_kwargs = get_vllm_extra_kwargs(model_info, vllm_extra_kwargs)
-
-    with vllm_runner(
-        model_info.name,
-        runner="pooling",
-        max_model_len=None,
-        max_num_seqs=8,
-        **vllm_extra_kwargs,
-    ) as vllm_model:
-        model_config = vllm_model.llm.llm_engine.model_config
-
-        # Confirm whether vllm is using the correct architecture
-        if model_info.architecture:
-            assert model_info.architecture in model_config.architectures
-
-        # Score API is only enabled for num_labels == 1
-        assert model_config.hf_config.num_labels == 1
-
-        # Confirm whether vllm uses the correct default_pooling_type, which
-        # relates to whether chunked prefill and prefix caching are enabled
-        assert (
-            model_config._model_info.default_pooling_type
-            == model_info.default_pooling_type
-        )
-
-        vllm_main_score = run_mteb_rerank(
-            vllm_mteb_encoder(vllm_model),
-            tasks=MTEB_RERANK_TASKS,
-            languages=MTEB_RERANK_LANGS,
-        )
-        vllm_dtype = model_config.dtype
-        head_dtype = model_config.head_dtype
-
-    # Accelerate mteb test by setting
-    # SentenceTransformers mteb score to a constant
-    if model_info.mteb_score is None:
-        st_main_score, st_dtype = mteb_test_rerank_models_hf(
-            hf_runner, model_info.name, model_info.hf_dtype, hf_model_callback
-        )
-    else:
-        st_main_score = model_info.mteb_score
-        st_dtype = "Constant"
-
-    print("Model:", model_info.name)
-    print("VLLM:", f"dtype:{vllm_dtype}", f"head_dtype:{head_dtype}", vllm_main_score)
-    print("SentenceTransformers:", st_dtype, st_main_score)
-    print("Difference:", st_main_score - vllm_main_score)
-
-    # We are not concerned that the vllm mteb results are better
-    # than SentenceTransformers, so we only perform one-sided testing.
-    assert st_main_score - vllm_main_score < atol
diff --git a/tests/models/language/pooling_mteb_test/mteb_score_utils.py b/tests/models/language/pooling_mteb_test/mteb_score_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..adc2cf3e411e5e8fb25f4f6878b6ad664a75e83e
--- /dev/null
+++ b/tests/models/language/pooling_mteb_test/mteb_score_utils.py
@@ -0,0 +1,305 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import tempfile
+from pathlib import Path
+from typing import Any
+
+import mteb
+import numpy as np
+import requests
+import torch
+from mteb.models import ModelMeta
+from torch.utils.data import DataLoader
+
+from tests.conftest import HfRunner
+from tests.models.utils import (
+    RerankModelInfo,
+    get_vllm_extra_kwargs,
+)
+
+# See #19344
+MTEB_RERANK_TASKS = ["NFCorpus"]
+MTEB_RERANK_LANGS = ["eng"]
+MTEB_RERANK_TOL = 2e-3
+
+template_home = (
+    Path(__file__).parent.parent.parent.parent.parent
+    / "examples/pooling/score/template"
+)
+
+_empty_model_meta = ModelMeta(
+    loader=None,
+    name="vllm/model",
+    revision="1",
+    release_date=None,
+    languages=None,
+    framework=[],
+    similarity_fn_name=None,
+    n_parameters=None,
+    memory_usage_mb=None,
+    max_tokens=None,
+    embed_dim=None,
+    license=None,
+    open_weights=None,
+    public_training_code=None,
+    public_training_data=None,
+    use_instructions=None,
+    training_datasets=None,
+    modalities=["text"],  # 'image' can be added to evaluate multimodal models
+)
+
+
+class MtebCrossEncoderMixin(mteb.CrossEncoderProtocol):
+    mteb_model_meta = _empty_model_meta
+
+
+class VllmMtebCrossEncoder(MtebCrossEncoderMixin):
+    def __init__(self, vllm_model):
+        self.llm = vllm_model
+        self.rng = np.random.default_rng(seed=42)
+        self.chat_template: str | None = getattr(vllm_model, "chat_template", None)
+
+    def predict(
+        self,
+        inputs1: DataLoader[mteb.types.BatchedInput],
+        inputs2: DataLoader[mteb.types.BatchedInput],
+        *args,
+        **kwargs,
+    ) -> np.ndarray:
+        queries = [text for batch in inputs1 for text in batch["text"]]
+        corpus = [text for batch in inputs2 for text in batch["text"]]
+
+        # Hoping to discover potential scheduling
+        # issues by randomizing the order.
+        r = self.rng.permutation(len(queries))
+        queries = [queries[i] for i in r]
+        corpus = [corpus[i] for i in r]
+
+        outputs = self.llm.score(
+            queries,
+            corpus,
+            truncate_prompt_tokens=-1,
+            use_tqdm=False,
+            chat_template=self.chat_template,
+        )
+        scores = np.array(outputs)
+        scores = scores[np.argsort(r)]
+        return scores
+
+
+class ScoreClientMtebEncoder(MtebCrossEncoderMixin):
+    mteb_model_meta = _empty_model_meta
+
+    def __init__(self, model_name: str, url):
+        self.model_name = model_name
+        self.url = url
+
+    def predict(
+        self,
+        inputs1: DataLoader[mteb.types.BatchedInput],
+        inputs2: DataLoader[mteb.types.BatchedInput],
+        *args,
+        **kwargs,
+    ) -> np.ndarray:
+        queries = [text for batch in inputs1 for text in batch["text"]]
+        full_corpus = [text for batch in inputs2 for text in batch["text"]]
+
+        outputs = []
+        for query, corpus in zip(queries, full_corpus):
+            outputs.append(self.get_score(query, corpus))
+
+        scores = np.array(outputs)
+        return scores
+
+    def get_score(self, query, corpus):
+        response = requests.post(
+            self.url,
+            json={
+                "model": self.model_name,
+                "text_1": query,
+                "text_2": corpus,
+                "truncate_prompt_tokens": -1,
+            },
+        ).json()
+        return response["data"][0]["score"]
+
+
+class RerankClientMtebEncoder(ScoreClientMtebEncoder):
+    def get_score(self, query, corpus):
+        response = requests.post(
+            self.url,
+            json={
+                "model": self.model_name,
+                "query": query,
+                "documents": [corpus],
+                "truncate_prompt_tokens": -1,
+            },
+        ).json()
+        return response["results"][0]["relevance_score"]
+
+
+class HFMtebCrossEncoder(MtebCrossEncoderMixin, HfRunner):
+    chat_template: str | None = None
+
+    def __init__(self, model_name: str, dtype: str = "auto", **kwargs: Any) -> None:
+        HfRunner.__init__(
+            self, model_name=model_name, is_cross_encoder=True, dtype=dtype, **kwargs
+        )
+
+    @torch.no_grad
+    def predict(
+        self,
+        inputs1: DataLoader[mteb.types.BatchedInput],
+        inputs2: DataLoader[mteb.types.BatchedInput],
+        *args,
+        **kwargs,
+    ) -> np.ndarray:
+        queries = [text for batch in inputs1 for text in batch["text"]]
+        corpus = [text for batch in inputs2 for text in batch["text"]]
+
+        if self.chat_template is not None:
+            tokenizer = self.model.tokenizer
+            prompts = []
+            for query, document in zip(queries, corpus):
+                conversation = [
+                    {"role": "query", "content": query},
+                    {"role": "document", "content": document},
+                ]
+
+                prompt = tokenizer.apply_chat_template(
+                    conversation=conversation,
+                    tools=None,
+                    chat_template=self.chat_template,
+                    tokenize=False,
+                )
+                prompts.append(prompt)
+            outputs_list = HfRunner.classify(self, prompts)
+            scores = np.array(outputs_list).squeeze(-1)
+            return scores
+        else:
+            prompts = list(zip(queries, corpus))
+            outputs_tensor = HfRunner.predict(self, prompts, show_progress_bar=False)
+            return outputs_tensor.cpu().numpy()
+
+
+def run_mteb_rerank(cross_encoder: mteb.CrossEncoderProtocol, tasks, languages):
+    with tempfile.TemporaryDirectory() as prediction_folder:
+        bm25s = mteb.get_model("bm25s")
+        eval_splits = ["test"]
+
+        mteb_tasks: list[mteb.abstasks.AbsTaskRetrieval] = mteb.get_tasks(
+            tasks=tasks, languages=languages, eval_splits=eval_splits
+        )
+
+        mteb.evaluate(
+            bm25s,
+            mteb_tasks,
+            prediction_folder=prediction_folder,
+            show_progress_bar=False,
+            # don't save results for test runs
+            cache=None,
+            overwrite_strategy="always",
+        )
+
+        second_stage_tasks = []
+        for task in mteb_tasks:
+            second_stage_tasks.append(
+                task.convert_to_reranking(
+                    prediction_folder,
+                    top_k=10,
+                )
+            )
+
+        results = mteb.evaluate(
+            cross_encoder,
+            second_stage_tasks,
+            show_progress_bar=False,
+            cache=None,
+        )
+        main_score = results[0].scores["test"][0]["main_score"]
+    return main_score
+
+
+def mteb_test_rerank_models(
+    vllm_runner,
+    model_info: RerankModelInfo,
+    hf_runner=HFMtebCrossEncoder,
+    vllm_extra_kwargs=None,
+    vllm_mteb_encoder=VllmMtebCrossEncoder,
+    atol=MTEB_RERANK_TOL,
+):
+    vllm_extra_kwargs = get_vllm_extra_kwargs(model_info, vllm_extra_kwargs)
+
+    # Maybe load chat_template.
+    chat_template: str | None = None
+    if model_info.chat_template_name is not None:
+        chat_template = (template_home / model_info.chat_template_name).read_text()
+
+    with vllm_runner(
+        model_info.name,
+        runner="pooling",
+        max_model_len=None,
+        max_num_seqs=8,
+        **vllm_extra_kwargs,
+    ) as vllm_model:
+        model_config = vllm_model.llm.llm_engine.model_config
+        vllm_model.chat_template = chat_template
+
+        # Confirm whether vllm is using the correct architecture
+        if model_info.architecture:
+            assert model_info.architecture in model_config.architectures
+
+        # Score API is only enabled for num_labels == 1
+        assert model_config.hf_config.num_labels == 1
+
+        # Confirm whether the important configs in model_config are correct.
+        pooler_config = model_config.pooler_config
+        if model_info.seq_pooling_type is not None:
+            assert pooler_config.seq_pooling_type == model_info.seq_pooling_type
+        if model_info.tok_pooling_type is not None:
+            assert pooler_config.tok_pooling_type == model_info.tok_pooling_type
+        if model_info.attn_type is not None:
+            assert model_config.attn_type == model_info.attn_type
+        if model_info.is_prefix_caching_supported is not None:
+            assert (
+                model_config.is_prefix_caching_supported
+                == model_info.is_prefix_caching_supported
+            )
+        if model_info.is_chunked_prefill_supported is not None:
+            assert (
+                model_config.is_chunked_prefill_supported
+                == model_info.is_chunked_prefill_supported
+            )
+
+        vllm_main_score = run_mteb_rerank(
+            vllm_mteb_encoder(vllm_model),
+            tasks=MTEB_RERANK_TASKS,
+            languages=MTEB_RERANK_LANGS,
+        )
+        vllm_dtype = model_config.dtype
+        head_dtype = model_config.head_dtype
+
+    # Accelerate mteb test by setting
+    # SentenceTransformers mteb score to a constant
+    if model_info.mteb_score is None:
+        with hf_runner(model_info.name, dtype=model_info.hf_dtype) as hf_model:
+            hf_model.chat_template = chat_template
+            st_main_score = run_mteb_rerank(
+                hf_model,
+                tasks=MTEB_RERANK_TASKS,
+                languages=MTEB_RERANK_LANGS,
+            )
+            st_dtype = next(hf_model.model.model.parameters()).dtype
+    else:
+        st_main_score = model_info.mteb_score
+        st_dtype = "Constant"
+
+    print("Model:", model_info.name)
+    print("VLLM:", f"dtype:{vllm_dtype}", f"head_dtype:{head_dtype}", vllm_main_score)
+    print("SentenceTransformers:", st_dtype, st_main_score)
+    print("Difference:", st_main_score - vllm_main_score)
+
+    # We are not concerned that the vllm mteb results are better
+    # than SentenceTransformers, so we only perform one-sided testing.
+    assert st_main_score - vllm_main_score < atol
diff --git a/tests/models/language/pooling_mteb_test/test_baai.py b/tests/models/language/pooling_mteb_test/test_baai.py
index bad13e2457146d427501e7e4bfeb39511253a14b..1199393d4b74ea59154f03333bd8eee8d231cb9b 100644
--- a/tests/models/language/pooling_mteb_test/test_baai.py
+++ b/tests/models/language/pooling_mteb_test/test_baai.py
@@ -4,90 +4,93 @@ import pytest
 
 from tests.models.language.pooling.embed_utils import correctness_test_embed_models
 from tests.models.utils import (
-    CLSPoolingEmbedModelInfo,
-    CLSPoolingRerankModelInfo,
     EmbedModelInfo,
-    LASTPoolingEmbedModelInfo,
     RerankModelInfo,
 )
 
-from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
+from .mteb_embed_utils import mteb_test_embed_models
+from .mteb_score_utils import mteb_test_rerank_models
 
 MODELS = [
     ########## BertModel
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
         "BAAI/bge-base-en",
         architecture="BertModel",
         mteb_score=0.779336792,
+        seq_pooling_type="CLS",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
         enable_test=True,
     ),
-    CLSPoolingEmbedModelInfo(
-        "BAAI/bge-base-zh", architecture="BertModel", enable_test=False
-    ),
-    CLSPoolingEmbedModelInfo(
-        "BAAI/bge-small-en", architecture="BertModel", enable_test=False
-    ),
-    CLSPoolingEmbedModelInfo(
-        "BAAI/bge-small-zh", architecture="BertModel", enable_test=False
-    ),
-    CLSPoolingEmbedModelInfo(
-        "BAAI/bge-large-en", architecture="BertModel", enable_test=False
-    ),
-    CLSPoolingEmbedModelInfo(
-        "BAAI/bge-large-zh", architecture="BertModel", enable_test=False
-    ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo("BAAI/bge-base-zh", architecture="BertModel", enable_test=False),
+    EmbedModelInfo("BAAI/bge-small-en", architecture="BertModel", enable_test=False),
+    EmbedModelInfo("BAAI/bge-small-zh", architecture="BertModel", enable_test=False),
+    EmbedModelInfo("BAAI/bge-large-en", architecture="BertModel", enable_test=False),
+    EmbedModelInfo("BAAI/bge-large-zh", architecture="BertModel", enable_test=False),
+    EmbedModelInfo(
         "BAAI/bge-large-zh-noinstruct", architecture="BertModel", enable_test=False
     ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
         "BAAI/bge-base-en-v1.5", architecture="BertModel", enable_test=False
     ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
         "BAAI/bge-base-zh-v1.5", architecture="BertModel", enable_test=False
     ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
         "BAAI/bge-small-en-v1.5", architecture="BertModel", enable_test=False
     ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
         "BAAI/bge-small-zh-v1.5", architecture="BertModel", enable_test=False
     ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
         "BAAI/bge-large-en-v1.5", architecture="BertModel", enable_test=False
     ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
         "BAAI/bge-large-zh-v1.5", architecture="BertModel", enable_test=False
     ),
     ########## XLMRobertaModel
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
         "BAAI/bge-m3",
         architecture="XLMRobertaModel",
         mteb_score=0.787343078,
+        seq_pooling_type="CLS",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
         enable_test=True,
     ),
     ########## Qwen2Model
-    LASTPoolingEmbedModelInfo(
+    EmbedModelInfo(
         "BAAI/bge-code-v1",
         architecture="Qwen2Model",
         mteb_score=0.75724465,
-        dtype="float32",
+        seq_pooling_type="LAST",
+        attn_type="decoder",
+        is_prefix_caching_supported=True,
+        is_chunked_prefill_supported=True,
         enable_test=True,
     ),
 ]
 
 RERANK_MODELS = [
     ########## XLMRobertaForSequenceClassification
-    CLSPoolingRerankModelInfo(
+    RerankModelInfo(
         "BAAI/bge-reranker-base",
         architecture="XLMRobertaForSequenceClassification",
         mteb_score=0.32398,
+        seq_pooling_type="CLS",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
         enable_test=True,
     ),
-    CLSPoolingRerankModelInfo(
+    RerankModelInfo(
         "BAAI/bge-reranker-large",
         architecture="XLMRobertaForSequenceClassification",
         enable_test=False,
     ),
-    CLSPoolingRerankModelInfo(
+    RerankModelInfo(
         "BAAI/bge-reranker-v2-m3",
         architecture="XLMRobertaForSequenceClassification",
         enable_test=False,
@@ -108,7 +111,5 @@ def test_embed_models_correctness(
 
 
 @pytest.mark.parametrize("model_info", RERANK_MODELS)
-def test_rerank_models_mteb(
-    hf_runner, vllm_runner, model_info: RerankModelInfo
-) -> None:
-    mteb_test_rerank_models(hf_runner, vllm_runner, model_info)
+def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
+    mteb_test_rerank_models(vllm_runner, model_info)
diff --git a/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py b/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py
index 6b2e4696449267565f1a22446721afeec3a8c650..23bc95548bcd67f6eb9326d346c0f132980d7463 100644
--- a/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py
+++ b/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py
@@ -9,40 +9,62 @@ import torch
 from torch.utils.data import DataLoader
 
 from tests.conftest import HfRunner
-from tests.models.language.pooling_mteb_test.mteb_utils import (
-    VllmMtebCrossEncoder,
+from tests.models.utils import RerankModelInfo
+
+from .mteb_score_utils import (
+    MtebCrossEncoderMixin,
     mteb_test_rerank_models,
 )
-from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo
 
 RERANK_MODELS = [
-    LASTPoolingRerankModelInfo(
+    RerankModelInfo(
         "BAAI/bge-reranker-v2-gemma",
         architecture="GemmaForSequenceClassification",
-        mteb_score=0.33757,
         hf_overrides={
             "architectures": ["GemmaForSequenceClassification"],
             "classifier_from_token": ["Yes"],
             "method": "no_post_processing",
         },
+        mteb_score=0.33757,
+        seq_pooling_type="LAST",
+        attn_type="decoder",
+        is_prefix_caching_supported=True,
+        is_chunked_prefill_supported=True,
+        chat_template_name="bge-reranker-v2-gemma.jinja",
     ),
 ]
 
 PROMPT = "Given a query A and a passage B, determine whether the passage contains an answer to the query by providing a prediction of either 'Yes' or 'No'."  # noqa: E501
 
 
-class GemmaRerankerHfRunner(HfRunner):
+class GemmaRerankerHfRunner(MtebCrossEncoderMixin, HfRunner):
     def __init__(
         self, model_name: str, dtype: str = "auto", *args: Any, **kwargs: Any
     ) -> None:
         from transformers import AutoModelForCausalLM, AutoTokenizer
 
-        super().__init__(model_name, dtype, auto_cls=AutoModelForCausalLM)
+        HfRunner.__init__(
+            self,
+            model_name=model_name,
+            auto_cls=AutoModelForCausalLM,
+            dtype=dtype,
+            **kwargs,
+        )
+
         self.tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
         self.yes_loc = self.tokenizer.convert_tokens_to_ids("Yes")
 
-    @torch.no_grad()
-    def predict(self, prompts: list[list[str]], *args, **kwargs) -> torch.Tensor:
+    @torch.no_grad
+    def predict(
+        self,
+        inputs1: DataLoader[mteb.types.BatchedInput],
+        inputs2: DataLoader[mteb.types.BatchedInput],
+        *args,
+        **kwargs,
+    ) -> np.ndarray:
+        queries = [text for batch in inputs1 for text in batch["text"]]
+        corpus = [text for batch in inputs2 for text in batch["text"]]
+
         def get_inputs(pairs, tokenizer, prompt=None):
             if prompt is None:
                 prompt = PROMPT
@@ -87,8 +109,8 @@ class GemmaRerankerHfRunner(HfRunner):
             )
 
         scores = []
-        for query, doc, *_ in prompts:
-            pairs = [(query, doc)]
+        for query, document in zip(queries, corpus):
+            pairs = [(query, document)]
             inputs = get_inputs(pairs, self.tokenizer)
             inputs = inputs.to(self.model.device)
             _n_tokens = inputs["input_ids"].shape[1]
@@ -105,41 +127,10 @@ class GemmaRerankerHfRunner(HfRunner):
         return torch.Tensor(scores)
 
 
-class GemmaMtebEncoder(VllmMtebCrossEncoder):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.query_template = "A: {query}\n"
-        self.document_template = "B: {doc}\n{prompt}"
-
-    def predict(
-        self,
-        inputs1: DataLoader[mteb.types.BatchedInput],
-        inputs2: DataLoader[mteb.types.BatchedInput],
-        *args,
-        **kwargs,
-    ) -> np.ndarray:
-        queries = [
-            self.query_template.format(query=text)
-            for batch in inputs1
-            for text in batch["text"]
-        ]
-        corpus = [
-            self.document_template.format(doc=text, prompt=PROMPT)
-            for batch in inputs2
-            for text in batch["text"]
-        ]
-        outputs = self.llm.score(
-            queries, corpus, truncate_prompt_tokens=-1, use_tqdm=False
-        )
-        scores = np.array(outputs)
-        return scores
-
-
 @pytest.mark.parametrize("model_info", RERANK_MODELS)
 def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
     mteb_test_rerank_models(
-        GemmaRerankerHfRunner,
         vllm_runner,
         model_info,
-        vllm_mteb_encoder=GemmaMtebEncoder,
+        hf_runner=GemmaRerankerHfRunner,
     )
diff --git a/tests/models/language/pooling_mteb_test/test_cross_encoder.py b/tests/models/language/pooling_mteb_test/test_cross_encoder.py
index 638ffc7a62b0ea5b6352d014cbb50ec88dcb5e72..0d1067d5e2c5d042f26f7f7135eff304ea6a616e 100644
--- a/tests/models/language/pooling_mteb_test/test_cross_encoder.py
+++ b/tests/models/language/pooling_mteb_test/test_cross_encoder.py
@@ -3,29 +3,34 @@
 import pytest
 
 from tests.models.utils import (
-    CLSPoolingRerankModelInfo,
-    LASTPoolingRerankModelInfo,
     RerankModelInfo,
 )
 
-from .mteb_utils import mteb_test_rerank_models
+from .mteb_score_utils import mteb_test_rerank_models
 
 RERANK_MODELS = [
-    CLSPoolingRerankModelInfo(
+    RerankModelInfo(
         "cross-encoder/ms-marco-TinyBERT-L-2-v2",
-        mteb_score=0.32898,
         architecture="BertForSequenceClassification",
+        seq_pooling_type="CLS",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
+        mteb_score=0.32898,
     ),
-    LASTPoolingRerankModelInfo(
+    RerankModelInfo(
         "tomaarsen/Qwen3-Reranker-0.6B-seq-cls",
-        mteb_score=0.25736,
         architecture="Qwen3ForSequenceClassification",
+        seq_pooling_type="LAST",
+        attn_type="decoder",
+        is_prefix_caching_supported=True,
+        is_chunked_prefill_supported=True,
+        chat_template_name="qwen3_reranker.jinja",
+        mteb_score=0.33459,
     ),
 ]
 
 
 @pytest.mark.parametrize("model_info", RERANK_MODELS)
-def test_rerank_models_mteb(
-    hf_runner, vllm_runner, model_info: RerankModelInfo
-) -> None:
-    mteb_test_rerank_models(hf_runner, vllm_runner, model_info)
+def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
+    mteb_test_rerank_models(vllm_runner, model_info)
diff --git a/tests/models/language/pooling_mteb_test/test_gte.py b/tests/models/language/pooling_mteb_test/test_gte.py
index a22821fd65b5a07aa5e585dc70ef380843e12fce..f87fd832afef2bcb8e84505619216327a3d75f99 100644
--- a/tests/models/language/pooling_mteb_test/test_gte.py
+++ b/tests/models/language/pooling_mteb_test/test_gte.py
@@ -5,36 +5,32 @@ import pytest
 
 from tests.models.language.pooling.embed_utils import correctness_test_embed_models
 from tests.models.utils import (
-    CLSPoolingEmbedModelInfo,
-    CLSPoolingRerankModelInfo,
     EmbedModelInfo,
-    LASTPoolingEmbedModelInfo,
     RerankModelInfo,
 )
 
-from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
+from .mteb_embed_utils import mteb_test_embed_models
+from .mteb_score_utils import mteb_test_rerank_models
 
 MODELS = [
     ########## BertModel
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
         "thenlper/gte-large",
         mteb_score=0.76807651,
         architecture="BertModel",
+        seq_pooling_type="MEAN",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
         enable_test=True,
     ),
-    CLSPoolingEmbedModelInfo(
-        "thenlper/gte-base", architecture="BertModel", enable_test=False
-    ),
-    CLSPoolingEmbedModelInfo(
-        "thenlper/gte-small", architecture="BertModel", enable_test=False
-    ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo("thenlper/gte-base", architecture="BertModel", enable_test=False),
+    EmbedModelInfo("thenlper/gte-small", architecture="BertModel", enable_test=False),
+    EmbedModelInfo(
         "thenlper/gte-large-zh", architecture="BertModel", enable_test=False
     ),
-    CLSPoolingEmbedModelInfo(
-        "thenlper/gte-base-zh", architecture="BertModel", enable_test=False
-    ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo("thenlper/gte-base-zh", architecture="BertModel", enable_test=False),
+    EmbedModelInfo(
         "thenlper/gte-small-zh", architecture="BertModel", enable_test=False
     ),
     ########### NewModel
@@ -43,68 +39,90 @@ MODELS = [
     # - whether to use token_type_embeddings
     # - whether to use context expansion
     # So only test one (the most widely used) model
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
         "Alibaba-NLP/gte-multilingual-base",
         architecture="GteNewModel",
         mteb_score=0.775074696,
         hf_overrides={"architectures": ["GteNewModel"]},
+        seq_pooling_type="CLS",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
         enable_test=True,
     ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
         "Alibaba-NLP/gte-base-en-v1.5",
         architecture="GteNewModel",
         hf_overrides={"architectures": ["GteNewModel"]},
         enable_test=False,
     ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
         "Alibaba-NLP/gte-large-en-v1.5",
         architecture="GteNewModel",
         hf_overrides={"architectures": ["GteNewModel"]},
         enable_test=False,
     ),
     ########### Qwen2ForCausalLM
-    LASTPoolingEmbedModelInfo(
+    EmbedModelInfo(
         "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
         mteb_score=0.758473459018872,
         architecture="Qwen2ForCausalLM",
+        seq_pooling_type="LAST",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
         enable_test=True,
     ),
     ########## ModernBertModel
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
         "Alibaba-NLP/gte-modernbert-base",
         mteb_score=0.748193353,
         architecture="ModernBertModel",
+        seq_pooling_type="CLS",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
         enable_test=True,
     ),
     ########## Qwen3ForCausalLM
-    LASTPoolingEmbedModelInfo(
+    EmbedModelInfo(
         "Qwen/Qwen3-Embedding-0.6B",
         mteb_score=0.771163695,
         architecture="Qwen3ForCausalLM",
-        dtype="float32",
+        seq_pooling_type="LAST",
+        attn_type="decoder",
+        is_prefix_caching_supported=True,
+        is_chunked_prefill_supported=True,
         enable_test=True,
     ),
-    LASTPoolingEmbedModelInfo(
+    EmbedModelInfo(
         "Qwen/Qwen3-Embedding-4B",
         architecture="Qwen3ForCausalLM",
-        dtype="float32",
         enable_test=False,
     ),
 ]
 
 RERANK_MODELS = [
-    CLSPoolingRerankModelInfo(
+    RerankModelInfo(
         # classifier_pooling: mean
         "Alibaba-NLP/gte-reranker-modernbert-base",
         mteb_score=0.33386,
         architecture="ModernBertForSequenceClassification",
+        seq_pooling_type="CLS",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
         enable_test=True,
     ),
-    CLSPoolingRerankModelInfo(
+    RerankModelInfo(
         "Alibaba-NLP/gte-multilingual-reranker-base",
         mteb_score=0.33062,
         architecture="GteNewForSequenceClassification",
         hf_overrides={"architectures": ["GteNewForSequenceClassification"]},
+        seq_pooling_type="CLS",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
         enable_test=True,
     ),
 ]
@@ -123,7 +141,5 @@ def test_embed_models_correctness(
 
 
 @pytest.mark.parametrize("model_info", RERANK_MODELS)
-def test_rerank_models_mteb(
-    hf_runner, vllm_runner, model_info: RerankModelInfo
-) -> None:
-    mteb_test_rerank_models(hf_runner, vllm_runner, model_info)
+def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
+    mteb_test_rerank_models(vllm_runner, model_info)
diff --git a/tests/models/language/pooling_mteb_test/test_intfloat.py b/tests/models/language/pooling_mteb_test/test_intfloat.py
index 1d078db69236ac5972c3460284a8cea645486b24..adadb60eeefe5097d898813a5edb5dae612bc77a 100644
--- a/tests/models/language/pooling_mteb_test/test_intfloat.py
+++ b/tests/models/language/pooling_mteb_test/test_intfloat.py
@@ -3,40 +3,44 @@
 import pytest
 
 from tests.models.language.pooling.embed_utils import correctness_test_embed_models
-from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo
+from tests.models.utils import EmbedModelInfo
 
-from .mteb_utils import mteb_test_embed_models
+from .mteb_embed_utils import mteb_test_embed_models
 
 MODELS = [
     ########## BertModel
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
         "intfloat/e5-small",
         architecture="BertModel",
         mteb_score=0.742285423,
+        seq_pooling_type="MEAN",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
         enable_test=True,
     ),
-    CLSPoolingEmbedModelInfo(
-        "intfloat/e5-base", architecture="BertModel", enable_test=False
-    ),
-    CLSPoolingEmbedModelInfo(
-        "intfloat/e5-large", architecture="BertModel", enable_test=False
-    ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo("intfloat/e5-base", architecture="BertModel", enable_test=False),
+    EmbedModelInfo("intfloat/e5-large", architecture="BertModel", enable_test=False),
+    EmbedModelInfo(
         "intfloat/multilingual-e5-small", architecture="BertModel", enable_test=False
     ),
     ########## XLMRobertaModel
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
         "intfloat/multilingual-e5-base",
         architecture="XLMRobertaModel",
         mteb_score=0.779325955,
+        seq_pooling_type="MEAN",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
         enable_test=True,
     ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
         "intfloat/multilingual-e5-large",
         architecture="XLMRobertaModel",
         enable_test=False,
     ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
         "intfloat/multilingual-e5-large-instruct",
         architecture="XLMRobertaModel",
         enable_test=False,
diff --git a/tests/models/language/pooling_mteb_test/test_jina.py b/tests/models/language/pooling_mteb_test/test_jina.py
index c2065bcd6eb4c2b6d1bc0f057a3c7179d86d7dee..627cc043194302936a90baead74c3c9679b1f426 100644
--- a/tests/models/language/pooling_mteb_test/test_jina.py
+++ b/tests/models/language/pooling_mteb_test/test_jina.py
@@ -10,30 +10,36 @@ from tests.models.language.pooling.embed_utils import (
     matryoshka_fy,
 )
 from tests.models.utils import (
-    CLSPoolingEmbedModelInfo,
-    CLSPoolingRerankModelInfo,
     EmbedModelInfo,
     RerankModelInfo,
 )
 from vllm import PoolingParams
 
-from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
+from .mteb_embed_utils import mteb_test_embed_models
+from .mteb_score_utils import mteb_test_rerank_models
 
 EMBEDDING_MODELS = [
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
         "jinaai/jina-embeddings-v3",
         mteb_score=0.824413164,
         architecture="XLMRobertaModel",
         is_matryoshka=True,
-        dtype="float32",
+        seq_pooling_type="MEAN",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
     )
 ]
 
 RERANK_MODELS = [
-    CLSPoolingRerankModelInfo(
+    RerankModelInfo(
         "jinaai/jina-reranker-v2-base-multilingual",
         mteb_score=0.33643,
         architecture="XLMRobertaForSequenceClassification",
+        seq_pooling_type="CLS",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
     )
 ]
 
@@ -65,10 +71,8 @@ def test_embed_models_correctness(
 
 
 @pytest.mark.parametrize("model_info", RERANK_MODELS)
-def test_rerank_models_mteb(
-    hf_runner, vllm_runner, model_info: RerankModelInfo
-) -> None:
-    mteb_test_rerank_models(hf_runner, vllm_runner, model_info)
+def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
+    mteb_test_rerank_models(vllm_runner, model_info)
 
 
 @pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
diff --git a/tests/models/language/pooling_mteb_test/test_mxbai_rerank.py b/tests/models/language/pooling_mteb_test/test_mxbai_rerank.py
index a6f2a89b268f17e87466a4591b53fb4cab546ceb..74fe760e7839a1e0a89603ee5fab473800d8ede7 100644
--- a/tests/models/language/pooling_mteb_test/test_mxbai_rerank.py
+++ b/tests/models/language/pooling_mteb_test/test_mxbai_rerank.py
@@ -2,13 +2,16 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Any
 
+import mteb
+import numpy as np
 import pytest
 import torch
+from torch.utils.data import DataLoader
 
 from tests.conftest import HfRunner
-from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo
+from tests.models.utils import RerankModelInfo
 
-from .mteb_utils import mteb_test_rerank_models
+from .mteb_score_utils import MtebCrossEncoderMixin, mteb_test_rerank_models
 
 mxbai_rerank_hf_overrides = {
     "architectures": ["Qwen2ForSequenceClassification"],
@@ -17,50 +20,73 @@ mxbai_rerank_hf_overrides = {
 }
 
 RERANK_MODELS = [
-    LASTPoolingRerankModelInfo(
+    RerankModelInfo(
         "mixedbread-ai/mxbai-rerank-base-v2",
         architecture="Qwen2ForSequenceClassification",
         hf_overrides=mxbai_rerank_hf_overrides,
-        mteb_score=0.273,
+        seq_pooling_type="LAST",
+        attn_type="decoder",
+        is_prefix_caching_supported=True,
+        is_chunked_prefill_supported=True,
+        chat_template_name="mxbai_rerank_v2.jinja",
+        mteb_score=0.33651,
         enable_test=True,
     ),
-    LASTPoolingRerankModelInfo(
+    RerankModelInfo(
         "mixedbread-ai/mxbai-rerank-large-v2",
         architecture="Qwen2ForSequenceClassification",
         hf_overrides=mxbai_rerank_hf_overrides,
+        chat_template_name="mxbai_rerank_v2.jinja",
         enable_test=False,
     ),
 ]
 
 
-class MxbaiRerankerHfRunner(HfRunner):
+class MxbaiRerankerHfRunner(MtebCrossEncoderMixin, HfRunner):
     def __init__(
         self, model_name: str, dtype: str = "auto", *args: Any, **kwargs: Any
     ) -> None:
         from transformers import AutoModelForCausalLM, AutoTokenizer
 
-        super().__init__(model_name, dtype, auto_cls=AutoModelForCausalLM)
+        HfRunner.__init__(
+            self,
+            model_name=model_name,
+            auto_cls=AutoModelForCausalLM,
+            dtype=dtype,
+            **kwargs,
+        )
 
         self.tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
         self.yes_loc = self.tokenizer.convert_tokens_to_ids("1")
         self.no_loc = self.tokenizer.convert_tokens_to_ids("0")
 
-    def predict(self, prompts: list[list[str]], *args, **kwargs) -> torch.Tensor:
-        def process_inputs(pairs):
-            inputs = self.tokenizer(
-                pairs,
-                padding=False,
-                truncation="longest_first",
-                return_attention_mask=False,
+    @torch.no_grad
+    def predict(
+        self,
+        inputs1: DataLoader[mteb.types.BatchedInput],
+        inputs2: DataLoader[mteb.types.BatchedInput],
+        *args,
+        **kwargs,
+    ) -> np.ndarray:
+        queries = [text for batch in inputs1 for text in batch["text"]]
+        corpus = [text for batch in inputs2 for text in batch["text"]]
+
+        tokenizer = self.tokenizer
+        prompts = []
+        for query, document in zip(queries, corpus):
+            conversation = [
+                {"role": "query", "content": query},
+                {"role": "document", "content": document},
+            ]
+
+            prompt = tokenizer.apply_chat_template(
+                conversation=conversation,
+                tools=None,
+                chat_template=self.chat_template,
+                tokenize=False,
             )
-            for i, ele in enumerate(inputs["input_ids"]):
-                inputs["input_ids"][i] = ele
-            inputs = self.tokenizer.pad(inputs, padding=True, return_tensors="pt")
-            for key in inputs:
-                inputs[key] = inputs[key].to(self.model.device)
-            return inputs
-
-        @torch.no_grad()
+            prompts.append(prompt)
+
         def compute_logits(inputs):
             logits = self.model(**inputs).logits[:, -1, :]
             yes_logits = logits[:, self.yes_loc]
@@ -70,9 +96,9 @@ class MxbaiRerankerHfRunner(HfRunner):
             return scores
 
         scores = []
-        for query, doc, *_ in prompts:
-            pairs = [(query, doc)]
-            inputs = process_inputs(pairs)
+        for prompt in prompts:
+            inputs = tokenizer([prompt], return_tensors="pt")
+            inputs = self.wrap_device(inputs)
             score = compute_logits(inputs)
             scores.append(score[0].item())
         return torch.Tensor(scores)
@@ -80,4 +106,4 @@ class MxbaiRerankerHfRunner(HfRunner):
 
 @pytest.mark.parametrize("model_info", RERANK_MODELS)
 def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
-    mteb_test_rerank_models(MxbaiRerankerHfRunner, vllm_runner, model_info)
+    mteb_test_rerank_models(vllm_runner, model_info, hf_runner=MxbaiRerankerHfRunner)
diff --git a/tests/models/language/pooling_mteb_test/test_nemotron.py b/tests/models/language/pooling_mteb_test/test_nemotron.py
new file mode 100644
index 0000000000000000000000000000000000000000..79fae2833990213454263e0a40cd9d66c71e9af2
--- /dev/null
+++ b/tests/models/language/pooling_mteb_test/test_nemotron.py
@@ -0,0 +1,50 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from tests.models.language.pooling_mteb_test.mteb_embed_utils import (
+    mteb_test_embed_models,
+)
+from tests.models.language.pooling_mteb_test.mteb_score_utils import (
+    mteb_test_rerank_models,
+)
+from tests.models.utils import (
+    EmbedModelInfo,
+    RerankModelInfo,
+)
+
+EMBEDDING_MODELS = [
+    EmbedModelInfo(
+        "nvidia/llama-nemotron-embed-1b-v2",
+        architecture="LlamaBidirectionalModel",
+        mteb_score=0.689164662128673,
+        seq_pooling_type="MEAN",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
+    )
+]
+
+RERANK_MODELS = [
+    RerankModelInfo(
+        "nvidia/llama-nemotron-rerank-1b-v2",
+        architecture="LlamaBidirectionalForSequenceClassification",
+        chat_template_name="nemotron-rerank.jinja",
+        mteb_score=0.33994,
+        seq_pooling_type="MEAN",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
+    ),
+]
+
+
+@pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
+def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None:
+    mteb_test_embed_models(hf_runner, vllm_runner, model_info)
+
+
+@pytest.mark.parametrize("model_info", RERANK_MODELS)
+def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
+    mteb_test_rerank_models(vllm_runner, model_info)
diff --git a/tests/models/language/pooling_mteb_test/test_nomic.py b/tests/models/language/pooling_mteb_test/test_nomic.py
index c54a43052483a6683df786a6405b7e28edcb01ce..fa987fab7cdd1110bc48fc7bae7f9fee739a19d8 100644
--- a/tests/models/language/pooling_mteb_test/test_nomic.py
+++ b/tests/models/language/pooling_mteb_test/test_nomic.py
@@ -4,30 +4,38 @@
 import pytest
 
 from tests.models.language.pooling.embed_utils import correctness_test_embed_models
-from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo
+from tests.models.utils import EmbedModelInfo
 
-from .mteb_utils import mteb_test_embed_models
+from .mteb_embed_utils import mteb_test_embed_models
 
 MODELS = [
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
         "nomic-ai/nomic-embed-text-v1",
         architecture="NomicBertModel",
         mteb_score=0.737568559,
         enable_test=True,
+        seq_pooling_type="MEAN",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
     ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
         "nomic-ai/nomic-embed-text-v1.5",
         architecture="NomicBertModel",
         enable_test=False,
     ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
         "nomic-ai/CodeRankEmbed", architecture="NomicBertModel", enable_test=False
     ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
         "nomic-ai/nomic-embed-text-v2-moe",
         architecture="NomicBertModel",
         mteb_score=0.715488912,
         enable_test=True,
+        seq_pooling_type="MEAN",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
     ),
 ]
 
diff --git a/tests/models/language/pooling_mteb_test/test_qwen3_reranker.py b/tests/models/language/pooling_mteb_test/test_qwen3_reranker.py
index 9a1be6c0be1d62db9eea4d0f5169057bbd79af89..3c182cb046b560a58646414a1fa75d5495afd739 100644
--- a/tests/models/language/pooling_mteb_test/test_qwen3_reranker.py
+++ b/tests/models/language/pooling_mteb_test/test_qwen3_reranker.py
@@ -1,15 +1,19 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa: E501
 from typing import Any
 
+import mteb
+import numpy as np
 import pytest
 import torch
+from torch.utils.data import DataLoader
 
 from tests.conftest import HfRunner
-from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo
+from tests.models.utils import RerankModelInfo
 from tests.utils import multi_gpu_test
 
-from .mteb_utils import mteb_test_rerank_models
+from .mteb_score_utils import MtebCrossEncoderMixin, mteb_test_rerank_models
 
 qwen3_reranker_hf_overrides = {
     "architectures": ["Qwen3ForSequenceClassification"],
@@ -18,50 +22,74 @@ qwen3_reranker_hf_overrides = {
 }
 
 RERANK_MODELS = [
-    LASTPoolingRerankModelInfo(
+    RerankModelInfo(
         "Qwen/Qwen3-Reranker-0.6B",
         architecture="Qwen3ForSequenceClassification",
-        mteb_score=0.25736,
         hf_overrides=qwen3_reranker_hf_overrides,
+        chat_template_name="qwen3_reranker.jinja",
+        seq_pooling_type="LAST",
+        attn_type="decoder",
+        is_prefix_caching_supported=True,
+        is_chunked_prefill_supported=True,
+        mteb_score=0.33459,
         enable_test=True,
     ),
-    LASTPoolingRerankModelInfo(
+    RerankModelInfo(
         "Qwen/Qwen3-Reranker-4B",
         architecture="Qwen3ForSequenceClassification",
+        chat_template_name="qwen3_reranker.jinja",
         hf_overrides=qwen3_reranker_hf_overrides,
         enable_test=False,
     ),
 ]
 
 
-class Qwen3RerankerHfRunner(HfRunner):
+class Qwen3RerankerHfRunner(MtebCrossEncoderMixin, HfRunner):
     def __init__(
         self, model_name: str, dtype: str = "auto", *args: Any, **kwargs: Any
     ) -> None:
         from transformers import AutoModelForCausalLM, AutoTokenizer
 
-        super().__init__(model_name, dtype, auto_cls=AutoModelForCausalLM)
+        HfRunner.__init__(
+            self,
+            model_name=model_name,
+            auto_cls=AutoModelForCausalLM,
+            dtype=dtype,
+            **kwargs,
+        )
 
         self.tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
         self.token_false_id = self.tokenizer.convert_tokens_to_ids("no")
         self.token_true_id = self.tokenizer.convert_tokens_to_ids("yes")
-
-    def predict(self, prompts: list[list[str]], *args, **kwargs) -> torch.Tensor:
-        def process_inputs(pairs):
-            inputs = self.tokenizer(
-                pairs,
-                padding=False,
-                truncation="longest_first",
-                return_attention_mask=False,
+        self.max_length = 40960
+
+    @torch.no_grad
+    def predict(
+        self,
+        inputs1: DataLoader[mteb.types.BatchedInput],
+        inputs2: DataLoader[mteb.types.BatchedInput],
+        *args,
+        **kwargs,
+    ) -> np.ndarray:
+        queries = [text for batch in inputs1 for text in batch["text"]]
+        corpus = [text for batch in inputs2 for text in batch["text"]]
+
+        tokenizer = self.tokenizer
+        prompts = []
+        for query, document in zip(queries, corpus):
+            conversation = [
+                {"role": "query", "content": query},
+                {"role": "document", "content": document},
+            ]
+
+            prompt = tokenizer.apply_chat_template(
+                conversation=conversation,
+                tools=None,
+                chat_template=self.chat_template,
+                tokenize=False,
             )
-            for i, ele in enumerate(inputs["input_ids"]):
-                inputs["input_ids"][i] = ele
-            inputs = self.tokenizer.pad(inputs, padding=True, return_tensors="pt")
-            for key in inputs:
-                inputs[key] = inputs[key].to(self.model.device)
-            return inputs
-
-        @torch.no_grad()
+            prompts.append(prompt)
+
         def compute_logits(inputs):
             batch_scores = self.model(**inputs).logits[:, -1, :]
             true_vector = batch_scores[:, self.token_true_id]
@@ -72,9 +100,9 @@ class Qwen3RerankerHfRunner(HfRunner):
             return scores
 
         scores = []
-        for query, doc, *_ in prompts:
-            pairs = [(query, doc)]
-            inputs = process_inputs(pairs)
+        for prompt in prompts:
+            inputs = tokenizer([prompt], return_tensors="pt")
+            inputs = self.wrap_device(inputs)
             score = compute_logits(inputs)
             scores.append(score[0].item())
         return torch.Tensor(scores)
@@ -82,7 +110,7 @@ class Qwen3RerankerHfRunner(HfRunner):
 
 @pytest.mark.parametrize("model_info", RERANK_MODELS)
 def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
-    mteb_test_rerank_models(Qwen3RerankerHfRunner, vllm_runner, model_info)
+    mteb_test_rerank_models(vllm_runner, model_info, hf_runner=Qwen3RerankerHfRunner)
 
 
 @pytest.mark.parametrize("model_info", RERANK_MODELS)
@@ -95,5 +123,8 @@ def test_rerank_models_mteb_tp(vllm_runner, model_info: RerankModelInfo) -> None
     }
 
     mteb_test_rerank_models(
-        Qwen3RerankerHfRunner, vllm_runner, model_info, vllm_extra_kwargs
+        vllm_runner,
+        model_info,
+        vllm_extra_kwargs=vllm_extra_kwargs,
+        hf_runner=Qwen3RerankerHfRunner,
     )
diff --git a/tests/models/language/pooling_mteb_test/test_snowflake_arctic_embed.py b/tests/models/language/pooling_mteb_test/test_snowflake_arctic_embed.py
index 3c30628aeaa49cc3d4a860e84381a7e5861a1e56..f3afbe84fa93be67f7cabfc77d1087eaade5e201 100644
--- a/tests/models/language/pooling_mteb_test/test_snowflake_arctic_embed.py
+++ b/tests/models/language/pooling_mteb_test/test_snowflake_arctic_embed.py
@@ -4,62 +4,82 @@
 import pytest
 
 from tests.models.language.pooling.embed_utils import correctness_test_embed_models
-from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo
+from tests.models.utils import EmbedModelInfo
 
-from .mteb_utils import mteb_test_embed_models
+from .mteb_embed_utils import mteb_test_embed_models
 
 MODELS = [
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
         "Snowflake/snowflake-arctic-embed-xs",
         is_matryoshka=False,
         architecture="BertModel",
         mteb_score=0.714927797,
+        seq_pooling_type="CLS",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
         enable_test=True,
     ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
         "Snowflake/snowflake-arctic-embed-s",
         is_matryoshka=False,
         architecture="BertModel",
         enable_test=False,
     ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
         "Snowflake/snowflake-arctic-embed-m",
         is_matryoshka=False,
         architecture="BertModel",
         enable_test=False,
     ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
         "Snowflake/snowflake-arctic-embed-m-long",
         is_matryoshka=False,
         architecture="NomicBertModel",
         mteb_score=0.681146831,
+        seq_pooling_type="CLS",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
         enable_test=True,
     ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
         "Snowflake/snowflake-arctic-embed-l",
         is_matryoshka=False,
         architecture="BertModel",
         enable_test=False,
     ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
         "Snowflake/snowflake-arctic-embed-m-v1.5",
         is_matryoshka=True,
         architecture="BertModel",
         mteb_score=0.649088363,
+        seq_pooling_type="CLS",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
         enable_test=True,
     ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
         "Snowflake/snowflake-arctic-embed-l-v2.0",
         is_matryoshka=True,
         architecture="XLMRobertaModel",
         mteb_score=0.712258299,
+        seq_pooling_type="CLS",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
         enable_test=True,
     ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
         "Snowflake/snowflake-arctic-embed-m-v2.0",
         is_matryoshka=True,
         architecture="GteModel",
         mteb_score=0.706622444,
+        seq_pooling_type="CLS",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
         enable_test=True,
     ),
 ]
diff --git a/tests/models/language/pooling_mteb_test/test_st_projector.py b/tests/models/language/pooling_mteb_test/test_st_projector.py
index 74fe4b9bcc03ff719a865a4957d61b0fc9e3b77c..395846347fb3fced94fd8d8cf0dc70e094a30fa6 100644
--- a/tests/models/language/pooling_mteb_test/test_st_projector.py
+++ b/tests/models/language/pooling_mteb_test/test_st_projector.py
@@ -3,27 +3,32 @@
 import pytest
 
 from tests.models.utils import (
-    CLSPoolingEmbedModelInfo,
     EmbedModelInfo,
-    LASTPoolingEmbedModelInfo,
 )
 
-from .mteb_utils import mteb_test_embed_models
+from .mteb_embed_utils import mteb_test_embed_models
 
 # ST models with projector (Dense) layers
 ST_PROJECTOR_MODELS = [
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
         "TencentBAC/Conan-embedding-v1",
         architecture="BertModel",
         mteb_score=0.688611955,
+        seq_pooling_type="MEAN",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
         enable_test=True,
     ),
-    LASTPoolingEmbedModelInfo(
+    EmbedModelInfo(
         "google/embeddinggemma-300m",
         architecture="Gemma3TextModel",
         mteb_score=0.7473819294684156,
+        seq_pooling_type="MEAN",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
         enable_test=True,
-        dtype="float32",
     ),
 ]
 
diff --git a/tests/models/multimodal/generation/conftest.py b/tests/models/multimodal/conftest.py
similarity index 77%
rename from tests/models/multimodal/generation/conftest.py
rename to tests/models/multimodal/conftest.py
index 26f8586742ceace83eb403dfef1c9cce12156a15..31d99218c8276541917bc933da720de607a9332a 100644
--- a/tests/models/multimodal/generation/conftest.py
+++ b/tests/models/multimodal/conftest.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Pytest configuration for vLLM tests."""
+"""Pytest configuration for vLLM multimodal tests."""
 
 import warnings
 
@@ -9,20 +9,17 @@ import torch
 from vllm.platforms import current_platform
 
 
-def pytest_configure(config):
-    """Disable Flash/MemEfficient SDP on ROCm to avoid HF
-    Transformers accuracy issues.
-    """
+def pytest_collection_modifyitems(config, items):
+    """Configure ROCm-specific settings based on collected tests."""
     if not current_platform.is_rocm():
         return
 
     skip_patterns = ["test_granite_speech.py"]
     if any(pattern in str(arg) for arg in config.args for pattern in skip_patterns):
-        # Skip disabling SDP for Granite Speech tests on ROCm
         return
 
     # Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
-    # accuracy issues
+    # accuracy issues: https://github.com/vllm-project/vllm/issues/30167
     # TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace
     torch.backends.cuda.enable_flash_sdp(False)
     torch.backends.cuda.enable_mem_efficient_sdp(False)
diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py
index f25904f1d4f7c746ef9ff418820c0276b33e886e..f117073fe5c3b7e06220fbcae9db0daba00ccb9b 100644
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -123,10 +123,6 @@ VLM_TEST_SETTINGS = {
         ),
         auto_cls=AutoModelForImageTextToText,
         vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
-        dtype="bfloat16",
-        marks=[
-            pytest.mark.skip(reason="vLLM does not support PrefixLM attention mask")
-        ],
     ),
 
     "qwen2_5_vl": VLMTestInfo(
@@ -176,6 +172,13 @@ VLM_TEST_SETTINGS = {
         auto_cls=AutoModelForImageTextToText,
         vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
         patch_hf_runner=model_utils.qwen3_vl_patch_hf_runner,
+        vllm_runner_kwargs={
+            "attention_config": {
+                "backend": "ROCM_AITER_FA",
+            },
+        }
+        if current_platform.is_rocm()
+        else None,
         image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
         marks=[
             pytest.mark.core_model,
@@ -256,8 +259,19 @@ VLM_TEST_SETTINGS = {
         image_size_factors=[(0.25, 0.2, 0.15)],
         vllm_runner_kwargs={
             "model_impl": "transformers",
+            # TODO: [ROCm] Revert this once issue #30167 is resolved
+            **(
+                {
+                    "mm_processor_kwargs": {
+                        "min_pixels": 256 * 28 * 28,
+                        "max_pixels": 1280 * 28 * 28,
+                    },
+                }
+                if current_platform.is_rocm()
+                else {}
+            ),
         },
-        marks=[large_gpu_mark(min_gb=32)],
+        marks=[large_gpu_mark(min_gb=80 if current_platform.is_rocm() else 32)],
     ),
     #### Extended model tests
     "aria": VLMTestInfo(
@@ -498,6 +512,7 @@ VLM_TEST_SETTINGS = {
         max_model_len=8192,
         use_tokenizer_eos=True,
         patch_hf_runner=model_utils.internvl_patch_hf_runner,
+        num_logprobs=10 if current_platform.is_rocm() else 5,
     ),
     "intern_vl-hf": VLMTestInfo(
         models=["OpenGVLab/InternVL3-1B-hf"],
@@ -513,6 +528,34 @@ VLM_TEST_SETTINGS = {
         use_tokenizer_eos=True,
         auto_cls=AutoModelForImageTextToText,
     ),
+    "isaac": VLMTestInfo(
+        models=[
+            "PerceptronAI/Isaac-0.1",
+            "PerceptronAI/Isaac-0.2-2B-Preview",
+        ],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: (
+            f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n"
+        ),
+        img_idx_to_prompt=lambda idx: "<image>",
+        single_image_prompts=IMAGE_ASSETS.prompts(
+            {
+                "stop_sign": "<vlm_image>Please describe the image shortly.",
+                "cherry_blossom": "<vlm_image>Please infer the season with reason.",
+            }
+        ),
+        multi_image_prompt=(
+            "Picture 1: <vlm_image>\n"
+            "Picture 2: <vlm_image>\n"
+            "Describe these two images with one paragraph respectively."
+        ),
+        enforce_eager=False,
+        max_model_len=4096,
+        max_num_seqs=2,
+        hf_model_kwargs={"device_map": "auto"},
+        patch_hf_runner=model_utils.isaac_patch_hf_runner,
+        image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
+    ),
     "kimi_vl": VLMTestInfo(
         models=["moonshotai/Kimi-VL-A3B-Instruct"],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
@@ -648,7 +691,17 @@ VLM_TEST_SETTINGS = {
         hf_output_post_proc=model_utils.minimax_vl_01_hf_output,
         patch_hf_runner=model_utils.minimax_vl_01_patch_hf_runner,
         auto_cls=AutoModelForImageTextToText,
-        marks=[large_gpu_mark(min_gb=80)],
+        marks=[
+            large_gpu_mark(min_gb=80),
+            # TODO: [ROCm] Fix pickle issue with ROCm spawn and tp>1
+            pytest.mark.skipif(
+                current_platform.is_rocm(),
+                reason=(
+                    "ROCm: Model too large for single GPU; "
+                    "multi-GPU blocked by HF _LazyConfigMapping pickle issue with spawn"
+                ),
+            ),
+        ],
     ),
     "molmo": VLMTestInfo(
         models=["allenai/Molmo-7B-D-0924"],
diff --git a/tests/models/multimodal/generation/test_granite_speech.py b/tests/models/multimodal/generation/test_granite_speech.py
index 5142493ee194a5f3b996ac3ca36eb860f82443f0..c553174ce94bd858ff46eca2c5498f510aa9e2e1 100644
--- a/tests/models/multimodal/generation/test_granite_speech.py
+++ b/tests/models/multimodal/generation/test_granite_speech.py
@@ -37,10 +37,12 @@ audio_lora_path = MODEL_NAME
 models = [MODEL_NAME]
 
 
-@pytest.fixture(autouse=True)
-def set_attention_backend_for_rocm(monkeypatch):
+@pytest.fixture
+def granite_speech_attention_config():
+    """Return attention config for Granite Speech tests on ROCm."""
     if current_platform.is_rocm():
-        monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN")
+        return {"backend": "ROCM_AITER_FA"}
+    return None
 
 
 def run_test(
@@ -55,6 +57,7 @@ def run_test(
     num_logprobs: int,
     tensor_parallel_size: int,
     distributed_executor_backend: str | None = None,
+    attention_config: dict | None = None,
 ):
     """Inference result should be the same between hf and vllm.
 
@@ -82,6 +85,7 @@ def run_test(
         enable_lora=True,
         max_lora_rank=64,
         enforce_eager=True,
+        attention_config=attention_config,
     ) as vllm_model:
         lora_request = LoRARequest("audio", 1, audio_lora_path)
         vllm_outputs_per_case = [
@@ -133,6 +137,7 @@ def test_models(
     vllm_runner,
     model: str,
     audio_assets: AudioTestAssets,
+    granite_speech_attention_config,
     dtype: str,
     max_model_len: int,
     max_tokens: int,
@@ -159,4 +164,5 @@ def test_models(
         max_tokens=max_tokens,
         num_logprobs=num_logprobs,
         tensor_parallel_size=1,
-    )
\ No newline at end of file
+        attention_config=granite_speech_attention_config,
+    )
diff --git a/tests/models/multimodal/generation/test_keye.py b/tests/models/multimodal/generation/test_keye.py
index 6f98bde1d91ea990aa677b90d1a31e9ee4ad7840..4205a8b2d1ac4cc2a30b38d6cd2635c514b8ae6d 100644
--- a/tests/models/multimodal/generation/test_keye.py
+++ b/tests/models/multimodal/generation/test_keye.py
@@ -8,7 +8,7 @@ from PIL.Image import Image
 from transformers import AutoProcessor
 
 from vllm import LLM, EngineArgs, SamplingParams
-from vllm.multimodal.utils import encode_image_base64
+from vllm.multimodal.utils import encode_image_url
 
 MODEL_NAME = "Kwai-Keye/Keye-VL-8B-Preview"
 
@@ -31,10 +31,7 @@ def test_keye_vl(
     question: str,
 ):
     images = [asset.pil_image for asset in image_assets]
-
-    image_urls = [
-        f"data:image/jpeg;base64,{encode_image_base64(image)}" for image in images
-    ]
+    image_urls = [encode_image_url(image) for image in images]
 
     engine_args = EngineArgs(
         model=MODEL_NAME,
diff --git a/tests/models/multimodal/generation/test_nemotron_parse.py b/tests/models/multimodal/generation/test_nemotron_parse.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b05d336c10ba8193bc8d94322bf6992ad2388fe
--- /dev/null
+++ b/tests/models/multimodal/generation/test_nemotron_parse.py
@@ -0,0 +1,89 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Sequence
+
+import pytest
+from transformers import AutoModel
+
+from tests.models.utils import check_logprobs_close
+from vllm.assets.image import ImageAsset
+
+from ....conftest import HfRunner, PromptImageInput, VllmRunner
+from ....utils import create_new_process_for_each_test
+
+IMAGE = ImageAsset("paper-11").pil_image_ext(ext="png").convert("RGB")
+PROMPT = "</s><s><predict_bbox><predict_classes><output_markdown>"
+
+
+def run_test(
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    inputs: Sequence[tuple[list[str], PromptImageInput]],
+    model: str,
+    *,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    """Verify that the inference result is the same between hf and vllm."""
+    with vllm_runner(
+        model,
+        dtype=dtype,
+        max_num_seqs=64,
+        limit_mm_per_prompt={"image": 1},
+        trust_remote_code=True,
+    ) as vllm_model:
+        vllm_outputs_per_case = [
+            vllm_model.generate_greedy_logprobs(
+                prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                images=images,
+            )
+            for prompts, images in inputs
+        ]
+
+    with hf_runner(model, dtype=dtype, auto_cls=AutoModel) as hf_model:
+        hf_outputs_per_case = [
+            hf_model.generate_greedy_logprobs_limit(
+                prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                images=images,
+                use_cache=False,  # HF Nemotron Parse crashes here without this
+            )
+            for prompts, images in inputs
+        ]
+
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
+        check_logprobs_close(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=vllm_outputs,
+            name_0="hf",
+            name_1="vllm",
+        )
+
+
+@pytest.mark.core_model
+@pytest.mark.parametrize("model", ["nvidia/NVIDIA-Nemotron-Parse-v1.1"])
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("num_logprobs", [5])
+@create_new_process_for_each_test("spawn")
+def test_models(
+    hf_runner, vllm_runner, model: str, dtype: str, num_logprobs: int
+) -> None:
+    run_test(
+        hf_runner,
+        vllm_runner,
+        inputs=[
+            (
+                [PROMPT] * 10,
+                [IMAGE] * 10,
+            ),
+        ],
+        model=model,
+        dtype=dtype,
+        max_tokens=100,
+        num_logprobs=num_logprobs,
+    )
diff --git a/tests/models/multimodal/generation/test_qwen2_vl.py b/tests/models/multimodal/generation/test_qwen2_vl.py
index a58caea42fd31ddd65fdd3cad732751d938523d2..f68577b5db92e911817fad39a1b926270cda69c7 100644
--- a/tests/models/multimodal/generation/test_qwen2_vl.py
+++ b/tests/models/multimodal/generation/test_qwen2_vl.py
@@ -269,7 +269,7 @@ def run_embedding_input_test(
     """Inference result should be the same between
     original image/video input and image/video embeddings input.
     """
-    from transformers import AutoProcessor  # noqa: F401
+    from transformers import AutoProcessor
 
     processor = AutoProcessor.from_pretrained(model)
 
diff --git a/tests/models/multimodal/generation/test_vit_backend_functionality.py b/tests/models/multimodal/generation/test_vit_backend_functionality.py
index a4e4ce312ddd4bf163d92994181f1d9fd793a56f..8f141746e24992e9a7e3e817ed07b7cfd13f7a3a 100644
--- a/tests/models/multimodal/generation/test_vit_backend_functionality.py
+++ b/tests/models/multimodal/generation/test_vit_backend_functionality.py
@@ -14,10 +14,10 @@ import pytest
 from transformers import AutoProcessor
 
 from vllm import LLM, EngineArgs, SamplingParams
-from vllm.attention.backends.registry import AttentionBackendEnum
-from vllm.multimodal.utils import encode_image_base64
+from vllm.multimodal.utils import encode_image_url
 from vllm.multimodal.video import sample_frames_from_video
 from vllm.platforms import current_platform
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
 
 from ....utils import create_new_process_for_each_test
 from ...utils import dummy_hf_overrides
@@ -178,8 +178,7 @@ def build_dots_ocr_prompt(images, config):
     """Build Dots.OCR specific prompt with OCR instructions."""
     # Use only stop_sign image for Dots.OCR
     image = images[0]  # Already filtered to stop_sign
-
-    image_url = f"data:image/jpeg;base64,{encode_image_base64(image)}"
+    image_url = encode_image_url(image)
 
     placeholders = [{"type": "image_url", "image_url": {"url": image_url}}]
     messages = [
@@ -204,9 +203,7 @@ def build_processor_prompt(images, config):
         config["model_name"], trust_remote_code=True
     )
 
-    image_urls = [
-        f"data:image/jpeg;base64,{encode_image_base64(img)}" for img in images
-    ]
+    image_urls = [encode_image_url(img) for img in images]
     placeholders = [{"type": "image", "image": url} for url in image_urls]
     messages = [
         {
@@ -225,9 +222,7 @@ def build_processor_prompt(images, config):
 
 def build_ovis_prompt(images, config):
     """Build Ovis2.5 specific prompt with custom format."""
-    image_urls = [
-        f"data:image/jpeg;base64,{encode_image_base64(img)}" for img in images
-    ]
+    image_urls = [encode_image_url(img) for img in images]
 
     placeholders = "\n".join(
         f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
diff --git a/tests/models/multimodal/generation/test_voxtral.py b/tests/models/multimodal/generation/test_voxtral.py
index 0eaef49e2395c09ceaeb4773470c3c1cac3e6ef0..9f8415c0c390caedd54d06ce7d83e605d21927a6 100644
--- a/tests/models/multimodal/generation/test_voxtral.py
+++ b/tests/models/multimodal/generation/test_voxtral.py
@@ -111,4 +111,5 @@ async def test_online_serving(client, audio_assets: AudioTestAssets):
 
     assert len(chat_completion.choices) == 1
     choice = chat_completion.choices[0]
+    assert choice.message.content == "In the first audio clip, you hear a brief"
     assert choice.finish_reason == "length"
diff --git a/tests/models/multimodal/generation/test_whisper.py b/tests/models/multimodal/generation/test_whisper.py
index 24fe243977ebb8adbe0cb4f8a616d23e40d753b9..1d18351f50fd08f249209738b926fee4ed9c6007 100644
--- a/tests/models/multimodal/generation/test_whisper.py
+++ b/tests/models/multimodal/generation/test_whisper.py
@@ -114,7 +114,7 @@ def check_model_available(model: str) -> None:
 @pytest.mark.core_model
 @pytest.mark.cpu_model
 @pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "openai/whisper-large-v3-turbo")])
-@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("dtype", ["half", "float"])
 @pytest.mark.parametrize("num_logprobs", [5])
 @pytest.mark.parametrize("enforce_eager", [True, False])
 @create_new_process_for_each_test("spawn")
diff --git a/tests/models/multimodal/generation/vlm_utils/model_utils.py b/tests/models/multimodal/generation/vlm_utils/model_utils.py
index b2c62fbd119cc082c5a5e37a4b001540420eb639..acc18021859b5be5220a7ecc2258f1eb18a6d684 100644
--- a/tests/models/multimodal/generation/vlm_utils/model_utils.py
+++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py
@@ -522,6 +522,183 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
     return hf_model
 
 
+def isaac_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    """Patch HF runner for Isaac:
+    1) Move processor outputs to model device
+    2) Ensure IsaacModel.forward returns hidden_states
+    for compatibility with hidden_states_to_seq_logprobs()
+    """
+
+    from perceptron.tensorstream import TextType
+    from perceptron.tensorstream.ops import compute_mrope_pos_tensor, modality_mask
+    from transformers.modeling_outputs import BaseModelOutputWithPast
+
+    def compute_position_ids_input_ids(input_ids: torch.Tensor) -> torch.Tensor:
+        """
+        Create 3D positional indices for token input.
+        """
+        batch_size, seq_length = input_ids.shape
+        position_ids = torch.arange(seq_length, device=input_ids.device)
+        position_ids = position_ids.view(1, -1).expand(batch_size, -1)
+        position_ids = position_ids.unsqueeze(2).expand(-1, -1, 3)  # Add 3D for MRoPE
+        return position_ids
+
+    model_device = next(hf_model.model.parameters()).device
+
+    # ----------------------------
+    # 1) Patch processor: move BatchFeature input_ids and TensorStream to model device
+    # ----------------------------
+    original_processor = hf_model.processor
+
+    def patched_processor(*args, **kwargs):
+        result = original_processor(*args, **kwargs)
+        for k, v in result.data.items():
+            result[k] = v.to(model_device)
+        return result
+
+    hf_model.processor = patched_processor
+
+    tokenizer = AutoTokenizer.from_pretrained(
+        hf_model.model_name, trust_remote_code=True
+    )
+
+    original_generate = hf_model.model.generate
+
+    def patched_generate(*args, **kwargs):
+        kwargs["pad_token_id"] = tokenizer.eos_token_id
+        kwargs["eos_token_id"] = tokenizer.eos_token_id
+        return original_generate(*args, **kwargs)
+
+    hf_model.model.generate = patched_generate
+
+    # ----------------------------
+    # 2) Patch IsaacModel.forward: add hidden_states to the output
+    # ----------------------------
+    isaac_model = hf_model.model.model
+
+    def patched_forward(
+        self,
+        input_ids=None,
+        tensor_stream=None,
+        attention_mask=None,
+        position_ids=None,
+        modality_tensor=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        use_cache=None,
+        output_hidden_states=None,
+        return_dict=None,
+        cache_position=None,
+        **kwargs,
+    ):
+        """
+        Forward pass with MRoPE position embeddings.
+        Computes position embeddings once and passes them through all layers.
+        """
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        # Get inputs
+        if tensor_stream is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both tensor_stream and inputs_embeds")
+        elif tensor_stream is not None:
+            # Embed TensorStream directly
+            inputs_embeds = self.embed_stream(tensor_stream)
+            # Create modality tensor if not provided
+            if modality_tensor is None:
+                modality_tensor = modality_mask(tensor_stream)
+        elif input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time"
+            )
+        elif input_ids is not None:
+            inputs_embeds = self.embed_tokens(input_ids)
+            # Create text modality tensor if not provided
+            if modality_tensor is None:
+                batch_size, seq_length = input_ids.shape
+                modality_tensor = torch.full(
+                    (batch_size, seq_length),
+                    TextType.text.value,
+                    device=input_ids.device,
+                    dtype=torch.long,
+                )
+        elif inputs_embeds is None:
+            raise ValueError(
+                "You have to specify either tensor_stream, input_ids or inputs_embeds"
+            )
+
+        # Create default position_ids if not provided
+        if position_ids is None:
+            if tensor_stream is not None:
+                position_ids = compute_mrope_pos_tensor(tensor_stream)  # (B,L,3)
+            else:
+                position_ids = compute_position_ids_input_ids(input_ids)
+
+        # Compute MRoPE position embeddings if we have custom rotary_emb
+        cos, sin = self.rotary_emb(position_ids, modality_tensor)
+        cos = cos.to(inputs_embeds.dtype)
+        sin = sin.to(inputs_embeds.dtype)
+
+        # Prepare attention mask
+        if attention_mask is not None:
+            attention_mask = self._update_causal_mask(
+                attention_mask, inputs_embeds, cache_position, past_key_values, False
+            )
+
+        # Initialize and collect hidden states
+        hidden_states = inputs_embeds
+        hidden_states_list: list[torch.Tensor] = []
+
+        if output_hidden_states:
+            hidden_states_list.append(hidden_states)
+
+        for decoder_layer in self.layers:
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_values,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=(cos, sin),
+                **kwargs,
+            )
+
+            hidden_states = (
+                layer_outputs[0] if isinstance(layer_outputs, tuple) else layer_outputs
+            )
+
+            if output_hidden_states:
+                hidden_states_list.append(hidden_states)
+
+        # Final layer norm
+        hidden_states = self.norm(hidden_states)
+
+        if output_hidden_states:
+            hidden_states_list.append(hidden_states)
+
+        # Convert to tuple or None
+        all_hidden_states = tuple(hidden_states_list) if output_hidden_states else None
+
+        # Include hiden_states for compatibility with hidden_states_to_seq_logprobs()
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+        )
+
+    isaac_model.forward = types.MethodType(patched_forward, isaac_model)
+
+    return hf_model
+
+
 def skyworkr1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
     """Patches and returns an instance of the HfRunner to use for SkyworkR1V."""
 
diff --git a/tests/models/multimodal/pooling/conftest.py b/tests/models/multimodal/pooling/conftest.py
index c5f40cb42ca2a5a115d441cb0ade074433330b90..401bc39b4b1090a5c98ab3dacc1e2894ac21bf6d 100644
--- a/tests/models/multimodal/pooling/conftest.py
+++ b/tests/models/multimodal/pooling/conftest.py
@@ -2,23 +2,17 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Pytest configuration for vLLM pooling tests."""
 
-import os
-import warnings
+import pytest
 
 from vllm.platforms import current_platform
 
 
-def pytest_collection_modifyitems(config, items):
-    """Set FLEX_ATTENTION backend for SigLIP tests on ROCm."""
-    if not current_platform.is_rocm():
-        return
+@pytest.fixture
+def siglip_attention_config():
+    """Return attention config for SigLIP tests on ROCm.
 
-    siglip_tests = [item for item in items if "test_siglip" in item.nodeid]
-
-    if siglip_tests:
-        os.environ["VLLM_ATTENTION_BACKEND"] = "FLEX_ATTENTION"
-        warnings.warn(
-            "ROCm: Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION for SigLIP tests",
-            UserWarning,
-            stacklevel=1,
-        )
+    On ROCm, SigLIP tests require FLEX_ATTENTION backend.
+    """
+    if current_platform.is_rocm():
+        return {"backend": "FLEX_ATTENTION"}
+    return None
diff --git a/tests/models/multimodal/pooling/test_intern_vit.py b/tests/models/multimodal/pooling/test_intern_vit.py
index 9d860d7a86330e5e363f78f8e1e127189c11e791..60b9af4b864b2923a344cc5d156f635fb71a5440 100644
--- a/tests/models/multimodal/pooling/test_intern_vit.py
+++ b/tests/models/multimodal/pooling/test_intern_vit.py
@@ -78,7 +78,9 @@ def run_intern_vit_test(
     ],
 )
 @pytest.mark.parametrize("dtype", ["half"])
-def test_models(dist_init, image_assets, model_id, dtype: str) -> None:
+def test_models(
+    default_vllm_config, dist_init, image_assets, model_id, dtype: str
+) -> None:
     run_intern_vit_test(
         image_assets,
         model_id,
diff --git a/tests/models/multimodal/pooling/test_jinavl_reranker.py b/tests/models/multimodal/pooling/test_jinavl_reranker.py
index d7b33be7a0adbe57c9506ea2cb7548ca9ef8ec58..3d41ba2e5b9b92b4ee00837c75e5f4c52e5fe76a 100644
--- a/tests/models/multimodal/pooling/test_jinavl_reranker.py
+++ b/tests/models/multimodal/pooling/test_jinavl_reranker.py
@@ -1,194 +1,370 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import cast
 
 import pytest
 from transformers import AutoModel
 
-from vllm.entrypoints.chat_utils import ChatCompletionContentPartImageParam
+from vllm.entrypoints.chat_utils import (
+    ChatCompletionContentPartImageEmbedsParam,
+    ChatCompletionContentPartImageParam,
+    ChatCompletionContentPartTextParam,
+)
 from vllm.entrypoints.score_utils import ScoreMultiModalParam
 
 from ....conftest import HfRunner, VllmRunner
 
-model_name = "jinaai/jina-reranker-m0"
+MODELS = ["jinaai/jina-reranker-m0"]
 
-mm_processor_kwargs = {
+MM_PROCESSOR_KWARGS = {
     "min_pixels": 3136,
     "max_pixels": 602112,
 }
 
-limit_mm_per_prompt = {"image": 2}
+LIMIT_MM_PER_PROMPT = {"image": 2}
 
+CHECKPOINT_TO_HF_MAPPER = {
+    "visual.": "model.visual.",
+    "model.": "model.language_model.",
+}
+
+# Shared long text for test data
+LONG_TEXT_DOC = """We present ReaderLM-v2, a compact 1.5 billion parameter language model designed for efficient
+web content extraction. Our model processes documents up to 512K tokens, transforming messy HTML
+into clean Markdown or JSON formats with high accuracy -- making it an ideal tool for grounding
+large language models. The models effectiveness results from two key innovations: (1) a three-stage
+data synthesis pipeline that generates high quality, diverse training data by iteratively drafting,
+refining, and critiquing web content extraction; and (2) a unified training framework combining
+continuous pre-training with multi-objective optimization. Intensive evaluation demonstrates that
+ReaderLM-v2 outperforms GPT-4o-2024-08-06 and other larger models by 15-20% on carefully curated
+benchmarks, particularly excelling at documents exceeding 100K tokens, while maintaining significantly
+lower computational requirements."""  # noqa: E501
+
+# Test data for different scenarios
+TEXT_IMAGE_TEST_DATA = {
+    "query": [{"text": "slm markdown"}],
+    "documents": [
+        {
+            "image": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
+        },
+        {
+            "image": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
+        },
+    ],
+}
+
+TEXT_TEXT_TEST_DATA = {
+    "query": [{"text": "slm markdown"}],
+    "documents": [
+        {"text": LONG_TEXT_DOC},
+        {"text": "数据提取么？为什么不用正则啊,你用正则不就全解决了么?"},
+    ],
+}
+
+IMAGE_TEXT_TEST_DATA = {
+    "query": [
+        {
+            "image": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
+        }
+    ],
+    "documents": [
+        {"text": LONG_TEXT_DOC},
+        {"text": "数据提取么?为什么不用正则啊,你用正则不就全解决了么?"},
+    ],
+}
+
+IMAGE_IMAGE_TEST_DATA = {
+    "query": [
+        {
+            "image": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
+        }
+    ],
+    "documents": [
+        {
+            "image": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
+        },
+        {
+            "image": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
+        },
+    ],
+}
 
-def vllm_reranker(
+TEXT_MIXED_DOCS_TEST_DATA = {
+    "query": [{"text": "slm markdown"}],
+    "documents": [
+        {"text": LONG_TEXT_DOC},
+        {
+            "image": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
+        },
+        {"text": "数据提取么？为什么不用正则啊,你用正则不就全解决了么?"},
+        {
+            "image": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
+        },
+    ],
+}
+
+
+def _normalize_image(image_val: str) -> str:
+    """Normalize image value to proper format for HF model."""
+    return (
+        image_val
+        if image_val.startswith(("http://", "https://"))
+        else f"data:image/png;base64,{image_val}"
+    )
+
+
+def create_score_multimodal_param(
+    content_parts: list[dict],
+) -> ScoreMultiModalParam:
+    """
+    Create a ScoreMultiModalParam from a list of content dictionaries.
+
+    Each dict supports the following formats:
+    - Text: {'text': 'content'}
+    - Image URL: {'image': 'https://...'}
+    - Image Base64: {'image': 'base64_str'}
+    """
+    formatted_content = []
+
+    for part in content_parts:
+        if "text" in part:
+            formatted_content.append(
+                ChatCompletionContentPartTextParam(
+                    type="text",
+                    text=part["text"],
+                )
+            )
+        elif "image" in part:
+            image_val = part["image"]
+            if image_val.startswith(("http://", "https://")):
+                formatted_content.append(
+                    ChatCompletionContentPartImageParam(
+                        type="image_url",
+                        image_url={"url": image_val},
+                    )
+                )
+            else:
+                formatted_content.append(
+                    ChatCompletionContentPartImageEmbedsParam(
+                        type="image_embeds", image_embeds=image_val
+                    )
+                )
+
+    return ScoreMultiModalParam(content=formatted_content)
+
+
+def _run_vllm(
     vllm_runner: type[VllmRunner],
-    model_name: str,
+    model: str,
     dtype: str,
-    query_strs: list[str],
-    document_strs: list[str],
-    query_type: str = "text",
-    doc_type: str = "text",
-):
-    def create_image_param(url: str) -> ChatCompletionContentPartImageParam:
-        return {"type": "image_url", "image_url": {"url": f"{url}"}}
-
-    query: list[str] | ScoreMultiModalParam
-    if query_type == "text":
-        query = query_strs
-    elif query_type == "image":
-        query = ScoreMultiModalParam(
-            content=[create_image_param(url) for url in query_strs]
-        )
-
-    documents: list[str] | ScoreMultiModalParam
-    if doc_type == "text":
-        documents = document_strs
-    elif doc_type == "image":
-        documents = ScoreMultiModalParam(
-            content=[create_image_param(url) for url in document_strs]
-        )
+    query_strs: list[dict[str, str]],
+    document_strs: list[dict[str, str]],
+) -> list[float]:
+    """Run vLLM reranker and return scores."""
+    query = create_score_multimodal_param(query_strs)
+    documents = create_score_multimodal_param(document_strs)
 
     with vllm_runner(
-        model_name,
+        model,
         runner="pooling",
         dtype=dtype,
         max_num_seqs=2,
         max_model_len=2048,
-        mm_processor_kwargs=mm_processor_kwargs,
-        limit_mm_per_prompt=limit_mm_per_prompt,
+        mm_processor_kwargs=MM_PROCESSOR_KWARGS,
+        limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
     ) as vllm_model:
         outputs = vllm_model.llm.score(query, documents)
 
     return [output.outputs.score for output in outputs]
 
 
-def hf_reranker(
+def _run_hf(
     hf_runner: type[HfRunner],
-    model_name: str,
+    model: str,
     dtype: str,
-    query_strs: list[str],
-    document_strs: list[str],
-    query_type: str = "text",
-    doc_type: str = "text",
-):
-    checkpoint_to_hf_mapper = {
-        "visual.": "model.visual.",
-        "model.": "model.language_model.",
-    }
-
-    data_pairs = [[query_strs[0], d] for d in document_strs]
+    query_strs: list[dict[str, str]],
+    document_strs: list[dict[str, str]],
+) -> list[float]:
+    """Run HuggingFace reranker and return scores."""
+    query = query_strs[0]
+    if "text" in query:
+        query_type = "text"
+        query_data = query["text"]
+    elif "image" in query:
+        query_type = "image"
+        query_data = _normalize_image(query["image"])
+    else:
+        raise ValueError("Unsupported query format")
+
+    # Separate documents by type
+    text_docs: list[str] = []
+    image_docs: list[str] = []
+    text_indices: list[int] = []
+    image_indices: list[int] = []
+
+    for idx, doc in enumerate(document_strs):
+        if "text" in doc:
+            text_docs.append(doc["text"])
+            text_indices.append(idx)
+        elif "image" in doc:
+            image_docs.append(_normalize_image(doc["image"]))
+            image_indices.append(idx)
+        else:
+            raise ValueError(f"Unsupported document format at index {idx}")
+
+    scores: list[None | float] = [None] * len(document_strs)
 
     with hf_runner(
-        model_name,
+        model,
         dtype=dtype,
         trust_remote_code=True,
         auto_cls=AutoModel,
-        model_kwargs={"key_mapping": checkpoint_to_hf_mapper},
+        model_kwargs={"key_mapping": CHECKPOINT_TO_HF_MAPPER},
     ) as hf_model:
-        return hf_model.model.compute_score(
-            data_pairs, max_length=2048, query_type=query_type, doc_type=doc_type
-        )
+        # Score text documents
+        if text_docs:
+            text_scores = hf_model.model.compute_score(
+                [[query_data, d] for d in text_docs],
+                max_length=2048,
+                query_type=query_type,
+                doc_type="text",
+            )
+            for i, s in zip(text_indices, text_scores):
+                scores[i] = s
 
+        # Score image documents
+        if image_docs:
+            image_scores = hf_model.model.compute_score(
+                [[query_data, d] for d in image_docs],
+                max_length=2048,
+                query_type=query_type,
+                doc_type="image",
+            )
+            for i, s in zip(image_indices, image_scores):
+                scores[i] = s
 
-# Visual Documents Reranking
-@pytest.mark.parametrize("model_name", [model_name])
-@pytest.mark.parametrize("dtype", ["half"])
-def test_model_text_image(hf_runner, vllm_runner, model_name, dtype):
-    query = ["slm markdown"]
-    documents = [
-        "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png",
-        "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png",
-    ]
-
-    hf_outputs = hf_reranker(
-        hf_runner, model_name, dtype, query, documents, "text", "image"
-    )
-    vllm_outputs = vllm_reranker(
-        vllm_runner, model_name, dtype, query, documents, "text", "image"
-    )
+    assert all(s is not None for s in scores)
+    return cast(list[float], scores)
 
-    assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
-    assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)
 
+def _run_test(
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    model: str,
+    dtype: str,
+    query_strs: list[dict[str, str]],
+    document_strs: list[dict[str, str]],
+) -> None:
+    """Run comparison test between vLLM and HuggingFace implementations."""
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
 
-# Textual Documents Reranking
-@pytest.mark.parametrize("model_name", [model_name])
-@pytest.mark.parametrize("dtype", ["half"])
-def test_model_text_text(hf_runner, vllm_runner, model_name, dtype):
-    query = ["slm markdown"]
-    documents = [
-        """We present ReaderLM-v2, a compact 1.5 billion parameter language model designed for efficient 
-        web content extraction. Our model processes documents up to 512K tokens, transforming messy HTML 
-        into clean Markdown or JSON formats with high accuracy -- making it an ideal tool for grounding 
-        large language models. The models effectiveness results from two key innovations: (1) a three-stage 
-        data synthesis pipeline that generates high quality, diverse training data by iteratively drafting, 
-        refining, and critiquing web content extraction; and (2) a unified training framework combining 
-        continuous pre-training with multi-objective optimization. Intensive evaluation demonstrates that 
-        ReaderLM-v2 outperforms GPT-4o-2024-08-06 and other larger models by 15-20% on carefully curated 
-        benchmarks, particularly excelling at documents exceeding 100K tokens, while maintaining significantly 
-        lower computational requirements.""",  # noqa: E501
-        "数据提取么？为什么不用正则啊，你用正则不就全解决了么？",
-    ]
-    hf_outputs = hf_reranker(
-        hf_runner, model_name, dtype, query, documents, "text", "text"
-    )
-    vllm_outputs = vllm_reranker(
-        vllm_runner, model_name, dtype, query, documents, "text", "text"
+    vllm_outputs = _run_vllm(vllm_runner, model, dtype, query_strs, document_strs)
+    hf_outputs = _run_hf(hf_runner, model, dtype, query_strs, document_strs)
+
+    # Compare outputs
+    assert len(hf_outputs) == len(vllm_outputs), (
+        f"Output length mismatch: HF={len(hf_outputs)}, vLLM={len(vllm_outputs)}"
     )
 
-    assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
-    assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)
+    for i, (hf_score, vllm_score) in enumerate(zip(hf_outputs, vllm_outputs)):
+        assert hf_score == pytest.approx(vllm_score, rel=0.02), (
+            f"Score mismatch at index {i}: HF={hf_score}, vLLM={vllm_score}"
+        )
 
 
-# Image Querying for Textual Documents
-@pytest.mark.parametrize("model_name", [model_name])
+@pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
-def test_model_image_text(hf_runner, vllm_runner, model_name, dtype):
-    query = [
-        "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
-    ]
-    documents = [
-        """We present ReaderLM-v2, a compact 1.5 billion parameter language model designed for efficient
-        web content extraction. Our model processes documents up to 512K tokens, transforming messy HTML
-        into clean Markdown or JSON formats with high accuracy -- making it an ideal tool for grounding
-        large language models. The models effectiveness results from two key innovations: (1) a three-stage
-        data synthesis pipeline that generates high quality, diverse training data by iteratively drafting,
-        refining, and critiquing web content extraction; and (2) a unified training framework combining
-        continuous pre-training with multi-objective optimization. Intensive evaluation demonstrates that
-        ReaderLM-v2 outperforms GPT-4o-2024-08-06 and other larger models by 15-20% on carefully curated
-        benchmarks, particularly excelling at documents exceeding 100K tokens, while maintaining significantly
-        lower computational requirements.""",  # noqa: E501
-        "数据提取么？为什么不用正则啊，你用正则不就全解决了么？",
-    ]
-
-    hf_outputs = hf_reranker(
-        hf_runner, model_name, dtype, query, documents, "image", "text"
-    )
-    vllm_outputs = vllm_reranker(
-        vllm_runner, model_name, dtype, query, documents, "image", "text"
+def test_model_text_image(
+    hf_runner,
+    vllm_runner,
+    model: str,
+    dtype: str,
+) -> None:
+    """Visual Documents Reranking"""
+    _run_test(
+        hf_runner,
+        vllm_runner,
+        model,
+        dtype,
+        TEXT_IMAGE_TEST_DATA["query"],
+        TEXT_IMAGE_TEST_DATA["documents"],
     )
 
-    assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
-    assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_model_text_text(
+    hf_runner,
+    vllm_runner,
+    model: str,
+    dtype: str,
+) -> None:
+    """Textual Documents Reranking"""
+    _run_test(
+        hf_runner,
+        vllm_runner,
+        model,
+        dtype,
+        TEXT_TEXT_TEST_DATA["query"],
+        TEXT_TEXT_TEST_DATA["documents"],
+    )
 
 
-# Image Querying for Image Documents
-@pytest.mark.parametrize("model_name", [model_name])
+@pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
-def test_model_image_image(hf_runner, vllm_runner, model_name, dtype):
-    query = [
-        "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
-    ]
-    documents = [
-        "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png",
-        "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png",
-    ]
-
-    hf_outputs = hf_reranker(
-        hf_runner, model_name, dtype, query, documents, "image", "image"
+def test_model_image_text(
+    hf_runner,
+    vllm_runner,
+    model: str,
+    dtype: str,
+) -> None:
+    """Image Querying for Textual Documents"""
+    _run_test(
+        hf_runner,
+        vllm_runner,
+        model,
+        dtype,
+        IMAGE_TEXT_TEST_DATA["query"],
+        IMAGE_TEXT_TEST_DATA["documents"],
     )
-    vllm_outputs = vllm_reranker(
-        vllm_runner, model_name, dtype, query, documents, "image", "image"
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_model_image_image(
+    hf_runner,
+    vllm_runner,
+    model: str,
+    dtype: str,
+) -> None:
+    """Image Querying for Image Documents"""
+    _run_test(
+        hf_runner,
+        vllm_runner,
+        model,
+        dtype,
+        IMAGE_IMAGE_TEST_DATA["query"],
+        IMAGE_IMAGE_TEST_DATA["documents"],
     )
 
-    assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
-    assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_model_text_mixed_documents(
+    hf_runner,
+    vllm_runner,
+    model: str,
+    dtype: str,
+) -> None:
+    """Text Query for Mixed Text and Image Documents"""
+    _run_test(
+        hf_runner,
+        vllm_runner,
+        model,
+        dtype,
+        TEXT_MIXED_DOCS_TEST_DATA["query"],
+        TEXT_MIXED_DOCS_TEST_DATA["documents"],
+    )
diff --git a/tests/models/multimodal/pooling/test_radio.py b/tests/models/multimodal/pooling/test_radio.py
index 1f5baed83fa62238f8286af053ae0e42aa75669d..86b5b1b5d1f930d16098c906ff95c937a9225ef3 100644
--- a/tests/models/multimodal/pooling/test_radio.py
+++ b/tests/models/multimodal/pooling/test_radio.py
@@ -40,15 +40,15 @@ def run_radio_test(
         for image in images
     ]
 
-    config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
+    hf_config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
 
     # RADIO model on HF does not properly handle torch_dtype argument
     # And relies on args["dtype"] which we have to patch manually:
-    config.args["dtype"] = torch_dtype
+    hf_config.args["dtype"] = torch_dtype
 
     hf_model = AutoModel.from_pretrained(
         model_id,
-        config=config,
+        config=hf_config,
         dtype=torch_dtype,
         trust_remote_code=True,
     ).to("cuda")
@@ -62,13 +62,14 @@ def run_radio_test(
     hf_model.make_preprocessor_external()
 
     hf_outputs_per_image = [
-        hf_model(pixel_value.to("cuda")).features for pixel_value in pixel_values
+        hf_model(pixel_value.to("cuda")) for pixel_value in pixel_values
     ]
 
-    radio_config = RadioConfig(
-        model_name=config.args["model"], reg_tokens=config.args["register_multiple"]
+    vllm_config = RadioConfig(
+        model_name=hf_config.args["model"],
+        **hf_config.args,
     )
-    vllm_model = RadioModel(radio_config)
+    vllm_model = RadioModel(vllm_config)
     vllm_model.load_weights(hf_model.state_dict())
     vllm_model = vllm_model.to("cuda", torch_dtype)
 
@@ -80,7 +81,8 @@ def run_radio_test(
 
     cos_similar = nn.CosineSimilarity(dim=-1)
     for vllm_output, hf_output in zip(vllm_outputs_per_image, hf_outputs_per_image):
-        assert cos_similar(vllm_output, hf_output).mean() > 0.99
+        assert cos_similar(vllm_output[0], hf_output[0]).mean() > 0.99
+        assert cos_similar(vllm_output[1], hf_output[1]).mean() > 0.99
 
 
 @pytest.mark.parametrize(
@@ -90,7 +92,9 @@ def run_radio_test(
     ],
 )
 @pytest.mark.parametrize("dtype", ["half", "bfloat16"])
-def test_radio(dist_init, image_assets, model_id, dtype: str) -> None:
+def test_radio(
+    default_vllm_config, dist_init, image_assets, model_id, dtype: str
+) -> None:
     run_radio_test(
         image_assets,
         model_id,
diff --git a/tests/models/multimodal/pooling/test_siglip.py b/tests/models/multimodal/pooling/test_siglip.py
index 72886cbf7f323d45aeeb097790a32d48bfbf4faa..0b8cd33ccfb9db026c56f7cd8ec843a191aa2b44 100644
--- a/tests/models/multimodal/pooling/test_siglip.py
+++ b/tests/models/multimodal/pooling/test_siglip.py
@@ -38,6 +38,7 @@ def _run_test(
     *,
     dtype: str,
     tokenization_kwargs: dict[str, Any] | None = None,
+    attention_config: dict[str, Any] | None = None,
 ) -> None:
     if tokenization_kwargs is None:
         tokenization_kwargs = {}
@@ -49,6 +50,7 @@ def _run_test(
         enforce_eager=True,
         max_model_len=64,
         gpu_memory_utilization=0.7,
+        attention_config=attention_config,
     ) as vllm_model:
         vllm_outputs = vllm_model.embed(
             input_texts, images=input_images, tokenization_kwargs=tokenization_kwargs
@@ -90,6 +92,7 @@ def test_models_text(
     hf_runner,
     vllm_runner,
     image_assets,
+    siglip_attention_config,
     model: str,
     dtype: str,
 ) -> None:
@@ -108,6 +111,7 @@ def test_models_text(
             "padding": "max_length",
             "max_length": 64,
         },  # siglip2 was trained with this padding setting.
+        attention_config=siglip_attention_config,
     )
 
 
@@ -117,6 +121,7 @@ def test_models_image(
     hf_runner,
     vllm_runner,
     image_assets,
+    siglip_attention_config,
     model: str,
     dtype: str,
 ) -> None:
@@ -133,6 +138,7 @@ def test_models_image(
         input_images,
         model,
         dtype=dtype,
+        attention_config=siglip_attention_config,
     )
 
 
@@ -141,6 +147,7 @@ def test_models_image(
 def test_models_text_image_no_crash(
     vllm_runner,
     image_assets,
+    siglip_attention_config,
     model: str,
     dtype: str,
 ) -> None:
@@ -154,6 +161,7 @@ def test_models_text_image_no_crash(
         enforce_eager=True,
         max_model_len=64,
         gpu_memory_utilization=0.7,
+        attention_config=siglip_attention_config,
     ) as vllm_model:
         with pytest.raises(ValueError, match="not both"):
             vllm_model.embed(texts, images=images)
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 3f4198c503b44ac245be0c2a3f61f05a6a51afca..d31eaeda8547169a99da47f0640dc86f17c1574c 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -86,11 +86,25 @@ def qwen3_vl_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
     return mm_data
 
 
+def glmasr_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
+    """
+    Patch the multimodal data for GLM-ASR model.
+    GLM-ASR requires text and audio to match 1:1, so we limit audio to 1.
+    """
+    if "audio" in mm_data:
+        audio = mm_data["audio"]
+        if isinstance(audio, list) and len(audio) > 1:
+            # Limit to single audio to match text requirement
+            mm_data["audio"] = [audio[0]]
+    return mm_data
+
+
 # For some multimodal models, tokenizer will always add bos_token
 # at the beginning of prompt by default, causing hf_processor outputs
 # incorrect token ids. So we need use `add_special_tokens=False` here
 # to leave bos_token to be added by the processor.
 _ADD_SPECIAL_TOKENS_OVERRIDES = {
+    "nemotron_parse": False,
     "ovis": False,
     "ovis2_5": False,
     "paligemma": False,
@@ -106,9 +120,11 @@ _IGNORE_MM_KEYS = {
 }
 
 MM_DATA_PATCHES = {
-    # GLM4.1V and Qwen3-VL requires video metadata to be included in the input
+    # Ernie4.5-VL, GLM4.1V and Qwen3-VL requires video metadata
+    "ernie4_5_moe_vl": qwen3_vl_patch_mm_data,
     "glm4v": glm4_1v_patch_mm_data,
     "glm4v_moe": glm4_1v_patch_mm_data,
+    "glmasr": glmasr_patch_mm_data,
     "qwen3_vl": qwen3_vl_patch_mm_data,
     "qwen3_vl_moe": qwen3_vl_patch_mm_data,
 }
@@ -212,7 +228,11 @@ def _test_processing_correctness(
         model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id_or_arch)
         model_id = model_id_or_arch
     model_info.check_available_online(on_fail="skip")
-    model_info.check_transformers_version(on_fail="skip")
+    model_info.check_transformers_version(
+        on_fail="skip",
+        check_max_version=False,
+        check_version_reason="vllm",
+    )
 
     model_config = ModelConfig(
         model_id,
@@ -386,6 +406,11 @@ def test_processing_correctness(
         pytest.skip("Fix later")
     if model_id == "jinaai/jina-reranker-m0":
         pytest.skip("Fix later")
+    if model_id in {"Qwen/Qwen-VL", "Qwen/Qwen-VL-Chat"}:
+        pytest.skip(
+            "Qwen-VL tokenizer requires downloading a font file from "
+            "servers that often refuse connections in CI"
+        )
 
     _test_processing_correctness(
         model_id,
diff --git a/tests/models/multimodal/processing/test_gemma3.py b/tests/models/multimodal/processing/test_gemma3.py
index 32a459ee8cdfbc5b19f4d0c57873fc1dd2e6f06f..e252be89413c9cc424fdb3f8d19c6b57aca475cb 100644
--- a/tests/models/multimodal/processing/test_gemma3.py
+++ b/tests/models/multimodal/processing/test_gemma3.py
@@ -2,14 +2,154 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
+import torch
 
+from vllm.model_executor.models.gemma3n_audio_utils import (
+    adjust_audio_features_to_expected_length,
+)
 from vllm.multimodal import MULTIMODAL_REGISTRY
 
 from ....conftest import ImageTestAssets
 from ...utils import build_model_context
 
+# Gemma3 (image) model
+GEMMA3_MODEL_ID = "google/gemma-3-4b-it"
 
-@pytest.mark.parametrize("model_id", ["google/gemma-3-4b-it"])
+# Gemma3n (multimodal with audio) model
+GEMMA3N_MODEL_ID = "google/gemma-3n-E2B-it"
+
+# Expected audio tokens for Gemma3n (audio_soft_tokens_per_image)
+GEMMA3N_EXPECTED_AUDIO_TOKENS = 188
+
+
+class TestGemma3nAudioTensorLogic:
+    """CPU-based tests for Gemma3n audio feature tensor manipulation.
+
+    These tests validate the padding/truncation logic in
+    adjust_audio_features_to_expected_length() which fixes the
+    integer overflow in _process_audio_input when audio_seq_len > 188.
+    """
+
+    def test_padding_when_audio_short(self):
+        """Test that short audio is padded to expected length."""
+        batch_size, seq_len, embed_dim = 1, 100, 256
+        expected_tokens = GEMMA3N_EXPECTED_AUDIO_TOKENS
+
+        audio_features = torch.randn(batch_size, seq_len, embed_dim)
+        padding_embs = torch.zeros(1, 1, embed_dim)
+
+        result, tokens_truncated = adjust_audio_features_to_expected_length(
+            audio_features, expected_tokens, padding_embs
+        )
+
+        assert result.shape == (batch_size, expected_tokens, embed_dim)
+        assert tokens_truncated == 0
+        # First 100 tokens should be original, rest should be padding (zeros)
+        assert torch.allclose(result[:, :seq_len, :], audio_features)
+        assert torch.allclose(
+            result[:, seq_len:, :],
+            torch.zeros(batch_size, expected_tokens - seq_len, embed_dim),
+        )
+
+    def test_truncation_when_audio_long(self):
+        """Test that long audio is truncated to expected length.
+
+        This is the key test for the overflow fix. Previously, when
+        audio_seq_len > expected_tokens, the code would compute a negative
+        padding value causing: RuntimeError: numel: integer multiplication overflow
+        """
+        batch_size, seq_len, embed_dim = 1, 192, 256  # 192 > 188
+        expected_tokens = GEMMA3N_EXPECTED_AUDIO_TOKENS
+
+        audio_features = torch.randn(batch_size, seq_len, embed_dim)
+        padding_embs = torch.zeros(1, 1, embed_dim)
+
+        result, tokens_truncated = adjust_audio_features_to_expected_length(
+            audio_features, expected_tokens, padding_embs
+        )
+
+        assert result.shape == (batch_size, expected_tokens, embed_dim)
+        assert tokens_truncated == seq_len - expected_tokens  # 192 - 188 = 4
+        # Result should be first 188 tokens of original
+        assert torch.allclose(result, audio_features[:, :expected_tokens, :])
+
+    def test_no_change_when_exact_length(self):
+        """Test that exact-length audio passes through unchanged."""
+        batch_size, embed_dim = 1, 256
+        expected_tokens = GEMMA3N_EXPECTED_AUDIO_TOKENS
+
+        audio_features = torch.randn(batch_size, expected_tokens, embed_dim)
+        padding_embs = torch.zeros(1, 1, embed_dim)
+
+        result, tokens_truncated = adjust_audio_features_to_expected_length(
+            audio_features, expected_tokens, padding_embs
+        )
+
+        assert result.shape == audio_features.shape
+        assert tokens_truncated == 0
+        assert torch.allclose(result, audio_features)
+
+    def test_original_bug_would_fail(self):
+        """Verify the original buggy implementation would cause overflow.
+
+        The original code always tried to pad, which fails when
+        audio_seq_len > expected_tokens because expand() gets negative size.
+        """
+        batch_size, seq_len, embed_dim = 1, 192, 256
+        expected_tokens = GEMMA3N_EXPECTED_AUDIO_TOKENS
+
+        padding_embs = torch.zeros(1, 1, embed_dim)
+
+        # Original buggy logic (always pads, never truncates)
+        extra_padding_tokens = expected_tokens - seq_len  # = -4 (negative!)
+
+        with pytest.raises(RuntimeError):
+            # This should fail with negative size error
+            padding_embs.expand(batch_size, extra_padding_tokens, embed_dim)
+
+    @pytest.mark.parametrize(
+        "seq_len",
+        [50, 100, 150, 187, 188, 189, 192, 200, 300],
+    )
+    def test_various_audio_lengths(self, seq_len: int):
+        """Test padding/truncation with various audio lengths."""
+        batch_size, embed_dim = 1, 256
+        expected_tokens = GEMMA3N_EXPECTED_AUDIO_TOKENS
+
+        audio_features = torch.randn(batch_size, seq_len, embed_dim)
+        padding_embs = torch.zeros(1, 1, embed_dim)
+
+        # Should not raise any errors
+        result, tokens_truncated = adjust_audio_features_to_expected_length(
+            audio_features, expected_tokens, padding_embs
+        )
+
+        # Output should always be expected_tokens length
+        assert result.shape == (batch_size, expected_tokens, embed_dim)
+
+        # Verify truncation count is correct
+        if seq_len > expected_tokens:
+            assert tokens_truncated == seq_len - expected_tokens
+        else:
+            assert tokens_truncated == 0
+
+    def test_batch_processing(self):
+        """Test that batch processing works correctly."""
+        batch_size, seq_len, embed_dim = 4, 192, 256
+        expected_tokens = GEMMA3N_EXPECTED_AUDIO_TOKENS
+
+        audio_features = torch.randn(batch_size, seq_len, embed_dim)
+        padding_embs = torch.zeros(1, 1, embed_dim)
+
+        result, tokens_truncated = adjust_audio_features_to_expected_length(
+            audio_features, expected_tokens, padding_embs
+        )
+
+        assert result.shape == (batch_size, expected_tokens, embed_dim)
+        assert tokens_truncated == seq_len - expected_tokens
+
+
+@pytest.mark.parametrize("model_id", [GEMMA3_MODEL_ID])
 def test_get_image_size_with_most_features(
     image_assets: ImageTestAssets, model_id: str
 ):
diff --git a/tests/models/multimodal/processing/test_qwen3_omni.py b/tests/models/multimodal/processing/test_qwen3_omni.py
new file mode 100644
index 0000000000000000000000000000000000000000..d66283be4ac687068ab4e4a338cb921ca0daa4b4
--- /dev/null
+++ b/tests/models/multimodal/processing/test_qwen3_omni.py
@@ -0,0 +1,104 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for Qwen3 Omni audio processing and sample rate handling."""
+
+from typing import Any
+
+import numpy as np
+import pytest
+
+from vllm.multimodal import MULTIMODAL_REGISTRY
+
+from ...utils import build_model_context
+
+
+@pytest.mark.parametrize("model_id", ["Qwen/Qwen3-Omni-30B-A3B-Instruct"])
+@pytest.mark.parametrize(
+    ("audio_sample_rate", "audio_duration_sec"),
+    [
+        (16000, 1.0),  # Native Whisper sample rate, 1 second
+        (16000, 2.0),  # Native Whisper sample rate, 2 seconds
+    ],
+)
+def test_processor_with_audio_sample_rate(
+    model_id: str,
+    audio_sample_rate: int,
+    audio_duration_sec: float,
+) -> None:
+    """
+    Test that vLLM's processor generates expected outputs with audio_sample_rate.
+
+    This validates that the processor correctly handles audio_sample_rate
+    passed via hf_processor_mm_kwargs and generates audio tokens.
+    """
+    ctx = build_model_context(
+        model_id,
+        limit_mm_per_prompt={"audio": 1, "image": 0, "video": 0},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    tokenizer = processor.info.get_tokenizer()
+
+    # Create audio data at the specified sample rate
+    audio_length = int(audio_sample_rate * audio_duration_sec)
+    rng = np.random.RandomState(42)
+    audio_data = rng.rand(audio_length).astype(np.float32)
+
+    # Build prompt with audio placeholder
+    prompt = "<|audio_start|><|audio_pad|><|audio_end|>"
+    mm_data = {"audio": [(audio_data, audio_sample_rate)]}
+
+    # Apply processor with audio_sample_rate in mm_kwargs
+    hf_processor_mm_kwargs: dict[str, Any] = {
+        "audio_sample_rate": audio_sample_rate,
+    }
+    processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
+
+    # Verify audio tokens are generated
+    hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
+    audio_token_id = tokenizer.convert_tokens_to_ids(hf_processor.audio_token)
+    aud_tok_count = processed_inputs["prompt_token_ids"].count(audio_token_id)
+
+    assert aud_tok_count >= 1, (
+        f"Expected at least 1 audio token but got {aud_tok_count}. "
+        f"sample_rate: {audio_sample_rate}Hz, duration: {audio_duration_sec}s"
+    )
+
+
+@pytest.mark.parametrize("model_id", ["Qwen/Qwen3-Omni-30B-A3B-Instruct"])
+def test_longer_audio_generates_more_tokens(model_id: str) -> None:
+    """
+    Test that longer audio generates more tokens than shorter audio.
+
+    This validates that audio_sample_rate is being used correctly by checking
+    that audio duration affects token count as expected.
+    """
+    ctx = build_model_context(
+        model_id,
+        limit_mm_per_prompt={"audio": 1, "image": 0, "video": 0},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    tokenizer = processor.info.get_tokenizer()
+
+    audio_sample_rate = 16000
+    rng = np.random.RandomState(42)
+
+    def get_token_count(duration: float) -> int:
+        audio_length = int(audio_sample_rate * duration)
+        audio_data = rng.rand(audio_length).astype(np.float32)
+        prompt = "<|audio_start|><|audio_pad|><|audio_end|>"
+        mm_data = {"audio": [(audio_data, audio_sample_rate)]}
+        hf_processor_mm_kwargs: dict[str, Any] = {
+            "audio_sample_rate": audio_sample_rate,
+        }
+        processed = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
+        hf_proc = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
+        audio_token_id = tokenizer.convert_tokens_to_ids(hf_proc.audio_token)
+        return processed["prompt_token_ids"].count(audio_token_id)
+
+    short_tokens = get_token_count(1.0)
+    long_tokens = get_token_count(2.0)
+
+    assert long_tokens > short_tokens, (
+        f"Expected longer audio (2s) to have more tokens than shorter (1s). "
+        f"Got short={short_tokens}, long={long_tokens}"
+    )
diff --git a/tests/models/multimodal/processing/test_tensor_schema.py b/tests/models/multimodal/processing/test_tensor_schema.py
index cb875436857cf2606e6f4e5f3d0a010316aebf8d..f047f832b9844b138f1af27cf08a5d819f629a76 100644
--- a/tests/models/multimodal/processing/test_tensor_schema.py
+++ b/tests/models/multimodal/processing/test_tensor_schema.py
@@ -138,25 +138,25 @@ def create_batched_mm_kwargs(
     )
 
 
-# TODO(Isotr0py): Don't initalize model during test
+# TODO(Isotr0py): Don't initialize model during test
 @contextmanager
 def initialize_dummy_model(
     model_cls: type[nn.Module],
     model_config: ModelConfig,
 ):
     temp_file = tempfile.mkstemp()[1]
-    init_distributed_environment(
-        world_size=1,
-        rank=0,
-        distributed_init_method=f"file://{temp_file}",
-        local_rank=0,
-        backend="nccl",
-    )
-    initialize_model_parallel(tensor_model_parallel_size=1)
-
     current_device = torch.get_default_device()
     vllm_config = VllmConfig(model_config=model_config)
     with set_current_vllm_config(vllm_config=vllm_config):
+        init_distributed_environment(
+            world_size=1,
+            rank=0,
+            distributed_init_method=f"file://{temp_file}",
+            local_rank=0,
+            backend="nccl",
+        )
+        initialize_model_parallel(tensor_model_parallel_size=1)
+
         with set_default_torch_dtype(model_config.dtype):
             torch.set_default_device(current_platform.device_type)
             model = model_cls(vllm_config=vllm_config)
@@ -172,7 +172,11 @@ def initialize_dummy_model(
 def test_model_tensor_schema(model_id: str):
     model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
     model_info.check_available_online(on_fail="skip")
-    model_info.check_transformers_version(on_fail="skip")
+    model_info.check_transformers_version(
+        on_fail="skip",
+        check_max_version=False,
+        check_version_reason="vllm",
+    )
 
     model_arch = next(
         arch for arch, info in HF_EXAMPLE_MODELS.hf_models.items() if info == model_info
diff --git a/tests/models/quantization/untest_fp8.py b/tests/models/quantization/untest_fp8.py
index 187e9aea4c460d2d63d318b4fdc3bcbd183da089..4ff9a0dbff80f57ba6e0cb1033c0d5cde5215a8c 100644
--- a/tests/models/quantization/untest_fp8.py
+++ b/tests/models/quantization/untest_fp8.py
@@ -9,7 +9,7 @@ import os
 import pytest
 
 from tests.quantization.utils import is_quant_method_supported
-from vllm.attention.utils.fa_utils import flash_attn_supports_fp8
+from vllm.v1.attention.backends.fa_utils import flash_attn_supports_fp8
 from vllm.platforms import current_platform
 from ..utils import check_logprobs_close
 from ...utils import models_path_prefix
@@ -76,7 +76,6 @@ def test_models(
 
     with monkeypatch.context() as m:
         m.setenv("TOKENIZERS_PARALLELISM", "true")
-        m.setenv("VLLM_ATTENTION_BACKEND", backend)
 
         MAX_MODEL_LEN = 1024
         NUM_LOG_PROBS = 8
@@ -87,6 +86,7 @@ def test_models(
             tensor_parallel_size=tensor_parallel_size,
             enforce_eager=enforce_eager,
             kv_cache_dtype="auto",
+            attention_config={"backend": backend},
         ) as vllm_model:
             baseline_outputs = vllm_model.generate_greedy_logprobs(
                 example_prompts, max_tokens, NUM_LOG_PROBS
@@ -98,6 +98,7 @@ def test_models(
             tensor_parallel_size=tensor_parallel_size,
             enforce_eager=enforce_eager,
             kv_cache_dtype=kv_cache_dtype,
+            attention_config={"backend": backend},
         ) as vllm_model:
             test_outputs = vllm_model.generate_greedy_logprobs(
                 example_prompts, max_tokens, NUM_LOG_PROBS
diff --git a/tests/models/quantization/untest_gptq_marlin_24.py b/tests/models/quantization/untest_gptq_marlin_24.py
index 824395a2c7a217b794d5980a26ceba7ad724acfa..9a071ae7884c289d3359b0459a42a8c3cb58851e 100644
--- a/tests/models/quantization/untest_gptq_marlin_24.py
+++ b/tests/models/quantization/untest_gptq_marlin_24.py
@@ -65,7 +65,10 @@ def test_models(
     num_logprobs: int,
 ) -> None:
     with vllm_runner(
-        model_pair.model_marlin, dtype=dtype, quantization="gptq_marlin_24"
+        model_pair.model_marlin,
+        dtype=dtype,
+        quantization="gptq_marlin_24",
+        allow_deprecated_quantization=True,
     ) as marlin_24_model:
         marlin_24_outputs = marlin_24_model.generate_greedy_logprobs(
             example_prompts, max_tokens, num_logprobs
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 5a7d458b9365062246b14ee0aff945acc7ceeb0f..5313a9c0d2a83894939c481a1fef56fe909d436b 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -51,9 +51,11 @@ class _HfExamplesInfo:
     The maximum version of HF Transformers that this model runs on.
     """
 
-    transformers_version_reason: str | None = None
+    transformers_version_reason: dict[Literal["vllm", "hf"], str] | None = None
     """
-    The reason for the minimum/maximum version requirement.
+    The type and reason to skip test for the minimum/maximum version requirement.
+    vllm: skip all vLLM tests if the version requirement is not met.
+    hf: only skip tests that uses HF runner if the version requirement is not met.
     """
 
     require_embed_inputs: bool = False
@@ -113,6 +115,7 @@ class _HfExamplesInfo:
         self,
         *,
         on_fail: Literal["error", "skip", "return"],
+        check_version_reason: Literal["vllm", "hf"] = "hf",
         check_min_version: bool = True,
         check_max_version: bool = True,
     ) -> str | None:
@@ -133,23 +136,28 @@ class _HfExamplesInfo:
         msg = f"`transformers=={current_version}` installed, but `transformers"
         # Only check the base version for the min/max version, otherwise preview
         # models cannot be run because `x.yy.0.dev0`<`x.yy.0`
-        if (
-            check_min_version
-            and min_version
-            and Version(cur_base_version) < Version(min_version)
-        ):
+        if min_version and Version(cur_base_version) < Version(min_version):
+            is_version_valid = not check_min_version
             msg += f">={min_version}` is required to run this model."
-        elif (
-            check_max_version
-            and max_version
-            and Version(cur_base_version) > Version(max_version)
-        ):
+        elif max_version and Version(cur_base_version) > Version(max_version):
+            is_version_valid = not check_max_version
             msg += f"<={max_version}` is required to run this model."
         else:
-            return None
+            is_version_valid = True
 
-        if self.transformers_version_reason:
-            msg += f" Reason: {self.transformers_version_reason}"
+        # check if Transformers version breaks the corresponding model runner,
+        # skip test when model runner not compatible
+        is_reason_valid = not (
+            check_version_reason
+            and self.transformers_version_reason
+            and check_version_reason in self.transformers_version_reason
+        )
+        is_transformers_valid = is_version_valid and is_reason_valid
+        if is_transformers_valid:
+            return None
+        elif self.transformers_version_reason:
+            for reason_type, reason in self.transformers_version_reason.items():
+                msg += f" Reason({reason_type}): {reason}"
 
         if on_fail == "error":
             raise RuntimeError(msg)
@@ -219,7 +227,10 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
         trust_remote_code=True,
     ),
     "CwmForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "facebook/cwm"), min_transformers_version="4.58"),
-    "DbrxForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "databricks/dbrx-instruct")),
+    # FIXME: databricks/dbrx-instruct has been deleted
+    "DbrxForCausalLM": _HfExamplesInfo(
+        os.path.join(models_path_prefix, "databricks/dbrx-instruct"), is_available_online=False
+    ),
     "DeciLMForCausalLM": _HfExamplesInfo(
         os.path.join(models_path_prefix, "nvidia/Llama-3_3-Nemotron-Super-49B-v1"),
         trust_remote_code=True,
@@ -243,6 +254,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
         os.path.join(models_path_prefix, "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"), trust_remote_code=True
     ),
     "Exaone4ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "LGAI-EXAONE/EXAONE-4.0-32B")),
+    "ExaoneMoEForCausalLM": _HfExamplesInfo(
+        "LGAI-EXAONE/K-EXAONE-236B-A23B", min_transformers_version="5.0.0"
+    ),
     "Fairseq2LlamaForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "mgleize/fairseq2-dummy-Llama-3.2-1B")),
     "FalconForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "tiiuae/falcon-7b")),
     "FalconH1ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "tiiuae/Falcon-H1-0.5B-Base")),
@@ -282,6 +296,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
     "Grok1ModelForCausalLM": _HfExamplesInfo(
         os.path.join(models_path_prefix, "hpcai-tech/grok-1"), trust_remote_code=True
     ),
+    "Grok1ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "xai-org/grok-2"), trust_remote_code=True),
     "HunYuanDenseV1ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "tencent/Hunyuan-7B-Instruct")),
     "HunYuanMoEV1ForCausalLM": _HfExamplesInfo(
         os.path.join(models_path_prefix, "tencent/Hunyuan-A13B-Instruct"), trust_remote_code=True
@@ -302,6 +317,16 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
     "Jais2ForCausalLM": _HfExamplesInfo(
         os.path.join(models_path_prefix, "inceptionai/Jais-2-8B-Chat"), min_transformers_version="4.58"
     ),
+    "IQuestCoderForCausalLM": _HfExamplesInfo(
+        os.path.join(models_path_prefix, "IQuestLab/IQuest-Coder-V1-40B-Instruct"), trust_remote_code=True
+    ),
+    "IQuestLoopCoderForCausalLM": _HfExamplesInfo(
+        os.path.join(models_path_prefix, "IQuestLab/IQuest-Coder-V1-40B-Loop-Instruct"), trust_remote_code=True
+    ),
+    "JAISLMHeadModel": _HfExamplesInfo(os.path.join(models_path_prefix, "inceptionai/jais-13b-chat")),
+    "Jais2ForCausalLM": _HfExamplesInfo(
+        os.path.join(models_path_prefix, "inceptionai/Jais-2-8B-Chat"), min_transformers_version="4.58"
+    ),
     "JambaForCausalLM": _HfExamplesInfo(
         os.path.join(models_path_prefix, "ai21labs/AI21-Jamba-1.5-Mini"),
         extras={
@@ -348,6 +373,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
     "MiniCPM3ForCausalLM": _HfExamplesInfo(
         os.path.join(models_path_prefix, "openbmb/MiniCPM3-4B"), trust_remote_code=True
     ),
+    "MiniCPM4ForCausalLM": _HfExamplesInfo(
+        os.path.join(models_path_prefix, "openbmb/MiniCPM4.1-8B"), trust_remote_code=True
+    ),
     "MiniMaxForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "MiniMaxAI/MiniMax-Text-01-hf")),
     "MiniMaxText01ForCausalLM": _HfExamplesInfo(
         os.path.join(models_path_prefix, "MiniMaxAI/MiniMax-Text-01"),
@@ -370,7 +398,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
         {"tiny": os.path.join(models_path_prefix, "TitanML/tiny-mixtral")},
     ),
     "MptForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "mpt"), is_available_online=False),
-    "MPTForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "mosaicml/mpt-7b")),
+    # FIXME: mosaicml/mpt-7b has been deleted
+    "MPTForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "mosaicml/mpt-7b"), is_available_online=False),
     "NemotronForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "nvidia/Minitron-8B-Base")),
     "NemotronHForCausalLM": _HfExamplesInfo(
         os.path.join(models_path_prefix, "nvidia/Nemotron-H-8B-Base-8K"), trust_remote_code=True
@@ -394,6 +423,11 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
     "PanguEmbeddedForCausalLM": _HfExamplesInfo(
         os.path.join(models_path_prefix, "FreedomIntelligence/openPangu-Embedded-7B-V1.1"), trust_remote_code=True
     ),
+    "PanguProMoEV2ForCausalLM": _HfExamplesInfo(
+        "",
+        trust_remote_code=True,
+        is_available_online=False,
+    ),
     "PanguUltraMoEForCausalLM": _HfExamplesInfo(
         os.path.join(models_path_prefix, "FreedomIntelligence/openPangu-Ultra-MoE-718B-V1.1"),
         trust_remote_code=True,
@@ -416,7 +450,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
     "QWenLMHeadModel": _HfExamplesInfo(
         os.path.join(models_path_prefix, "Qwen/Qwen-7B-Chat"),
         max_transformers_version="4.53",
-        transformers_version_reason="HF model uses remote code that is not compatible with latest Transformers",  # noqa: E501
+        transformers_version_reason={
+            "hf": "HF model uses remote code that is not compatible with latest Transformers"  # noqa: E501
+        },
         trust_remote_code=True,
     ),
     "Qwen2ForCausalLM": _HfExamplesInfo(
@@ -463,6 +499,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
     ),
     "Zamba2ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "Zyphra/Zamba2-7B-instruct")),
     "MiMoForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "XiaomiMiMo/MiMo-7B-RL"), trust_remote_code=True),
+    "MiMoV2FlashForCausalLM": _HfExamplesInfo(
+        os.path.join(models_path_prefix, "XiaomiMiMo/MiMo-V2-Flash"), trust_remote_code=True
+    ),
     "Dots1ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "rednote-hilab/dots.llm1.inst")),
 }
 
@@ -484,7 +523,10 @@ _EMBEDDING_EXAMPLE_MODELS = {
         os.path.join(models_path_prefix, "internlm/internlm2-1_8b-reward"), trust_remote_code=True
     ),
     "JambaForSequenceClassification": _HfExamplesInfo(os.path.join(models_path_prefix, "ai21labs/Jamba-tiny-reward-dev")),
-    "LlamaModel": _HfExamplesInfo(os.path.join(models_path_prefix, "llama", is_available_online=False)),
+    "LlamaModel": _HfExamplesInfo("llama", is_available_online=False),
+    "LlamaBidirectionalModel": _HfExamplesInfo(
+        os.path.join(models_path_prefix, "nvidia/llama-nemotron-embed-1b-v2"), trust_remote_code=True
+    ),
     "MistralModel": _HfExamplesInfo(os.path.join(models_path_prefix, "intfloat/e5-mistral-7b-instruct")),
     "ModernBertModel": _HfExamplesInfo(
         os.path.join(models_path_prefix, "Alibaba-NLP/gte-modernbert-base"), trust_remote_code=True
@@ -496,12 +538,16 @@ _EMBEDDING_EXAMPLE_MODELS = {
     "Qwen2ForRewardModel": _HfExamplesInfo(
         os.path.join(models_path_prefix, "Qwen/Qwen2.5-Math-RM-72B"),
         max_transformers_version="4.53",
-        transformers_version_reason="HF model uses remote code that is not compatible with latest Transformers",  # noqa: E501
+        transformers_version_reason={
+            "hf": "HF model uses remote code that is not compatible with latest Transformers"  # noqa: E501
+        },
     ),
     "Qwen2ForProcessRewardModel": _HfExamplesInfo(
         os.path.join(models_path_prefix, "Qwen/Qwen2.5-Math-PRM-7B"),
         max_transformers_version="4.53",
-        transformers_version_reason="HF model uses remote code that is not compatible with latest Transformers",  # noqa: E501
+        transformers_version_reason={
+            "hf": "HF model uses remote code that is not compatible with latest Transformers"  # noqa: E501
+        },
     ),
     "RobertaModel": _HfExamplesInfo(os.path.join(models_path_prefix, "sentence-transformers/stsb-roberta-base-v2")),
     "RobertaForMaskedLM": _HfExamplesInfo(os.path.join(models_path_prefix, "sentence-transformers/all-roberta-large-v1")),
@@ -551,6 +597,9 @@ _SEQUENCE_CLASSIFICATION_EXAMPLE_MODELS = {
         trust_remote_code=True,
         hf_overrides={"architectures": ["GteNewForSequenceClassification"]},
     ),
+    "LlamaBidirectionalForSequenceClassification": _HfExamplesInfo(
+        "nvidia/llama-nemotron-rerank-1b-v2", trust_remote_code=True
+    ),
     "ModernBertForSequenceClassification": _HfExamplesInfo(
         os.path.join(models_path_prefix, "Alibaba-NLP/gte-reranker-modernbert-base")
     ),
@@ -581,6 +630,15 @@ _AUTOMATIC_CONVERTED_MODELS = {
         os.path.join(models_path_prefix, "tomaarsen/Qwen3-Reranker-0.6B-seq-cls")
     ),
     "Qwen3ForTokenClassification": _HfExamplesInfo("bd2lcco/Qwen3-0.6B-finetuned"),
+    "Qwen3VLForSequenceClassification": _HfExamplesInfo(
+        "Qwen/Qwen3-VL-Reranker-2B",
+        is_available_online=False,
+        hf_overrides={
+            "architectures": ["Qwen3VLForSequenceClassification"],
+            "classifier_from_token": ["no", "yes"],
+            "is_original_qwen3_reranker": True,
+        },
+    ),
 }
 
 _MULTIMODAL_EXAMPLE_MODELS = {
@@ -607,7 +665,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
         os.path.join(models_path_prefix, "deepseek-ai/deepseek-vl2-tiny"),
         extras={"fork": os.path.join(models_path_prefix, "Isotr0py/deepseek-vl2-tiny")},
         max_transformers_version="4.48",
-        transformers_version_reason="HF model is not compatible.",
+        transformers_version_reason={"hf": "HF model is not compatible."},
         hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
     ),
     "DeepseekOCRForCausalLM": _HfExamplesInfo(
@@ -624,6 +682,11 @@ _MULTIMODAL_EXAMPLE_MODELS = {
     "FuyuForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "adept/fuyu-8b")),
     "Gemma3ForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "google/gemma-3-4b-it")),
     "Gemma3nForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "google/gemma-3n-E2B-it")),
+    "GlmAsrForConditionalGeneration": _HfExamplesInfo(
+        os.path.join(models_path_prefix, "zai-org/GLM-ASR-Nano-2512"),
+        trust_remote_code=True,
+        min_transformers_version="5.0",
+    ),
     "GraniteSpeechForConditionalGeneration": _HfExamplesInfo(
         os.path.join(models_path_prefix, "ibm-granite/granite-speech-3.3-2b")
     ),
@@ -639,7 +702,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
         trust_remote_code=True,
         extras={"2b": os.path.join(models_path_prefix, "h2oai/h2ovl-mississippi-2b")},
         max_transformers_version="4.48",
-        transformers_version_reason="HF model is not compatible.",
+        transformers_version_reason={"hf": "HF model is not compatible."},
     ),
     "HCXVisionForCausalLM": _HfExamplesInfo(
         os.path.join(models_path_prefix, "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"),
@@ -653,6 +716,11 @@ _MULTIMODAL_EXAMPLE_MODELS = {
         os.path.join(models_path_prefix, "HuggingFaceM4/Idefics3-8B-Llama3"),
         extras={"tiny": os.path.join(models_path_prefix, "HuggingFaceTB/SmolVLM-256M-Instruct")},
     ),
+    "IsaacForConditionalGeneration": _HfExamplesInfo(
+        "PerceptronAI/Isaac-0.1",
+        trust_remote_code=True,
+        extras={"0.2-2B-Preview": "PerceptronAI/Isaac-0.2-2B-Preview"},
+    ),
     "InternS1ForConditionalGeneration": _HfExamplesInfo(
         os.path.join(models_path_prefix, "internlm/Intern-S1"), trust_remote_code=True
     ),
@@ -668,6 +736,10 @@ _MULTIMODAL_EXAMPLE_MODELS = {
         trust_remote_code=True,
     ),
     "InternVLForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "OpenGVLab/InternVL3-1B-hf")),
+    "KananaVForConditionalGeneration": _HfExamplesInfo(
+        os.path.join(models_path_prefix, "kakaocorp/kanana-1.5-v-3b-instruct"),
+        trust_remote_code=True,
+    ),
     "KeyeForConditionalGeneration": _HfExamplesInfo(
         os.path.join(models_path_prefix, "Kwai-Keye/Keye-VL-8B-Preview"),
         trust_remote_code=True,
@@ -681,13 +753,21 @@ _MULTIMODAL_EXAMPLE_MODELS = {
         extras={"thinking": os.path.join(models_path_prefix, "moonshotai/Kimi-VL-A3B-Thinking")},
         trust_remote_code=True,
         max_transformers_version="4.53.3",
-        transformers_version_reason="HF model uses deprecated transformers API "
-        "(PytorchGELUTanh, DynamicCache.seen_tokens, and more). See: "
-        "https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/discussions/31",
+        transformers_version_reason={
+            "hf": (
+                "HF model uses deprecated transformers API "
+                "(PytorchGELUTanh, DynamicCache.seen_tokens, and more). See: "
+                "https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/discussions/31"
+            )
+        },
     ),
     "LightOnOCRForConditionalGeneration": _HfExamplesInfo(
         os.path.join(models_path_prefix, "lightonai/LightOnOCR-1B-1025")
     ),
+    "Lfm2VlForConditionalGeneration": _HfExamplesInfo(
+        "LiquidAI/LFM2-VL-450M",
+        min_transformers_version="5.0.0",
+    ),
     "Llama4ForConditionalGeneration": _HfExamplesInfo(
         os.path.join(models_path_prefix, "meta-llama/Llama-4-Scout-17B-16E-Instruct"),
         max_model_len=10240,
@@ -712,7 +792,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
     "MantisForConditionalGeneration": _HfExamplesInfo(
         os.path.join(models_path_prefix, "TIGER-Lab/Mantis-8B-siglip-llama3"),
         max_transformers_version="4.48",
-        transformers_version_reason="HF model is not compatible.",
+        transformers_version_reason={"hf": "HF model is not compatible."},
         hf_overrides={"architectures": ["MantisForConditionalGeneration"]},
     ),
     "MiDashengLMModel": _HfExamplesInfo(
@@ -739,7 +819,9 @@ _MULTIMODAL_EXAMPLE_MODELS = {
     "MolmoForCausalLM": _HfExamplesInfo(
         os.path.join(models_path_prefix, "allenai/Molmo-7B-D-0924"),
         max_transformers_version="4.48",
-        transformers_version_reason="Incorrectly-detected `tensorflow` import.",
+        transformers_version_reason={
+            "vllm": "Incorrectly-detected `tensorflow` import from processor."
+        },
         extras={"olmo": os.path.join(models_path_prefix, "allenai/Molmo-7B-O-0924")},
         trust_remote_code=True,
     ),
@@ -758,7 +840,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
         os.path.join(models_path_prefix, "AIDC-AI/Ovis2-1B"),
         trust_remote_code=True,
         max_transformers_version="4.53",
-        transformers_version_reason="HF model is not compatible",
+        transformers_version_reason={"hf": "HF model is not compatible"},
         extras={
             "1.6-llama": os.path.join(models_path_prefix, "AIDC-AI/Ovis1.6-Llama3.2-3B"),
             "1.6-gemma": os.path.join(models_path_prefix, "AIDC-AI/Ovis1.6-Gemma2-9B"),
@@ -777,7 +859,9 @@ _MULTIMODAL_EXAMPLE_MODELS = {
         os.path.join(models_path_prefix, "microsoft/Phi-3-vision-128k-instruct"),
         trust_remote_code=True,
         max_transformers_version="4.48",
-        transformers_version_reason="Use of deprecated imports which have been removed.",  # noqa: E501
+        transformers_version_reason={
+            "hf": "HF model use deprecated imports which have been removed."
+        },  # noqa: E501
         extras={"phi3.5": os.path.join(models_path_prefix, "microsoft/Phi-3.5-vision-instruct")},
     ),
     "Phi4MMForCausalLM": _HfExamplesInfo(
@@ -796,7 +880,9 @@ _MULTIMODAL_EXAMPLE_MODELS = {
         extras={"chat": os.path.join(models_path_prefix, "Qwen/Qwen-VL-Chat")},
         trust_remote_code=True,
         max_transformers_version="4.53.3",
-        transformers_version_reason="Use of deprecated imports which have been removed.",  # noqa: E501
+        transformers_version_reason={
+            "hf": "HF model uses deprecated imports which have been removed."
+        },  # noqa: E501
         hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]},
     ),
     "Qwen2AudioForConditionalGeneration": _HfExamplesInfo(
@@ -851,7 +937,15 @@ _MULTIMODAL_EXAMPLE_MODELS = {
         # disable this temporarily until we support HF format
         is_available_online=False,
     ),
+    "VoxtralStreamingGeneration": _HfExamplesInfo(
+        "<place-holder>",
+        # disable this temporarily until we support HF format
+        is_available_online=False,
+    ),
     # [Encoder-decoder]
+    "NemotronParseForConditionalGeneration": _HfExamplesInfo(
+        "nvidia/NVIDIA-Nemotron-Parse-v1.1", trust_remote_code=True
+    ),
     "WhisperForConditionalGeneration": _HfExamplesInfo(
         os.path.join(models_path_prefix, "openai/whisper-large-v3-turbo"),
         extras={"v3": os.path.join(models_path_prefix, "openai/whisper-large-v3")},
@@ -926,6 +1020,11 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
         trust_remote_code=True,
         speculative_model=os.path.join(models_path_prefix, "baidu/ERNIE-4.5-21B-A3B-PT"),
     ),
+    "ExaoneMoeMTP": _HfExamplesInfo(
+        "LGAI-EXAONE/K-EXAONE-236B-A23B",
+        speculative_model="LGAI-EXAONE/K-EXAONE-236B-A23B",
+        min_transformers_version="5.0.0",
+    ),
     "Glm4MoeMTPModel": _HfExamplesInfo(
         os.path.join(models_path_prefix, "zai-org/GLM-4.5"),
         speculative_model="zai-org/GLM-4.5",
diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
index 8c4bd6eaa2dd8fba17942428b61a782a135f7850..61e8c601f2ce7166b87e5c530633d38575db5a08 100644
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -66,7 +66,11 @@ def can_initialize(
 
     model_info = EXAMPLE_MODELS.get_hf_info(model_arch)
     model_info.check_available_online(on_fail="skip")
-    model_info.check_transformers_version(on_fail="skip")
+    model_info.check_transformers_version(
+        on_fail="skip",
+        check_max_version=False,
+        check_version_reason="vllm",
+    )
 
     hf_overrides_fn = partial(
         dummy_hf_overrides,
@@ -108,11 +112,12 @@ def can_initialize(
         patch.object(V1EngineCore, "_initialize_kv_caches", _initialize_kv_caches_v1),
         monkeypatch.context() as m,
     ):
-        if model_arch == "GptOssForCausalLM":
-            # FIXME: A hack to bypass FA3 assertion because our CI's L4 GPU
-            # has cc==8.9 which hasn't supported FA3 yet. Remove this hack when
-            # L4 supports FA3.
-            m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN")
+        # FIXME: A hack to bypass FA3 assertion because our CI's L4 GPU
+        # has cc==8.9 which hasn't supported FA3 yet. Remove this hack when
+        # L4 supports FA3.
+        attention_config = (
+            {"backend": "TRITON_ATTN"} if model_arch == "GptOssForCausalLM" else None
+        )
         if model_arch == "WhisperForConditionalGeneration":
             m.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
 
@@ -143,6 +148,7 @@ def can_initialize(
             else "vllm",
             hf_overrides=hf_overrides_fn,
             max_num_seqs=model_info.max_num_seqs,
+            attention_config=attention_config,
         )
 
 
diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py
index b93848bcda8a083edde4220ccd96a0bb012bb0a6..9089e91d60966b9e457464393b7f989e2752987c 100644
--- a/tests/models/test_registry.py
+++ b/tests/models/test_registry.py
@@ -34,7 +34,11 @@ models_path_prefix = os.getenv('VLLM_OPTEST_MODELS_PATH') or os.getenv("OPTEST_M
 def test_registry_imports(model_arch):
     # Skip if transformers version is incompatible
     model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
-    model_info.check_transformers_version(on_fail="skip")
+    model_info.check_transformers_version(
+        on_fail="skip",
+        check_max_version=False,
+        check_version_reason="vllm",
+    )
     # Ensure all model classes can be imported successfully
     model_cls = ModelRegistry._try_load_model_cls(model_arch)
     assert model_cls is not None
diff --git a/tests/models/test_terratorch.py b/tests/models/test_terratorch.py
index 15764145bc1a25db2778c678f99617e143fe1f3a..24b624e269583dd17dac104acb78c2b100fd28c5 100644
--- a/tests/models/test_terratorch.py
+++ b/tests/models/test_terratorch.py
@@ -38,7 +38,7 @@ def test_inference(
         max_num_seqs=32,
         default_torch_num_threads=1,
     ) as vllm_model:
-        vllm_output = vllm_model.llm.encode(prompt)
+        vllm_output = vllm_model.llm.encode(prompt, pooling_task="plugin")
         assert torch.equal(
             torch.isnan(vllm_output[0].outputs.data).any(), torch.tensor(False)
         )
diff --git a/tests/models/test_vision.py b/tests/models/test_vision.py
index 82ba958a58c41dc50c6705fcaa03d6725133cec3..24e49e9d61c816f54d89ce4f79b4560252a89f11 100644
--- a/tests/models/test_vision.py
+++ b/tests/models/test_vision.py
@@ -21,6 +21,7 @@ from vllm.model_executor.models.vision import (
 from vllm.platforms import current_platform
 from vllm.utils.network_utils import get_open_port
 from vllm.utils.system_utils import update_environment_variables
+from vllm.utils.torch_utils import set_random_seed
 
 pytestmark = pytest.mark.cpu_test
 
@@ -98,7 +99,7 @@ def run_dp_sharded_vision_model_vs_direct(
     """
 
     # Set random seed for reproducibility
-    current_platform.seed_everything(0)
+    set_random_seed(0)
 
     device = f"{current_platform.device_name}:{local_rank}"
     current_platform.set_device(device)
@@ -284,7 +285,7 @@ def run_dp_sharded_mrope_vision_model_vs_direct(
     calling the model directly.
     """
     # Set random seed for reproducibility
-    current_platform.seed_everything(0)
+    set_random_seed(0)
     device = f"{current_platform.device_name}:{local_rank}"
     current_platform.set_device(device)
     torch.set_default_device(device)
@@ -408,7 +409,7 @@ def run_dp_sharded_mrope_vision_model_uneven_load_worker(
 ):
     """Test run_dp_sharded_mrope_vision_model with uneven load distribution."""
     # Set up distributed environment
-    current_platform.seed_everything(123)
+    set_random_seed(123)
     device = f"{current_platform.device_name}:{local_rank}"
     current_platform.set_device(device)
     torch.set_default_device(device)
diff --git a/tests/models/utils.py b/tests/models/utils.py
index d84b4b820533ec703c652ac880dd2fc9b1986956..1b820d284ee5e2522ebed814afc20c0a47a0804d 100644
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -10,7 +10,8 @@ import torch
 import torch.nn.functional as F
 from transformers import PretrainedConfig
 
-from vllm.config.model import ModelConfig, ModelDType, RunnerOption
+from vllm.config.model import AttnTypeStr, ModelConfig, ModelDType, RunnerOption
+from vllm.config.pooler import SequencePoolingType, TokenPoolingType
 from vllm.logprobs import Logprob, PromptLogprobs, SampleLogprobs
 from vllm.multimodal.processing import InputProcessingContext
 from vllm.tokenizers import cached_tokenizer_from_config
@@ -292,7 +293,11 @@ def build_model_context(
     """
     model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
     model_info.check_available_online(on_fail="skip")
-    model_info.check_transformers_version(on_fail="skip")
+    model_info.check_transformers_version(
+        on_fail="skip",
+        check_max_version=False,
+        check_version_reason="vllm",
+    )
 
     model_config_kwargs = model_config_kwargs or {}
     limit_mm_per_prompt = limit_mm_per_prompt or {}
@@ -375,7 +380,11 @@ class ModelInfo:
     max_model_len: int | None = None
     hf_dtype: str = "float32"
     hf_overrides: dict[str, Any] | None = None
-    default_pooling_type: str = ""
+    seq_pooling_type: SequencePoolingType | None = None
+    tok_pooling_type: TokenPoolingType | None = None
+    attn_type: AttnTypeStr | None = None
+    is_prefix_caching_supported: bool | None = None
+    is_chunked_prefill_supported: bool | None = None
     enable_test: bool = True
 
 
@@ -386,29 +395,10 @@ class EmbedModelInfo(ModelInfo):
     matryoshka_dimensions: list[int] | None = None
 
 
-@dataclass
-class CLSPoolingEmbedModelInfo(EmbedModelInfo):
-    default_pooling_type: str = "CLS"
-
-
-@dataclass
-class LASTPoolingEmbedModelInfo(EmbedModelInfo):
-    default_pooling_type: str = "LAST"
-
-
 @dataclass
 class RerankModelInfo(ModelInfo):
     mteb_score: float | None = None
-
-
-@dataclass
-class CLSPoolingRerankModelInfo(RerankModelInfo):
-    default_pooling_type: str = "CLS"
-
-
-@dataclass
-class LASTPoolingRerankModelInfo(RerankModelInfo):
-    default_pooling_type: str = "LAST"
+    chat_template_name: str | None = None
 
 
 @dataclass
@@ -483,12 +473,16 @@ def dummy_hf_overrides(
         "num_kv_shared_layers": 1,
     }
 
+    _hf_config = hf_config
+
     class DummyConfig:
+        hf_config = _hf_config
         hf_text_config = text_config
 
+    model_arch_config = ModelConfig.get_model_arch_config(DummyConfig)
     # Only set MoE related config when the model has MoE layers.
     # Otherwise all models detected as MoE by _get_transformers_backend_cls.
-    if ModelConfig.get_num_experts(DummyConfig) > 0:
+    if model_arch_config.num_experts > 0:
         update_dict.update(
             {
                 "num_experts": num_experts,
diff --git a/tests/multimodal/test_audio.py b/tests/multimodal/test_audio.py
index 189b319e5fcdee02cc0a6bbe3c8edf27fe624b39..46545498c83fb72cf76b0d9128275e79f8c18117 100644
--- a/tests/multimodal/test_audio.py
+++ b/tests/multimodal/test_audio.py
@@ -7,10 +7,16 @@ from unittest.mock import patch
 
 import numpy as np
 import pytest
+import torch
 
 from vllm.multimodal.audio import (
+    MONO_AUDIO_SPEC,
+    PASSTHROUGH_AUDIO_SPEC,
     AudioMediaIO,
     AudioResampler,
+    AudioSpec,
+    ChannelReduction,
+    normalize_audio,
     resample_audio_librosa,
     resample_audio_scipy,
 )
@@ -137,3 +143,500 @@ def test_audio_media_io_encode_base64(dummy_audio):
         decoded = base64.b64decode(out)
         assert decoded == b"dummy_wav_data"
         mock_write.assert_called_once()
+
+
+# ============================================================
+# Tests for normalize_audio function
+# ============================================================
+
+
+class TestNormalizeAudio:
+    """Tests for normalize_audio function with different specs."""
+
+    def test_passthrough_preserves_audio(self):
+        """Passthrough spec should not modify audio."""
+        stereo = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], dtype=np.float32)
+        result = normalize_audio(stereo, PASSTHROUGH_AUDIO_SPEC)
+        np.testing.assert_array_equal(result, stereo)
+
+    def test_mono_spec_with_numpy_stereo(self):
+        """Mono spec should reduce stereo numpy array to 1D."""
+        stereo = np.array([[1.0, 2.0], [-1.0, 0.0]], dtype=np.float32)
+        result = normalize_audio(stereo, MONO_AUDIO_SPEC)
+        assert result.ndim == 1
+        np.testing.assert_array_almost_equal(result, [0.0, 1.0])
+
+    def test_mono_spec_with_torch_stereo(self):
+        """Mono spec should reduce stereo torch tensor to 1D."""
+        stereo = torch.tensor([[1.0, 2.0], [-1.0, 0.0]])
+        result = normalize_audio(stereo, MONO_AUDIO_SPEC)
+        assert result.ndim == 1
+        torch.testing.assert_close(result, torch.tensor([0.0, 1.0]))
+
+    def test_mono_passthrough_for_1d_numpy(self):
+        """1D numpy array should pass through unchanged with mono spec."""
+        mono = np.array([1.0, 2.0, 3.0], dtype=np.float32)
+        result = normalize_audio(mono, MONO_AUDIO_SPEC)
+        assert result.ndim == 1
+        np.testing.assert_array_equal(result, mono)
+
+    def test_mono_passthrough_for_1d_torch(self):
+        """1D torch tensor should pass through unchanged with mono spec."""
+        mono = torch.tensor([1.0, 2.0, 3.0])
+        result = normalize_audio(mono, MONO_AUDIO_SPEC)
+        assert result.ndim == 1
+        torch.testing.assert_close(result, mono)
+
+    def test_first_channel_reduction(self):
+        """FIRST reduction should take only the first channel."""
+        spec = AudioSpec(target_channels=1, channel_reduction=ChannelReduction.FIRST)
+        stereo = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
+        result = normalize_audio(stereo, spec)
+        np.testing.assert_array_equal(result, [1.0, 2.0])
+
+    def test_max_channel_reduction(self):
+        """MAX reduction should take max across channels."""
+        spec = AudioSpec(target_channels=1, channel_reduction=ChannelReduction.MAX)
+        stereo = np.array([[1.0, 4.0], [3.0, 2.0]], dtype=np.float32)
+        result = normalize_audio(stereo, spec)
+        np.testing.assert_array_equal(result, [3.0, 4.0])
+
+    def test_sum_channel_reduction(self):
+        """SUM reduction should sum across channels."""
+        spec = AudioSpec(target_channels=1, channel_reduction=ChannelReduction.SUM)
+        stereo = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
+        result = normalize_audio(stereo, spec)
+        np.testing.assert_array_equal(result, [4.0, 6.0])
+
+    def test_invalid_3d_array_raises(self):
+        """3D arrays should raise ValueError."""
+        audio_3d = np.random.randn(2, 3, 4).astype(np.float32)
+        with pytest.raises(ValueError, match="Unsupported audio"):
+            normalize_audio(audio_3d, MONO_AUDIO_SPEC)
+
+    def test_channel_expansion_raises(self):
+        """Expanding from mono to stereo should raise ValueError."""
+        mono = np.array([1.0, 2.0, 3.0], dtype=np.float32)
+        spec = AudioSpec(target_channels=2)
+        with pytest.raises(ValueError, match="Cannot expand"):
+            normalize_audio(mono, spec)
+
+    def test_time_channels_format_numpy(self):
+        """Audio in (time, channels) format should be transposed to (channels, time).
+
+        This handles the case where audio loaders like soundfile return
+        (time, channels) format instead of (channels, time) like torchaudio.
+        """
+        # Create audio in (time, channels) format: 1000 samples, 2 channels
+        audio_time_channels = np.array(
+            [[1.0, -1.0]] * 1000,  # 1000 time steps, 2 channels
+            dtype=np.float32,
+        )
+        assert audio_time_channels.shape == (1000, 2)  # (time, channels)
+
+        result = normalize_audio(audio_time_channels, MONO_AUDIO_SPEC)
+
+        # Should be reduced to mono 1D
+        assert result.ndim == 1
+        assert result.shape == (1000,)
+        # Mean of [1.0, -1.0] at each time step should be 0.0
+        np.testing.assert_array_almost_equal(result, np.zeros(1000))
+
+    def test_time_channels_format_torch(self):
+        """Torch tensor in (time, channels) format should be transposed."""
+        # Create audio in (time, channels) format: 1000 samples, 2 channels
+        audio_time_channels = torch.tensor(
+            [[1.0, -1.0]] * 1000,  # 1000 time steps, 2 channels
+        )
+        assert audio_time_channels.shape == (1000, 2)  # (time, channels)
+
+        result = normalize_audio(audio_time_channels, MONO_AUDIO_SPEC)
+
+        # Should be reduced to mono 1D
+        assert result.ndim == 1
+        assert result.shape == (1000,)
+        # Mean of [1.0, -1.0] at each time step should be 0.0
+        torch.testing.assert_close(result, torch.zeros(1000))
+
+    def test_channels_time_format_preserved(self):
+        """Audio already in (channels, time) format should work correctly."""
+        # Create audio in standard (channels, time) format: 2 channels, 1000 samples
+        audio_channels_time = np.array(
+            [[1.0] * 1000, [-1.0] * 1000],  # 2 channels, 1000 time steps
+            dtype=np.float32,
+        )
+        assert audio_channels_time.shape == (2, 1000)  # (channels, time)
+
+        result = normalize_audio(audio_channels_time, MONO_AUDIO_SPEC)
+
+        # Should be reduced to mono 1D
+        assert result.ndim == 1
+        assert result.shape == (1000,)
+        # Mean of [1.0, -1.0] at each time step should be 0.0
+        np.testing.assert_array_almost_equal(result, np.zeros(1000))
+
+    def test_ambiguous_square_audio_numpy(self):
+        """Square audio arrays (N, N) should use shape[0] > shape[1] heuristic.
+
+        For a square array, shape[0] == shape[1], so no transpose happens
+        and we assume (channels, time) format.
+        """
+        # Create square audio: 4 channels, 4 samples
+        audio_square = np.array(
+            [
+                [1.0, 2.0, 3.0, 4.0],
+                [5.0, 6.0, 7.0, 8.0],
+                [9.0, 10.0, 11.0, 12.0],
+                [13.0, 14.0, 15.0, 16.0],
+            ],
+            dtype=np.float32,
+        )
+        assert audio_square.shape == (4, 4)
+
+        result = normalize_audio(audio_square, MONO_AUDIO_SPEC)
+
+        # Should be reduced to mono 1D with mean across channels (axis 0)
+        assert result.ndim == 1
+        assert result.shape == (4,)
+        # Mean across 4 channels: [1+5+9+13, 2+6+10+14, ...] / 4
+        expected = np.array([7.0, 8.0, 9.0, 10.0])
+        np.testing.assert_array_almost_equal(result, expected)
+
+
+# ============================================================
+# Tests for MultiModalDataParser integration with target_channels
+# ============================================================
+
+
+class TestMultiModalDataParserChannelNormalization:
+    """Tests for MultiModalDataParser.target_channels integration.
+
+    These tests verify that the target_channels parameter is properly used
+    in the _parse_audio_data method to normalize audio channels.
+    """
+
+    def test_parser_normalizes_stereo_to_mono(self):
+        """Parser should normalize stereo to mono when target_channels=1."""
+        from vllm.multimodal.parse import MultiModalDataParser
+
+        # Create parser with mono normalization enabled
+        parser = MultiModalDataParser(
+            target_sr=16000,
+            target_channels=1,
+        )
+
+        # Create stereo audio (simulating torchaudio output)
+        stereo_audio = np.array(
+            [[1.0, 1.0, 1.0], [-1.0, -1.0, -1.0]],  # 2 channels, 3 samples
+            dtype=np.float32,
+        )
+
+        # Parse audio data
+        result = parser._parse_audio_data((stereo_audio, 16000))
+
+        # Check that result is mono (1D)
+        audio_item = result.get(0)
+        assert audio_item.ndim == 1, f"Expected 1D mono audio, got {audio_item.ndim}D"
+        assert audio_item.shape == (3,), f"Expected shape (3,), got {audio_item.shape}"
+        # Channel average of [1, 1, 1] and [-1, -1, -1] should be [0, 0, 0]
+        np.testing.assert_array_almost_equal(audio_item, np.zeros(3))
+
+    def test_parser_preserves_stereo_when_target_channels_none(self):
+        """Parser should preserve stereo when target_channels=None."""
+        from vllm.multimodal.parse import MultiModalDataParser
+
+        # Create parser without channel normalization
+        parser = MultiModalDataParser(
+            target_sr=16000,
+            target_channels=None,
+        )
+
+        # Create stereo audio
+        stereo_audio = np.array(
+            [[1.0, 1.0, 1.0], [-1.0, -1.0, -1.0]],
+            dtype=np.float32,
+        )
+
+        # Parse audio data
+        result = parser._parse_audio_data((stereo_audio, 16000))
+
+        # Check that result preserves original shape (after resampling)
+        audio_item = result.get(0)
+        # When target_channels=None, stereo audio should be preserved
+        assert audio_item.ndim == 2, f"Expected 2D stereo audio, got {audio_item.ndim}D"
+
+    def test_parser_mono_passthrough_when_target_channels_1(self):
+        """Parser should pass through mono audio unchanged when target_channels=1."""
+        from vllm.multimodal.parse import MultiModalDataParser
+
+        # Create parser with mono normalization enabled
+        parser = MultiModalDataParser(
+            target_sr=16000,
+            target_channels=1,
+        )
+
+        # Create mono audio (already 1D)
+        mono_audio = np.random.randn(16000).astype(np.float32)
+
+        # Parse audio data
+        result = parser._parse_audio_data((mono_audio, 16000))
+
+        # Check that result is still mono (1D)
+        audio_item = result.get(0)
+        assert audio_item.ndim == 1
+        assert audio_item.shape == (16000,)
+
+    def test_parser_with_target_channels_2(self):
+        """Parser should reduce 6-channel to 2-channel when target_channels=2."""
+        from vllm.multimodal.parse import MultiModalDataParser
+
+        # Create parser with stereo target
+        parser = MultiModalDataParser(
+            target_sr=16000,
+            target_channels=2,
+        )
+
+        # Create 6-channel audio (5.1 surround)
+        surround_audio = np.random.randn(6, 1000).astype(np.float32)
+
+        # Parse audio data
+        result = parser._parse_audio_data((surround_audio, 16000))
+
+        # Check that result is stereo (2 channels)
+        audio_item = result.get(0)
+        assert audio_item.ndim == 2
+        assert audio_item.shape[0] == 2  # 2 channels
+
+
+# ============================================================
+# End-to-End Audio Pipeline Tests
+# ============================================================
+
+
+class TestAudioPipelineE2E:
+    """End-to-end tests for audio normalization in the full pipeline.
+
+    These tests verify the complete flow from raw audio input through
+    the MultiModalDataParser, simulating different audio loader formats.
+    """
+
+    def test_stereo_audio_normalized_to_mono_e2e(self):
+        """Full pipeline: stereo audio (torchaudio format) → mono output."""
+        from vllm.multimodal.parse import MultiModalDataParser
+
+        # Simulate torchaudio output: (channels, time) format
+        # Stereo audio with left channel = 1.0, right channel = -1.0
+        stereo_torchaudio = np.array(
+            [[1.0] * 16000, [-1.0] * 16000],  # 2 channels, 1 second at 16kHz
+            dtype=np.float32,
+        )
+        assert stereo_torchaudio.shape == (2, 16000)
+
+        # Create parser with mono normalization (like Whisper models)
+        parser = MultiModalDataParser(
+            target_sr=16000,
+            target_channels=1,
+        )
+
+        # Process audio through the parser
+        result = parser._parse_audio_data((stereo_torchaudio, 16000))
+        audio_output = result.get(0)
+
+        # Verify output is mono 1D
+        assert audio_output.ndim == 1, f"Expected 1D, got {audio_output.ndim}D"
+        assert audio_output.shape == (16000,)
+
+        # Verify channel averaging: mean of [1.0, -1.0] = 0.0
+        np.testing.assert_array_almost_equal(audio_output, np.zeros(16000), decimal=5)
+
+    def test_soundfile_format_normalized_to_mono_e2e(self):
+        """Full pipeline: soundfile format (time, channels) → mono output."""
+        from vllm.multimodal.parse import MultiModalDataParser
+
+        # Simulate soundfile output: (time, channels) format
+        # 16000 samples, 2 channels
+        stereo_soundfile = np.array(
+            [[0.5, -0.5]] * 16000,  # Each row is [left, right]
+            dtype=np.float32,
+        )
+        assert stereo_soundfile.shape == (16000, 2)
+
+        # Create parser with mono normalization
+        parser = MultiModalDataParser(
+            target_sr=16000,
+            target_channels=1,
+        )
+
+        # Process audio through the parser
+        result = parser._parse_audio_data((stereo_soundfile, 16000))
+        audio_output = result.get(0)
+
+        # Verify output is mono 1D
+        assert audio_output.ndim == 1, f"Expected 1D, got {audio_output.ndim}D"
+        assert audio_output.shape == (16000,)
+
+        # Verify channel averaging: mean of [0.5, -0.5] = 0.0
+        np.testing.assert_array_almost_equal(audio_output, np.zeros(16000), decimal=5)
+
+    def test_librosa_mono_passthrough_e2e(self):
+        """Full pipeline: librosa mono format → preserved as mono."""
+        from vllm.multimodal.parse import MultiModalDataParser
+
+        # Simulate librosa output: already mono (time,) format
+        mono_librosa = np.random.randn(16000).astype(np.float32)
+        assert mono_librosa.shape == (16000,)
+
+        # Create parser with mono normalization
+        parser = MultiModalDataParser(
+            target_sr=16000,
+            target_channels=1,
+        )
+
+        # Process audio through the parser
+        result = parser._parse_audio_data((mono_librosa, 16000))
+        audio_output = result.get(0)
+
+        # Verify output is still mono 1D
+        assert audio_output.ndim == 1
+        assert audio_output.shape == (16000,)
+
+        # Verify audio content is preserved
+        np.testing.assert_array_almost_equal(audio_output, mono_librosa)
+
+    def test_multichannel_5_1_surround_to_mono_e2e(self):
+        """Full pipeline: 5.1 surround (6 channels) → mono output."""
+        from vllm.multimodal.parse import MultiModalDataParser
+
+        # Simulate 5.1 surround audio: 6 channels
+        surround_audio = np.array(
+            [
+                [1.0] * 8000,  # Front Left
+                [2.0] * 8000,  # Front Right
+                [3.0] * 8000,  # Center
+                [4.0] * 8000,  # LFE (subwoofer)
+                [5.0] * 8000,  # Rear Left
+                [6.0] * 8000,  # Rear Right
+            ],
+            dtype=np.float32,
+        )
+        assert surround_audio.shape == (6, 8000)
+
+        # Create parser with mono normalization
+        parser = MultiModalDataParser(
+            target_sr=16000,
+            target_channels=1,
+        )
+
+        # Process audio through the parser
+        result = parser._parse_audio_data((surround_audio, 16000))
+        audio_output = result.get(0)
+
+        # Verify output is mono 1D
+        assert audio_output.ndim == 1
+
+        # Verify channel averaging: mean of [1,2,3,4,5,6] = 3.5
+        expected_value = (1.0 + 2.0 + 3.0 + 4.0 + 5.0 + 6.0) / 6
+        np.testing.assert_array_almost_equal(
+            audio_output, np.full(8000, expected_value), decimal=5
+        )
+
+    def test_torch_tensor_input_e2e(self):
+        """Full pipeline: torch.Tensor stereo input → mono numpy output."""
+        from vllm.multimodal.parse import MultiModalDataParser
+
+        # Simulate torch tensor input (from torchaudio)
+        stereo_torch = torch.tensor(
+            [[1.0] * 8000, [-1.0] * 8000],  # 2 channels
+            dtype=torch.float32,
+        )
+        assert stereo_torch.shape == (2, 8000)
+
+        # Create parser with mono normalization
+        parser = MultiModalDataParser(
+            target_sr=16000,
+            target_channels=1,
+        )
+
+        # Process audio through the parser
+        # Note: Parser expects numpy, so we convert first (simulating real usage)
+        result = parser._parse_audio_data((stereo_torch.numpy(), 16000))
+        audio_output = result.get(0)
+
+        # Verify output is mono 1D numpy array
+        assert audio_output.ndim == 1
+        assert isinstance(audio_output, np.ndarray)
+
+        # Verify channel averaging
+        np.testing.assert_array_almost_equal(audio_output, np.zeros(8000), decimal=5)
+
+    def test_passthrough_preserves_stereo_e2e(self):
+        """Full pipeline: stereo with target_channels=None → stereo preserved."""
+        from vllm.multimodal.parse import MultiModalDataParser
+
+        # Stereo audio
+        stereo_audio = np.array(
+            [[1.0] * 8000, [-1.0] * 8000],
+            dtype=np.float32,
+        )
+
+        # Create parser WITHOUT mono normalization (passthrough)
+        parser = MultiModalDataParser(
+            target_sr=16000,
+            target_channels=None,  # Passthrough - no normalization
+        )
+
+        # Process audio through the parser
+        result = parser._parse_audio_data((stereo_audio, 16000))
+        audio_output = result.get(0)
+
+        # Verify output preserves stereo (2D)
+        assert audio_output.ndim == 2
+        assert audio_output.shape == (2, 8000)
+
+    def test_resampling_with_channel_normalization_e2e(self):
+        """Full pipeline: resample + channel normalize in single pass."""
+        from vllm.multimodal.parse import MultiModalDataParser
+
+        # Stereo audio at 48kHz (common recording rate)
+        stereo_48k = np.array(
+            [[1.0] * 48000, [-1.0] * 48000],  # 1 second at 48kHz
+            dtype=np.float32,
+        )
+
+        # Create parser with both resampling and mono normalization
+        parser = MultiModalDataParser(
+            target_sr=16000,  # Resample to 16kHz
+            target_channels=1,  # Normalize to mono
+        )
+
+        # Process audio through the parser
+        result = parser._parse_audio_data((stereo_48k, 48000))
+        audio_output = result.get(0)
+
+        # Verify output is mono 1D at target sample rate
+        assert audio_output.ndim == 1
+        # After resampling from 48kHz to 16kHz, length should be ~16000
+        assert audio_output.shape[0] == 16000
+
+    def test_very_short_audio_e2e(self):
+        """Full pipeline: very short audio (< 1 frame) handled correctly."""
+        from vllm.multimodal.parse import MultiModalDataParser
+
+        # Very short stereo audio (10 samples)
+        short_stereo = np.array(
+            [[1.0] * 10, [-1.0] * 10],
+            dtype=np.float32,
+        )
+
+        parser = MultiModalDataParser(
+            target_sr=16000,
+            target_channels=1,
+        )
+
+        result = parser._parse_audio_data((short_stereo, 16000))
+        audio_output = result.get(0)
+
+        # Should still produce mono output
+        assert audio_output.ndim == 1
+        assert audio_output.shape == (10,)
+        np.testing.assert_array_almost_equal(audio_output, np.zeros(10))
diff --git a/tests/multimodal/test_cache.py b/tests/multimodal/test_cache.py
index e641b1111abafcf4d2c46b8a4b57efb1a893e157..0a8d4f7373f4006e88b9c40eedca667c13f265db 100644
--- a/tests/multimodal/test_cache.py
+++ b/tests/multimodal/test_cache.py
@@ -24,10 +24,12 @@ from vllm.multimodal.cache import (
 )
 from vllm.multimodal.hasher import MultiModalHasher
 from vllm.multimodal.inputs import (
+    MultiModalFeatureSpec,
     MultiModalFieldElem,
     MultiModalKwargsItem,
     MultiModalKwargsItems,
     MultiModalSharedField,
+    PlaceholderRange,
 )
 from vllm.multimodal.processing import PromptInsertion
 from vllm.utils.mem_constants import GiB_bytes, MiB_bytes
@@ -518,3 +520,40 @@ def test_cache_eviction_shm_cache():
     receiver_cache = ShmObjectStoreReceiverCache(vllm_config, mp.Lock())
 
     _run_test_cache_eviction_shm(sender_cache, receiver_cache, base_item_size=MiB_bytes)
+
+
+def test_processor_cache_shared_across_loras():
+    """Test that processor cache uses mm_hash to share data across LoRAs."""
+    model_config = ModelConfig(
+        model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
+        mm_processor_cache_gb=1,
+    )
+    receiver_cache = MultiModalReceiverCache(model_config)
+
+    base_mm_hash = "image_hash_abc123"
+    lora_a_identifier = f"12345:{base_mm_hash}"
+    lora_b_identifier = f"67890:{base_mm_hash}"
+
+    item_data = MultiModalKwargsItem.dummy("test_image", nbytes=1024)
+
+    feature_lora_a = MultiModalFeatureSpec(
+        data=item_data,
+        modality="image",
+        identifier=lora_a_identifier,
+        mm_position=PlaceholderRange(offset=0, length=100),
+        mm_hash=base_mm_hash,
+    )
+
+    receiver_cache.get_and_update_features([feature_lora_a])
+    assert base_mm_hash in receiver_cache._cache
+
+    feature_lora_b = MultiModalFeatureSpec(
+        data=None,
+        modality="image",
+        identifier=lora_b_identifier,
+        mm_position=PlaceholderRange(offset=0, length=100),
+        mm_hash=base_mm_hash,
+    )
+
+    receiver_cache.get_and_update_features([feature_lora_b])
+    assert feature_lora_b.data == item_data
diff --git a/tests/multimodal/test_embedding_shape_validation_unit.py b/tests/multimodal/test_embedding_shape_validation_unit.py
new file mode 100644
index 0000000000000000000000000000000000000000..7966aad4e988cdd26f1226d981e5039603d15ed2
--- /dev/null
+++ b/tests/multimodal/test_embedding_shape_validation_unit.py
@@ -0,0 +1,249 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Unit tests for embedding shape validation.
+
+Simple, fast unit tests that can run without server fixtures.
+Run with: pytest tests/multimodal/test_embedding_shape_validation_unit.py -v
+"""
+
+import pytest
+import torch
+
+from vllm.multimodal.parse import (
+    AudioEmbeddingItems,
+    ImageEmbeddingItems,
+)
+
+
+class TestImageEmbedBasicValidation:
+    """Test basic ndim validation in image embeddings via ImageEmbeddingItems."""
+
+    def test_valid_2d_tensor_accepted(self):
+        """Baseline: 2D tensors should be accepted."""
+        valid_tensor = torch.randn(10, 768, dtype=torch.float32)
+
+        # Should not raise - 2D is valid
+        items = ImageEmbeddingItems(valid_tensor)
+        assert items.get_count() == 10
+
+    def test_valid_3d_tensor_accepted(self):
+        """Baseline: 3D tensors should be accepted."""
+        valid_tensor = torch.randn(2, 10, 768, dtype=torch.float32)
+
+        # Should not raise - 3D is valid
+        items = ImageEmbeddingItems(valid_tensor)
+        assert items.get_count() == 2
+
+    def test_valid_list_of_2d_tensors_accepted(self):
+        """Baseline: List of 2D tensors should be accepted."""
+        tensors = [
+            torch.randn(10, 768, dtype=torch.float32),
+            torch.randn(15, 768, dtype=torch.float32),
+        ]
+
+        # Should not raise
+        items = ImageEmbeddingItems(tensors)
+        assert items.get_count() == 2
+
+    def test_1d_tensor_rejected(self):
+        """Security: 1D tensors should be rejected (invalid ndim)."""
+        invalid_tensor = torch.randn(768, dtype=torch.float32)  # 1D
+
+        with pytest.raises(ValueError) as exc_info:
+            ImageEmbeddingItems(invalid_tensor)
+
+        assert "must be 2D" in str(exc_info.value) or "3D" in str(exc_info.value)
+
+    def test_4d_tensor_rejected(self):
+        """Security: 4D tensors should be rejected (invalid ndim)."""
+        invalid_tensor = torch.randn(1, 2, 10, 768, dtype=torch.float32)  # 4D
+
+        with pytest.raises(ValueError) as exc_info:
+            ImageEmbeddingItems(invalid_tensor)
+
+        assert "must be 2D" in str(exc_info.value) or "3D" in str(exc_info.value)
+
+    def test_hidden_size_validation_correct_size(self):
+        """Embeddings with correct hidden size should be accepted."""
+        expected_hidden_size = 768
+        valid_tensor = torch.randn(10, expected_hidden_size, dtype=torch.float32)
+
+        # Should not raise
+        items = ImageEmbeddingItems(
+            valid_tensor, expected_hidden_size=expected_hidden_size
+        )
+        assert items.get_count() == 10
+
+    def test_hidden_size_validation_wrong_size_rejected(self):
+        """Embeddings with wrong hidden size should be rejected."""
+        expected_hidden_size = 768
+        wrong_hidden_size = 4096
+        invalid_tensor = torch.randn(10, wrong_hidden_size, dtype=torch.float32)
+
+        with pytest.raises(ValueError) as exc_info:
+            ImageEmbeddingItems(
+                invalid_tensor, expected_hidden_size=expected_hidden_size
+            )
+
+        error_msg = str(exc_info.value)
+        assert "hidden dimension mismatch" in error_msg.lower()
+        assert str(wrong_hidden_size) in error_msg
+        assert str(expected_hidden_size) in error_msg
+
+
+class TestAudioEmbedBasicValidation:
+    """Test basic ndim validation in audio embeddings via AudioEmbeddingItems."""
+
+    def test_valid_2d_tensor_accepted(self):
+        """Baseline: 2D tensors should be accepted."""
+        valid_tensor = torch.randn(10, 768, dtype=torch.float32)
+
+        # Should not raise - 2D is valid
+        items = AudioEmbeddingItems(valid_tensor)
+        assert items.get_count() == 10
+
+    def test_valid_3d_tensor_accepted(self):
+        """Baseline: 3D tensors should be accepted."""
+        valid_tensor = torch.randn(2, 10, 768, dtype=torch.float32)
+
+        # Should not raise - 3D is valid
+        items = AudioEmbeddingItems(valid_tensor)
+        assert items.get_count() == 2
+
+    def test_valid_list_of_2d_tensors_accepted(self):
+        """Baseline: List of 2D tensors should be accepted."""
+        tensors = [
+            torch.randn(10, 768, dtype=torch.float32),
+            torch.randn(15, 768, dtype=torch.float32),
+        ]
+
+        # Should not raise
+        items = AudioEmbeddingItems(tensors)
+        assert items.get_count() == 2
+
+    def test_1d_tensor_rejected(self):
+        """Security: 1D tensors should be rejected (invalid ndim)."""
+        invalid_tensor = torch.randn(768, dtype=torch.float32)  # 1D
+
+        with pytest.raises(ValueError) as exc_info:
+            AudioEmbeddingItems(invalid_tensor)
+
+        assert "must be 2D" in str(exc_info.value) or "3D" in str(exc_info.value)
+
+    def test_scalar_rejected(self):
+        """Security: Scalar tensors should be rejected."""
+        invalid_tensor = torch.tensor(1.0)  # 0D (scalar)
+
+        with pytest.raises(ValueError):
+            AudioEmbeddingItems(invalid_tensor)
+
+    def test_hidden_size_validation_correct_size(self):
+        """Embeddings with correct hidden size should be accepted."""
+        expected_hidden_size = 768
+        valid_tensor = torch.randn(10, expected_hidden_size, dtype=torch.float32)
+
+        # Should not raise
+        items = AudioEmbeddingItems(
+            valid_tensor, expected_hidden_size=expected_hidden_size
+        )
+        assert items.get_count() == 10
+
+    def test_hidden_size_validation_wrong_size_rejected(self):
+        """Embeddings with wrong hidden size should be rejected."""
+        expected_hidden_size = 768
+        wrong_hidden_size = 4096
+        invalid_tensor = torch.randn(10, wrong_hidden_size, dtype=torch.float32)
+
+        with pytest.raises(ValueError) as exc_info:
+            AudioEmbeddingItems(
+                invalid_tensor, expected_hidden_size=expected_hidden_size
+            )
+
+        error_msg = str(exc_info.value)
+        assert "hidden dimension mismatch" in error_msg.lower()
+        assert str(wrong_hidden_size) in error_msg
+        assert str(expected_hidden_size) in error_msg
+
+
+class TestShapeValidationDoSPrevention:
+    """
+    Tests for DoS prevention through shape validation.
+
+    Verifies that embeddings with incorrect shapes are rejected early,
+    preventing crashes during model inference.
+    """
+
+    def test_prevent_crash_from_wrong_shape_image_embeds(self):
+        """
+        Prevent crash scenario: wrong hidden size in image embeddings.
+
+        Without validation, this would pass initial checks but crash later
+        during model forward pass when dimensions don't match.
+        """
+        expected_hidden_size = 768  # Typical model hidden size
+        wrong_hidden_size = 4096  # Wrong size (e.g., Llama-sized)
+
+        wrong_embedding = torch.randn(100, wrong_hidden_size, dtype=torch.float32)
+
+        # Should be rejected at instantiation time, not during inference
+        with pytest.raises(ValueError) as exc_info:
+            ImageEmbeddingItems(
+                wrong_embedding, expected_hidden_size=expected_hidden_size
+            )
+
+        error_msg = str(exc_info.value)
+        assert "hidden dimension mismatch" in error_msg.lower()
+        assert str(expected_hidden_size) in error_msg  # Expected
+        assert str(wrong_hidden_size) in error_msg  # Received
+
+    def test_prevent_crash_from_wrong_shape_audio_embeds(self):
+        """
+        Prevent crash scenario: wrong hidden size in audio embeddings.
+        """
+        expected_hidden_size = 768
+        wrong_hidden_size = 4096
+
+        wrong_embedding = torch.randn(100, wrong_hidden_size, dtype=torch.float32)
+
+        with pytest.raises(ValueError) as exc_info:
+            AudioEmbeddingItems(
+                wrong_embedding, expected_hidden_size=expected_hidden_size
+            )
+
+        error_msg = str(exc_info.value)
+        assert "hidden dimension mismatch" in error_msg.lower()
+
+    def test_extremely_large_hidden_size_rejected(self):
+        """Security: Prevent DoS from extremely large embeddings."""
+        expected_hidden_size = 768
+        huge_hidden_size = 100000  # Large but not extreme to avoid test OOM
+
+        invalid_tensor = torch.randn(10, huge_hidden_size, dtype=torch.float32)
+
+        with pytest.raises(ValueError) as exc_info:
+            ImageEmbeddingItems(
+                invalid_tensor, expected_hidden_size=expected_hidden_size
+            )
+
+        assert "hidden dimension mismatch" in str(exc_info.value).lower()
+
+    def test_batch_with_mixed_hidden_sizes_rejected(self):
+        """All embeddings in a list must have the same hidden size."""
+        expected_hidden_size = 768
+
+        # One correct, one wrong
+        batch = [
+            torch.randn(10, expected_hidden_size, dtype=torch.float32),
+            torch.randn(10, expected_hidden_size + 100, dtype=torch.float32),  # Wrong!
+        ]
+
+        # Should fail on the second one
+        with pytest.raises(ValueError) as exc_info:
+            ImageEmbeddingItems(batch, expected_hidden_size=expected_hidden_size)
+
+        assert "hidden dimension mismatch" in str(exc_info.value).lower()
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v", "--tb=short"])
diff --git a/tests/multimodal/test_image.py b/tests/multimodal/test_image.py
index 329a5b0494cb66ee3d91c9a915ed3dd32a3c15fb..54922594d71d7b9ffcb42309fdbce0c9d5d48fc0 100644
--- a/tests/multimodal/test_image.py
+++ b/tests/multimodal/test_image.py
@@ -1,11 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pickle
 from pathlib import Path
 
 import numpy as np
 import pytest
 from PIL import Image, ImageChops
 
+from vllm.multimodal.base import MediaWithBytes
 from vllm.multimodal.image import ImageMediaIO, convert_image_mode
 
 pytestmark = pytest.mark.cpu_test
@@ -157,3 +159,34 @@ def test_rgba_background_color_validation():
     ImageMediaIO(rgba_background_color=(0, 0, 0))  # Should not raise
     ImageMediaIO(rgba_background_color=[255, 255, 255])  # Should not raise
     ImageMediaIO(rgba_background_color=(128, 128, 128))  # Should not raise
+
+
+def test_media_with_bytes_pickle_roundtrip():
+    """Regression test for pickle/unpickle of MediaWithBytes.
+
+    Verifies that MediaWithBytes can be pickled and unpickled without
+    RecursionError. See: https://github.com/vllm-project/vllm/issues/30818
+    """
+    original_image = Image.open(ASSETS_DIR / "image1.png").convert("RGB")
+    original_bytes = b"test_bytes_data"
+
+    wrapper = MediaWithBytes(media=original_image, original_bytes=original_bytes)
+
+    # Verify attribute delegation works before pickling
+    assert wrapper.width == original_image.width
+    assert wrapper.height == original_image.height
+    assert wrapper.mode == original_image.mode
+
+    # Pickle and unpickle (this would cause RecursionError before the fix)
+    pickled = pickle.dumps(wrapper)
+    unpickled = pickle.loads(pickled)
+
+    # Verify the unpickled object works correctly
+    assert unpickled.original_bytes == original_bytes
+    assert unpickled.media.width == original_image.width
+    assert unpickled.media.height == original_image.height
+
+    # Verify attribute delegation works after unpickling
+    assert unpickled.width == original_image.width
+    assert unpickled.height == original_image.height
+    assert unpickled.mode == original_image.mode
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index 262ea42e4d0faed8e4f31ec37391410008226511..64bb88960e86c749ec9e7c6177035f5d69dad71d 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -1021,9 +1021,8 @@ def test_hf_processor_init_kwargs(
         DummyProcessor,  # type: ignore[arg-type]
         **inference_kwargs,
     )
-
-    for k, v in expected_kwargs.items():
-        assert getattr(processor, k) == v
+    assert processor.a == expected_kwargs["a"]
+    assert processor.b == expected_kwargs["b"]
 
 
 @pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"])  # Dummy
diff --git a/tests/multimodal/test_video.py b/tests/multimodal/test_video.py
index eccaa53ea1004231f8e4f27799fb74185381fac3..5f63e115509ce1f115af25c2e64b4f1340332eee 100644
--- a/tests/multimodal/test_video.py
+++ b/tests/multimodal/test_video.py
@@ -299,3 +299,212 @@ def test_video_media_io_backend_env_var_fallback(monkeypatch: pytest.MonkeyPatch
         frames_missing, metadata_missing = videoio_missing.load_bytes(b"test")
         np.testing.assert_array_equal(frames_missing, FAKE_OUTPUT_2)
         assert metadata_missing["video_backend"] == "test_video_backend_override_2"
+
+
+# ============================================================================
+# Frame Recovery Tests
+# ============================================================================
+
+
+def test_video_recovery_simulated_failures(monkeypatch: pytest.MonkeyPatch):
+    """
+    Test that frame recovery correctly uses the next valid frame when
+    target frames fail to load.
+
+    Uses corrupted.mp4 and mocks VideoCapture.grab() to fail on specific
+    frame indices (in addition to the real corruption at frame 17), then
+    verifies recovery produces more frames.
+    """
+    import cv2
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_VIDEO_LOADER_BACKEND", "opencv")
+
+        # Load corrupted.mp4 (26 frames, frame 17 is genuinely corrupted)
+        video_path = ASSETS_DIR / "corrupted.mp4"
+        with open(video_path, "rb") as f:
+            video_data = f.read()
+
+        # Simulate additional failures on frames 3 and 10
+        # (in addition to the real corruption at frame 17)
+        fail_on_frames = {3, 10}
+
+        # Store original VideoCapture class
+        original_video_capture = cv2.VideoCapture
+
+        class MockVideoCapture:
+            """Wrapper that simulates grab() failures on specific frames."""
+
+            def __init__(self, *args, **kwargs):
+                self._cap = original_video_capture(*args, **kwargs)
+                self._current_frame = -1
+
+            def grab(self):
+                self._current_frame += 1
+                if self._current_frame in fail_on_frames:
+                    return False  # Simulate failure
+                return self._cap.grab()
+
+            def retrieve(self):
+                return self._cap.retrieve()
+
+            def get(self, prop):
+                return self._cap.get(prop)
+
+            def isOpened(self):
+                return self._cap.isOpened()
+
+            def release(self):
+                return self._cap.release()
+
+        # Patch cv2.VideoCapture
+        m.setattr(cv2, "VideoCapture", MockVideoCapture)
+
+        loader = VIDEO_LOADER_REGISTRY.load("opencv")
+
+        # Use num_frames=8 which samples: [0, 3, 7, 10, 14, 17, 21, 25]
+        # Frame 3: mocked failure, recovery window [3, 7) -> use frame 4
+        # Frame 10: mocked failure, recovery window [10, 14) -> use frame 11
+        # Frame 17: real corruption, recovery window [17, 21) -> use frame 18
+
+        # Test WITHOUT recovery - should have fewer frames due to failures
+        frames_no_recovery, meta_no = loader.load_bytes(
+            video_data, num_frames=8, frame_recovery=False
+        )
+
+        # Test WITH recovery - should recover using next valid frames
+        frames_with_recovery, meta_yes = loader.load_bytes(
+            video_data, num_frames=8, frame_recovery=True
+        )
+
+        # With recovery should have MORE frames than without
+        # Without: 5 frames (3, 10, 17 all fail)
+        # With: 8 frames (all recovered)
+        assert frames_with_recovery.shape[0] > frames_no_recovery.shape[0], (
+            f"Recovery should produce more frames. "
+            f"Without: {frames_no_recovery.shape[0]}, "
+            f"With: {frames_with_recovery.shape[0]}"
+        )
+
+        # Verify metadata consistency
+        assert frames_no_recovery.shape[0] == len(meta_no["frames_indices"])
+        assert frames_with_recovery.shape[0] == len(meta_yes["frames_indices"])
+
+        # Verify temporal order is preserved
+        assert meta_yes["frames_indices"] == sorted(meta_yes["frames_indices"])
+
+
+def test_video_recovery_with_corrupted_file(monkeypatch: pytest.MonkeyPatch):
+    """
+    Test frame recovery with an actual corrupted video file using sparse sampling.
+
+    This test uses corrupted.mp4 which has genuine H.264 codec errors on
+    frame 17. With num_frames=8, the target frames are [0, 3, 7, 10, 14, 17, 21, 25].
+    Frame 17 is corrupted but frames 18-20 are readable, so recovery can use
+    frame 18 to fill in for the failed frame 17.
+
+    This test verifies:
+    1. Without recovery: frame 17 is skipped (7 frames loaded)
+    2. With recovery: frame 18 fills in for frame 17 (8 frames loaded)
+    3. Recovery produces MORE frames than without recovery
+    4. Metadata is consistent with loaded frames
+    """
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_VIDEO_LOADER_BACKEND", "opencv")
+
+        corrupted_video_path = ASSETS_DIR / "corrupted.mp4"
+
+        with open(corrupted_video_path, "rb") as f:
+            video_data = f.read()
+
+        loader = VIDEO_LOADER_REGISTRY.load("opencv")
+
+        # Use num_frames=8 which makes frame 17 a target with recovery window [17, 21)
+        # Target frames: [0, 3, 7, 10, 14, 17, 21, 25]
+        # Frame 17 is corrupted, but frames 18-20 are readable for recovery
+
+        # Test without recovery - frame 17 will be skipped
+        frames_no_recovery, meta_no_recovery = loader.load_bytes(
+            video_data, num_frames=8, frame_recovery=False
+        )
+
+        # Test with recovery - frame 18 should fill in for frame 17
+        frames_with_recovery, meta_with_recovery = loader.load_bytes(
+            video_data, num_frames=8, frame_recovery=True
+        )
+
+        # Verify metadata consistency for both modes
+        assert frames_no_recovery.shape[0] == len(meta_no_recovery["frames_indices"]), (
+            "Frame count must match indices without recovery"
+        )
+        assert frames_with_recovery.shape[0] == len(
+            meta_with_recovery["frames_indices"]
+        ), "Frame count must match indices with recovery"
+
+        # KEY ASSERTION: Recovery should produce MORE frames than without recovery
+        # Without recovery: 7 frames (frame 17 skipped)
+        # With recovery: 8 frames (frame 18 used for frame 17)
+        assert frames_with_recovery.shape[0] > frames_no_recovery.shape[0], (
+            f"Recovery should produce more frames with sparse sampling. "
+            f"Got {frames_with_recovery.shape[0]} with recovery vs "
+            f"{frames_no_recovery.shape[0]} without"
+        )
+
+        # Verify we got all 8 requested frames with recovery
+        assert frames_with_recovery.shape[0] == 8, (
+            f"With recovery, should load all 8 requested frames. "
+            f"Got {frames_with_recovery.shape[0]}"
+        )
+
+        # Verify the video metadata is correct
+        expected_total_frames = 26
+        assert meta_with_recovery["total_num_frames"] == expected_total_frames, (
+            f"Expected {expected_total_frames} total frames in metadata"
+        )
+
+
+def test_video_recovery_dynamic_backend(monkeypatch: pytest.MonkeyPatch):
+    """
+    Test that frame_recovery works with the dynamic video backend.
+
+    The dynamic backend samples frames based on fps/duration rather than
+    loading all frames. This test verifies recovery works in that context.
+    """
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_VIDEO_LOADER_BACKEND", "opencv_dynamic")
+
+        corrupted_video_path = ASSETS_DIR / "corrupted.mp4"
+
+        with open(corrupted_video_path, "rb") as f:
+            video_data = f.read()
+
+        loader = VIDEO_LOADER_REGISTRY.load("opencv_dynamic")
+
+        # Test without recovery
+        frames_no_recovery, meta_no = loader.load_bytes(
+            video_data, fps=2, max_duration=10, frame_recovery=False
+        )
+
+        # Test with frame_recovery enabled
+        frames_with_recovery, meta_with = loader.load_bytes(
+            video_data, fps=2, max_duration=10, frame_recovery=True
+        )
+
+        # Verify basic properties
+        assert frames_no_recovery.shape[0] > 0, (
+            "Should load some frames without recovery"
+        )
+        assert frames_with_recovery.shape[0] > 0, (
+            "Should load some frames with recovery"
+        )
+        assert "do_sample_frames" in meta_with
+        assert meta_with["do_sample_frames"] is False  # Dynamic backend always False
+        assert frames_with_recovery.shape[0] == len(meta_with["frames_indices"])
+
+        # Key assertion: recovery should help when corrupted frames are sampled
+        # We expect recovery to produce >= frames than without recovery
+        assert frames_with_recovery.shape[0] >= frames_no_recovery.shape[0], (
+            f"Recovery should produce at least as many frames. "
+            f"Got {frames_with_recovery.shape[0]} with recovery vs "
+            f"{frames_no_recovery.shape[0]} without"
+        )
diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
index 98245cdf0c984f81be8b9558e1b6859a9b1fa372..b99c9629ab90e96f31b22b4ab725e4656e3acc95 100644
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
@@ -7,7 +7,7 @@ import torch
 import torch.nn as nn
 
 from vllm.config import VllmConfig
-from vllm.model_executor.layers.pooler import DispatchPooler, Pooler
+from vllm.model_executor.layers.pooler import DispatchPooler
 from vllm.model_executor.models.gemma2 import Gemma2Model
 from vllm.model_executor.models.utils import WeightsMapper, maybe_prefix
 from vllm.sequence import IntermediateTensors
@@ -28,12 +28,7 @@ class MyGemma2Embedding(nn.Module):
         pooler_config = vllm_config.model_config.pooler_config
         assert pooler_config is not None
 
-        self.pooler = DispatchPooler(
-            {
-                "token_embed": Pooler.for_token_embed(pooler_config),
-                "embed": Pooler.for_embed(pooler_config),
-            }
-        )
+        self.pooler = DispatchPooler.for_embedding(pooler_config)
 
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors
diff --git a/tests/plugins_tests/test_platform_plugins.py b/tests/plugins_tests/test_platform_plugins.py
index 711cb468d7928a16391ec378479c74402eb043d5..89ba63f25a8fafc986820b50af004604ed7d842d 100644
--- a/tests/plugins_tests/test_platform_plugins.py
+++ b/tests/plugins_tests/test_platform_plugins.py
@@ -31,7 +31,7 @@ def test_platform_plugins():
     )
 
 
-# def test_oot_custom_op(monkeypatch: pytest.MonkeyPatch):
+# def test_oot_custom_op(default_vllm_config, monkeypatch: pytest.MonkeyPatch):
 #     # simulate workload by running an example
 #     load_general_plugins()
 #     from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
diff --git a/tests/quantization/test_auto_round.py b/tests/quantization/test_auto_round.py
index 9f5db821950122d86649032fd2354f94c0327df9..a2a1ebc014cb9f75b0c746f71578a13101c53027 100644
--- a/tests/quantization/test_auto_round.py
+++ b/tests/quantization/test_auto_round.py
@@ -26,7 +26,9 @@ MODELS = [
 )
 @pytest.mark.parametrize("model", MODELS)
 def test_auto_round(vllm_runner, model):
-    with vllm_runner(model, enforce_eager=True) as llm:
+    with vllm_runner(
+        model, enforce_eager=True, allow_deprecated_quantization=True
+    ) as llm:
         output = llm.generate_greedy(["The capital of France is"], max_tokens=8)
     assert output
     print(f"{output[0][1]}")
diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index 84bf099ad67a4f7537962303d57aeb13fb925da4..f6c96a0ae410c8b513d25f813ffad846b53066a2 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -86,7 +86,7 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
         current_platform.is_rocm()
         and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL
     ):
-        pytest.skip(f"Skip model {model_path} as it is not support on ROCm.")
+        pytest.skip(f"Skip model {model_path} as it is not supported on ROCm.")
 
     with vllm_runner(model_path, enforce_eager=True) as llm:
 
@@ -164,7 +164,7 @@ def test_compressed_tensors_w8a8_logprobs(
         current_platform.is_rocm()
         and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL
     ):
-        pytest.skip(f"Skip model {model_path} as it is not support on ROCm.")
+        pytest.skip(f"Skip model {model_path} as it is not supported on ROCm.")
 
     if use_aiter:
         if model_path not in ROCM_AITER_SUPPORTED_INT8_MODEL:
@@ -234,7 +234,7 @@ def test_compressed_tensors_w8a8_dynamic_per_token(
         current_platform.is_rocm()
         and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL
     ):
-        pytest.skip(f"Skip model {model_path} as it is not support on ROCm.")
+        pytest.skip(f"Skip model {model_path} as it is not supported on ROCm.")
 
     if use_aiter:
         if model_path not in ROCM_AITER_SUPPORTED_INT8_MODEL:
@@ -651,6 +651,9 @@ def test_compressed_tensors_2of4_sparse_compressed(vllm_runner, args_2of4):
         assert output
 
 
+@pytest.mark.skipif(
+    not current_platform.is_cuda(), reason="This test is skipped on non-CUDA platform."
+)
 @pytest.mark.parametrize(
     "args",
     [
@@ -783,7 +786,10 @@ def test_compressed_tensors_fp8_block_enabled(vllm_runner):
 
             input_quant_op = qkv_proj.scheme.w8a8_block_fp8_linear.input_quant_op
             assert isinstance(input_quant_op, QuantFP8)
-            assert input_quant_op._forward_method == input_quant_op.forward_cuda
+            assert input_quant_op._forward_method in (
+                input_quant_op.forward_cuda,
+                input_quant_op.forward_hip,
+            )
 
         llm.apply_model(check_model)
 
diff --git a/tests/quantization/test_configs.py b/tests/quantization/test_configs.py
index 192cff93dad74c09d49b9624be99c012b027604d..2334305b0d0d79183eedb3e65b7f831f2ba5deaa 100644
--- a/tests/quantization/test_configs.py
+++ b/tests/quantization/test_configs.py
@@ -11,7 +11,8 @@ import pytest
 import os
 
 from vllm.config import ModelConfig
-from ..utils import models_path_prefix
+from vllm.platforms import current_platform
+from tests.utils import models_path_prefix
 
 
 @dataclass
@@ -25,21 +26,45 @@ MODEL_ARG_EXPTYPES = [
     # AUTOGPTQ
     # compat: autogptq <=0.7.1 is_marlin_format: bool
     # Model Serialized in Exllama Format.
-    # (os.path.join(models_path_prefix, "TheBloke/Llama-2-7B-Chat-GPTQ"), None, "gptq_marlin"),
-    # (os.path.join(models_path_prefix, "TheBloke/Llama-2-7B-Chat-GPTQ"), "marlin", "gptq_marlin"),
-    # (os.path.join(models_path_prefix, "TheBloke/Llama-2-7B-Chat-GPTQ"), "gptq", "gptq"),
+    (
+        os.path.join(models_path_prefix, "TheBloke/Llama-2-7B-Chat-GPTQ"),
+        None,
+        "gptq_marlin" if current_platform.is_cuda() else "gptq",
+    ),
+    (
+        os.path.join(models_path_prefix, "TheBloke/Llama-2-7B-Chat-GPTQ"),
+        "marlin",
+        "gptq_marlin" if current_platform.is_cuda() else "ERROR",
+    ),
+    (os.path.join(models_path_prefix, "TheBloke/Llama-2-7B-Chat-GPTQ"), "gptq", "gptq"),
     (os.path.join(models_path_prefix, "TheBloke/Llama-2-7B-Chat-GPTQ"), "awq", "ERROR"),
     # compat: autogptq >=0.8.0 use checkpoint_format: str
 
     # Model Serialized in Exllama Format.
-    (os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"), None, "gptq_marlin"),
-    (os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"), "marlin", "gptq_marlin"),
+    (
+        os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"),
+        None,
+        "gptq_marlin" if current_platform.is_cuda() else "gptq",
+    ),
+    (
+        os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"),
+        "marlin",
+        "gptq_marlin" if current_platform.is_cuda() else "ERROR",
+    ),
     (os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"), "gptq", "gptq"),
     (os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"), "awq", "ERROR"),
     # AUTOAWQ
-    # (os.path.join(models_path_prefix, "TheBloke/OpenHermes-2.5-Mistral-7B-AWQ"), None, "awq_marlin"),
+    (
+        os.path.join(models_path_prefix, "TheBloke/OpenHermes-2.5-Mistral-7B-AWQ"),
+        None,
+        "awq_marlin" if current_platform.is_cuda() else "awq",
+    ),
     (os.path.join(models_path_prefix, "TheBloke/OpenHermes-2.5-Mistral-7B-AWQ"), "awq", "awq"),
-    # (os.path.join(models_path_prefix, "TheBloke/OpenHermes-2.5-Mistral-7B-AWQ"), "marlin", "awq_marlin"),
+    (
+        os.path.join(models_path_prefix, "TheBloke/OpenHermes-2.5-Mistral-7B-AWQ"),
+        "marlin",
+        "awq_marlin" if current_platform.is_cuda() else "ERROR",
+    ),
     (os.path.join(models_path_prefix, "TheBloke/OpenHermes-2.5-Mistral-7B-AWQ"), "gptq", "ERROR"),
 ]
 
diff --git a/tests/quantization/test_cpu_offload.py b/tests/quantization/test_cpu_offload.py
index a11a3a881f97b1bed7c9caedb4c7d02c1615184d..9681de9c000f270b62efb75ea5f1ba07ddcdcfd7 100644
--- a/tests/quantization/test_cpu_offload.py
+++ b/tests/quantization/test_cpu_offload.py
@@ -67,7 +67,7 @@ def test_cpu_offload_compressed_tensors(monkeypatch):
     monkeypatch.setenv("VLLM_TEST_FORCE_LOAD_FORMAT", "auto")
     # Test wNa16
     compare_two_settings(
-        os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w4a16-channel-v2"),
+        os.path.join(models_path_prefix, "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"),
         ["--enforce_eager"],
         ["--enforce_eager", "--cpu-offload-gb", "1"],
         max_wait_seconds=480,
diff --git a/tests/quantization/test_cpu_wna16.py b/tests/quantization/test_cpu_wna16.py
index 077b802e559dcef1b678bb5cf88a11dd8a9eb484..56b9c39b03cf936c0632172255ab29ab69255c97 100644
--- a/tests/quantization/test_cpu_wna16.py
+++ b/tests/quantization/test_cpu_wna16.py
@@ -10,6 +10,7 @@ if not current_platform.is_cpu():
 MODELS = [
     "TheBloke/TinyLlama-1.1B-Chat-v1.0-AWQ",
     "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ",  # with g_idx
+    "Qwen/Qwen1.5-0.5B-Chat-GPTQ-Int4",  # without g_idx
 ]
 DTYPE = ["bfloat16"]
 
diff --git a/tests/quantization/test_experts_int8.py b/tests/quantization/test_experts_int8.py
index 40be6f0301b8b1b0414198d8fac35dc8cdefca96..8923a9753e8a6224b577088fdf0bcb8b8c97385b 100644
--- a/tests/quantization/test_experts_int8.py
+++ b/tests/quantization/test_experts_int8.py
@@ -38,6 +38,10 @@ def test_model_experts_int8_startup(
     model_info.check_transformers_version(on_fail="skip")
 
     with vllm_runner(
-        model, dtype=dtype, enforce_eager=True, quantization="experts_int8"
+        model,
+        dtype=dtype,
+        enforce_eager=True,
+        quantization="experts_int8",
+        allow_deprecated_quantization=True,
     ) as vllm_model:
         vllm_model.generate_greedy(example_prompts, max_tokens)
diff --git a/tests/quantization/test_gptq_dynamic.py b/tests/quantization/test_gptq_dynamic.py
index e01adaf76c1b045bc7e476b7f649d68e666ca6c8..a6858f6d50a009a07a02e21d073dd27383f7d107 100644
--- a/tests/quantization/test_gptq_dynamic.py
+++ b/tests/quantization/test_gptq_dynamic.py
@@ -15,7 +15,9 @@ from vllm.model_executor.layers.quantization.gptq_marlin import GPTQMarlinLinear
 from vllm.model_executor.layers.quantization.utils.gptq_utils import (
     get_dynamic_override,
 )
+
 from ..utils import models_path_prefix
+from vllm.platforms import current_platform
 
 PROMPT = "On the surface of Mars, we found"
 
@@ -23,7 +25,10 @@ PROMPT = "On the surface of Mars, we found"
 # The second layer is quantized using bits=8, group_size=32
 # All other layers (layer index >= 2) are not quantized
 MODEL_QUANT = [
-    (os.path.join(models_path_prefix, "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue"), True),
+    (
+        os.path.join(models_path_prefix, "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue"),
+        current_platform.is_cuda(),
+    ),
     (
         os.path.join(models_path_prefix, "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse"),
         False,
diff --git a/tests/quantization/test_modelopt.py b/tests/quantization/test_modelopt.py
index 0298994c396f68a89a8c8191bcfa54573bf906a8..154b29d7017acc481ad1260d6dba17b1cc60bb1e 100644
--- a/tests/quantization/test_modelopt.py
+++ b/tests/quantization/test_modelopt.py
@@ -6,6 +6,7 @@ Run `pytest tests/quantization/test_modelopt.py`.
 """
 
 import os
+from typing import NoReturn
 
 import pytest
 import torch
@@ -19,6 +20,28 @@ def enable_pickle(monkeypatch):
     monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
 
 
+def _skip(msg: str) -> NoReturn:
+    pytest.skip(msg)
+    raise RuntimeError(msg)
+
+
+def _snapshot_download_or_skip(model_id: str) -> str:
+    try:
+        from huggingface_hub import snapshot_download
+    except Exception as e:  # pragma: no cover
+        _skip(f"huggingface_hub is required to download {model_id}: {e}")
+
+    try:
+        return snapshot_download(
+            repo_id=model_id,
+            repo_type="model",
+            # These checkpoints are already small; download full repo for simplicity.
+            allow_patterns=["*"],
+        )
+    except Exception as e:
+        _skip(f"Failed to download {model_id} from the HF Hub: {e}")
+
+
 @pytest.mark.skipif(
     not is_quant_method_supported("modelopt"),
     reason="ModelOpt FP8 is not supported on this GPU type.",
@@ -91,3 +114,121 @@ def test_modelopt_fp8_checkpoint_setup(vllm_runner):
         output = llm.generate_greedy(["Hello my name is"], max_tokens=4)
         assert output
         print(f"ModelOpt FP8 output: {output}")
+
+
+@pytest.mark.skipif(
+    not is_quant_method_supported("modelopt"),
+    reason="ModelOpt FP8 is not supported on this GPU type.",
+)
+def test_modelopt_fp8_pc_pt_checkpoint_setup(vllm_runner):
+    """Test ModelOpt FP8_PER_CHANNEL_PER_TOKEN checkpoint setup."""
+    model_id = "CedricHwang/qwen2.5-0.5b-modelopt-fp8-pc-pt"
+    model_path = _snapshot_download_or_skip(model_id)
+
+    with vllm_runner(model_path, quantization="modelopt", enforce_eager=True) as llm:
+
+        def check_model(model):
+            layer = model.model.layers[0]
+
+            qkv_proj = layer.self_attn.qkv_proj
+            o_proj = layer.self_attn.o_proj
+            gate_up_proj = layer.mlp.gate_up_proj
+            down_proj = layer.mlp.down_proj
+
+            from vllm.model_executor.layers.quantization.modelopt import (
+                ModelOptFp8PcPtLinearMethod,
+            )
+
+            assert isinstance(qkv_proj.quant_method, ModelOptFp8PcPtLinearMethod)
+            assert isinstance(o_proj.quant_method, ModelOptFp8PcPtLinearMethod)
+            assert isinstance(gate_up_proj.quant_method, ModelOptFp8PcPtLinearMethod)
+            assert isinstance(down_proj.quant_method, ModelOptFp8PcPtLinearMethod)
+
+            assert qkv_proj.weight.dtype == torch.float8_e4m3fn
+            assert o_proj.weight.dtype == torch.float8_e4m3fn
+            assert gate_up_proj.weight.dtype == torch.float8_e4m3fn
+            assert down_proj.weight.dtype == torch.float8_e4m3fn
+
+            # Per-channel scales; activations are dynamically scaled per token.
+            assert hasattr(qkv_proj, "weight_scale")
+            assert qkv_proj.weight_scale.dtype == torch.float32
+            assert qkv_proj.weight_scale.dim() == 1
+            assert not hasattr(qkv_proj, "input_scale")
+
+            assert hasattr(o_proj, "weight_scale")
+            assert o_proj.weight_scale.dtype == torch.float32
+            assert o_proj.weight_scale.dim() == 1
+            assert not hasattr(o_proj, "input_scale")
+
+            assert hasattr(gate_up_proj, "weight_scale")
+            assert gate_up_proj.weight_scale.dtype == torch.float32
+            assert gate_up_proj.weight_scale.dim() == 1
+            assert not hasattr(gate_up_proj, "input_scale")
+
+            assert hasattr(down_proj, "weight_scale")
+            assert down_proj.weight_scale.dtype == torch.float32
+            assert down_proj.weight_scale.dim() == 1
+            assert not hasattr(down_proj, "input_scale")
+
+        llm.apply_model(check_model)
+
+        output = llm.generate_greedy(["Hello my name is"], max_tokens=4)
+        assert output
+        print(f"ModelOpt FP8_PER_CHANNEL_PER_TOKEN output: {output}")
+
+
+@pytest.mark.skipif(
+    not is_quant_method_supported("modelopt"),
+    reason="ModelOpt FP8 is not supported on this GPU type.",
+)
+def test_modelopt_fp8_pb_wo_checkpoint_setup(vllm_runner):
+    """Test ModelOpt FP8_PB_WO checkpoint setup."""
+    model_id = "CedricHwang/qwen2.5-0.5b-modelopt-fp8-pb-wo"
+    model_path = _snapshot_download_or_skip(model_id)
+
+    with vllm_runner(model_path, quantization="modelopt", enforce_eager=True) as llm:
+
+        def check_model(model):
+            layer = model.model.layers[0]
+
+            qkv_proj = layer.self_attn.qkv_proj
+            o_proj = layer.self_attn.o_proj
+            gate_up_proj = layer.mlp.gate_up_proj
+            down_proj = layer.mlp.down_proj
+
+            from vllm.model_executor.layers.quantization.modelopt import (
+                ModelOptFp8PbWoLinearMethod,
+            )
+
+            assert isinstance(qkv_proj.quant_method, ModelOptFp8PbWoLinearMethod)
+            assert isinstance(o_proj.quant_method, ModelOptFp8PbWoLinearMethod)
+            assert isinstance(gate_up_proj.quant_method, ModelOptFp8PbWoLinearMethod)
+            assert isinstance(down_proj.quant_method, ModelOptFp8PbWoLinearMethod)
+
+            assert qkv_proj.weight.dtype == torch.float8_e4m3fn
+            assert o_proj.weight.dtype == torch.float8_e4m3fn
+            assert gate_up_proj.weight.dtype == torch.float8_e4m3fn
+            assert down_proj.weight.dtype == torch.float8_e4m3fn
+
+            # Block scales; should be materialized as a 2D [out_blk, in_blk] tensor.
+            assert hasattr(qkv_proj, "weight_scale")
+            assert qkv_proj.weight_scale.dtype == torch.float32
+            assert qkv_proj.weight_scale.dim() == 2
+
+            assert hasattr(o_proj, "weight_scale")
+            assert o_proj.weight_scale.dtype == torch.float32
+            assert o_proj.weight_scale.dim() == 2
+
+            assert hasattr(gate_up_proj, "weight_scale")
+            assert gate_up_proj.weight_scale.dtype == torch.float32
+            assert gate_up_proj.weight_scale.dim() == 2
+
+            assert hasattr(down_proj, "weight_scale")
+            assert down_proj.weight_scale.dtype == torch.float32
+            assert down_proj.weight_scale.dim() == 2
+
+        llm.apply_model(check_model)
+
+        output = llm.generate_greedy(["Hello my name is"], max_tokens=4)
+        assert output
+        print(f"ModelOpt FP8_PB_WO output: {output}")
diff --git a/tests/quantization/test_rtn.py b/tests/quantization/test_rtn.py
index 195f1fbbdfc0c8d3773b7f9e1bcb81d8d1e7dfa3..8468eec6f2e3a2b1f060f14b510b0ceb28c26729 100644
--- a/tests/quantization/test_rtn.py
+++ b/tests/quantization/test_rtn.py
@@ -30,6 +30,10 @@ def test_model_rtn_startup(
     max_tokens: int,
 ) -> None:
     with vllm_runner(
-        model, enforce_eager=True, dtype=dtype, quantization="rtn"
+        model,
+        enforce_eager=True,
+        dtype=dtype,
+        quantization="rtn",
+        allow_deprecated_quantization=True,
     ) as vllm_model:
         vllm_model.generate_greedy(example_prompts, max_tokens)
diff --git a/tests/quantization/test_torchao.py b/tests/quantization/test_torchao.py
index f35c3973ab6e6c2652f3246626c11ebf37f8ddd8..da4f6a028709ea0f09b4eeae8f570e5bf052760c 100644
--- a/tests/quantization/test_torchao.py
+++ b/tests/quantization/test_torchao.py
@@ -6,11 +6,17 @@ import importlib.util
 import pytest
 import torch
 
+from vllm.platforms import current_platform
+
 DTYPE = ["bfloat16"]
 
 TORCHAO_AVAILABLE = importlib.util.find_spec("torchao") is not None
 
 
+@pytest.mark.skipif(
+    current_platform.is_rocm() and current_platform.is_fp8_fnuz(),
+    reason="Only fp8_fnuz supported on CDNA3 architecture",
+)
 @pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
 def test_pre_quantized_model(vllm_runner):
     with vllm_runner(
diff --git a/tests/quantization/untest_fp8.py b/tests/quantization/untest_fp8.py
index 8eba45e21493bc308312b5908d59017aaa73b3f7..6e1ed8e1049e73354176cba31580014d864174f9 100644
--- a/tests/quantization/untest_fp8.py
+++ b/tests/quantization/untest_fp8.py
@@ -38,7 +38,9 @@ MODELS = [
     reason="FP8 is not supported on this GPU type.",
 )
 @pytest.mark.parametrize("model_id", MODELS)
-@pytest.mark.parametrize("force_marlin", [False, True])
+@pytest.mark.parametrize(
+    "force_marlin", [False] if current_platform.is_rocm() else [False, True]
+)
 @pytest.mark.parametrize(
     "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False]
 )
@@ -127,7 +129,9 @@ def test_kv_cache_model_load_and_run(
     reason="FP8 is not supported on this GPU type.",
 )
 @pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
-@pytest.mark.parametrize("force_marlin", [False, True])
+@pytest.mark.parametrize(
+    "force_marlin", [False] if current_platform.is_rocm() else [False, True]
+)
 @pytest.mark.parametrize(
     "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False]
 )
@@ -199,10 +203,10 @@ def test_scaled_fp8_quant(dtype) -> None:
     def quantize_ref(tensor, inv_scale):
         # The reference implementation that fully aligns to
         # the kernel being tested.
-        finfo = torch.finfo(torch.float8_e4m3fn)
+        finfo = torch.finfo(current_platform.fp8_dtype())
         scale = inv_scale.reciprocal()
         qweight = (tensor.to(torch.float32) * scale).clamp(min=finfo.min, max=finfo.max)
-        qweight = qweight.to(torch.float8_e4m3fn)
+        qweight = qweight.to(current_platform.fp8_dtype())
         return qweight
 
     def per_tensor_dequantize(tensor, inv_scale, dtype):
@@ -218,7 +222,7 @@ def test_scaled_fp8_quant(dtype) -> None:
     ref_y, inv_scale = ops.scaled_fp8_quant(x, None)
     ref_y = per_tensor_dequantize(ref_y, inv_scale, dtype)
 
-    # Reference dynamic quantizaton
+    # Reference dynamic quantization
     y = quantize_ref(x, inv_scale)
     torch.testing.assert_close(ref_y, per_tensor_dequantize(y, inv_scale, dtype))
 
@@ -269,6 +273,10 @@ def test_scaled_fp8_quant(dtype) -> None:
     )
 
 
+@pytest.mark.skipif(
+    current_platform.is_fp8_fnuz(),
+    reason="FP8 e4m3fn weight reloading is not supported on e4m3fnuz platforms",
+)
 @pytest.mark.parametrize("method_cls", [Fp8LinearMethod, Fp8MoEMethod])
 # FP8 weight reloading does not support online quantization
 @pytest.mark.parametrize("is_checkpoint_fp8_serialized", [True])  # skip False
@@ -279,8 +287,19 @@ def test_scaled_fp8_quant(dtype) -> None:
 # this is the case for marlin as well as per-tensor Fp8MoEMethod
 @pytest.mark.parametrize("use_marlin", [False])  # skip True
 def test_fp8_reloading(
-    method_cls, is_checkpoint_fp8_serialized, weight_block_size, use_marlin, dist_init
+    default_vllm_config,
+    method_cls,
+    is_checkpoint_fp8_serialized,
+    weight_block_size,
+    use_marlin,
+    dist_init,
+    monkeypatch,
 ):
+    # NOTE(rob): this test fails when using DeepGEMM because the
+    # shapes are invalid. Previously the test was passing because
+    # we set fp8_backend to None, which sidestepped the issue.
+    monkeypatch.setenv("VLLM_USE_DEEP_GEMM", "0")
+
     if is_checkpoint_fp8_serialized is False:
         pytest.skip("FP8 weight reloading does not support online quantization")
 
@@ -308,6 +327,7 @@ def test_fp8_reloading(
                 params_dtype=torch.bfloat16,
                 weight_loader=default_weight_loader,
             )
+            method.use_marlin = use_marlin
 
         else:
             layer = FusedMoE(
@@ -326,8 +346,6 @@ def test_fp8_reloading(
                 weight_loader=default_weight_loader,
             )
 
-        method.use_marlin = use_marlin
-
     # capture weights format during loading
     original_metadata = [
         (name, param.shape, getattr(param, "weight_loader", default_weight_loader))
diff --git a/tests/quantization/untest_ptpc_fp8.py b/tests/quantization/untest_ptpc_fp8.py
index 61efd2ce66c71559e215f4c6f3c3d79f72534b80..6858062b91834749717faa41e6fa8d358d304e81 100644
--- a/tests/quantization/untest_ptpc_fp8.py
+++ b/tests/quantization/untest_ptpc_fp8.py
@@ -6,18 +6,12 @@ Run `pytest tests/quantization/test_ptpc_fp8.py --forked`.
 """
 
 import pytest
-import torch
 
 from tests.quantization.utils import is_quant_method_supported
 from vllm.model_executor.layers.quantization.fp8 import Fp8KVCacheMethod
 from vllm.model_executor.layers.quantization.ptpc_fp8 import PTPCFp8LinearMethod
 from vllm.platforms import current_platform
 
-UNSUPPORTED_STR = (
-    "Currently torch._scaled_mm (hipBLASLt) rowwise gemm only "
-    "support output dtype of bfloat16. torch.float16 is specified."
-)
-
 
 @pytest.fixture(scope="function", autouse=True)
 def enable_pickle(monkeypatch):
@@ -30,24 +24,17 @@ def enable_pickle(monkeypatch):
     reason="PTPC FP8 is not supported on this GPU type.",
 )
 @pytest.mark.skipif(not current_platform.is_rocm(), reason="This test is for ROCm GPU.")
-@pytest.mark.parametrize("dtype", ["auto", "bfloat16", "float16"])
-@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8", "fp8_e4m3"])
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
 def test_ptpc_fp8_rocm(vllm_runner, dtype: str, kv_cache_dtype: str) -> None:
-    try:
-        llm = vllm_runner(
-            "facebook/opt-125m",
-            dtype=dtype,
-            quantization="ptpc_fp8",
-            enforce_eager=True,
-            kv_cache_dtype=kv_cache_dtype,
-        )
-    except AssertionError as e:
-        if str(e) == UNSUPPORTED_STR:
-            # If the error message matches, the test passes
-            return
-        else:
-            # If the error message does not match, re-raise the exception
-            raise
+    llm = vllm_runner(
+        "facebook/opt-125m",
+        dtype=dtype,
+        quantization="ptpc_fp8",
+        enforce_eager=True,
+        kv_cache_dtype=kv_cache_dtype,
+        allow_deprecated_quantization=True,
+    )
 
     with llm:
 
@@ -60,9 +47,9 @@ def test_ptpc_fp8_rocm(vllm_runner, dtype: str, kv_cache_dtype: str) -> None:
                 assert attn._k_scale == 1.0
                 assert attn._v_scale == 1.0
 
+            # For GPUs with hardware support, we keep weights in fp8
             if current_platform.has_device_capability(94):
-                # For GPUs with hardware support, we keep weights in fp8
-                assert fc1.weight.dtype == torch.float8_e4m3fnuz
+                assert fc1.weight.dtype == current_platform.fp8_dtype()
 
         llm.apply_model(check_model)
 
diff --git a/tests/quantization/utils.py b/tests/quantization/utils.py
index 20a425b721145ac5ca00dc046698508b352c5c6a..cf3da37b073e46e6f56de07c17c4f05aa02c319e 100644
--- a/tests/quantization/utils.py
+++ b/tests/quantization/utils.py
@@ -10,6 +10,11 @@ def is_quant_method_supported(quant_method: str) -> bool:
     if not (current_platform.is_cuda() or current_platform.is_rocm()):
         return False
 
+    try:
+        current_platform.verify_quantization(quant_method)
+    except ValueError:
+        return False
+
     capability = current_platform.get_device_capability()
     assert capability is not None
 
diff --git a/tests/rocm/aiter/test_mla_fp8_support_check.py b/tests/rocm/aiter/test_mla_fp8_support_check.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3dc0f8ea13d42361d6ba7eb4fb44225785bd29e
--- /dev/null
+++ b/tests/rocm/aiter/test_mla_fp8_support_check.py
@@ -0,0 +1,118 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Unit tests for AITER MLA FP8 support detection.
+
+These tests verify that the _check_aiter_mla_fp8_support() function
+correctly handles various error conditions without crashing.
+"""
+
+from unittest.mock import patch
+
+import pytest
+
+
+class TestAiterMlaFp8SupportCheck:
+    """Test cases for _check_aiter_mla_fp8_support() function."""
+
+    def setup_method(self):
+        """Reset the global cache before each test."""
+        import vllm._aiter_ops as aiter_ops
+
+        aiter_ops._AITER_MLA_SUPPORTS_FP8 = None
+
+    @patch("vllm._aiter_ops.is_aiter_found_and_supported", return_value=True)
+    def test_import_error_handling(self, mock_supported):
+        """Test that ImportError is handled gracefully."""
+        import vllm._aiter_ops as aiter_ops
+        from vllm._aiter_ops import _check_aiter_mla_fp8_support
+
+        aiter_ops._AITER_MLA_SUPPORTS_FP8 = None
+
+        # Should return False without raising
+        with patch(
+            "vllm._aiter_ops.inspect.signature",
+            side_effect=ImportError("No module"),
+        ):
+            result = _check_aiter_mla_fp8_support()
+            assert result is False
+
+    @patch("vllm._aiter_ops.is_aiter_found_and_supported", return_value=True)
+    def test_module_not_found_error_handling(self, mock_supported):
+        """Test that ModuleNotFoundError is handled gracefully."""
+        import vllm._aiter_ops as aiter_ops
+        from vllm._aiter_ops import _check_aiter_mla_fp8_support
+
+        aiter_ops._AITER_MLA_SUPPORTS_FP8 = None
+
+        with patch(
+            "vllm._aiter_ops.inspect.signature",
+            side_effect=ModuleNotFoundError("Module not found"),
+        ):
+            # Should return False without raising
+            assert _check_aiter_mla_fp8_support() is False
+            # Cache should be set to False
+            assert aiter_ops._AITER_MLA_SUPPORTS_FP8 is False
+
+    @patch("vllm._aiter_ops.is_aiter_found_and_supported", return_value=True)
+    def test_attribute_error_handling(self, mock_supported):
+        """Test that AttributeError is handled gracefully."""
+        import vllm._aiter_ops as aiter_ops
+        from vllm._aiter_ops import _check_aiter_mla_fp8_support
+
+        aiter_ops._AITER_MLA_SUPPORTS_FP8 = None
+
+        with patch(
+            "vllm._aiter_ops.inspect.signature",
+            side_effect=AttributeError("No attribute"),
+        ):
+            assert _check_aiter_mla_fp8_support() is False
+            assert aiter_ops._AITER_MLA_SUPPORTS_FP8 is False
+
+    @patch("vllm._aiter_ops.is_aiter_found_and_supported", return_value=True)
+    def test_value_error_handling(self, mock_supported):
+        """Test that ValueError is handled gracefully (no signature)."""
+        import vllm._aiter_ops as aiter_ops
+        from vllm._aiter_ops import _check_aiter_mla_fp8_support
+
+        aiter_ops._AITER_MLA_SUPPORTS_FP8 = None
+
+        with patch(
+            "vllm._aiter_ops.inspect.signature",
+            side_effect=ValueError("No signature"),
+        ):
+            assert _check_aiter_mla_fp8_support() is False
+            assert aiter_ops._AITER_MLA_SUPPORTS_FP8 is False
+
+    @patch("vllm._aiter_ops.is_aiter_found_and_supported", return_value=True)
+    def test_type_error_handling(self, mock_supported):
+        """Test that TypeError is handled gracefully (not callable)."""
+        import vllm._aiter_ops as aiter_ops
+        from vllm._aiter_ops import _check_aiter_mla_fp8_support
+
+        aiter_ops._AITER_MLA_SUPPORTS_FP8 = None
+
+        with patch(
+            "vllm._aiter_ops.inspect.signature",
+            side_effect=TypeError("Not a callable"),
+        ):
+            assert _check_aiter_mla_fp8_support() is False
+            assert aiter_ops._AITER_MLA_SUPPORTS_FP8 is False
+
+    @patch("vllm._aiter_ops.is_aiter_found_and_supported", return_value=True)
+    def test_result_caching(self, mock_supported):
+        """Test that the result is cached after first check."""
+        import vllm._aiter_ops as aiter_ops
+
+        # Set cache to True
+        aiter_ops._AITER_MLA_SUPPORTS_FP8 = True
+
+        from vllm._aiter_ops import _check_aiter_mla_fp8_support
+
+        # Should return cached value without re-checking
+        result = _check_aiter_mla_fp8_support()
+        assert result is True
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/tests/standalone_tests/lazy_imports.py b/tests/standalone_tests/lazy_imports.py
index ddcdd2a51ab9f439c423aa8a5f097d12ec4f9f93..fff5c54f276d36e9c3f47cbd71285d855c1ac770 100644
--- a/tests/standalone_tests/lazy_imports.py
+++ b/tests/standalone_tests/lazy_imports.py
@@ -5,9 +5,6 @@
 # The utility function cannot be placed in `vllm.utils`
 # this needs to be a standalone script
 import sys
-from contextlib import nullcontext
-
-from vllm_test_utils import BlameResult, blame
 
 # List of modules that should not be imported too early.
 # Lazy import `torch._inductor.async_compile` to avoid creating
@@ -16,26 +13,10 @@ from vllm_test_utils import BlameResult, blame
 # `cv2` can easily mess up the environment.
 module_names = ["torch._inductor.async_compile", "cv2"]
 
+# set all modules in `module_names` to be None.
+# if we import any modules during `import vllm`, there would be a
+# hard error and nice stacktrace on the first import.
+for module_name in module_names:
+    sys.modules[module_name] = None  # type: ignore[assignment]
 
-def any_module_imported():
-    return any(module_name in sys.modules for module_name in module_names)
-
-
-# In CI, we only check finally if the module is imported.
-# If it is indeed imported, we can rerun the test with `use_blame=True`,
-# which will trace every function call to find the first import location,
-# and help find the root cause.
-# We don't run it in CI by default because it is slow.
-use_blame = False
-context = blame(any_module_imported) if use_blame else nullcontext()
-with context as result:
-    import vllm  # noqa
-
-if use_blame:
-    assert isinstance(result, BlameResult)
-    print(f"the first import location is:\n{result.trace_stack}")
-
-assert not any_module_imported(), (
-    f"Some the modules in {module_names} are imported. To see the first"
-    f" import location, run the test with `use_blame=True`."
-)
+import vllm  # noqa
diff --git a/tests/standalone_tests/python_only_compile.sh b/tests/standalone_tests/python_only_compile.sh
index 2017e34030d605dce20d78cae9ecdba6d23ebb98..ebf199a5056fb4b156b48ecfa5b781e69176ad7e 100644
--- a/tests/standalone_tests/python_only_compile.sh
+++ b/tests/standalone_tests/python_only_compile.sh
@@ -18,25 +18,37 @@ for i in {1..5}; do
     echo "Checking metadata.json URL (attempt $i)..."
     if curl --fail "$meta_json_url" > metadata.json; then
         echo "INFO: metadata.json URL is valid."
-        # check whether it is valid json by python
+        # check whether it is valid json by python (printed to stdout)
         if python3 -m json.tool metadata.json; then
-            echo "INFO: metadata.json is valid JSON. Proceeding with the test."
+            echo "INFO: metadata.json is valid JSON. Proceeding with the check."
+            # check whether there is an object in the json matching:
+            # "package_name": "vllm", and "platform_tag" matches the current architecture
+            # see `determine_wheel_url` in setup.py for more details
+            if python3 -c "import platform as p,json as j,sys as s; d = j.load(open('metadata.json')); \
+             s.exit(int(not any(o.get('package_name') == 'vllm' and p.machine() in o.get('platform_tag') \
+             for o in d)))" 2>/dev/null; then
+                echo "INFO: metadata.json contains a pre-compiled wheel for the current architecture."
+                break
+            else
+                echo "WARN: metadata.json does not have a pre-compiled wheel for the current architecture."
+            fi
         else
             echo "CRITICAL: metadata.json exists but is not valid JSON, please do report in #sig-ci channel!"
+            echo "INFO: metadata.json content:"
+            cat metadata.json
             exit 1
         fi
-        break
     fi
-    # failure handling
+    # failure handling & retry logic
     if [ $i -eq 5 ]; then
-        echo "ERROR: metadata.json URL is still not valid after 5 attempts."
-        echo "ERROR: Please check whether the precompiled wheel for commit $merge_base_commit exists."
+        echo "ERROR: metadata is still not available after 5 attempts."
+        echo "ERROR: Please check whether the precompiled wheel for commit $merge_base_commit is available."
         echo " NOTE: If $merge_base_commit is a new commit on main, maybe try again after its release pipeline finishes."
         echo " NOTE: If it fails, please report in #sig-ci channel."
         exit 1
     else
-        echo "WARNING: metadata.json URL is not valid. Retrying in 3 minutes..."
-        sleep 180
+        echo "WARNING: metadata is not available. Retrying after 5 minutes..."
+        sleep 300
     fi
 done
 
diff --git a/tests/standalone_tests/pytorch_nightly_dependency.sh b/tests/standalone_tests/pytorch_nightly_dependency.sh
index fd93ad76bed0f1a2b39a5754cd41673aec04497d..92820b269f9dfd3ef53b2060ac0b7a0e49360328 100644
--- a/tests/standalone_tests/pytorch_nightly_dependency.sh
+++ b/tests/standalone_tests/pytorch_nightly_dependency.sh
@@ -4,6 +4,11 @@
 set -e
 set -x
 
+if command -v rocminfo >/dev/null 2>&1; then
+  echo "Skipping test for ROCm platform"
+  exit 0
+fi
+
 cd /vllm-workspace/
 
 rm -rf .venv
@@ -36,7 +41,7 @@ if diff before.txt after.txt; then
   echo "torch version not overridden."
 else
   echo "torch version overridden by nightly_torch_test.txt, \
-  if the dependency is not triggered by the pytroch nightly test,\
+  if the dependency is not triggered by the pytorch nightly test,\
   please add the dependency to the list 'white_list' in tools/pre_commit/generate_nightly_torch_test.py"
   exit 1
 fi
diff --git a/tests/test_attention_backend_registry.py b/tests/test_attention_backend_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..034749874d7b332fb8d611eae699d13b8a73e85c
--- /dev/null
+++ b/tests/test_attention_backend_registry.py
@@ -0,0 +1,169 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from vllm.v1.attention.backend import (
+    AttentionBackend,
+    AttentionImpl,
+)
+from vllm.v1.attention.backends.registry import (
+    AttentionBackendEnum,
+    MambaAttentionBackendEnum,
+    register_backend,
+)
+
+
+class CustomAttentionImpl(AttentionImpl):
+    """Mock custom attention implementation for testing."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+
+    def forward(self, *args, **kwargs):
+        """Mock forward pass."""
+        pass
+
+
+class CustomAttentionBackend(AttentionBackend):
+    """Mock custom attention backend for testing."""
+
+    @staticmethod
+    def get_name():
+        return "CUSTOM"
+
+    @staticmethod
+    def get_impl_cls():
+        return CustomAttentionImpl
+
+    @staticmethod
+    def get_builder_cls():
+        """Mock builder class."""
+        return None
+
+    @staticmethod
+    def get_required_kv_cache_layout():
+        """Mock KV cache layout."""
+        return None
+
+
+class CustomMambaAttentionImpl(AttentionImpl):
+    """Mock custom mamba attention implementation for testing."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+
+    def forward(self, *args, **kwargs):
+        """Mock forward pass."""
+        pass
+
+
+class CustomMambaAttentionBackend(AttentionBackend):
+    """Mock custom mamba attention backend for testing."""
+
+    @staticmethod
+    def get_name():
+        return "CUSTOM_MAMBA"
+
+    @staticmethod
+    def get_impl_cls():
+        return CustomMambaAttentionImpl
+
+    @staticmethod
+    def get_builder_cls():
+        """Mock builder class."""
+        return None
+
+    @staticmethod
+    def get_required_kv_cache_layout():
+        """Mock KV cache layout."""
+        return None
+
+
+def test_custom_is_not_alias_of_any_backend():
+    # Get all members of AttentionBackendEnum
+    all_backends = list(AttentionBackendEnum)
+
+    # Find any aliases of CUSTOM
+    aliases = []
+    for backend in all_backends:
+        if backend.name != "CUSTOM" and backend is AttentionBackendEnum.CUSTOM:
+            aliases.append(backend.name)
+
+    # CUSTOM should not be an alias of any other backend
+    assert len(aliases) == 0, (
+        f"BUG! CUSTOM is an alias of: {', '.join(aliases)}!\n"
+        f"CUSTOM.value = {repr(AttentionBackendEnum.CUSTOM.value)}\n"
+        f"This happens when CUSTOM has the same value as another backend.\n"
+        f"When you register to CUSTOM, you're actually registering to {aliases[0]}!\n"
+        f"All backend values:\n"
+        + "\n".join(f"  {b.name}: {repr(b.value)}" for b in all_backends)
+    )
+
+    # Verify CUSTOM has its own unique identity
+    assert AttentionBackendEnum.CUSTOM.name == "CUSTOM", (
+        f"CUSTOM.name should be 'CUSTOM', but got '{AttentionBackendEnum.CUSTOM.name}'"
+    )
+
+
+def test_register_custom_backend_with_class_path():
+    # Register with explicit class path
+    register_backend(
+        backend=AttentionBackendEnum.CUSTOM,
+        class_path="tests.test_attention_backend_registry.CustomAttentionBackend",
+        is_mamba=False,
+    )
+
+    # Check that CUSTOM backend is registered
+    assert AttentionBackendEnum.CUSTOM.is_overridden(), (
+        "CUSTOM should be overridden after registration"
+    )
+
+    # Get the registered class path
+    class_path = AttentionBackendEnum.CUSTOM.get_path()
+    assert class_path == "tests.test_attention_backend_registry.CustomAttentionBackend"
+
+    # Get the backend class
+    backend_cls = AttentionBackendEnum.CUSTOM.get_class()
+    assert backend_cls.get_name() == "CUSTOM"
+    assert backend_cls.get_impl_cls() == CustomAttentionImpl
+
+
+def test_mamba_custom_is_not_alias_of_any_backend():
+    # Get all mamba backends
+    all_backends = list(MambaAttentionBackendEnum)
+
+    # Find any aliases of CUSTOM
+    aliases = []
+    for backend in all_backends:
+        if backend.name != "CUSTOM" and backend is MambaAttentionBackendEnum.CUSTOM:
+            aliases.append(backend.name)
+
+    # CUSTOM should not be an alias of any other backend
+    assert len(aliases) == 0, (
+        f"BUG! MambaAttentionBackendEnum.CUSTOM is an alias of: {', '.join(aliases)}!\n"
+        f"CUSTOM.value = {repr(MambaAttentionBackendEnum.CUSTOM.value)}\n"
+        f"All mamba backend values:\n"
+        + "\n".join(f"  {b.name}: {repr(b.value)}" for b in all_backends)
+    )
+
+
+def test_register_custom_mamba_backend_with_class_path():
+    # Register with explicit class path
+    register_backend(
+        backend=MambaAttentionBackendEnum.CUSTOM,
+        class_path="tests.test_attention_backend_registry.CustomMambaAttentionBackend",
+        is_mamba=True,
+    )
+
+    # Check that the backend is registered
+    assert MambaAttentionBackendEnum.CUSTOM.is_overridden()
+
+    # Get the registered class path
+    class_path = MambaAttentionBackendEnum.CUSTOM.get_path()
+    assert (
+        class_path
+        == "tests.test_attention_backend_registry.CustomMambaAttentionBackend"
+    )
+
+    # Get the backend class
+    backend_cls = MambaAttentionBackendEnum.CUSTOM.get_class()
+    assert backend_cls.get_name() == "CUSTOM_MAMBA"
+    assert backend_cls.get_impl_cls() == CustomMambaAttentionImpl
diff --git a/tests/test_config.py b/tests/test_config.py
index 18b2a3264d6ec3b6c49c46d0ba7be8d57115cefe..a618249826816f7db75d0a101cf53f1580d48cc6 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
 import logging
 import os
 from dataclasses import MISSING, Field, asdict, dataclass, field
@@ -25,7 +26,6 @@ from vllm.config.vllm import (
     OPTIMIZATION_LEVEL_TO_CONFIG,
     OptimizationLevel,
 )
-from vllm.model_executor.layers.pooler import PoolingType
 from vllm.platforms import current_platform
 from utils import models_path_prefix
 
@@ -162,8 +162,9 @@ def test_get_pooling_config():
     model_config = ModelConfig(model_id)
 
     assert model_config.pooler_config is not None
-    assert model_config.pooler_config.normalize
-    assert model_config.pooler_config.pooling_type == PoolingType.MEAN.name
+    assert model_config.pooler_config.use_activation
+    assert model_config.pooler_config.seq_pooling_type == "MEAN"
+    assert model_config.pooler_config.tok_pooling_type == "ALL"
 
 
 @pytest.mark.skipif(
@@ -171,7 +172,7 @@ def test_get_pooling_config():
 )
 def test_get_pooling_config_from_args():
     model_id = os.path.join(models_path_prefix, "sentence-transformers/all-MiniLM-L12-v2")
-    pooler_config = PoolerConfig(pooling_type="CLS", normalize=True)
+    pooler_config = PoolerConfig(seq_pooling_type="CLS", normalize=True)
     model_config = ModelConfig(model_id, pooler_config=pooler_config)
 
     assert asdict(model_config.pooler_config) == asdict(pooler_config)
@@ -182,14 +183,25 @@ def test_get_pooling_config_from_args():
     [
         ("tomaarsen/Qwen3-Reranker-0.6B-seq-cls", "LAST", "LAST"),  # LLM
         ("intfloat/e5-small", "CLS", "MEAN"),  # BertModel
+    ],
+)
+def test_default_seq_pooling_type(model_id, default_pooling_type, pooling_type):
+    model_config = ModelConfig(model_id)
+    assert model_config._model_info.default_seq_pooling_type == default_pooling_type
+    assert model_config.pooler_config.seq_pooling_type == pooling_type
+
+
+@pytest.mark.parametrize(
+    ("model_id", "default_pooling_type", "pooling_type"),
+    [
         ("Qwen/Qwen2.5-Math-RM-72B", "ALL", "ALL"),  # reward
         ("Qwen/Qwen2.5-Math-PRM-7B", "STEP", "STEP"),  # step reward
     ],
 )
-def test_default_pooling_type(model_id, default_pooling_type, pooling_type):
+def test_default_tok_pooling_type(model_id, default_pooling_type, pooling_type):
     model_config = ModelConfig(model_id)
-    assert model_config._model_info.default_pooling_type == default_pooling_type
-    assert model_config.pooler_config.pooling_type == pooling_type
+    assert model_config._model_info.default_tok_pooling_type == default_pooling_type
+    assert model_config.pooler_config.tok_pooling_type == pooling_type
 
 
 @pytest.mark.parametrize(
@@ -207,8 +219,8 @@ def test_default_pooling_type(model_id, default_pooling_type, pooling_type):
 )
 def test_moe_model_detection(model_id, expected_is_moe_model):
     model_config = ModelConfig(model_id)
-    # Just check that is_moe_model field exists and is a boolean
-    assert model_config.is_model_moe() == expected_is_moe_model
+    # Just check that is_moe field exists and is a boolean
+    assert model_config.is_moe == expected_is_moe_model
 
 
 @pytest.mark.parametrize(
@@ -226,7 +238,7 @@ def test_moe_model_detection(model_id, expected_is_moe_model):
 def test_is_quantized(model_id, quantized):
     model_config = ModelConfig(model_id)
     # Just check that quantized field exists and is a boolean
-    assert model_config.is_quantized() == quantized
+    assert model_config.is_quantized == quantized
 
 
 @pytest.mark.skipif(
@@ -556,100 +568,100 @@ def test_s3_url_different_models_create_different_directories(mock_pull_files):
             "jason9693/Qwen2.5-1.5B-apeach",
             "decoder",
             True,
-            "Pooling models with causal attn and last pooling support chunked prefill.",
+            "Pooling models with causal attn and LAST/ALL pooling support chunked prefill.",  # noqa: E501
         ),
         (
             "Qwen/Qwen3-Embedding-0.6B",
             "decoder",
             True,
-            "Pooling models with causal attn and last pooling support chunked prefill.",
+            "Pooling models with causal attn and LAST/ALL pooling support chunked prefill.",  # noqa: E501
         ),
         (
             "Qwen/Qwen2.5-Math-PRM-7B",
             "decoder",
             False,
-            "Pooling models with step pooling does not support chunked prefill.",
+            "Pooling models with causal attn and LAST/STEP pooling do not support chunked prefill.",  # noqa: E501
         ),
         (
             "internlm/internlm2-1_8b-reward",
             "decoder",
             True,
-            "Pooling models with causal attn and all pooling support chunked prefill.",
+            "Pooling models with causal attn and LAST/ALL pooling support chunked prefill.",  # noqa: E501
         ),
         (
             "BAAI/bge-base-en",
             "encoder_only",
             False,
-            "Pooling models with bidirectional attn does not support chunked prefill.",
+            "Pooling models with bidirectional attn do not support chunked prefill.",  # noqa: E501
         ),
         (
             "boltuix/NeuroBERT-NER",
             "encoder_only",
             False,
-            "Pooling models with bidirectional attn does not support chunked prefill.",
+            "Pooling models with bidirectional attn do not support chunked prefill.",  # noqa: E501
         ),
         (
             "papluca/xlm-roberta-base-language-detection",
             "encoder_only",
             False,
-            "Pooling models with bidirectional attn does not support chunked prefill.",
+            "Pooling models with bidirectional attn do not support chunked prefill.",  # noqa: E501
         ),
         (
             "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
             "encoder_only",
             False,
-            "Pooling models with bidirectional attn does not support chunked prefill.",
+            "Pooling models with bidirectional attn do not support chunked prefill.",  # noqa: E501
         ),
         (
             "intfloat/e5-small",
             "encoder_only",
             False,
-            "Pooling models with bidirectional attn does not support chunked prefill.",
+            "Pooling models with bidirectional attn do not support chunked prefill.",  # noqa: E501
         ),
         # multimodal models
         (
             "openai/clip-vit-base-patch32",
             "decoder",
             True,
-            "Pooling models with causal attn and last pooling support chunked prefill.",
+            "Pooling models with causal attn and LAST/ALL pooling support chunked prefill.",  # noqa: E501
         ),
         (
             "google/siglip-base-patch16-224",
             "encoder_only",
             False,
-            "Pooling models with bidirectional attn does not support chunked prefill.",
+            "Pooling models with bidirectional attn do not support chunked prefill.",  # noqa: E501
         ),
         # generate models
         (
             "Qwen/Qwen3-0.6B",
             "decoder",
             True,
-            "Generative models support chunked prefill.",
+            "Generative models support chunked prefill.",  # noqa: E501
         ),
         (
             "Qwen/Qwen3-Next-80B-A3B-Instruct",
             "hybrid",
             True,
-            "Generative models support chunked prefill.",
+            "Generative models support chunked prefill.",  # noqa: E501
         ),
         (
             "ibm-granite/granite-4.0-h-small",
             "hybrid",
             True,
-            "Generative models support chunked prefill.",
+            "Generative models support chunked prefill.",  # noqa: E501
         ),
         (
             "state-spaces/mamba-130m-hf",
             "attention_free",
             True,
-            "Generative models support chunked prefill.",
+            "Generative models support chunked prefill.",  # noqa: E501
         ),
         # encoder_decoder models
         (
             "openai/whisper-small",
             "encoder_decoder",
             False,
-            "Encoder decoder models does not support chunked prefill.",
+            "Encoder decoder models do not support chunked prefill.",  # noqa: E501
         ),
     ],
 )
@@ -675,100 +687,100 @@ def test_is_chunked_prefill_supported(
             "jason9693/Qwen2.5-1.5B-apeach",
             "decoder",
             True,
-            "Pooling models with causal attn and last pooling support prefix caching.",
+            "Pooling models with causal attn and LAST/ALL pooling support prefix caching.",  # noqa: E501
         ),
         (
             "Qwen/Qwen3-Embedding-0.6B",
             "decoder",
             True,
-            "Pooling models with causal attn and last pooling support prefix caching.",
+            "Pooling models with causal attn and LAST/ALL pooling support prefix caching.",  # noqa: E501
         ),
         (
             "Qwen/Qwen2.5-Math-PRM-7B",
             "decoder",
             False,
-            "Pooling models with step pooling does not support prefix caching.",
+            "Pooling models with causal attn and LAST/STEP pooling do not support prefix caching.",  # noqa: E501
         ),
         (
             "internlm/internlm2-1_8b-reward",
             "decoder",
             True,
-            "Pooling models with causal attn and all pooling support prefix caching.",
+            "Pooling models with causal attn and LAST/ALL pooling support prefix caching.",  # noqa: E501
         ),
         (
             "BAAI/bge-base-en",
             "encoder_only",
             False,
-            "Pooling models with bidirectional attn does not support prefix caching.",
+            "Pooling models with bidirectional attn do not support prefix caching.",  # noqa: E501
         ),
         (
             "boltuix/NeuroBERT-NER",
             "encoder_only",
             False,
-            "Pooling models with bidirectional attn does not support prefix caching.",
+            "Pooling models with bidirectional attn do not support prefix caching.",  # noqa: E501
         ),
         (
             "papluca/xlm-roberta-base-language-detection",
             "encoder_only",
             False,
-            "Pooling models with bidirectional attn does not support prefix caching.",
+            "Pooling models with bidirectional attn do not support prefix caching.",  # noqa: E501
         ),
         (
             "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
             "encoder_only",
             False,
-            "Pooling models with bidirectional attn does not support prefix caching.",
+            "Pooling models with bidirectional attn do not support prefix caching.",  # noqa: E501
         ),
         (
             "intfloat/e5-small",
             "encoder_only",
             False,
-            "Pooling models with bidirectional attn does not support prefix caching.",
+            "Pooling models with bidirectional attn do not support prefix caching.",  # noqa: E501
         ),
         # multimodal models
         (
             "openai/clip-vit-base-patch32",
             "decoder",
             True,
-            "Pooling models with causal attn and last pooling support prefix caching.",
+            "Pooling models with causal attn and LAST/ALL pooling support prefix caching.",  # noqa: E501
         ),
         (
             "google/siglip-base-patch16-224",
             "encoder_only",
             False,
-            "Pooling models with bidirectional attn does not support prefix caching.",
+            "Pooling models with bidirectional attn do not support prefix caching.",  # noqa: E501
         ),
         # generate models
         (
             "Qwen/Qwen3-0.6B",
             "decoder",
             True,
-            "Generative models support prefix caching.",
+            "Generative models support prefix caching.",  # noqa: E501
         ),
         (
             "Qwen/Qwen3-Next-80B-A3B-Instruct",
             "hybrid",
             False,
-            "Hybrid models does not support prefix caching since the feature is still experimental.",  # noqa: E501
+            "Hybrid models do not support prefix caching since the feature is still experimental.",  # noqa: E501
         ),
         (
             "ibm-granite/granite-4.0-h-small",
             "hybrid",
             False,
-            "Hybrid models does not support prefix caching since the feature is still experimental.",  # noqa: E501
+            "Hybrid models do not support prefix caching since the feature is still experimental.",  # noqa: E501
         ),
         (
             "state-spaces/mamba-130m-hf",
             "attention_free",
             False,
-            "Attention free models does not support prefix caching since the feature is still experimental.",  # noqa: E501
+            "Attention free models do not support prefix caching since the feature is still experimental.",  # noqa: E501
         ),
         # encoder_decoder models
         (
             "openai/whisper-small",
             "encoder_decoder",
             False,
-            "Encoder decoder models does not support prefix caching.",
+            "Encoder decoder models do not support prefix caching.",  # noqa: E501
         ),
     ],
 )
@@ -927,7 +939,7 @@ def test_vllm_config_callable_defaults():
         model_config=quantized_model, optimization_level=OptimizationLevel.O2
     )
     enable_if_quantized = lambda cfg: (
-        cfg.model_config is not None and cfg.model_config.is_quantized()
+        cfg.model_config is not None and cfg.model_config.is_quantized
     )
     assert enable_if_quantized(config_quantized) is True
     assert enable_if_quantized(config_no_model) is False
@@ -938,7 +950,7 @@ def test_vllm_config_callable_defaults():
         model_config=moe_model, optimization_level=OptimizationLevel.O2
     )
     enable_if_sequential = lambda cfg: (
-        cfg.model_config is not None and not cfg.model_config.is_model_moe()
+        cfg.model_config is not None and not cfg.model_config.is_moe
     )
     assert enable_if_sequential(config_moe) is False
     assert enable_if_sequential(config_quantized) is True
@@ -1052,3 +1064,46 @@ def test_scheduler_config_init():
     with pytest.raises(AttributeError):
         # InitVar does not become an attribute
         print(SchedulerConfig.default_factory().max_model_len)
+
+
+@pytest.mark.parametrize(
+    (
+        "model_id",
+        "data_parallel_size",
+        "external_lb",
+        "expected_needs_coordinator",
+    ),
+    [
+        # Non-MoE model with DP=1 should not need coordinator
+        ("facebook/opt-125m", 1, False, False),
+        # Non-MoE model with DP>1 internal LB should need coordinator
+        ("facebook/opt-125m", 2, False, True),
+        # Non-MoE model with DP>1 external LB should not need coordinator
+        ("facebook/opt-125m", 2, True, False),
+        # MoE model with DP=1 should not need coordinator
+        ("mistralai/Mixtral-8x7B-Instruct-v0.1", 1, False, False),
+        # MoE model with DP>1 internal LB should need both coordinator
+        # and wave coordination
+        ("mistralai/Mixtral-8x7B-Instruct-v0.1", 2, False, True),
+        # MoE model with DP>1 external LB needs coordinator for wave coordination
+        # (wave coordination runs in coordinator process)
+        ("mistralai/Mixtral-8x7B-Instruct-v0.1", 2, True, True),
+    ],
+)
+def test_needs_dp_coordination(
+    model_id,
+    data_parallel_size,
+    external_lb,
+    expected_needs_coordinator,
+):
+    """Test that DP coordinator and wave coordination are configured correctly."""
+    from vllm.config import ParallelConfig
+
+    model_config = ModelConfig(model_id)
+    parallel_config = ParallelConfig(
+        data_parallel_size=data_parallel_size,
+        data_parallel_external_lb=external_lb,
+    )
+    vllm_config = VllmConfig(model_config=model_config, parallel_config=parallel_config)
+
+    assert vllm_config.needs_dp_coordinator == expected_needs_coordinator
diff --git a/tests/test_pooling_params.py b/tests/test_pooling_params.py
index 7812562c8948cae88c9fd3ffe365b015696f038d..28dedc10e1aa1119a0971224e648e912e43a4e45 100644
--- a/tests/test_pooling_params.py
+++ b/tests/test_pooling_params.py
@@ -18,7 +18,7 @@ EMBEDDING_MODELS = [
 ]
 
 classify_parameters = ["use_activation"]
-embed_parameters = ["dimensions", "normalize"]
+embed_parameters = ["dimensions", "use_activation"]
 step_pooling_parameters = ["step_tag_id", "returned_token_ids"]
 
 
@@ -40,19 +40,19 @@ def test_task():
 
 def test_embed():
     task = "embed"
-    model_config = MockModelConfig(pooler_config=PoolerConfig(pooling_type="CLS"))
+    model_config = MockModelConfig(pooler_config=PoolerConfig(seq_pooling_type="CLS"))
 
-    pooling_params = PoolingParams(normalize=None)
+    pooling_params = PoolingParams(use_activation=None)
     pooling_params.verify(task=task, model_config=model_config)
 
-    pooling_params = PoolingParams(normalize=True)
+    pooling_params = PoolingParams(use_activation=True)
     pooling_params.verify(task=task, model_config=model_config)
 
-    pooling_params = PoolingParams(normalize=False)
+    pooling_params = PoolingParams(use_activation=False)
     pooling_params.verify(task=task, model_config=model_config)
 
     invalid_parameters = classify_parameters + step_pooling_parameters
-    for p in invalid_parameters:
+    for p in set(invalid_parameters) - set(embed_parameters):
         with pytest.raises(ValueError):
             pooling_params = PoolingParams(**{p: True})
             pooling_params.verify(task=task, model_config=model_config)
@@ -86,7 +86,7 @@ def test_embed_dimensions(model_info: EmbedModelInfo):
 
 @pytest.mark.parametrize("task", ["score", "classify"])
 def test_classify(task):
-    model_config = MockModelConfig(pooler_config=PoolerConfig(pooling_type="CLS"))
+    model_config = MockModelConfig(pooler_config=PoolerConfig(seq_pooling_type="CLS"))
 
     pooling_params = PoolingParams(use_activation=None)
     pooling_params.verify(task=task, model_config=model_config)
@@ -98,7 +98,7 @@ def test_classify(task):
     pooling_params.verify(task=task, model_config=model_config)
 
     invalid_parameters = embed_parameters + step_pooling_parameters
-    for p in invalid_parameters:
+    for p in set(invalid_parameters) - set(classify_parameters):
         with pytest.raises(ValueError):
             pooling_params = PoolingParams(**{p: True})
             pooling_params.verify(task=task, model_config=model_config)
@@ -108,23 +108,23 @@ def test_classify(task):
 def test_token_embed(pooling_type: str):
     task = "token_embed"
     model_config = MockModelConfig(
-        pooler_config=PoolerConfig(pooling_type=pooling_type)
+        pooler_config=PoolerConfig(tok_pooling_type=pooling_type)
     )
 
-    pooling_params = PoolingParams(normalize=None)
+    pooling_params = PoolingParams(use_activation=None)
     pooling_params.verify(task=task, model_config=model_config)
 
-    pooling_params = PoolingParams(normalize=True)
+    pooling_params = PoolingParams(use_activation=True)
     pooling_params.verify(task=task, model_config=model_config)
 
-    pooling_params = PoolingParams(normalize=False)
+    pooling_params = PoolingParams(use_activation=False)
     pooling_params.verify(task=task, model_config=model_config)
 
     invalid_parameters = classify_parameters
     if pooling_type != "STEP":
         invalid_parameters = classify_parameters + step_pooling_parameters
 
-    for p in invalid_parameters:
+    for p in set(invalid_parameters) - set(embed_parameters):
         with pytest.raises(ValueError):
             pooling_params = PoolingParams(**{p: True})
             pooling_params.verify(task=task, model_config=model_config)
@@ -134,7 +134,7 @@ def test_token_embed(pooling_type: str):
 def test_token_classify(pooling_type: str):
     task = "token_classify"
     model_config = MockModelConfig(
-        pooler_config=PoolerConfig(pooling_type=pooling_type)
+        pooler_config=PoolerConfig(tok_pooling_type=pooling_type)
     )
 
     pooling_params = PoolingParams(use_activation=None)
@@ -150,7 +150,7 @@ def test_token_classify(pooling_type: str):
     if pooling_type != "STEP":
         invalid_parameters = embed_parameters + step_pooling_parameters
 
-    for p in invalid_parameters:
+    for p in set(invalid_parameters) - set(classify_parameters):
         with pytest.raises(ValueError):
             pooling_params = PoolingParams(**{p: True})
             pooling_params.verify(task=task, model_config=model_config)
diff --git a/tests/test_routing_simulator.py b/tests/test_routing_simulator.py
index e8826eb441a2486b7768477e8eb18d7e535e2713..e37f30755663ac3a73e7d8a1f9a7c8c819c46c5a 100644
--- a/tests/test_routing_simulator.py
+++ b/tests/test_routing_simulator.py
@@ -127,7 +127,7 @@ def test_routing_strategy_integration(monkeypatch, device):
         envs.environment_variables[env_name] = lambda s=strategy: s
 
         # Test the select_experts method
-        topk_weights, topk_ids, _ = fused_moe.select_experts(
+        topk_weights, topk_ids = fused_moe.router.select_experts(
             hidden_states=hidden_states,
             router_logits=router_logits,
         )
diff --git a/tests/tokenizers_/test_basic.py b/tests/tokenizers_/test_basic.py
index 0510261eacde7905d0c777f514507f3989845342..b5c26a6599453f5b45bb4b8a88044aee1e86018a 100644
--- a/tests/tokenizers_/test_basic.py
+++ b/tests/tokenizers_/test_basic.py
@@ -10,6 +10,7 @@ from transformers import (
 )
 
 from vllm.tokenizers import TokenizerLike, get_tokenizer
+from vllm.tokenizers.grok2 import Grok2Tokenizer
 from vllm.tokenizers.mistral import MistralTokenizer
 
 
@@ -37,6 +38,10 @@ def test_tokenizer_like_protocol():
     assert isinstance(tokenizer, MistralTokenizer)
     _assert_tokenizer_like(tokenizer)
 
+    tokenizer = get_tokenizer("xai-org/grok-2", tokenizer_mode="grok2")
+    assert isinstance(tokenizer, Grok2Tokenizer)
+    _assert_tokenizer_like(tokenizer)
+
 
 @pytest.mark.parametrize("tokenizer_name", ["facebook/opt-125m", "gpt2"])
 def test_tokenizer_revision(tokenizer_name: str):
diff --git a/tests/tokenizers_/test_detokenize.py b/tests/tokenizers_/test_detokenize.py
index f1e920ce6f15924da572c883f3fe7335549d9cb8..a127f0f4c9403e1d1614ab0901a085d31b9b18b0 100644
--- a/tests/tokenizers_/test_detokenize.py
+++ b/tests/tokenizers_/test_detokenize.py
@@ -40,7 +40,8 @@ TOKENIZERS = [
     os.path.join(models_path_prefix, "EleutherAI/gpt-j-6b"),
     os.path.join(models_path_prefix, "EleutherAI/pythia-70m"),
     os.path.join(models_path_prefix, "bigscience/bloom-560m"),
-    os.path.join(models_path_prefix, "mosaicml/mpt-7b"),
+    # FIXME: mosaicml/mpt-7b has been deleted
+    # "mosaicml/mpt-7b",
     os.path.join(models_path_prefix, "tiiuae/falcon-7b"),
     os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"),
     os.path.join(models_path_prefix, "codellama/CodeLlama-7b-hf"),
diff --git a/tests/tool_parsers/test_functiongemma_tool_parser.py b/tests/tool_parsers/test_functiongemma_tool_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5a0a5a19131c53718e6c942053a4ab2449f32d8
--- /dev/null
+++ b/tests/tool_parsers/test_functiongemma_tool_parser.py
@@ -0,0 +1,154 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from unittest.mock import MagicMock
+
+import pytest
+
+from vllm.entrypoints.openai.protocol import ChatCompletionRequest
+from vllm.tool_parsers.functiongemma_tool_parser import FunctionGemmaToolParser
+
+
+@pytest.fixture
+def mock_tokenizer():
+    tokenizer = MagicMock()
+    tokenizer.encode.return_value = [1, 2, 3]
+    tokenizer.get_vocab.return_value = {}
+    return tokenizer
+
+
+@pytest.fixture
+def parser(mock_tokenizer):
+    return FunctionGemmaToolParser(mock_tokenizer)
+
+
+@pytest.fixture
+def mock_request():
+    request = MagicMock(spec=ChatCompletionRequest)
+    request.tools = []
+    request.tool_choice = "auto"
+    return request
+
+
+class TestExtractToolCalls:
+    def test_no_tool_calls(self, parser, mock_request):
+        model_output = "Hello, how can I help you today?"
+        result = parser.extract_tool_calls(model_output, mock_request)
+
+        assert result.tools_called is False
+        assert result.tool_calls == []
+        assert result.content == model_output
+
+    def test_single_tool_call(self, parser, mock_request):
+        model_output = (
+            "<start_function_call>call:get_weather{location:<escape>London<escape>}"
+            "<end_function_call>"
+        )
+        result = parser.extract_tool_calls(model_output, mock_request)
+
+        assert result.tools_called is True
+        assert len(result.tool_calls) == 1
+        assert result.tool_calls[0].function.name == "get_weather"
+        assert '"location": "London"' in result.tool_calls[0].function.arguments
+
+    def test_multiple_arguments(self, parser, mock_request):
+        model_output = (
+            "<start_function_call>call:get_weather{"
+            "location:<escape>San Francisco<escape>,"
+            "unit:<escape>celsius<escape>}"
+            "<end_function_call>"
+        )
+        result = parser.extract_tool_calls(model_output, mock_request)
+
+        assert result.tools_called is True
+        assert len(result.tool_calls) == 1
+        assert result.tool_calls[0].function.name == "get_weather"
+        args = result.tool_calls[0].function.arguments
+        assert "San Francisco" in args
+        assert "celsius" in args
+
+    def test_text_before_tool_call(self, parser, mock_request):
+        model_output = (
+            "Let me check the weather for you. "
+            "<start_function_call>call:get_weather{location:<escape>Paris<escape>}"
+            "<end_function_call>"
+        )
+        result = parser.extract_tool_calls(model_output, mock_request)
+
+        assert result.tools_called is True
+        assert result.content == "Let me check the weather for you."
+
+    def test_multiple_tool_calls(self, parser, mock_request):
+        model_output = (
+            "<start_function_call>call:get_weather{location:<escape>London<escape>}"
+            "<end_function_call>"
+            "<start_function_call>call:get_time{timezone:<escape>UTC<escape>}"
+            "<end_function_call>"
+        )
+        result = parser.extract_tool_calls(model_output, mock_request)
+
+        assert result.tools_called is True
+        assert len(result.tool_calls) == 2
+        assert result.tool_calls[0].function.name == "get_weather"
+        assert result.tool_calls[1].function.name == "get_time"
+
+
+class TestParseArguments:
+    def test_empty_arguments(self, parser):
+        result = parser._parse_arguments("")
+        assert result == {}
+
+    def test_single_string_argument(self, parser):
+        result = parser._parse_arguments("city:<escape>Tokyo<escape>")
+        assert result == {"city": "Tokyo"}
+
+    def test_multiple_arguments(self, parser):
+        args_str = "city:<escape>Tokyo<escape>,country:<escape>Japan<escape>"
+        result = parser._parse_arguments(args_str)
+        assert result == {"city": "Tokyo", "country": "Japan"}
+
+    def test_numeric_argument(self, parser):
+        result = parser._parse_arguments("count:<escape>42<escape>")
+        assert result == {"count": 42}
+
+    def test_boolean_argument(self, parser):
+        result = parser._parse_arguments("enabled:<escape>true<escape>")
+        assert result == {"enabled": True}
+
+    def test_argument_with_spaces(self, parser):
+        result = parser._parse_arguments("message:<escape>Hello World<escape>")
+        assert result == {"message": "Hello World"}
+
+
+class TestAdjustRequest:
+    def test_skip_special_tokens_disabled(self, parser, mock_request):
+        mock_request.tools = [{"type": "function", "function": {"name": "test"}}]
+        mock_request.tool_choice = "auto"
+        mock_request.skip_special_tokens = True
+
+        result = parser.adjust_request(mock_request)
+        assert result.skip_special_tokens is False
+
+    def test_skip_special_tokens_when_tool_choice_none(self, parser, mock_request):
+        mock_request.tools = [{"type": "function", "function": {"name": "test"}}]
+        mock_request.tool_choice = "none"
+        mock_request.skip_special_tokens = True
+
+        result = parser.adjust_request(mock_request)
+        assert result.skip_special_tokens is True
+
+
+class TestBufferDeltaText:
+    def test_regular_text_not_buffered(self, parser):
+        result = parser._buffer_delta_text("hello")
+        assert result == "hello"
+        assert parser.buffered_delta_text == ""
+
+    def test_complete_tag_flushed(self, parser):
+        parser.buffered_delta_text = "<start_function_"
+        result = parser._buffer_delta_text("call>")
+        assert "<start_function_call>" in result
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/tests/tool_parsers/test_kimi_k2_tool_parser.py b/tests/tool_parsers/test_kimi_k2_tool_parser.py
index d02f53c34b455b93a3207a6995cf62d91439ea2e..dc6140374d53cde840dcc79c1d5c03520e829621 100644
--- a/tests/tool_parsers/test_kimi_k2_tool_parser.py
+++ b/tests/tool_parsers/test_kimi_k2_tool_parser.py
@@ -44,6 +44,33 @@ def assert_tool_calls(
         )
 
 
+def run_streaming_sequence(parser, deltas):
+    """Helper to simulate a streaming sequence and return results."""
+    previous_text = ""
+    previous_token_ids: list[int] = []
+    results = []
+
+    for delta_text, delta_token_ids in deltas:
+        current_text = previous_text + delta_text
+        current_token_ids = previous_token_ids + delta_token_ids
+
+        result = parser.extract_tool_calls_streaming(
+            previous_text=previous_text,
+            current_text=current_text,
+            delta_text=delta_text,
+            previous_token_ids=previous_token_ids,
+            current_token_ids=current_token_ids,
+            delta_token_ids=delta_token_ids,
+            request=None,
+        )
+        results.append(result)
+
+        previous_text = current_text
+        previous_token_ids = current_token_ids
+
+    return results
+
+
 def test_extract_tool_calls_no_tools(kimi_k2_tool_parser):
     model_output = "This is a test"
     extracted_tool_calls = kimi_k2_tool_parser.extract_tool_calls(
@@ -346,61 +373,32 @@ def test_token_leak_between_section_and_tool_begin(kimi_k2_tool_parser):
     tool_call_begin_token_id = kimi_k2_tool_parser.vocab.get("<|tool_call_begin|>")
 
     # Simulate streaming sequence:
+    deltas = [
+        ("I'll help you with that. ", [1, 2, 3]),
+        ("<|tool_calls_section_begin|>", [section_begin_token_id]),
+        (" spurious text ", [4, 5]),
+        ("<|tool_call_begin|>", [tool_call_begin_token_id]),
+    ]
+
+    results = run_streaming_sequence(kimi_k2_tool_parser, deltas)
+
     # Delta 1: "I'll help you with that. "
-    result1 = kimi_k2_tool_parser.extract_tool_calls_streaming(
-        previous_text="",
-        current_text="I'll help you with that. ",
-        delta_text="I'll help you with that. ",
-        previous_token_ids=[],
-        current_token_ids=[1, 2, 3],  # Regular tokens
-        delta_token_ids=[1, 2, 3],
-        request=None,
-    )
-    assert result1 is not None
-    assert result1.content == "I'll help you with that. "
+    assert results[0] is not None
+    assert results[0].content == "I'll help you with that. "
 
     # Delta 2: "<|tool_calls_section_begin|>"
-    prev_ids = [1, 2, 3]
-    curr_ids = prev_ids + [section_begin_token_id]
-    result2 = kimi_k2_tool_parser.extract_tool_calls_streaming(
-        previous_text="I'll help you with that. ",
-        current_text="I'll help you with that. <|tool_calls_section_begin|>",
-        delta_text="<|tool_calls_section_begin|>",
-        previous_token_ids=prev_ids,
-        current_token_ids=curr_ids,
-        delta_token_ids=[section_begin_token_id],
-        request=None,
-    )
     # Section marker should be stripped and suppressed
-    assert result2 is None or (result2.content is None or result2.content == "")
+    assert results[1] is None or (
+        results[1].content is None or results[1].content == ""
+    )
 
     # Delta 3: " spurious text or tokens " (THE LEAK SCENARIO)
-    prev_ids = curr_ids
-    curr_ids = curr_ids + [4, 5]
-    result3 = kimi_k2_tool_parser.extract_tool_calls_streaming(
-        previous_text="I'll help you with that. <|tool_calls_section_begin|>",
-        current_text="I'll help you with that. <|tool_calls_section_begin|> spurious text ",
-        delta_text=" spurious text ",
-        previous_token_ids=prev_ids,
-        current_token_ids=curr_ids,
-        delta_token_ids=[4, 5],
-        request=None,
-    )
     # CRITICAL: This text should be suppressed, NOT returned as reasoning_delta
-    assert result3 is None or (result3.content is None or result3.content == "")
+    assert results[2] is None or (
+        results[2].content is None or results[2].content == ""
+    )
 
     # Delta 4: "<|tool_call_begin|>..."
-    prev_ids = curr_ids
-    curr_ids = curr_ids + [tool_call_begin_token_id]
-    _result4 = kimi_k2_tool_parser.extract_tool_calls_streaming(
-        previous_text="I'll help you with that. <|tool_calls_section_begin|> spurious text ",
-        current_text="I'll help you with that. <|tool_calls_section_begin|> spurious text <|tool_call_begin|>",
-        delta_text="<|tool_call_begin|>",
-        previous_token_ids=prev_ids,
-        current_token_ids=curr_ids,
-        delta_token_ids=[tool_call_begin_token_id],
-        request=None,
-    )
     # Now we're in tool call mode, result depends on internal state
     # The key is that the spurious text from Delta 3 was not leaked
 
@@ -416,31 +414,15 @@ def test_split_markers_across_deltas(kimi_k2_tool_parser):
         "<|tool_calls_section_begin|>"
     )
 
-    # Delta 1: "...reasoning<|tool_calls_sec"
-    _result1 = kimi_k2_tool_parser.extract_tool_calls_streaming(
-        previous_text="Some reasoning",
-        current_text="Some reasoning<|tool_calls_sec",
-        delta_text="<|tool_calls_sec",
-        previous_token_ids=[1, 2],
-        current_token_ids=[1, 2, 3],  # Partial token
-        delta_token_ids=[3],
-        request=None,
-    )
-    # Partial token not recognized yet, might be buffered
-    # Should return as content or None (depends on implementation)
+    # Delta 1: partial token, Delta 2: complete marker
+    deltas = [
+        ("<|tool_calls_sec", [3]),
+        ("tion_begin|> ", [section_begin_token_id, 4]),
+    ]
+
+    _results = run_streaming_sequence(kimi_k2_tool_parser, deltas)
 
-    # Delta 2: "tion_begin|> "  (completes the marker)
-    _result2 = kimi_k2_tool_parser.extract_tool_calls_streaming(
-        previous_text="Some reasoning<|tool_calls_sec",
-        current_text="Some reasoning<|tool_calls_section_begin|> ",
-        delta_text="tion_begin|> ",
-        previous_token_ids=[1, 2, 3],
-        current_token_ids=[1, 2, section_begin_token_id, 4],
-        delta_token_ids=[section_begin_token_id, 4],
-        request=None,
-    )
     # Now the complete marker should be detected via buffer
-    # The parser should enter tool section mode
     assert kimi_k2_tool_parser.in_tool_section is True
 
 
@@ -475,42 +457,17 @@ def test_reentry_to_reasoning_after_tool_section(kimi_k2_tool_parser):
     section_begin_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_begin|>")
     section_end_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_end|>")
 
-    # Enter tool section
-    _result1 = kimi_k2_tool_parser.extract_tool_calls_streaming(
-        previous_text="",
-        current_text="<|tool_calls_section_begin|>",
-        delta_text="<|tool_calls_section_begin|>",
-        previous_token_ids=[],
-        current_token_ids=[section_begin_id],
-        delta_token_ids=[section_begin_id],
-        request=None,
-    )
-    assert kimi_k2_tool_parser.in_tool_section is True
+    deltas = [
+        ("<|tool_calls_section_begin|>", [section_begin_id]),
+        ("<|tool_calls_section_end|>", [section_end_id]),
+        (" More reasoning", [10, 11]),
+    ]
 
-    # Exit tool section
-    _result2 = kimi_k2_tool_parser.extract_tool_calls_streaming(
-        previous_text="<|tool_calls_section_begin|>",
-        current_text="<|tool_calls_section_begin|><|tool_calls_section_end|>",
-        delta_text="<|tool_calls_section_end|>",
-        previous_token_ids=[section_begin_id],
-        current_token_ids=[section_begin_id, section_end_id],
-        delta_token_ids=[section_end_id],
-        request=None,
-    )
-    assert kimi_k2_tool_parser.in_tool_section is False
+    results = run_streaming_sequence(kimi_k2_tool_parser, deltas)
 
-    # Subsequent reasoning text should be returned normally
-    result3 = kimi_k2_tool_parser.extract_tool_calls_streaming(
-        previous_text="<|tool_calls_section_begin|><|tool_calls_section_end|>",
-        current_text="<|tool_calls_section_begin|><|tool_calls_section_end|> More reasoning",
-        delta_text=" More reasoning",
-        previous_token_ids=[section_begin_id, section_end_id],
-        current_token_ids=[section_begin_id, section_end_id, 10, 11],
-        delta_token_ids=[10, 11],
-        request=None,
-    )
-    assert result3 is not None
-    assert result3.content == " More reasoning"
+    assert kimi_k2_tool_parser.in_tool_section is False
+    assert results[2] is not None
+    assert results[2].content == " More reasoning"
 
 
 def test_empty_tool_section(kimi_k2_tool_parser):
@@ -819,106 +776,150 @@ def test_tool_call_end_and_section_end_same_chunk(kimi_k2_tool_parser):
     tool_end_id = kimi_k2_tool_parser.vocab.get("<|tool_call_end|>")
 
     # Simulate a streaming sequence for a SHORT tool call (all in one chunk):
-    # 1. Reasoning text
-    result1 = kimi_k2_tool_parser.extract_tool_calls_streaming(
-        previous_text="",
-        current_text="Let me help. ",
-        delta_text="Let me help. ",
-        previous_token_ids=[],
-        current_token_ids=[1, 2],
-        delta_token_ids=[1, 2],
-        request=None,
-    )
-    assert result1 is not None
-    assert result1.content == "Let me help. "
-
-    # 2. Section begin
-    _result2 = kimi_k2_tool_parser.extract_tool_calls_streaming(
-        previous_text="Let me help. ",
-        current_text="Let me help. <|tool_calls_section_begin|>",
-        delta_text="<|tool_calls_section_begin|>",
-        previous_token_ids=[1, 2],
-        current_token_ids=[1, 2, section_begin_id],
-        delta_token_ids=[section_begin_id],
-        request=None,
-    )
-    assert kimi_k2_tool_parser.in_tool_section is True
-
-    # 3. Tool call begin + full content + tool_end + section_end ALL IN ONE CHUNK
-    # This is the critical scenario for short tool calls
     combined = (
         '<|tool_call_begin|>get_weather:0 <|tool_call_argument_begin|> {"city": "Paris"} '
         "<|tool_call_end|><|tool_calls_section_end|>"
     )
 
-    # Build up the previous text gradually to simulate realistic streaming
-    prev_text = "Let me help. <|tool_calls_section_begin|>"
-    curr_text = prev_text + combined
+    deltas = [
+        ("Let me help. ", [1, 2]),
+        ("<|tool_calls_section_begin|>", [section_begin_id]),
+        (combined, [tool_begin_id, 10, 11, 12, tool_end_id, section_end_id]),
+        (" Done", [20]),
+    ]
 
-    result3 = kimi_k2_tool_parser.extract_tool_calls_streaming(
-        previous_text=prev_text,
-        current_text=curr_text,
-        delta_text=combined,
-        previous_token_ids=[1, 2, section_begin_id],
-        current_token_ids=[
-            1,
-            2,
-            section_begin_id,
-            tool_begin_id,
-            10,
-            11,
-            12,
-            tool_end_id,
-            section_end_id,
-        ],
-        delta_token_ids=[tool_begin_id, 10, 11, 12, tool_end_id, section_end_id],
-        request=None,
-    )
+    results = run_streaming_sequence(kimi_k2_tool_parser, deltas)
 
     # CRITICAL: Parser should have exited section AFTER processing tool
     assert kimi_k2_tool_parser.in_tool_section is False
 
     # Tool call should have been emitted (not dropped)
-    # The result might be the tool name or None depending on state, but
-    # importantly, it shouldn't be returning the literal tokens as content
-
-    if result3 is not None and result3.content is not None:
+    if results[2] is not None and results[2].content is not None:
         # Verify no special tokens leaked into content
-        assert "<|tool_call_end|>" not in result3.content
-        assert "<|tool_calls_section_end|>" not in result3.content
+        assert "<|tool_call_end|>" not in results[2].content
+        assert "<|tool_calls_section_end|>" not in results[2].content
 
-    # 4. Verify subsequent content streams normally
-    result4 = kimi_k2_tool_parser.extract_tool_calls_streaming(
-        previous_text=curr_text,
-        current_text=curr_text + " Done",
-        delta_text=" Done",
-        previous_token_ids=[
-            1,
-            2,
-            section_begin_id,
-            tool_begin_id,
-            10,
-            11,
-            12,
-            tool_end_id,
-            section_end_id,
-        ],
-        current_token_ids=[
-            1,
-            2,
-            section_begin_id,
-            tool_begin_id,
-            10,
-            11,
-            12,
-            tool_end_id,
-            section_end_id,
-            20,
-        ],
-        delta_token_ids=[20],
-        request=None,
+    # Content after tool section should stream normally
+    assert results[3] is not None
+    assert results[3].content == " Done"
+
+
+def test_streaming_tool_call_markers_not_leaked(kimi_k2_tool_parser):
+    """
+    CRITICAL TEST: Verify that tool call markers (<|tool_call_begin|>,
+    <|tool_call_end|>, <|tool_call_argument_begin|>) are NOT leaked
+    into the content field during streaming.
+
+    This reproduces the AWS Bedrock bug where tool call markers appeared
+    in the 'text' field of responses.
+    """
+    kimi_k2_tool_parser.reset_streaming_state()
+
+    section_begin_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_begin|>")
+    section_end_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_end|>")
+    tool_begin_id = kimi_k2_tool_parser.vocab.get("<|tool_call_begin|>")
+    tool_end_id = kimi_k2_tool_parser.vocab.get("<|tool_call_end|>")
+
+    # List of markers that should NEVER appear in content
+    forbidden_markers = [
+        "<|tool_call_begin|>",
+        "<|tool_call_end|>",
+        "<|tool_call_argument_begin|>",
+        "<|tool_calls_section_begin|>",
+        "<|tool_calls_section_end|>",
+    ]
+
+    all_content = []
+
+    # Steps: reasoning, section begin, tool call, section end, more reasoning
+    tool_chunk = (
+        "<|tool_call_begin|> functions.get_weather:0 "
+        '<|tool_call_argument_begin|> {"city": "Tokyo"} <|tool_call_end|>'
     )
+    deltas = [
+        ("I'll check the weather. ", [1, 2, 3]),
+        ("<|tool_calls_section_begin|>", [section_begin_id]),
+        (tool_chunk, [tool_begin_id, 10, 11, tool_end_id]),
+        ("<|tool_calls_section_end|>", [section_end_id]),
+        (" Here's the result.", [20, 21]),
+    ]
+
+    results = run_streaming_sequence(kimi_k2_tool_parser, deltas)
+
+    for res in results:
+        if res and res.content:
+            all_content.append(res.content)
+
+    # CRITICAL ASSERTIONS: No forbidden markers in any content
+    full_content = "".join(all_content)
+    for marker in forbidden_markers:
+        assert marker not in full_content, (
+            f"MARKER LEAK DETECTED: '{marker}' found in content. "
+            f"Full content: {repr(full_content)}"
+        )
 
-    # Content after tool section should stream normally
-    assert result4 is not None
-    assert result4.content == " Done"
+    # Also check that tool call content (function name, arguments) is not leaked
+    assert "get_weather" not in full_content, (
+        f"TOOL CALL CONTENT LEAKED: 'get_weather' found in content. "
+        f"Full content: {repr(full_content)}"
+    )
+    assert "Tokyo" not in full_content, (
+        f"TOOL CALL CONTENT LEAKED: 'Tokyo' found in content. "
+        f"Full content: {repr(full_content)}"
+    )
+
+    # Verify that legitimate content was preserved
+    assert "I'll check the weather." in full_content or len(all_content) > 0
+
+
+def test_streaming_multiple_tool_calls_not_leaked(kimi_k2_tool_parser):
+    """
+    Test that MULTIPLE tool calls in streaming mode do not leak into content.
+    This reproduces the AWS Bedrock scenario: "Compare weather in Tokyo and NYC".
+    """
+    kimi_k2_tool_parser.reset_streaming_state()
+
+    section_begin_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_begin|>")
+    section_end_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_end|>")
+    tool_begin_id = kimi_k2_tool_parser.vocab.get("<|tool_call_begin|>")
+    tool_end_id = kimi_k2_tool_parser.vocab.get("<|tool_call_end|>")
+
+    all_content = []
+
+    tool1 = '<|tool_call_begin|> get_weather:0 <|tool_call_argument_begin|> {"city": "Tokyo"} <|tool_call_end|>'
+    tool2 = ' <|tool_call_begin|> get_weather:1 <|tool_call_argument_begin|> {"city": "New York"} <|tool_call_end|>'
+
+    deltas = [
+        ("I'll compare the weather. ", [1, 2, 3]),
+        ("<|tool_calls_section_begin|>", [section_begin_id]),
+        (tool1, [tool_begin_id, 10, tool_end_id]),
+        (tool2, [tool_begin_id, 20, tool_end_id]),
+        ("<|tool_calls_section_end|>", [section_end_id]),
+        (" Here's the comparison.", [30]),
+    ]
+
+    results = run_streaming_sequence(kimi_k2_tool_parser, deltas)
+
+    for res in results:
+        if res and res.content:
+            all_content.append(res.content)
+
+    # Assertions
+    full_content = "".join(all_content)
+
+    # Check no markers leaked
+    forbidden = ["<|tool_call", "<|tool_calls_section"]
+    for marker in forbidden:
+        assert marker not in full_content, (
+            f"MARKER LEAKED: {marker} in {repr(full_content)}"
+        )
+
+    # Check no tool call content leaked (both tools)
+    assert "get_weather" not in full_content, f"TOOL NAME LEAKED: {repr(full_content)}"
+    assert "Tokyo" not in full_content, f"TOOL ARG LEAKED (Tokyo): {repr(full_content)}"
+    assert "New York" not in full_content, (
+        f"TOOL ARG LEAKED (NYC): {repr(full_content)}"
+    )
+
+    # Legitimate content preserved
+    assert "compare" in full_content.lower() or len(all_content) > 0
diff --git a/tests/tool_parsers/test_mistral_tool_parser.py b/tests/tool_parsers/test_mistral_tool_parser.py
index 9400a67267f4cab473874e896e531d2f8c54648c..d2502079d0de9a7bdb79b26d7ef4408c1bcf6e5e 100644
--- a/tests/tool_parsers/test_mistral_tool_parser.py
+++ b/tests/tool_parsers/test_mistral_tool_parser.py
@@ -281,6 +281,8 @@ def test_extract_tool_calls_pre_v11_tokenizer(
         "single_tool_add",
         "single_tool_weather",
         "multiple_tool_calls",
+        "complex",
+        "wrong_json",
     ],
     argnames=["model_output", "expected_tool_calls", "expected_content"],
     argvalues=[
@@ -326,6 +328,36 @@ def test_extract_tool_calls_pre_v11_tokenizer(
             ],
             None,
         ),
+        (
+            # Complex
+            """hi{hi[TOOL_CALLS]bash{"command": "print(\\"hello world!\\")\\nre.compile(r\'{}\')""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="bash",
+                        arguments=json.dumps(
+                            {"command": "print(\"hello world!\")\nre.compile(r'{}')"}
+                        )[:-2],
+                    )
+                )
+            ],
+            "hi{hi",
+        ),
+        (
+            # Wrong json
+            """hi{hi[TOOL_CALLS]bash{"command": "print(\\"hello world!\\")\\nre.compile(r\'{}\')"}""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="bash",
+                        arguments=json.dumps(
+                            {"command": "print(\"hello world!\")\nre.compile(r'{}')"}
+                        ),
+                    )
+                )
+            ],
+            "hi{hi",
+        ),
     ],
 )
 def test_extract_tool_calls(
@@ -673,7 +705,7 @@ def test_extract_tool_calls_streaming(
         ),
         (
             # Complex
-            """[TOOL_CALLS]bash{"command": "print(\\"hello world!\\")\\nre.compile(r\'{}\')"}""",  # noqa: E501
+            """hi{hi[TOOL_CALLS]bash{"command": "print(\\"hello world!\\")\\nre.compile(r\'{}\')"}""",  # noqa: E501
             [
                 ToolCall(
                     function=FunctionCall(
@@ -684,7 +716,7 @@ def test_extract_tool_calls_streaming(
                     )
                 )
             ],
-            "",
+            "hi{hi",
         ),
     ],
 )
diff --git a/tests/tool_use/test_chat_completions.py b/tests/tool_use/test_chat_completions.py
index 425d3879985e791460a8567ce00386beccdff9ac..07b7933f65c06881582904140dace85d65d2ed22 100644
--- a/tests/tool_use/test_chat_completions.py
+++ b/tests/tool_use/test_chat_completions.py
@@ -151,3 +151,45 @@ async def test_chat_completion_with_tools(
     assert chunk.choices[0].finish_reason != "tool_calls"
     assert len(chunks)
     assert "".join(chunks) == output_text
+
+
+# Regression test for https://github.com/vllm-project/vllm/issues/32006
+# Engine crash when combining response_format: json_object with
+# tool_choice: required
+@pytest.mark.asyncio
+@pytest.mark.timeout(120)
+async def test_response_format_with_tool_choice_required(
+    client: openai.AsyncOpenAI, server_config: ServerConfig
+):
+    """
+    Test that combining response_format: json_object with tool_choice: required
+    doesn't crash the engine.
+
+    Before the fix, this would cause a validation error:
+    "You can only use one kind of structured outputs constraint but multiple
+    are specified" because both json_object and json (from tool schema) would
+    be set in StructuredOutputsParams.
+    """
+    models = await client.models.list()
+    model_name: str = models.data[0].id
+
+    # This combination previously crashed the engine
+    chat_completion = await client.chat.completions.create(
+        messages=ensure_system_prompt(
+            [{"role": "user", "content": "What is the weather in Dallas, Texas?"}],
+            server_config,
+        ),
+        temperature=0,
+        max_completion_tokens=150,
+        model=model_name,
+        tools=[WEATHER_TOOL],
+        tool_choice="required",
+        response_format={"type": "json_object"},
+    )
+
+    # The fix clears response_format when tool_choice forces tool calling,
+    # so the request should complete successfully with tool calls
+    choice = chat_completion.choices[0]
+    assert choice.finish_reason == "tool_calls"
+    assert choice.message.tool_calls is not None
+    assert len(choice.message.tool_calls) > 0
diff --git a/tests/tool_use/test_minimax_m2_tool_parser.py b/tests/tool_use/test_minimax_m2_tool_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf1835b1928b4f1b36652cd7111108d3afc7a60f
--- /dev/null
+++ b/tests/tool_use/test_minimax_m2_tool_parser.py
@@ -0,0 +1,119 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+
+import pytest
+
+from vllm.tool_parsers.minimax_m2_tool_parser import (
+    MinimaxM2ToolParser,
+)
+
+pytestmark = pytest.mark.cpu_test
+
+
+class FakeTokenizer:
+    """Minimal fake tokenizer that exposes the attributes used by the
+    parser: a truthy model_tokenizer marker and a vocab mapping for the
+    special tokens.
+    """
+
+    def __init__(self):
+        self.model_tokenizer = True
+        # The parser will look up start/end tokens by their literal strings
+        self.vocab = {
+            "<minimax:tool_call>": 1,
+            "</minimax:tool_call>": 2,
+        }
+
+    def get_vocab(self):
+        return self.vocab
+
+
+@pytest.fixture
+def minimax_m2_tool_parser():
+    return MinimaxM2ToolParser(FakeTokenizer())
+
+
+def test_extract_tool_calls_streaming_incremental(minimax_m2_tool_parser):
+    parser = minimax_m2_tool_parser
+    parser._reset_streaming_state()
+    chunks = [
+        "<minimax:tool_call>",
+        '<invoke name="get_weather">',
+        '<parameter name="city">',
+        "Seattle</parameter>",
+        "</invoke></minimax:tool_call>",
+    ]
+    previous = ""
+    for chunk in chunks:
+        current = previous + chunk
+        delta = chunk
+        parser.extract_tool_calls_streaming(
+            previous_text=previous,
+            current_text=current,
+            delta_text=delta,
+            previous_token_ids=[],
+            current_token_ids=[],
+            delta_token_ids=[],
+            request=None,
+        )
+        previous = current
+
+    assert len(parser.prev_tool_call_arr) == 1
+    entry = parser.prev_tool_call_arr[0]
+
+    assert entry["name"] == "get_weather"
+    args = entry["arguments"]
+    assert args["city"] == "Seattle"
+
+
+def test_streaming_minimax_m2_multiple_invokes(minimax_m2_tool_parser):
+    parser = minimax_m2_tool_parser
+    parser._reset_streaming_state()
+
+    chunks = [
+        "<minimax:tool_call>",
+        '<invoke name="search_web">',
+        '<parameter name="query_tag">',
+        '["technology", "events"]</parameter>',
+        '<parameter name="query_list">',
+        '["OpenAI", "latest", "release"]</parameter>',
+        "</invoke>",
+        '<invoke name="search_web">',
+        '<parameter name="query_tag">',
+        '["technology", "events"]</parameter>',
+        '<parameter name="query_list">',
+        '["Gemini", "latest", "release"]</parameter>',
+        "</invoke>",
+        "</minimax:tool_call>",
+    ]
+    previous = ""
+    for chunk in chunks:
+        current = previous + chunk
+        delta = chunk
+        parser.extract_tool_calls_streaming(
+            previous_text=previous,
+            current_text=current,
+            delta_text=delta,
+            previous_token_ids=[],
+            current_token_ids=[],
+            delta_token_ids=[],
+            request=None,
+        )
+        previous = current
+
+    assert len(parser.prev_tool_call_arr) == 2
+
+    for entry, expect_model in zip(parser.prev_tool_call_arr, ["OpenAI", "Gemini"]):
+        assert entry["name"] == "search_web"
+        args = json.dumps(entry["arguments"])
+        assert "technology" in args and "events" in args
+        assert expect_model in args
+
+    # check streamed_args_for_tool for serving_chat.py
+    for index in range(2):
+        expected_call = parser.prev_tool_call_arr[index].get("arguments", {})
+        expected_call = json.dumps(expected_call)
+        actual_call = parser.streamed_args_for_tool[index]
+        assert expected_call == actual_call
diff --git a/tests/tool_use/test_tool_choice_required.py b/tests/tool_use/test_tool_choice_required.py
index 35ed8d215f73a27b2323d54861f2106a8c68e237..6ff37255e48d47ace53e96e9cc35fadb5b805203 100644
--- a/tests/tool_use/test_tool_choice_required.py
+++ b/tests/tool_use/test_tool_choice_required.py
@@ -311,6 +311,7 @@ def test_streaming_output_valid(output, empty_params, delta_len):
         previous_text = current_text
 
     assert len(messages) > 0
+
     combined_messages = "["
     for message in messages:
         if message.tool_calls[0].function.name:
@@ -328,3 +329,35 @@ def test_streaming_output_valid(output, empty_params, delta_len):
     combined_messages += "}]"
     assert json.loads(combined_messages) == output
     assert json.dumps(json.loads(combined_messages)) == output_json
+
+
+def test_streaming_output_valid_with_trailing_extra_data():
+    self = MagicMock()
+
+    output = [{"name": "get_current_weather", "parameters": {"city": "Vienna"}}]
+    output_json = json.dumps(output) + "\nDONE"
+
+    previous_text = ""
+    function_name_returned = False
+    messages = []
+    delta_len = 3
+    for i in range(0, len(output_json), delta_len):
+        delta_text = output_json[i : i + delta_len]
+        current_text = previous_text + delta_text
+
+        delta_message, function_name_returned = (
+            OpenAIServingChat.extract_tool_call_required_streaming(
+                self,
+                previous_text=previous_text,
+                current_text=current_text,
+                delta_text=delta_text,
+                function_name_returned=function_name_returned,
+            )
+        )
+
+        if delta_message:
+            messages.append(delta_message)
+
+        previous_text = current_text
+
+    assert len(messages) > 0
diff --git a/tests/utils.py b/tests/utils.py
index c0db93698d2b768fdfb844a4015e3be5e8423cac..aaf371225535a424bd36b3daf3c53f30602e250c 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -112,6 +112,7 @@ class RemoteOpenAIServer:
             env.update(env_dict)
         serve_cmd = ["vllm", "serve", model, *vllm_serve_args]
         print(f"Launching RemoteOpenAIServer with: {' '.join(serve_cmd)}")
+        print(f"Environment variables: {env}")
         self.proc: subprocess.Popen = subprocess.Popen(
             serve_cmd,
             env=env,
@@ -726,13 +727,34 @@ def init_test_distributed_environment(
     distributed_init_port: str,
     local_rank: int = -1,
 ) -> None:
-    distributed_init_method = f"tcp://localhost:{distributed_init_port}"
-    init_distributed_environment(
-        world_size=pp_size * tp_size,
-        rank=rank,
-        distributed_init_method=distributed_init_method,
-        local_rank=local_rank,
+    # Note: This function is often called from Ray worker processes, so we
+    # can't rely on pytest fixtures to set the config. We check if the config
+    # is already set and only create a default one if needed.
+    from vllm.config import (
+        VllmConfig,
+        get_current_vllm_config_or_none,
+        set_current_vllm_config,
     )
+
+    distributed_init_method = f"tcp://localhost:{distributed_init_port}"
+
+    if get_current_vllm_config_or_none() is not None:
+        # Config already set, use it directly
+        init_distributed_environment(
+            world_size=pp_size * tp_size,
+            rank=rank,
+            distributed_init_method=distributed_init_method,
+            local_rank=local_rank,
+        )
+    else:
+        # No config set, create a default one for the test
+        with set_current_vllm_config(VllmConfig()):
+            init_distributed_environment(
+                world_size=pp_size * tp_size,
+                rank=rank,
+                distributed_init_method=distributed_init_method,
+                local_rank=local_rank,
+            )
     ensure_model_parallel_initialized(tp_size, pp_size)
 
 
diff --git a/tests/utils_/test_torch_utils.py b/tests/utils_/test_torch_utils.py
index 0a30b9727f4de895eb32c2d84a9340bc025f2698..f6a9486a129607e43501c9583e691acbd747be26 100644
--- a/tests/utils_/test_torch_utils.py
+++ b/tests/utils_/test_torch_utils.py
@@ -99,30 +99,18 @@ def _test_stream_thread(main_expected_stream: torch.cuda.Stream):
 
 
 def test_current_stream_multithread():
-    from vllm.platforms import current_platform
-
     if not torch.cuda.is_available():
         pytest.skip("CUDA not available")
 
-    if current_platform.is_rocm():
-        main_dedicated_stream = current_stream()
-
-        assert main_dedicated_stream.cuda_stream != 0, (
-            "ROCm should create a dedicated stream, not use default stream (0x0)"
-        )
-
-        main_stream_again = current_stream()
-        assert main_stream_again == main_dedicated_stream, (
-            "Multiple calls to current_stream should return the same dedicated stream"
-        )
+    main_dedicated_stream = current_stream()
 
-        _test_stream_thread(main_dedicated_stream)
-    else:
-        main_default_stream = torch.cuda.default_stream()
-        main_initial_stream = current_stream()
+    assert main_dedicated_stream.cuda_stream != 0, (
+        "ROCm/CUDA should create a dedicated stream, not use default stream (0x0)"
+    )
 
-        assert main_initial_stream == main_default_stream, (
-            "First call to current_stream should return default stream on CUDA"
-        )
+    main_stream_again = current_stream()
+    assert main_stream_again == main_dedicated_stream, (
+        "Multiple calls to current_stream should return the same dedicated stream"
+    )
 
-        _test_stream_thread(main_default_stream)
+    _test_stream_thread(main_dedicated_stream)
diff --git a/tests/v1/attention/test_attention_backends.py b/tests/v1/attention/test_attention_backends.py
index e7ec8380e0a84b41657aef7f4f8db9c9c8a7f85b..6e2bb44e09cd423e8a9a5d2b09ba37536446d85c 100644
--- a/tests/v1/attention/test_attention_backends.py
+++ b/tests/v1/attention/test_attention_backends.py
@@ -15,13 +15,17 @@ from tests.v1.attention.utils import (
     create_vllm_config,
     try_get_attention_backend,
 )
-from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config import ModelConfig
 from vllm.platforms import current_platform
 from vllm.utils.math_utils import cdiv
-from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, is_torch_equal_or_newer
+from vllm.utils.torch_utils import (
+    STR_DTYPE_TO_TORCH_DTYPE,
+    is_torch_equal_or_newer,
+    set_random_seed,
+)
+from vllm.v1.attention.backend import AttentionType, CommonAttentionMetadata
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
 from vllm.v1.attention.backends.utils import (
-    CommonAttentionMetadata,
     set_kv_cache_layout,
 )
 from vllm.v1.kv_cache_interface import FullAttentionSpec
@@ -79,6 +83,13 @@ BATCH_SPECS = {
     ),
     "single_decode": BatchSpec(seq_lens=[1024], query_lens=[1]),
     "single_prefill": BatchSpec(seq_lens=[1024], query_lens=[64]),
+    # encoder-only
+    "small_encoder_prefill": BatchSpec(
+        seq_lens=[32, 64, 128, 256], query_lens=[32, 64, 128, 256]
+    ),
+    "medium_encoder_prefill": BatchSpec(
+        seq_lens=[256, 512, 1024, 2048], query_lens=[256, 512, 1024, 2048]
+    ),
 }
 
 
@@ -114,17 +125,17 @@ def create_and_prepopulate_kv_cache(
         Tuple of (kv_cache, updated_block_table)
     """
     batch_size = len(k_contexts)
-    seq_lens = common_attn_metadata.seq_lens_cpu
+    seq_lens = common_attn_metadata.seq_lens.cpu()
     query_lens = (
         common_attn_metadata.query_start_loc_cpu[1:]
         - common_attn_metadata.query_start_loc_cpu[:-1]
     )
-    context_lens = common_attn_metadata.num_computed_tokens_cpu
+    context_lens = seq_lens - query_lens
     block_table = common_attn_metadata.block_table_tensor
     slot_mapping = common_attn_metadata.slot_mapping
 
     # Create KV cache
-    kv_cache = torch.empty(
+    kv_cache = torch.zeros(
         2, num_blocks, block_size, num_kv_heads, head_size, dtype=dtype, device=device
     )
     kv_cache_flat = kv_cache.view(2, -1, num_kv_heads, head_size)
@@ -205,6 +216,7 @@ def run_attention_backend(
     key: torch.Tensor,
     value: torch.Tensor,
     kv_cache: torch.Tensor,
+    attn_type: AttentionType = AttentionType.DECODER,
     sliding_window: int | None = None,
 ) -> torch.Tensor:
     """Run attention computation using the specified backend's AttentionImpl."""
@@ -272,6 +284,7 @@ def run_attention_backend(
         num_kv_heads=num_kv_heads,
         alibi_slopes=None,
         sliding_window=sliding_window,
+        attn_type=attn_type,
         kv_cache_dtype="auto",
     )
 
@@ -295,6 +308,7 @@ def _test_backend_correctness(
     backend_to_test: list[AttentionBackendEnum | str],
     mask_mod,
     *,
+    attn_type: AttentionType = AttentionType.DECODER,
     block_size: int = 16,
     atol: float = 1e-2,
     rtol: float = 1e-2,
@@ -320,7 +334,7 @@ def _test_backend_correctness(
     multiple GPUs. This tests that backends work correctly with different
     head counts.
     """
-    current_platform.seed_everything(42)
+    set_random_seed(42)
 
     hf_config_override = None
     if tensor_parallel_size > 1:
@@ -432,6 +446,9 @@ def _test_backend_correctness(
     common_attn_metadata = create_common_attn_metadata(
         batch_spec, vllm_config.cache_config.block_size, device
     )
+    if attn_type == AttentionType.ENCODER_ONLY:
+        # For encoder-only, all tokens are prefill tokens
+        common_attn_metadata.causal = False
 
     # 3. Simulate Paged KV Cache and a realistic slot_mapping
     kv_cache = create_and_prepopulate_kv_cache(
@@ -487,6 +504,7 @@ def _test_backend_correctness(
                 value_vllm,
                 kv_cache_for_backend,
                 sliding_window=sliding_window,
+                attn_type=attn_type,
             )
         finally:
             if reset_kv_cache_layout:
@@ -537,7 +555,7 @@ def _test_backend_correctness(
 @pytest.mark.parametrize("model", ["meta-llama/Meta-Llama-3-8B"])
 @pytest.mark.parametrize("tensor_parallel_size", [1, 2, 4])
 def test_causal_backend_correctness(
-    batch_spec_name: str, model: str, tensor_parallel_size: int
+    default_vllm_config, batch_spec_name: str, model: str, tensor_parallel_size: int
 ):
     """Test backend's correctness with causal attention."""
 
@@ -557,9 +575,21 @@ def test_causal_backend_correctness(
         if is_torch_equal_or_newer("2.9.0.dev0")
         else []
     )
-    SMALL_BLOCK_BACKENDS = [
-        x for x in BACKENDS_TO_TEST if x not in LARGE_BLOCK_BACKENDS
-    ]
+
+    if current_platform.is_rocm():
+        SMALL_BLOCK_BACKENDS = [
+            x
+            for x in BACKENDS_TO_TEST
+            if (
+                x not in LARGE_BLOCK_BACKENDS
+                and x is not AttentionBackendEnum.FLASH_ATTN
+            )
+        ]
+    else:
+        SMALL_BLOCK_BACKENDS = [
+            x for x in BACKENDS_TO_TEST if x not in LARGE_BLOCK_BACKENDS
+        ]
+
     _test_backend_correctness(
         batch_spec,
         model,
@@ -580,12 +610,20 @@ def test_causal_backend_correctness(
         )
 
 
-SLIDING_WINDOW_BACKENDS_TO_TEST = [
-    AttentionBackendEnum.FLASH_ATTN,
-    AttentionBackendEnum.FLEX_ATTENTION,
-    AttentionBackendEnum.TRITON_ATTN,
-    "FLEX_ATTENTION_SLOW",
-]
+if current_platform.is_rocm():
+    # FLASH_ATTN is not supported on ROCm
+    SLIDING_WINDOW_BACKENDS_TO_TEST = [
+        AttentionBackendEnum.FLEX_ATTENTION,
+        AttentionBackendEnum.TRITON_ATTN,
+        "FLEX_ATTENTION_SLOW",
+    ]
+else:
+    SLIDING_WINDOW_BACKENDS_TO_TEST = [
+        AttentionBackendEnum.FLASH_ATTN,
+        AttentionBackendEnum.FLEX_ATTENTION,
+        AttentionBackendEnum.TRITON_ATTN,
+        "FLEX_ATTENTION_SLOW",
+    ]
 
 
 @pytest.mark.parametrize(
@@ -652,3 +690,45 @@ def test_sliding_window_backend_correctness(
             block_size=128,
             tensor_parallel_size=tensor_parallel_size,
         )
+
+
+@pytest.mark.parametrize(
+    "batch_spec_name",
+    [
+        "small_encoder_prefill",
+        "medium_encoder_prefill",
+    ],
+)
+@pytest.mark.parametrize("model", ["google/embeddinggemma-300m"])
+@pytest.mark.parametrize("tensor_parallel_size", [1, 2])
+def test_sliding_window_encoder_backend_correctness(
+    batch_spec_name: str, model: str, tensor_parallel_size: int
+):
+    """Test backend's correctness with sliding window attention."""
+
+    def bidi_sliding_window_mask_mod(
+        b: torch.Tensor,
+        h: torch.Tensor,
+        q_idx: torch.Tensor,
+        kv_idx: torch.Tensor,
+        *,
+        context_len: int,
+        sliding_window: int,
+    ):
+        return torch.abs(q_idx + context_len - kv_idx) < sliding_window
+
+    batch_spec = BATCH_SPECS[batch_spec_name]
+    model_config = ModelConfig(model=model, max_model_len=max(batch_spec.seq_lens))
+    sliding_window = model_config.get_sliding_window()
+    sliding_window_mask_mod_fn = partial(
+        bidi_sliding_window_mask_mod, sliding_window=sliding_window
+    )
+
+    _test_backend_correctness(
+        batch_spec,
+        model,
+        SLIDING_WINDOW_BACKENDS_TO_TEST,
+        sliding_window_mask_mod_fn,
+        attn_type=AttentionType.ENCODER_ONLY,
+        tensor_parallel_size=tensor_parallel_size,
+    )
diff --git a/tests/v1/attention/test_attention_backends_selection.py b/tests/v1/attention/test_attention_backends_selection.py
index 6464bb52a4eaa56e4e03e6da672253aea1bacd47..9d8d5d3ebb191e8627c4035a00563ac23f023b9a 100644
--- a/tests/v1/attention/test_attention_backends_selection.py
+++ b/tests/v1/attention/test_attention_backends_selection.py
@@ -79,7 +79,12 @@ from vllm.v1.attention.backends.short_conv_attn import ShortConvAttentionBackend
     ],
 )
 def test_mamba_layers_get_attn_backend(
-    dist_init, layer_class, init_kwargs, expected_backend, expected_mamba_type
+    default_vllm_config,
+    dist_init,
+    layer_class,
+    init_kwargs,
+    expected_backend,
+    expected_mamba_type,
 ):
     """Test that Mamba-like layers return the correct attention backend."""
     layer = layer_class(**init_kwargs)
diff --git a/tests/v1/attention/test_attention_splitting.py b/tests/v1/attention/test_attention_splitting.py
index f08e2f480e30f0497cdd764279d3b78a0dfaebb5..734819fcdca83b3165d825a01dc97bd92cff7522 100644
--- a/tests/v1/attention/test_attention_splitting.py
+++ b/tests/v1/attention/test_attention_splitting.py
@@ -323,6 +323,7 @@ def test_prefill_split_across_ubatches(
         num_tokens,
         batch_spec.batch_size,
         split_point=split_point,
+        num_ubatches=2,
     )
     assert ubatch_slices is not None and len(ubatch_slices) == 2
 
diff --git a/tests/v1/attention/test_batch_reordering.py b/tests/v1/attention/test_batch_reordering.py
index e37219454222bcf0e69ad1d9fb7b8b25b350ac6a..6265e12f9a7d17fcf35cfbc7f85312c243f7d99d 100644
--- a/tests/v1/attention/test_batch_reordering.py
+++ b/tests/v1/attention/test_batch_reordering.py
@@ -98,6 +98,27 @@ REORDER_TEST_CASES = {
         expected_order=[0, 1, 6, 8, 4, 3, 2, 7, 5],
         expected_modified=True,
     ),
+    "new_request_single_token_prefill": ReorderTestCase(
+        requests=[
+            (100, 0),
+            (1, 0),  # New request with only 1 token (STILL prefill)
+            (50, 100),
+            (1, 10),
+        ],
+        # Only index 3 is a true decode (has num_computed_tokens > 0)
+        expected_order=[3, 2, 0, 1],
+        expected_modified=True,
+    ),
+    "multiple_new_requests_single_token_prefill": ReorderTestCase(
+        requests=[
+            (1, 0),  # New prefill (1 token, no computed)
+            (1, 0),  # New prefill (1 token, no computed)
+            (1, 50),
+            (200, 0),
+        ],
+        expected_order=[2, 1, 0, 3],
+        expected_modified=True,
+    ),
 }
 
 
diff --git a/tests/v1/attention/test_chunked_local_attention.py b/tests/v1/attention/test_chunked_local_attention.py
index faace3473a281f071c69a073eae9160e8c601923..4529c2cfc29b636b8b2152b895284ea7f406e665 100644
--- a/tests/v1/attention/test_chunked_local_attention.py
+++ b/tests/v1/attention/test_chunked_local_attention.py
@@ -172,7 +172,7 @@ def test_local_attention_virtual_batches(test_data: LocalAttentionTestData):
     )
 
     # Call the function
-    result = make_local_attention_virtual_batches(
+    result, _ = make_local_attention_virtual_batches(
         attn_chunk_size, common_attn_metadata, block_size
     )
 
diff --git a/tests/v1/attention/test_mla_backends.py b/tests/v1/attention/test_mla_backends.py
index 783e02ce89bdb0e7a2b6c1ae5ea580d05be3a059..85efc5d8fe4ee9df88b2d4afc2319f0b7d1eceb6 100644
--- a/tests/v1/attention/test_mla_backends.py
+++ b/tests/v1/attention/test_mla_backends.py
@@ -18,15 +18,15 @@ from tests.v1.attention.utils import (
     try_get_attention_backend,
 )
 from vllm import _custom_ops as ops
-from vllm.attention.backends.registry import AttentionBackendEnum
-from vllm.attention.ops.flashmla import is_flashmla_dense_supported
-from vllm.attention.utils.fa_utils import flash_attn_supports_mla
 from vllm.config.vllm import set_current_vllm_config
 from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
 from vllm.utils.math_utils import cdiv
 from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
+from vllm.v1.attention.backend import CommonAttentionMetadata
+from vllm.v1.attention.backends.fa_utils import flash_attn_supports_mla
 from vllm.v1.attention.backends.mla.common import QueryLenSupport
-from vllm.v1.attention.backends.utils import CommonAttentionMetadata
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
+from vllm.v1.attention.ops.flashmla import is_flashmla_dense_supported
 from vllm.v1.kv_cache_interface import FullAttentionSpec
 
 BACKENDS_TO_TEST = [
@@ -154,12 +154,12 @@ def create_and_prepopulate_kv_cache(
         MLA KV cache tensor
     """
     batch_size = len(kv_c_contexts)
-    seq_lens = common_attn_metadata.seq_lens_cpu
+    seq_lens = common_attn_metadata.seq_lens.cpu()
     query_lens = (
         common_attn_metadata.query_start_loc_cpu[1:]
         - common_attn_metadata.query_start_loc_cpu[:-1]
     )
-    context_lens = common_attn_metadata.num_computed_tokens_cpu
+    context_lens = seq_lens - query_lens
     block_table = common_attn_metadata.block_table_tensor
     slot_mapping = common_attn_metadata.slot_mapping
 
@@ -394,7 +394,11 @@ def run_attention_backend(
 @pytest.mark.parametrize("model", ["deepseek-ai/DeepSeek-R1"])
 @pytest.mark.parametrize("tensor_parallel_size", [1, 4, 8, 16])
 def test_backend_correctness(
-    dist_init, batch_spec_name: str, model: str, tensor_parallel_size: int
+    default_vllm_config,
+    dist_init,
+    batch_spec_name: str,
+    model: str,
+    tensor_parallel_size: int,
 ):
     """
     Test that all backends produce similar outputs to a reference implementation
diff --git a/tests/v1/attention/test_rocm_attention_backends_selection.py b/tests/v1/attention/test_rocm_attention_backends_selection.py
index 77790be6f892b73cdce634ac3e7e22817aca0851..a31c053aed21d9a0ea9715842786b0d802189979 100644
--- a/tests/v1/attention/test_rocm_attention_backends_selection.py
+++ b/tests/v1/attention/test_rocm_attention_backends_selection.py
@@ -7,8 +7,9 @@ from unittest.mock import MagicMock, patch
 import pytest
 import torch
 
-from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.platforms import current_platform
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
+from vllm.v1.attention.selector import AttentionSelectorConfig
 
 # ROCm-specific attention backend selection tests
 pytestmark = pytest.mark.skipif(
@@ -94,26 +95,20 @@ def mock_on_gfx9():
             None,
             AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN.get_path(),
         ),
-        # Test Case 9: VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1
-        (
-            {"VLLM_V1_USE_PREFILL_DECODE_ATTENTION": "1"},
-            None,
-            AttentionBackendEnum.ROCM_ATTN.get_path(),
-        ),
-        # Test Case 10: VLLM_ROCM_USE_AITER=1 + explicit TRITON_ATTN
+        # Test Case 9: VLLM_ROCM_USE_AITER=1 + explicit TRITON_ATTN
         (
             {"VLLM_ROCM_USE_AITER": "1"},
             "TRITON_ATTN",
             AttentionBackendEnum.TRITON_ATTN.get_path(),
         ),
-        # Test Case 11: VLLM_ROCM_USE_AITER=1 + VLLM_ROCM_USE_AITER_MHA=0
+        # Test Case 10: VLLM_ROCM_USE_AITER=1 + VLLM_ROCM_USE_AITER_MHA=0
         # (explicitly disabled)
         (
             {"VLLM_ROCM_USE_AITER": "1", "VLLM_ROCM_USE_AITER_MHA": "0"},
             None,
             AttentionBackendEnum.TRITON_ATTN.get_path(),
         ),
-        # Test Case 12: VLLM_ROCM_USE_AITER=1 + explicit ROCM_ATTN
+        # Test Case 11: VLLM_ROCM_USE_AITER=1 + explicit ROCM_ATTN
         (
             {"VLLM_ROCM_USE_AITER": "1"},
             "ROCM_ATTN",
@@ -150,8 +145,7 @@ def test_standard_attention_backend_selection(
     # Get the backend class path
     from vllm.platforms.rocm import RocmPlatform
 
-    backend_path = RocmPlatform.get_attn_backend_cls(
-        selected_backend=backend_enum,
+    attn_selector_config = AttentionSelectorConfig(
         head_size=128,
         dtype=torch.float16,
         kv_cache_dtype="auto",
@@ -160,6 +154,11 @@ def test_standard_attention_backend_selection(
         has_sink=False,
         use_sparse=False,
     )
+
+    backend_path = RocmPlatform.get_attn_backend_cls(
+        selected_backend=backend_enum, attn_selector_config=attn_selector_config
+    )
+
     assert backend_path == expected_backend_path
 
 
@@ -273,8 +272,16 @@ def test_mla_backend_selection(
 
         if should_raise:
             with pytest.raises(ValueError):
-                RocmPlatform.get_attn_backend_cls(
-                    selected_backend=backend_enum,
+                attn_selector_config = AttentionSelectorConfig(
+                    head_size=128,
+                    dtype=torch.float16,
+                    kv_cache_dtype="auto",
+                    block_size=block_size,
+                    use_mla=True,
+                    has_sink=False,
+                    use_sparse=False,
+                )
+                attn_selector_config = AttentionSelectorConfig(
                     head_size=128,
                     dtype=torch.float16,
                     kv_cache_dtype="auto",
@@ -283,9 +290,13 @@ def test_mla_backend_selection(
                     has_sink=False,
                     use_sparse=False,
                 )
+                backend_path = RocmPlatform.get_attn_backend_cls(
+                    selected_backend=backend_enum,
+                    attn_selector_config=attn_selector_config,
+                )
+
         else:
-            backend_path = RocmPlatform.get_attn_backend_cls(
-                selected_backend=backend_enum,
+            attn_selector_config = AttentionSelectorConfig(
                 head_size=128,
                 dtype=torch.float16,
                 kv_cache_dtype="auto",
@@ -294,6 +305,11 @@ def test_mla_backend_selection(
                 has_sink=False,
                 use_sparse=False,
             )
+
+            backend_path = RocmPlatform.get_attn_backend_cls(
+                selected_backend=backend_enum, attn_selector_config=attn_selector_config
+            )
+
             assert backend_path == expected_backend_path
 
 
@@ -309,8 +325,7 @@ def test_aiter_fa_requires_gfx9(mock_vllm_config):
             match="only supported on gfx9",
         ),
     ):
-        RocmPlatform.get_attn_backend_cls(
-            selected_backend=AttentionBackendEnum.ROCM_AITER_FA,
+        attn_selector_config = AttentionSelectorConfig(
             head_size=128,
             dtype=torch.float16,
             kv_cache_dtype="auto",
@@ -320,6 +335,11 @@ def test_aiter_fa_requires_gfx9(mock_vllm_config):
             use_sparse=False,
         )
 
+        RocmPlatform.get_attn_backend_cls(
+            selected_backend=AttentionBackendEnum.ROCM_AITER_FA,
+            attn_selector_config=attn_selector_config,
+        )
+
 
 def test_sparse_not_supported(mock_vllm_config):
     """Test that sparse attention is not supported on ROCm."""
@@ -328,8 +348,7 @@ def test_sparse_not_supported(mock_vllm_config):
     with pytest.raises(
         AssertionError, match="Sparse MLA backend on ROCm only supports block size 1"
     ):
-        RocmPlatform.get_attn_backend_cls(
-            selected_backend=None,
+        attn_selector_config = AttentionSelectorConfig(
             head_size=128,
             dtype=torch.float16,
             kv_cache_dtype="auto",
@@ -338,3 +357,7 @@ def test_sparse_not_supported(mock_vllm_config):
             has_sink=False,
             use_sparse=True,
         )
+
+        RocmPlatform.get_attn_backend_cls(
+            selected_backend=None, attn_selector_config=attn_selector_config
+        )
diff --git a/tests/v1/attention/test_sparse_mla_backends.py b/tests/v1/attention/test_sparse_mla_backends.py
index 6e8559304f27146b9bec2fbdfb12e504d964592f..27e4d269da1a6840419326c9ae5a632465d8ea3f 100644
--- a/tests/v1/attention/test_sparse_mla_backends.py
+++ b/tests/v1/attention/test_sparse_mla_backends.py
@@ -22,15 +22,16 @@ from tests.v1.attention.utils import (
     create_vllm_config,
 )
 from vllm import _custom_ops as ops
-from vllm.attention.ops import flashmla
 from vllm.config import set_current_vllm_config
 from vllm.model_executor.layers.linear import ColumnParallelLinear
+from vllm.platforms import current_platform
 from vllm.utils.math_utils import cdiv
 from vllm.v1.attention.backends.mla.flashmla_sparse import (
     FlashMLASparseBackend,
     triton_convert_req_index_to_global_index,
 )
 from vllm.v1.attention.backends.utils import split_prefill_chunks
+from vllm.v1.attention.ops import flashmla
 from ...utils import models_path_prefix
 
 SPARSE_BACKEND_BATCH_SPECS = {
@@ -125,8 +126,16 @@ def _quantize_dequantize_fp8_ds_mla(
     reason="FlashMLASparseBackend requires CUDA 9.0 or higher",
 )
 def test_sparse_backend_decode_correctness(
-    dist_init, batch_name, kv_cache_dtype, tensor_parallel_size, workspace_init
+    default_vllm_config,
+    dist_init,
+    batch_name,
+    kv_cache_dtype,
+    tensor_parallel_size,
+    workspace_init,
 ):
+    if current_platform.is_rocm():
+        pytest.skip("ROCm does not support fp8_ds_mla data type for kv cache.")
+
     if not torch.cuda.is_available():
         pytest.skip("CUDA is required for sparse MLA decode test")
 
@@ -295,7 +304,7 @@ def test_sparse_backend_decode_correctness(
     positions = np.arange(starts[-1], dtype=np.int32) - np.repeat(
         starts[:-1], seg_lengths
     )
-    seq_lengths = np.asarray(common_attn_metadata.seq_lens_cpu, dtype=np.int32)
+    seq_lengths = np.asarray(common_attn_metadata.seq_lens.cpu(), dtype=np.int32)
     prefix_lengths = seq_lengths - seg_lengths
     positions += np.repeat(prefix_lengths, seg_lengths)
 
diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py
index 4dcaf9d908690a994a39750c94c79d768342ecfe..da4cea8fca707bf57f3f084895e633d2ec9e3b9c 100644
--- a/tests/v1/attention/utils.py
+++ b/tests/v1/attention/utils.py
@@ -7,8 +7,6 @@ from dataclasses import dataclass
 import pytest
 import torch
 
-from vllm.attention.backends.abstract import AttentionImpl
-from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config import (
     CacheConfig,
     CompilationConfig,
@@ -20,10 +18,12 @@ from vllm.config import (
     VllmConfig,
 )
 from vllm.config.model import ModelDType
-from vllm.v1.attention.backends.utils import (
+from vllm.v1.attention.backend import (
+    AttentionImpl,
     AttentionMetadataBuilder,
     CommonAttentionMetadata,
 )
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
 from vllm.v1.kv_cache_interface import FullAttentionSpec
 
 
@@ -249,8 +249,8 @@ def create_dummy_kv_cache(
 @dataclass
 class BackendConfig:
     name: str
-    env_vars: dict
-    comp_config: dict  # compilation config
+    attention_config: dict
+    comp_config: dict
     specific_gpu_arch: tuple | None = None
 
 
@@ -259,10 +259,10 @@ full_cg_backend_configs = {
     # FA3 on Hopper
     "FA3": BackendConfig(
         name="FA3",
-        env_vars={
-            "VLLM_ATTENTION_BACKEND": "FLASH_ATTN",
-            "VLLM_FLASH_ATTN_VERSION": "3",
-            "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
+        attention_config={
+            "backend": "FLASH_ATTN",
+            "flash_attn_version": 3,
+            "flash_attn_max_num_splits_for_cuda_graph": 16,
         },
         comp_config={
             "cudagraph_mode": "FULL",
@@ -272,9 +272,7 @@ full_cg_backend_configs = {
     # FlashMLA on Hopper
     "FlashMLA": BackendConfig(
         name="FlashMLA",
-        env_vars={
-            "VLLM_ATTENTION_BACKEND": "FLASHMLA",
-        },
+        attention_config={"backend": "FLASHMLA"},
         comp_config={
             "cudagraph_mode": "FULL_AND_PIECEWISE",
         },
@@ -283,9 +281,7 @@ full_cg_backend_configs = {
     # Cutlass MLA on Blackwell
     "CutlassMLA": BackendConfig(
         name="CutlassMLA",
-        env_vars={
-            "VLLM_ATTENTION_BACKEND": "CUTLASS_MLA",
-        },
+        attention_config={"backend": "CUTLASS_MLA"},
         comp_config={
             "cudagraph_mode": "FULL_AND_PIECEWISE",
         },
@@ -294,9 +290,7 @@ full_cg_backend_configs = {
     # FlashInfer MLA on Blackwell
     "FlashInferMLA": BackendConfig(
         name="FlashInferMLA",
-        env_vars={
-            "VLLM_ATTENTION_BACKEND": "FLASHINFER_MLA",
-        },
+        attention_config={"backend": "FLASHINFER_MLA"},
         comp_config={
             "cudagraph_mode": "FULL_AND_PIECEWISE",
         },
@@ -305,9 +299,9 @@ full_cg_backend_configs = {
     # FlashAttention MLA on Hopper
     "FlashAttentionMLA": BackendConfig(
         name="FlashAttentionMLA",
-        env_vars={
-            "VLLM_ATTENTION_BACKEND": "FLASH_ATTN_MLA",
-            "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
+        attention_config={
+            "backend": "FLASH_ATTN_MLA",
+            "flash_attn_max_num_splits_for_cuda_graph": 16,
         },
         comp_config={
             "cudagraph_mode": "FULL_DECODE_ONLY",
@@ -317,10 +311,10 @@ full_cg_backend_configs = {
     # FA2
     "FA2": BackendConfig(
         name="FA2",
-        env_vars={
-            "VLLM_ATTENTION_BACKEND": "FLASH_ATTN",
-            "VLLM_FLASH_ATTN_VERSION": "2",
-            "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
+        attention_config={
+            "backend": "FLASH_ATTN",
+            "flash_attn_version": 2,
+            "flash_attn_max_num_splits_for_cuda_graph": 16,
         },
         comp_config={
             "cudagraph_mode": "FULL_AND_PIECEWISE",
@@ -329,7 +323,7 @@ full_cg_backend_configs = {
     # Triton Attention
     "TritonAttn": BackendConfig(
         name="TritonAttn",
-        env_vars={"VLLM_ATTENTION_BACKEND": "TRITON_ATTN"},
+        attention_config={"backend": "TRITON_ATTN"},
         comp_config={
             "cudagraph_mode": "FULL_AND_PIECEWISE",
         },
@@ -337,14 +331,17 @@ full_cg_backend_configs = {
     # FlashInfer
     "FlashInfer": BackendConfig(
         name="FlashInfer",
-        env_vars={"VLLM_ATTENTION_BACKEND": "FLASHINFER"},
+        attention_config={"backend": "FLASHINFER"},
         comp_config={
             "cudagraph_mode": "FULL_AND_PIECEWISE",
         },
     ),
     "RocmAttn": BackendConfig(
         name="RocmAttn",
-        env_vars={"VLLM_V1_USE_PREFILL_DECODE_ATTENTION": "1"},
+        attention_config={
+            "backend": "ROCM_ATTN",
+            "use_prefill_decode_attention": True,
+        },
         comp_config={
             "cudagraph_mode": "FULL",
         },
diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index af36b298f5558c4acd709c972aff9074a74192a4..cda53f02b69cd0598198994bb0766e704ad9c8c7 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -1800,3 +1800,60 @@ def test_request_with_prompt_embeds_and_mm_inputs(hash_fn: Callable[[Any], bytes
         )
     )
     assert block_hashes[1] == expected_hash2
+
+
+def test_auto_fit_max_model_len():
+    """Test that max_model_len=-1 auto-fits to available GPU memory."""
+    # Create config with original_max_model_len=-1 to trigger auto-fit
+    model_config = ModelConfig(max_model_len=1024)
+    # Simulate the user passing -1 by setting original_max_model_len
+    model_config.original_max_model_len = -1
+    vllm_config = VllmConfig(model_config=model_config)
+
+    mem_per_block_per_layer = 16 * 2 * 64 * 4 * 2  # 16KB per block per layer
+    kv_cache_specs = {
+        "layer_1": new_kv_cache_spec(),
+        "layer_2": new_kv_cache_spec(),
+    }
+
+    # With enough memory, max_model_len stays at the derived max
+    large_available_memory = mem_per_block_per_layer * 2 * 1024  # plenty of memory
+    _kv_cache_configs = get_kv_cache_configs(
+        vllm_config, [kv_cache_specs], [large_available_memory]
+    )
+    assert vllm_config.model_config.max_model_len == 1024
+
+    # Reset for next test
+    model_config = ModelConfig(max_model_len=1024)
+    model_config.original_max_model_len = -1
+    vllm_config = VllmConfig(model_config=model_config)
+
+    # With limited memory, max_model_len should be reduced
+    # Need memory for at least max_model_len tokens
+    # 32 blocks worth of memory for 2 layers = can fit 32*16=512 tokens
+    limited_memory = mem_per_block_per_layer * 2 * 32
+    _kv_cache_configs = get_kv_cache_configs(
+        vllm_config, [kv_cache_specs], [limited_memory]
+    )
+    # Should be reduced to fit in memory
+    assert vllm_config.model_config.max_model_len < 1024
+    assert vllm_config.model_config.max_model_len > 0
+
+
+def test_auto_fit_max_model_len_not_triggered():
+    """Test that auto-fit is not triggered when original_max_model_len is not -1."""
+    model_config = ModelConfig(max_model_len=16)
+    # original_max_model_len should be None by default, not -1
+    vllm_config = VllmConfig(model_config=model_config)
+
+    mem_per_block_per_layer = 16 * 2 * 64 * 4 * 2
+    kv_cache_specs = {
+        "layer_1": new_kv_cache_spec(),
+        "layer_2": new_kv_cache_spec(),
+    }
+
+    # This should work normally without auto-fit
+    _kv_cache_configs = get_kv_cache_configs(
+        vllm_config, [kv_cache_specs], [mem_per_block_per_layer * 2 * 32]
+    )
+    assert vllm_config.model_config.max_model_len == 16
diff --git a/tests/v1/core/test_kv_sharing.py b/tests/v1/core/test_kv_sharing.py
index e6d37b1d63c8c5637ea515304ea895dd8d29cfb1..db0e8dae3e788995369791da9a9b50bb47615963 100644
--- a/tests/v1/core/test_kv_sharing.py
+++ b/tests/v1/core/test_kv_sharing.py
@@ -11,7 +11,9 @@ pytestmark = pytest.mark.cpu_test
 
 
 def new_kv_cache_spec():
-    return FullAttentionSpec(16, 1, 1, torch.float32, False)
+    return FullAttentionSpec(
+        block_size=16, num_kv_heads=1, head_size=1, dtype=torch.float32
+    )
 
 
 def test_initialize_kv_cache_for_kv_sharing_different_attn_groups():
diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index 0880a17c78d40f5a32b786c4a0ee58d710d238ed..486e5f9cd4c8b0e773d129d2417e63a671404092 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -35,6 +35,7 @@ from vllm.v1.kv_cache_interface import (
     FullAttentionSpec,
     KVCacheConfig,
     KVCacheGroupSpec,
+    MambaSpec,
     SlidingWindowSpec,
 )
 
@@ -94,35 +95,105 @@ def make_kv_cache_config(block_size: int, num_blocks: int) -> KVCacheConfig:
         kv_cache_groups=[
             KVCacheGroupSpec(
                 ["layer"],
-                FullAttentionSpec(block_size, 1, 1, torch.float32),
+                FullAttentionSpec(
+                    block_size=block_size,
+                    num_kv_heads=1,
+                    head_size=1,
+                    dtype=torch.float32,
+                ),
             )
         ],
     )
 
 
 def make_kv_cache_config_hybrid_model(
-    block_size: int, num_blocks: int
+    block_size: int, num_blocks: int, second_spec_type: str = "sliding_window"
 ) -> KVCacheConfig:
+    if second_spec_type == "sliding_window":
+        second_spec = SlidingWindowSpec(
+            block_size=block_size,
+            num_kv_heads=1,
+            head_size=1,
+            dtype=torch.float32,
+            sliding_window=2 * block_size,
+        )
+    elif second_spec_type == "mamba":
+        second_spec = MambaSpec(
+            block_size=block_size,
+            shapes=(1, 1),
+            dtypes=(torch.float32,),
+        )
+
     return KVCacheConfig(
         num_blocks=num_blocks,
         kv_cache_tensors=[],
         kv_cache_groups=[
             KVCacheGroupSpec(
                 ["layer1"],
-                FullAttentionSpec(block_size, 1, 1, torch.float32),
+                FullAttentionSpec(
+                    block_size=block_size,
+                    num_kv_heads=1,
+                    head_size=1,
+                    dtype=torch.float32,
+                ),
             ),
             KVCacheGroupSpec(
                 ["layer2"],
-                SlidingWindowSpec(
-                    block_size, 1, 1, torch.float32, sliding_window=2 * block_size
-                ),
+                second_spec,
             ),
             KVCacheGroupSpec(
                 ["layer3"],
+                second_spec,
+            ),
+        ],
+    )
+
+
+def make_kv_cache_config_three_types(
+    block_size: int, num_blocks: int, third_spec_type: str = "mamba"
+) -> KVCacheConfig:
+    if third_spec_type == "mamba":
+        third_spec = MambaSpec(
+            block_size=block_size,
+            shapes=(1, 1),
+            dtypes=(torch.float32,),
+        )
+    elif third_spec_type == "sliding_window":
+        third_spec = SlidingWindowSpec(
+            block_size=block_size,
+            num_kv_heads=1,
+            head_size=1,
+            dtype=torch.float32,
+            sliding_window=4 * block_size,
+        )
+
+    return KVCacheConfig(
+        num_blocks=num_blocks,
+        kv_cache_tensors=[],
+        kv_cache_groups=[
+            KVCacheGroupSpec(
+                ["layer1"],
+                FullAttentionSpec(
+                    block_size=block_size,
+                    num_kv_heads=1,
+                    head_size=1,
+                    dtype=torch.float32,
+                ),
+            ),
+            KVCacheGroupSpec(
+                ["layer2"],
                 SlidingWindowSpec(
-                    block_size, 1, 1, torch.float32, sliding_window=2 * block_size
+                    block_size=block_size,
+                    num_kv_heads=1,
+                    head_size=1,
+                    dtype=torch.float32,
+                    sliding_window=2 * block_size,
                 ),
             ),
+            KVCacheGroupSpec(
+                ["layer3"],
+                third_spec,
+            ),
         ],
     )
 
@@ -406,6 +477,184 @@ def test_prefill_hybrid_model():
     )
 
 
+def _make_hybrid_kv_cache_config(
+    block_size: int, num_blocks: int, spec_types: list[str]
+) -> KVCacheConfig:
+    """
+    Create a KVCacheConfig with the specified spec types.
+
+    Args:
+        block_size: The block size for KV cache.
+        num_blocks: The number of blocks in the KV cache.
+        spec_types: List of spec type strings. Supported types:
+            - "full": FullAttentionSpec
+            - "sliding_window": SlidingWindowSpec with window=2*block_size
+            - "sliding_window_large": SlidingWindowSpec with window=4*block_size
+            - "mamba": MambaSpec
+    """
+    spec_map = {
+        "full": lambda: FullAttentionSpec(
+            block_size=block_size,
+            num_kv_heads=1,
+            head_size=1,
+            dtype=torch.float32,
+        ),
+        "sliding_window": lambda: SlidingWindowSpec(
+            block_size=block_size,
+            num_kv_heads=1,
+            head_size=1,
+            dtype=torch.float32,
+            sliding_window=2 * block_size,
+        ),
+        "sliding_window_large": lambda: SlidingWindowSpec(
+            block_size=block_size,
+            num_kv_heads=1,
+            head_size=1,
+            dtype=torch.float32,
+            sliding_window=4 * block_size,
+        ),
+        "mamba": lambda: MambaSpec(
+            block_size=block_size,
+            shapes=(1, 1),
+            dtypes=(torch.float32,),
+        ),
+    }
+
+    kv_cache_groups = [
+        KVCacheGroupSpec([f"layer{i}"], spec_map[spec_type]())
+        for i, spec_type in enumerate(spec_types)
+    ]
+
+    return KVCacheConfig(
+        num_blocks=num_blocks,
+        kv_cache_tensors=[],
+        kv_cache_groups=kv_cache_groups,
+    )
+
+
+# Test cases covering various combinations of KV cache spec types:
+# - Varying number of groups (2, 3, or 4)
+# - 0, 1, or 2 full attention groups
+# - Sliding window with different window sizes
+# - Interleaved group IDs (full attn and other types mixed)
+# - Mamba spec combinations
+_HYBRID_MODEL_TEST_CASES = [
+    # 2 groups: 1 full + 1 other
+    pytest.param(["full", "sliding_window"], id="2g-full+sw"),
+    pytest.param(["full", "mamba"], id="2g-full+mamba"),
+    # 2 groups: 0 full (all other types)
+    pytest.param(["sliding_window", "mamba"], id="2g-sw+mamba"),
+    pytest.param(["sliding_window", "sliding_window_large"], id="2g-sw+sw_large"),
+    # 3 groups: 1 full + 2 others (same type)
+    pytest.param(["full", "sliding_window", "sliding_window"], id="3g-full+2sw"),
+    pytest.param(["full", "mamba", "mamba"], id="3g-full+2mamba"),
+    # 3 groups: 1 full + 2 others (different types)
+    pytest.param(["full", "sliding_window", "mamba"], id="3g-full+sw+mamba"),
+    pytest.param(
+        ["full", "sliding_window", "sliding_window_large"],
+        id="3g-full+sw+sw_large",
+    ),
+    # 3 groups: 2 full + 1 other
+    pytest.param(["full", "full", "sliding_window"], id="3g-2full+sw"),
+    pytest.param(["full", "full", "mamba"], id="3g-2full+mamba"),
+    # 4 groups: interleaved (full, other, full, other)
+    pytest.param(
+        ["full", "sliding_window", "full", "sliding_window_large"],
+        id="4g-interleaved-full+sw+sw_large",
+    ),
+    pytest.param(
+        ["full", "mamba", "full", "mamba"],
+        id="4g-interleaved-full+mamba",
+    ),
+    # 4 groups: interleaved with different sliding windows
+    pytest.param(
+        ["full", "sliding_window", "full", "sliding_window_large"],
+        id="4g-interleaved-full+sw_mixed",
+    ),
+    # 4 groups: 0 full (all other types)
+    pytest.param(
+        ["sliding_window", "mamba", "sliding_window_large", "mamba"],
+        id="4g-sw+mamba+sw_large+mamba",
+    ),
+    # 4 groups: 2 full + 2 others (grouped)
+    pytest.param(
+        ["full", "full", "sliding_window", "mamba"],
+        id="4g-2full+sw+mamba",
+    ),
+]
+
+
+@pytest.mark.parametrize("spec_types", _HYBRID_MODEL_TEST_CASES)
+def test_prefill_hybrid_model_combinations(spec_types: list[str]):
+    """
+    Test prefix caching with hybrid models containing various combinations of
+    KV cache spec types.
+
+    This unified test covers:
+    - Various combinations (full attn + other attn types)
+    - Varying number of groups (2, 3, or 4)
+    - 0, 1, or 2 full attention groups in the combination
+    - Two sliding_window attn groups with different window sizes
+    - Interleaved group IDs (full attn and other types alternating)
+    - Mamba spec with other attention types
+    """
+    block_size = 16
+    num_groups = len(spec_types)
+    # Allocate enough blocks for all groups
+    num_blocks = 10 * num_groups
+
+    kv_cache_config = _make_hybrid_kv_cache_config(block_size, num_blocks, spec_types)
+    manager = KVCacheManager(
+        kv_cache_config,
+        max_model_len=8192,
+        enable_caching=True,
+        hash_block_size=block_size,
+    )
+
+    hash_fn = sha256
+
+    # Complete 3 blocks (48 tokens)
+    common_token_ids = [i for i in range(3) for _ in range(block_size)]
+    unique_token_ids = [3] * 7
+    all_token_ids = common_token_ids + unique_token_ids
+
+    # First request: no cache hit initially
+    req0 = make_request("0", all_token_ids, block_size, hash_fn)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
+
+    assert len(req0.block_hashes) == 3
+    assert not computed_blocks.blocks[0]  # No cache hit initially
+    assert num_computed_tokens == 0
+
+    blocks = manager.allocate_slots(
+        req0, 55, len(computed_blocks.blocks[0]) * block_size, computed_blocks
+    )
+    assert blocks is not None
+    # Should have blocks for all groups
+    assert len(blocks.get_block_ids()) == num_groups
+
+    # Second request: should hit cached blocks for common prefix
+    req1 = make_request("1", common_token_ids + [4] * 5, block_size, hash_fn)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
+
+    # Should hit cached blocks for all groups
+    assert num_computed_tokens == 3 * block_size
+    assert len(computed_blocks.blocks) == num_groups
+
+    # Allocate and verify blocks for second request
+    blocks = manager.allocate_slots(
+        req1,
+        len(common_token_ids) + 5 - num_computed_tokens,
+        num_computed_tokens,
+        computed_blocks,
+    )
+    assert blocks is not None
+    assert len(blocks.get_block_ids()) == num_groups
+
+    manager.free(req0)
+    manager.free(req1)
+
+
 def test_prefill_plp():
     """Test prefill with APC and some prompt logprobs (plp) requests.
 
@@ -1356,6 +1605,69 @@ def test_kv_cache_events(blocks_to_cache: int):
     assert len(manager.block_pool.cached_block_hash_to_block) == 0
 
 
+def test_null_parent_block_hash():
+    block_size = 1
+    num_cached_blocks = 2
+    num_full_blocks = 4
+
+    pool = BlockPool(
+        num_gpu_blocks=8,
+        enable_caching=True,
+        hash_block_size=block_size,
+        enable_kv_cache_events=True,
+    )
+
+    req = make_request(
+        "req_null_parent",
+        prompt_token_ids=[10, 11, 12, 13],
+        block_size=block_size,
+        hash_fn=sha256,
+    )
+    assert len(req.block_hashes) == num_full_blocks
+
+    # Physical parent is `null_block` (no hash), while the logical parent hash
+    # still exists in `request.block_hashes[num_cached_blocks - 1]`.
+    assert pool.null_block.block_hash is None
+    new_blocks = pool.get_new_blocks(num_full_blocks - 1)
+    blocks = [
+        new_blocks[: num_cached_blocks - 1],
+        pool.null_block,  # physical parent
+        *new_blocks[num_cached_blocks - 1 :],
+    ]
+
+    pool.cache_full_blocks(
+        request=req,
+        blocks=blocks,
+        num_cached_blocks=num_cached_blocks,
+        num_full_blocks=num_full_blocks,
+        block_size=block_size,
+        kv_cache_group_id=0,
+    )
+
+    events = pool.take_events()
+    assert len(events) == 1
+    event = events[0]
+    assert isinstance(event, BlockStored)
+
+    expected_parent = kv_cache_utils.maybe_convert_block_hash(
+        req.block_hashes[num_cached_blocks - 1]
+    )
+    assert event.parent_block_hash == expected_parent
+    assert event.parent_block_hash is not None
+
+    expected_new_hashes = [
+        kv_cache_utils.maybe_convert_block_hash(h)
+        for h in req.block_hashes[num_cached_blocks:num_full_blocks]
+    ]
+    assert event.block_hashes == expected_new_hashes
+
+    # Ensure we didn't accidentally assign a hash to the null block.
+    assert pool.null_block.block_hash is None
+    # Sanity check: newly cached physical blocks should have hashes assigned.
+    assert blocks[num_cached_blocks].block_hash is not None
+    assert blocks[num_full_blocks - 1].block_hash is not None
+
+
 @pytest.mark.parametrize("blocks_to_cache", [2, 3, 10])
 def test_kv_cache_events_with_lora(blocks_to_cache: int):
     """Test BlockStored events contain correct lora_id when using LoRA requests."""
@@ -1553,15 +1865,20 @@ def test_different_block_size():
         kv_cache_groups=[
             KVCacheGroupSpec(
                 ["layer1"],
-                FullAttentionSpec(block_size * 2, 1, 1, torch.float16),
+                FullAttentionSpec(
+                    block_size=block_size * 2,
+                    num_kv_heads=1,
+                    head_size=1,
+                    dtype=torch.float16,
+                ),
             ),
             KVCacheGroupSpec(
                 ["layer2"],
                 SlidingWindowSpec(
-                    block_size,
-                    1,
-                    1,
-                    torch.float32,
+                    block_size=block_size,
+                    num_kv_heads=1,
+                    head_size=1,
+                    dtype=torch.float32,
                     sliding_window=2 * block_size,
                 ),
             ),
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index 2694506fe8e2247f8703c5c0d80933d3ed0d6ee6..822e1676407761fe5ece10e4d76dd8e0fc29bbf3 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -1264,10 +1264,11 @@ def test_kv_connector_unable_to_allocate(use_ec_connector, ec_role):
     assert len(scheduler.waiting) == 0
 
 
+@pytest.mark.parametrize("is_async", [False, True])
 @pytest.mark.parametrize(
     "use_ec_connector, ec_role", [(False, None), (True, "ec_consumer")]
 )
-def test_kv_connector_handles_preemption(use_ec_connector, ec_role):
+def test_kv_connector_handles_preemption(is_async, use_ec_connector, ec_role):
     """
     Test whether scheduler with KVConnector is able to handle
     unable to allocate (run out of blocks in allocate_slots().
@@ -1280,7 +1281,9 @@ def test_kv_connector_handles_preemption(use_ec_connector, ec_role):
     NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE
     scheduler = create_scheduler(
         enable_prefix_caching=True,
-        use_kv_connector=mock_kv(matched_tokens=NUM_MATCHED_NEW_TOKENS, is_async=False),
+        use_kv_connector=mock_kv(
+            matched_tokens=NUM_MATCHED_NEW_TOKENS, is_async=is_async
+        ),
         block_size=BLOCK_SIZE,
         num_blocks=NUM_BLOCKS,
         # encoder connector should not affect test results
@@ -1318,6 +1321,12 @@ def test_kv_connector_handles_preemption(use_ec_connector, ec_role):
 
     # All can be scheduled - 1st token.
     output = scheduler.schedule()
+    if is_async:
+        assert len(scheduler.waiting) == 2
+        assert scheduler.running == []
+        _step_until_kv_transfer_finished(scheduler, req_ids)
+        output = scheduler.schedule()
+
     _assert_right_scheduler_output(
         output,
         # 2 remote kv cache hits.
@@ -1370,6 +1379,12 @@ def test_kv_connector_handles_preemption(use_ec_connector, ec_role):
     # Restarts the preempted request - generate 3rd token.
     # This will have a local and remote cache hit.
     output = scheduler.schedule()
+    if is_async:
+        waiting_req_ids = [req.request_id for req in scheduler.waiting]
+        assert len(waiting_req_ids) == 1
+        _step_until_kv_transfer_finished(scheduler, waiting_req_ids)
+        output = scheduler.schedule()
+
     _assert_right_scheduler_output(
         output,
         # 1 remote kv_cache hit!
@@ -1380,6 +1395,8 @@ def test_kv_connector_handles_preemption(use_ec_connector, ec_role):
     )
     assert len(scheduler.running) == 1
     assert len(scheduler.waiting) == 0
+    assert output.scheduled_cached_reqs.num_reqs == 1
+    assert output.scheduled_new_reqs == []
     _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
     assert len(scheduler.running) == 1
     assert len(scheduler.waiting) == 0
@@ -1392,6 +1409,8 @@ def test_kv_connector_handles_preemption(use_ec_connector, ec_role):
         num_requests=0,
         expected_num_scheduled_tokens=1,
     )
+    assert output.scheduled_cached_reqs.num_reqs == 1
+    assert output.scheduled_new_reqs == []
     assert len(scheduler.running) == 1
     _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
     assert len(scheduler.running) == 0
@@ -1577,7 +1596,13 @@ def create_scheduler_with_priority(
         kv_cache_tensors=[],
         kv_cache_groups=[
             KVCacheGroupSpec(
-                ["layer"], FullAttentionSpec(block_size, 1, 1, torch.float32, False)
+                ["layer"],
+                FullAttentionSpec(
+                    block_size=block_size,
+                    num_kv_heads=1,
+                    head_size=1,
+                    dtype=torch.float32,
+                ),
             )
         ],
     )
@@ -2288,7 +2313,6 @@ def test_priority_scheduling_preemption_and_resumption_when_out_of_kv(
     # 4th Schedule - this should trigger the resumption
     output = scheduler.schedule()
     scheduled_cached_reqs = output.scheduled_cached_reqs
-    resumed_from_preemption = scheduled_cached_reqs.resumed_from_preemption
 
     assert len(output.scheduled_new_reqs) == 0
     assert scheduled_cached_reqs.num_reqs == 1
@@ -2296,14 +2320,14 @@ def test_priority_scheduling_preemption_and_resumption_when_out_of_kv(
     assert len(scheduler.running) == 1
 
     # Preempted request resumed in scheduled_cached_reqs
-    assert len(resumed_from_preemption) == 1
-    assert len(scheduled_cached_reqs.resumed_req_token_ids) == 1
-    assert resumed_from_preemption[0]
+    assert len(scheduled_cached_reqs.resumed_req_ids) == 1
+    assert len(scheduled_cached_reqs.all_token_ids) == 1
     assert scheduled_cached_reqs.req_ids[0] == request_low.request_id
-    assert scheduled_cached_reqs.resumed_req_token_ids[0] is not None
+    assert request_low.request_id in scheduled_cached_reqs.resumed_req_ids
+    assert request_low.request_id in scheduled_cached_reqs.all_token_ids
     # Resumed tokens include 30 prompt tokens and 2 decoded tokens
-    assert len(scheduled_cached_reqs.resumed_req_token_ids[0]) == 32
-    assert scheduled_cached_reqs.resumed_req_token_ids[0][31] == 100
+    assert len(scheduled_cached_reqs.all_token_ids[request_low.request_id]) == 32
+    assert scheduled_cached_reqs.all_token_ids[request_low.request_id][31] == 100
 
 
 @pytest.mark.parametrize(
@@ -3126,7 +3150,6 @@ def test_priority_scheduling_ec_connector_preemption_and_resumption(
     # 4th Schedule - this should trigger req_low resumption from waiting
     output = scheduler.schedule()
     scheduled_cached_reqs = output.scheduled_cached_reqs
-    resumed_from_preemption = scheduled_cached_reqs.resumed_from_preemption
 
     assert len(output.scheduled_new_reqs) == 0
     assert scheduled_cached_reqs.num_reqs == 1
@@ -3134,14 +3157,14 @@ def test_priority_scheduling_ec_connector_preemption_and_resumption(
     assert len(scheduler.running) == 1
 
     # Preempted request resumed in scheduled_cached_reqs
-    assert len(resumed_from_preemption) == 1
-    assert len(scheduled_cached_reqs.resumed_req_token_ids) == 1
-    assert resumed_from_preemption[0]
+    assert len(scheduled_cached_reqs.resumed_req_ids) == 1
+    assert len(scheduled_cached_reqs.all_token_ids) == 1
     assert scheduled_cached_reqs.req_ids[0] == request_low.request_id
-    assert scheduled_cached_reqs.resumed_req_token_ids[0] is not None
+    assert request_low.request_id in scheduled_cached_reqs.resumed_req_ids
+    assert request_low.request_id in scheduled_cached_reqs.all_token_ids
     ## Resumed tokens include 94 prompt tokens and 2 decoded tokens
-    assert len(scheduled_cached_reqs.resumed_req_token_ids[0]) == 96
-    assert scheduled_cached_reqs.resumed_req_token_ids[0][95] == 100
+    assert len(scheduled_cached_reqs.all_token_ids[request_low.request_id]) == 96
+    assert scheduled_cached_reqs.all_token_ids[request_low.request_id][95] == 100
     assert scheduler.running[0].request_id == request_low.request_id
     assert request_high.request_id in output.finished_req_ids
 
@@ -3330,3 +3353,28 @@ def test_ec_connector_allocate_encoder_tokens_with_external_load(use_kv_connecto
 # ==============================================================================
 # EPD (Encoder-Prefill-Decode) Encoder-cache-specific tests end
 # ==============================================================================
+
+
+def test_prepend_skipped_requests_order():
+    scheduler = create_scheduler(max_num_seqs=1, use_kv_connector=True)
+    requests = create_requests(num_requests=4)
+    for request in requests:
+        scheduler.add_request(request)
+
+    # 4 requests waiting, capture their order
+    expected_waiting_reqs = list(scheduler.waiting)
+
+    # simulate first 2 waiting requests are waiting for remote KVs
+    for req in expected_waiting_reqs[:2]:
+        req.status = RequestStatus.WAITING_FOR_REMOTE_KVS
+
+    # schedule step
+    # expect the first 2 waiting to be skipped, the third running,
+    # and the fourth waiting
+    scheduler.schedule()
+
+    # pop the third request which is expected to be running
+    expected_waiting_reqs.pop(2)
+
+    # verify waiting order is preserved
+    assert list(scheduler.waiting) == expected_waiting_reqs
diff --git a/tests/v1/core/test_single_type_kv_cache_manager.py b/tests/v1/core/test_single_type_kv_cache_manager.py
index e6a69dc8a949af89a4444c80858ab0780c92361d..23097bf2a086de596f24c339f5485ffdee6274e9 100644
--- a/tests/v1/core/test_single_type_kv_cache_manager.py
+++ b/tests/v1/core/test_single_type_kv_cache_manager.py
@@ -21,13 +21,23 @@ from vllm.v1.kv_cache_interface import ChunkedLocalAttentionSpec, SlidingWindowS
 pytestmark = pytest.mark.cpu_test
 
 
-def get_sliding_window_manager(sliding_window_spec, block_pool):
-    return SlidingWindowManager(sliding_window_spec, block_pool, kv_cache_group_id=0)
+def get_sliding_window_manager(sliding_window_spec, block_pool, enable_caching=True):
+    return SlidingWindowManager(
+        sliding_window_spec,
+        block_pool,
+        enable_caching=enable_caching,
+        kv_cache_group_id=0,
+    )
 
 
-def get_chunked_local_attention_manager(chunked_local_attention_spec, block_pool):
+def get_chunked_local_attention_manager(
+    chunked_local_attention_spec, block_pool, enable_caching=True
+):
     return ChunkedLocalAttentionManager(
-        chunked_local_attention_spec, block_pool, kv_cache_group_id=0
+        chunked_local_attention_spec,
+        block_pool,
+        enable_caching=enable_caching,
+        kv_cache_group_id=0,
     )
 
 
@@ -332,11 +342,53 @@ def test_get_num_blocks_to_allocate():
     ]
 
     assert (
-        manager.get_num_blocks_to_allocate("1", 20 * block_size, cached_blocks_1) == 20
+        manager.get_num_blocks_to_allocate("1", 20 * block_size, cached_blocks_1, 0)
+        == 20
     )
     assert (
-        manager.get_num_blocks_to_allocate("2", 20 * block_size, cached_blocks_2) == 15
+        manager.get_num_blocks_to_allocate("2", 20 * block_size, cached_blocks_2, 0)
+        == 15
+    )
+
+
+def test_evictable_cached_blocks_not_double_allocated():
+    block_size = 2
+    sliding_window_length = 2 * block_size
+    sliding_window_spec = SlidingWindowSpec(
+        block_size=block_size,
+        num_kv_heads=1,
+        head_size=1,
+        dtype=torch.float32,
+        sliding_window=sliding_window_length,
+    )
+
+    block_pool = BlockPool(
+        num_gpu_blocks=100, enable_caching=True, hash_block_size=block_size
+    )
+    manager = get_sliding_window_manager(sliding_window_spec, block_pool)
+
+    request_id = "req"
+    evictable_block = block_pool.blocks[1]  # ref_cnt == 0, eviction candidate
+
+    num_blocks_to_allocate = manager.get_num_blocks_to_allocate(
+        request_id=request_id,
+        num_tokens=2 * block_size,
+        new_computed_blocks=[evictable_block],
+        total_computed_tokens=block_size,
+    )
+    # Free capacity check should count evictable cached blocks, but allocation
+    # should only allocate the truly new block.
+    assert num_blocks_to_allocate == 2
+
+    manager.allocate_new_computed_blocks(
+        request_id,
+        [evictable_block],
+        num_local_computed_tokens=block_size,
+        num_external_computed_tokens=0,
     )
+    new_blocks = manager.allocate_new_blocks(request_id, num_tokens=4)
+    assert len(new_blocks) == 1
+    assert len(manager.req_to_blocks[request_id]) == 2
 
 
 def test_chunked_local_attention_get_num_blocks_to_allocate():
@@ -359,8 +411,10 @@ def test_chunked_local_attention_get_num_blocks_to_allocate():
     ]
 
     assert (
-        manager.get_num_blocks_to_allocate("1", 20 * block_size, cached_blocks_1) == 20
+        manager.get_num_blocks_to_allocate("1", 20 * block_size, cached_blocks_1, 0)
+        == 20
     )
     assert (
-        manager.get_num_blocks_to_allocate("2", 20 * block_size, cached_blocks_2) == 15
+        manager.get_num_blocks_to_allocate("2", 20 * block_size, cached_blocks_2, 0)
+        == 15
     )
diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py
index 531b9c595b04d13c316b3f7abd454d3be73c8232..bcc68907060f74f6b347ad04737cdaadce3220a7 100644
--- a/tests/v1/core/utils.py
+++ b/tests/v1/core/utils.py
@@ -142,7 +142,13 @@ def create_scheduler(
         kv_cache_tensors=[],
         kv_cache_groups=[
             KVCacheGroupSpec(
-                ["layer"], FullAttentionSpec(block_size, 1, 1, torch.float32, False)
+                ["layer"],
+                FullAttentionSpec(
+                    block_size=block_size,
+                    num_kv_heads=1,
+                    head_size=1,
+                    dtype=torch.float32,
+                ),
             )
         ],
     )
diff --git a/tests/v1/cudagraph/test_cudagraph_dispatch.py b/tests/v1/cudagraph/test_cudagraph_dispatch.py
index 0e71d6c63ce683a05d71da77d76fd42173268a89..f9d3e8d0532b5143e6e204f082ff9e63217877d1 100644
--- a/tests/v1/cudagraph/test_cudagraph_dispatch.py
+++ b/tests/v1/cudagraph/test_cudagraph_dispatch.py
@@ -49,7 +49,10 @@ def _create_vllm_config(
         mock_config.lora_config = None
     # Mimic the behavior of VllmConfig.__post_init__()
     if compilation_config.mode == CompilationMode.VLLM_COMPILE:
-        compilation_config.set_splitting_ops_for_v1()
+        compilation_config.set_splitting_ops_for_v1(
+            all2all_backend=mock_config.parallel_config.all2all_backend,
+            data_parallel_size=mock_config.parallel_config.data_parallel_size,
+        )
 
     # mimic VllmConfig.__post_init__
     if compilation_config.cudagraph_capture_sizes:
diff --git a/tests/v1/cudagraph/test_cudagraph_mode.py b/tests/v1/cudagraph/test_cudagraph_mode.py
index b1895e83b8b375ef3556396c96d5adffb3100627..f4f74d16c70193a38b9b54b99c501fa61426d659 100644
--- a/tests/v1/cudagraph/test_cudagraph_mode.py
+++ b/tests/v1/cudagraph/test_cudagraph_mode.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import contextlib
-import os
 import weakref
 from contextlib import ExitStack
 
@@ -13,26 +11,6 @@ from vllm import LLM
 from vllm.config import CompilationConfig, CompilationMode
 from vllm.platforms import current_platform
 
-
-@contextlib.contextmanager
-def temporary_environ(env_vars):
-    """
-    Temporarily set environment variables and restore them afterward.
-    We have to do this vs monkeypatch because monkeypatch doesn't work
-    with "module" scoped fixtures.
-    """
-    original_env = {k: os.environ.get(k) for k in env_vars}
-    try:
-        os.environ.update(env_vars)
-        yield
-    finally:
-        for k, v in original_env.items():
-            if v is None:
-                os.environ.pop(k, None)
-            else:
-                os.environ[k] = v
-
-
 # test attention backend and cudagraph_mode combo
 # (backend_name, cudagraph_mode, supported)
 if current_platform.is_rocm():
@@ -68,9 +46,9 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte
     ):
         pytest.skip("Only Hopper GPUs support FA3 and FlashMLA")
 
-    env_vars = backend_configs[backend_name].env_vars
+    attention_config = backend_config.attention_config
 
-    with temporary_environ(env_vars), ExitStack() as stack:
+    with ExitStack() as stack:
         if not supported:
             stack.enter_context(pytest.raises(Exception))
 
@@ -80,6 +58,7 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte
             trust_remote_code=True,
             gpu_memory_utilization=0.45,
             max_model_len=1024,
+            attention_config=attention_config,
             compilation_config=CompilationConfig(
                 mode=CompilationMode.VLLM_COMPILE, cudagraph_mode=cudagraph_mode
             ),
@@ -122,9 +101,10 @@ combo_cases_2 = [
 def test_cudagraph_compilation_combo(
     backend_name, cudagraph_mode, compilation_mode, supported
 ):
-    env_vars = backend_configs[backend_name].env_vars
+    backend_config = backend_configs[backend_name]
+    attention_config = backend_config.attention_config
 
-    with temporary_environ(env_vars), ExitStack() as stack:
+    with ExitStack() as stack:
         if not supported:
             stack.enter_context(pytest.raises(Exception))
 
@@ -134,6 +114,7 @@ def test_cudagraph_compilation_combo(
             trust_remote_code=True,
             gpu_memory_utilization=0.45,
             max_model_len=1024,
+            attention_config=attention_config,
             compilation_config=CompilationConfig(
                 mode=compilation_mode, cudagraph_mode=cudagraph_mode
             ),
diff --git a/tests/v1/determinism/test_batch_invariance.py b/tests/v1/determinism/test_batch_invariance.py
index 1c45e7fe366ffb44b62329da342d92ec7e002624..61fb5f07303b4c4267ef9f1a50bec823ee36cedd 100644
--- a/tests/v1/determinism/test_batch_invariance.py
+++ b/tests/v1/determinism/test_batch_invariance.py
@@ -28,7 +28,7 @@ IS_DEVICE_CAPABILITY_BELOW_90 = is_device_capability_below_90()
     BACKENDS,
 )
 def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
-    backend, monkeypatch: pytest.MonkeyPatch
+    backend,
 ):
     """
     Ensures that the same request (the 'needle' prompt) yields identical output
@@ -54,7 +54,7 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
     seed = int(os.getenv("VLLM_TEST_SEED", "12345"))
     random.seed(seed)
 
-    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend)
+    attention_config = {"backend": backend}
     # Allow overrides from environment (useful for CI tuning)
     # "facebook/opt-125m" is too small, doesn't reliably test determinism
     model = resolve_model_name(backend)
@@ -92,6 +92,7 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
             max_num_seqs=max_batch_size,
             gpu_memory_utilization=gpu_mem_util,
             max_model_len=max_model_len,
+            attention_config=attention_config,
         )
 
         # Baseline generation for the needle prompt alone.
@@ -106,6 +107,7 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
             max_num_seqs=max_batch_size,
             gpu_memory_utilization=gpu_mem_util,
             max_model_len=max_model_len,
+            attention_config=attention_config,
         )
 
         mismatches = 0
@@ -163,10 +165,8 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
     BACKENDS,
 )
 def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
-    backend, monkeypatch: pytest.MonkeyPatch
+    backend,
 ):
-    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend)
-
     seed = int(os.getenv("VLLM_TEST_SEED", "12345"))
     random.seed(seed)
     model_name = resolve_model_name(backend)
@@ -188,12 +188,12 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
     llm = LLM(
         model=model_name,
         tensor_parallel_size=tp_size,
-        # enable_prefix_caching=False,
         max_num_seqs=32,
         max_model_len=8192,
         dtype="bfloat16",  # not everything is supported
         gpu_memory_utilization=0.9,
         enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90,
+        attention_config={"backend": backend},
     )
 
     # Use more realistic prompts for better token generation
@@ -382,12 +382,11 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
     "backend",
     BACKENDS,
 )
-def test_simple_generation(backend, monkeypatch: pytest.MonkeyPatch):
+def test_simple_generation(backend):
     """
     Simple test that runs the model with a basic prompt and prints the output.
     Useful for quick smoke testing and debugging.
     """
-    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend)
     model = resolve_model_name(backend)
 
     llm = LLM(
@@ -399,6 +398,7 @@ def test_simple_generation(backend, monkeypatch: pytest.MonkeyPatch):
         dtype="bfloat16",
         enable_prefix_caching=False,
         enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90,
+        attention_config={"backend": backend},
     )
 
     prompt = "the capital of france is"
@@ -445,8 +445,6 @@ def test_logprobs_without_batch_invariance_should_fail(
     The test will PASS if we detect differences (proving batch invariance matters).
     The test will FAIL if everything matches (suggesting batch invariance isn't needed).
     """
-    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend)
-
     # CRITICAL: Disable batch invariance for this test
     monkeypatch.setenv("VLLM_BATCH_INVARIANT", "0")
     monkeypatch.setattr(batch_invariant, "VLLM_BATCH_INVARIANT", False)
@@ -466,6 +464,7 @@ def test_logprobs_without_batch_invariance_should_fail(
         max_model_len=8192,
         dtype="bfloat16",
         enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90,
+        attention_config={"backend": backend},
     )
 
     # build ragged prompts to change shapes significantly across BS=1 vs BS=N
@@ -650,7 +649,7 @@ def test_logprobs_without_batch_invariance_should_fail(
 @skip_unsupported
 @pytest.mark.parametrize("backend", ["FLASH_ATTN"])
 def test_decode_logprobs_match_prefill_logprobs(
-    backend, monkeypatch: pytest.MonkeyPatch
+    backend,
 ):
     """
     Test that verifies decode logprobs match prefill logprobs.
@@ -665,8 +664,6 @@ def test_decode_logprobs_match_prefill_logprobs(
     This ensures that the logprobs from decode are consistent with what
     we would get if we ran prefill on each prefix.
     """
-    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend)
-
     seed = int(os.getenv("VLLM_TEST_SEED", "12345"))
     random.seed(seed)
     model_name = resolve_model_name(backend)
@@ -690,6 +687,7 @@ def test_decode_logprobs_match_prefill_logprobs(
         max_model_len=8192,
         dtype="bfloat16",
         enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90,
+        attention_config={"backend": backend},
     )
 
     # Use a few test prompts
@@ -921,6 +919,7 @@ def LLM_with_max_seqs(
     max_num_seqs: int,
     gpu_memory_utilization: float,
     max_model_len: int,
+    attention_config: dict | None = None,
 ) -> LLM:
     """
     Helper to construct an LLM with a specific max_num_seqs (batch-size limit)
@@ -935,6 +934,7 @@ def LLM_with_max_seqs(
         tensor_parallel_size=int(os.getenv("VLLM_TP_SIZE", "1")),
         enable_prefix_caching=False,
         enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90,
+        attention_config=attention_config,
         # Enable for MOE models
         # enable_expert_parallel=True,
     )
diff --git a/tests/v1/determinism/test_online_batch_invariance.py b/tests/v1/determinism/test_online_batch_invariance.py
index 5e3b997364949babdefaa286be3a1e003662bf23..52c8103b2f1cee3f369b1e721c3695133f0fb6ed 100644
--- a/tests/v1/determinism/test_online_batch_invariance.py
+++ b/tests/v1/determinism/test_online_batch_invariance.py
@@ -136,11 +136,9 @@ def _compare_bs1_vs_bsn_single_process(
 @skip_unsupported
 @pytest.mark.parametrize("backend", BACKENDS)
 def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
-    backend: str, monkeypatch: pytest.MonkeyPatch
+    backend: str,
 ) -> None:
     random.seed(int(os.getenv("VLLM_TEST_SEED", "12345")))
-    # Override backend for this test (and the RemoteOpenAIServer child process).
-    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend)
     model_name = resolve_model_name(backend)
     prompts_all = [_random_prompt(10, 50) for _ in range(32)]
 
@@ -156,6 +154,7 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
     server_args: list[str] = [
         "--max-model-len=8192",
         "--max-num-seqs=32",
+        f"--attention-backend={backend}",
     ]
     if tp_size:
         server_args += ["-tp", tp_size]
diff --git a/tests/v1/determinism/test_rms_norm_batch_invariant.py b/tests/v1/determinism/test_rms_norm_batch_invariant.py
index 390872519528cc4ed9a1dcbc6f7f3cf2568d2a77..5e5b40d09c237fe2dbbbe4f76e31718018e4ba28 100644
--- a/tests/v1/determinism/test_rms_norm_batch_invariant.py
+++ b/tests/v1/determinism/test_rms_norm_batch_invariant.py
@@ -21,7 +21,11 @@ from vllm.model_executor.layers.layernorm import RMSNorm
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("eps", [1e-6, 1e-5])
 def test_rms_norm_batch_invariant_vs_standard(
-    batch_size: int, hidden_size: int, dtype: torch.dtype, eps: float
+    default_vllm_config,
+    batch_size: int,
+    hidden_size: int,
+    dtype: torch.dtype,
+    eps: float,
 ):
     """
     Compare batch-invariant Triton RMS norm against standard CUDA implementation.
@@ -68,7 +72,9 @@ def test_rms_norm_batch_invariant_vs_standard(
 @pytest.mark.parametrize("batch_size", [1, 16, 128])
 @pytest.mark.parametrize("seq_len", [1, 32, 512])
 @pytest.mark.parametrize("hidden_size", [2048, 4096])
-def test_rms_norm_3d_input(batch_size: int, seq_len: int, hidden_size: int):
+def test_rms_norm_3d_input(
+    default_vllm_config, batch_size: int, seq_len: int, hidden_size: int
+):
     """
     Test RMS norm with 3D input tensors (batch, seq_len, hidden_size).
 
@@ -107,7 +113,7 @@ def test_rms_norm_3d_input(batch_size: int, seq_len: int, hidden_size: int):
 
 
 @skip_unsupported
-def test_rms_norm_numerical_stability():
+def test_rms_norm_numerical_stability(default_vllm_config):
     """
     Test RMS norm numerical stability with extreme values.
 
@@ -167,7 +173,7 @@ def test_rms_norm_numerical_stability():
 
 
 @skip_unsupported
-def test_rms_norm_formula():
+def test_rms_norm_formula(default_vllm_config):
     """
     Test that RMS norm follows the correct mathematical formula.
 
@@ -201,7 +207,7 @@ def test_rms_norm_formula():
 
 @skip_unsupported
 @pytest.mark.parametrize("hidden_size", [128, 1024, 4096, 16384])
-def test_rms_norm_different_hidden_sizes(hidden_size: int):
+def test_rms_norm_different_hidden_sizes(default_vllm_config, hidden_size: int):
     """
     Test RMS norm with various hidden sizes to ensure block size handling.
 
@@ -238,7 +244,7 @@ def test_rms_norm_different_hidden_sizes(hidden_size: int):
 
 
 @skip_unsupported
-def test_rms_norm_determinism():
+def test_rms_norm_determinism(default_vllm_config):
     """
     Test that batch-invariant RMS norm produces deterministic results.
 
diff --git a/tests/v1/determinism/utils.py b/tests/v1/determinism/utils.py
index a8013ed229cfc81ab0d57d6dac9c419283864d75..485eb26c7b9bc64649a1adc673771a2ca53a9922 100644
--- a/tests/v1/determinism/utils.py
+++ b/tests/v1/determinism/utils.py
@@ -6,9 +6,9 @@ import random
 import pytest
 import torch
 
-from vllm.attention.utils.fa_utils import flash_attn_supports_mla
 from vllm.platforms import current_platform
 from vllm.utils.flashinfer import has_flashinfer
+from vllm.v1.attention.backends.fa_utils import flash_attn_supports_mla
 
 skip_unsupported = pytest.mark.skipif(
     not (current_platform.is_cuda() and current_platform.has_device_capability(80)),
diff --git a/tests/v1/distributed/test_eagle_dp.py b/tests/v1/distributed/test_eagle_dp.py
index 9f6a6614fc1fd5901a8491c8feb320e8348b60ed..1b7c2d8ea70d83797063ed3dfe3798ef92bef6cc 100644
--- a/tests/v1/distributed/test_eagle_dp.py
+++ b/tests/v1/distributed/test_eagle_dp.py
@@ -16,7 +16,12 @@ DP_SIZE = int(os.getenv("DP_SIZE", 2))
 
 
 @pytest.mark.asyncio
-async def test_run_eagle_dp():
+async def test_run_eagle_dp(monkeypatch: pytest.MonkeyPatch):
+    # This test checks that running a model with and without eagle
+    # leads to identical tokens. This is only true in batch invariant mode
+    # (because the target model verifies all draft tokens in one big forward pass)
+    monkeypatch.setenv("VLLM_BATCH_INVARIANT", "1")
+
     target_model = "meta-llama/Llama-3.1-8B-Instruct"
     draft_model = "yuhuili/EAGLE-LLaMA3.1-Instruct-8B"
 
@@ -29,6 +34,7 @@ async def test_run_eagle_dp():
         data_parallel_backend="mp",  # ray takes more time
         trust_remote_code=True,
         max_model_len=16384,
+        attention_config={"backend": "FLASH_ATTN"},
     )
 
     eagle_engine_args = replace(
@@ -41,9 +47,10 @@ async def test_run_eagle_dp():
     )
 
     prompt = "This is a test of data parallel with eagle"
-    num_expected_tokens = 100
+    # This test might be flaky, see
+    # https://github.com/vllm-project/vllm/issues/31913
+    num_expected_tokens = 20
     sampling_params = SamplingParams(
-        min_tokens=num_expected_tokens,
         max_tokens=num_expected_tokens,
         ignore_eos=True,
         output_kind=RequestOutputKind.FINAL_ONLY,
diff --git a/tests/v1/e2e/test_async_scheduling.py b/tests/v1/e2e/test_async_scheduling.py
index 5cef9b33c9984abac30d19e050e2e29735298c4d..b85f8880cf8efdfd03ad3e90a56b0a9b0e61b987 100644
--- a/tests/v1/e2e/test_async_scheduling.py
+++ b/tests/v1/e2e/test_async_scheduling.py
@@ -30,8 +30,9 @@ example_prompts = [first_prompt, "In one word, the capital of France is "] + [
 
 default_params = dict(
     temperature=0.0,  # greedy
-    max_tokens=23,
-    min_tokens=18,
+    max_tokens=30,
+    # spec decoding currently doesn't support min_tokens
+    # min_tokens=28,
 )
 
 
@@ -50,6 +51,14 @@ def test_without_spec_decoding(
         dict(logprobs=2),
         dict(logprobs=2, presence_penalty=-1.0),
         dict(structured_outputs=struct_outputs),
+        dict(
+            structured_outputs=struct_outputs,
+            logprobs=2,
+        ),
+        dict(
+            structured_outputs=struct_outputs,
+            presence_penalty=-1.0,
+        ),
         dict(
             structured_outputs=struct_outputs,
             logprobs=2,
@@ -86,7 +95,7 @@ def test_without_spec_decoding(
     run_tests(monkeypatch, MODEL, test_configs, test_sampling_params)
 
 
-def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch):
+def test_with_spec_decoding(sample_json_schema, monkeypatch: pytest.MonkeyPatch):
     """Test consistency and acceptance rates with some different combos of
     preemption, executor, async scheduling, prefill chunking,
     spec decoding model length.
@@ -100,9 +109,20 @@ def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch):
     # Set small draft model len to force doesn't-fit-in-drafter case.
     spec_config_short = spec_config | {"max_model_len": 50}
 
+    struct_outputs = StructuredOutputsParams(json=sample_json_schema)
+
     test_sampling_params = [
         dict(),
+        dict(presence_penalty=-1.0),
+        dict(bad_words=["the", " the"]),
         dict(logprobs=2),
+        dict(logprobs=2, presence_penalty=-1.0),
+        dict(structured_outputs=struct_outputs),
+        dict(
+            structured_outputs=struct_outputs,
+            logprobs=2,
+            presence_penalty=-1.0,
+        ),
     ]
 
     # test_preemption, executor, async_scheduling,
@@ -142,18 +162,12 @@ def run_tests(
     """Test consistency of combos of async scheduling, preemption,
     uni/multiproc executor with spec decoding."""
 
+    # Determine attention config based on platform
+    attention_config = {"backend": "FLEX_ATTENTION"}
+
     with monkeypatch.context() as m:
-        # avoid precision errors
-        if current_platform.is_rocm():
-            if is_testing_with_spec_decoding:
-                # Use TRITON_ATTN for spec decoding test for consistency
-                m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN")
-            else:
-                m.setenv("VLLM_ATTENTION_BACKEND", "ROCM_AITER_FA")
-        else:
-            m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
         # lock matmul precision to full FP32 (IEEE)
-        m.setenv("VLLM_FLOAT32_MATMUL_PRECISION", "ieee")
+        m.setenv("VLLM_FLOAT32_MATMUL_PRECISION", "highest")
         # m.setenv("VLLM_BATCH_INVARIANT", "1")
         outputs: list[tuple[str, list, list]] = []
         for n, (
@@ -174,6 +188,7 @@ def run_tests(
                 spec_config,
                 test_prefill_chunking=test_prefill_chunking,
                 is_testing_with_spec_decoding=is_testing_with_spec_decoding,
+                attention_config=attention_config,
             )
             outputs.append(test_results)
 
@@ -204,15 +219,7 @@ def run_tests(
                     name_1=f"config=[{test_config}], params={params}",
                 )
 
-                # On ROCm with TRITON_ATTN (spec decoding test), skip strict
-                # logprobs comparison when logprobs are requested
-                skip_logprobs_check = (
-                    current_platform.is_rocm()
-                    and params.get("logprobs")
-                    and is_testing_with_spec_decoding
-                )
-                if not skip_logprobs_check:
-                    assert _all_logprobs_match(base_logprobs, test_logprobs)
+                assert _all_logprobs_match(base_logprobs, test_logprobs)
 
                 if (
                     base_acceptance_rate is not None
@@ -262,6 +269,7 @@ def run_test(
     spec_config: dict[str, Any] | None,
     test_prefill_chunking: bool,
     is_testing_with_spec_decoding: bool = False,
+    attention_config: dict[str, Any] | None = None,
 ):
     spec_decoding = spec_config is not None
     cache_arg: dict[str, Any] = (
@@ -281,14 +289,6 @@ def run_test(
     print(f"---- TESTING {test_str}: {test_config}")
     print("-" * 80)
 
-    # On ROCm: use float16 for first test (ROCM_AITER_FA), but float32 for
-    # spec decoding test (TRITON_ATTN) for better precision.
-    # On others: always use float32.
-    if current_platform.is_rocm() and not is_testing_with_spec_decoding:
-        dtype = "float16"
-    else:
-        dtype = "float32"
-
     with VllmRunner(
         model,
         max_model_len=512,
@@ -298,9 +298,10 @@ def run_test(
         # enforce_eager=True,
         async_scheduling=async_scheduling,
         distributed_executor_backend=executor,
-        dtype=dtype,
+        dtype="float32",
         speculative_config=spec_config,
         disable_log_stats=False,
+        attention_config=attention_config,
         **cache_arg,
     ) as vllm_model:
         results = []
@@ -358,12 +359,7 @@ def _all_logprobs_match(req_a, req_b) -> bool:
 
 
 def _logprobs_match(lps_a: dict[int, Logprob], lps_b: dict[int, Logprob]) -> bool:
-    if current_platform.is_rocm():
-        # ROCm has higher numerical variance
-        # due to use of float16.
-        rel_tol, abs_tol = 5e-2, 1e-5
-    else:
-        rel_tol, abs_tol = 1e-3, 1e-6
+    rel_tol, abs_tol = 1e-3, 1e-6
     return (
         len(lps_a) == len(lps_b)
         and lps_a.keys() == lps_b.keys()
diff --git a/tests/v1/e2e/test_async_spec_decode.py b/tests/v1/e2e/test_async_spec_decode.py
index 561f37a52d5735186da75e4504eead685ff17bd2..4bf76da452f3112b1f008658d2a60308d002eebf 100644
--- a/tests/v1/e2e/test_async_spec_decode.py
+++ b/tests/v1/e2e/test_async_spec_decode.py
@@ -19,7 +19,7 @@ def sync_tracker():
     Fixture that patches CommonAttentionMetadata.seq_lens_cpu to detect
     lazy init syncs. Prints stack traces immediately when syncs occur.
     """
-    from vllm.v1.attention.backends.utils import CommonAttentionMetadata
+    from vllm.v1.attention.backend import CommonAttentionMetadata
 
     # Shared counter for cross-process communication (inherited by fork)
     sync_count = multiprocessing.Value("i", 0)
diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
index 6eee5ed50b95b414618a5a4b496b8fc23b9955ff..6c8bbe214c785ef134ab39ea78cbf07aa5e9ba29 100644
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -445,25 +445,26 @@ def test_eagle_correctness(
     should be the same when using eagle speculative decoding.
     model_setup: (method, model_name, eagle_model_name, tp_size)
     """
-    with monkeypatch.context() as m:
-        if "Llama-4-Scout" in model_setup[1] and attn_backend == "FLASH_ATTN":
-            # Scout requires default backend selection
-            # because vision encoder has head_dim 88 being incompatible
-            #  with FLASH_ATTN and needs to fall back to Flex Attn
-
-            # pass if not ROCm
-            if current_platform.is_rocm():
-                # TODO: Enable Flex Attn for spec_decode on ROCm
-                pytest.skip("Flex Attn for spec_decode not supported on ROCm currently")
-        else:
-            m.setenv("VLLM_MLA_DISABLE", "1")
-            m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
+    # Determine attention config
+    # Scout requires default backend selection because vision encoder has
+    # head_dim 88 being incompatible with FLASH_ATTN and needs to fall back
+    # to Flex Attn
+    if "Llama-4-Scout" in model_setup[1] and attn_backend == "FLASH_ATTN":
+        if current_platform.is_rocm():
+            # TODO: Enable Flex Attn for spec_decode on ROCm
+            pytest.skip("Flex Attn for spec_decode not supported on ROCm currently")
+        attention_config = None  # Let it fall back to default
+    else:
+        attention_config = {"backend": attn_backend}
+
+    if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm():
+        pytest.skip(
+            "TRITON_ATTN does not support "
+            "multi-token eagle spec decode on current platform"
+        )
 
-        if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm():
-            pytest.skip(
-                "TRITON_ATTN does not support "
-                "multi-token eagle spec decode on current platform"
-            )
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_MLA_DISABLE", "1")
 
         if attn_backend == "ROCM_AITER_FA" and current_platform.is_rocm():
             if "deepseek" in model_setup[1].lower():
@@ -478,7 +479,10 @@ def test_eagle_correctness(
         max_num_batched_tokens = 128 if enable_chunked_prefill else max_model_len
 
         ref_llm = LLM(
-            model=model_name, max_model_len=max_model_len, tensor_parallel_size=tp_size
+            model=model_name,
+            max_model_len=max_model_len,
+            tensor_parallel_size=tp_size,
+            attention_config=attention_config,
         )
         ref_outputs = ref_llm.chat(test_prompts, sampling_config)
         del ref_llm
@@ -499,6 +503,7 @@ def test_eagle_correctness(
             max_num_batched_tokens=max_num_batched_tokens,
             enable_chunked_prefill=enable_chunked_prefill,
             model_impl=model_impl,
+            attention_config=attention_config,
         )
         spec_outputs = spec_llm.chat(test_prompts, sampling_config)
         matches = 0
diff --git a/tests/v1/e2e/untest_cascade_attention.py b/tests/v1/e2e/untest_cascade_attention.py
index 97744faee0c1a09ac845a8d0ee56496486b61767..7b63b1ebee18b38ff716316b658b637fca9967cb 100644
--- a/tests/v1/e2e/untest_cascade_attention.py
+++ b/tests/v1/e2e/untest_cascade_attention.py
@@ -1,16 +1,18 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import os
 import pytest
 
 from vllm import LLM, SamplingParams
 
 from ...utils import create_new_process_for_each_test
+from ...utils import models_path_prefix
 
 
 @create_new_process_for_each_test()
 @pytest.mark.parametrize("attn_backend", ["FLASH_ATTN", "FLASHINFER"])
-def test_cascade_attention(example_system_message, monkeypatch, attn_backend):
+def test_cascade_attention(example_system_message, attn_backend):
     prompt = "\n<User>: Implement fibonacci sequence in Python.\n<Claude>:"
 
     if attn_backend == "FLASHINFER":
@@ -19,19 +21,18 @@ def test_cascade_attention(example_system_message, monkeypatch, attn_backend):
             "needs investigation. See issue #25679."
         )
 
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
-
-        llm = LLM(model="Qwen/Qwen2-1.5B-Instruct")
-        sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
-
-        # No cascade attention.
-        single_prompt = [example_system_message + prompt]
-        responses = llm.generate(single_prompt, sampling_params)
-        ref_output = responses[0].outputs[0].text
-
-        # (Probably) Use cascade attention.
-        prompts = [example_system_message + prompt] * 64
-        responses = llm.generate(prompts, sampling_params)
-        for response in responses:
-            assert response.outputs[0].text == ref_output
\ No newline at end of file
+    llm = LLM(
+        model=os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct"), attention_config={"backend": attn_backend}
+    )
+    sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
+
+    # No cascade attention.
+    single_prompt = [example_system_message + prompt]
+    responses = llm.generate(single_prompt, sampling_params)
+    ref_output = responses[0].outputs[0].text
+
+    # (Probably) Use cascade attention.
+    prompts = [example_system_message + prompt] * 64
+    responses = llm.generate(prompts, sampling_params)
+    for response in responses:
+        assert response.outputs[0].text == ref_output
diff --git a/tests/v1/ec_connector/integration/test_epd_correctness.py b/tests/v1/ec_connector/integration/test_epd_correctness.py
index 616d34441ab8e061644a180da3be82ffa2bf0b82..eae4b7427240fd34a07b683941858d44dca3b691 100644
--- a/tests/v1/ec_connector/integration/test_epd_correctness.py
+++ b/tests/v1/ec_connector/integration/test_epd_correctness.py
@@ -31,7 +31,7 @@ import openai
 import requests
 
 from vllm.assets.image import ImageAsset
-from vllm.multimodal.utils import encode_image_base64
+from vllm.multimodal.utils import encode_image_url
 
 MAX_OUTPUT_LEN = 256
 
@@ -49,9 +49,7 @@ SAMPLE_PROMPTS_MM: list[dict] = [
                 "content": [
                     {
                         "type": "image_url",
-                        "image_url": {
-                            "url": f"data:image;base64,{encode_image_base64(image_1)}"
-                        },
+                        "image_url": {"url": encode_image_url(image_1)},
                     },
                     {"type": "text", "text": "What's in this image?"},
                 ],
@@ -66,9 +64,7 @@ SAMPLE_PROMPTS_MM: list[dict] = [
                 "content": [
                     {
                         "type": "image_url",
-                        "image_url": {
-                            "url": f"data:image;base64,{encode_image_base64(image_2)}"
-                        },
+                        "image_url": {"url": encode_image_url(image_2)},
                     },
                     {
                         "type": "image_url",
diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py
index bf7d79d5d06c8aeeeeca3ceb8d41194408faed60..d944e046ae73ff9749d578f02a9e24ae18f0e51a 100644
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@@ -12,6 +12,13 @@ from vllm import SamplingParams
 from vllm.assets.image import ImageAsset
 from vllm.config import VllmConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    ChatCompletionResponse,
+    ErrorResponse,
+)
+from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
+from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
 from vllm.inputs import PromptType
 from vllm.outputs import RequestOutput
 from vllm.platforms import current_platform
@@ -255,7 +262,7 @@ async def test_multi_abort(output_kind: RequestOutputKind):
 
         # Use multi-abort to abort multiple requests at once
         abort_request_ids = [request_ids[i] for i in REQUEST_IDS_TO_ABORT]
-        await engine.abort(abort_request_ids)
+        await engine.abort(abort_request_ids, internal=False)
 
         # Wait for all tasks to complete
         results = await asyncio.gather(*tasks, return_exceptions=True)
@@ -486,6 +493,60 @@ async def test_dp_rank_argument():
                 pass
 
 
+@pytest.mark.asyncio(scope="module")
+async def test_header_dp_rank_argument():
+    with ExitStack() as after:
+        with set_default_torch_num_threads(1):
+            engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
+        after.callback(engine.shutdown)
+
+        MODEL_NAME = "test-model"
+        BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]
+
+        # Create models first
+        models = OpenAIServingModels(
+            engine_client=engine,
+            base_model_paths=BASE_MODEL_PATHS,
+        )
+
+        # Create serving chat instance
+        serving_chat = OpenAIServingChat(
+            engine_client=engine,
+            models=models,
+            response_role="assistant",
+            chat_template=None,
+            chat_template_content_format="auto",
+            request_logger=None,
+        )
+        # Create a chat completion request
+        req = ChatCompletionRequest(
+            model=MODEL_NAME,
+            messages=[{"role": "user", "content": TEXT_PROMPT}],
+            max_tokens=100,
+            temperature=1.0,
+            seed=33,
+        )
+        # Test 1: Valid DP rank (0)
+        mock_raw_request = MagicMock()
+        mock_raw_request.headers = {"X-data-parallel-rank": "0"}
+        mock_raw_request.state = MagicMock()
+
+        # Should succeed with valid rank
+        response = await serving_chat.create_chat_completion(req, mock_raw_request)
+        assert isinstance(response, ChatCompletionResponse), (
+            "Expected a ChatCompletionResponse for valid DP rank"
+        )
+
+        # Test 2: Out-of-range DP rank (1)
+        mock_raw_request.headers = {"X-data-parallel-rank": "1"}
+
+        # should return ErrorResponse for out-of-range rank
+        response2 = await serving_chat.create_chat_completion(req, mock_raw_request)
+        assert isinstance(response2, ErrorResponse), (
+            "Expected an ErrorResponse for out-of-range DP rank"
+        )
+
+
 @pytest.mark.asyncio
 async def test_check_health():
     """Test that check_health returns normally for healthy engine
@@ -550,7 +611,7 @@ async def test_abort_final_output(output_kind: RequestOutputKind):
         await asyncio.sleep(0.5)
 
         # Abort the request
-        await engine.abort(request_id)
+        await engine.abort(request_id, internal=False)
 
         # Wait for generation to complete and return final output
         final_output = await generated
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index c55fbf5669c6bb6e8a40fb134919d51053f08892..3b5fb41eb92d6265b2c49fc278a3dd8b4dfaaae4 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -42,10 +42,16 @@ TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)
 PROMPT = "I am Gyoubu Masataka Oniwa"
 PROMPT_TOKENS = TOKENIZER(PROMPT).input_ids
 
+_REQUEST_COUNTER = 0
+
 
 def make_request() -> EngineCoreRequest:
+    global _REQUEST_COUNTER
+    _REQUEST_COUNTER += 1
+    request_id = f"request-{_REQUEST_COUNTER}"
     return EngineCoreRequest(
-        request_id=str(uuid.uuid4()),
+        request_id=request_id,
+        external_req_id=f"{request_id}-{uuid.uuid4()}",
         prompt_token_ids=PROMPT_TOKENS,
         mm_features=None,
         sampling_params=SamplingParams(),
diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index c9cf2050c654c4e057ddd1d4a612d42d6a587c06..cd86acd7a84e159c28ae3b2e43bc90475bcf16d2 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -2,12 +2,14 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
+import importlib
 import os
 import signal
 import time
 import uuid
 from dataclasses import dataclass
 from threading import Thread
+from types import SimpleNamespace
 from typing import Any
 from unittest.mock import MagicMock
 
@@ -25,7 +27,11 @@ from vllm.usage.usage_lib import UsageContext
 from vllm.utils.torch_utils import set_default_torch_num_threads
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.core import EngineCore
-from vllm.v1.engine.core_client import AsyncMPClient, EngineCoreClient, SyncMPClient
+from vllm.v1.engine.core_client import (
+    AsyncMPClient,
+    EngineCoreClient,
+    SyncMPClient,
+)
 from vllm.v1.engine.utils import CoreEngineProcManager
 from vllm.v1.executor.abstract import Executor
 
@@ -33,14 +39,19 @@ from ...distributed.conftest import MockSubscriber
 from ...utils import create_new_process_for_each_test
 from ...utils import models_path_prefix
 
-if not current_platform.is_cuda():
-    pytest.skip(reason="V1 currently only supported on CUDA.", allow_module_level=True)
+if not current_platform.is_cuda_alike():
+    pytest.skip(
+        reason="V1 currently only supported on CUDA-alike platforms.",
+        allow_module_level=True,
+    )
 
 MODEL_NAME = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct")
 TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)
 PROMPT = "Hello my name is Robert and I love quantization kernels"
 PROMPT_TOKENS = TOKENIZER(PROMPT).input_ids
 
+_REQUEST_COUNTER = 0
+
 
 def make_request(
     params: SamplingParams, prompt_tokens_ids: list[int] | None = None
@@ -48,8 +59,12 @@ def make_request(
     if not prompt_tokens_ids:
         prompt_tokens_ids = PROMPT_TOKENS
 
+    global _REQUEST_COUNTER
+    _REQUEST_COUNTER += 1
+    request_id = f"request-{_REQUEST_COUNTER}"
     return EngineCoreRequest(
-        request_id=str(uuid.uuid4()),
+        request_id=request_id,
+        external_req_id=f"{request_id}-{uuid.uuid4()}",
         prompt_token_ids=prompt_tokens_ids,
         mm_features=None,
         sampling_params=params,
@@ -62,6 +77,92 @@ def make_request(
     )
 
 
+def _reload_envs_module():
+    import vllm.envs as envs_mod
+
+    cache_clear = getattr(getattr(envs_mod, "__getattr__", None), "cache_clear", None)
+    if cache_clear is not None:
+        cache_clear()
+    return importlib.reload(envs_mod)
+
+
+def _reload_core_client_module():
+    module = importlib.import_module("vllm.v1.engine.core_client")
+    return importlib.reload(module)
+
+
+def test_mp_client_uses_env_timeout(monkeypatch: pytest.MonkeyPatch):
+    timeout_value = 654
+    monkeypatch.setenv("VLLM_ENGINE_READY_TIMEOUT_S", str(timeout_value))
+
+    # Ensure that the environment variable is loaded if caching is enabled
+    _reload_envs_module()
+    core_client_mod = _reload_core_client_module()
+
+    poll_timeouts: list[int] = []
+
+    class ShadowSocket:
+        def poll(self, timeout: int) -> int:
+            # Capture the timeout value for each poll call
+            poll_timeouts.append(timeout)
+            return 1
+
+        def recv_multipart(self):
+            return (b"\x00\x00", b"ready")
+
+    class DummySocket:
+        def send_multipart(self, _msg, *, copy: bool = False, track: bool = False):
+            if track:
+                return SimpleNamespace(done=True)
+
+        def recv_multipart(self, *, copy: bool = False):
+            return (b"", b"")
+
+        def close(self, *, linger: int = 0):
+            pass
+
+        def bind(self, _address):
+            pass
+
+        def connect(self, _address):
+            pass
+
+        def setsockopt(self, *_args, **_kwargs):
+            pass
+
+    monkeypatch.setattr(core_client_mod.zmq.Socket, "shadow", lambda *_: ShadowSocket())
+    monkeypatch.setattr(
+        core_client_mod, "make_zmq_socket", lambda *_, **__: DummySocket()
+    )
+
+    parallel_config = SimpleNamespace(
+        data_parallel_size=1,
+        data_parallel_rank=0,
+        data_parallel_index=0,
+        data_parallel_size_local=1,
+        data_parallel_rank_local=None,
+        data_parallel_hybrid_lb=False,
+        data_parallel_external_lb=False,
+    )
+    vllm_config = SimpleNamespace(parallel_config=parallel_config)
+
+    client = core_client_mod.MPClient(
+        asyncio_mode=False,
+        vllm_config=vllm_config,
+        executor_class=object,
+        log_stats=False,
+        client_addresses={
+            "input_address": "inproc://input",
+            "output_address": "inproc://output",
+        },
+    )
+    try:
+        # timeout_value is in seconds, but poll receives milliseconds
+        assert poll_timeouts == [timeout_value * 1000]
+    finally:
+        client.shutdown()
+
+
 def loop_until_done(client: EngineCoreClient, outputs: dict):
     while True:
         engine_core_outputs = client.get_output().outputs
@@ -638,6 +739,7 @@ def test_kv_cache_events(
         )
         assert event.parent_block_hash is None, "Parent block hash should be None"
         assert event.lora_id is None, "Lora id should be None"
+        assert event.lora_name is None, "Lora name should be None"
         assert len(event.token_ids) == num_blocks * block_size, (
             "Token ids should be the same as the custom tokens"
         )
diff --git a/tests/v1/engine/test_fast_incdec_prefix_err.py b/tests/v1/engine/test_fast_incdec_prefix_err.py
index a7bf46324bc40bfc15aa25260084c4e34067cc3d..952f6ae0d8aa1fbac906c73c387a2d8a758fe88b 100644
--- a/tests/v1/engine/test_fast_incdec_prefix_err.py
+++ b/tests/v1/engine/test_fast_incdec_prefix_err.py
@@ -29,6 +29,7 @@ def test_fast_inc_detok_invalid_utf8_err_case():
     params = SamplingParams(skip_special_tokens=True)
     request = EngineCoreRequest(
         request_id="test",
+        external_req_id="test-ext",
         prompt_token_ids=prompt_token_ids,
         mm_features=None,
         sampling_params=params,
diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py
index ab3f9fdaac3ba05d59a3c0506de2498d90ea7b5f..2791eeb17741195504ade6990b996433dccedc60 100644
--- a/tests/v1/engine/test_output_processor.py
+++ b/tests/v1/engine/test_output_processor.py
@@ -59,12 +59,12 @@ def test_incremental_detokenization(
     output_processor = OutputProcessor(
         dummy_test_vectors.tokenizer, log_stats=False, stream_interval=stream_interval
     )
-    engine_core = MockEngineCore(tokens_list=dummy_test_vectors.generation_tokens)
 
     # Make N requests.
     requests = [
         EngineCoreRequest(
-            request_id=f"request-{idx}",
+            request_id=f"request-{idx}-int",
+            external_req_id=f"request-{idx}",
             prompt_token_ids=prompt_tokens,
             mm_features=None,
             eos_token_id=None,
@@ -84,6 +84,11 @@ def test_incremental_detokenization(
         for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
     ]
 
+    engine_core = MockEngineCore(
+        tokens_list=dummy_test_vectors.generation_tokens,
+        request_ids=[req.request_id for req in requests],
+    )
+
     # Add requests to the detokenizer.
     for request, prompt in zip(requests, dummy_test_vectors.prompt_strings):
         output_processor.add_request(request, prompt)
@@ -270,12 +275,28 @@ def _validate_logprobs(
                     # the logprob token id at this sequence position
                     decoded_token = pos_logprob_dict[lp_tok].decoded_token
                     ref_decoded_token = _ref_convert_id_to_token(dtv.tokenizer, lp_tok)
-                    assert decoded_token == ref_decoded_token, (
-                        f"Sampled logprob token id {lp_tok} decodes to"
-                        f" {ref_decoded_token} but Logprob decoded"
-                        f" token is {decoded_token} instead"
-                        f" (at position {idx})"
-                    )
+
+                    # With UTF-8 correction logic, tokens ending with "�"
+                    # (incomplete byte sequences) are corrected to either
+                    # empty string or proper UTF-8 characters
+                    if ref_decoded_token.endswith("�"):
+                        # Token needs UTF-8 correction
+                        assert not decoded_token.endswith("�"), (
+                            f"Sampled logprob token id {lp_tok} decodes to"
+                            f" '{ref_decoded_token}' (ends with replacement char)"
+                            f" but corrected decoded token '{decoded_token}'"
+                            f" still ends with replacement char"
+                            f" (at position {idx}). UTF-8 correction should"
+                            f" have removed it."
+                        )
+                    else:
+                        # No correction needed, should match exactly
+                        assert decoded_token == ref_decoded_token, (
+                            f"Sampled logprob token id {lp_tok} decodes to"
+                            f" {ref_decoded_token} but Logprob decoded"
+                            f" token is {decoded_token} instead"
+                            f" (at position {idx})"
+                        )
 
                 ref_cumulative_logprob += pos_logprob_dict[sampled_token].logprob
             # Assert that cumulative logprobs are correct
@@ -416,12 +437,28 @@ def _validate_logprobs(
                     # the logprob token id at this sequence position
                     decoded_token = pos_logprob_dict[plp_tok].decoded_token
                     ref_decoded_token = _ref_convert_id_to_token(dtv.tokenizer, plp_tok)
-                    assert decoded_token == ref_decoded_token, (
-                        f"Prompt logprob token id {plp_tok} decodes to"
-                        f" {ref_decoded_token} but Logprob decoded"
-                        f" token is {decoded_token} instead"
-                        f" (at position {idx})"
-                    )
+
+                    # With UTF-8 correction logic, tokens ending with "�"
+                    # (incomplete byte sequences) are corrected to either
+                    # empty string or proper UTF-8 characters
+                    if ref_decoded_token.endswith("�"):
+                        # Token needs UTF-8 correction
+                        assert not decoded_token.endswith("�"), (
+                            f"Prompt logprob token id {plp_tok} decodes to"
+                            f" '{ref_decoded_token}' (ends with replacement char)"
+                            f" but corrected decoded token '{decoded_token}'"
+                            f" still ends with replacement char"
+                            f" (at position {idx}). UTF-8 correction should"
+                            f" have removed it."
+                        )
+                    else:
+                        # No correction needed, should match exactly
+                        assert decoded_token == ref_decoded_token, (
+                            f"Prompt logprob token id {plp_tok} decodes to"
+                            f" {ref_decoded_token} but Logprob decoded"
+                            f" token is {decoded_token} instead"
+                            f" (at position {idx})"
+                        )
         else:
             # Prompt logprobs disabled for this request
             assert prompt_logprobs is None
@@ -439,15 +476,6 @@ def test_logprobs_processor(
     dummy_test_vectors,
 ):
     output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=False)
-    engine_core = MockEngineCore(
-        tokens_list=dummy_test_vectors.generation_tokens,
-        generated_logprobs_raw=None
-        if num_sample_logprobs is None
-        else dummy_test_vectors.generation_logprobs,
-        prompt_logprobs_raw=None
-        if num_prompt_logprobs is None
-        else dummy_test_vectors.prompt_logprobs,
-    )
 
     # Make N requests.
     request_id_list = [
@@ -455,7 +483,8 @@ def test_logprobs_processor(
     ]
     requests = [
         EngineCoreRequest(
-            request_id=request_id_list[idx],
+            request_id=request_id_list[idx] + "-int",
+            external_req_id=request_id_list[idx],
             prompt_token_ids=prompt_tokens,
             mm_features=None,
             eos_token_id=None,
@@ -477,6 +506,17 @@ def test_logprobs_processor(
         for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
     ]
 
+    engine_core = MockEngineCore(
+        tokens_list=dummy_test_vectors.generation_tokens,
+        generated_logprobs_raw=None
+        if num_sample_logprobs is None
+        else dummy_test_vectors.generation_logprobs,
+        prompt_logprobs_raw=None
+        if num_prompt_logprobs is None
+        else dummy_test_vectors.prompt_logprobs,
+        request_ids=[req.request_id for req in requests],
+    )
+
     # Add requests to the detokenizer.
     for request, prompt in zip(requests, dummy_test_vectors.prompt_strings):
         output_processor.add_request(request, prompt)
@@ -622,19 +662,12 @@ def test_stop_token(
         ]
     prompt_string = dummy_test_vectors.prompt_strings[0]
     prompt_tokens = dummy_test_vectors.prompt_tokens[0]
-    engine_core = MockEngineCore(
-        tokens_list=[generation_tokens],
-        generated_logprobs_raw=[generation_logprobs] if do_logprobs else None,
-        prompt_logprobs_raw=None,
-        eos_token_id=eos_token_id,
-        stop_token_ids=stop_token_ids,
-        ignore_eos=ignore_eos,
-    )
 
     # Make request.
     request_id = "request-0"
     request = EngineCoreRequest(
         request_id=request_id,
+        external_req_id=request_id + "-ext",
         prompt_token_ids=prompt_tokens,
         mm_features=None,
         eos_token_id=eos_token_id,
@@ -656,6 +689,16 @@ def test_stop_token(
         pooling_params=None,
     )
 
+    engine_core = MockEngineCore(
+        tokens_list=[generation_tokens],
+        generated_logprobs_raw=[generation_logprobs] if do_logprobs else None,
+        prompt_logprobs_raw=None,
+        eos_token_id=eos_token_id,
+        stop_token_ids=stop_token_ids,
+        ignore_eos=ignore_eos,
+        request_ids=[request.request_id],
+    )
+
     # Add request to the detokenizer.
     output_processor.add_request(request, prompt_string)
 
@@ -721,13 +764,6 @@ def test_stop_string(
     dummy_test_vectors,
 ):
     output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=False)
-    engine_core = MockEngineCore(
-        tokens_list=dummy_test_vectors.generation_tokens,
-        generated_logprobs_raw=dummy_test_vectors.generation_logprobs
-        if num_sample_logprobs
-        else None,
-        prompt_logprobs_raw=None,
-    )
 
     # Make N requests.
     request_id_list = [
@@ -735,7 +771,8 @@ def test_stop_string(
     ]
     requests = [
         EngineCoreRequest(
-            request_id=request_id_list[idx],
+            request_id=request_id_list[idx] + "-int",
+            external_req_id=request_id_list[idx],
             prompt_token_ids=prompt_tokens,
             mm_features=None,
             eos_token_id=None,
@@ -757,6 +794,15 @@ def test_stop_string(
         for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
     ]
 
+    engine_core = MockEngineCore(
+        tokens_list=dummy_test_vectors.generation_tokens,
+        generated_logprobs_raw=dummy_test_vectors.generation_logprobs
+        if num_sample_logprobs
+        else None,
+        prompt_logprobs_raw=None,
+        request_ids=[req.request_id for req in requests],
+    )
+
     # Add requests to the detokenizer.
     for request, prompt in zip(requests, dummy_test_vectors.prompt_strings):
         output_processor.add_request(request, prompt)
@@ -814,9 +860,12 @@ def test_stop_string(
     for idx, (ref_gen_str, stop_str) in enumerate(
         zip(dummy_test_vectors.generation_strings, STOP_STRINGS)
     ):
-        # Request should be aborted.
+        # Request should be aborted (check internal ID in abort list).
+        internal_request_id = f"request-{idx}-int"
+        assert internal_request_id in aborted
+
+        # Use external ID for collecting outputs
         request_id = f"request-{idx}"
-        assert request_id in aborted
 
         # Collected values that were generated.
         gen_str = gen_strings[request_id]
@@ -849,13 +898,13 @@ def test_stop_string(
 
 def test_iteration_stats(dummy_test_vectors):
     output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=True)
-    engine_core = MockEngineCore(dummy_test_vectors.generation_tokens)
     engine_core_timestamp = time.monotonic()
 
     # Make N requests.
     requests = [
         EngineCoreRequest(
             request_id=f"request-{idx}",
+            external_req_id=f"request-{idx}-ext",
             prompt_token_ids=prompt_tokens,
             mm_features=None,
             eos_token_id=None,
@@ -869,6 +918,11 @@ def test_iteration_stats(dummy_test_vectors):
         for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
     ]
 
+    engine_core = MockEngineCore(
+        dummy_test_vectors.generation_tokens,
+        request_ids=[req.request_id for req in requests],
+    )
+
     # Add all requests except one to the OutputProcessor.
     num_active = len(dummy_test_vectors.generation_tokens) - 1
     for request in requests[:num_active]:
@@ -923,7 +977,6 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
     output_processor = OutputProcessor(
         dummy_test_vectors.tokenizer, log_stats=log_stats
     )
-    engine_core = MockEngineCore(dummy_test_vectors.generation_tokens)
     engine_core_timestamp = time.monotonic()
 
     # Create LoRA requests
@@ -937,7 +990,8 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
     lora_assignments = [lora1, lora2, None]
     requests = [
         EngineCoreRequest(
-            request_id=f"request-{idx}",
+            request_id=f"request-{idx}-int",
+            external_req_id=f"request-{idx}",
             prompt_token_ids=prompt_tokens,
             mm_features=None,
             eos_token_id=None,
@@ -951,6 +1005,11 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
         for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
     ]
 
+    engine_core = MockEngineCore(
+        dummy_test_vectors.generation_tokens,
+        request_ids=[req.request_id for req in requests],
+    )
+
     # Add all requests to the OutputProcessor
     for request in requests:
         output_processor.add_request(request, None)
@@ -1016,9 +1075,9 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
     outputs = EngineCoreOutputs(
         outputs=engine_core.get_outputs(), scheduler_stats=SchedulerStats()
     )
-    # Find and mark request-0 as finished (it uses lora-1)
+    # Find and mark request-0-int as finished (it uses lora-1)
     for output in outputs.outputs:
-        if output.request_id == "request-0":
+        if output.request_id == "request-0-int":
             output.finish_reason = FinishReason.LENGTH
             break
 
@@ -1041,9 +1100,9 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
     outputs = EngineCoreOutputs(
         outputs=engine_core.get_outputs(), scheduler_stats=SchedulerStats()
     )
-    # Find and mark request-1 as finished (it uses lora-2)
+    # Find and mark request-1-int as finished (it uses lora-2)
     for output in outputs.outputs:
-        if output.request_id == "request-1":
+        if output.request_id == "request-1-int":
             output.finish_reason = FinishReason.LENGTH
             break
 
@@ -1065,9 +1124,9 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
     outputs = EngineCoreOutputs(
         outputs=engine_core.get_outputs(), scheduler_stats=SchedulerStats()
     )
-    # Find and mark request-2 as finished (it has no LoRA)
+    # Find and mark request-2-int as finished (it has no LoRA)
     for output in outputs.outputs:
-        if output.request_id == "request-2":
+        if output.request_id == "request-2-int":
             output.finish_reason = FinishReason.LENGTH
             break
 
@@ -1108,7 +1167,9 @@ async def test_request_output_collector():
             for idx in range(NUM_REQS)
         ]
 
-    collector = RequestOutputCollector(RequestOutputKind.DELTA)
+    collector = RequestOutputCollector(
+        RequestOutputKind.DELTA, request_id="my-request-id-int"
+    )
 
     # CASE 1: Put then get.
     outputs = make_outputs()
@@ -1164,7 +1225,9 @@ async def test_request_output_collector():
 @pytest.mark.asyncio
 async def test_cumulative_output_collector_n():
     """Test collector correctly handles multiple outputs by index."""
-    collector = RequestOutputCollector(RequestOutputKind.CUMULATIVE)
+    collector = RequestOutputCollector(
+        RequestOutputKind.CUMULATIVE, request_id="my-request-id-int"
+    )
     outputs = [
         RequestOutput(
             request_id="my-request-id",
@@ -1243,11 +1306,13 @@ async def test_cumulative_output_collector_n():
 
 
 @pytest.mark.parametrize("runner", ["generate", "pooling"])
-def test_abort_requests(runner: str, dummy_test_vectors):
+@pytest.mark.parametrize("abort_by", ["internal", "external"])
+def test_abort_requests(runner: str, abort_by: str, dummy_test_vectors):
     output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=True)
     requests = [
         EngineCoreRequest(
             request_id=f"request-{idx}",
+            external_req_id=f"external-{idx}",
             prompt_token_ids=prompt_tokens,
             mm_features=None,
             eos_token_id=None,
@@ -1266,8 +1331,13 @@ def test_abort_requests(runner: str, dummy_test_vectors):
             output_kind = request.sampling_params.output_kind
         else:
             output_kind = request.pooling_params.output_kind
-        queue = RequestOutputCollector(output_kind=output_kind)
+        queue = RequestOutputCollector(
+            output_kind=output_kind, request_id=request.request_id
+        )
         output_processor.add_request(request, None, queue=queue)
 
     for request in requests:
-        output_processor.abort_requests([request.request_id])
+        if abort_by == "internal":
+            output_processor.abort_requests([request.request_id], internal=True)
+        else:
+            output_processor.abort_requests([request.external_req_id], internal=False)
diff --git a/tests/v1/engine/test_parallel_sampling.py b/tests/v1/engine/test_parallel_sampling.py
index 736c0e54837feaf947142193b362786a5f09570e..fe6f15df209821b03d1bc364270d73cfc081a722 100644
--- a/tests/v1/engine/test_parallel_sampling.py
+++ b/tests/v1/engine/test_parallel_sampling.py
@@ -4,11 +4,12 @@
 from vllm import SamplingParams
 from vllm.outputs import CompletionOutput
 from vllm.sampling_params import RequestOutputKind
+from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.parallel_sampling import ParentRequest
 
 
 def test_parent_request_to_output_stream() -> None:
-    parent_request = ParentRequest("parent_id", SamplingParams(n=2))
+    parent_request = ParentRequest(make_request(SamplingParams(n=2)))
     parent_request.child_requests = {"child_id_0", "child_id_1"}
     output_0 = CompletionOutput(
         index=0, text="child 0", token_ids=[], cumulative_logprob=None, logprobs=None
@@ -17,51 +18,31 @@ def test_parent_request_to_output_stream() -> None:
         index=1, text="child 1", token_ids=[], cumulative_logprob=None, logprobs=None
     )
     # Request not finished
-    assert ("parent_id", [output_0], False) == parent_request.get_outputs(
-        "child_id_0", output_0
-    )
-    assert ("parent_id", [output_1], False) == parent_request.get_outputs(
-        "child_id_1", output_1
-    )
-    assert ("parent_id", [output_0], False) == parent_request.get_outputs(
-        "child_id_0", output_0
-    )
-    assert ("parent_id", [output_1], False) == parent_request.get_outputs(
-        "child_id_1", output_1
-    )
+    assert ([output_0], False) == parent_request.get_outputs("child_id_0", output_0)
+    assert ([output_1], False) == parent_request.get_outputs("child_id_1", output_1)
+    assert ([output_0], False) == parent_request.get_outputs("child_id_0", output_0)
+    assert ([output_1], False) == parent_request.get_outputs("child_id_1", output_1)
 
     # output_1 finished
     output_1.finish_reason = "ended"
-    assert ("parent_id", [output_0], False) == parent_request.get_outputs(
-        "child_id_0", output_0
-    )
-    assert ("parent_id", [output_1], False) == parent_request.get_outputs(
-        "child_id_1", output_1
-    )
+    assert ([output_0], False) == parent_request.get_outputs("child_id_0", output_0)
+    assert ([output_1], False) == parent_request.get_outputs("child_id_1", output_1)
     # Finished output_1 had already returned, DO NOT returned again
-    assert ("parent_id", [output_0], False) == parent_request.get_outputs(
-        "child_id_0", output_0
-    )
-    assert parent_request.get_outputs("child_id_1", output_1) == (
-        "parent_id",
-        [],
-        False,
-    )
+    assert ([output_0], False) == parent_request.get_outputs("child_id_0", output_0)
+    assert parent_request.get_outputs("child_id_1", output_1) == ([], False)
 
     # output_0 finished
     output_0.finish_reason = "ended"
-    assert ("parent_id", [output_0], True) == parent_request.get_outputs(
-        "child_id_0", output_0
-    )
-    assert parent_request.get_outputs("child_id_1", output_1) == ("parent_id", [], True)
+    assert ([output_0], True) == parent_request.get_outputs("child_id_0", output_0)
+    assert parent_request.get_outputs("child_id_1", output_1) == ([], True)
     # Finished output_0 had already returned, DO NOT returned again
-    assert parent_request.get_outputs("child_id_0", output_0) == ("parent_id", [], True)
-    assert parent_request.get_outputs("child_id_1", output_1) == ("parent_id", [], True)
+    assert parent_request.get_outputs("child_id_0", output_0) == ([], True)
+    assert parent_request.get_outputs("child_id_1", output_1) == ([], True)
 
 
 def test_parent_request_to_output_final_only() -> None:
     parent_request = ParentRequest(
-        "parent_id", SamplingParams(n=2, output_kind=RequestOutputKind.FINAL_ONLY)
+        make_request(SamplingParams(n=2, output_kind=RequestOutputKind.FINAL_ONLY))
     )
     parent_request.child_requests = {"child_id_0", "child_id_1"}
     output_0 = CompletionOutput(
@@ -71,33 +52,33 @@ def test_parent_request_to_output_final_only() -> None:
         index=1, text="child 1", token_ids=[], cumulative_logprob=None, logprobs=None
     )
     # Request not finished, return nothing
-    assert parent_request.get_outputs("child_id_0", output_0) == (
-        "parent_id",
-        [],
-        False,
-    )
-    assert parent_request.get_outputs("child_id_1", output_1) == (
-        "parent_id",
-        [],
-        False,
-    )
+    assert parent_request.get_outputs("child_id_0", output_0) == ([], False)
+    assert parent_request.get_outputs("child_id_1", output_1) == ([], False)
     # output_1 finished, but outputs won't be returned until all child requests finished
     output_1.finish_reason = "ended"
-    assert parent_request.get_outputs("child_id_0", output_0) == (
-        "parent_id",
-        [],
-        False,
-    )
-    assert parent_request.get_outputs("child_id_1", output_1) == (
-        "parent_id",
-        [],
-        False,
-    )
+    assert parent_request.get_outputs("child_id_0", output_0) == ([], False)
+    assert parent_request.get_outputs("child_id_1", output_1) == ([], False)
     # output_0 finished, as all child requests finished, the output would be returned
     output_0.finish_reason = "ended"
-    assert ("parent_id", [output_0, output_1], True) == parent_request.get_outputs(
+    assert ([output_0, output_1], True) == parent_request.get_outputs(
         "child_id_0", output_0
     )
-    assert ("parent_id", [output_0, output_1], True) == parent_request.get_outputs(
+    assert ([output_0, output_1], True) == parent_request.get_outputs(
         "child_id_1", output_1
     )
+
+
+def make_request(sampling_params: SamplingParams) -> EngineCoreRequest:
+    return EngineCoreRequest(
+        request_id="parent_id",
+        external_req_id="ext_parent_id",
+        prompt_token_ids=None,
+        mm_features=None,
+        sampling_params=sampling_params,
+        pooling_params=None,
+        eos_token_id=None,
+        arrival_time=0.0,
+        lora_request=None,
+        cache_salt=None,
+        data_parallel_rank=None,
+    )
diff --git a/tests/v1/engine/test_preprocess_error_handling.py b/tests/v1/engine/test_preprocess_error_handling.py
new file mode 100644
index 0000000000000000000000000000000000000000..13649a52be9adad7aa6efc1e476939b833d78082
--- /dev/null
+++ b/tests/v1/engine/test_preprocess_error_handling.py
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch.cuda
+
+from vllm import LLM, SamplingParams
+from vllm.platforms import current_platform
+from vllm.v1.engine import EngineCoreRequest
+from vllm.v1.engine.core import EngineCore
+
+MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
+
+
+def test_preprocess_error_handling(monkeypatch: pytest.MonkeyPatch):
+    """Test that preprocessing errors are handled gracefully."""
+
+    if current_platform.is_rocm() or current_platform.is_xpu():
+        pytest.skip(
+            "Skipped on ROCm/XPU: this test only works with 'fork', "
+            "but ROCm/XPU uses 'spawn'."
+        )
+
+    assert not torch.cuda.is_initialized(), (
+        "fork needs to be used for the engine "
+        "core process and this isn't possible if cuda is already initialized"
+    )
+
+    # Store original method to call for non-failing requests
+    original_preprocess = EngineCore.preprocess_add_request
+
+    # Monkeypatch to make preprocess_add_request raise an exception
+    # only for requests with "FAIL" in the first token
+    def conditional_failing_preprocess(self, request: EngineCoreRequest):
+        # Fail if the first token id is 333
+        if request.prompt_token_ids and request.prompt_token_ids[0] == 333:
+            raise ValueError("Simulated preprocessing error!")
+        return original_preprocess(self, request)
+
+    monkeypatch.setattr(
+        EngineCore, "preprocess_add_request", conditional_failing_preprocess
+    )
+
+    llm = LLM(model=MODEL_NAME)
+
+    # Create a failing request by crafting a request with an invalid token
+    # We need to use a direct approach since LLM.generate tokenizes for us
+    from vllm.inputs import TokensPrompt
+
+    # This should raise an exception due to the preprocessing failure
+    # Special token id to trigger the failure
+    failing_prompt = TokensPrompt(prompt_token_ids=[333])
+    outputs = llm.generate(failing_prompt, SamplingParams(max_tokens=10))  # type: ignore
+    assert len(outputs) == 1
+    assert len(outputs[0].outputs[0].token_ids) == 0
+    assert outputs[0].finished
+    assert outputs[0].outputs[0].finish_reason == "error"
+
+    # Verify the engine is still functional with a normal request
+    outputs = llm.generate("Hello, my name is", SamplingParams(max_tokens=10))
+    assert len(outputs) == 1
+    assert len(outputs[0].outputs[0].token_ids) > 0
+    assert outputs[0].outputs[0].finish_reason in ("stop", "length")
diff --git a/tests/v1/engine/test_process_multi_modal_uuids.py b/tests/v1/engine/test_process_multi_modal_uuids.py
index 1b11b8af49d17bf360426d07d7557812440dd61d..1a16e391316f1958aabd1509ce9232289159a381 100644
--- a/tests/v1/engine/test_process_multi_modal_uuids.py
+++ b/tests/v1/engine/test_process_multi_modal_uuids.py
@@ -6,6 +6,7 @@ import pytest
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
 from vllm.config import CacheConfig, DeviceConfig, ModelConfig, VllmConfig
+from vllm.multimodal import MultiModalUUIDDict
 from vllm.sampling_params import SamplingParams
 from vllm.v1.engine import input_processor as input_processor_mod
 from vllm.v1.engine.input_processor import InputProcessor
@@ -166,7 +167,7 @@ def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch):
         monkeypatch, mm_cache_gb=0.0, enable_prefix_caching=False
     )
 
-    captured: dict[str, object] = {}
+    captured: dict[str, MultiModalUUIDDict] = {}
 
     def fake_preprocess(
         prompt, *, tokenization_kwargs=None, lora_request=None, mm_uuids=None
@@ -196,7 +197,16 @@ def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch):
     )
 
     # Expect request-id-based overrides are passed through
-    assert captured["mm_uuids"] == {
-        "image": [f"{request_id}-image-0", f"{request_id}-image-1"],
-        "video": [f"{request_id}-video-0"],
-    }
+    mm_uuids = captured["mm_uuids"]
+    assert set(mm_uuids.keys()) == {"image", "video"}
+    assert len(mm_uuids["image"]) == 2
+    assert len(mm_uuids["video"]) == 1
+    assert mm_uuids["image"][0].startswith(f"{request_id}-image-") and mm_uuids[
+        "image"
+    ][0].endswith("-0")
+    assert mm_uuids["image"][1].startswith(f"{request_id}-image-") and mm_uuids[
+        "image"
+    ][1].endswith("-1")
+    assert mm_uuids["video"][0].startswith(f"{request_id}-video-") and mm_uuids[
+        "video"
+    ][0].endswith("-0")
diff --git a/tests/v1/engine/utils.py b/tests/v1/engine/utils.py
index 3541ef89bfc14c2018a88582ca80f49f1517188c..d14775668147e6784fe45b5c19e59ceec0e72231 100644
--- a/tests/v1/engine/utils.py
+++ b/tests/v1/engine/utils.py
@@ -343,6 +343,7 @@ class MockEngineCore:
         eos_token_id: int | None = None,
         stop_token_ids: list[int] | None = None,
         ignore_eos: bool = False,
+        request_ids: list[str] | None = None,
     ) -> None:
         self.num_requests = len(tokens_list)
         self.tokens_list = tokens_list
@@ -355,6 +356,11 @@ class MockEngineCore:
         self.eos_token_id = eos_token_id
         self.stop_token_ids = stop_token_ids
         self.ignore_eos = ignore_eos
+        self.request_ids = (
+            request_ids
+            if request_ids is not None
+            else [f"request-{i}" for i in range(self.num_requests)]
+        )
 
     def get_outputs(self) -> list[EngineCoreOutput]:
         do_logprobs = self.do_logprobs
@@ -386,7 +392,7 @@ class MockEngineCore:
                     prompt_logprobs = None
                 new_token_id = token_ids[token_idx]
                 output = EngineCoreOutput(
-                    request_id=f"request-{req_idx}",
+                    request_id=self.request_ids[req_idx],
                     new_token_ids=[new_token_id],
                     new_logprobs=logprobs,
                     new_prompt_logprobs_tensors=prompt_logprobs,
diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index 8ba9d275bc9ffbb2bc880c0e673b05836c409668..ec4b7f18d84c8dfe60d549d8216b3f78e3eaba8c 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -610,7 +610,7 @@ Make the response as short as possible.
 
 
 @pytest.mark.parametrize(
-    "model_name, backend, tokenizer_mode, reasoning_parser, speculative_config",  # noqa: E501
+    "model_name, backend, tokenizer_mode, reasoning_parser, speculative_config, async_scheduling",  # noqa: E501
     [
         (
             "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
@@ -618,8 +618,10 @@ Make the response as short as possible.
             "auto",
             "deepseek_r1",
             NGRAM_SPEC_CONFIG,
+            False,
         ),
-        ("Qwen/Qwen3-1.7B", "xgrammar", "auto", "deepseek_r1", None),
+        ("Qwen/Qwen3-1.7B", "xgrammar", "auto", "deepseek_r1", None, False),
+        ("Qwen/Qwen3-1.7B", "xgrammar", "auto", "deepseek_r1", None, True),
     ],
 )
 def test_structured_output_with_reasoning_matrices(
@@ -628,6 +630,7 @@ def test_structured_output_with_reasoning_matrices(
     reasoning_parser: str,
     model_name: str,
     speculative_config: dict[str, Any] | None,
+    async_scheduling: bool,
 ):
     if current_platform.is_tpu() and speculative_config:
         pytest.skip("TPU does not support speculative decoding")
@@ -648,6 +651,7 @@ def test_structured_output_with_reasoning_matrices(
         ),
         tokenizer_mode=tokenizer_mode,
         speculative_config=speculative_config,
+        async_scheduling=async_scheduling,
     )
     tokenizer = llm.get_tokenizer()
     reasoner = ReasoningParserManager.get_reasoning_parser(reasoning_parser)(
diff --git a/tests/v1/entrypoints/openai/serving_responses/test_image.py b/tests/v1/entrypoints/openai/serving_responses/test_image.py
index be5693bbf27360efb0004599a9b41c317afe79ce..644d8ce00686e0d1a36121ac25a132a51420e2ac 100644
--- a/tests/v1/entrypoints/openai/serving_responses/test_image.py
+++ b/tests/v1/entrypoints/openai/serving_responses/test_image.py
@@ -8,7 +8,7 @@ import pytest
 import pytest_asyncio
 
 from tests.utils import RemoteOpenAIServer
-from vllm.multimodal.utils import encode_image_base64
+from vllm.multimodal.utils import encode_image_url
 
 # Use a small vision model for testing
 MODEL_NAME = "Qwen/Qwen2.5-VL-3B-Instruct"
@@ -52,9 +52,9 @@ async def client(image_server):
 
 
 @pytest.fixture(scope="session")
-def base64_encoded_image(local_asset_server) -> dict[str, str]:
+def url_encoded_image(local_asset_server) -> dict[str, str]:
     return {
-        image_url: encode_image_base64(local_asset_server.get_image_asset(image_url))
+        image_url: encode_image_url(local_asset_server.get_image_asset(image_url))
         for image_url in TEST_IMAGE_ASSETS
     }
 
@@ -95,7 +95,7 @@ async def test_single_chat_session_image_base64encoded(
     client: openai.AsyncOpenAI,
     model_name: str,
     raw_image_url: str,
-    base64_encoded_image: dict[str, str],
+    url_encoded_image: dict[str, str],
 ):
     content_text = "What's in this image?"
     messages = [
@@ -104,7 +104,7 @@ async def test_single_chat_session_image_base64encoded(
             "content": [
                 {
                     "type": "input_image",
-                    "image_url": f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}",  # noqa: E501
+                    "image_url": url_encoded_image[raw_image_url],
                     "detail": "auto",
                 },
                 {"type": "input_text", "text": content_text},
diff --git a/tests/v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
new file mode 100755
index 0000000000000000000000000000000000000000..a9c2c47eba47f4acde977247f099f335b413bd18
--- /dev/null
+++ b/tests/v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
@@ -0,0 +1,54 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Utility to run integration tests sequentially with varying TP configurations.
+SCRIPT="v1/kv_connector/nixl_integration/run_accuracy_test.sh"
+
+# Define test configurations
+tp_configs=(
+  "GPU_MEMORY_UTILIZATION=0.6 PREFILLER_TP_SIZE=2 DECODER_TP_SIZE=2"
+  "GPU_MEMORY_UTILIZATION=0.6 PREFILLER_TP_SIZE=1 DECODER_TP_SIZE=2"
+  "GPU_MEMORY_UTILIZATION=0.6 PREFILLER_TP_SIZE=2 DECODER_TP_SIZE=1"
+  "GPU_MEMORY_UTILIZATION=0.8 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny" # MLA case
+  "GPU_MEMORY_UTILIZATION=0.8 PREFILLER_TP_SIZE=1 DECODER_TP_SIZE=2 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny"
+  "GPU_MEMORY_UTILIZATION=0.8 PREFILLER_TP_SIZE=2 DECODER_TP_SIZE=1 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny"
+)
+dp_ep_configs=(
+"DP_EP=1 GPU_MEMORY_UTILIZATION=0.8 PREFILLER_TP_SIZE=1 DECODER_TP_SIZE=2 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny" # MLA+P-TP1, D-DPEP=2 (TP=1) 
+"DP_EP=1 GPU_MEMORY_UTILIZATION=0.8 PREFILLER_TP_SIZE=2 DECODER_TP_SIZE=2 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny" # MLA+P-TP2, D-DPEP=2 (TP=1) 
+)
+
+# Select config array based on DP_EP env var
+if [[ -n "${DP_EP:-}" ]]; then
+  configs=("${dp_ep_configs[@]}")
+  echo "DP_EP is set, using dp_ep_configs"
+else
+  configs=("${tp_configs[@]}")
+fi
+
+run_tests() {
+  local label=$1
+  local extra_args=$2
+
+  echo "=== Running tests (${label}) ==="
+  for cfg in "${configs[@]}"; do
+    echo "-> Running with ${cfg} ${extra_args:+and ${extra_args}}"
+    # Use 'env' to safely set variables without eval
+    if ! env ${cfg} bash "${SCRIPT}" ${extra_args}; then
+      echo "❌ Test failed for config: ${cfg} ${extra_args:+(${extra_args})}"
+      exit 1
+    fi
+  done
+  echo "✅ All ${label} tests passed!"
+}
+
+# Run tests
+run_tests "default backend" ""
+
+# Check if FLASHINFER is set (non-empty)
+if [[ -n "${FLASHINFER:-}" ]]; then
+  echo "FLASHINFER is set, rerunning with --attention-backend FLASHINFER"
+  run_tests "FLASHINFER backend" "--attention-backend FLASHINFER"
+else
+  echo "FLASHINFER not set, skipping FLASHINFER runs."
+fi
diff --git a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
index 453ccc81eb14a4bb2a6578652903e6cd7dcdd085..c2c38f51c50031ff37979a73ca5244c37ba85165 100755
--- a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
+++ b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
@@ -3,21 +3,29 @@ set -xe
 
 # Parse command line arguments
 KV_BUFFER_DEVICE="cuda"  # Default to cuda
+ATTENTION_BACKEND=""  # Default to empty (use vllm default)
 while [[ $# -gt 0 ]]; do
   case $1 in
     --kv_buffer_device)
       KV_BUFFER_DEVICE="$2"
       shift 2
       ;;
+    --attention-backend)
+      ATTENTION_BACKEND="$2"
+      shift 2
+      ;;
     *)
       echo "Unknown option $1"
-      echo "Usage: $0 [--kv_buffer_device <cuda|cpu>]"
+      echo "Usage: $0 [--kv_buffer_device <cuda|cpu>] [--attention-backend <backend>]"
       exit 1
       ;;
   esac
 done
 
 echo "Running accuracy tests with kv_buffer_device=$KV_BUFFER_DEVICE"
+if [[ -n "$ATTENTION_BACKEND" ]]; then
+  echo "Using attention backend: $ATTENTION_BACKEND"
+fi
 
 DECODER_KV_LAYOUT=${DECODER_KV_LAYOUT:-"HND"} # Default to HND, optional NHD
 if [[ "$DECODER_KV_LAYOUT" == "NHD" ]]; then
@@ -148,6 +156,11 @@ run_tests_for_model() {
     --tensor-parallel-size $PREFILLER_TP_SIZE \
     --kv-transfer-config '$KV_CONFIG'"
 
+    # Add attention backend config if specified
+    if [[ -n "$ATTENTION_BACKEND" ]]; then
+      BASE_CMD="${BASE_CMD} --attention-backend=$ATTENTION_BACKEND"
+    fi
+
     if [ -n "$model_args" ]; then
     FULL_CMD="$BASE_CMD $model_args"
     else
@@ -188,7 +201,12 @@ run_tests_for_model() {
     --block-size ${DECODE_BLOCK_SIZE} \
     --gpu-memory-utilization $GPU_MEMORY_UTILIZATION \
     --kv-transfer-config '$KV_CONFIG'"
-  
+
+    # Add attention backend config if specified
+    if [[ -n "$ATTENTION_BACKEND" ]]; then
+      BASE_CMD="${BASE_CMD} --attention-backend=$ATTENTION_BACKEND"
+    fi
+
   # DP-EP attention mode
   if [[ -z "$DP_EP" ]]; then
     BASE_CMD="${BASE_CMD} --tensor-parallel-size $DECODER_TP_SIZE"
diff --git a/tests/v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
deleted file mode 100755
index 9308c81da06353a299c8633b1e0b01eb62d4c873..0000000000000000000000000000000000000000
--- a/tests/v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
+++ /dev/null
@@ -1,41 +0,0 @@
-#!/usr/bin/env bash
-set -euo pipefail
-
-# Utility to run integration tests sequentially with varying TP configurations.
-SCRIPT="v1/kv_connector/nixl_integration/run_accuracy_test.sh"
-
-# Define test configurations
-configs=(
-  "GPU_MEMORY_UTILIZATION=0.6 PREFILLER_TP_SIZE=2 DECODER_TP_SIZE=2"
-  "GPU_MEMORY_UTILIZATION=0.6 PREFILLER_TP_SIZE=1 DECODER_TP_SIZE=2"
-  "GPU_MEMORY_UTILIZATION=0.8 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny" # MLA case
-  "GPU_MEMORY_UTILIZATION=0.8 PREFILLER_TP_SIZE=1 DECODER_TP_SIZE=2 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny"
-  "DP_EP=1 GPU_MEMORY_UTILIZATION=0.8 PREFILLER_TP_SIZE=1 DECODER_TP_SIZE=2 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny" # MLA+P-TP1, D-DPEP=2 (TP=1) 
-)
-
-run_tests() {
-  local label=$1
-  local extra_env=$2
-
-  echo "=== Running tests (${label}) ==="
-  for cfg in "${configs[@]}"; do
-    echo "-> Running with ${cfg} ${extra_env:+and ${extra_env}}"
-    # Use 'env' to safely set variables without eval
-    if ! env ${extra_env} ${cfg} bash "${SCRIPT}"; then
-      echo "❌ Test failed for config: ${cfg} ${extra_env:+(${extra_env})}"
-      exit 1
-    fi
-  done
-  echo "✅ All ${label} tests passed!"
-}
-
-# Run tests
-run_tests "default backend" ""
-
-# Check if FLASHINFER is set (non-empty)
-if [[ -n "${FLASHINFER:-}" ]]; then
-  echo "FLASHINFER is set, rerunning with VLLM_ATTENTION_BACKEND=FLASHINFER"
-  run_tests "FLASHINFER backend" "VLLM_ATTENTION_BACKEND=FLASHINFER"
-else
-  echo "FLASHINFER not set, skipping FLASHINFER runs."
-fi
diff --git a/tests/v1/kv_connector/unit/test_backwards_compatibility.py b/tests/v1/kv_connector/unit/test_backwards_compatibility.py
index 0d29ca5fca5e52ac87092a3664372f0af1284f8f..da6a5aadbc6d569bbe79b47cecdf7b3b8ee24669 100644
--- a/tests/v1/kv_connector/unit/test_backwards_compatibility.py
+++ b/tests/v1/kv_connector/unit/test_backwards_compatibility.py
@@ -14,12 +14,12 @@ from unittest.mock import patch
 
 import pytest
 
-from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory
 from vllm.distributed.kv_transfer.kv_connector.v1 import (
     KVConnectorBase_V1,
     KVConnectorRole,
 )
+from vllm.v1.attention.backend import AttentionMetadata
 from vllm.v1.core.sched.output import SchedulerOutput
 
 from .utils import create_scheduler, create_vllm_config
diff --git a/tests/v1/kv_connector/unit/test_config.py b/tests/v1/kv_connector/unit/test_config.py
index 6cf86f3d5c4ac09f8072efa1ac50c0e8ecc92534..8a547c3f03f9565f6301ce6813fec639ac6ef8c2 100644
--- a/tests/v1/kv_connector/unit/test_config.py
+++ b/tests/v1/kv_connector/unit/test_config.py
@@ -15,11 +15,12 @@ pytestmark = pytest.mark.cpu_test
     [
         ("native", 4.0, 1, 1, "OffloadingConnector", 4.0 * (1 << 30)),
         # bytes per rank: 8.0 GiB / (2 * 2) = 2.0 GiB
-        ("native", 8.0, 2, 2, "OffloadingConnector", 8.0 * (1 << 30) / 4),
+        ("native", 8.0, 2, 2, "OffloadingConnector", 8.0 * (1 << 30)),
         ("lmcache", 4.0, 1, 1, "LMCacheConnectorV1", 4.0),
         # size per rank: 8.0 GiB / (2 * 2) = 2.0 GiB
         ("lmcache", 8.0, 2, 2, "LMCacheConnectorV1", 2.0),
-        (None, None, 1, 1, None, None),
+        # When kv_offloading_size is None, offloading is disabled (backend is ignored)
+        ("native", None, 1, 1, None, None),
     ],
 )
 def test_kv_connector(
@@ -54,8 +55,7 @@ def test_kv_connector(
     assert kv_transfer_config.kv_role == "kv_both"
 
     if kv_offloading_backend == "native":
-        assert kv_connector_extra_config["kv_bytes_per_rank"] == expected_bytes
-        assert kv_connector_extra_config["num_cpu_blocks"] == 0
+        assert kv_connector_extra_config["cpu_bytes_to_use"] == expected_bytes
         # Existing config should be preserved
         assert kv_connector_extra_config["existing_key"] == "existing_value"
     elif kv_offloading_backend == "lmcache":
@@ -63,3 +63,19 @@ def test_kv_connector(
         assert kv_connector_extra_config["lmcache.max_local_cpu_size"] == expected_bytes
         # Existing config should be replaced
         assert "existing_key" not in kv_connector_extra_config
+
+
+def test_kv_offloading_size_only_uses_native_default():
+    """Test that setting only kv_offloading_size enables native offloading."""
+    vllm_config = VllmConfig(
+        cache_config=CacheConfig(
+            kv_offloading_size=4.0,
+            # kv_offloading_backend not set, should default to "native"
+        ),
+    )
+
+    kv_transfer_config = vllm_config.kv_transfer_config
+    kv_connector_extra_config = kv_transfer_config.kv_connector_extra_config
+    assert kv_transfer_config.kv_connector == "OffloadingConnector"
+    assert kv_transfer_config.kv_role == "kv_both"
+    assert kv_connector_extra_config["cpu_bytes_to_use"] == 4.0 * (1 << 30)
diff --git a/tests/v1/kv_connector/unit/test_example_connector.py b/tests/v1/kv_connector/unit/test_example_connector.py
index 75edb79fb4af45f53cff34c0a0acd086d6a22006..d415608c95faa4bae9ea1ffe2ee059f2685903fb 100644
--- a/tests/v1/kv_connector/unit/test_example_connector.py
+++ b/tests/v1/kv_connector/unit/test_example_connector.py
@@ -9,7 +9,7 @@ from PIL import Image
 from vllm import LLM, EngineArgs, SamplingParams
 from vllm.assets.image import ImageAsset
 from vllm.config import KVTransferConfig
-from vllm.multimodal.utils import encode_image_base64
+from vllm.multimodal.utils import encode_image_url
 from vllm.platforms import current_platform
 
 MODEL_NAME = "RedHatAI/Qwen2.5-VL-3B-Instruct-quantized.w8a8"
@@ -74,7 +74,7 @@ def process_prompt(processor, llm: LLM, question: str, image_urls: list[Image]):
     placeholders = [
         {
             "type": "image_url",
-            "image_url": {"url": f"data:image;base64,{encode_image_base64(image_pil)}"},
+            "image_url": {"url": encode_image_url(image_pil)},
         }
         for image_pil in image_urls
     ]
@@ -145,7 +145,7 @@ def test_shared_storage_connector_hashes(tmp_path):
 
     # don't put this import at the top level
     # it will call torch.cuda.device_count()
-    from transformers import AutoProcessor  # noqa: F401
+    from transformers import AutoProcessor
 
     # Create processor to handle the chat prompt
     processor = AutoProcessor.from_pretrained(MODEL_NAME)
diff --git a/tests/v1/kv_connector/unit/test_lmcache_connector.py b/tests/v1/kv_connector/unit/test_lmcache_connector.py
index 6a8cfc71a67a68c956b3256e46940e647e37c094..c3df2b68b1ff12481b3f977ec4dab6ea49b20f53 100644
--- a/tests/v1/kv_connector/unit/test_lmcache_connector.py
+++ b/tests/v1/kv_connector/unit/test_lmcache_connector.py
@@ -25,6 +25,7 @@ def mock_lmcache_engine_event():
             lora_id,
             block_size,
             medium,
+            lora_name,
         ):
             self.block_hashes = block_hashes
             self.parent_block_hash = parent_block_hash
@@ -32,6 +33,7 @@ def mock_lmcache_engine_event():
             self.lora_id = lora_id
             self.block_size = block_size
             self.medium = medium
+            self.lora_name = lora_name
 
     return MockEvent(
         block_hashes=["hash1", "hash2"],
@@ -40,6 +42,7 @@ def mock_lmcache_engine_event():
         lora_id=None,
         block_size=16,
         medium="GPU",
+        lora_name=None,
     )
 
 
@@ -109,6 +112,7 @@ class TestGetKVConnectorKVCacheEvents:
         assert events[0].lora_id is None
         assert events[0].block_size == 16
         assert events[0].medium == "GPU"
+        assert events[0].lora_name is None
 
     def test_converts_multiple_events(self, mock_connector):
         """Test conversion of multiple events from lmcache engine format."""
@@ -121,6 +125,7 @@ class TestGetKVConnectorKVCacheEvents:
                 self.lora_id = None
                 self.block_size = 16
                 self.medium = "GPU"
+                self.lora_name = None
 
         events = [MockEvent(i) for i in range(5)]
         mock_connector._lmcache_engine.get_kv_events.return_value = events
@@ -150,6 +155,7 @@ class TestGetKVConnectorKVCacheEvents:
                 self.lora_id = 42
                 self.block_size = 32
                 self.medium = "DISK"
+                self.lora_name = "lora_example"
 
         mock_connector._lmcache_engine.get_kv_events.return_value = [
             MockEventWithLora()
@@ -166,6 +172,7 @@ class TestGetKVConnectorKVCacheEvents:
         assert event.lora_id == 42
         assert event.block_size == 32
         assert event.medium == "DISK"
+        assert event.lora_name == "lora_example"
 
     def test_handles_none_parent_block_hash(self, mock_connector):
         """Test handling of events with None parent_block_hash."""
@@ -178,6 +185,7 @@ class TestGetKVConnectorKVCacheEvents:
                 self.lora_id = None
                 self.block_size = 16
                 self.medium = "GPU"
+                self.lora_name = None
 
         mock_connector._lmcache_engine.get_kv_events.return_value = [
             MockEventNoParent()
@@ -223,6 +231,7 @@ class TestUpdateConnectorOutput:
             block_size=16,
             lora_id=None,
             medium="GPU",
+            lora_name=None,
         )
         kv_events.add_events([event])
 
@@ -243,6 +252,7 @@ class TestUpdateConnectorOutput:
             block_size=16,
             lora_id=None,
             medium="GPU",
+            lora_name=None,
         )
         existing_events.add_events([event1])
         existing_events.add_events([event1])  # Simulate 2 workers reporting
@@ -258,6 +268,7 @@ class TestUpdateConnectorOutput:
             block_size=16,
             lora_id=None,
             medium="GPU",
+            lora_name=None,
         )
         new_events.add_events([event2])
 
@@ -288,6 +299,7 @@ class TestUpdateConnectorOutput:
             block_size=16,
             lora_id=None,
             medium="GPU",
+            lora_name=None,
         )
         new_events.add_events([event])
 
@@ -309,6 +321,7 @@ class TestUpdateConnectorOutput:
             block_size=16,
             lora_id=None,
             medium="GPU",
+            lora_name=None,
         )
         events1.add_events([event1])
         output1 = KVConnectorOutput(kv_cache_events=events1)
@@ -323,6 +336,7 @@ class TestUpdateConnectorOutput:
             block_size=16,
             lora_id=None,
             medium="GPU",
+            lora_name=None,
         )
         events2.add_events([event2])
         output2 = KVConnectorOutput(kv_cache_events=events2)
@@ -337,6 +351,7 @@ class TestUpdateConnectorOutput:
             block_size=16,
             lora_id=None,
             medium="GPU",
+            lora_name=None,
         )
         events3.add_events([event3])
         output3 = KVConnectorOutput(kv_cache_events=events3)
@@ -358,6 +373,7 @@ class TestUpdateConnectorOutput:
             block_size=16,
             lora_id=None,
             medium="GPU",
+            lora_name=None,
         )
         events1.add_events([event1])
         output1 = KVConnectorOutput(kv_cache_events=events1)
@@ -397,6 +413,7 @@ class TestTakeEvents:
             block_size=16,
             lora_id=None,
             medium="GPU",
+            lora_name=None,
         )
         event2 = BlockStored(
             block_hashes=["hash2"],
@@ -405,6 +422,7 @@ class TestTakeEvents:
             block_size=16,
             lora_id=None,
             medium="GPU",
+            lora_name=None,
         )
         kv_events.add_events([event1, event2])
         mock_connector._kv_cache_events = kv_events
@@ -431,6 +449,7 @@ class TestTakeEvents:
             block_size=16,
             lora_id=None,
             medium="GPU",
+            lora_name=None,
         )
         uncommon_event = BlockStored(
             block_hashes=["hash_uncommon"],
@@ -439,6 +458,7 @@ class TestTakeEvents:
             block_size=16,
             lora_id=None,
             medium="GPU",
+            lora_name=None,
         )
 
         # All 3 workers report common_event
@@ -469,6 +489,7 @@ class TestTakeEvents:
             block_size=16,
             lora_id=None,
             medium="GPU",
+            lora_name=None,
         )
         kv_events1.add_events([event1])
         mock_connector._kv_cache_events = kv_events1
@@ -491,6 +512,7 @@ class TestTakeEvents:
             block_size=16,
             lora_id=None,
             medium="GPU",
+            lora_name=None,
         )
         kv_events2.add_events([event2])
         mock_connector._kv_cache_events = kv_events2
@@ -510,6 +532,7 @@ class TestTakeEvents:
             block_size=16,
             lora_id=None,
             medium="GPU",
+            lora_name=None,
         )
         event2 = BlockStored(
             block_hashes=["hash2"],
@@ -518,6 +541,7 @@ class TestTakeEvents:
             block_size=16,
             lora_id=None,
             medium="GPU",
+            lora_name=None,
         )
 
         # Worker 1 reports event1
@@ -572,6 +596,7 @@ class TestIntegrationScenarios:
                 self.lora_id = None
                 self.block_size = 16
                 self.medium = "GPU"
+                self.lora_name = None
 
         # Worker 1
         mock_connector._lmcache_engine.get_kv_events.return_value = [
@@ -628,6 +653,7 @@ class TestIntegrationScenarios:
                 self.lora_id = None
                 self.block_size = 16
                 self.medium = "GPU"
+                self.lora_name = None
 
         for cycle in range(3):
             # Get events
@@ -667,6 +693,7 @@ class TestIntegrationScenarios:
             block_size=16,
             lora_id=None,
             medium="GPU",
+            lora_name=None,
         )
 
         worker1_unique_event = BlockStored(
@@ -676,6 +703,7 @@ class TestIntegrationScenarios:
             block_size=16,
             lora_id=None,
             medium="GPU",
+            lora_name=None,
         )
 
         worker2_unique_event = BlockStored(
@@ -685,6 +713,7 @@ class TestIntegrationScenarios:
             block_size=16,
             lora_id=None,
             medium="GPU",
+            lora_name=None,
         )
 
         worker3_unique_event = BlockStored(
@@ -694,6 +723,7 @@ class TestIntegrationScenarios:
             block_size=16,
             lora_id=None,
             medium="GPU",
+            lora_name=None,
         )
 
         # Create events for each worker
diff --git a/tests/v1/kv_connector/unit/test_moriio_connector.py b/tests/v1/kv_connector/unit/test_moriio_connector.py
new file mode 100644
index 0000000000000000000000000000000000000000..1cc6988635d8dda177a3220915e39cbcf199fa7a
--- /dev/null
+++ b/tests/v1/kv_connector/unit/test_moriio_connector.py
@@ -0,0 +1,545 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import importlib.util
+import os
+from unittest.mock import MagicMock, patch
+
+import msgspec
+import pytest
+import torch
+import zmq
+
+from tests.conftest import _find_free_port
+from vllm.config import (
+    CacheConfig,
+    DeviceConfig,
+    KVTransferConfig,
+    ModelConfig,
+    SchedulerConfig,
+    VllmConfig,
+)
+from vllm.distributed.kv_transfer.kv_connector.v1.moriio.moriio_common import (
+    MoRIIOAgentMetadata,
+    MoRIIOConnectorMetadata,
+    MoRIIOConstants,
+    zmq_ctx,
+)
+from vllm.distributed.kv_transfer.kv_connector.v1.moriio.moriio_connector import (
+    KVConnectorRole,
+    MoRIIOConnector,
+    MoRIIOConnectorWorker,
+)
+from vllm.platforms import current_platform
+from vllm.utils.network_utils import (
+    get_ip,
+    make_zmq_path,
+)
+
+from .utils import create_request, create_scheduler
+
+aiter_available = importlib.util.find_spec("aiter") is not None
+mori_available = importlib.util.find_spec("mori") is not None
+pytestmark = pytest.mark.skipif(
+    not (current_platform.is_rocm() and mori_available),
+    reason="MoRIIOs are only available on ROCm with aiter package installed",
+)
+
+
+@pytest.fixture
+def mock_parallel_groups():
+    """Mock tensor/data parallel group functions for single-rank tests."""
+    mock_group = MagicMock()
+    mock_group.rank = 0
+    mock_group.local_rank = 0
+    mock_group.world_size = 1
+
+    with (
+        patch.multiple(
+            "vllm.distributed.kv_transfer.kv_connector.v1.moriio.moriio_common",
+            get_tensor_model_parallel_rank=MagicMock(return_value=0),
+            get_tensor_model_parallel_world_size=MagicMock(return_value=0),
+        ),
+        patch.multiple(
+            "vllm.distributed.kv_transfer.kv_connector.v1.moriio.moriio_connector",
+            get_tensor_model_parallel_world_size=MagicMock(return_value=0),
+            get_world_group=MagicMock(return_value=mock_group),
+            get_tp_group=MagicMock(return_value=mock_group),
+        ),
+    ):
+        yield mock_group
+
+
+def _setup_kv_transfer_request(request, remote_host="127.0.0.1", fake_port=4789):
+    """Setup KV transfer parameters for a request."""
+    request.kv_transfer_params.update(
+        {
+            "remote_notify_port": fake_port,
+            "remote_block_ids": None,
+            "remote_host": remote_host,
+            "remote_port": fake_port,
+            "remote_handshake_port": fake_port,
+            "remote_engine_id": "test_engine",
+        }
+    )
+    return request
+
+
+class FakeMorIIOWrapper:
+    # A fake MoRIIOWrapper for testing purposes
+    def __init__(self, *args, **kwargs):
+        pass
+
+    def set_moriio_engine(self, moriio_engine):
+        pass
+
+    def set_backend_type(self, backend_type):
+        pass
+
+    def get_agent_metadata(self):
+        pass
+
+    def register_remote_engine(self, remote_packed_engine_metadata):
+        pass
+
+    def register_local_tensor(self, tensor: torch.Tensor):
+        pass
+
+    def get_unpack_memory_metadata(self, packed_memory_metadata):
+        pass
+
+    def build_session(self, local_memory_metadata, remote_memory_metadata):
+        pass
+
+    def read_remote_data(
+        self, transfer_size_byte, local_offset=0, remote_offset=0, session=None
+    ):
+        pass
+
+    def write_remote_data(
+        self, transfer_size_byte, local_offset=0, remote_offset=0, session=None
+    ):
+        pass
+
+    def write_remote_data_single(
+        self, transfer_size_byte, local_offset=0, remote_offset=0, sess_idx=0
+    ):
+        pass
+
+    def waiting_for_transfer_complete(self):
+        pass
+
+    def async_wait_reqid(self):
+        pass
+
+    def _handle_message(self, msg: bytes):
+        pass
+
+    def _handle_structured_message(self, data: dict):
+        pass
+
+    def _handle_completion_message(self, msg: str):
+        pass
+
+    def send_notify(self, req_ids, remote_ip, remote_port):
+        pass
+
+    def pop_finished_req_ids(self):
+        pass
+
+    def pop_finished_write_req_ids(self):
+        pass
+
+    def shutdown(self):
+        pass
+
+
+class FakeMorIIOConnectorWorker(MoRIIOConnectorWorker):
+    # Define a fake remote engine id for testing
+    REMOTE_ENGINE_ID = "remote_engine"
+
+    def __init__(
+        self, *args, hand_shake_latency: float = 1.8, kv_cache_layout="HND", **kwargs
+    ):
+        super().__init__(*args, **kwargs)
+
+
+def create_vllm_config(
+    model: str = "facebook/opt-125m",
+    max_num_seqs: int = 16,
+    max_num_batched_tokens: int = 64,
+    block_size: int = 16,
+    max_model_len: int = 10000,
+    enable_chunked_prefill: bool = True,
+    enable_permute_local_kv: bool = False,
+    role="kv_consumer",
+) -> VllmConfig:
+    """Initialize VllmConfig for testing."""
+    scheduler_config = SchedulerConfig(
+        max_num_seqs=max_num_seqs,
+        max_num_batched_tokens=max_num_batched_tokens,
+        max_model_len=max_model_len,
+        enable_chunked_prefill=enable_chunked_prefill,
+        is_encoder_decoder=False,
+    )
+    model_config = ModelConfig(
+        model=model,
+        trust_remote_code=True,
+        dtype="bfloat16",
+        seed=42,
+    )
+    # Cache config, optionally force APC
+    cache_config = CacheConfig(
+        block_size=block_size,
+        gpu_memory_utilization=0.9,
+        swap_space=0,
+        cache_dtype="auto",
+        enable_prefix_caching=True,
+    )
+    kv_transfer_config = KVTransferConfig(
+        kv_connector="MoRIIOConnector",
+        kv_role=role,
+        enable_permute_local_kv=enable_permute_local_kv,
+    )
+    return VllmConfig(
+        scheduler_config=scheduler_config,
+        model_config=model_config,
+        cache_config=cache_config,
+        kv_transfer_config=kv_transfer_config,
+        device_config=DeviceConfig("cpu"),
+    )
+
+
+@pytest.fixture
+def moriio_read_mode():
+    """Force the connector into read mode via env for tests."""
+    os.environ["VLLM_MORIIO_CONNECTOR_READ_MODE"] = "True"
+    yield
+    # Cleanup after test
+    os.environ.pop("VLLM_MORIIO_CONNECTOR_READ_MODE", None)
+
+
+def test_write_mode_saves_local_block_ids():
+    """Write mode records local block ids in MoRIIOConnectorMetadata.reqs_to_save."""
+
+    # Setup Scheduler and Request
+    vllm_config = create_vllm_config(role="kv_producer")
+    scheduler = create_scheduler(vllm_config)
+
+    # 2 Full Blocks and 1 Half Block.
+    BLOCK_SIZE = vllm_config.cache_config.block_size
+    NUM_EXTERNAL_FULL_BLOCKS = 2
+    NUM_TOKENS = int(BLOCK_SIZE * (NUM_EXTERNAL_FULL_BLOCKS + 0.5))
+
+    request = create_request(
+        request_id=1,
+        block_size=BLOCK_SIZE,
+        num_tokens=NUM_TOKENS,
+        do_remote_decode=True,
+        do_remote_prefill=False,
+    )
+    request_id = request.request_id
+
+    scheduler.add_request(request)
+
+    # Fake Config
+    request = _setup_kv_transfer_request(request)
+
+    # Remote Prefill, triggers MoRIIOConnectorMetadata.
+    scheduler_output = scheduler.schedule()
+    kv_connector_metadata = scheduler_output.kv_connector_metadata
+    assert kv_connector_metadata is not None, "kv_connector_metadata is None"
+    assert isinstance(kv_connector_metadata, MoRIIOConnectorMetadata)
+
+    assert len(kv_connector_metadata.reqs_to_save) == 1, (
+        "Unexpected number of reqs_to_save"
+    )
+    assert len(kv_connector_metadata.reqs_to_recv) == 0, (
+        "Unexpected number of reqs_to_recv"
+    )
+    assert len(kv_connector_metadata.reqs_to_send) == 0, (
+        "Unexpected number of reqs_to_send"
+    )
+    assert request_id in kv_connector_metadata.reqs_to_save, (
+        "Request ID not in reqs_to_save"
+    )
+    req_meta = kv_connector_metadata.reqs_to_save[request_id]
+
+    for block_id, block in zip(
+        req_meta.local_block_ids,
+        scheduler.kv_cache_manager.coordinator.single_type_managers[0].req_to_blocks[
+            request_id
+        ],
+    ):
+        assert block_id == block.block_id, f"{block_id} != {block.block_id}"
+
+
+def test_write_mode_with_chunked_prefill_saves_local_block_ids():
+    """Write mode with chunked prefill still records correct local block ids."""
+    # Setup Scheduler and Request
+    MAX_NUM_BATCHED_TOKENS = 64
+    NUM_TOKENS = MAX_NUM_BATCHED_TOKENS * 2 + MAX_NUM_BATCHED_TOKENS // 2
+
+    vllm_config = create_vllm_config(
+        max_num_batched_tokens=MAX_NUM_BATCHED_TOKENS, role="kv_producer"
+    )
+    BLOCK_SIZE = vllm_config.cache_config.block_size
+
+    scheduler = create_scheduler(vllm_config)
+
+    # 2 Full Blocks and 1 Half Block.
+
+    request = create_request(
+        request_id=1,
+        block_size=BLOCK_SIZE,
+        num_tokens=NUM_TOKENS,
+        do_remote_decode=True,
+        do_remote_prefill=False,
+    )
+    request_id = request.request_id
+
+    scheduler.add_request(request)
+
+    # Fake Config
+    request = _setup_kv_transfer_request(request)
+
+    # Remote Prefill with chunked prefill, triggers multiple schedules.
+    expected_counts = [(0, 0, 0), (0, 0, 0), (1, 0, 0)]
+    kv_connector_metadata = None
+    for _, (expected_save, expected_recv, expected_send) in enumerate(expected_counts):
+        scheduler_output = scheduler.schedule()
+        kv_connector_metadata = scheduler_output.kv_connector_metadata
+
+        assert len(kv_connector_metadata.reqs_to_save) == expected_save
+        assert len(kv_connector_metadata.reqs_to_recv) == expected_recv
+        assert len(kv_connector_metadata.reqs_to_send) == expected_send
+    assert kv_connector_metadata is not None, "kv_connector_metadata is None"
+    assert request_id in kv_connector_metadata.reqs_to_save, (
+        "Request ID not in reqs_to_save"
+    )
+    req_meta = kv_connector_metadata.reqs_to_save[request_id]
+
+    for block_id, block in zip(
+        req_meta.local_block_ids,
+        scheduler.kv_cache_manager.coordinator.single_type_managers[0].req_to_blocks[
+            request_id
+        ],
+    ):
+        assert block_id == block.block_id, f"{block_id} != {block.block_id}"
+
+
+def test_read_mode_loads_remote_block_ids(moriio_read_mode):
+    """Read mode loads remote block ids into local cache mapping."""
+
+    # Setup Scheduler and Request
+    vllm_config = create_vllm_config(role="kv_consumer")
+    scheduler = create_scheduler(vllm_config)
+
+    # 2 Full Blocks and 1 Half Block.
+    BLOCK_SIZE = vllm_config.cache_config.block_size
+    NUM_EXTERNAL_FULL_BLOCKS = 2
+    NUM_TOKENS = int(BLOCK_SIZE * (NUM_EXTERNAL_FULL_BLOCKS + 0.5))
+
+    request = create_request(
+        request_id=1,
+        block_size=BLOCK_SIZE,
+        num_tokens=NUM_TOKENS,
+        do_remote_decode=False,
+        do_remote_prefill=True,
+    )
+    request_id = request.request_id
+
+    scheduler.add_request(request)
+    block_list = scheduler.kv_cache_manager.coordinator.single_type_managers[
+        0
+    ].req_to_blocks[request_id]
+
+    request = _setup_kv_transfer_request(request)
+
+    # Set remote block ids to be fetched.
+    request.kv_transfer_params["remote_block_ids"] = block_list
+
+    # Remote Prefill, triggers MorIIOConnectorMetadata.
+
+    scheduler_output = scheduler.schedule()
+    kv_connector_metadata = scheduler_output.kv_connector_metadata
+    assert kv_connector_metadata is not None, "kv_connector_metadata is None"
+    assert isinstance(kv_connector_metadata, MoRIIOConnectorMetadata), (
+        "kv_connector_metadata is not MoRIIOConnectorMetadata"
+    )
+    assert len(kv_connector_metadata.reqs_to_save) == 0, (
+        "Unexpected number of reqs_to_save"
+    )
+    assert len(kv_connector_metadata.reqs_to_recv) == 1, (
+        "Unexpected number of reqs_to_recv"
+    )
+    assert len(kv_connector_metadata.reqs_to_send) == 0, (
+        "Unexpected number of reqs_to_send"
+    )
+    assert request_id in kv_connector_metadata.reqs_to_recv, (
+        "Request ID not in reqs_to_recv"
+    )
+    req_meta = kv_connector_metadata.reqs_to_recv[request_id]
+
+    for block_id, block in zip(
+        req_meta.local_block_ids,
+        scheduler.kv_cache_manager.coordinator.single_type_managers[0].req_to_blocks[
+            request_id
+        ],
+    ):
+        assert block_id == block.block_id, f"{block_id} != {block.block_id}"
+
+
+@pytest.mark.skipif(
+    not aiter_available, reason="Requires aiter package for ROCm FlashAttention backend"
+)
+def test_register_kv_caches(mock_parallel_groups):
+    """Test that MoRIIOConnector.register_kv_caches correctly registers kv caches."""
+    ROLE = "kv_consumer"
+    IP = get_ip()
+    vllm_config = create_vllm_config(role=ROLE)
+    DEFAULT_PORT = 6301
+    TP_RANK = 0
+    DP_RANK = 0
+    from vllm.v1.attention.backends.rocm_aiter_fa import AiterFlashAttentionBackend
+
+    backend_cls = AiterFlashAttentionBackend
+
+    # Create test kv cache tensors using proper backend shape
+    kv_cache_shape = backend_cls.get_kv_cache_shape(
+        num_blocks=2, block_size=16, num_kv_heads=4, head_size=64
+    )
+    shared_tensor = torch.zeros(*kv_cache_shape, dtype=torch.float16)
+    unique_tensor = torch.zeros(*kv_cache_shape, dtype=torch.float16)
+    kv_caches = {
+        "layer0": shared_tensor,
+        "layer1": unique_tensor,
+        "layer2": shared_tensor,
+    }
+
+    with (
+        patch(
+            "vllm.distributed.kv_transfer.kv_connector.v1.moriio.moriio_connector.threading.Event"
+        ),
+        patch(
+            "vllm.distributed.kv_transfer.kv_connector.v1.moriio.moriio_connector.threading.Thread"
+        ),
+    ):
+        # Create connector
+        vllm_config.kv_transfer_config.kv_connector_extra_config.update(
+            {
+                "proxy_ip": "127.0.0.1",
+                "proxy_ping_port": 12345,
+                "http_port": 12346,
+            }
+        )
+
+        connector = MoRIIOConnector(vllm_config, KVConnectorRole.WORKER)
+        connector.connector_worker = FakeMorIIOConnectorWorker(
+            vllm_config, connector.engine_id, hand_shake_latency=0
+        )
+
+        from mori.io import (
+            MemoryDesc,
+        )
+
+        # Execute register_kv_caches
+        connector.register_kv_caches(kv_caches)
+
+        # Verify that the MemoryDesc stored in layer_name_to_local_kv_cache_metadata
+        assert (
+            shared_tensor.data_ptr()
+            == MemoryDesc.unpack(
+                connector.connector_worker.layer_name_to_local_kv_cache_metadata[
+                    "layer0"
+                ][0]
+            ).data
+        )
+        assert (
+            unique_tensor.data_ptr()
+            == MemoryDesc.unpack(
+                connector.connector_worker.layer_name_to_local_kv_cache_metadata[
+                    "layer1"
+                ][0]
+            ).data
+        )
+        assert (
+            shared_tensor.data_ptr()
+            == MemoryDesc.unpack(
+                connector.connector_worker.layer_name_to_local_kv_cache_metadata[
+                    "layer2"
+                ][0]
+            ).data
+        )
+
+        # Verify engine keys
+        expected_engine_key = f"{ROLE[3:]}:{IP}:{DEFAULT_PORT}:tp{TP_RANK}:dp{DP_RANK}"
+        assert (
+            MemoryDesc.unpack(
+                connector.connector_worker.layer_name_to_local_kv_cache_metadata[
+                    "layer0"
+                ][0]
+            ).engine_key
+            == expected_engine_key
+        )
+
+
+@pytest.mark.skipif(
+    not aiter_available, reason="Requires aiter package for ROCm FlashAttention backend"
+)
+def test_moriio_handshake_returns_metadata(mock_parallel_groups):
+    """MoRIIO handshake socket returns valid agent metadata over ZMQ."""
+
+    ROLE = "kv_consumer"
+    vllm_config = create_vllm_config(role=ROLE)
+    from vllm.v1.attention.backends.rocm_aiter_fa import AiterFlashAttentionBackend
+
+    backend_cls = AiterFlashAttentionBackend
+
+    # Create test kv cache tensors using proper backend shape
+    kv_cache_shape = backend_cls.get_kv_cache_shape(
+        num_blocks=2, block_size=16, num_kv_heads=4, head_size=64
+    )
+    shared_tensor = torch.zeros(*kv_cache_shape, dtype=torch.float16)
+    unique_tensor = torch.zeros(*kv_cache_shape, dtype=torch.float16)
+    kv_caches = {
+        "layer0": shared_tensor,
+        "layer1": unique_tensor,
+        "layer2": shared_tensor,
+    }
+
+    with (
+        patch(
+            "vllm.distributed.kv_transfer.kv_connector.v1.moriio.moriio_engine.MoRIIOWrapper",
+            FakeMorIIOWrapper,
+        ),
+    ):
+        handshake_port = _find_free_port()
+        # Create connector
+        vllm_config.kv_transfer_config.kv_connector_extra_config.update(
+            {
+                "proxy_ip": "127.0.0.1",
+                "proxy_ping_port": 12345,
+                "http_port": 12346,
+                "handshake_port": handshake_port,
+            }
+        )
+        connector = MoRIIOConnector(vllm_config, KVConnectorRole.WORKER)
+
+        # Execute register_kv_caches
+        connector.register_kv_caches(kv_caches)
+
+        # Connect to handshake socket and request metadata
+        path = make_zmq_path("tcp", "127.0.0.1", handshake_port)
+        with zmq_ctx(zmq.DEALER, path) as sock:
+            sock.send(MoRIIOConstants.GET_META_MSG)
+            received_frame = sock.recv_multipart()
+
+            if len(received_frame) != 2 or received_frame[0] != b"":
+                raise ValueError(f"Unexpected frame! {received_frame = }")
+
+            metadata_bytes = received_frame[1]
+            decoder = msgspec.msgpack.Decoder(MoRIIOAgentMetadata)
+            metadata = decoder.decode(metadata_bytes)
+            assert isinstance(metadata, MoRIIOAgentMetadata), (
+                "Decoded metadata is not MoRIIOAgentMetadata"
+            )
diff --git a/tests/v1/kv_connector/unit/test_multi_connector.py b/tests/v1/kv_connector/unit/test_multi_connector.py
index ea6e6029011c742d3d52237630fdf2adb9b64a5e..7ebe55b152f62d3564d357089dd62476ddc9d115 100644
--- a/tests/v1/kv_connector/unit/test_multi_connector.py
+++ b/tests/v1/kv_connector/unit/test_multi_connector.py
@@ -51,6 +51,33 @@ class MockConnector(KVConnectorBase_V1):
     ) -> KVConnectorStats | None:
         return MockConnectorStats(data=data) if data is not None else None
 
+    def start_load_kv(self, forward_context, **kwargs):
+        pass
+
+    def wait_for_layer_load(self, layer_name):
+        pass
+
+    def save_kv_layer(self, layer_name, kv_layer, attn_metadata, **kwargs):
+        pass
+
+    def wait_for_save(self):
+        pass
+
+    def build_connector_meta(self, scheduler_output):
+        return None
+
+    def get_num_new_matched_tokens(self, request, num_computed_tokens):
+        return (0, False)
+
+    def update_state_after_alloc(self, request, blocks, num_tokens) -> None:
+        pass
+
+
+class MockCrossLayerConnector(MockConnector):
+    @property
+    def prefer_cross_layer_blocks(self) -> bool:
+        return True
+
 
 # Register the mock connector
 KVConnectorFactory.register_connector("MockConnector", __name__, MockConnector.__name__)
@@ -603,3 +630,21 @@ class TestMultiConnectorStats:
         # One non-empty
         stats.data["NixlConnector"].data["transfer_duration"].append(1.0)
         assert not stats.is_empty()
+
+
+class TestMultiConnectorPreferCrossLayerBlocks:
+    def test_all_connectors_prefer_cross_layer_blocks(self):
+        mc = MultiConnector.__new__(MultiConnector)
+        mc._connectors = [
+            MockCrossLayerConnector.__new__(MockCrossLayerConnector),
+            MockCrossLayerConnector.__new__(MockCrossLayerConnector),
+        ]
+        assert mc.prefer_cross_layer_blocks is True
+
+    def test_mixed_connectors_do_not_prefer_cross_layer_blocks(self):
+        mc = MultiConnector.__new__(MultiConnector)
+        mc._connectors = [
+            MockCrossLayerConnector.__new__(MockCrossLayerConnector),
+            MockConnector.__new__(MockConnector),  # default False
+        ]
+        assert mc.prefer_cross_layer_blocks is False
diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py
index 66804fa671c7cb1696e60206c1bb881779596721..6d25ee6f61c47e20501a300e4fde4cb5a2d07fe7 100644
--- a/tests/v1/kv_connector/unit/test_nixl_connector.py
+++ b/tests/v1/kv_connector/unit/test_nixl_connector.py
@@ -41,10 +41,13 @@ from vllm.distributed.kv_transfer.kv_transfer_state import (
     has_kv_transfer_group,
 )
 from vllm.forward_context import ForwardContext
+from vllm.outputs import RequestOutput
 from vllm.platforms import current_platform
 from vllm.platforms.interface import Platform
 from vllm.sampling_params import SamplingParams
 from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend
+from vllm.v1.engine import EngineCoreRequest
+from vllm.v1.engine.output_processor import OutputProcessor
 from vllm.v1.outputs import KVConnectorOutput, ModelRunnerOutput
 from vllm.v1.request import RequestStatus
 
@@ -182,18 +185,21 @@ class FakeNixlWrapper:
 def _make_fake_nixl_pkg():
     """Context manager that creates a temporary package making
        `from nixl._api import nixl_agent` resolve to our FakeNixlWrapper.
+       Also creates rixl package for ROCm compatibility.
 
     Automatically cleans up the temporary directory when done.
     """
     with tempfile.TemporaryDirectory() as td:
-        pkg_root = os.path.join(td, "nixl", "_api")
-        os.makedirs(pkg_root, exist_ok=True)
+        # Create both nixl and rixl packages for cross-platform compatibility
+        for pkg_name in ["nixl", "rixl"]:
+            pkg_root = os.path.join(td, pkg_name, "_api")
+            os.makedirs(pkg_root, exist_ok=True)
 
-        # Get the source code of FakeNixlWrapper class and dedent it
-        fake_nixl_source = inspect.getsource(FakeNixlWrapper)
-        fake_nixl_source = textwrap.dedent(fake_nixl_source)
+            # Get the source code of FakeNixlWrapper class and dedent it
+            fake_nixl_source = inspect.getsource(FakeNixlWrapper)
+            fake_nixl_source = textwrap.dedent(fake_nixl_source)
 
-        stub = f"""\
+            stub = f"""\
 # Copy of FakeNixlWrapper implementation for Ray workers
 import uuid
 from collections import defaultdict
@@ -203,16 +209,17 @@ from collections import defaultdict
 # Export as nixl_agent
 nixl_agent = FakeNixlWrapper
 """
-        with open(os.path.join(pkg_root, "__init__.py"), "w") as f:
-            f.write(stub)
-
-        # Mock nixlXferTelemetry class
-        pkg_root2 = os.path.join(td, "nixl", "_bindings")
-        os.makedirs(pkg_root2, exist_ok=True)
-        with open(os.path.join(pkg_root2, "__init__.py"), "w") as f:
-            f.write("class nixlXferTelemetry: pass")
-        # touch parent package
-        open(os.path.join(td, "nixl", "__init__.py"), "w").close()
+            with open(os.path.join(pkg_root, "__init__.py"), "w") as f:
+                f.write(stub)
+
+            # Mock nixlXferTelemetry class
+            pkg_root2 = os.path.join(td, pkg_name, "_bindings")
+            os.makedirs(pkg_root2, exist_ok=True)
+            with open(os.path.join(pkg_root2, "__init__.py"), "w") as f:
+                f.write("class nixlXferTelemetry: pass")
+            # touch parent package
+            open(os.path.join(td, pkg_name, "__init__.py"), "w").close()
+
         yield td
 
 
@@ -296,6 +303,7 @@ def test_prompt_less_than_block_size():
 )
 def test_kv_transfer_handshake(dist_init):
     """Unit test for basic NixlConnector interface functionality."""
+    from vllm.config import set_current_vllm_config
 
     # Test setup, we creates a scheduler that contains a NixlConnector
     # of role SCHEDULER, and expect it to be serving NixlAgentMetadata from
@@ -305,81 +313,82 @@ def test_kv_transfer_handshake(dist_init):
     vllm_config.kv_transfer_config.kv_buffer_device = "cpu"
     scheduler = create_scheduler(vllm_config)
 
-    # Create two NixlConnector of role WORKER, one is the worker of
-    # the scheduler (prefill), the other is a worker of decode instance.
-
-    # Prefill connector will register KV cache to populate proper handshake
-    # metadata.
-    prefill_connector = NixlConnector(vllm_config, KVConnectorRole.WORKER)
-    kv_cache_shape = FlashAttentionBackend.get_kv_cache_shape(
-        num_blocks=2, block_size=16, num_kv_heads=4, head_size=64
-    )
-    shared_tensor = torch.zeros(*kv_cache_shape, dtype=torch.float16)
-    unique_tensor = torch.zeros(*kv_cache_shape, dtype=torch.float16)
-    kv_caches = {
-        "layer0": shared_tensor,
-        "layer1": unique_tensor,
-        "layer2": shared_tensor,
-    }
-    prefill_connector.register_kv_caches(kv_caches)
-
-    # Simulate EngineCore initialization that would gather connector
-    # metadata from all workers
-    metadata = prefill_connector.get_handshake_metadata()
-
-    # metadata is a NixlHandshakePayload, decode it to get NixlAgentMetadata
-    decoder = msgspec.msgpack.Decoder(NixlAgentMetadata)
-    expected_agent_metadata = decoder.decode(metadata.agent_metadata_bytes)
-
-    # The scheduler connector expects metadata to be in
-    # dict[int, KVConnectorHandshakeMetadata], where the first key is
-    # the dp_rank, the second key is the tp_rank.
-    scheduler_connector = scheduler.get_kv_connector()
-    scheduler_connector.set_xfer_handshake_metadata({0: metadata})
+    with set_current_vllm_config(vllm_config):
+        # Create two NixlConnector of role WORKER, one is the worker of
+        # the scheduler (prefill), the other is a worker of decode instance.
 
-    # Simulate a request that finishes prefill, which returns
-    # corresponding NixlConnectorMetadata for decode instance.
-    BLOCK_SIZE = vllm_config.cache_config.block_size
-    NUM_EXTERNAL_FULL_BLOCKS = 2
-    NUM_TOKENS = int(BLOCK_SIZE * (NUM_EXTERNAL_FULL_BLOCKS + 0.5))
-
-    request = create_request(
-        request_id=1,
-        block_size=BLOCK_SIZE,
-        num_tokens=NUM_TOKENS,
-        do_remote_decode=True,
-    )
-    request.status = RequestStatus.FINISHED_LENGTH_CAPPED
-    delay, kv_connector_metadata = scheduler.get_kv_connector().request_finished(
-        request, [0, 1, 2]
-    )
-    assert delay
-
-    # Decode connector will be able to create handshake with the prefill connector.
-    decode_connector = NixlConnector(vllm_config, KVConnectorRole.WORKER)
-
-    # Here we are testing the retrieval of NIXLAgentMetadata.
-    # Knowing the implementation detail, we override the add_remote_agent
-    # to validate the metadata received is the same as the one in prefill_connector.
-    with patch.object(
-        decode_connector.connector_worker, "add_remote_agent"
-    ) as mock_add_remote_agent:
-        mock_add_remote_agent.return_type = "remote_agent"
-
-        decode_connector.connector_worker._nixl_handshake(
-            kv_connector_metadata["remote_host"],
-            kv_connector_metadata["remote_port"],
-            kv_connector_metadata["tp_size"],
-            kv_connector_metadata["remote_engine_id"],
+        # Prefill connector will register KV cache to populate proper handshake
+        # metadata.
+        prefill_connector = NixlConnector(vllm_config, KVConnectorRole.WORKER)
+        kv_cache_shape = FlashAttentionBackend.get_kv_cache_shape(
+            num_blocks=2, block_size=16, num_kv_heads=4, head_size=64
         )
+        shared_tensor = torch.zeros(*kv_cache_shape, dtype=torch.float16)
+        unique_tensor = torch.zeros(*kv_cache_shape, dtype=torch.float16)
+        kv_caches = {
+            "layer0": shared_tensor,
+            "layer1": unique_tensor,
+            "layer2": shared_tensor,
+        }
+        prefill_connector.register_kv_caches(kv_caches)
+
+        # Simulate EngineCore initialization that would gather connector
+        # metadata from all workers
+        metadata = prefill_connector.get_handshake_metadata()
+
+        # metadata is a NixlHandshakePayload, decode it to get NixlAgentMetadata
+        decoder = msgspec.msgpack.Decoder(NixlAgentMetadata)
+        expected_agent_metadata = decoder.decode(metadata.agent_metadata_bytes)
+
+        # The scheduler connector expects metadata to be in
+        # dict[int, KVConnectorHandshakeMetadata], where the first key is
+        # the dp_rank, the second key is the tp_rank.
+        scheduler_connector = scheduler.get_kv_connector()
+        scheduler_connector.set_xfer_handshake_metadata({0: metadata})
+
+        # Simulate a request that finishes prefill, which returns
+        # corresponding NixlConnectorMetadata for decode instance.
+        BLOCK_SIZE = vllm_config.cache_config.block_size
+        NUM_EXTERNAL_FULL_BLOCKS = 2
+        NUM_TOKENS = int(BLOCK_SIZE * (NUM_EXTERNAL_FULL_BLOCKS + 0.5))
+
+        request = create_request(
+            request_id=1,
+            block_size=BLOCK_SIZE,
+            num_tokens=NUM_TOKENS,
+            do_remote_decode=True,
+        )
+        request.status = RequestStatus.FINISHED_LENGTH_CAPPED
+        delay, kv_connector_metadata = scheduler.get_kv_connector().request_finished(
+            request, [0, 1, 2]
+        )
+        assert delay
+
+        # Decode connector will be able to create handshake with the prefill connector.
+        decode_connector = NixlConnector(vllm_config, KVConnectorRole.WORKER)
+
+        # Here we are testing the retrieval of NIXLAgentMetadata.
+        # Knowing the implementation detail, we override the add_remote_agent
+        # to validate the metadata received is the same as the one in prefill_connector.
+        with patch.object(
+            decode_connector.connector_worker, "add_remote_agent"
+        ) as mock_add_remote_agent:
+            mock_add_remote_agent.return_type = "remote_agent"
+
+            decode_connector.connector_worker._nixl_handshake(
+                kv_connector_metadata["remote_host"],
+                kv_connector_metadata["remote_port"],
+                kv_connector_metadata["tp_size"],
+                kv_connector_metadata["remote_engine_id"],
+            )
 
-        received_metadata = mock_add_remote_agent.call_args.args
-        assert received_metadata[0] == expected_agent_metadata
-        assert received_metadata[1] == 0  # remote_tp_rank
-        assert received_metadata[2] == 1  # remote_tp_size
+            received_metadata = mock_add_remote_agent.call_args.args
+            assert received_metadata[0] == expected_agent_metadata
+            assert received_metadata[1] == 0  # remote_tp_rank
+            assert received_metadata[2] == 1  # remote_tp_size
 
-    # Need to shutdown the background thread to release NIXL side channel port
-    scheduler_connector.shutdown()
+        # Need to shutdown the background thread to release NIXL side channel port
+        scheduler_connector.shutdown()
 
 
 class FakeNixlConnectorWorker(NixlConnectorWorker):
@@ -391,6 +400,8 @@ class FakeNixlConnectorWorker(NixlConnectorWorker):
         super().__init__(*args, **kwargs)
         self._hand_shake_latency = hand_shake_latency
         self.kv_cache_layout = kv_cache_layout
+        # Mock register_kv_caches attribute needed for tests that do not call it.
+        self.src_xfer_handles_by_block_size = {self.block_size: 1}
 
     def _nixl_handshake(
         self, host: str, port: int, remote_tp_size: int, expected_engine_id: str
@@ -407,22 +418,43 @@ class FakeNixlConnectorWorker(NixlConnectorWorker):
 
         assert expected_engine_id == self.REMOTE_ENGINE_ID
 
-        remote_agent_name = self.add_remote_agent(
-            NixlAgentMetadata(
-                engine_id=self.REMOTE_ENGINE_ID,
-                agent_metadata=FakeNixlWrapper.AGENT_METADATA,
-                kv_caches_base_addr=[0],
-                device_id=0,
-                num_blocks=1,
-                block_lens=self.block_len_per_layer,
-                # `self.kv_cache_layout` is only forced to HND when vllm engine
-                # is started. We mock HND here.
-                kv_cache_layout="HND",
-                block_size=self.block_size,
-            ),
-            remote_tp_size=remote_tp_size,
-        )
-        return {0: remote_agent_name}
+        # Adjust remote block length metadata to satisfy heterogeneous TP
+        # invariants enforced during handshake validation.
+        remote_block_lens = list(self.block_len_per_layer)
+        tp_ratio = self.kv_topo.tp_ratio(remote_tp_size)
+        if remote_tp_size > self.world_size:
+            # P TP > D TP case, block_len of remote is smaller
+            remote_block_lens = [
+                block_len // (-tp_ratio) for block_len in remote_block_lens
+            ]
+        elif remote_tp_size < self.world_size:
+            remote_block_lens = [
+                block_len * tp_ratio for block_len in remote_block_lens
+            ]
+
+        # When remote tp_size > local tp_size, handshake with multiple
+        # remote ranks.
+        num_hanshakes = 1 if tp_ratio > 0 else -tp_ratio
+        remote_agents: dict[int, str] = {}
+        for remote_tp_rank in range(num_hanshakes):
+            remote_agent_name = self.add_remote_agent(
+                NixlAgentMetadata(
+                    engine_id=self.REMOTE_ENGINE_ID,
+                    agent_metadata=FakeNixlWrapper.AGENT_METADATA,
+                    kv_caches_base_addr=[0],
+                    device_id=remote_tp_rank,
+                    num_blocks=1,
+                    block_lens=remote_block_lens,
+                    # `self.kv_cache_layout` is only forced to HND when vllm engine
+                    # is started. We mock HND here.
+                    kv_cache_layout="HND",
+                    block_size=self.block_size,
+                ),
+                remote_tp_rank=remote_tp_rank,
+                remote_tp_size=remote_tp_size,
+            )
+            remote_agents[remote_tp_rank] = remote_agent_name
+        return remote_agents
 
 
 class TestNixlHandshake:
@@ -432,6 +464,7 @@ class TestNixlHandshake:
     )
     def test_multi_xfer_one_engine(
         self,
+        default_vllm_config,
         # dist_init is a fixture that initializes the distributed environment.
         dist_init,
     ):
@@ -453,7 +486,13 @@ class TestNixlHandshake:
             vllm_config, connector.engine_id, hand_shake_latency=0
         )
         assert isinstance(connector.connector_worker.nixl_wrapper, FakeNixlWrapper)
-        connector.connector_worker.nixl_wrapper.set_cycles_before_xfer_done(3)
+        worker = connector.connector_worker
+        worker.nixl_wrapper.set_cycles_before_xfer_done(3)
+        # simulate handshake
+        worker.dst_xfer_side_handles = {
+            FakeNixlConnectorWorker.REMOTE_ENGINE_ID: {0: 1}
+        }
+        worker.kv_cache_layout = "HND"
         num_xfers = 4
         while True:
             # For the same request_id, initiate multiple xfers across different
@@ -515,6 +554,7 @@ class TestNixlHandshake:
     )
     def test_async_load_kv(
         self,
+        default_vllm_config,
         # Fixture that initializes the distributed environment.
         dist_init,
         # Simulate consumer-producer TP sizes.
@@ -567,12 +607,178 @@ class TestNixlHandshake:
                 return
         raise TimeoutError("Took too long to complete async handshake.")
 
+    @patch(
+        "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper",
+        FakeNixlWrapper,
+    )
+    @pytest.mark.parametrize("local_tp_size", [1, 2])
+    def test_prefill_tp_size_greater_than_decode_tp_size(
+        self, local_tp_size: int, default_vllm_config, dist_init
+    ):
+        """
+        Verify remote TP > local TP handshake succeeds with different
+        remote configurations.
+        """
+
+        vllm_config = create_vllm_config()
+        local_tp_size = 1
+        vllm_config.parallel_config.tensor_parallel_size = local_tp_size
+
+        connector = NixlConnector(vllm_config, KVConnectorRole.WORKER)
+        connector.connector_worker = FakeNixlConnectorWorker(
+            vllm_config, connector.engine_id, hand_shake_latency=0
+        )
+        worker = connector.connector_worker
+
+        # Minimal local registration params used by add_remote_agent
+        worker.slot_size_per_layer = [4096]
+        worker.block_len_per_layer = [4096 * worker.block_size]
+        worker.num_blocks = 1
+        worker.dst_num_blocks[worker.engine_id] = worker.num_blocks
+        worker.src_blocks_data = [(0, worker.block_len_per_layer[0], worker.tp_rank)]
+
+        def check_handshake(remote_tp_size: int):
+            tp_ratio = remote_tp_size // local_tp_size
+            assert set(remote_agents.keys()) == set(range(tp_ratio))
+
+            remote_engine_id = worker.REMOTE_ENGINE_ID
+            assert worker._tp_size[remote_engine_id] == remote_tp_size
+            assert -tp_ratio == worker.kv_topo.tp_ratio_from_engine_id(remote_engine_id)
+            # ensure src_xfer_handles_by_tp_ratio is populated with tpratio chunks
+            assert -tp_ratio in worker.src_xfer_handles_by_tp_ratio
+            assert len(worker.src_xfer_handles_by_tp_ratio[-tp_ratio]) == tp_ratio
+            assert remote_engine_id in worker.dst_xfer_side_handles
+            assert set(worker.dst_xfer_side_handles[remote_engine_id].keys()) == set(
+                range(tp_ratio)
+            )
+
+        remote_agents = worker._nixl_handshake(
+            host="localhost",
+            port=1234,
+            remote_tp_size=2,
+            expected_engine_id=worker.REMOTE_ENGINE_ID,
+        )
+        check_handshake(2)
+
+        # NOTE flexiblity: a second remote with higher number of ranks is
+        # discovered. This is not a scenario we actively support right now, but
+        # the connector allows it.
+        worker.REMOTE_ENGINE_ID = "remote_engine_2"
+        remote_agents = worker._nixl_handshake(
+            host="localhost",
+            port=1234,
+            remote_tp_size=6,
+            expected_engine_id=worker.REMOTE_ENGINE_ID,
+        )
+        check_handshake(6)
+
+    @patch(
+        "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper",
+        FakeNixlWrapper,
+    )
+    @pytest.mark.parametrize("local_tp_size", [1, 2])
+    def test_prefill_tp_size_greater_than_decode_tp_size_mla(
+        self, local_tp_size: int, default_vllm_config, dist_init
+    ):
+        """
+        Verify remote TP > local TP handshake succeeds with different
+        remote configurations for an MLA model.
+        """
+        vllm_config = create_vllm_config()
+        d_tp_size = 1
+        p_tp_size = 2
+
+        # Build two separate connectors/workers to emulate P TP=2 ranks.
+        conn_p0 = NixlConnector(vllm_config, KVConnectorRole.WORKER)
+        conn_p1 = NixlConnector(vllm_config, KVConnectorRole.WORKER)
+        conn_p0.connector_worker = FakeNixlConnectorWorker(
+            vllm_config, conn_p0.engine_id, hand_shake_latency=0
+        )
+        conn_p1.connector_worker = FakeNixlConnectorWorker(
+            vllm_config, conn_p1.engine_id, hand_shake_latency=0
+        )
+
+        # Force P world size to 2 for both workers and emulate distinct tp_ranks.
+        # Also enable MLA path so that expected_finished_count is updated.
+        for rank, worker in enumerate(
+            (conn_p0.connector_worker, conn_p1.connector_worker)
+        ):
+            worker.world_size = p_tp_size
+            worker.kv_topo.remote_tp_size = {worker.engine_id: p_tp_size}
+            worker.tp_rank = rank
+            worker.use_mla = True
+
+        req_id = "req-ep-dp2-p0"
+        now = time.perf_counter()
+        # Register a request on P that is waiting for consumers to read
+        # (both workers track it).
+        conn_p0.connector_worker._reqs_to_send[req_id] = now + 10.0
+        conn_p0.connector_worker._reqs_to_process.add(req_id)
+        conn_p1.connector_worker._reqs_to_send[req_id] = now + 10.0
+        conn_p1.connector_worker._reqs_to_process.add(req_id)
+
+        # Simulate a read notification coming from D with (tp=1, dp=2).
+        notif = f"{req_id}:{d_tp_size}".encode()
+        # D0-0->P0 notif
+        conn_p0.connector_worker.nixl_wrapper.get_new_notifs = lambda: {
+            "agent": [notif]
+        }  # type: ignore[method-assign]
+        conn_p1.connector_worker.nixl_wrapper.get_new_notifs = lambda: {
+            "agent": [notif]
+        }  # type: ignore[method-assign]
+
+        # Trigger notification processing via get_finished().
+        done_sending0, _ = conn_p0.get_finished(finished_req_ids=set())
+        done_sending1, _ = conn_p1.get_finished(finished_req_ids=set())
+        assert req_id in done_sending0 and req_id in done_sending1
+
+        # E2E aggregation: ensure the aggregated output marks the request
+        # as finished using the connector's expected_finished_count.
+        from vllm.v1.outputs import KVConnectorOutput, ModelRunnerOutput
+
+        aggregator = KVOutputAggregator.from_connector(conn_p0, world_size=2)
+
+        out0 = ModelRunnerOutput(
+            req_ids=[req_id],
+            req_id_to_index={req_id: 0},
+            sampled_token_ids=[[0]],
+            logprobs=None,
+            prompt_logprobs_dict={},
+            pooler_output=[None],
+            kv_connector_output=KVConnectorOutput(
+                finished_sending=done_sending0,
+                finished_recving=None,
+            ),
+        )
+        out1 = ModelRunnerOutput(
+            req_ids=[req_id],
+            req_id_to_index={req_id: 0},
+            sampled_token_ids=[[0]],
+            logprobs=None,
+            prompt_logprobs_dict={},
+            pooler_output=[None],
+            kv_connector_output=KVConnectorOutput(
+                finished_sending=done_sending1,
+                finished_recving=None,
+            ),
+        )
+        aggregated = aggregator.aggregate([out0, out1], output_rank=0)
+        assert aggregated.kv_connector_output is not None
+        assert aggregated.kv_connector_output.finished_sending == {req_id}
+
+        # Producers cleaned up state for the finished request.
+        assert req_id not in conn_p0.connector_worker._reqs_to_send
+        assert req_id not in conn_p0.connector_worker._reqs_to_process
+        assert req_id not in conn_p1.connector_worker._reqs_to_send
+        assert req_id not in conn_p1.connector_worker._reqs_to_process
+
     @patch(
         "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper",
         FakeNixlWrapper,
     )
     def test_concurrent_load_kv(
         self,
+        default_vllm_config,
         # dist_init is a fixture that initializes the distributed environment.
         dist_init,
     ):
@@ -585,6 +791,9 @@ class TestNixlHandshake:
         connector.connector_worker = FakeNixlConnectorWorker(
             vllm_config, connector.engine_id
         )
+        # Register (mocked) local xfer handler
+        # worker = connector.connector_worker
+        # worker.src_xfer_handles_by_block_size = {worker.block_size: 1}
         metadata = NixlConnectorMetadata()
         total_reqs = 5
         for i in range(total_reqs):
@@ -630,7 +839,9 @@ class TestNixlHandshake:
         "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper",
         FakeNixlWrapper,
     )
-    def test_handshake_fails_on_kv_cache_layout_mismatch(self, dist_init):
+    def test_handshake_fails_on_kv_cache_layout_mismatch(
+        self, default_vllm_config, dist_init
+    ):
         """
         Verify that adding a remote agent fails if kv_cache_layout differs.
         This test is only relevant for heterogeneous TP.
@@ -672,7 +883,6 @@ class TestNixlHandshake:
             with pytest.raises(RuntimeError):
                 # mismatched layout is expected to fail
                 worker.add_remote_agent(meta, remote_tp_size=2)
-            with pytest.raises(AssertionError):
                 worker.add_remote_agent(meta, remote_tp_size=1)
 
     @patch(
@@ -680,7 +890,7 @@ class TestNixlHandshake:
         FakeNixlWrapper,
     )
     def test_handshake_succeed_on_kv_cache_layout_mismatch_with_experimental(
-        self, dist_init
+        self, default_vllm_config, dist_init
     ):
         """
         Verify that adding a remote agent fails if kv_cache_layout differs.
@@ -735,7 +945,7 @@ class TestNixlHandshake:
     "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper",
     FakeNixlWrapper,
 )
-def test_kv_connector_stats(dist_init):
+def test_kv_connector_stats(default_vllm_config, dist_init):
     """Test that KV transfer stats are properly recorded and retrieved."""
     vllm_config = create_vllm_config()
 
@@ -1069,6 +1279,22 @@ def test_abort_timeout_on_prefiller(monkeypatch, distributed_executor_backend):
         run_test_and_cleanup()
 
 
+class RequestIdMapper:
+    """Helper class to map external request IDs to internal request IDs."""
+
+    def __init__(self, output_processor: OutputProcessor):
+        self.req_id_mapping: dict[str, str] = {}
+        self.original_add_request = output_processor.add_request
+        output_processor.add_request = self._add_request
+
+    def _add_request(self, request: EngineCoreRequest, *args, **kwargs):
+        self.req_id_mapping[request.external_req_id] = request.request_id
+        return self.original_add_request(request, *args, **kwargs)
+
+    def __call__(self, external_req_id: str) -> str:
+        return self.req_id_mapping[external_req_id]
+
+
 def _run_abort_timeout_test(llm: LLM, timeout: int):
     """Helper function to run the abort timeout test logic."""
     remote_prefill_opts = {
@@ -1090,24 +1316,34 @@ def _run_abort_timeout_test(llm: LLM, timeout: int):
         0
     ].req_to_blocks
 
+    id_mapper = RequestIdMapper(llm.llm_engine.output_processor)
+
+    def req_id(outputs: list[RequestOutput]) -> str:
+        assert len(outputs) == 1
+        return id_mapper(outputs[0].request_id)
+
     padding = "Just making this request a little longer so that we're sure "
     "we're not hitting the small-request lower bound beneath which we don't "
     "actually trigger the whole kv transfer, but rather just recompute the "
     "blocks on D."
-    _ = llm.generate([f"What is the capital of Japan? {padding}"], sampling_params)
+    req0_id = req_id(
+        llm.generate([f"What is the capital of Japan? {padding}"], sampling_params)
+    )
 
     # Request finished but not freed
-    assert "0" in scheduler.finished_req_ids and "0" in req_to_blocks
+    assert req0_id in scheduler.finished_req_ids and req0_id in req_to_blocks
     # Some other request, 0 still not freed
-    _ = llm.generate([f"What is the capital of Italy? {padding}"], sampling_params)
-    assert "0" in req_to_blocks
-    assert "1" in scheduler.finished_req_ids and "1" in req_to_blocks
+    req1_id = req_id(
+        llm.generate([f"What is the capital of Italy? {padding}"], sampling_params)
+    )
+    assert req0_id in req_to_blocks
+    assert req1_id in scheduler.finished_req_ids and req1_id in req_to_blocks
 
     # Wait for timeout and trigger another scheduler loop
     time.sleep(timeout)
     _ = llm.generate([f"What is the capital of France? {padding}"], sampling_params)
     # Request-0 times out and is cleared!
-    assert "0" not in req_to_blocks
+    assert req0_id not in req_to_blocks
     # Need to shutdown the background thread to release NIXL side channel port
     llm.llm_engine.engine_core.shutdown()
 
@@ -1132,7 +1368,7 @@ def _run_abort_timeout_test(llm: LLM, timeout: int):
         "TRITON_ATTN",
     ],
 )
-def test_register_kv_caches(dist_init, attn_backend, monkeypatch):
+def test_register_kv_caches(default_vllm_config, dist_init, attn_backend):
     """
     Test that register_kv_caches() properly calls nixl_wrapper methods with
     correct data.
@@ -1144,9 +1380,7 @@ def test_register_kv_caches(dist_init, attn_backend, monkeypatch):
        block layout info
     """
 
-    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
-
-    vllm_config = create_vllm_config()
+    vllm_config = create_vllm_config(attention_backend=attn_backend)
 
     # Import the appropriate backend based on the parameter
     if attn_backend == "FLASH_ATTN":
@@ -1205,7 +1439,7 @@ def test_register_kv_caches(dist_init, attn_backend, monkeypatch):
         patch(f"{nixl_module}.NixlWrapper") as mock_nixl_wrapper,
         patch(f"{nixl_module}.threading.Event"),
         patch(f"{nixl_module}.threading.Thread") as mock_thread,
-        patch(f"{nixl_module}.get_attn_backend") as mock_get_attn_backend,
+        patch(f"{nixl_module}.get_current_attn_backend") as mock_get_attn_backend,
     ):
         # Ensure get_attn_backend returns the correct value due to
         # _cached_get_attn_backend returning the backend from previous
@@ -1295,7 +1529,9 @@ class FakePlatform(Platform):
         ("oot", "VRAM"),
     ],
 )
-def test_kv_buffer_to_nixl_memory_types(dist_init, kv_buffer_device, nixl_memory_type):
+def test_kv_buffer_to_nixl_memory_types(
+    default_vllm_config, dist_init, kv_buffer_device, nixl_memory_type
+):
     """
     Test that register_kv_caches() passes the correct memory types from the
     config to the nixl_wrapper.
@@ -1340,7 +1576,7 @@ def test_kv_buffer_to_nixl_memory_types(dist_init, kv_buffer_device, nixl_memory
     "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper",
     FakeNixlWrapper,
 )
-def test_shutdown_cleans_up_resources(dist_init):
+def test_shutdown_cleans_up_resources(default_vllm_config, dist_init):
     """Test that shutdown() properly cleans up all resources."""
     vllm_config = create_vllm_config()
 
@@ -1359,8 +1595,11 @@ def test_shutdown_cleans_up_resources(dist_init):
         patch.object(nixl_wrapper, "deregister_memory") as mock_dereg,
     ):
         worker._recving_transfers = {"req1": [123]}
-        worker.src_xfer_side_handle = 456
-        worker.dst_xfer_side_handles = {"engine1": 789}
+        # Mock register_kv_cache which registers local handle
+        worker.src_xfer_handles_by_block_size = {worker.block_size: 455}
+        # P TP = 2 * D TP case, we should register 2 local handles
+        worker.src_xfer_handles_by_tp_ratio = {-2: [456, 457]}
+        worker.dst_xfer_side_handles = {"engine1": {0: 789}}
         worker._remote_agents = {"engine1": {0: "agent1"}}
         worker._registered_descs = ["desc1", "desc2"]
 
@@ -1381,8 +1620,10 @@ def test_shutdown_cleans_up_resources(dist_init):
         mock_listener.join.assert_called_once()
 
         mock_rel_xfer.assert_called_once_with(123)
-        assert mock_rel_dlist.call_count == 2
-        mock_rel_dlist.assert_any_call(456)  # src handle
+        assert mock_rel_dlist.call_count == 4
+        mock_rel_dlist.assert_any_call(455)  # src handle (whole region)
+        mock_rel_dlist.assert_any_call(456)  # src handle (1st chunk)
+        mock_rel_dlist.assert_any_call(457)  # src handle (2nd chunk)
         mock_rel_dlist.assert_any_call(789)  # dst handle
         mock_rem_agent.assert_called_once_with("agent1")
         assert mock_dereg.call_count == 2
@@ -1394,7 +1635,7 @@ def test_shutdown_cleans_up_resources(dist_init):
     "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper",
     FakeNixlWrapper,
 )
-def test_aborted_request_removed_from_worker_in_batch(dist_init):
+def test_aborted_request_removed_from_worker_in_batch(default_vllm_config, dist_init):
     """
     Create and schedule a request so that P adds it to in-batch tracking via
     the real scheduler, then simulate an abort (request not in next scheduler
@@ -1464,6 +1705,8 @@ class FailingNixlWrapper(FakeNixlWrapper):
         self.fail_handshake = False
         self.fail_transfer_setup = False
         self.fail_send_notif = False
+        self.fail_transfer_state = False  # Returns "ERR" state
+        self.fail_transfer_exception = False  # Raises exception in check_xfer_state
 
     def add_remote_agent(self, agent_metadata: bytes) -> str:
         if self.fail_handshake:
@@ -1498,12 +1741,156 @@ class FailingNixlWrapper(FakeNixlWrapper):
             raise RuntimeError("Simulated send_notif failure")
         return super().send_notif(agent_name, notif_msg)
 
+    def check_xfer_state(self, handle: int) -> str:
+        if self.fail_transfer_exception:
+            raise RuntimeError("Simulated check_xfer_state exception")
+        if self.fail_transfer_state:
+            return "ERR"  # Bad transfer state
+        return super().check_xfer_state(handle)
+
+
+@patch(
+    "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper",
+    FailingNixlWrapper,
+)
+@pytest.mark.parametrize(
+    "failure_type,wrapper_config,needs_get_finished",
+    [
+        ("transfer_setup_failed", {"fail_transfer_setup": True}, False),
+        ("handshake_failed", {"fail_handshake": True}, False),
+        ("notification_failed", {"fail_send_notif": True}, False),
+        ("transfer_failed", {"fail_transfer_state": True}, True),
+        ("transfer_exception", {"fail_transfer_exception": True}, True),
+    ],
+)
+def test_transfer_failure_logging(
+    default_vllm_config,
+    dist_init,
+    failure_type,
+    wrapper_config,
+    needs_get_finished,
+):
+    """Test that transfer failures are logged with structured context.
+
+    Run with `pytest -sv` to see the log output.
+
+    Covers failure types:
+    - transfer_setup_failed: make_prepped_xfer fails
+    - handshake_failed: add_remote_agent fails during request handshake
+    - notification_failed: send_notif fails
+    - transfer_failed: check_xfer_state returns bad state (e.g., "ERR")
+    - transfer_exception: check_xfer_state raises exception
+    """
+    import logging
+
+    vllm_config = create_vllm_config()
+
+    connector = NixlConnector(vllm_config, KVConnectorRole.WORKER)
+    connector.connector_worker = FakeNixlConnectorWorker(
+        vllm_config, connector.engine_id, hand_shake_latency=0.0
+    )
+
+    # Configure FailingNixlWrapper to fail in the specified way
+    for key, value in wrapper_config.items():
+        setattr(connector.connector_worker.nixl_wrapper, key, value)
+
+    request_id = f"test_{failure_type}_req"
+
+    # For notification_failed, we need empty local blocks
+    # (full cache hit path to trigger send_notif)
+    local_blocks = [] if failure_type == "notification_failed" else [10, 11, 12]
+    remote_blocks = [20, 21, 22]
+
+    metadata = NixlConnectorMetadata()
+    metadata.add_new_req_to_recv(
+        request_id=request_id,
+        local_block_ids=local_blocks,
+        kv_transfer_params={
+            "remote_block_ids": remote_blocks,
+            "remote_engine_id": FakeNixlConnectorWorker.REMOTE_ENGINE_ID,
+            "remote_request_id": f"prefill-{request_id}",
+            "remote_host": "localhost",
+            "remote_port": 1234,
+            "remote_tp_size": 1,
+        },
+    )
+    connector.bind_connector_metadata(metadata)
+
+    dummy_ctx = ForwardContext(
+        no_compile_layers={},
+        attn_metadata={},
+        virtual_engine=0,
+    )
+
+    # Capture logs from the nixl_connector logger specifically
+    # vLLM loggers have propagate=False, so we need to capture directly
+    nixl_logger = logging.getLogger(
+        "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector"
+    )
+    captured_logs: list[logging.LogRecord] = []
+
+    class LogCapture(logging.Handler):
+        def emit(self, record):
+            captured_logs.append(record)
+
+    handler = LogCapture()
+    handler.setLevel(logging.ERROR)
+    nixl_logger.addHandler(handler)
+
+    try:
+        connector.start_load_kv(dummy_ctx)
+        # Process the ready_requests queue (for async handshake)
+        connector.bind_connector_metadata(NixlConnectorMetadata())
+        # Wait for async handshake to complete
+        time.sleep(0.2)
+        connector.start_load_kv(dummy_ctx)
+
+        # For transfer_failed/transfer_exception, the error happens in
+        # get_finished() when checking transfer state
+        if needs_get_finished:
+            connector.get_finished(finished_req_ids=set())
+    finally:
+        nixl_logger.removeHandler(handler)
+
+    # Print logs for manual comparison between commits
+    error_logs = [r for r in captured_logs if r.levelno >= logging.ERROR]
+    print("\n" + "=" * 60)
+    print(f"CAPTURED ERROR LOGS for {failure_type}:")
+    print("=" * 60)
+    for i, record in enumerate(error_logs):
+        print(f"\n--- Log {i + 1} ---")
+        print(f"Message: {record.message}")
+    print("=" * 60 + "\n")
+
+    assert len(error_logs) >= 1, f"Expected at least one error log for {failure_type}"
+
+    # Verify structured logging output (new format)
+    # Check that at least one log matches the expected format
+    all_messages = [r.message for r in error_logs]
+    combined_logs = "\n".join(all_messages)
+
+    assert any("NIXL transfer failure" in msg for msg in all_messages), (
+        f"Expected structured log format with 'NIXL transfer failure' prefix "
+        f"for {failure_type}. Got: {all_messages}"
+    )
+    assert any("failure_type" in msg for msg in all_messages), (
+        f"Expected 'failure_type' in logs. Got: {all_messages}"
+    )
+    assert any("Context:" in msg for msg in all_messages), (
+        f"Expected 'Context:' in logs. Got: {all_messages}"
+    )
+    # Check that the expected failure_type appears in at least one log
+    # Note: handshake_failed also triggers handshake_setup_failed
+    assert failure_type in combined_logs or (
+        failure_type == "handshake_failed" and "handshake_setup_failed" in combined_logs
+    ), f"Expected '{failure_type}' in logs. Got: {all_messages}"
+
 
 @patch(
     "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper",
     FailingNixlWrapper,
 )
-def test_handshake_failure_returns_finished(dist_init):
+def test_handshake_failure_returns_finished(default_vllm_config, dist_init):
     """Test that handshake failures mark blocks invalid and return via get_finished."""
     vllm_config = create_vllm_config()
 
@@ -1552,7 +1939,7 @@ def test_handshake_failure_returns_finished(dist_init):
     "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper",
     FailingNixlWrapper,
 )
-def test_transfer_setup_failure_returns_finished(dist_init):
+def test_transfer_setup_failure_returns_finished(default_vllm_config, dist_init):
     """Test that transfer setup failures mark blocks invalid
     and return via get_finished."""
     vllm_config = create_vllm_config()
@@ -1627,6 +2014,7 @@ def test_transfer_setup_failure_returns_finished(dist_init):
     FakeNixlWrapper,
 )
 def test_compatibility_hash_validation(
+    default_vllm_config,
     dist_init,
     mismatch_type,
     config_overrides,
@@ -1739,7 +2127,7 @@ def test_compatibility_hash_validation(
     "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper",
     FakeNixlWrapper,
 )
-def test_handshake_decode_errors(dist_init, error_scenario):
+def test_handshake_decode_errors(default_vllm_config, dist_init, error_scenario):
     """
     Test that msgspec decode errors are properly handled during handshake.
 
diff --git a/tests/v1/kv_connector/unit/test_offloading_connector.py b/tests/v1/kv_connector/unit/test_offloading_connector.py
index 69565f584ab897969086720db317f986448ecd8d..5c049301c732b608bdac84ca9e8542653a2fe1fa 100644
--- a/tests/v1/kv_connector/unit/test_offloading_connector.py
+++ b/tests/v1/kv_connector/unit/test_offloading_connector.py
@@ -26,6 +26,7 @@ from vllm.v1.core.kv_cache_utils import (
     init_none_hash,
 )
 from vllm.v1.core.sched.scheduler import Scheduler
+from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.kv_offload.abstract import (
     LoadStoreSpec,
     OffloadingEvent,
@@ -64,8 +65,11 @@ class MockLoadStoreSpec(LoadStoreSpec):
 
 class MockOffloadingHandler(OffloadingHandler):
     def __init__(self):
+        self.transfer_specs: dict[int, TransferSpec] = {}
         self.completed_transfers: list[TransferResult] = []
-        self.completed_specs: list[TransferSpec] = []
+        self.waiting_jobs: set[int] = set()
+        self.completed_jobs: list[int] = []
+        self.flushed_jobs: set[int] = set()
 
     def get_finished(self) -> list[TransferResult]:
         finished = self.completed_transfers
@@ -73,14 +77,25 @@ class MockOffloadingHandler(OffloadingHandler):
         return finished
 
     def transfer_async(self, job_id: int, spec: TransferSpec) -> bool:
-        self.completed_specs.append(spec)
-        self.completed_transfers.append((job_id, True))
+        self.transfer_specs[job_id] = spec
+        self.waiting_jobs.add(job_id)
         return True
 
+    def complete_jobs(self, job_ids: set[int]) -> None:
+        for job_id in job_ids:
+            if job_id in self.waiting_jobs:
+                self.waiting_jobs.remove(job_id)
+                self.completed_jobs.append(job_id)
+                self.completed_transfers.append((job_id, True))
+
+    def wait(self, job_ids: set[int]) -> None:
+        self.flushed_jobs |= job_ids
+        self.complete_jobs(job_ids)
+
 
 class MockOffloadingSpec(OffloadingSpec):
-    def __init__(self, vllm_config: VllmConfig):
-        super().__init__(vllm_config)
+    def __init__(self, vllm_config: VllmConfig, kv_cache_config: KVCacheConfig):
+        super().__init__(vllm_config, kv_cache_config)
 
         self.manager = MagicMock(spec=OffloadingManager)
         self.manager.lookup.return_value = 0
@@ -98,9 +113,22 @@ class MockOffloadingSpec(OffloadingSpec):
         yield GPULoadStoreSpec, MockLoadStoreSpec, self.handler
         yield MockLoadStoreSpec, GPULoadStoreSpec, self.handler
 
+    def complete_transfers(self):
+        self.handler.complete_jobs(self.handler.waiting_jobs.copy())
+
     def get_completed_transfers(self) -> list[TransferSpec]:
-        specs = self.handler.completed_specs
-        self.handler.completed_specs = []
+        specs = [
+            self.handler.transfer_specs[job_id]
+            for job_id in self.handler.completed_jobs
+        ]
+        self.handler.completed_jobs.clear()
+        return specs
+
+    def get_flushed_transfers(self):
+        specs = [
+            self.handler.transfer_specs[job_id] for job_id in self.handler.flushed_jobs
+        ]
+        self.handler.flushed_jobs.clear()
         return specs
 
 
@@ -170,11 +198,9 @@ class RequestRunner:
         # mapping (offloading address) -> gpu_block_index
         self.offloaded: dict[Any, int] = {}
 
-        self.pending_loads_count: int = 0
-        self.pending_stores_count: int = 0
-
         self.completed_loads: list[TransferSummary] = []
         self.completed_stores: list[TransferSummary] = []
+        self.flushed_gpu_block_indexes: set[int] = set()
 
         # maps {block_id: block_offset}
         self.gpu_block_index: dict[int, int] = {}
@@ -201,54 +227,60 @@ class RequestRunner:
 
         self.scheduler.add_request(req)
 
-    def _wait_for_transfers(self):
+    def _parse_transfers(self):
+        for transfer_spec in self.offloading_spec.get_flushed_transfers():
+            src_spec, dst_spec = transfer_spec
+            assert isinstance(src_spec, GPULoadStoreSpec)
+
+            for block_id in src_spec.block_ids:
+                self.flushed_gpu_block_indexes.add(
+                    self.gpu_block_index[block_id.item()]
+                )
+
         block_size_factor = self.offloaded_block_size // self.gpu_block_size
 
-        while self.pending_loads_count or self.pending_stores_count:
-            for transfer_spec in self.offloading_spec.get_completed_transfers():
-                src_spec, dst_spec = transfer_spec
-
-                if isinstance(src_spec, GPULoadStoreSpec):
-                    store = True
-                    gpu_spec = src_spec
-                    offload_spec = dst_spec
-                else:
-                    store = False
-                    gpu_spec = dst_spec
-                    offload_spec = src_spec
-
-                assert isinstance(offload_spec, MockLoadStoreSpec)
-                assert isinstance(gpu_spec, GPULoadStoreSpec)
-
-                gpu_block_indices: list[int] = []
-                for block_id in gpu_spec.block_ids:
-                    gpu_block_indices.append(self.gpu_block_index[block_id.item()])
-
-                # list of (block_hash, sub_block_offset)
-                offload_addresses: list[Any] = []
-                for block_hash in offload_spec.block_hashes:
-                    for sub_block_idx in range(block_size_factor):
-                        offload_addresses.append((block_hash, sub_block_idx))
-
-                if store:
-                    assert len(gpu_block_indices) == len(offload_addresses)
-
-                    self.completed_stores.append(
-                        TransferSummary(gpu_block_indices, offload_addresses)
-                    )
-                    self.pending_stores_count -= 1
-                else:
-                    remainder_sub_block_count = len(offload_addresses) - len(
-                        gpu_block_indices
-                    )
-                    assert remainder_sub_block_count >= 0
-                    assert remainder_sub_block_count < block_size_factor
-                    offload_addresses = offload_addresses[remainder_sub_block_count:]
-
-                    self.completed_loads.append(
-                        TransferSummary(gpu_block_indices, offload_addresses)
-                    )
-                    self.pending_loads_count -= 1
+        for transfer_spec in self.offloading_spec.get_completed_transfers():
+            src_spec, dst_spec = transfer_spec
+
+            if isinstance(src_spec, GPULoadStoreSpec):
+                store = True
+                gpu_spec = src_spec
+                offload_spec = dst_spec
+            else:
+                store = False
+                gpu_spec = dst_spec
+                offload_spec = src_spec
+
+            assert isinstance(offload_spec, MockLoadStoreSpec)
+            assert isinstance(gpu_spec, GPULoadStoreSpec)
+
+            gpu_block_indices: list[int] = []
+            for block_id in gpu_spec.block_ids:
+                gpu_block_indices.append(self.gpu_block_index[block_id.item()])
+
+            # list of (block_hash, sub_block_offset)
+            offload_addresses: list[Any] = []
+            for block_hash in offload_spec.block_hashes:
+                for sub_block_idx in range(block_size_factor):
+                    offload_addresses.append((block_hash, sub_block_idx))
+
+            if store:
+                assert len(gpu_block_indices) == len(offload_addresses)
+
+                self.completed_stores.append(
+                    TransferSummary(gpu_block_indices, offload_addresses)
+                )
+            else:
+                remainder_sub_block_count = len(offload_addresses) - len(
+                    gpu_block_indices
+                )
+                assert remainder_sub_block_count >= 0
+                assert remainder_sub_block_count < block_size_factor
+                offload_addresses = offload_addresses[remainder_sub_block_count:]
+
+                self.completed_loads.append(
+                    TransferSummary(gpu_block_indices, offload_addresses)
+                )
 
     def _update_gpu_block_idx(self):
         for blocks in self.scheduler.kv_cache_manager.coordinator.single_type_managers[
@@ -257,18 +289,19 @@ class RequestRunner:
             for block_idx, block in enumerate(blocks):
                 self.gpu_block_index[block.block_id] = block_idx
 
-    def _run(self, decoded_tokens: list[int]):
+    def _run(self, decoded_tokens: list[int], complete_transfers: bool):
         """
         Runs multiple engine (scheduler + worker) steps.
         Assumes a single request is running.
 
         Args:
             decoded_tokens: the tokens to yield at each step.
+            complete_transfers: complete transfers immediately
         """
 
         tokens_iter = iter(decoded_tokens)
         token_id = next(tokens_iter, None)
-        while token_id is not None:
+        while True:
             assert self.scheduler.requests
 
             scheduler_output = self.scheduler.schedule()
@@ -278,8 +311,10 @@ class RequestRunner:
             assert kv_connector_metadata is not None
             assert isinstance(kv_connector_metadata, OffloadingConnectorMetadata)
 
-            self.pending_loads_count += len(kv_connector_metadata.reqs_to_load)
-            self.pending_stores_count += len(kv_connector_metadata.reqs_to_store)
+            if scheduler_output.preempted_req_ids:
+                self.worker_connector.handle_preemptions(
+                    scheduler_output.preempted_req_ids
+                )
 
             self.worker_connector.bind_connector_metadata(kv_connector_metadata)
             self.worker_connector.start_load_kv(self._dummy_ctx)
@@ -287,6 +322,9 @@ class RequestRunner:
             if scheduler_output.total_num_scheduled_tokens > 0:
                 self.worker_connector.wait_for_save()
 
+            if complete_transfers:
+                self.offloading_spec.complete_transfers()
+
             finished_sending, finished_recving = self.worker_connector.get_finished(
                 scheduler_output.finished_req_ids
             )
@@ -297,7 +335,7 @@ class RequestRunner:
                 reqs=self.scheduler.running,
                 finished_sending=finished_sending,
                 finished_recving=finished_recving,
-                token_id=token_id,
+                token_id=token_id or 0,
             )
 
             if self.scheduler.running:
@@ -305,7 +343,10 @@ class RequestRunner:
 
             self.scheduler.update_from_output(scheduler_output, model_runner_output)
 
-        self._wait_for_transfers()
+            if token_id is None:
+                break
+
+        self._parse_transfers()
 
         # run one more step to update finished stored
         if EOS_TOKEN_ID in decoded_tokens:
@@ -330,8 +371,10 @@ class RequestRunner:
     def run(
         self,
         decoded_tokens: list[int],
+        complete_transfers: bool = True,
         expected_stored_gpu_block_indexes: tuple[int, ...] = (),
         expected_loaded_gpu_block_indexes: tuple[int, ...] = (),
+        expected_flushed_gpu_block_indexes: tuple[int, ...] = (),
     ):
         """
         Runs multiple engine (scheduler + worker) steps.
@@ -339,14 +382,17 @@ class RequestRunner:
 
         Args:
             decoded_tokens: the tokens to yield at each step.
+            complete_transfers: complete transfers immediately
             expected_stored_gpu_block_indexes: GPU block indexes
                 that are expected to be written during the run.
             expected_loaded_gpu_block_indexes: GPU block indexes
                 that are expected to be loaded during the run.
+            expected_flushed_gpu_block_indexes: GPU block indexes
+                that are expected to be flushed during the run.
         """
 
         self.manager.reset_mock()
-        self._run(decoded_tokens)
+        self._run(decoded_tokens, complete_transfers)
 
         loaded_gpu_block_indexes: set[int] = set()
         for transfer in self.completed_loads:
@@ -370,6 +416,9 @@ class RequestRunner:
         assert set(expected_stored_gpu_block_indexes) == stored_gpu_block_indexes
         self.completed_stores.clear()
 
+        assert set(expected_flushed_gpu_block_indexes) == self.flushed_gpu_block_indexes
+        self.flushed_gpu_block_indexes.clear()
+
 
 @pytest.fixture
 def request_runner():
@@ -414,10 +463,13 @@ def test_offloading_connector(request_runner):
     runner.manager.prepare_store.side_effect = (
         lambda block_hashes: generate_store_output(list(block_hashes)[1:2])
     )
-    runner.run(decoded_tokens=[0], expected_stored_gpu_block_indexes=(3, 4, 5))
+    runner.run(decoded_tokens=[0])
 
     # add block missing 1 token -> no offload
-    runner.run(decoded_tokens=[0] * (offloaded_block_size - 1))
+    runner.run(
+        decoded_tokens=[0] * (offloaded_block_size - 1),
+        expected_stored_gpu_block_indexes=(3, 4, 5),
+    )
     runner.manager.prepare_store.assert_not_called()
 
     # +1 token -> single block, fail prepare_store
@@ -435,23 +487,20 @@ def test_offloading_connector(request_runner):
     runner.manager.prepare_store.side_effect = (
         lambda block_hashes: generate_store_output(block_hashes)
     )
-    runner.run(
-        decoded_tokens=[0] * offloaded_block_size,
-        expected_stored_gpu_block_indexes=(15, 16, 17),
-    )
+    runner.run(decoded_tokens=[0] * offloaded_block_size)
     runner.manager.touch.assert_called()
     block_hashes1 = list(runner.manager.touch.call_args.args[0])
     assert len(block_hashes1) == 6
 
     # terminate request
-    runner.run(decoded_tokens=[EOS_TOKEN_ID])
+    runner.run(
+        decoded_tokens=[EOS_TOKEN_ID],
+        expected_stored_gpu_block_indexes=(15, 16, 17),
+    )
 
     # create a new request differing only on the last token
     runner.new_request(token_ids=[0] * (offloaded_block_size * 6 - 1) + [1])
-    runner.run(
-        decoded_tokens=[0],
-        expected_stored_gpu_block_indexes=tuple(range(6 * block_size_factor)),
-    )
+    runner.run(decoded_tokens=[0])
     runner.manager.touch.assert_called()
     block_hashes2 = list(runner.manager.touch.call_args.args[0])
     assert len(block_hashes2) == 6
@@ -461,7 +510,10 @@ def test_offloading_connector(request_runner):
     assert block_hashes1[5] != block_hashes2[5]
 
     # terminate request
-    runner.run(decoded_tokens=[EOS_TOKEN_ID])
+    runner.run(
+        decoded_tokens=[EOS_TOKEN_ID],
+        expected_stored_gpu_block_indexes=tuple(range(6 * block_size_factor)),
+    )
 
     # full_block_tokens - num_computed_tokens < offloaded_block_size
     runner.new_request(
@@ -528,7 +580,74 @@ def test_offloading_connector(request_runner):
     assert event.token_ids == []
     assert event.parent_block_hash is None
     assert event.lora_id is None
+    assert event.lora_name is None
     event = events[1]
     assert isinstance(event, BlockRemoved)
     assert event.block_hashes == to_hashes([4, 5, 6])
     assert event.medium == "B"
+
+
+def test_request_preemption(request_runner):
+    offloaded_block_size = 12
+    gpu_block_size = 4
+    num_gpu_blocks = 100
+
+    runner = request_runner(
+        offloaded_block_size=offloaded_block_size,
+        gpu_block_size=gpu_block_size,
+        num_gpu_blocks=num_gpu_blocks,
+    )
+
+    free_block_queue = runner.scheduler.kv_cache_manager.block_pool.free_block_queue
+    num_free_blocks_empty = free_block_queue.num_free_blocks
+
+    # 2 blocks, store all, without flushing
+    # blocks = [0, 1, 2], [3, 4, 5]
+    runner.new_request(token_ids=[0] * offloaded_block_size * 2)
+    runner.manager.prepare_store.side_effect = (
+        lambda block_hashes: generate_store_output(block_hashes)
+    )
+    runner.run(
+        decoded_tokens=[0],
+        complete_transfers=False,
+    )
+
+    # decode 2 more blocks - 1 gpu block, storing [6, 7, 8] (no flush)
+    runner.manager.prepare_store.side_effect = (
+        lambda block_hashes: generate_store_output(block_hashes)
+    )
+    runner.run(
+        decoded_tokens=[0] * (2 * offloaded_block_size - gpu_block_size),
+        complete_transfers=False,
+    )
+
+    # simulate KV cache running out of space
+    free_block_queue.num_free_blocks = 0
+
+    # request should be preempted now
+    runner.run(
+        decoded_tokens=[],
+        complete_transfers=False,
+        expected_flushed_gpu_block_indexes=(0, 1, 2, 3, 4, 5, 6, 7, 8),
+        expected_stored_gpu_block_indexes=(0, 1, 2, 3, 4, 5, 6, 7, 8),
+    )
+
+    # restore KV cache space and reset GPU prefix cache
+    free_block_queue.num_free_blocks = num_free_blocks_empty
+    runner.scheduler.reset_prefix_cache()
+
+    # request should now return from preemption
+    # re-load [0, ..., 8] from the CPU and store [9, 10, 11]
+    runner.manager.lookup.return_value = 3
+    runner.manager.prepare_store.side_effect = (
+        lambda block_hashes: generate_store_output(block_hashes)
+    )
+    runner.run(
+        decoded_tokens=[0] * gpu_block_size,
+        expected_loaded_gpu_block_indexes=(0, 1, 2, 3, 4, 5, 6, 7, 8),
+    )
+
+    runner.run(
+        decoded_tokens=[EOS_TOKEN_ID],
+        expected_stored_gpu_block_indexes=(9, 10, 11),
+    )
diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py
index 13ce3537b7dbe14afc5a793ea8f8c7e3f8ffda74..a51ea915cecee6d48beaad03b4efac279cbfad51 100644
--- a/tests/v1/kv_connector/unit/utils.py
+++ b/tests/v1/kv_connector/unit/utils.py
@@ -11,6 +11,7 @@ import torch
 
 from vllm import SamplingParams
 from vllm.config import (
+    AttentionConfig,
     CacheConfig,
     DeviceConfig,
     KVTransferConfig,
@@ -94,6 +95,7 @@ def create_vllm_config(
     dtype: str = "float16",
     cache_dtype: str = "auto",
     hf_overrides: dict[str, Any] | None = None,
+    attention_backend: str | None = None,
 ) -> VllmConfig:
     """Initialize VllmConfig For Testing."""
     model_config = ModelConfig(
@@ -131,12 +133,14 @@ def create_vllm_config(
         enable_permute_local_kv=enable_permute_local_kv,
         kv_connector_extra_config=kv_connector_extra_config or {},
     )
+    attention_config = AttentionConfig(backend=attention_backend)
     return VllmConfig(
         scheduler_config=scheduler_config,
         model_config=model_config,
         cache_config=cache_config,
         kv_transfer_config=kv_transfer_config,
         device_config=DeviceConfig("cpu"),
+        attention_config=attention_config,
     )
 
 
@@ -151,7 +155,13 @@ def create_scheduler(
         kv_cache_tensors=[],
         kv_cache_groups=[
             KVCacheGroupSpec(
-                ["layer"], FullAttentionSpec(block_size, 1, 1, torch.float32, False)
+                ["layer"],
+                FullAttentionSpec(
+                    block_size=block_size,
+                    num_kv_heads=1,
+                    head_size=1,
+                    dtype=torch.float32,
+                ),
             )
         ],
     )
diff --git a/tests/v1/kv_offload/test_cpu_gpu.py b/tests/v1/kv_offload/test_cpu_gpu.py
index 3516c0013879d98943f938e08eb3d0e97d64a907..b3696f1ccdf0062accdb2ba7d081fcefe9bcd31b 100644
--- a/tests/v1/kv_offload/test_cpu_gpu.py
+++ b/tests/v1/kv_offload/test_cpu_gpu.py
@@ -7,6 +7,7 @@ import pytest
 import torch
 
 from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed
 from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend
 from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec
 from vllm.v1.kv_offload.worker.cpu_gpu import CpuGpuOffloadingHandlers
@@ -49,6 +50,7 @@ NUM_MAPPINGS = [3]
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
 def test_transfer(
+    default_vllm_config,
     gpu_to_cpu: bool,
     num_mappings: int,
     head_size: int,
@@ -62,7 +64,7 @@ def test_transfer(
     seed: int,
     device: str,
 ) -> None:
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
 
     # create per-layer GPU KV caches based on available attn_backends
     attn_backends_list = BACKENDS_TO_TEST
diff --git a/tests/v1/kv_offload/test_cpu_offloading.py b/tests/v1/kv_offload/test_cpu_offloading.py
index 57474a3dc01e73bdf4ca30cbd6accb2be3143aec..103675608c69d6ab786af2ab8c02f8551a1b0009 100644
--- a/tests/v1/kv_offload/test_cpu_offloading.py
+++ b/tests/v1/kv_offload/test_cpu_offloading.py
@@ -13,13 +13,12 @@ from vllm import LLM, SamplingParams, TokensPrompt
 from vllm.config import KVEventsConfig, KVTransferConfig
 from vllm.distributed.kv_events import BlockStored, KVEventBatch
 from vllm.platforms import current_platform
-from vllm.utils.system_utils import set_env_var
 
 CPU_BLOCK_SIZES = [48]
-ATTN_BACKENDS = ["FLASH_ATTN"]
+ATTN_BACKENDS = []
 
 if current_platform.is_cuda():
-    ATTN_BACKENDS.append("FLASHINFER")
+    ATTN_BACKENDS = ["FLASH_ATTN", "FLASHINFER", "TRITON_ATTN"]
 elif current_platform.is_rocm():
     ATTN_BACKENDS = ["TRITON_ATTN"]
 
@@ -162,7 +161,7 @@ def test_cpu_offloading(cpu_block_size: int, attn_backend: str) -> None:
         kv_connector="OffloadingConnector",
         kv_role="kv_both",
         kv_connector_extra_config={
-            "num_cpu_blocks": 1000,
+            "cpu_bytes_to_use": 500 << 20,
             "block_size": cpu_block_size,
         },
     )
@@ -180,13 +179,13 @@ def test_cpu_offloading(cpu_block_size: int, attn_backend: str) -> None:
         topic="test",
     )
 
-    with set_env_var("VLLM_ATTENTION_BACKEND", attn_backend):
-        llm = LLM(
-            model="meta-llama/Llama-3.2-1B-Instruct",
-            gpu_memory_utilization=0.5,
-            kv_events_config=kv_events_config,
-            kv_transfer_config=kv_transfer_config,
-        )
+    llm = LLM(
+        model="meta-llama/Llama-3.2-1B-Instruct",
+        gpu_memory_utilization=0.5,
+        kv_events_config=kv_events_config,
+        kv_transfer_config=kv_transfer_config,
+        attention_config={"backend": attn_backend},
+    )
 
     events_endpoint = events_endpoint.replace("*", "127.0.0.1")
     subscriber = MockSubscriber(events_endpoint, topic=kv_events_config.topic)
diff --git a/tests/v1/kv_offload/test_worker.py b/tests/v1/kv_offload/test_worker.py
index 6fcd408f3c593ea225cb1b985430a17577ee99b7..fbdac5f9dc7c73d805897c939e17b9174e12e1b8 100644
--- a/tests/v1/kv_offload/test_worker.py
+++ b/tests/v1/kv_offload/test_worker.py
@@ -63,6 +63,12 @@ class OffloadingHandler1To2(OffloadingHandler):
                 del self.transfers[job_id]
         return finished
 
+    def wait(self, job_ids: set[int]) -> None:
+        for job_id in job_ids:
+            spec = self.transfers.get(job_id)
+            if spec:
+                assert spec.finished
+
 
 class OffloadingHandler2To1(OffloadingHandler):
     def __init__(self):
@@ -84,6 +90,12 @@ class OffloadingHandler2To1(OffloadingHandler):
                 del self.transfers[job_id]
         return finished
 
+    def wait(self, job_ids: set[int]) -> None:
+        for job_id in job_ids:
+            spec = self.transfers.get(job_id)
+            if spec:
+                assert spec.finished
+
 
 def test_offloading_worker():
     """
diff --git a/tests/v1/metrics/test_perf_metrics.py b/tests/v1/metrics/test_perf_metrics.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3846a7a3ef160100a2af13188822167eaaee620
--- /dev/null
+++ b/tests/v1/metrics/test_perf_metrics.py
@@ -0,0 +1,907 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Tests for the analytic estimators in metrics/flops.py.
+"""
+
+import types
+from types import SimpleNamespace
+
+from transformers.models.deepseek_v3.configuration_deepseek_v3 import DeepseekV3Config
+from transformers.models.llama4.configuration_llama4 import (
+    Llama4Config,
+    Llama4TextConfig,
+)
+from transformers.models.qwen3.configuration_qwen3 import Qwen3Config
+from transformers.models.qwen3_moe.configuration_qwen3_moe import Qwen3MoeConfig
+
+from vllm.config.model import ModelConfig, get_hf_text_config
+from vllm.transformers_utils.model_arch_config_convertor import (
+    MODEL_ARCH_CONFIG_CONVERTORS,
+    ModelArchConfigConvertorBase,
+)
+from vllm.v1.metrics.perf import (
+    AttentionMetrics,
+    BaseConfigParser,
+    ExecutionContext,
+    FfnMetrics,
+    ModelMetrics,
+    ParsedArgs,
+    UnembedMetrics,
+)
+
+
+class MockModelConfig:
+    """Mock ModelConfig that implements the getter methods used by parsers."""
+
+    def __init__(self, hf_config, dtype):
+        self.hf_config = hf_config
+        self.hf_text_config = get_hf_text_config(hf_config)
+        convertor_cls = MODEL_ARCH_CONFIG_CONVERTORS.get(
+            self.hf_config.model_type, ModelArchConfigConvertorBase
+        )
+        self.model_arch_config = convertor_cls(
+            self.hf_config, self.hf_text_config
+        ).convert()
+        self.dtype = dtype
+        self.is_attention_free = False
+
+    def __getattr__(self, name):
+        # 1. Check if ModelConfig actually has this attribute
+        if not hasattr(ModelConfig, name):
+            raise AttributeError(
+                f"'{type(self).__name__}' object has no attribute '{name}' "
+                f"and neither does 'ModelConfig'."
+            )
+
+        # 2. Fetch the attribute from the ModelConfig CLASS
+        attr = getattr(ModelConfig, name)
+
+        # 3. Case A: It is a @property
+        if isinstance(attr, property):
+            # Manually invoke the property's getter, passing 'self' (this mock instance)
+            return attr.__get__(self, self.__class__)
+
+        # 4. Case B: It is a standard method (function)
+        if isinstance(attr, types.FunctionType):
+            # Bind the function to 'self' so it acts like a method of
+            # this instance. This creates a bound method where 'self' is
+            # automatically passed as the first arg.
+            return types.MethodType(attr, self)
+
+        # 5. Case C: It is a class attribute / static variable
+        return attr
+
+
+def create_mock_vllm_config(
+    hf_config,
+    model_dtype="bfloat16",
+    cache_dtype="auto",
+    quant_config=None,
+    data_parallel_size=1,
+    tensor_parallel_size=1,
+    pipeline_parallel_size=1,
+    enable_expert_parallel=False,
+) -> SimpleNamespace:
+    vllm_config = SimpleNamespace()
+    vllm_config.model_config = MockModelConfig(hf_config, model_dtype)
+
+    vllm_config.cache_config = SimpleNamespace()
+    vllm_config.cache_config.cache_dtype = cache_dtype
+
+    vllm_config.quant_config = quant_config
+
+    vllm_config.parallel_config = SimpleNamespace()
+    vllm_config.parallel_config.data_parallel_size = data_parallel_size
+    vllm_config.parallel_config.tensor_parallel_size = tensor_parallel_size
+    vllm_config.parallel_config.pipeline_parallel_size = pipeline_parallel_size
+    vllm_config.parallel_config.enable_expert_parallel = enable_expert_parallel
+
+    return vllm_config
+
+
+#### Parser Tests ####
+
+
+def test_base_config_parser():
+    """Test BaseConfigParser extracts base model attributes correctly."""
+    hf_config = Qwen3Config(
+        vocab_size=50000,
+        hidden_size=2048,
+        num_attention_heads=16,
+        num_hidden_layers=24,
+    )
+    vllm_config = create_mock_vllm_config(hf_config, model_dtype="float16")
+
+    parser = BaseConfigParser()
+    args = ParsedArgs()
+    result = parser.parse(args, vllm_config)
+
+    assert result.vocab_size == 50000
+    assert result.hidden_size == 2048
+    assert result.num_attention_heads == 16
+    assert result.num_hidden_layers == 24
+    assert result.weight_byte_size == 2  # float16 is 2 bytes
+    assert result.activation_byte_size == 2  # default activation size
+
+
+def test_base_attention_config_parser_with_gqa():
+    """Test BaseAttentionConfigParser with grouped query attention."""
+    hf_config = Qwen3Config(
+        hidden_size=4096,
+        num_attention_heads=32,
+        num_key_value_heads=8,  # GQA with 4:1 ratio
+        head_dim=128,
+    )
+    vllm_config = create_mock_vllm_config(hf_config)
+
+    parser_chain = AttentionMetrics.get_parser()
+    result = parser_chain.parse(vllm_config)
+
+    assert result.num_key_value_heads == 8
+    assert result.head_dim == 128
+
+
+def test_base_attention_config_parser_without_gqa():
+    """
+    Test BaseAttentionConfigParser defaults to MHA when num_key_value_heads not
+    specified.
+    """
+    hf_config = Qwen3Config(
+        hidden_size=4096,
+        num_attention_heads=32,
+        # No num_key_value_heads specified
+    )
+    vllm_config = create_mock_vllm_config(hf_config)
+
+    parser_chain = AttentionMetrics.get_parser()
+    result = parser_chain.parse(vllm_config)
+
+    # Should default to MHA (num_key_value_heads = num_attention_heads)
+    assert result.num_key_value_heads == 32
+
+
+def test_base_ffn_config_parser_dense():
+    """Test BaseFfnConfigParser for dense FFN."""
+    hf_config = Qwen3Config(
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+    )
+    vllm_config = create_mock_vllm_config(hf_config)
+
+    parser_chain = FfnMetrics.get_parser()
+    result = parser_chain.parse(vllm_config)
+
+    assert result.intermediate_size == 11008
+    assert result.num_experts == 0
+    assert result.num_experts_per_tok == 0
+    assert result.num_moe_layers == 0  # No MoE
+
+
+def test_base_ffn_config_parser_moe():
+    """Test BaseFfnConfigParser for MoE FFN."""
+    hf_config = Qwen3MoeConfig(
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_experts=64,
+        num_experts_per_tok=8,
+        moe_intermediate_size=14336,
+        n_shared_experts=2,
+    )
+    vllm_config = create_mock_vllm_config(hf_config)
+
+    parser_chain = FfnMetrics.get_parser()
+    result = parser_chain.parse(vllm_config)
+
+    assert result.num_experts == 64
+    assert result.num_experts_per_tok == 8
+    assert result.moe_intermediate_size == 14336
+    assert result.num_shared_experts == 2
+    assert result.num_moe_layers == 32  # All layers are MoE by default
+
+
+def test_interleave_moe_layer_step_parser():
+    """Test InterleaveMoeLayerStepParser correctly computes MoE layer count."""
+    hf_config = Llama4Config(
+        text_config=Llama4TextConfig(
+            num_hidden_layers=32,
+            num_local_experts=64,
+            interleave_moe_layer_step=4,  # Every 4th layer is MoE
+        ),
+    )
+
+    vllm_config = create_mock_vllm_config(hf_config)
+
+    parser_chain = FfnMetrics.get_parser()
+    result = parser_chain.parse(vllm_config)
+
+    assert result.num_moe_layers == 8
+
+
+def test_moe_layer_freq_parser():
+    """Test MoeLayerFreqParser correctly computes MoE layer count."""
+    hf_config = DeepseekV3Config(
+        num_hidden_layers=30,
+        n_routed_experts=64,
+        moe_layer_freq=3,  # Every 3rd layer after first_k_dense_replace
+        first_k_dense_replace=6,  # First 6 layers are dense
+    )
+    vllm_config = create_mock_vllm_config(hf_config)
+
+    parser_chain = FfnMetrics.get_parser()
+    result = parser_chain.parse(vllm_config)
+
+    # Layers >= 6 and divisible by 3: 6, 9, 12, 15, 18, 21, 24, 27
+    expected_moe_layers = len(
+        [layer for layer in range(30) if layer >= 6 and layer % 3 == 0]
+    )
+    assert expected_moe_layers == 8
+    assert result.num_moe_layers == expected_moe_layers
+
+
+#### ComponentMetrics Tests ####
+
+
+def test_attention_metrics_scaling():
+    """Test that attention metrics scale proportionally with model dimensions."""
+    base_hf_config = Qwen3Config(
+        hidden_size=2048,
+        num_attention_heads=16,
+        num_key_value_heads=16,
+        num_hidden_layers=12,
+        head_dim=128,
+    )
+
+    base_vllm_config = create_mock_vllm_config(base_hf_config)
+    base_metrics = AttentionMetrics.from_vllm_config(base_vllm_config)
+
+    # Test scaling with number of layers
+    double_layers_hf_config = Qwen3Config(
+        hidden_size=2048,
+        num_attention_heads=16,
+        num_key_value_heads=16,
+        num_hidden_layers=24,  # Double the layers
+        head_dim=128,
+    )
+    double_layers_vllm_config = create_mock_vllm_config(double_layers_hf_config)
+    double_layers_metrics = AttentionMetrics.from_vllm_config(double_layers_vllm_config)
+
+    ctx = ExecutionContext.from_single_request(
+        num_tokens=100, context_len=512, is_prefill=True
+    )
+
+    # FLOPS should double when layers double
+    base_flops = base_metrics.get_num_flops(ctx)
+    double_flops = double_layers_metrics.get_num_flops(ctx)
+    assert double_flops == 2 * base_flops
+
+    # Read/write bytes should also scale proportionally
+    base_read = base_metrics.get_read_bytes(ctx)
+    double_read = double_layers_metrics.get_read_bytes(ctx)
+    assert double_read == 2 * base_read
+
+    base_write = base_metrics.get_write_bytes(ctx)
+    double_write = double_layers_metrics.get_write_bytes(ctx)
+    assert double_write == 2 * base_write
+
+
+def test_attention_metrics_grouped_query():
+    """Test attention metrics handle grouped query attention correctly."""
+    mha_hf_config = Qwen3Config(
+        hidden_size=4096,
+        num_attention_heads=32,
+        num_key_value_heads=32,  # MHA
+        num_hidden_layers=1,
+    )
+    mha_config = create_mock_vllm_config(mha_hf_config)
+
+    gqa_hf_config = Qwen3Config(
+        hidden_size=4096,
+        num_attention_heads=32,
+        num_key_value_heads=8,  # GQA with 4:1 ratio
+        num_hidden_layers=1,
+    )
+    gqa_config = create_mock_vllm_config(gqa_hf_config)
+
+    mha_metrics = AttentionMetrics.from_vllm_config(mha_config)
+    gqa_metrics = AttentionMetrics.from_vllm_config(gqa_config)
+
+    ctx = ExecutionContext.from_single_request(
+        num_tokens=1, context_len=1024, is_prefill=False
+    )
+
+    # GQA should have less KV cache reads since fewer KV heads
+    mha_read = mha_metrics.get_read_bytes(ctx)
+    gqa_read = gqa_metrics.get_read_bytes(ctx)
+    assert gqa_read < mha_read
+
+
+def test_ffn_metrics_scaling():
+    """Test FFN metrics scale proportionally with model dimensions."""
+    base_hf_config = Qwen3Config(
+        hidden_size=2048,
+        intermediate_size=8192,
+        num_hidden_layers=12,
+    )
+    base_vllm_config = create_mock_vllm_config(base_hf_config)
+    base_metrics = FfnMetrics.from_vllm_config(base_vllm_config)
+
+    # Test scaling with intermediate size
+    larger_ffn_hf_config = Qwen3Config(
+        hidden_size=2048,
+        intermediate_size=16384,  # Double intermediate size
+        num_hidden_layers=12,
+    )
+    larger_ffn_vllm_config = create_mock_vllm_config(larger_ffn_hf_config)
+    larger_ffn_metrics = FfnMetrics.from_vllm_config(larger_ffn_vllm_config)
+
+    ctx = ExecutionContext.from_single_request(
+        num_tokens=100, context_len=512, is_prefill=True
+    )
+
+    # FLOPS should double when intermediate size doubles
+    base_flops = base_metrics.get_num_flops(ctx)
+    larger_flops = larger_ffn_metrics.get_num_flops(ctx)
+    assert larger_flops == base_flops * 2
+
+
+def test_moe_metrics_vs_dense():
+    """Test MoE metrics versus dense metrics."""
+    dense_hf_config = Qwen3Config(
+        hidden_size=2048,
+        intermediate_size=8192,
+        num_hidden_layers=12,
+    )
+    dense_config = create_mock_vllm_config(dense_hf_config)
+
+    moe_hf_config = Qwen3MoeConfig(
+        hidden_size=2048,
+        intermediate_size=8192,
+        num_hidden_layers=12,
+        num_experts=64,
+        num_experts_per_tok=2,  # 2 routed expert
+        moe_intermediate_size=8192,
+        n_shared_experts=0,
+    )
+    moe_config = create_mock_vllm_config(moe_hf_config)
+
+    dense_metrics = FfnMetrics.from_vllm_config(dense_config)
+    moe_metrics = FfnMetrics.from_vllm_config(moe_config)
+
+    ctx = ExecutionContext.from_single_request(
+        num_tokens=100, context_len=512, is_prefill=True
+    )
+
+    # MoE should have different compute/memory characteristics
+    dense_flops = dense_metrics.get_num_flops(ctx)
+    moe_flops = moe_metrics.get_num_flops(ctx)
+
+    # 2 routed experts vs 1 dense.
+    assert moe_flops == dense_flops * 2
+
+
+def test_unembed_metrics_scaling():
+    """Test unembedding metrics scale with vocab size."""
+    small_vocab_hf_config = Qwen3Config(
+        hidden_size=2048,
+        vocab_size=32000,
+    )
+    small_vocab_config = create_mock_vllm_config(small_vocab_hf_config)
+
+    large_vocab_hf_config = Qwen3Config(
+        hidden_size=2048,
+        vocab_size=64000,  # Double vocab size
+    )
+    large_vocab_config = create_mock_vllm_config(large_vocab_hf_config)
+
+    small_vocab_metrics = UnembedMetrics.from_vllm_config(small_vocab_config)
+    large_vocab_metrics = UnembedMetrics.from_vllm_config(large_vocab_config)
+
+    ctx = ExecutionContext.from_single_request(
+        num_tokens=100, context_len=512, is_prefill=True
+    )
+
+    # FLOPS should double when vocab size doubles
+    small_flops = small_vocab_metrics.get_num_flops(ctx)
+    large_flops = large_vocab_metrics.get_num_flops(ctx)
+    assert large_flops == 2 * small_flops
+
+
+def test_prefill_vs_decode_differences():
+    """Test that prefill and decode have different memory access patterns."""
+    hf_config = Qwen3Config(
+        hidden_size=2048,
+        num_attention_heads=16,
+        num_key_value_heads=16,
+        num_hidden_layers=1,
+    )
+    config = create_mock_vllm_config(hf_config)
+
+    metrics = AttentionMetrics.from_vllm_config(config)
+
+    prefill_ctx = ExecutionContext.from_single_request(
+        num_tokens=512, context_len=512, is_prefill=True
+    )
+    decode_ctx = ExecutionContext.from_single_request(
+        num_tokens=1, context_len=512, is_prefill=False
+    )
+
+    prefill_read = metrics.get_read_bytes(prefill_ctx)
+    decode_read = metrics.get_read_bytes(decode_ctx)
+
+    assert prefill_read != decode_read
+
+
+def test_model_metrics_aggregation():
+    """Test ModelMetrics correctly aggregates across components."""
+    hf_config = Qwen3Config(
+        hidden_size=2048,
+        num_attention_heads=16,
+        num_hidden_layers=12,
+        vocab_size=32000,
+        intermediate_size=8192,
+    )
+    config = create_mock_vllm_config(hf_config)
+
+    model_metrics = ModelMetrics(config)
+    ctx = ExecutionContext.from_single_request(
+        num_tokens=100, context_len=512, is_prefill=True
+    )
+
+    # Should have metrics for attention, ffn, and unembed
+    total_flops = model_metrics.get_num_flops(ctx)
+    breakdown = model_metrics.get_num_flops_breakdown(ctx)
+
+    # Breakdown should sum to total
+    assert total_flops == sum(breakdown.values())
+
+
+def test_moe_expert_activation_proportional_scaling():
+    """Test that routed expert metrics scale proportionally with num_experts_per_tok."""
+    base_moe_config = Qwen3MoeConfig(
+        hidden_size=2048,
+        intermediate_size=8192,
+        num_hidden_layers=12,
+        num_experts=64,
+        num_experts_per_tok=1,  # 1 expert per token
+        moe_intermediate_size=8192,
+        n_shared_experts=2,
+    )
+
+    double_experts_config = Qwen3MoeConfig(
+        hidden_size=2048,
+        intermediate_size=8192,
+        num_hidden_layers=12,
+        num_experts=64,
+        num_experts_per_tok=2,  # 2 experts per token (double)
+        moe_intermediate_size=8192,
+        n_shared_experts=2,  # Same shared experts
+    )
+
+    triple_experts_config = Qwen3MoeConfig(
+        hidden_size=2048,
+        intermediate_size=8192,
+        num_hidden_layers=12,
+        num_experts=64,
+        num_experts_per_tok=3,  # 3 experts per token (triple)
+        moe_intermediate_size=8192,
+        n_shared_experts=2,  # Same shared experts
+    )
+
+    base_vllm_config = create_mock_vllm_config(base_moe_config)
+    double_vllm_config = create_mock_vllm_config(double_experts_config)
+    triple_vllm_config = create_mock_vllm_config(triple_experts_config)
+
+    base_metrics = FfnMetrics.from_vllm_config(base_vllm_config)
+    double_metrics = FfnMetrics.from_vllm_config(double_vllm_config)
+    triple_metrics = FfnMetrics.from_vllm_config(triple_vllm_config)
+
+    ctx = ExecutionContext.from_single_request(
+        num_tokens=100, context_len=512, is_prefill=True
+    )
+
+    # Get total metrics - the key insight is that differences should be proportional
+    base_flops = base_metrics.get_num_flops(ctx)
+    double_flops = double_metrics.get_num_flops(ctx)
+    triple_flops = triple_metrics.get_num_flops(ctx)
+
+    # The difference between double and base should equal one additional expert
+    one_expert_diff = double_flops - base_flops
+
+    # The difference between triple and base should equal two additional experts
+    two_expert_diff = triple_flops - base_flops
+
+    # Proportional scaling: 2 * (1 expert diff) should equal (2 expert diff)
+    assert two_expert_diff == 2 * one_expert_diff
+
+    # Same logic applies to memory operations
+    base_read = base_metrics.get_read_bytes(ctx)
+    double_read = double_metrics.get_read_bytes(ctx)
+    triple_read = triple_metrics.get_read_bytes(ctx)
+
+    one_expert_read_diff = double_read - base_read
+    two_expert_read_diff = triple_read - base_read
+
+    assert two_expert_read_diff == 2 * one_expert_read_diff
+
+    # Same for write bytes
+    base_write = base_metrics.get_write_bytes(ctx)
+    double_write = double_metrics.get_write_bytes(ctx)
+    triple_write = triple_metrics.get_write_bytes(ctx)
+
+    one_expert_write_diff = double_write - base_write
+    two_expert_write_diff = triple_write - base_write
+
+    assert two_expert_write_diff == 2 * one_expert_write_diff
+
+
+def test_quantization_config_parser_fp8():
+    """Test quantization parsers with fp8."""
+
+    class MockQuantConfig:
+        def get_name(self):
+            return "fp8"
+
+    hf_config = Qwen3Config(
+        hidden_size=2048, num_attention_heads=16, num_hidden_layers=1
+    )
+    vllm_config = create_mock_vllm_config(hf_config, quant_config=MockQuantConfig())
+
+    attn_result = AttentionMetrics.get_parser().parse(vllm_config)
+    assert attn_result.weight_byte_size == 1  # fp8
+
+    ffn_result = FfnMetrics.get_parser().parse(vllm_config)
+    assert ffn_result.weight_byte_size == 1  # fp8
+
+
+def test_quantization_config_parser_mxfp4():
+    """Test quantization parsers with mxfp4."""
+
+    class MockQuantConfig:
+        def get_name(self):
+            return "mxfp4"
+
+    hf_config = Qwen3Config(
+        hidden_size=2048, intermediate_size=8192, num_hidden_layers=1
+    )
+    vllm_config = create_mock_vllm_config(hf_config, quant_config=MockQuantConfig())
+
+    ffn_result = FfnMetrics.get_parser().parse(vllm_config)
+    assert ffn_result.weight_byte_size == 0.5  # mxfp4
+
+
+#### Per-GPU Tests ####
+
+
+def test_attention_per_gpu_with_tensor_parallelism():
+    """Test attention metrics with tensor parallelism - per_gpu vs global."""
+    hf_config = Qwen3Config(
+        hidden_size=4096,
+        num_attention_heads=32,
+        num_key_value_heads=8,
+        num_hidden_layers=24,
+    )
+
+    # Test with TP=4
+    vllm_config = create_mock_vllm_config(hf_config, tensor_parallel_size=4)
+    metrics = AttentionMetrics.from_vllm_config(vllm_config)
+
+    ctx = ExecutionContext.from_single_request(
+        num_tokens=128, context_len=1024, is_prefill=True
+    )
+
+    # Get global and per-gpu metrics
+    global_flops = metrics.get_num_flops(ctx, per_gpu=False)
+    per_gpu_flops = metrics.get_num_flops(ctx, per_gpu=True)
+
+    # With TP=4, global flops should be 4x per-gpu flops (heads divided by 4)
+    assert global_flops == 4 * per_gpu_flops
+
+    # Same for read/write bytes
+    global_read = metrics.get_read_bytes(ctx, per_gpu=False)
+    per_gpu_read = metrics.get_read_bytes(ctx, per_gpu=True)
+    # Reads should scale similarly (weight reads are divided by TP)
+    assert global_read > per_gpu_read
+
+    global_write = metrics.get_write_bytes(ctx, per_gpu=False)
+    per_gpu_write = metrics.get_write_bytes(ctx, per_gpu=True)
+    assert global_write > per_gpu_write
+
+
+def test_attention_per_gpu_with_pipeline_parallelism():
+    """Test attention metrics with pipeline parallelism - per_gpu vs global."""
+    hf_config = Qwen3Config(
+        hidden_size=2048,
+        num_attention_heads=16,
+        num_hidden_layers=32,
+    )
+
+    # Test with PP=4
+    vllm_config = create_mock_vllm_config(hf_config, pipeline_parallel_size=4)
+    metrics = AttentionMetrics.from_vllm_config(vllm_config)
+
+    ctx = ExecutionContext.from_single_request(
+        num_tokens=100, context_len=512, is_prefill=False
+    )
+
+    # Get global and per-gpu metrics
+    global_flops = metrics.get_num_flops(ctx, per_gpu=False)
+    per_gpu_flops = metrics.get_num_flops(ctx, per_gpu=True)
+
+    # With PP=4, global flops should be 4x per-gpu flops (layers divided by 4)
+    assert global_flops == 4 * per_gpu_flops
+
+    global_read = metrics.get_read_bytes(ctx, per_gpu=False)
+    per_gpu_read = metrics.get_read_bytes(ctx, per_gpu=True)
+    assert global_read == 4 * per_gpu_read
+
+
+def test_ffn_per_gpu_with_tensor_parallelism():
+    """Test FFN metrics with tensor parallelism - per_gpu vs global."""
+    hf_config = Qwen3Config(
+        hidden_size=4096,
+        intermediate_size=14336,
+        num_hidden_layers=32,
+    )
+
+    # Test with DP=2, TP=4 (ffn_tp_size will be 8)
+    vllm_config = create_mock_vllm_config(
+        hf_config,
+        data_parallel_size=2,
+        tensor_parallel_size=4,
+    )
+    metrics = FfnMetrics.from_vllm_config(vllm_config)
+
+    # ffn_tp_size should be dp_size * tp_size = 8 (when EP not enabled)
+    assert metrics.ffn_tp_size == 8
+
+    ctx = ExecutionContext.from_single_request(
+        num_tokens=128, context_len=2048, is_prefill=True
+    )
+
+    # Get global and per-gpu metrics
+    global_flops = metrics.get_num_flops(ctx, per_gpu=False)
+    per_gpu_flops = metrics.get_num_flops(ctx, per_gpu=True)
+
+    # With ffn_tp_size=8, global should be 8x per-gpu
+    assert global_flops == 8 * per_gpu_flops
+
+
+def test_ffn_per_gpu_with_pipeline_parallelism():
+    """Test FFN metrics with pipeline parallelism - per_gpu vs global."""
+    hf_config = Qwen3Config(
+        hidden_size=2048,
+        intermediate_size=8192,
+        num_hidden_layers=24,
+    )
+
+    # Test with PP=6
+    vllm_config = create_mock_vllm_config(hf_config, pipeline_parallel_size=6)
+    metrics = FfnMetrics.from_vllm_config(vllm_config)
+
+    ctx = ExecutionContext.from_single_request(
+        num_tokens=100, context_len=512, is_prefill=True
+    )
+
+    # Get global and per-gpu metrics
+    global_flops = metrics.get_num_flops(ctx, per_gpu=False)
+    per_gpu_flops = metrics.get_num_flops(ctx, per_gpu=True)
+
+    # With PP=6, global should be 6x per-gpu (layers divided by 6)
+    assert global_flops == 6 * per_gpu_flops
+
+
+def test_moe_per_gpu_with_expert_parallelism():
+    """
+    Test MoE metrics with expert parallelism - verifies num_activated_experts bug fix.
+    """
+    hf_config = Qwen3MoeConfig(
+        hidden_size=2048,
+        intermediate_size=8192,
+        num_hidden_layers=24,
+        num_experts=64,
+        num_experts_per_tok=8,
+        moe_intermediate_size=14336,
+        n_shared_experts=2,
+    )
+
+    # Test with DP=2, TP=4, EP enabled (ffn_ep_size will be 8)
+    vllm_config = create_mock_vllm_config(
+        hf_config,
+        data_parallel_size=2,
+        tensor_parallel_size=4,
+        enable_expert_parallel=True,
+    )
+    metrics = FfnMetrics.from_vllm_config(vllm_config)
+
+    # When EP enabled, ffn_ep_size = dp_size * tp_size = 8
+    assert metrics.ffn_ep_size == 8
+    assert metrics.ffn_tp_size == 1
+
+    ctx = ExecutionContext.from_single_request(
+        num_tokens=100, context_len=512, is_prefill=True
+    )
+
+    # Get per-gpu metrics
+    per_gpu_read_breakdown = metrics.get_read_bytes_breakdown(ctx, per_gpu=True)
+    global_read_breakdown = metrics.get_read_bytes_breakdown(ctx, per_gpu=False)
+
+    # Verify that routed expert weight reads are reasonable
+    # With per_gpu=True, each GPU has 64/8 = 8 experts
+    # T=100, E_per_gpu=8/8=1, so T*E=100 expert activations
+    # num_activated_experts should be min(100, 8) = 8
+
+    # Check that weight reads scale appropriately
+    # Global has all 64 experts, per-gpu has 8 experts
+    # So weight reads should reflect this difference
+    if "routed_up_gate_weights" in per_gpu_read_breakdown:
+        per_gpu_weight_reads = per_gpu_read_breakdown["routed_up_gate_weights"]
+        global_weight_reads = global_read_breakdown["routed_up_gate_weights"]
+
+        # The ratio should reflect the expert count difference
+        # This verifies the bug fix works correctly
+        assert per_gpu_weight_reads < global_weight_reads
+
+        # Global should read more experts than per-gpu
+        # Exact ratio depends on num_activated_experts calculation
+        ratio = global_weight_reads / per_gpu_weight_reads
+        # Should be > 1 since global has more experts to read
+        assert ratio > 1
+
+
+def test_moe_per_gpu_expert_activation_accounting():
+    """
+    Test that MoE correctly accounts for expert activations with small batch sizes.
+    """
+    hf_config = Qwen3MoeConfig(
+        hidden_size=2048,
+        intermediate_size=8192,
+        num_hidden_layers=12,
+        num_experts=64,
+        num_experts_per_tok=8,
+        moe_intermediate_size=14336,
+        n_shared_experts=0,  # No shared experts for this test
+    )
+
+    # Test with EP=8
+    vllm_config = create_mock_vllm_config(
+        hf_config,
+        data_parallel_size=8,
+        enable_expert_parallel=True,
+    )
+    metrics = FfnMetrics.from_vllm_config(vllm_config)
+
+    # Small batch: T=10, E_per_gpu=8/8=1
+    # Each GPU: T*E = 10*1 = 10 activations
+    # Experts per GPU: 64/8 = 8
+    # So num_activated_experts should be min(10, 8) = 8
+    small_ctx = ExecutionContext.from_single_request(
+        num_tokens=10, context_len=512, is_prefill=True
+    )
+    small_read = metrics.get_read_bytes_breakdown(small_ctx, per_gpu=True)
+
+    # Large batch: T=1000, E_per_gpu=1
+    # Each GPU: T*E = 1000*1 = 1000 activations
+    # Experts per GPU: 8
+    # So num_activated_experts should be min(1000, 8) = 8 (all experts activated)
+    large_ctx = ExecutionContext.from_single_request(
+        num_tokens=1000, context_len=512, is_prefill=True
+    )
+    large_read = metrics.get_read_bytes_breakdown(large_ctx, per_gpu=True)
+
+    # Weight reads should be similar (both activate all 8 experts per GPU)
+    # But activation reads should differ (proportional to T*E)
+    if "routed_up_gate_weights" in small_read:
+        small_weight = small_read["routed_up_gate_weights"]
+        large_weight = large_read["routed_up_gate_weights"]
+
+        # Weight reads should be the same (both read all 8 experts)
+        assert small_weight == large_weight
+
+        # But input activation reads should scale with T*E
+        small_input = small_read["routed_up_gate_input"]
+        large_input = large_read["routed_up_gate_input"]
+        assert large_input == 100 * small_input  # 1000/10 = 100x
+
+
+def test_unembed_per_gpu_with_tensor_parallelism():
+    """Test unembed metrics with tensor parallelism - per_gpu vs global."""
+    hf_config = Qwen3Config(
+        hidden_size=4096,
+        vocab_size=128000,
+    )
+
+    # Test with TP=8
+    vllm_config = create_mock_vllm_config(hf_config, tensor_parallel_size=8)
+    metrics = UnembedMetrics.from_vllm_config(vllm_config)
+
+    ctx = ExecutionContext.from_single_request(
+        num_tokens=100, context_len=512, is_prefill=True
+    )
+
+    # Get global and per-gpu metrics
+    global_flops = metrics.get_num_flops(ctx, per_gpu=False)
+    per_gpu_flops = metrics.get_num_flops(ctx, per_gpu=True)
+
+    # With TP=8, vocab is divided by 8, so global should be 8x per-gpu
+    assert global_flops == 8 * per_gpu_flops
+
+    # For read bytes, weight reads scale with TP but input reads don't (replicated)
+    global_read_breakdown = metrics.get_read_bytes_breakdown(ctx, per_gpu=False)
+    per_gpu_read_breakdown = metrics.get_read_bytes_breakdown(ctx, per_gpu=True)
+
+    # Input reads should be the same (replicated across TP ranks)
+    assert global_read_breakdown["input"] == per_gpu_read_breakdown["input"]
+
+    # Weight reads should scale 8x (divided by TP)
+    assert global_read_breakdown["weight"] == 8 * per_gpu_read_breakdown["weight"]
+
+
+def test_model_metrics_per_gpu_aggregation():
+    """Test ModelMetrics correctly aggregates per_gpu metrics across components."""
+    hf_config = Qwen3Config(
+        hidden_size=2048,
+        num_attention_heads=16,
+        num_hidden_layers=12,
+        vocab_size=32000,
+        intermediate_size=8192,
+    )
+
+    # Test with mixed parallelism: TP=2, PP=2
+    vllm_config = create_mock_vllm_config(
+        hf_config,
+        tensor_parallel_size=2,
+        pipeline_parallel_size=2,
+    )
+
+    model_metrics = ModelMetrics(vllm_config)
+    ctx = ExecutionContext.from_single_request(
+        num_tokens=100, context_len=512, is_prefill=True
+    )
+
+    # Get breakdowns for both modes
+    per_gpu_breakdown = model_metrics.get_num_flops_breakdown(ctx, per_gpu=True)
+    global_breakdown = model_metrics.get_num_flops_breakdown(ctx, per_gpu=False)
+
+    # Verify breakdown sums match totals
+    per_gpu_total = model_metrics.get_num_flops(ctx, per_gpu=True)
+    global_total = model_metrics.get_num_flops(ctx, per_gpu=False)
+
+    assert per_gpu_total == sum(per_gpu_breakdown.values())
+    assert global_total == sum(global_breakdown.values())
+
+    # Global should be larger than per-gpu due to parallelism
+    assert global_total > per_gpu_total
+
+    # With TP=2 and PP=2, the ratio depends on which parallelism applies to
+    # which component but we can verify that global is reasonably larger
+    ratio = global_total / per_gpu_total
+    assert ratio > 1  # Should be between PP and TP*PP depending on component mix
+
+
+def test_attention_per_gpu_heads_not_evenly_divisible():
+    """Test attention with heads not evenly divisible by TP."""
+    hf_config = Qwen3Config(
+        hidden_size=2048,
+        num_attention_heads=17,  # Not divisible by 4
+        num_key_value_heads=5,  # Not divisible by 4
+        num_hidden_layers=8,
+    )
+
+    vllm_config = create_mock_vllm_config(hf_config, tensor_parallel_size=4)
+    metrics = AttentionMetrics.from_vllm_config(vllm_config)
+
+    ctx = ExecutionContext.from_single_request(
+        num_tokens=64, context_len=256, is_prefill=True
+    )
+
+    # Should not crash and should handle max(1, ...) correctly
+    per_gpu_flops = metrics.get_num_flops(ctx, per_gpu=True)
+    global_flops = metrics.get_num_flops(ctx, per_gpu=False)
+
+    # Both should be positive
+    assert per_gpu_flops > 0
+    assert global_flops > 0
+    assert global_flops > per_gpu_flops
diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py
index 75b1eff9eb8e4048cfe4764320d8e6ab6818cdcf..de8a1a6071fdcf0c9ac5346eb2ba2b3b5acc8b28 100644
--- a/tests/v1/sample/test_logprobs.py
+++ b/tests/v1/sample/test_logprobs.py
@@ -516,6 +516,424 @@ def test_logprobs_mode(logprobs_mode: LogprobsMode):
     del llm
 
 
+class TestCorrectDecodedToken:
+    """Unit tests for _correct_decoded_token method in LogprobsProcessor.
+
+    This method handles UTF-8 decoding issues where incomplete byte sequences
+    result in the Unicode replacement character "�" (U+FFFD). This commonly
+    happens with byte-fallback tokenization when multi-byte UTF-8 characters
+    are split across tokens.
+    """
+
+    @pytest.fixture
+    def mock_tokenizer(self):
+        """Create a mock tokenizer for testing."""
+        from unittest.mock import Mock
+
+        tokenizer = Mock()
+        return tokenizer
+
+    @pytest.fixture
+    def processor_with_empty_logprobs(self, mock_tokenizer):
+        """Create a LogprobsProcessor with empty logprobs."""
+        from vllm.v1.engine.logprobs import LogprobsProcessor
+
+        processor = LogprobsProcessor(
+            tokenizer=mock_tokenizer,
+            logprobs=[],
+            prompt_logprobs=None,
+            cumulative_logprob=0.0,
+            num_logprobs=1,
+            num_prompt_logprobs=None,
+        )
+        return processor
+
+    @pytest.fixture
+    def processor_with_previous_logprobs(self, mock_tokenizer):
+        """Create a LogprobsProcessor with previous logprobs."""
+        from vllm.v1.engine.logprobs import LogprobsProcessor
+
+        processor = LogprobsProcessor(
+            tokenizer=mock_tokenizer,
+            logprobs=[{123: None}],  # Previous token ID is 123
+            prompt_logprobs=None,
+            cumulative_logprob=0.0,
+            num_logprobs=1,
+            num_prompt_logprobs=None,
+        )
+        return processor
+
+    def test_correction_with_previous_token_in_list(
+        self, processor_with_empty_logprobs
+    ):
+        """Test correction using previous token in the same list.
+
+        Scenario: Token at idx=1 ends with "�", but when decoded with
+        the previous token (idx=0), it forms a valid UTF-8 sequence.
+        Example: token[0]="�", token[1]="�" -> together form "polarized"
+        """
+        processor = processor_with_empty_logprobs
+        tokens = [100, 101, 102]  # token IDs
+
+        # Mock tokenizer behavior:
+        # - decode([102]) returns "�" (ends with replacement char)
+        # - decode([101, 102]) returns "valid" (no replacement char)
+        processor.tokenizer.decode.side_effect = lambda ids: (
+            "valid" if ids == [101, 102] else "�"
+        )
+
+        result = processor._correct_decoded_token(2, tokens)
+        assert result == "valid"
+        processor.tokenizer.decode.assert_called_with([101, 102])
+
+    def test_correction_with_previous_logprob_token(
+        self, processor_with_previous_logprobs
+    ):
+        """Test correction using previous logprob token.
+
+        Scenario: Cannot correct with previous token in list (idx=0),
+        but can correct with previous logprob token.
+        """
+        processor = processor_with_previous_logprobs
+        tokens = [100]  # single token
+
+        # Mock tokenizer behavior:
+        # - decode([100]) returns "�" (ends with replacement char)
+        # - decode([123, 100]) returns " "polarized" (no replacement char)
+        # Token 123 is from previous logprobs
+        def mock_decode(ids):
+            if ids == [123, 100]:
+                return ' "polarized"'
+            return "�"
+
+        processor.tokenizer.decode.side_effect = mock_decode
+
+        result = processor._correct_decoded_token(0, tokens)
+        assert result == ' "polarized"'
+
+    def test_correction_at_idx_zero_no_previous_logprobs(
+        self, processor_with_empty_logprobs
+    ):
+        """Test correction at idx=0 with no previous logprobs.
+
+        Scenario: First token in list, no previous logprobs available.
+        Should return empty string as fallback.
+        """
+        processor = processor_with_empty_logprobs
+        tokens = [100]
+
+        # Mock tokenizer always returns "�"
+        processor.tokenizer.decode.return_value = "�"
+
+        result = processor._correct_decoded_token(0, tokens)
+        assert result == ""
+
+    def test_correction_at_idx_zero_with_previous_logprobs(
+        self, processor_with_previous_logprobs
+    ):
+        """Test correction at idx=0 with previous logprobs available.
+
+        Scenario: First token in list, but previous logprobs exist.
+        Should try correction with previous logprob token.
+        """
+        processor = processor_with_previous_logprobs
+        tokens = [200]
+
+        # Mock tokenizer behavior
+        def mock_decode(ids):
+            if ids == [123, 200]:
+                return "corrected"
+            return "�"
+
+        processor.tokenizer.decode.side_effect = mock_decode
+
+        result = processor._correct_decoded_token(0, tokens)
+        assert result == "corrected"
+
+    def test_no_correction_needed_returns_fallback(
+        self, processor_with_previous_logprobs
+    ):
+        """Test fallback to empty string when no correction works.
+
+        Scenario: All correction attempts still end with "�".
+        Should return empty string as final fallback.
+        """
+        processor = processor_with_previous_logprobs
+        tokens = [100, 101, 102]
+
+        # Mock tokenizer always returns text ending with "�"
+        processor.tokenizer.decode.return_value = "still�"
+
+        result = processor._correct_decoded_token(2, tokens)
+        assert result == ""
+
+    def test_middle_token_correction(self, processor_with_previous_logprobs):
+        """Test correction for a token in the middle of the list.
+
+        Scenario: Token at idx=5 in a longer list needs correction.
+        """
+        processor = processor_with_previous_logprobs
+        tokens = [10, 20, 30, 40, 50, 60, 70, 80]
+
+        # Mock tokenizer behavior for middle token
+        def mock_decode(ids):
+            if ids == [50, 60]:
+                return "olar"
+            return "�"
+
+        processor.tokenizer.decode.side_effect = mock_decode
+
+        result = processor._correct_decoded_token(5, tokens)
+        assert result == "olar"
+
+    def test_multiple_consecutive_replacement_chars(
+        self, processor_with_previous_logprobs
+    ):
+        """Test handling of multiple consecutive replacement characters.
+
+        Scenario: Sequence like ["�", "�", "p"] where first two should
+        become empty strings.
+        """
+        processor = processor_with_previous_logprobs
+
+        # Test first replacement char
+        tokens = [100, 101, 102]
+        processor.tokenizer.decode.return_value = "still�"
+        result1 = processor._correct_decoded_token(0, tokens)
+        assert result1 == ""
+
+        # Test second replacement char
+        result2 = processor._correct_decoded_token(1, tokens)
+        assert result2 == ""
+
+    def test_correction_with_multibyte_utf8(self, processor_with_previous_logprobs):
+        """Test correction involving multi-byte UTF-8 characters.
+
+        Scenario: Byte-fallback tokenization splits multi-byte UTF-8
+        characters (e.g., curly quotes, Chinese characters, emojis).
+        Example from user: "�", "�" -> "", "\""
+        """
+        processor = processor_with_previous_logprobs
+        tokens = [200, 201]
+
+        # Mock tokenizer behavior for multi-byte UTF-8 correction
+        def mock_decode(ids):
+            # When decoding first token (idx=0) with previous logprob token
+            if ids == [123, 200]:
+                return ' "'  # Space + left curly quote
+            # When decoding second token (idx=1) with previous token in list
+            elif ids == [200, 201]:
+                return '"'  # Right curly quote
+            # When decoding second token (idx=1) with previous logprob + prev token
+            elif ids == [123, 200, 201]:
+                return ' ""'  # Full sequence
+            return "�"
+
+        processor.tokenizer.decode.side_effect = mock_decode
+
+        # First token correction (idx=0)
+        # Will call decode([123, 200]) since idx=0 uses previous logprob token
+        result1 = processor._correct_decoded_token(0, tokens)
+        assert result1 == ' "'
+
+        # Second token correction (idx=1)
+        # Will call decode([200, 201]) since idx>0 uses previous token in list
+        result2 = processor._correct_decoded_token(1, tokens)
+        assert result2 == '"'
+
+    def test_real_world_opt125m_scenario(self, mock_tokenizer):
+        """Test the real-world scenario from user's example.
+
+        User's example with facebook/opt-125m:
+        Before: [" the", " term", " �", "�", "p", "olar", "ized", "�", "�", ...]
+        After: [" the", " term", "", " "", "p", "olar", "ized", "", "\"", ...]
+        """
+        from vllm.v1.engine.logprobs import LogprobsProcessor
+
+        # Simulate the sequence of tokens
+        processor = LogprobsProcessor(
+            tokenizer=mock_tokenizer,
+            logprobs=[],
+            prompt_logprobs=None,
+            cumulative_logprob=0.0,
+            num_logprobs=1,
+            num_prompt_logprobs=None,
+        )
+
+        # Token IDs representing the problematic sequence
+        tokens = [1, 2, 3, 4, 5, 6, 7, 8, 9]  # placeholder IDs
+
+        # Mock decode behavior simulating the real scenario
+        def mock_decode(ids):
+            # Simulate cases where individual tokens decode to "�"
+            # but combinations decode correctly
+            if len(ids) == 1:
+                if ids[0] == 3 or ids[0] == 4 or ids[0] == 8 or ids[0] == 9:
+                    return "�"
+            elif len(ids) == 2:
+                if ids == [2, 3]:
+                    return " term�"  # Still ends with �, need more context
+                elif ids == [3, 4]:
+                    return ' "'  # Corrected to space + left curly quote
+                elif ids == [7, 8]:
+                    return "ized�"  # Still ends with �
+                elif ids == [8, 9]:
+                    return '"'  # Corrected to right curly quote
+            elif len(ids) == 3:
+                if ids == [1, 2, 3]:
+                    return " the term�"  # Still ends with issue
+                elif ids == [2, 3, 4]:
+                    return ' term "'  # With all context
+            return "normal_text"
+
+        mock_tokenizer.decode.side_effect = mock_decode
+
+        # Test token at index 2 (should fail to correct, return "")
+        # Token 3 individually is "�"
+        # decode([2, 3]) = " term�" (still ends with �)
+        # No previous logprobs, so fallback to ""
+        result = processor._correct_decoded_token(2, tokens)
+        assert result == ""
+
+        # Test token at index 3 (should correct to " "")
+        # Token 4 individually is "�"
+        # decode([3, 4]) = " "" (corrected!)
+        processor.logprobs = [{2: None}]  # Add previous logprob
+        result = processor._correct_decoded_token(3, tokens)
+        assert result == ' "'
+
+
+def test_verify_tokens_integration():
+    """Integration test for _verify_tokens with real model.
+
+    This test validates that _verify_tokens correctly identifies and
+    corrects tokens ending with the replacement character "�".
+    Uses facebook/opt-125m which is known to produce these issues.
+    """
+    runner = VllmRunner(
+        "facebook/opt-125m",
+        max_logprobs=0,
+        enable_prefix_caching=False,
+        gpu_memory_utilization=0.15,
+        max_model_len=256,
+    )
+
+    # Use a prompt that triggers multi-byte UTF-8 issues
+    # Based on user's example: "In this example,"
+    test_prompts = ["In this example,"]
+
+    sampling_params = SamplingParams(
+        max_tokens=16,
+        temperature=0,
+        logprobs=0,
+    )
+
+    results = runner.llm.generate(test_prompts, sampling_params=sampling_params)
+
+    # Verify that decoded tokens don't contain replacement characters
+    for result in results:
+        assert result.outputs[0].logprobs is not None
+        for logprob_dict in result.outputs[0].logprobs:
+            for token_id, logprob_info in logprob_dict.items():
+                decoded_token = logprob_info.decoded_token
+                # Decoded tokens should not end with replacement character
+                # They should either be corrected or empty string
+                assert not decoded_token.endswith("�"), (
+                    f"Token {token_id} decoded to '{decoded_token}' which "
+                    f"ends with replacement character"
+                )
+                # Decoded tokens should not contain lone replacement characters
+                assert decoded_token != "�", (
+                    f"Token {token_id} is a lone replacement character"
+                )
+
+
+def test_utf8_edge_cases_with_real_model():
+    """Test various UTF-8 edge cases with a real model.
+
+    Tests prompts that are likely to trigger byte-fallback tokenization
+    and multi-byte UTF-8 splitting.
+    """
+    runner = VllmRunner(
+        "facebook/opt-125m",
+        max_logprobs=1,
+        enable_prefix_caching=False,
+        gpu_memory_utilization=0.15,
+        max_model_len=256,
+    )
+
+    # Prompts with various multi-byte UTF-8 characters
+    test_prompts = [
+        'Smart quotes: "Hello"',  # Curly quotes
+        "Em dash — test",  # Em dash
+        "Ellipsis… continues",  # Ellipsis
+        "Chinese: 你好",  # Chinese characters
+        "Emoji: 😀 🎉",  # Emojis
+        'Mixed: "quoted" — with symbols',  # Mixed
+    ]
+
+    sampling_params = SamplingParams(
+        max_tokens=10,
+        temperature=0,
+        logprobs=1,
+    )
+
+    results = runner.llm.generate(test_prompts, sampling_params=sampling_params)
+
+    for i, result in enumerate(results):
+        prompt = test_prompts[i]
+        assert result.outputs[0].logprobs is not None
+
+        # Check that no decoded tokens end with replacement character
+        for logprob_dict in result.outputs[0].logprobs:
+            for token_id, logprob_info in logprob_dict.items():
+                decoded_token = logprob_info.decoded_token
+                assert not decoded_token.endswith("�"), (
+                    f"Prompt: '{prompt}'\n"
+                    f"Token {token_id} decoded to '{decoded_token}' which "
+                    f"ends with replacement character"
+                )
+
+
+def test_correct_decoded_token_preserves_valid_tokens():
+    """Test that valid tokens (not ending with �) are not modified.
+
+    The _correct_decoded_token method should only be called for tokens
+    ending with "�", but this test verifies the broader _verify_tokens
+    logic doesn't affect valid tokens.
+    """
+    runner = VllmRunner(
+        "facebook/opt-125m",
+        max_logprobs=2,
+        enable_prefix_caching=False,
+        gpu_memory_utilization=0.15,
+        max_model_len=256,
+    )
+
+    # Simple prompt with standard ASCII characters
+    test_prompts = ["Hello world, this is a test."]
+
+    sampling_params = SamplingParams(
+        max_tokens=10,
+        temperature=0,
+        logprobs=2,
+    )
+
+    results = runner.llm.generate(test_prompts, sampling_params=sampling_params)
+
+    for result in results:
+        assert result.outputs[0].logprobs is not None
+
+        # All decoded tokens should be valid strings
+        for logprob_dict in result.outputs[0].logprobs:
+            for token_id, logprob_info in logprob_dict.items():
+                decoded_token = logprob_info.decoded_token
+                # Valid tokens should be non-empty strings (or empty if corrected)
+                assert isinstance(decoded_token, str)
+                # Should not contain replacement character
+                assert "�" not in decoded_token
+
+
 @pytest.mark.parametrize("logprobs_mode", get_args(LogprobsMode))
 @pytest.mark.parametrize(
     "model_setup",
@@ -524,32 +942,74 @@ def test_logprobs_mode(logprobs_mode: LogprobsMode):
             (
                 "eagle",
                 "meta-llama/Llama-3.2-1B-Instruct",
-                "nm-testing/Llama3_2_1B_speculator.eagle3",
+                {
+                    "method": "eagle",
+                    "model": "nm-testing/Llama3_2_1B_speculator.eagle3",
+                    "num_speculative_tokens": 3,
+                },
+                0,
+            ),
+            marks=large_gpu_mark(min_gb=32),
+            id="eagle0",
+        ),
+        pytest.param(
+            (
+                "eagle",
+                "meta-llama/Llama-3.2-1B-Instruct",
+                {
+                    "method": "eagle",
+                    "model": "nm-testing/Llama3_2_1B_speculator.eagle3",
+                    "num_speculative_tokens": 3,
+                },
+                3,
+            ),
+            marks=large_gpu_mark(min_gb=32),
+            id="eagle3",
+        ),
+        pytest.param(
+            (
+                "ngram",
+                "meta-llama/Llama-3.2-1B-Instruct",
+                {
+                    "method": "ngram",
+                    "prompt_lookup_max": 5,
+                    "prompt_lookup_min": 3,
+                    "num_speculative_tokens": 3,
+                },
+                3,
             ),
             marks=large_gpu_mark(min_gb=32),
+            id="ngram",
         ),
     ],
 )
-@pytest.mark.parametrize("top_logprobs", [0, 3])
 def test_spec_decode_logprobs(
     logprobs_mode: LogprobsMode,
-    model_setup: tuple[str, str, str],
-    top_logprobs: int,
+    model_setup: tuple[str, str, dict, int],
 ):
     """Spec decode logprobs should match those of the base model.
 
     Args:
         logprobs_mode: logprobs mode.
-        model_setup: Spec decode method, base model name, and
-        draft model name.
+        model_setup: Tuple of (method, base model name,
+            speculative_config dict, top_logprobs).
     """
     from vllm import LLM
 
+    method, model_name, spec_config, top_logprobs = model_setup
+
     prompt = "Hello world " * 50
     sampling_params = SamplingParams(
         temperature=0, logprobs=top_logprobs, max_tokens=10, ignore_eos=False
     )
-    method, model_name, spec_model_name = model_setup
+    penalty_sampling_params = SamplingParams(
+        temperature=0,
+        logprobs=top_logprobs,
+        max_tokens=10,
+        ignore_eos=False,
+        presence_penalty=-1.0,
+    )
+
     max_model_len = 256
 
     # Run base LLM.
@@ -560,27 +1020,27 @@ def test_spec_decode_logprobs(
         seed=42,
         logprobs_mode=logprobs_mode,
         gpu_memory_utilization=0.4,
+        enable_prefix_caching=False,
+    )
+    ref_results = ref_llm.generate(
+        [prompt, prompt], [sampling_params, penalty_sampling_params]
     )
-    ref_results = ref_llm.generate([prompt], sampling_params)
     # Collect logprobs outputs from reference LLM.
     ref_logprobs = []
-    for output in ref_results[0].outputs:
-        for logprobs in output.logprobs:
-            for token_id in logprobs:
-                ref_logprobs.append(logprobs[token_id])
+    for results in ref_results:
+        for output in results.outputs:
+            for logprobs in output.logprobs:
+                ref_logprobs.extend(logprobs.values())
     del ref_llm
     torch.cuda.empty_cache()
     cleanup_dist_env_and_memory()
 
     # Run spec decode LLM.
+    # Add max_model_len to spec_config if not present
+    spec_config_with_len = {**spec_config, "max_model_len": max_model_len}
     spec_llm = LLM(
         model_name,
-        speculative_config={
-            "method": method,
-            "model": spec_model_name,
-            "num_speculative_tokens": 3,
-            "max_model_len": max_model_len,
-        },
+        speculative_config=spec_config_with_len,
         max_logprobs=5,
         max_model_len=max_model_len,
         seed=42,
@@ -589,14 +1049,17 @@ def test_spec_decode_logprobs(
         # Force prefill chunking
         enable_chunked_prefill=True,
         max_num_batched_tokens=32,
+        enable_prefix_caching=False,
+    )
+    spec_results = spec_llm.generate(
+        [prompt, prompt], [sampling_params, penalty_sampling_params]
     )
-    spec_results = spec_llm.generate([prompt], sampling_params)
     # Collect logprobs outputs from spec decode LLM.
     spec_logprobs = []
-    for output in spec_results[0].outputs:
-        for logprobs in output.logprobs:
-            for token_id in logprobs:
-                spec_logprobs.append(logprobs[token_id])
+    for results in spec_results:
+        for output in results.outputs:
+            for logprobs in output.logprobs:
+                spec_logprobs.extend(logprobs.values())
     del spec_llm
     torch.cuda.empty_cache()
     cleanup_dist_env_and_memory()
diff --git a/tests/v1/sample/test_rejection_sampler.py b/tests/v1/sample/test_rejection_sampler.py
index 61caffee45daf78ba38ce553fc28209bbfde9fc5..d8ae57984fed2143be42c997cb49ee466974b2c3 100644
--- a/tests/v1/sample/test_rejection_sampler.py
+++ b/tests/v1/sample/test_rejection_sampler.py
@@ -691,9 +691,13 @@ def test_frequency_penalties(rejection_sampler):
 
 
 def test_bad_words(rejection_sampler):
-    """Test rejection sampling with bad words constraints"""
+    """Test rejection sampling with bad words constraints.
+
+    This test applies bad words to non-consecutive requests (0 and 2, but not 1)
+    to verify correct logit indexing when iterating over requests with bad words.
+    """
     spec_tokens = [[1, 2, 3], [1, 15, 3], [1, 2, 3]]
-    output_tokens = [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]]
+    output_tokens = [[1, 2, 3, 4], [1, 15, 3, 4], [1, 2, 3, 4]]
 
     logits = create_logits_tensor(output_tokens, token_idx_to_override=15)
     metadata = create_sampling_metadata(
@@ -701,17 +705,9 @@ def test_bad_words(rejection_sampler):
         output_token_ids=[[2], [3], [4]],
         spec_token_ids=spec_tokens,
         bad_words_token_ids={
-            0: [
-                [
-                    2,
-                ]
-            ],
-            1: [
-                [
-                    2,
-                ]
-            ],
-            # Do not apply bad words to the last request
+            0: [[2]],
+            # Request 1 has no bad words (to test non-consecutive request handling)
+            2: [[2]],
         },
     )
     bonus_token_tensor = torch.tensor(
@@ -726,8 +722,11 @@ def test_bad_words(rejection_sampler):
         sampling_metadata=metadata,
     )
 
+    # Request 0: bad word [2] matches prefix, so token 2 is rejected -> 15
+    # Request 1: no bad words, all tokens match -> [1, 15, 3, 4]
+    # Request 2: bad word [2] matches prefix, so token 2 is rejected -> 15
     expected = torch.tensor(
-        [[1, 15, -1, -1], [1, 15, 3, 4], [1, 2, 3, 4]],
+        [[1, 15, -1, -1], [1, 15, 3, 4], [1, 15, -1, -1]],
         dtype=torch.int,
         device=logits.device,
     )
diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py
index 6a4c44ab8f5666087c9f11097ac8df2290d7d521..09549dddebf13f4994eeaca1e5a6b57454a99abb 100644
--- a/tests/v1/spec_decode/test_eagle.py
+++ b/tests/v1/spec_decode/test_eagle.py
@@ -14,8 +14,8 @@ from tests.v1.attention.utils import (
     create_standard_kv_cache_spec,
     try_get_attention_backend,
 )
-from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config import (
+    AttentionConfig,
     CacheConfig,
     DeviceConfig,
     ModelConfig,
@@ -27,6 +27,7 @@ from vllm.config import (
 from vllm.config.load import LoadConfig
 from vllm.model_executor.models.llama import LlamaForCausalLM
 from vllm.platforms import current_platform
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
 from vllm.v1.spec_decode.eagle import EagleProposer
 
 from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
@@ -41,6 +42,7 @@ eagle3_dir = os.path.join(models_path_prefix, "yuhuili/EAGLE3-LLaMA3.1-Instruct-
 def _create_proposer(
     method: str,
     num_speculative_tokens: int,
+    attention_backend: str | None = None,
     speculative_token_tree: list[tuple[int, ...]] | None = None,
 ) -> EagleProposer:
     model_config = ModelConfig(model=model_dir, runner="generate", max_model_len=100)
@@ -73,6 +75,7 @@ def _create_proposer(
             max_model_len=model_config.max_model_len,
             is_encoder_decoder=model_config.is_encoder_decoder,
         ),
+        attention_config=AttentionConfig(backend=attention_backend),
     )
 
     return EagleProposer(vllm_config=vllm_config, device=current_platform.device_type)
@@ -306,10 +309,16 @@ def test_prepare_inputs_padded():
 
     proposer = _create_proposer("eagle", num_speculative_tokens)
 
-    output_metadata, token_indices_to_sample = proposer.prepare_inputs_padded(
-        common_attn_metadata, spec_decode_metadata, valid_sampled_tokens_count
+    output_metadata, token_indices_to_sample, num_rejected_tokens_gpu = (
+        proposer.prepare_inputs_padded(
+            common_attn_metadata, spec_decode_metadata, valid_sampled_tokens_count
+        )
     )
 
+    # Verify num_rejected_tokens_gpu is calculated correctly
+    expected_num_rejected = torch.tensor([1, 0, 2], dtype=torch.int32, device=device)
+    assert torch.equal(num_rejected_tokens_gpu, expected_num_rejected)
+
     assert output_metadata.max_query_len == 3
     assert torch.equal(output_metadata.query_start_loc, expected_query_start_loc)
     assert torch.equal(token_indices_to_sample, expected_token_indices_to_sample)
@@ -334,8 +343,6 @@ def test_load_model(
     use_distinct_lm_head,
     monkeypatch,
 ):
-    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
-
     if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm():
         pytest.skip(
             "TRITON_ATTN does not support "
@@ -399,7 +406,9 @@ def test_load_model(
     assert not isinstance(target_model, SupportsMultiModal)
 
     # Create proposer using the helper function
-    proposer = _create_proposer(method, num_speculative_tokens=8)
+    proposer = _create_proposer(
+        method, num_speculative_tokens=8, attention_backend=attn_backend
+    )
 
     # Call the method under test
     proposer.load_model(target_model)
@@ -425,8 +434,6 @@ def test_load_model(
 @pytest.mark.parametrize("attn_backend", get_attn_backend_list_based_on_platform())
 @pytest.mark.parametrize("num_speculative_tokens", [1, 3, 8])
 def test_propose(method, attn_backend, num_speculative_tokens, monkeypatch):
-    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
-
     if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm():
         pytest.skip(
             "TRITON_ATTN does not support "
@@ -454,7 +461,9 @@ def test_propose(method, attn_backend, num_speculative_tokens, monkeypatch):
     seq_lens = [seq_len_1, seq_len_2]
 
     # Create proposer first so we can use its actual hidden_size
-    proposer = _create_proposer("eagle", num_speculative_tokens)
+    proposer = _create_proposer(
+        "eagle", num_speculative_tokens, attention_backend=attn_backend
+    )
     # Get the hidden_size from the proposer to ensure consistency
     hidden_size = proposer.hidden_size
 
@@ -627,7 +636,9 @@ def test_propose_tree(spec_token_tree):
 
     # Create proposer first so we can use its actual hidden_size.
     proposer = _create_proposer(
-        "eagle", num_speculative_tokens, speculative_token_tree=spec_token_tree
+        "eagle",
+        num_speculative_tokens,
+        speculative_token_tree=spec_token_tree,
     )
     # Get the hidden_size from the proposer to ensure consistency.
     hidden_size = proposer.hidden_size
diff --git a/tests/v1/spec_decode/test_mtp.py b/tests/v1/spec_decode/test_mtp.py
index 477ab52d159cc46a01c8663bff76ff3c6cd5171c..d2cb864f82862917161d64cf4585aef64bc09af1 100644
--- a/tests/v1/spec_decode/test_mtp.py
+++ b/tests/v1/spec_decode/test_mtp.py
@@ -13,7 +13,6 @@ from tests.v1.attention.utils import (
     create_standard_kv_cache_spec,
     try_get_attention_backend,
 )
-from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config import (
     CacheConfig,
     DeviceConfig,
@@ -26,6 +25,7 @@ from vllm.config import (
 from vllm.config.load import LoadConfig
 from vllm.model_executor.models.llama import LlamaForCausalLM
 from vllm.platforms import current_platform
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
 from vllm.v1.spec_decode.eagle import EagleProposer
 from ...utils import models_path_prefix
 
diff --git a/tests/v1/spec_decode/test_ngram.py b/tests/v1/spec_decode/test_ngram.py
index 480e17620a430ed8da8e39c56eb2b86d03833b45..b3d0058774d14b73dd35869a364dfc8516993588 100644
--- a/tests/v1/spec_decode/test_ngram.py
+++ b/tests/v1/spec_decode/test_ngram.py
@@ -85,10 +85,8 @@ def test_ngram_proposer():
     token_ids_cpu = np.array([[1, 2, 3, 4, 5]])
     result = get_ngram_proposer(min_n=2, max_n=2, k=2).propose(
         sampled_token_ids=[[0]],
-        req_ids=["0"],
         num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]),
         token_ids_cpu=token_ids_cpu,
-        spec_decode_unsupported_reqs=(),
     )
     assert len(result[0]) == 0
 
@@ -96,10 +94,8 @@ def test_ngram_proposer():
     token_ids_cpu = np.array([[1, 2, 3, 4, 1, 2, 3]])
     result = get_ngram_proposer(min_n=4, max_n=4, k=2).propose(
         sampled_token_ids=[[0]],
-        req_ids=["0"],
         num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]),
         token_ids_cpu=token_ids_cpu,
-        spec_decode_unsupported_reqs=(),
     )
     assert len(result[0]) == 0
 
@@ -107,10 +103,8 @@ def test_ngram_proposer():
     token_ids_cpu = np.array([[1, 2, 3, 4, 1, 2, 3]])
     result = get_ngram_proposer(min_n=3, max_n=4, k=2).propose(
         sampled_token_ids=[[0]],
-        req_ids=["0"],
         num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]),
         token_ids_cpu=token_ids_cpu,
-        spec_decode_unsupported_reqs=(),
     )
     assert np.array_equal(result, np.array([[4, 1]]))
 
@@ -119,10 +113,8 @@ def test_ngram_proposer():
     token_ids_cpu = np.array([[2, 3, 4, 5, 1, 2, 3, 4, 1, 2, 3, 4]])
     result = get_ngram_proposer(min_n=3, max_n=4, k=2).propose(
         sampled_token_ids=[[0]],
-        req_ids=["0"],
         num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]),
         token_ids_cpu=token_ids_cpu,
-        spec_decode_unsupported_reqs=(),
     )
     assert np.array_equal(result, np.array([[1, 2]]))  # Not [5, 1]]
 
@@ -130,10 +122,8 @@ def test_ngram_proposer():
     token_ids_cpu = np.array([[3, 4, 5, 2, 3, 4, 1, 2, 3, 4]])
     result = get_ngram_proposer(min_n=2, max_n=4, k=2).propose(
         sampled_token_ids=[[0]],
-        req_ids=["0"],
         num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]),
         token_ids_cpu=token_ids_cpu,
-        spec_decode_unsupported_reqs=(),
     )
     assert np.array_equal(result, np.array([[1, 2]]))  # Not [5, 2]]
 
@@ -141,10 +131,8 @@ def test_ngram_proposer():
     token_ids_cpu = np.array([[1, 2, 3, 100, 1, 2, 3, 200, 1, 2, 3, 300, 1, 2, 3]])
     result = get_ngram_proposer(min_n=3, max_n=3, k=2).propose(
         sampled_token_ids=[[0]],
-        req_ids=["0"],
         num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]),
         token_ids_cpu=token_ids_cpu,
-        spec_decode_unsupported_reqs=(),
     )
     assert np.array_equal(result, np.array([[100, 1]]))
 
@@ -152,10 +140,8 @@ def test_ngram_proposer():
     token_ids_cpu = np.array([[]])
     result = get_ngram_proposer(min_n=2, max_n=2, k=2).propose(
         sampled_token_ids=[[0]],
-        req_ids=["0"],
         num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]),
         token_ids_cpu=token_ids_cpu,
-        spec_decode_unsupported_reqs=(),
     )
     assert len(result[0]) == 0
 
@@ -165,10 +151,8 @@ def test_ngram_proposer():
     token_ids_cpu = np.array([[1, 2, 3, 1, 2], [4, 5, 6, -1, -1]])
     result = get_ngram_proposer(min_n=2, max_n=2, k=2).propose(
         sampled_token_ids=[[0], [1]],
-        req_ids=["0", "1"],
         num_tokens_no_spec=np.array([5, 3]),
         token_ids_cpu=token_ids_cpu,
-        spec_decode_unsupported_reqs=(),
     )
     assert len(result[0]) == 2
     assert np.array_equal(result[0], np.array([3, 1]))
@@ -186,10 +170,8 @@ def test_ngram_proposer():
     sampled_token_ids = [[2], [], [8]]  # Empty list for request 1 simulates prefill
     result = proposer.propose(
         sampled_token_ids=sampled_token_ids,
-        req_ids=["0", "1", "2"],
         num_tokens_no_spec=num_tokens_no_spec,
         token_ids_cpu=token_ids_cpu,
-        spec_decode_unsupported_reqs=(),
     )
     assert len(result) == 3
     assert np.array_equal(result[0], [3, 1])
@@ -217,10 +199,8 @@ def test_ngram_proposer():
     token_ids_cpu = np.array([input_1, input_2])
     result = ngram_proposer.propose(
         sampled_token_ids=[[0], [1]],
-        req_ids=["0", "1"],
         num_tokens_no_spec=np.array([len(input_1), 3]),
         token_ids_cpu=token_ids_cpu,
-        spec_decode_unsupported_reqs=(),
     )
     assert len(result[0]) == 2
     assert np.array_equal(result[0], np.array([middle_integer + 2, middle_integer + 3]))
diff --git a/tests/v1/spec_decode/test_tree_attention.py b/tests/v1/spec_decode/test_tree_attention.py
index 0afeeb8914b87d974c8ddd919614d505fcdc3b14..b5ce37ea4780b83f7b7caf7d498186f84fd7629a 100644
--- a/tests/v1/spec_decode/test_tree_attention.py
+++ b/tests/v1/spec_decode/test_tree_attention.py
@@ -11,10 +11,10 @@ from tests.v1.attention.utils import (
     create_vllm_config,
     try_get_attention_backend,
 )
-from vllm.attention.backends.registry import AttentionBackendEnum
-from vllm.attention.utils.fa_utils import is_flash_attn_varlen_func_available
 from vllm.config import ParallelConfig, SpeculativeConfig
-from vllm.v1.attention.backends.utils import CommonAttentionMetadata
+from vllm.v1.attention.backend import CommonAttentionMetadata
+from vllm.v1.attention.backends.fa_utils import is_flash_attn_varlen_func_available
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
 
 if not is_flash_attn_varlen_func_available():
     pytest.skip(
diff --git a/tests/v1/spec_decode/untest_max_len.py b/tests/v1/spec_decode/untest_max_len.py
index 15a6bd2659ea9ab5dd250f7b9f7e5649079be8f2..42991f9f1ae03c530f16823c709af6654f99023a 100644
--- a/tests/v1/spec_decode/untest_max_len.py
+++ b/tests/v1/spec_decode/untest_max_len.py
@@ -38,53 +38,48 @@ def test_ngram_max_len(num_speculative_tokens: int):
 def test_eagle_max_len(
     monkeypatch: pytest.MonkeyPatch, num_speculative_tokens: int, attn_backend: str
 ):
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
-
-        if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm():
-            pytest.skip(
-                "TRITON_ATTN does not support "
-                "multi-token eagle spec decode on current platform"
-            )
+    if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm():
+        pytest.skip(
+            "TRITON_ATTN does not support "
+            "multi-token eagle spec decode on current platform"
+        )
 
-        if attn_backend == "ROCM_AITER_FA" and current_platform.is_rocm():
-            m.setenv("VLLM_ROCM_USE_AITER", "1")
+    if attn_backend == "ROCM_AITER_FA" and current_platform.is_rocm():
+        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
 
-        llm = LLM(
-            model="meta-llama/Meta-Llama-3-8B-Instruct",
-            enforce_eager=True,  # For faster initialization.
-            speculative_config={
-                "method": "eagle",
-                "model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
-                "num_speculative_tokens": num_speculative_tokens,
-                "max_model_len": 80,
-            },
-            max_model_len=200,
+    llm = LLM(
+        model="meta-llama/Meta-Llama-3-8B-Instruct",
+        enforce_eager=True,  # For faster initialization.
+        speculative_config={
+            "method": "eagle",
+            "model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
+            "num_speculative_tokens": num_speculative_tokens,
+            "max_model_len": 80,
+        },
+        max_model_len=200,
+        attention_config={"backend": attn_backend},
+    )
+    sampling_params = SamplingParams(max_tokens=200, ignore_eos=True)
+    outputs = llm.generate(_PROMPTS, sampling_params)
+    for o in outputs:
+        assert o.outputs[0].finish_reason == "length", (
+            "This test is only meaningful if the output is truncated due to max length"
         )
-        sampling_params = SamplingParams(max_tokens=200, ignore_eos=True)
-        outputs = llm.generate(_PROMPTS, sampling_params)
-        for o in outputs:
-            assert o.outputs[0].finish_reason == "length", (
-                "This test is only meaningful if the output "
-                "is truncated due to max length"
-            )
 
-        sampling_params = SamplingParams(
-            max_tokens=200,
-            structured_outputs=StructuredOutputsParams(
-                regex="^" + "a b c d e " * 15 + "$"
-            ),
+    sampling_params = SamplingParams(
+        max_tokens=200,
+        structured_outputs=StructuredOutputsParams(regex="^" + "a b c d e " * 15 + "$"),
+    )
+    output = llm.generate(_PROMPTS, sampling_params)
+    for o in output:
+        assert o.prompt_token_ids is not None
+        assert (
+            len(o.prompt_token_ids)
+            < 80
+            < len(o.prompt_token_ids) + len(o.outputs[0].token_ids)
+            <= 200
+        ), (
+            "This test is only meaningful if the output "
+            "is longer than the eagle max length"
         )
-        output = llm.generate(_PROMPTS, sampling_params)
-        for o in output:
-            assert o.prompt_token_ids is not None
-            assert (
-                len(o.prompt_token_ids)
-                < 80
-                < len(o.prompt_token_ids) + len(o.outputs[0].token_ids)
-                <= 200
-            ), (
-                "This test is only meaningful if the output "
-                "is longer than the eagle max length"
-            )
-            assert o.outputs[0].text == "a b c d e " * 15
+        assert o.outputs[0].text == "a b c d e " * 15
diff --git a/tests/v1/structured_output/test_reasoning_structured_output.py b/tests/v1/structured_output/test_reasoning_structured_output.py
index ba52af3ad604d5491056fa33c77023e4114dc6d0..98a25e41dfe096b8736297f304be4d38b558beb6 100644
--- a/tests/v1/structured_output/test_reasoning_structured_output.py
+++ b/tests/v1/structured_output/test_reasoning_structured_output.py
@@ -71,6 +71,7 @@ class TestReasoningStructuredOutput:
         request.prompt_token_ids = [1, 2, 3, 4, 5]
         request.all_token_ids = [1, 2, 3, 4, 5, 6, 7, 8]
         request.num_computed_tokens = 5
+        request.num_output_placeholders = 0
         return request
 
     def test_should_fill_bitmask_with_enable_in_reasoning(
diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index dabedc65ed8bff9e05cc6c52132d89891c6e7acd..f05add3a7bc683ca9cfda74e875e61938351af93 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -6,8 +6,6 @@ import numpy as np
 import pytest
 import torch
 
-from vllm.attention.backends.abstract import MultipleOf
-from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.layer import Attention
 from vllm.config import (
     AttentionConfig,
@@ -27,6 +25,9 @@ from vllm.platforms import current_platform
 from vllm.sampling_params import SamplingParams
 from vllm.utils.mem_constants import GiB_bytes
 from vllm.utils.system_utils import update_environment_variables
+from vllm.utils.torch_utils import set_random_seed
+from vllm.v1.attention.backend import MultipleOf
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
 from vllm.v1.core.kv_cache_utils import estimate_max_model_len, get_kv_cache_configs
 from vllm.v1.core.sched.output import CachedRequestData, NewRequestData, SchedulerOutput
 from vllm.v1.kv_cache_interface import (
@@ -113,15 +114,16 @@ def get_vllm_config():
 @pytest.fixture
 def model_runner():
     vllm_config = get_vllm_config()
-    model_config = vllm_config.model_config
-    num_heads = model_config.get_num_kv_heads(vllm_config.parallel_config)
-    head_size = model_config.get_head_size()
-    vllm_config.compilation_config.static_forward_context["layer.0"] = Attention(
-        num_heads, head_size, 0.1
-    )
-    runner = GPUModelRunner(vllm_config, DEVICE)
-    initialize_kv_cache(runner)
-    return runner
+    with set_current_vllm_config(vllm_config):
+        model_config = vllm_config.model_config
+        num_heads = model_config.get_num_kv_heads(vllm_config.parallel_config)
+        head_size = model_config.get_head_size()
+        vllm_config.compilation_config.static_forward_context["layer.0"] = Attention(
+            num_heads, head_size, 0.1
+        )
+        runner = GPUModelRunner(vllm_config, DEVICE)
+        initialize_kv_cache(runner)
+        yield runner
 
 
 model_runner_2 = model_runner
@@ -547,7 +549,7 @@ def test_reload_weights_before_load_model(model_runner):
         model_runner.reload_weights()
 
 
-def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order():
+def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order(default_vllm_config):
     torch.set_default_dtype(torch.float16)
     layer_0 = "model.layers.0.self_attn.attn"
     layer_1 = "model.layers.1.self_attn.attn"
@@ -574,7 +576,7 @@ def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order():
         assert fwd_context is not None
 
 
-def test_init_kv_cache_with_kv_sharing_target_layer_not_exist():
+def test_init_kv_cache_with_kv_sharing_target_layer_not_exist(default_vllm_config):
     torch.set_default_dtype(torch.float16)
     layer_0 = "model.layers.0.self_attn.attn"
     layer_1 = "model.layers.1.self_attn.attn"
@@ -601,7 +603,7 @@ def test_init_kv_cache_with_kv_sharing_target_layer_not_exist():
         assert fwd_context is not None
 
 
-def test_init_kv_cache_with_kv_sharing_target_same_as_current():
+def test_init_kv_cache_with_kv_sharing_target_same_as_current(default_vllm_config):
     torch.set_default_dtype(torch.float16)
     layer_0 = "model.layers.0.self_attn.attn"
     layer_1 = "model.layers.1.self_attn.attn"
@@ -628,7 +630,7 @@ def test_init_kv_cache_with_kv_sharing_target_same_as_current():
         assert fwd_context is not None
 
 
-def test_init_kv_cache_without_kv_sharing():
+def test_init_kv_cache_without_kv_sharing(default_vllm_config):
     torch.set_default_dtype(torch.float16)
     layer_0 = "model.layers.0.self_attn.attn"
     layer_1 = "model.layers.1.self_attn.attn"
@@ -695,7 +697,7 @@ def test_init_kv_cache_without_kv_sharing():
     assert kv_cache_config.kv_cache_groups[0].layer_names[1] == layer_1
 
 
-def test_init_kv_cache_with_kv_sharing_valid():
+def test_init_kv_cache_with_kv_sharing_valid(default_vllm_config):
     torch.set_default_dtype(torch.float16)
     layer_0 = "model.layers.0.self_attn.attn"
     layer_1 = "model.layers.1.self_attn.attn"
@@ -778,7 +780,7 @@ def test_hybrid_attention_mamba_tensor_shapes():
     will not corrupt an attention block and vice versa
     """
 
-    current_platform.seed_everything(42)
+    set_random_seed(42)
 
     update_environment_variables(
         {
@@ -1048,7 +1050,7 @@ def test_input_batch_with_kernel_block_sizes():
             assert block_table.block_size == kernel_size
 
 
-def test_hybrid_cache_integration(model_runner, dist_init):
+def test_hybrid_cache_integration(default_vllm_config, dist_init):
     """Test hybrid cache architecture integration with GPUModelRunner."""
     # Create a new model runner with hybrid cache configuration
     vllm_config = get_vllm_config()
@@ -1112,3 +1114,87 @@ def test_hybrid_cache_integration(model_runner, dist_init):
     runner._update_states(scheduler_output)
     assert _is_req_scheduled(runner, req_id)
     assert _is_req_state_block_table_match(runner, req_id)
+
+
+def test_is_uniform_decode() -> None:
+    # Normal
+    assert GPUModelRunner._is_uniform_decode(
+        max_num_scheduled_tokens=1,
+        uniform_decode_query_len=1,
+        num_tokens=16,
+        num_reqs=16,
+    )
+    assert not GPUModelRunner._is_uniform_decode(
+        max_num_scheduled_tokens=2,
+        uniform_decode_query_len=1,
+        num_tokens=16,
+        num_reqs=16,
+    )
+    assert not GPUModelRunner._is_uniform_decode(
+        max_num_scheduled_tokens=1,
+        uniform_decode_query_len=1,
+        num_tokens=16,
+        num_reqs=15,
+    )
+    # Spec decoding
+    assert GPUModelRunner._is_uniform_decode(
+        max_num_scheduled_tokens=5,
+        uniform_decode_query_len=5,
+        num_tokens=30,
+        num_reqs=6,
+    )
+    assert not GPUModelRunner._is_uniform_decode(
+        max_num_scheduled_tokens=5,
+        uniform_decode_query_len=4,
+        num_tokens=30,
+        num_reqs=6,
+    )
+    assert not GPUModelRunner._is_uniform_decode(
+        max_num_scheduled_tokens=5,
+        uniform_decode_query_len=5,
+        num_tokens=30,
+        num_reqs=7,
+    )
+    # Force uniform decode
+    assert GPUModelRunner._is_uniform_decode(
+        max_num_scheduled_tokens=1,
+        uniform_decode_query_len=1,
+        num_tokens=16,
+        num_reqs=16,
+        force_uniform_decode=True,
+    )
+    assert GPUModelRunner._is_uniform_decode(
+        max_num_scheduled_tokens=2,
+        uniform_decode_query_len=1,
+        num_tokens=16,
+        num_reqs=16,
+        force_uniform_decode=True,
+    )
+    assert GPUModelRunner._is_uniform_decode(
+        max_num_scheduled_tokens=1,
+        uniform_decode_query_len=1,
+        num_tokens=16,
+        num_reqs=15,
+        force_uniform_decode=True,
+    )
+    assert not GPUModelRunner._is_uniform_decode(
+        max_num_scheduled_tokens=1,
+        uniform_decode_query_len=1,
+        num_tokens=16,
+        num_reqs=16,
+        force_uniform_decode=False,
+    )
+    assert not GPUModelRunner._is_uniform_decode(
+        max_num_scheduled_tokens=2,
+        uniform_decode_query_len=1,
+        num_tokens=16,
+        num_reqs=16,
+        force_uniform_decode=False,
+    )
+    assert not GPUModelRunner._is_uniform_decode(
+        max_num_scheduled_tokens=1,
+        uniform_decode_query_len=1,
+        num_tokens=16,
+        num_reqs=15,
+        force_uniform_decode=False,
+    )
diff --git a/tests/v1/worker/test_utils.py b/tests/v1/worker/test_utils.py
index bcf5611e3522819739883412d88640f960e4d635..a13e11d7178efdd4866b4c80e737540c075827cc 100644
--- a/tests/v1/worker/test_utils.py
+++ b/tests/v1/worker/test_utils.py
@@ -6,14 +6,14 @@ import torch
 from vllm.v1.worker.utils import bind_kv_cache
 
 
-def test_bind_kv_cache():
+def test_bind_kv_cache(default_vllm_config):
     from vllm.attention.layer import Attention
 
     ctx = {
-        "layers.0.self_attn": Attention(32, 128, 0.1),
-        "layers.1.self_attn": Attention(32, 128, 0.1),
-        "layers.2.self_attn": Attention(32, 128, 0.1),
-        "layers.3.self_attn": Attention(32, 128, 0.1),
+        "layers.0.self_attn": Attention(32, 128, 0.1, prefix="layers.0.self_attn"),
+        "layers.1.self_attn": Attention(32, 128, 0.1, prefix="layers.1.self_attn"),
+        "layers.2.self_attn": Attention(32, 128, 0.1, prefix="layers.2.self_attn"),
+        "layers.3.self_attn": Attention(32, 128, 0.1, prefix="layers.3.self_attn"),
     }
     kv_cache = {
         "layers.0.self_attn": torch.zeros((1,)),
@@ -34,13 +34,13 @@ def test_bind_kv_cache():
     assert runner_kv_caches[3] is kv_cache["layers.3.self_attn"]
 
 
-def test_bind_kv_cache_non_attention():
+def test_bind_kv_cache_non_attention(default_vllm_config):
     from vllm.attention.layer import Attention
 
     # example from Jamba PP=2
     ctx = {
-        "model.layers.20.attn": Attention(32, 128, 0.1),
-        "model.layers.28.attn": Attention(32, 128, 0.1),
+        "model.layers.20.attn": Attention(32, 128, 0.1, prefix="model.layers.20.attn"),
+        "model.layers.28.attn": Attention(32, 128, 0.1, prefix="model.layers.28.attn"),
     }
     kv_cache = {
         "model.layers.20.attn": torch.zeros((1,)),
diff --git a/tools/ep_kernels/install_python_libraries.sh b/tools/ep_kernels/install_python_libraries.sh
index 1bb7fd834523879fca665d7d5082655f91414960..89da24f95dac77c110e350f4f3182025452982df 100755
--- a/tools/ep_kernels/install_python_libraries.sh
+++ b/tools/ep_kernels/install_python_libraries.sh
@@ -6,11 +6,12 @@ set -ex
 #   --mode <mode>        "install" (default) or "wheel"
 #   --pplx-ref <commit>  pplx-kernels commit hash
 #   --deepep-ref <commit> DeepEP commit hash
+#   --nvshmem-ver <ver>  NVSHMEM version 
 
 CUDA_HOME=${CUDA_HOME:-/usr/local/cuda}
 PPLX_COMMIT_HASH=${PPLX_COMMIT_HASH:-"12cecfd"}
 DEEPEP_COMMIT_HASH=${DEEPEP_COMMIT_HASH:-"73b6ea4"}
-NVSHMEM_VER=3.3.24  # Suppports both CUDA 12 and 13
+NVSHMEM_VER=${NVSHMEM_VER:-"3.3.24"}  # Default supports both CUDA 12 and 13
 WORKSPACE=${WORKSPACE:-$(pwd)/ep_kernels_workspace}
 MODE=${MODE:-install}
 CUDA_VERSION_MAJOR=$(${CUDA_HOME}/bin/nvcc --version | egrep -o "release [0-9]+" | cut -d ' ' -f 2)
@@ -50,6 +51,18 @@ while [[ $# -gt 0 ]]; do
             DEEPEP_COMMIT_HASH="$2"
             shift 2
             ;;
+        --nvshmem-ver)
+            if [[ -z "$2" || "$2" =~ ^- ]]; then
+                echo "Error: --nvshmem-ver requires an argument." >&2
+                exit 1
+            fi
+            if [[ "$2" =~ / ]]; then
+                echo "Error: NVSHMEM version should not contain slashes." >&2
+                exit 1
+            fi
+            NVSHMEM_VER="$2"
+            shift 2
+            ;;
         *)
             echo "Error: Unknown argument '$1'" >&2
             exit 1
@@ -57,6 +70,13 @@ while [[ $# -gt 0 ]]; do
     esac
 done
 
+# Validate NVSHMEM_VER to prevent path traversal attacks
+# Only allow alphanumeric characters, dots, and hyphens (typical version string chars)
+if [[ ! "$NVSHMEM_VER" =~ ^[a-zA-Z0-9.-]+$ ]]; then
+    echo "Error: NVSHMEM_VER contains invalid characters. Only alphanumeric, dots, and hyphens are allowed." >&2
+    exit 1
+fi
+
 mkdir -p "$WORKSPACE"
 
 WHEEL_DIR="$WORKSPACE/dist"
diff --git a/tools/flashinfer-build.sh b/tools/flashinfer-build.sh
index 6c14d87348c3ad3b363703dba3b3eca31f38e86a..b3cc6c3087102eee45f6f1339517df3b25024c8e 100755
--- a/tools/flashinfer-build.sh
+++ b/tools/flashinfer-build.sh
@@ -32,9 +32,12 @@ if [[ "${CUDA_VERSION}" == 11.* ]]; then
     FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
 elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
     FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
+elif [[ "${CUDA_VERSION}" == 12.[8-9]* ]]; then
+    # CUDA 12.8–12.9
+    FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 10.3a 12.0"
 else
-    # CUDA 12.8+ supports 10.0a and 12.0
-    FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
+    # CUDA 13.0+
+    FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0f 12.0"
 fi
 
 echo "🏗️ Building FlashInfer AOT for arches: ${FI_TORCH_CUDA_ARCH_LIST}"
diff --git a/tools/install_torchcodec_rocm.sh b/tools/install_torchcodec_rocm.sh
new file mode 100755
index 0000000000000000000000000000000000000000..f4a2554733c521102e7b1fcc96ac3bb8dc231328
--- /dev/null
+++ b/tools/install_torchcodec_rocm.sh
@@ -0,0 +1,90 @@
+#!/bin/bash
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Script to install TorchCodec from source (required for ROCm compatibility)
+
+set -e
+
+TORCHCODEC_REPO="${TORCHCODEC_REPO:-https://github.com/pytorch/torchcodec.git}"
+TORCHCODEC_BRANCH="${TORCHCODEC_BRANCH:-main}"
+
+echo "=== TorchCodec Installation Script ==="
+
+# Check if torchcodec is already installed and working
+if python3 -c "from torchcodec.decoders import VideoDecoder" 2>/dev/null; then
+    echo "TorchCodec is already installed and working. Skipping."
+    exit 0
+fi
+
+echo "TorchCodec not found. Installing from source..."
+
+# Install system dependencies (FFmpeg + pkg-config)
+install_system_deps() {
+    if command -v apt-get &> /dev/null; then
+        echo "Installing system dependencies..."
+        apt-get update && apt-get install -y --no-install-recommends \
+            pkg-config \
+            ffmpeg libavcodec-dev libavformat-dev libavutil-dev \
+            libswscale-dev libavdevice-dev libavfilter-dev libswresample-dev
+    else
+        echo "Warning: apt-get did not work. Please install dependencies manually."
+        return 1
+    fi
+}
+
+# Check for pkg-config
+if ! command -v pkg-config &> /dev/null; then
+    echo "pkg-config not found. Installing system dependencies..."
+    install_system_deps
+fi
+
+# Check for required FFmpeg libraries
+echo "Checking for FFmpeg libraries..."
+if ! pkg-config --exists libavcodec libavformat libavutil libswscale libavdevice libavfilter libswresample 2>/dev/null; then
+    echo "FFmpeg development libraries not found. Installing..."
+    install_system_deps
+fi
+
+# Install Python build dependencies
+echo "Installing Python build dependencies..."
+pip install pybind11 setuptools wheel
+
+# Set pybind11 cmake path so CMake can find it
+export pybind11_DIR=$(python3 -c "import pybind11; print(pybind11.get_cmake_dir())")
+export CMAKE_PREFIX_PATH="${pybind11_DIR}:${CMAKE_PREFIX_PATH}"
+echo "pybind11_DIR set to: $pybind11_DIR"
+
+# Create temp directory for build
+BUILD_DIR=$(mktemp -d -t torchcodec-XXXXXX)
+echo "Building in temporary directory: $BUILD_DIR"
+
+cleanup() {
+    echo "Cleaning up $BUILD_DIR"
+    rm -rf "$BUILD_DIR"
+}
+trap cleanup EXIT
+
+# Clone and build
+cd "$BUILD_DIR"
+echo "Cloning TorchCodec from $TORCHCODEC_REPO (branch: $TORCHCODEC_BRANCH)..."
+git clone --depth 1 --branch "$TORCHCODEC_BRANCH" "$TORCHCODEC_REPO" torchcodec
+
+cd torchcodec
+
+# Set build environment for ROCm compatibility
+export TORCHCODEC_CMAKE_BUILD_DIR="${PWD}/build"
+export TORCHCODEC_DISABLE_COMPILE_WARNING_AS_ERROR=1
+export I_CONFIRM_THIS_IS_NOT_A_LICENSE_VIOLATION=1
+
+echo "Building TorchCodec..."
+pip install . --no-build-isolation
+
+# Verify installation
+echo "Verifying installation..."
+if python3 -c "from torchcodec.decoders import VideoDecoder; print('TorchCodec installed successfully!')"; then
+    echo "=== TorchCodec installation complete ==="
+else
+    echo "Error: TorchCodec installation failed verification"
+    exit 1
+fi
\ No newline at end of file
diff --git a/tools/pre_commit/check_pickle_imports.py b/tools/pre_commit/check_pickle_imports.py
index 13e5a0eda751a7afe90cb6be4514408fcc0b2ce0..85fbb4d5fd6be80496345f10130c742dad27129b 100644
--- a/tools/pre_commit/check_pickle_imports.py
+++ b/tools/pre_commit/check_pickle_imports.py
@@ -27,6 +27,7 @@ ALLOWED_FILES = {
     "vllm/distributed/device_communicators/shm_broadcast.py",
     "vllm/distributed/device_communicators/shm_object_storage.py",
     "vllm/utils/hashing.py",
+    "tests/multimodal/test_image.py",
     "tests/tokenizers_/test_hf.py",
     "tests/utils_/test_hashing.py",
     "benchmarks/kernels/graph_machete_bench.py",
diff --git a/tools/pre_commit/mypy.py b/tools/pre_commit/mypy.py
index 3f7e0a069f869d38eeb55cddae487352b6ce7e15..48803930d7b59bf92f3699d65c5a58f59e112888 100755
--- a/tools/pre_commit/mypy.py
+++ b/tools/pre_commit/mypy.py
@@ -41,6 +41,7 @@ FILES = [
     "vllm/usage",
     "vllm/utils",
     "vllm/worker",
+    "vllm/v1/attention",
     "vllm/v1/core",
     "vllm/v1/engine",
     "vllm/v1/executor",
@@ -60,7 +61,6 @@ SEPARATE_GROUPS = [
     "vllm/lora",
     "vllm/model_executor",
     # v1 related
-    "vllm/v1/attention",
     "vllm/v1/kv_offload",
     "vllm/v1/spec_decode",
     "vllm/v1/structured_output",
@@ -73,7 +73,7 @@ EXCLUDE = [
     "vllm/model_executor/models",
     "vllm/model_executor/layers/fla/ops",
     # Ignore triton kernels in ops.
-    "vllm/attention/ops",
+    "vllm/v1/attention/ops",
 ]
 
 
diff --git a/tools/vllm-rocm/pin_rocm_dependencies.py b/tools/vllm-rocm/pin_rocm_dependencies.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba11fd9343d2acb6be1becfdf077c64629c678cf
--- /dev/null
+++ b/tools/vllm-rocm/pin_rocm_dependencies.py
@@ -0,0 +1,221 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Pin vLLM dependencies to exact versions of custom ROCm wheels.
+
+This script modifies vLLM's requirements files to replace version constraints
+with exact versions of custom-built ROCm wheels (torch, triton, torchvision, amdsmi).
+
+This ensures that 'pip install vllm' automatically installs the correct custom wheels
+instead of allowing pip to download different versions from PyPI.
+"""
+
+import re
+import sys
+from pathlib import Path
+
+
+def extract_version_from_wheel(wheel_name: str) -> str:
+    """
+    Extract version from wheel filename.
+
+    Example:
+        torch-2.9.0a0+git1c57644-cp312-cp312-linux_x86_64.whl -> 2.9.0a0+git1c57644
+        triton-3.4.0-cp312-cp312-linux_x86_64.whl -> 3.4.0
+    """
+    # Wheel format:
+    #    {distribution}-{version}(-{build tag})?-{python}-{abi}-{platform}.whl
+    parts = wheel_name.replace(".whl", "").split("-")
+
+    if len(parts) < 5:
+        raise ValueError(f"Invalid wheel filename format: {wheel_name}")
+
+    # Version is the second part
+    version = parts[1]
+    return version
+
+
+def get_custom_wheel_versions(install_dir: str) -> dict[str, str]:
+    """
+    Read /install directory and extract versions of custom wheels.
+
+    Returns:
+        Dict mapping package names to exact versions
+    """
+    install_path = Path(install_dir)
+    if not install_path.exists():
+        print(f"ERROR: Install directory not found: {install_dir}", file=sys.stderr)
+        sys.exit(1)
+
+    versions = {}
+
+    # Map wheel prefixes to package names
+    # IMPORTANT: Use dashes to avoid matching substrings
+    #            (e.g., 'torch' would match 'torchvision')
+    # ORDER MATTERS: This order is preserved when pinning dependencies
+    #               in requirements files
+    package_mapping = [
+        ("torch-", "torch"),  # Match torch- (not torchvision)
+        ("triton-", "triton"),  # Match triton- (not triton_kernels)
+        ("triton_kernels-", "triton-kernels"),  # Match triton_kernels-
+        ("torchvision-", "torchvision"),  # Match torchvision-
+        ("torchaudio-", "torchaudio"),  # Match torchaudio-
+        ("amdsmi-", "amdsmi"),  # Match amdsmi-
+        ("flash_attn-", "flash-attn"),  # Match flash_attn-
+        ("aiter-", "aiter"),  # Match aiter-
+    ]
+
+    for wheel_file in install_path.glob("*.whl"):
+        wheel_name = wheel_file.name
+
+        for prefix, package_name in package_mapping:
+            if wheel_name.startswith(prefix):
+                try:
+                    version = extract_version_from_wheel(wheel_name)
+                    versions[package_name] = version
+                    print(f"Found {package_name}=={version}", file=sys.stderr)
+                except Exception as e:
+                    print(
+                        f"WARNING: Could not extract version from {wheel_name}: {e}",
+                        file=sys.stderr,
+                    )
+                break
+
+    # Return versions in the order defined by package_mapping
+    ordered_versions = {}
+    for _, package_name in package_mapping:
+        if package_name in versions:
+            ordered_versions[package_name] = versions[package_name]
+    return ordered_versions
+
+
+def pin_dependencies_in_requirements(requirements_path: str, versions: dict[str, str]):
+    """
+    Insert custom wheel pins at the TOP of requirements file.
+
+    This ensures that when setup.py processes the file line-by-line,
+    custom wheels (torch, triton, etc.) are encountered FIRST, before
+    any `-r common.txt` includes that might pull in other dependencies.
+
+    Creates:
+        # Custom ROCm wheel pins (auto-generated)
+        torch==2.9.0a0+git1c57644
+        triton==3.4.0
+        torchvision==0.23.0a0+824e8c8
+        amdsmi==26.1.0+5df6c765
+
+        -r common.txt
+        ... rest of file ...
+    """
+    requirements_file = Path(requirements_path)
+
+    if not requirements_file.exists():
+        print(
+            f"ERROR: Requirements file not found: {requirements_path}", file=sys.stderr
+        )
+        sys.exit(1)
+
+    # Backup original file
+    backup_file = requirements_file.with_suffix(requirements_file.suffix + ".bak")
+    with open(requirements_file) as f:
+        original_lines = f.readlines()
+
+    # Write backup
+    with open(backup_file, "w") as f:
+        f.writelines(original_lines)
+
+    # Build header with pinned custom wheels
+    header_lines = [
+        "# Custom ROCm wheel pins (auto-generated by pin_rocm_dependencies.py)\n",
+        "# These must come FIRST to ensure correct dependency resolution\n",
+    ]
+
+    for package_name, exact_version in versions.items():
+        header_lines.append(f"{package_name}=={exact_version}\n")
+
+    header_lines.append("\n")  # Blank line separator
+
+    # Filter out any existing entries for custom packages from original file
+    filtered_lines = []
+    removed_packages = []
+
+    for line in original_lines:
+        stripped = line.strip()
+        should_keep = True
+
+        # Check if this line is for one of our custom packages
+        if stripped and not stripped.startswith("#") and not stripped.startswith("-"):
+            for package_name in versions:
+                # Handle both hyphen and underscore variations
+                pattern_name = package_name.replace("-", "[-_]")
+                pattern = rf"^{pattern_name}\s*[=<>]=?\s*[\d.a-zA-Z+]+"
+
+                if re.match(pattern, stripped, re.IGNORECASE):
+                    removed_packages.append(f"{package_name}: {stripped}")
+                    should_keep = False
+                    break
+
+        if should_keep:
+            filtered_lines.append(line)
+
+    # Combine: header + filtered original content
+    final_lines = header_lines + filtered_lines
+
+    # Write modified content
+    with open(requirements_file, "w") as f:
+        f.writelines(final_lines)
+
+    # Print summary
+    print("\n✓ Inserted custom wheel pins at TOP of requirements:", file=sys.stderr)
+    for package_name, exact_version in versions.items():
+        print(f"  - {package_name}=={exact_version}", file=sys.stderr)
+
+    if removed_packages:
+        print("\n✓ Removed old package entries:", file=sys.stderr)
+        for pkg in removed_packages:
+            print(f"  - {pkg}", file=sys.stderr)
+
+    print(f"\n✓ Patched requirements file: {requirements_path}", file=sys.stderr)
+    print(f"  Backup saved: {backup_file}", file=sys.stderr)
+
+
+def main():
+    if len(sys.argv) != 3:
+        print(
+            f"Usage: {sys.argv[0]} <install_dir> <requirements_file>", file=sys.stderr
+        )
+        print(
+            f"Example: {sys.argv[0]} /install /app/vllm/requirements/rocm.txt",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    install_dir = sys.argv[1]
+    requirements_path = sys.argv[2]
+
+    print("=" * 70, file=sys.stderr)
+    print("Pinning vLLM dependencies to custom ROCm wheel versions", file=sys.stderr)
+    print("=" * 70, file=sys.stderr)
+
+    # Get versions from custom wheels
+    print(f"\nScanning {install_dir} for custom wheels...", file=sys.stderr)
+    versions = get_custom_wheel_versions(install_dir)
+
+    if not versions:
+        print("\nERROR: No custom wheels found in /install!", file=sys.stderr)
+        sys.exit(1)
+
+    # Pin dependencies in requirements file
+    print(f"\nPatching {requirements_path}...", file=sys.stderr)
+    pin_dependencies_in_requirements(requirements_path, versions)
+
+    print("\n" + "=" * 70, file=sys.stderr)
+    print("✓ Dependency pinning complete!", file=sys.stderr)
+    print("=" * 70, file=sys.stderr)
+
+    sys.exit(0)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm/_aiter_ops.py b/vllm/_aiter_ops.py
index 010817e79a9367e442edcad6a4c86fdf309d448e..b443f773525ac68e87102f5b2929fc97bf7ab1bd 100644
--- a/vllm/_aiter_ops.py
+++ b/vllm/_aiter_ops.py
@@ -4,6 +4,7 @@ import functools
 from collections.abc import Callable
 
 import torch
+from torch._ops import OpOverload
 
 import vllm.envs as envs
 from vllm.platforms import current_platform
@@ -24,14 +25,13 @@ def is_aiter_found() -> bool:
 # we keep this global outside to not cause torch compile breaks.
 IS_AITER_FOUND = is_aiter_found()
 
-# Can't use dtypes.fp8 directly inside an op
-# because it returns wrong result on gfx942.
-# This is a workaround to get the correct FP8 dtype.
-# This might because that the get_gfx() is wrapped as a custom op.
-if IS_AITER_FOUND:
-    from aiter import dtypes
 
-    AITER_FP8_DTYPE = dtypes.fp8
+def is_aiter_found_and_supported() -> bool:
+    if current_platform.is_rocm() and IS_AITER_FOUND:
+        from vllm.platforms.rocm import on_gfx9
+
+        return on_gfx9()
+    return False
 
 
 def if_aiter_supported(func: Callable) -> Callable:
@@ -43,17 +43,24 @@ def if_aiter_supported(func: Callable) -> Callable:
     def wrapper(*args, **kwargs):
         # checks the platform, device arch and aiter library existence.
 
-        if current_platform.is_rocm() and IS_AITER_FOUND:
-            from vllm.platforms.rocm import on_gfx9
-
-            if on_gfx9():
-                return func(*args, **kwargs)
+        if is_aiter_found_and_supported():
+            return func(*args, **kwargs)
 
         return None
 
     return wrapper
 
 
+# Can't use dtypes.fp8 directly inside an op
+# because it returns wrong result on gfx942.
+# This is a workaround to get the correct FP8 dtype.
+# This might because that the get_gfx() is wrapped as a custom op.
+if is_aiter_found_and_supported():
+    from aiter import dtypes
+
+    AITER_FP8_DTYPE = dtypes.fp8
+
+
 def _rocm_aiter_fused_moe_impl(
     hidden_states: torch.Tensor,
     w1: torch.Tensor,
@@ -281,7 +288,17 @@ def _check_aiter_mla_fp8_support() -> bool:
             _AITER_MLA_SUPPORTS_FP8 = (
                 "q_scale" in sig.parameters and "kv_scale" in sig.parameters
             )
-        except Exception:
+        except (
+            ImportError,
+            ModuleNotFoundError,
+            AttributeError,
+            ValueError,
+            TypeError,
+        ):
+            # ImportError/ModuleNotFoundError: aiter.mla module not available
+            # AttributeError: mla_decode_fwd doesn't exist
+            # ValueError: mla_decode_fwd has no signature (e.g., built-in)
+            # TypeError: mla_decode_fwd is not a callable
             _AITER_MLA_SUPPORTS_FP8 = False
     return _AITER_MLA_SUPPORTS_FP8
 
@@ -373,6 +390,31 @@ def _rocm_aiter_gemm_a8w8_fake(
     return Y
 
 
+def _rocm_aiter_triton_gemm_a8w8_blockscale_impl(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    output_dtype: torch.dtype = torch.float16,
+) -> torch.Tensor:
+    from aiter.ops.triton.gemm_a8w8_blockscale import gemm_a8w8_blockscale
+
+    return gemm_a8w8_blockscale(A, B, As, Bs, dtype=output_dtype)
+
+
+def _rocm_aiter_triton_gemm_a8w8_blockscale_fake(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    output_dtype: torch.dtype = torch.float16,
+) -> torch.Tensor:
+    m = A.shape[0]
+    n = B.shape[0]
+    Y = torch.empty(m, n, dtype=output_dtype, device=A.device)
+    return Y
+
+
 def _rocm_aiter_gemm_a8w8_blockscale_impl(
     A: torch.Tensor,
     B: torch.Tensor,
@@ -427,16 +469,16 @@ def _rocm_aiter_rmsnorm2d_fwd_with_add_impl(
     from aiter import rmsnorm2d_fwd_with_add
 
     residual_out = torch.empty_like(residual)
-    output = torch.empty_like(x)
+    out = torch.empty_like(x)
     rmsnorm2d_fwd_with_add(
-        output,  # output
+        out,  # output
         x,  # input
         residual,  # residual input
         residual_out,  # residual output
         weight,
         variance_epsilon,
     )
-    return output, residual_out
+    return out, residual_out
 
 
 def _rocm_aiter_rmsnorm2d_fwd_with_add_fake(
@@ -445,7 +487,84 @@ def _rocm_aiter_rmsnorm2d_fwd_with_add_fake(
     weight: torch.Tensor,
     variance_epsilon: float,
 ) -> tuple[torch.Tensor, torch.Tensor]:
-    return torch.empty_like(x), torch.empty_like(residual)
+    residual_out = torch.empty_like(residual)
+    out = torch.empty_like(x)
+    return out, residual_out
+
+
+def _rocm_aiter_rmsnorm_fused_add_dynamic_quant_impl(
+    x: torch.Tensor,
+    residual: torch.Tensor,
+    weight: torch.Tensor,
+    epsilon: float,
+    quant_dtype: torch.dtype,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    import aiter as rocm_aiter
+
+    assert quant_dtype in [torch.int8, _FP8_DTYPE]
+
+    y_scale = torch.empty(x.shape[0], 1, dtype=torch.float32, device=x.device)
+    out = torch.empty(x.shape, dtype=quant_dtype, device=x.device)
+    residual_out = torch.empty_like(x)
+
+    rocm_aiter.rmsnorm2d_fwd_with_add_dynamicquant(
+        out,
+        x,
+        residual,
+        residual_out,
+        y_scale,
+        weight,
+        epsilon,
+        use_model_sensitive_rmsnorm=0,
+    )
+
+    return out, residual_out, y_scale
+
+
+def _rocm_aiter_rmsnorm_fused_add_dynamic_quant_fake(
+    x: torch.Tensor,
+    residual: torch.Tensor,
+    weight: torch.Tensor,
+    epsilon: float,
+    quant_dtype: torch.dtype,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    y_scale = torch.empty(x.shape[0], 1, dtype=torch.float32, device=x.device)
+    out = torch.empty(x.shape, dtype=quant_dtype, device=x.device)
+    residual_out = torch.empty_like(x)
+
+    return out, residual_out, y_scale
+
+
+def _rocm_aiter_rmsnorm_fused_dynamic_quant_impl(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    epsilon: float,
+    quant_dtype: torch.dtype,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    import aiter as rocm_aiter
+
+    assert quant_dtype in [torch.int8, _FP8_DTYPE]
+
+    y_scale = torch.empty(x.shape[0], 1, dtype=torch.float32, device=x.device)
+    out = torch.empty(x.shape, dtype=quant_dtype, device=x.device)
+
+    rocm_aiter.rmsnorm2d_fwd_with_dynamicquant(
+        out, x, y_scale, weight, epsilon, use_model_sensitive_rmsnorm=0
+    )
+
+    return out, y_scale
+
+
+def _rocm_aiter_rmsnorm_fused_dynamic_quant_fake(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    epsilon: float,
+    quant_dtype: torch.dtype,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    y_scale = torch.empty(x.shape[0], 1, dtype=torch.float32, device=x.device)
+    out = torch.empty(x.shape, dtype=quant_dtype, device=x.device)
+
+    return out, y_scale
 
 
 def _rocm_aiter_per_tensor_quant_impl(
@@ -521,7 +640,11 @@ def _rocm_aiter_rmsnorm_with_add_fp8_group_quant_impl(
         dtype_quant=AITER_FP8_DTYPE,
         res1=residual,
     )
-    return (x_quant, x_quant_scales, res)
+    return (
+        x_quant,
+        res,
+        x_quant_scales,
+    )
 
 
 def _rocm_aiter_rmsnorm_with_add_fp8_group_quant_fake(
@@ -535,8 +658,8 @@ def _rocm_aiter_rmsnorm_with_add_fp8_group_quant_fake(
     scale_shape = (M, (N + group_size - 1) // group_size)
     return (
         torch.empty_like(x, dtype=AITER_FP8_DTYPE, device=x.device),
-        torch.empty(scale_shape, dtype=torch.float32, device=x.device),
         torch.empty_like(residual, device=residual.device),
+        torch.empty(scale_shape, dtype=torch.float32, device=x.device),
     )
 
 
@@ -642,48 +765,130 @@ _OPS_REGISTERED = False
 
 
 class rocm_aiter_ops:
+    """ROCm AITER operations wrapper for AMD GPU acceleration in vLLM.
+
+    This class centralizes the import and registration of AITER ops,
+    and provides a unified interface for checking if AITER is enabled.
+    Operations are only available on supported gfx9
+    architectures when aiter is installed.
+
+    The class uses environment variables to control which features are enabled,
+    allowing fine-grained control over which AITER optimizations are used.
+
+    Environment Variables:
+        VLLM_ROCM_USE_AITER: Main toggle for all AITER operations.
+        VLLM_ROCM_USE_AITER_LINEAR: Controls GEMM and quantization ops.
+        VLLM_ROCM_USE_AITER_RMSNORM: Controls RMSNorm operations.
+        VLLM_ROCM_USE_AITER_MOE: Controls MoE (Mixture of Experts) ops.
+        VLLM_ROCM_USE_AITER_MLA: Controls MLA (Multi-head Latent Attention) ops.
+        VLLM_ROCM_USE_AITER_MHA: Controls MHA ops including flash_attn_varlen.
+        VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION: Controls Triton unified attention.
+        VLLM_ROCM_USE_AITER_FP8BMM: Controls FP8 batched matrix multiply.
+        VLLM_ROCM_USE_AITER_FP4_ASM_GEMM: Controls FP4 assembly GEMM.
+        VLLM_ROCM_USE_AITER_TRITON_ROPE: Controls Triton rotary embeddings.
+        VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS: Controls shared expert fusion.
+        VLLM_ROCM_USE_AITER_TRITON_GEMM: Controls Triton unquantized GEMM.
+
+    Note:
+        The environment variables are assigned when the module is imported,
+        so you can't change the environment variables after the module is imported.
+        This is done out of performance consideration. Accessing environment variables
+        is expensive as described in issue https://github.com/vllm-project/vllm/issues/17067
+        so we don't want to do it repeatedly, especially in the hot path (the forward pass).
+        You can call the refresh_env_variables() function to reload the env variables
+        after monkey patching the env variables in the unit test.
+
+    Check Functions:
+        All check functions (is_*_enabled) are decorated with @if_aiter_supported,
+        which verifies: (1) platform is ROCm, (2) device arch is gfx9, and
+        (3) aiter library is installed. The check function then also verifies
+        the corresponding environment variable is enabled.
+        i.e.                                             ___
+        is_enabled() == current_platform.is_rocm() and      |     checked by
+                        current_platform.is_on_gfx9() and   | @if_aiter_supported
+                        IS_AITER_FOUND and   _______________|
+                        cls._AITER_ENABLED   -----> Check by the logic in `is_enabled()`
+
+    Example:
+        from vllm._aiter_ops import rocm_aiter_ops
+
+        # Check if aiter is enabled before using operations
+        if rocm_aiter_ops.is_enabled():
+            result = rocm_aiter_ops.rms_norm(x, weight, epsilon)
+
+    Operations:
+        - RMS normalization: rms_norm, rms_norm2d_with_add
+        - GEMM operations: gemm_a8w8, gemm_a8w8_blockscale
+        - Fused MoE: fused_moe, asm_moe_tkw1
+        - Routing: topk_softmax, biased_grouped_topk, grouped_topk
+        - MLA decode: mla_decode_fwd
+        - Quantization: per_tensor_quant, per_token_quant, group_fp8_quant
+        - Triton ops: triton_rotary_embed, triton_fp8_bmm, triton_gemm_a8w8_blockscale
+    """
+
+    # Check if the env variable is set
     _AITER_ENABLED = envs.VLLM_ROCM_USE_AITER
     _LINEAR_ENABLED = envs.VLLM_ROCM_USE_AITER_LINEAR
     _RMSNORM_ENABLED = envs.VLLM_ROCM_USE_AITER_RMSNORM
     _FMOE_ENABLED = envs.VLLM_ROCM_USE_AITER_MOE
     _MLA_ENABLED = envs.VLLM_ROCM_USE_AITER_MLA
-    _PG_ATTN_ENABLED = envs.VLLM_ROCM_USE_AITER_PAGED_ATTN
     _MHA_ENABLED = envs.VLLM_ROCM_USE_AITER_MHA
     _TRITON_UNIFIED_ATTN_ENABLED = envs.VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION
+    # TODO: Consolidate under _LINEAR_ENABLED
     _FP8BMM_ENABLED = envs.VLLM_ROCM_USE_AITER_FP8BMM
+    # TODO: Consolidate under _LINEAR_ENABLED
     _FP4_GEMM_DYNAMIC_QUANT_ASM = envs.VLLM_ROCM_USE_AITER_FP4_ASM_GEMM
+    # TODO: Consolidate under VLLM_ROCM_USE_AITER_ROPE
     _TRITON_ROTARY_EMBED = envs.VLLM_ROCM_USE_AITER_TRITON_ROPE
     _MOE_SHARED_EXPERTS_ENABLED = envs.VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS
+    # TODO: Consolidate under _LINEAR_ENABLED
     _TRITON_UNQUANT_GEMM = envs.VLLM_ROCM_USE_AITER_TRITON_GEMM
 
+    @classmethod
+    def refresh_env_variables(cls):
+        """
+        Since the environment variables are assigned when the module is imported,
+        This is a helper function to reload all the env variables from
+        the environment variables.
+        for example, after monkey patching the env variables in the unit test,
+        you can call this function to reload the env variables.
+        """
+        cls._AITER_ENABLED = envs.VLLM_ROCM_USE_AITER
+        cls._LINEAR_ENABLED = envs.VLLM_ROCM_USE_AITER_LINEAR
+        cls._RMSNORM_ENABLED = envs.VLLM_ROCM_USE_AITER_RMSNORM
+        cls._FMOE_ENABLED = envs.VLLM_ROCM_USE_AITER_MOE
+        cls._MLA_ENABLED = envs.VLLM_ROCM_USE_AITER_MLA
+        cls._MHA_ENABLED = envs.VLLM_ROCM_USE_AITER_MHA
+        cls._TRITON_UNIFIED_ATTN_ENABLED = envs.VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION
+        cls._FP8BMM_ENABLED = envs.VLLM_ROCM_USE_AITER_FP8BMM
+        cls._FP4_GEMM_DYNAMIC_QUANT_ASM = envs.VLLM_ROCM_USE_AITER_FP4_ASM_GEMM
+        cls._TRITON_ROTARY_EMBED = envs.VLLM_ROCM_USE_AITER_TRITON_ROPE
+        cls._MOE_SHARED_EXPERTS_ENABLED = envs.VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS
+        cls._TRITON_UNQUANT_GEMM = envs.VLLM_ROCM_USE_AITER_TRITON_GEMM
+
     @classmethod
     @if_aiter_supported
     def is_enabled(cls) -> bool:
-        """Verifies device specs and availability of aiter main env variable."""
         return cls._AITER_ENABLED
 
     @classmethod
     @if_aiter_supported
     def is_linear_enabled(cls) -> bool:
-        """ "Verifies device specs and availability of env variable."""
         return cls._AITER_ENABLED and cls._LINEAR_ENABLED
 
     @classmethod
     @if_aiter_supported
-    def is_linear_fp8_enaled(cls) -> bool:
-        """ "Verifies device specs and availability of env variable."""
+    def is_linear_fp8_enabled(cls) -> bool:
         return cls.is_linear_enabled()
 
     @classmethod
     @if_aiter_supported
     def is_rmsnorm_enabled(cls) -> bool:
-        """ "Verifies device specs and availability of env variable."""
         return cls._AITER_ENABLED and cls._RMSNORM_ENABLED
 
     @classmethod
     @if_aiter_supported
     def is_fused_moe_enabled(cls) -> bool:
-        """ "Verifies device specs and availability of env variable."""
         return cls._AITER_ENABLED and cls._FMOE_ENABLED
 
     @classmethod
@@ -694,25 +899,16 @@ class rocm_aiter_ops:
     @classmethod
     @if_aiter_supported
     def is_mla_enabled(cls) -> bool:
-        """ "Verifies device specs and availability of env variable."""
         return cls._AITER_ENABLED and cls._MLA_ENABLED
 
     @classmethod
     @if_aiter_supported
     def is_mha_enabled(cls) -> bool:
-        """ "Verifies device specs and availability of env variable."""
         return cls._AITER_ENABLED and cls._MHA_ENABLED
 
-    @classmethod
-    @if_aiter_supported
-    def is_pa_attn_enabled(cls) -> bool:
-        """ "Verifies device specs and availability of env variable."""
-        return cls._AITER_ENABLED and cls._PG_ATTN_ENABLED
-
     @classmethod
     @if_aiter_supported
     def is_triton_unified_attn_enabled(cls) -> bool:
-        """ "Verifies device specs and availability of env variable."""
         return cls._AITER_ENABLED and cls._TRITON_UNIFIED_ATTN_ENABLED
 
     @classmethod
@@ -803,6 +999,12 @@ class rocm_aiter_ops:
                 dispatch_key=current_platform.dispatch_key,
             )
 
+            direct_register_custom_op(
+                op_name="rocm_aiter_triton_gemm_a8w8_blockscale",
+                op_func=_rocm_aiter_triton_gemm_a8w8_blockscale_impl,
+                fake_impl=_rocm_aiter_triton_gemm_a8w8_blockscale_fake,
+            )
+
             direct_register_custom_op(
                 op_name="rocm_aiter_gemm_a8w8_blockscale",
                 op_func=_rocm_aiter_gemm_a8w8_blockscale_impl,
@@ -822,6 +1024,20 @@ class rocm_aiter_ops:
                 dispatch_key=current_platform.dispatch_key,
             )
 
+            direct_register_custom_op(
+                op_name="rocm_aiter_rmsnorm_fused_dynamic_quant",
+                op_func=_rocm_aiter_rmsnorm_fused_dynamic_quant_impl,
+                fake_impl=_rocm_aiter_rmsnorm_fused_dynamic_quant_fake,
+                dispatch_key=current_platform.dispatch_key,
+            )
+
+            direct_register_custom_op(
+                op_name="rocm_aiter_rmsnorm_fused_add_dynamic_quant",
+                op_func=_rocm_aiter_rmsnorm_fused_add_dynamic_quant_impl,
+                fake_impl=_rocm_aiter_rmsnorm_fused_add_dynamic_quant_fake,
+                dispatch_key=current_platform.dispatch_key,
+            )
+
             direct_register_custom_op(
                 op_name="rocm_aiter_rmsnorm_fp8_group_quant",
                 op_func=_rocm_aiter_rmsnorm_fp8_group_quant_impl,
@@ -857,13 +1073,54 @@ class rocm_aiter_ops:
             direct_register_custom_op(
                 op_name="rocm_aiter_per_token_quant",
                 op_func=_rocm_aiter_per_token_quant_impl,
-                mutates_args=["scale"],
                 fake_impl=_rocm_aiter_per_token_quant_fake,
                 dispatch_key=current_platform.dispatch_key,
             )
 
             _OPS_REGISTERED = True
 
+    @staticmethod
+    def get_rmsnorm_fused_add_op() -> OpOverload:
+        return torch.ops.vllm.rocm_aiter_rmsnorm2d_fwd_with_add.default
+
+    @staticmethod
+    def get_rmsnorm_op() -> OpOverload:
+        return torch.ops.vllm.rocm_aiter_rms_norm.default
+
+    @staticmethod
+    def get_rmsnorm_fused_add_dynamic_quant_op() -> OpOverload:
+        return torch.ops.vllm.rocm_aiter_rmsnorm_fused_add_dynamic_quant.default
+
+    @staticmethod
+    def get_rmsnorm_fused_dynamic_quant_op() -> OpOverload:
+        return torch.ops.vllm.rocm_aiter_rmsnorm_fused_dynamic_quant.default
+
+    @staticmethod
+    def get_rmsnorm_group_fused_quant_op() -> OpOverload:
+        return torch.ops.vllm.rocm_aiter_rmsnorm_fp8_group_quant.default
+
+    @staticmethod
+    def get_rmsnorm_group_add_fused_quant_op() -> OpOverload:
+        return torch.ops.vllm.rocm_aiter_rmsnorm_with_add_fp8_group_quant.default
+
+    @staticmethod
+    def get_per_token_quant_op() -> OpOverload:
+        return torch.ops.vllm.rocm_aiter_per_token_quant.default
+
+    @staticmethod
+    def get_group_quant_op() -> OpOverload:
+        return torch.ops.vllm.rocm_aiter_group_fp8_quant.default
+
+    @staticmethod
+    def get_act_mul_fused_fp8_group_quant_op() -> OpOverload:
+        return torch.ops.vllm.rocm_aiter_act_mul_and_fp8_group_quant.default
+
+    @staticmethod
+    def rms_norm(
+        x: torch.Tensor, weight: torch.Tensor, variance_epsilon: float
+    ) -> torch.Tensor:
+        return torch.ops.vllm.rocm_aiter_rms_norm(x, weight, variance_epsilon)
+
     @staticmethod
     def rms_norm2d_with_add(
         x: torch.Tensor,
@@ -875,12 +1132,6 @@ class rocm_aiter_ops:
             x, residual, weight, variance_epsilon
         )
 
-    @staticmethod
-    def rms_norm(
-        x: torch.Tensor, weight: torch.Tensor, variance_epsilon: float
-    ) -> torch.Tensor:
-        return torch.ops.vllm.rocm_aiter_rms_norm(x, weight, variance_epsilon)
-
     @staticmethod
     def gemm_a8w8(
         A: torch.Tensor,
@@ -892,6 +1143,19 @@ class rocm_aiter_ops:
     ) -> torch.Tensor:
         return torch.ops.vllm.rocm_aiter_gemm_a8w8(A, B, As, Bs, bias, output_dtype)
 
+    @staticmethod
+    def triton_gemm_a8w8_blockscale(
+        A: torch.Tensor,
+        B: torch.Tensor,
+        As: torch.Tensor,
+        Bs: torch.Tensor,
+        block_size: list[int],
+        output_dtype: torch.dtype = torch.float16,
+    ) -> torch.Tensor:
+        return torch.ops.vllm.rocm_aiter_triton_gemm_a8w8_blockscale(
+            A, B, As, Bs, output_dtype
+        )
+
     @staticmethod
     def gemm_a8w8_blockscale(
         A: torch.Tensor,
@@ -1120,14 +1384,14 @@ class rocm_aiter_ops:
         key_ = key[..., :rotary_dim]
         positions = positions.view(*query.shape[:1])
         rope_cached_thd_positions_2c_fwd_inplace(
-            positions,
-            sin,
-            cos,
             query_,
             key_,
+            cos,
+            sin,
+            positions,
             rotate_style,
             reuse_freqs_front_part=True,
-            is_nope_first=False,
+            nope_first=False,
         )
         query = query.view(query_shape)
         key = key.view(key_shape)
@@ -1163,19 +1427,6 @@ class rocm_aiter_ops:
             config=config,
         )
 
-    @staticmethod
-    def triton_gemm_a8w8_blockscale(
-        A: torch.Tensor,
-        B: torch.Tensor,
-        As: torch.Tensor,
-        Bs: torch.Tensor,
-        block_size: list[int],
-        output_dtype: torch.dtype = torch.float16,
-    ) -> torch.Tensor:
-        from aiter.ops.triton.gemm_a8w8_blockscale import gemm_a8w8_blockscale
-
-        return gemm_a8w8_blockscale(A, B, As, Bs, dtype=output_dtype)
-
     @staticmethod
     def group_fp8_quant(
         input_2d: torch.Tensor,
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 93c063974dacadcad12dafffc1570f6d36965d1e..0b3bd2a5a3658b25f46ef3598f92a32d8f5fb77b 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -920,20 +920,6 @@ def cutlass_scaled_mm_supports_fp4(cuda_device_capability: int) -> bool:
     return torch.ops._C.cutlass_scaled_mm_supports_fp4(cuda_device_capability)
 
 
-def cutlass_blockwise_scaled_grouped_mm(
-    output: torch.Tensor,
-    a: torch.Tensor,
-    b: torch.Tensor,
-    scales_a: torch.Tensor,
-    scales_b: torch.Tensor,
-    problem_sizes: torch.Tensor,
-    expert_offsets: torch.Tensor,
-):
-    torch.ops._C.cutlass_blockwise_scaled_grouped_mm(
-        output, a, b, scales_a, scales_b, problem_sizes, expert_offsets
-    )
-
-
 def cutlass_scaled_fp4_mm(
     a: torch.Tensor,
     b: torch.Tensor,
@@ -1289,6 +1275,25 @@ def get_cutlass_moe_mm_problem_sizes(
     )
 
 
+def get_cutlass_moe_mm_problem_sizes_from_expert_offsets(
+    expert_first_token_offset: torch.Tensor,
+    problem_sizes1: torch.Tensor,
+    problem_sizes2: torch.Tensor,
+    n: int,
+    k: int,
+    swap_ab: bool,
+):
+    """Compute per-expert (M, N, K) problem sizes from expert_first_token_offset"""
+    return torch.ops._C.get_cutlass_moe_mm_problem_sizes_from_expert_offsets(
+        expert_first_token_offset,
+        problem_sizes1,
+        problem_sizes2,
+        n,
+        k,
+        swap_ab,
+    )
+
+
 def shuffle_rows(input_tensor: torch.Tensor, dst2src_map: torch.Tensor):
     """
     Shuffle and expand the input tensor according to the dst2src_map and store the result in output_tensor.
@@ -1820,15 +1825,15 @@ def scaled_fp4_experts_quant(
     topk: int,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """
-    Quantize input tensor to FP4 and return quantized tensor and scale, for
+    Quantize input tensor to NVFP4 and return quantized tensor and scale, for
     packed MoE Inputs.
     Args:
-        input_tensor: The input tensor to be quantized to FP4
+        input_tensor: The input tensor to be quantized to NVFP4
         input_global_scale: A scalar scaling factor for the entire tensor.
         expert_offsets: The expert offsets tensor
         blockscale_offsets: The blockscale offsets tensor
     Outputs:
-        output: The quantized tensor in FP4
+        output: The quantized tensor in NVFP4
         output_scales: The blockscale tensor in FP8-E4M3
     """
     assert not current_platform.is_rocm()
@@ -1874,6 +1879,71 @@ def scaled_fp4_experts_quant(
     return output, output_scales
 
 
+def silu_and_mul_scaled_fp4_experts_quant(
+    input_tensor: torch.Tensor,
+    input_global_scale: torch.Tensor,
+    expert_offsets: torch.Tensor,
+    blockscale_offsets: torch.Tensor,
+    topk: int,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Fused SiLU+Mul+NVFP4 quantization for MoE intermediate activations.
+
+    Args:
+        input_tensor: The input tensor with gate || up layout [m_topk, k*2]
+        input_global_scale: A per-expert scaling factor [n_experts]
+        expert_offsets: The expert offsets tensor [n_experts+1]
+        blockscale_offsets: The blockscale offsets tensor [n_experts+1]
+        topk: Number of top-k experts selected
+    Outputs:
+        output: The quantized tensor in NVFP4 [m_topk, k/2]
+        output_scales: The blockscale tensor in FP8-E4M3
+    """
+    assert not current_platform.is_rocm()
+    assert input_tensor.ndim == 2, (
+        f"input.ndim needs to be == 2, but got {input_tensor.ndim}."
+    )
+
+    # Control the maximum number of tokens per expert supported by the
+    # NVFP4 MoE Expert Quantization. This is used to prevent the kernel
+    # from running out of memory. This value can also be increased to support
+    # larger models.
+    MAX_TOKENS_PER_EXPERT = envs.VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE
+    m_numtopk, k_times_2 = input_tensor.shape
+    assert k_times_2 % 2 == 0, "input width must be even (gate || up layout)"
+    k = k_times_2 // 2
+
+    assert m_numtopk <= MAX_TOKENS_PER_EXPERT * topk, (
+        f"m_numtopk must be less than MAX_TOKENS_PER_EXPERT("
+        f"{MAX_TOKENS_PER_EXPERT})"
+        f" for cutlass_moe_fp4, observed m_numtopk = {m_numtopk}. Use"
+        f" VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE to set this value."
+    )
+    scales_k = k // 16
+    padded_k = (scales_k + (4 - 1)) // 4
+
+    # output is uint8 and packed fp4 values
+    output = torch.empty(
+        m_numtopk, k // 2, device=input_tensor.device, dtype=torch.uint8
+    )
+    output_scales = torch.empty(
+        MAX_TOKENS_PER_EXPERT * topk,
+        padded_k,
+        dtype=torch.int32,
+        device=input_tensor.device,
+    )
+    torch.ops._C.silu_and_mul_scaled_fp4_experts_quant(
+        output,
+        output_scales,
+        input_tensor,
+        input_global_scale,
+        expert_offsets,
+        blockscale_offsets,
+    )
+    output_scales = output_scales.view(torch.float8_e4m3fn)
+    return output, output_scales
+
+
 # fp8
 # def scaled_fp8_quant(
 #     input: torch.Tensor,
@@ -1882,6 +1952,7 @@ def scaled_fp4_experts_quant(
 #     scale_ub: torch.Tensor | None = None,
 #     use_per_token_if_dynamic: bool = False,
 #     output: torch.Tensor | None = None,
+#     group_shape: tuple[int, int] | None = None,
 # ) -> tuple[torch.Tensor, torch.Tensor]:
 #     """
 #     Quantize input tensor to FP8 and return quantized tensor and scale.
@@ -1893,14 +1964,23 @@ def scaled_fp4_experts_quant(
 #     will benefit from padding.
 
 #     Args:
-#         input: The input tensor to be quantized to FP8
-#         scale: Optional scaling factor for the FP8 quantization
+#         input: The input tensor to be quantized to FP8 (must be 2D: [M, N])
+#         scale: Optional scaling factor for the FP8 quantization. Supports:
+#             - 0D or [1]: per-tensor scaling
+#             - 1D: requires explicit group_shape to disambiguate per-channel
+#               vs per-token (use (-1, 1) for per-channel, (1, -1) for per-token)
+#             - 2D [M/group_m, N/group_n]: group scaling (e.g. [M, N/128] for
+#               DeepSeek-style (1,128) groups, or [M/128, N/128] for (128,128))
 #         scale_ub: Optional upper bound for scaling factor in dynamic
 #             per token case
 #         num_token_padding: If specified, pad the first dimension
 #             of the output to at least this value.
 #         use_per_token_if_dynamic: Whether to do per_tensor or per_token
 #             in the dynamic quantization case.
+#         group_shape: Optional tuple (group_m, group_n) specifying the group
+#             shape for static quantization. Use -1 for "full extent" (e.g.,
+#             (-1, -1) for per-tensor, (-1, 1) for per-channel, etc.)
+#             Required for 1D scales; optional for 2D scales.
 
 #     Returns:
 #         tuple[torch.Tensor, torch.Tensor]: The output tensor in FP8 and
@@ -1929,8 +2009,7 @@ def scaled_fp4_experts_quant(
 #             scale = torch.empty(1, device=input.device, dtype=torch.float32)
 #             torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale)
 #     else:
-#         assert scale.numel() == 1, f"{scale.shape}"
-#         torch.ops._C.static_scaled_fp8_quant(output, input, scale)
+#         torch.ops._C.static_scaled_fp8_quant(output, input, scale, group_shape)
 
 #     return output, scale
 
@@ -2379,7 +2458,6 @@ def moe_wna16_marlin_gemm(
     moe_block_size: int,
     top_k: int,
     mul_topk_weights: bool,
-    is_ep: bool,
     b_q_type: ScalarType,
     size_m: int,
     size_n: int,
@@ -2411,7 +2489,6 @@ def moe_wna16_marlin_gemm(
         moe_block_size,
         top_k,
         mul_topk_weights,
-        is_ep,
         b_q_type.id,
         size_m,
         size_n,
@@ -2473,7 +2550,6 @@ if hasattr(torch.ops, "_moe_C") and hasattr(torch.ops._moe_C, "marlin_gemm_moe")
         moe_block_size: int,
         top_k: int,
         mul_topk_weights: bool,
-        is_ep: bool,
         b_q_type: ScalarType,
         size_m: int,
         size_n: int,
@@ -2560,16 +2636,30 @@ def concat_and_cache_mla(
     )
 
 
-def copy_blocks(
-    key_caches: list[torch.Tensor],
-    value_caches: list[torch.Tensor],
-    block_mapping: torch.Tensor,
+def concat_and_cache_mla_rope_fused(
+    positions: torch.Tensor,
+    q_pe: torch.Tensor,
+    k_pe: torch.Tensor,
+    kv_c: torch.Tensor,
+    cos_sin_cache: torch.Tensor,
+    is_neox: bool,
+    slot_mapping: torch.Tensor,
+    kv_cache: torch.Tensor,
+    kv_cache_dtype: str,
+    kv_cache_scale: torch.Tensor,
 ) -> None:
-    torch.ops._C_cache_ops.copy_blocks(key_caches, value_caches, block_mapping)
-
-
-def copy_blocks_mla(kv_caches: list[torch.Tensor], block_mapping: torch.Tensor) -> None:
-    torch.ops._C_cache_ops.copy_blocks_mla(kv_caches, block_mapping)
+    torch.ops._C_cache_ops.concat_and_cache_mla_rope_fused(
+        positions,
+        q_pe,
+        k_pe,
+        kv_c,
+        cos_sin_cache,
+        is_neox,
+        slot_mapping,
+        kv_cache,
+        kv_cache_dtype,
+        kv_cache_scale,
+    )
 
 
 def swap_blocks(
@@ -3224,6 +3314,42 @@ def cpu_gemm_wna16(
     return output
 
 
+def cpu_prepack_moe_weight(
+    weight: torch.Tensor,
+    isa: str,
+) -> torch.Tensor:
+    output = torch.empty_like(weight)
+    torch.ops._C.prepack_moe_weight(weight, output, isa)
+    return output
+
+
+def cpu_fused_moe(
+    input: torch.Tensor,
+    w13: torch.Tensor,
+    w2: torch.Tensor,
+    w13_bias: torch.Tensor | None,
+    w2_bias: torch.Tensor | None,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    act: str,
+    isa: str,
+) -> torch.Tensor:
+    output = torch.empty_like(input)
+    torch.ops._C.cpu_fused_moe(
+        output,
+        input,
+        w13,
+        w2,
+        w13_bias,
+        w2_bias,
+        topk_weights,
+        topk_ids,
+        act,
+        isa,
+    )
+    return output
+
+
 if hasattr(torch.ops._qutlass_C, "matmul_mxf4_bf16_tn"):
 
     @register_fake("_qutlass_C::matmul_mxf4_bf16_tn")
diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py
index 95c17cb331f67a0f70319138bf42d23e39bb71b0..239f5376eb4626ec9ffaffc18cb4e785315945b8 100644
--- a/vllm/_ipex_ops.py
+++ b/vllm/_ipex_ops.py
@@ -383,18 +383,6 @@ class ipex_ops:
         )
         return None
 
-    @staticmethod
-    def copy_blocks(
-        key_caches: list[torch.Tensor],
-        value_caches: list[torch.Tensor],
-        block_mapping: torch.Tensor,
-    ) -> None:
-        torch.xpu.copy_blocks(  # type: ignore
-            key_caches,
-            value_caches,
-            block_mapping,
-        )
-
     @staticmethod
     def swap_blocks(
         src: torch.Tensor, dst: torch.Tensor, block_mapping: torch.Tensor
diff --git a/vllm/assets/image.py b/vllm/assets/image.py
index c1a0f2b9cc29419ddcba018389fe68aea996cd19..a91eb7d4b67d64a2e3e7d1578fb4d10213bc7e18 100644
--- a/vllm/assets/image.py
+++ b/vllm/assets/image.py
@@ -42,8 +42,11 @@ class ImageAsset:
         )
 
     @property
-    def pil_image(self, ext="jpg") -> Image.Image:
-        image_path = self.get_path(ext)
+    def pil_image(self) -> Image.Image:
+        return self.pil_image_ext(ext="jpg")
+
+    def pil_image_ext(self, ext: str) -> Image.Image:
+        image_path = self.get_path(ext=ext)
         return Image.open(image_path)
 
     @property
diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py
deleted file mode 100644
index 4c7fa477b52ba58fcb1d3484de8157f1709f1841..0000000000000000000000000000000000000000
--- a/vllm/attention/backends/utils.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Attention backend utils"""
-
-from dataclasses import dataclass
-
-from vllm.config import ModelConfig
-from vllm.logger import init_logger
-
-logger = init_logger(__name__)
-
-PAD_SLOT_ID = -1
-
-
-@dataclass
-class MLADims:
-    q_lora_rank: int | None
-    kv_lora_rank: int
-    qk_nope_head_dim: int
-    qk_rope_head_dim: int
-    v_head_dim: int
-
-
-def get_mla_dims(model_config: ModelConfig) -> MLADims:
-    hf_text_config = model_config.hf_text_config
-
-    return MLADims(
-        q_lora_rank=getattr(hf_text_config, "q_lora_rank", None),
-        kv_lora_rank=hf_text_config.kv_lora_rank,
-        qk_nope_head_dim=hf_text_config.qk_nope_head_dim,
-        qk_rope_head_dim=hf_text_config.qk_rope_head_dim,
-        v_head_dim=hf_text_config.v_head_dim,
-    )
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 5254119c3eb5aa45319d9ca28b4370b44b12b817..7e1fc41449427a8d8d453e3be6b9695e59296b7a 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -2,27 +2,15 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention layer."""
 
-import functools
 from typing import cast
 
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
 
 import vllm.envs as envs
-from vllm.attention.backends.abstract import (
-    AttentionBackend,
-    AttentionType,
-    MLAAttentionImpl,
-)
-from vllm.attention.backends.registry import AttentionBackendEnum
-from vllm.attention.layers.mm_encoder_attention import maybe_get_vit_flash_attn_backend
-from vllm.attention.selector import get_attn_backend
-from vllm.attention.utils.fa_utils import get_flash_attn_version
 from vllm.attention.utils.kv_sharing_utils import validate_kv_sharing_target
 from vllm.attention.utils.kv_transfer_utils import maybe_transfer_kv_layer
 from vllm.config import CacheConfig, get_current_vllm_config
-from vllm.config.multimodal import MultiModalConfig
 from vllm.config.vllm import VllmConfig
 from vllm.forward_context import ForwardContext, get_forward_context
 from vllm.logger import init_logger
@@ -33,15 +21,22 @@ from vllm.model_executor.layers.linear import (
     UnquantizedLinearMethod,
 )
 from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.quantization.base_config import QuantizeMethodBase
 from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
 from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
-from vllm.model_executor.models.vision import get_vit_attn_backend
 from vllm.platforms import current_platform
 from vllm.utils.torch_utils import (
     direct_register_custom_op,
     kv_cache_dtype_str_to_dtype,
 )
+from vllm.v1.attention.backend import (
+    AttentionBackend,
+    AttentionType,
+    MLAAttentionImpl,
+)
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
+from vllm.v1.attention.selector import get_attn_backend
 from vllm.v1.kv_cache_interface import (
     FullAttentionSpec,
     KVCacheSpec,
@@ -52,6 +47,35 @@ from vllm.v1.kv_cache_interface import (
 logger = init_logger(__name__)
 
 
+def should_load_quant_weights(quant_method: QuantizeMethodBase | None) -> bool:
+    """Returns whether the quantization method should load quantized weights."""
+    return quant_method is not None and not isinstance(
+        quant_method, UnquantizedLinearMethod
+    )
+
+
+def set_default_quant_scales(layer: nn.Module, register_buffer: bool = False) -> None:
+    """Sets default quantization scales for the layer."""
+    if register_buffer:
+        layer.register_buffer("_k_scale", torch.tensor(1.0, dtype=torch.float32))
+        layer.register_buffer("_v_scale", torch.tensor(1.0, dtype=torch.float32))
+        layer.register_buffer("_q_scale", torch.tensor(1.0, dtype=torch.float32))
+        layer.register_buffer("_prob_scale", torch.tensor(1.0, dtype=torch.float32))
+    else:
+        layer._k_scale.fill_(1.0)
+        layer._v_scale.fill_(1.0)
+        layer._q_scale.fill_(1.0)
+        layer._prob_scale.fill_(1.0)
+
+    # We also keep q/k/v_scale on host (cpu) memory for attention
+    # backends that require the scales to be on host instead of on device.
+    # e.g. Flashinfer
+    layer._q_scale_float = 1.0
+    layer._k_scale_float = 1.0
+    layer._v_scale_float = 1.0
+    layer._prob_scale_float = 1.0
+
+
 def _init_kv_cache_quant(
     layer: nn.Module,
     quant_config: QuantizationConfig | None,
@@ -80,17 +104,21 @@ def _init_kv_cache_quant(
     # with the model weights.
     layer.kv_cache_dtype = kv_cache_dtype
     layer.calculate_kv_scales = calculate_kv_scales
-    layer._k_scale = torch.tensor(1.0, dtype=torch.float32)
-    layer._v_scale = torch.tensor(1.0, dtype=torch.float32)
-    layer._q_scale = torch.tensor(1.0, dtype=torch.float32)
-    layer._prob_scale = torch.tensor(1.0, dtype=torch.float32)
 
-    # We also keep q/k/v_scale on host (cpu) memory for attention
-    # backends that require the scales to be on host instead of on device.
-    # e.g. Flashinfer
-    layer._q_scale_float = 1.0
-    layer._k_scale_float = 1.0
-    layer._v_scale_float = 1.0
+    # Note [Register q/k/v/prob scales in state dict]
+    # When calling model.to(device), only parameters/buffers in state dict are
+    # moved. If not registering q/k/v/prob scales in state dict, there would
+    # be an IMA error when a cuda kernel (e.g., quant_fp8) accesses the tensor
+    # on cpu.
+    # Registering in state dict means it interacts with weight loading. One edge
+    # case is when quant_method is None, or quant_method is UnquantizedLinearMethod
+    # (i.e., should_load_quant_weights(quant_method) == False).
+    # In this case, the checkpoint does not have the scales. We need to
+    # initialize the scales to 1.0 and update the scales after weight loading.
+    # This is espectially important when we load dummy weights first (providing
+    # wrong scales) and then load real weights (which misses scales and keeps the
+    # wrong scales from dummy load).
+    set_default_quant_scales(layer, register_buffer=True)
 
     # The output scale on host memory. This should be the input scale of
     # the quant op after this attention layer.
@@ -99,9 +127,9 @@ def _init_kv_cache_quant(
     quant_method = (
         quant_config.get_quant_method(layer, prefix=prefix) if quant_config else None
     )
-    if quant_method is not None and not isinstance(
-        quant_method, UnquantizedLinearMethod
-    ):
+
+    # See [Note: Register q/k/v/prob scales in state dict]
+    if should_load_quant_weights(quant_method):
         assert isinstance(quant_method, BaseKVCacheMethod)
         # TODO (mgoin): kv cache dtype should be specified in the FP8
         # checkpoint config and become the "auto" behavior
@@ -142,6 +170,7 @@ class Attention(nn.Module, AttentionLayerBase):
         attn_type: str = AttentionType.DECODER,
         kv_sharing_target_layer_name: str | None = None,
         attn_backend: type[AttentionBackend] | None = None,
+        head_size_v: int | None = None,
         **extra_impl_args,
     ) -> None:
         """
@@ -175,14 +204,21 @@ class Attention(nn.Module, AttentionLayerBase):
         assert num_heads % num_kv_heads == 0, (
             f"num_heads ({num_heads}) is not divisible by num_kv_heads ({num_kv_heads})"
         )
+        self.quant_config = quant_config
+        self.layer_name = prefix
 
         # Initialize KV cache quantization attributes
         _init_kv_cache_quant(
-            self, quant_config, prefix, kv_cache_dtype, calculate_kv_scales
+            self,
+            self.quant_config,
+            self.layer_name,
+            kv_cache_dtype,
+            calculate_kv_scales,
         )
 
         self.num_heads = num_heads
         self.head_size = head_size
+        self.head_size_v = self.head_size if head_size_v is None else head_size_v
         self.num_kv_heads = num_kv_heads
         self.sliding_window = sliding_window
         self.has_sink = extra_impl_args.get("sinks") is not None
@@ -240,8 +276,7 @@ class Attention(nn.Module, AttentionLayerBase):
             kv_sharing_target_layer_name,
             **extra_impl_args,
         )
-        backend_name = self.attn_backend.get_name()
-        self.backend = AttentionBackendEnum.__members__.get(backend_name)
+        self.backend = AttentionBackendEnum[self.attn_backend.get_name()]
         self.dtype = dtype
 
         # For cuda-alike (CUDA and ROCM) and cpu platforms, we control how
@@ -255,7 +290,6 @@ class Attention(nn.Module, AttentionLayerBase):
         if prefix in compilation_config.static_forward_context:
             raise ValueError(f"Duplicate layer name: {prefix}")
         compilation_config.static_forward_context[prefix] = self
-        self.layer_name = prefix
         self.attn_type = attn_type
 
         if kv_sharing_target_layer_name is not None:
@@ -324,6 +358,13 @@ class Attention(nn.Module, AttentionLayerBase):
                 query, _ = self.query_quant(query, self._q_scale)
 
         if self.use_output:
+            if output_shape is None:
+                # Handle both 2D [num_tokens, hidden] and
+                # 3D [num_tokens, heads, head_dim] query
+                num_tokens = query.shape[0]
+                output_shape = torch.Size(
+                    (num_tokens, self.num_heads * self.head_size_v)
+                )
             output_shape = output_shape if output_shape is not None else query.shape
             output = torch.empty(output_shape, dtype=output_dtype, device=query.device)
             hidden_size = output_shape[-1]
@@ -331,11 +372,11 @@ class Attention(nn.Module, AttentionLayerBase):
             # NOTE(woosuk): We do this outside the custom op to minimize the
             # CPU overheads from the non-CUDA-graph regions.
             query = query.view(-1, self.num_heads, self.head_size)
-            output = output.view(-1, self.num_heads, self.head_size)
+            output = output.view(-1, self.num_heads, self.head_size_v)
             if key is not None:
                 key = key.view(-1, self.num_kv_heads, self.head_size)
             if value is not None:
-                value = value.view(-1, self.num_kv_heads, self.head_size)
+                value = value.view(-1, self.num_kv_heads, self.head_size_v)
             if self.use_direct_call:
                 forward_context: ForwardContext = get_forward_context()
                 attn_metadata = forward_context.attn_metadata
@@ -386,6 +427,17 @@ class Attention(nn.Module, AttentionLayerBase):
     def process_weights_after_loading(self, act_dtype: torch.dtype):
         self.impl.process_weights_after_loading(act_dtype)
 
+        # If we should not load quant weights, we initialize the scales to 1.0
+        # as the default value. See [Note: Register q/k/v/prob scales in state dict]
+        # for more details.
+        quant_method = (
+            self.quant_config.get_quant_method(self, prefix=self.layer_name)
+            if self.quant_config
+            else None
+        )
+        if not should_load_quant_weights(quant_method):
+            set_default_quant_scales(self, register_buffer=False)
+
     def get_attn_backend(self) -> type[AttentionBackend]:
         return self.attn_backend
 
@@ -410,136 +462,11 @@ class Attention(nn.Module, AttentionLayerBase):
                 block_size=block_size,
                 num_kv_heads=self.num_kv_heads,
                 head_size=self.head_size,
+                head_size_v=self.head_size_v,
                 dtype=self.kv_cache_torch_dtype,
             )
 
 
-class MultiHeadAttention(nn.Module):
-    """Multi-headed attention without any cache, used for ViT."""
-
-    def __init__(
-        self,
-        num_heads: int,
-        head_size: int,
-        scale: float,
-        num_kv_heads: int | None = None,
-        # This has no effect, it is only here to make it easier to swap
-        # between Attention and MultiHeadAttention
-        prefix: str = "",
-        multimodal_config: MultiModalConfig | None = None,
-    ) -> None:
-        super().__init__()
-        self.num_heads = num_heads
-        self.head_size = head_size
-        self.scale = scale
-        self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
-        self.layer_name = prefix
-
-        assert self.num_heads % self.num_kv_heads == 0, (
-            f"num_heads ({self.num_heads}) is not "
-            f"divisible by num_kv_heads ({self.num_kv_heads})"
-        )
-        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
-
-        # During model initialization, the default dtype is set as the model
-        # weight and activation dtype.
-        dtype = torch.get_default_dtype()
-
-        # Determine the attention backend
-        attn_backend_override = None
-        if multimodal_config is not None:
-            attn_backend_override = multimodal_config.mm_encoder_attn_backend
-
-        self.attn_backend = get_vit_attn_backend(
-            head_size=head_size,
-            dtype=dtype,
-            attn_backend_override=attn_backend_override,
-        )
-
-        self._flash_attn_varlen_func = maybe_get_vit_flash_attn_backend(
-            self.attn_backend,
-        )
-
-        self.is_flash_attn_backend = self.attn_backend in {
-            AttentionBackendEnum.FLASH_ATTN,
-            AttentionBackendEnum.ROCM_AITER_FA,
-        }
-
-        self.fa_version = None
-        if (
-            self.attn_backend == AttentionBackendEnum.FLASH_ATTN
-            and current_platform.is_cuda()
-        ):
-            self.fa_version = get_flash_attn_version()
-            assert self._flash_attn_varlen_func is not None
-            self._flash_attn_varlen_func = functools.partial(
-                self._flash_attn_varlen_func, fa_version=self.fa_version
-            )
-
-        logger.info_once(
-            f"Using {self.attn_backend} for MultiHeadAttention in multimodal encoder."
-        )
-
-    def forward(
-        self,
-        query: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
-    ) -> torch.Tensor:
-        """Input shape:
-        (batch_size x seq_len x hidden_size) or
-        (batch_size x seq_len x num_heads x head_size)
-        """
-        bsz, q_len = query.size()[:2]
-        kv_len = key.size(1)
-
-        query = query.view(bsz, q_len, self.num_heads, self.head_size)
-        key = key.view(bsz, kv_len, self.num_kv_heads, self.head_size)
-        value = value.view(bsz, kv_len, self.num_kv_heads, self.head_size)
-
-        if (num_repeat := self.num_queries_per_kv) > 1:
-            # Handle MQA and GQA
-            key = torch.repeat_interleave(key, num_repeat, dim=2)
-            value = torch.repeat_interleave(value, num_repeat, dim=2)
-
-        if self.is_flash_attn_backend:
-            assert self._flash_attn_varlen_func is not None
-            cu_seqlens_q = torch.arange(
-                0, (bsz + 1) * q_len, step=q_len, dtype=torch.int32, device=query.device
-            )
-            cu_seqlens_k = torch.arange(
-                0, (bsz + 1) * kv_len, step=kv_len, dtype=torch.int32, device=key.device
-            )
-
-            out = self._flash_attn_varlen_func(
-                query.flatten(0, 1),
-                key.flatten(0, 1),
-                value.flatten(0, 1),
-                cu_seqlens_q=cu_seqlens_q,
-                cu_seqlens_k=cu_seqlens_k,
-                max_seqlen_q=q_len,
-                max_seqlen_k=kv_len,
-                softmax_scale=self.scale,
-            )
-        elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA:
-            query, key, value = (x.transpose(1, 2) for x in (query, key, value))
-            out = F.scaled_dot_product_attention(query, key, value, scale=self.scale)
-            out = out.transpose(1, 2)
-        elif self.attn_backend == AttentionBackendEnum.PALLAS:
-            query, key, value = (x.transpose(1, 2) for x in (query, key, value))
-            from torch_xla.experimental.custom_kernel import flash_attention
-
-            out = flash_attention(query, key, value, sm_scale=self.scale)
-            out = out.transpose(1, 2)
-        else:
-            # ViT attention hasn't supported this backend yet
-            raise NotImplementedError(
-                f"ViT attention hasn't supported {self.attn_backend} backend yet."
-            )
-
-        return out.reshape(bsz, q_len, -1)
-
-
 class MLAAttention(nn.Module, AttentionLayerBase):
     """Multi-Head Latent Attention layer.
 
@@ -587,10 +514,15 @@ class MLAAttention(nn.Module, AttentionLayerBase):
             kv_cache_dtype = "auto"
             block_size = 16
             calculate_kv_scales = False
+        self.quant_config = quant_config
 
         # Initialize KV cache quantization attributes
         _init_kv_cache_quant(
-            self, quant_config, prefix, kv_cache_dtype, calculate_kv_scales
+            self,
+            self.quant_config,
+            self.layer_name,
+            kv_cache_dtype,
+            calculate_kv_scales,
         )
 
         dtype = torch.get_default_dtype()
@@ -720,6 +652,17 @@ class MLAAttention(nn.Module, AttentionLayerBase):
         if hasattr(self.impl, "process_weights_after_loading"):
             self.impl.process_weights_after_loading(act_dtype)
 
+        # If we should not load quant weights, we initialize the scales to 1.0
+        # as the default value. See [Note: Register q/k/v/prob scales in state dict]
+        # for more details.
+        quant_method = (
+            self.quant_config.get_quant_method(self, prefix=self.layer_name)
+            if self.quant_config
+            else None
+        )
+        if not should_load_quant_weights(quant_method):
+            set_default_quant_scales(self, register_buffer=False)
+
     def calc_kv_scales(
         self, q: torch.Tensor, kv_c_normed: torch.Tensor, k_pe: torch.Tensor
     ) -> None:
@@ -862,6 +805,7 @@ def unified_attention_with_output(
     output_block_scale: torch.Tensor | None = None,
 ) -> None:
     attn_metadata, self, kv_cache = get_attention_context(layer_name)
+
     self.impl.forward(
         self,
         query,
diff --git a/vllm/attention/ops/triton_reshape_and_cache_flash.py b/vllm/attention/ops/triton_reshape_and_cache_flash.py
deleted file mode 100644
index 5d2ba154ae018c5d81e29109931eed7382e8d3d1..0000000000000000000000000000000000000000
--- a/vllm/attention/ops/triton_reshape_and_cache_flash.py
+++ /dev/null
@@ -1,184 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import torch
-
-from vllm.platforms import current_platform
-from vllm.triton_utils import tl, triton
-
-
-@triton.jit
-def reshape_and_cache_kernel_flash(
-    key_ptr,  # [num_tokens, num_heads, head_size]
-    value_ptr,  # [num_tokens, num_heads, head_size]
-    key_cache_ptr,  # [num_blocks, block_size, num_heads, head_size]
-    value_cache_ptr,  # [num_blocks, block_size, num_heads, head_size]
-    slot_mapping_ptr,  # [num_tokens]
-    k_scale,  # float32
-    v_scale,  # float32
-    # strides
-    key_stride: tl.int64,
-    value_stride: tl.int64,
-    block_stride: tl.int64,
-    page_stride: tl.int64,
-    num_heads: tl.constexpr,
-    head_size: tl.constexpr,
-    block_size: tl.constexpr,
-    # FP8 flags
-    FP8_KV_CACHE: tl.constexpr,
-    # tune parameters
-    TILE_SIZE: tl.constexpr,
-):
-    token_idx = tl.program_id(axis=0)
-    slot_idx = tl.load(slot_mapping_ptr + token_idx).to(tl.int64)
-    if slot_idx < 0:
-        # Padding token that should be ignored.
-        return
-
-    tile_i = tl.program_id(axis=1)
-    tile_offs = tl.arange(0, TILE_SIZE)
-    tile_pos = tile_i * TILE_SIZE + tile_offs
-
-    block_idx = slot_idx // block_size
-    block_offset = slot_idx % block_size
-
-    src_key_idx = token_idx * key_stride
-    src_value_idx = token_idx * value_stride
-
-    tgt_idx = block_idx * block_stride + block_offset * page_stride
-
-    # [TILE_SIZE]
-    key_load = tl.load(
-        key_ptr + src_key_idx + tile_pos, mask=tile_pos < (num_heads * head_size)
-    )
-    if FP8_KV_CACHE:
-        # tl.store will do the correct implicit cast to fp8,
-        # based on the key_cache_ptr.dtype.element_ty
-        key_tile = key_load if key_load.dtype.is_fp8() else key_load / tl.load(k_scale)
-    else:
-        key_tile = key_load
-
-    # [TILE_SIZE]
-    value_load = tl.load(
-        value_ptr + src_value_idx + tile_pos, mask=tile_pos < (num_heads * head_size)
-    )
-    if FP8_KV_CACHE:
-        if value_load.dtype.is_fp8():
-            value_tile = value_load
-        else:
-            # tl.store will do the correct implicit cast to fp8,
-            #  based on the value_cache_ptr.dtype.element_ty
-            value_tile = value_load / tl.load(v_scale)
-    else:
-        value_tile = value_load
-
-    tl.store(
-        key_cache_ptr + tgt_idx + tile_pos,
-        key_tile,
-        mask=tile_pos < (num_heads * head_size),
-    )
-    tl.store(
-        value_cache_ptr + tgt_idx + tile_pos,
-        value_tile,
-        mask=tile_pos < (num_heads * head_size),
-    )
-    return
-
-
-def triton_reshape_and_cache_flash(
-    key: torch.Tensor,  # [num_tokens, num_heads, head_size]
-    value: torch.Tensor,  # [num_tokens, num_heads, head_size]
-    # [num_blocks, block_size, num_heads, head_size]
-    key_cache: torch.Tensor,
-    # [num_blocks, block_size, num_heads, head_size]
-    value_cache: torch.Tensor,
-    slot_mapping: torch.Tensor,  # [num_tokens]
-    kv_cache_dtype: str,  # "auto", "fp8"
-    k_scale: torch.Tensor,  # float32
-    v_scale: torch.Tensor,  # float32
-):
-    num_heads = key.shape[1]
-    head_size = key.shape[2]
-    block_size = key_cache.shape[1]
-    n = num_heads * head_size
-
-    key_stride = key.stride()[0]
-    value_stride = value.stride()[0]
-    block_stride = key_cache.stride()[0]
-    page_stride = key_cache.stride()[1]
-
-    head_stride = key_cache.stride()[2]
-    assert head_stride == head_size, "only continous heads are supported"
-
-    assert kv_cache_dtype == "auto" or kv_cache_dtype.startswith("fp8"), (
-        f"unsupported kv_cache_dtype (str), got {kv_cache_dtype}."
-    )
-    kv_cache_torch_dtype = (
-        current_platform.fp8_dtype()
-        if kv_cache_dtype.startswith("fp8")
-        else key_cache.dtype
-    )
-
-    if key_cache.dtype != kv_cache_torch_dtype and kv_cache_dtype.startswith("fp8"):
-        # to avoid erounous implicit cast in triton kernel (tl.store to uint8)
-        # (e.g. explicit cast to fp8e4m3fnuz is not supported in triton 3.4)
-        key_cache = key_cache.view(kv_cache_torch_dtype)
-        value_cache = value_cache.view(kv_cache_torch_dtype)
-    assert kv_cache_dtype != torch.uint8, (
-        "explicit fp8 cast and store to "
-        "uint8 is not supported by triton reshape_and_cache_flash"
-    )
-
-    FP8_KV_CACHE = kv_cache_dtype.startswith("fp8")
-    assert (not FP8_KV_CACHE) or kv_cache_torch_dtype in [
-        torch.float8_e4m3fn,
-        torch.float8_e5m2,
-        torch.uint8,
-        torch.float8_e4m3fnuz,
-    ], (
-        "unsupported dtype of KV cache tensor, got "
-        "{kv_cache_torch_dtype}. Supported kv cache dtypes: fp8e4m3fn, "
-        "fp8e5m2, uint8, bfloat16, float16, float32, fp8e4m3fnuz."
-    )
-
-    # heuristics instead of autotuning
-    TILE_SIZE = min(2048, triton.next_power_of_2(n))
-    if current_platform.is_rocm() or current_platform.is_xpu():
-        num_stages = 4
-        num_warps = 8
-    else:  # cuda
-        num_stages = 10
-        num_warps = 16
-        if torch.cuda.get_device_capability(key.device)[0] < 9:
-            TILE_SIZE = min(512, TILE_SIZE)
-
-    # TODO(ngl): maybe replace with static launch grid to avoid overhead if
-    #   using cudagraphs
-    grid = lambda meta: (
-        slot_mapping.shape[0],
-        triton.cdiv(n, meta["TILE_SIZE"]),
-    )
-
-    reshape_and_cache_kernel_flash[grid](
-        key_ptr=key,
-        value_ptr=value,
-        key_cache_ptr=key_cache,
-        value_cache_ptr=value_cache,
-        slot_mapping_ptr=slot_mapping,
-        k_scale=k_scale,
-        v_scale=v_scale,
-        # strides
-        key_stride=key_stride,
-        value_stride=value_stride,
-        block_stride=block_stride,
-        page_stride=page_stride,
-        num_heads=num_heads,
-        head_size=head_size,
-        block_size=block_size,
-        # FP8 flags
-        FP8_KV_CACHE=FP8_KV_CACHE,
-        # autotune parameters
-        TILE_SIZE=TILE_SIZE,
-        num_warps=num_warps,
-        num_stages=num_stages,
-    )
diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py
index 49ee0faf049d1be5fdc3f6f2d72da8c89ec0efd0..20d13b167c0ba470f7ec798bcb8a1e1e4d5558a2 100644
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@@ -867,7 +867,7 @@ class RandomMultiModalDataset(RandomDataset):
         fourcc = cv2.VideoWriter_fourcc(*"mp4v")
         fps = 30  # frames per second
 
-        with NamedTemporaryFile(suffix=".mp4", delete_on_close=False) as temp_file:
+        with NamedTemporaryFile(suffix=".mp4", delete=False) as temp_file:
             temp_path = temp_file.name
 
             # Create video writer
@@ -1376,7 +1376,9 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
         "--custom-output-len",
         type=int,
         default=256,
-        help="Number of output tokens per request, used only for custom dataset.",
+        help="Number of output tokens per request. Unless it is set to -1, the "
+        "value overrides potential output length loaded from the dataset. It is "
+        "used only for custom dataset.",
     )
 
     spec_bench_group = parser.add_argument_group("spec bench dataset options")
@@ -1437,19 +1439,97 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
     )
 
     random_group = parser.add_argument_group("random dataset options")
-    random_group.add_argument(
+    add_random_dataset_base_args(random_group)
+
+    random_mm_group = parser.add_argument_group(
+        "random multimodal dataset options extended from random dataset"
+    )
+    add_random_multimodal_dataset_args(random_mm_group)
+
+    hf_group = parser.add_argument_group("hf dataset options")
+    hf_group.add_argument(
+        "--hf-subset", type=str, default=None, help="Subset of the HF dataset."
+    )
+    hf_group.add_argument(
+        "--hf-split", type=str, default=None, help="Split of the HF dataset."
+    )
+    hf_group.add_argument(
+        "--hf-name",
+        type=str,
+        default=None,
+        help=(
+            "Name of the dataset on HuggingFace "
+            "(e.g., 'lmarena-ai/VisionArena-Chat'). "
+            "Specify this if your dataset-path is a local path."
+        ),
+    )
+    hf_group.add_argument(
+        "--hf-output-len",
+        type=int,
+        default=None,
+        help="Output length for each request. Overrides the output lengths "
+        "from the sampled HF dataset.",
+    )
+
+    prefix_repetition_group = parser.add_argument_group(
+        "prefix repetition dataset options"
+    )
+    prefix_repetition_group.add_argument(
+        "--prefix-repetition-prefix-len",
+        type=int,
+        default=256,
+        help="Number of prefix tokens per request, used only for prefix "
+        "repetition dataset.",
+    )
+    prefix_repetition_group.add_argument(
+        "--prefix-repetition-suffix-len",
+        type=int,
+        default=256,
+        help="Number of suffix tokens per request, used only for prefix "
+        "repetition dataset. Total input length is prefix_len + suffix_len.",
+    )
+    prefix_repetition_group.add_argument(
+        "--prefix-repetition-num-prefixes",
+        type=int,
+        default=10,
+        help="Number of prefixes to generate, used only for prefix repetition "
+        "dataset. Prompts per prefix is num_requests // num_prefixes.",
+    )
+    prefix_repetition_group.add_argument(
+        "--prefix-repetition-output-len",
+        type=int,
+        default=128,
+        help="Number of output tokens per request, used only for prefix "
+        "repetition dataset.",
+    )
+
+
+def add_random_dataset_base_args(
+    parser_or_group: FlexibleArgumentParser | argparse._ArgumentGroup,
+) -> None:
+    """Add CLI arguments for base random dataset options.
+
+    This function adds arguments needed for:
+    - random (random dataset)
+    - random-mm (random multimodal dataset)
+    - random-rerank (random dataset for reranking)
+
+    Args:
+        parser_or_group: Either a parser or an argument group to add arguments to.
+    """
+    parser_or_group.add_argument(
         "--random-input-len",
         type=int,
         default=1024,
         help="Number of input tokens per request, used only for random sampling.",
     )
-    random_group.add_argument(
+    parser_or_group.add_argument(
         "--random-output-len",
         type=int,
         default=128,
         help="Number of output tokens per request, used only for random sampling.",
     )
-    random_group.add_argument(
+    parser_or_group.add_argument(
         "--random-range-ratio",
         type=float,
         default=0.0,
@@ -1458,7 +1538,7 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
         "a symmetric sampling range"
         "[length * (1 - range_ratio), length * (1 + range_ratio)].",
     )
-    random_group.add_argument(
+    parser_or_group.add_argument(
         "--random-prefix-len",
         type=int,
         default=0,
@@ -1471,13 +1551,13 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
             "input_len * (1 + range_ratio)]."
         ),
     )
-    random_group.add_argument(
+    parser_or_group.add_argument(
         "--random-batch-size",
         type=int,
         default=1,
         help=("Batch size for random sampling. Only used for embeddings benchmark."),
     )
-    random_group.add_argument(
+    parser_or_group.add_argument(
         "--no-reranker",
         action="store_true",
         help=(
@@ -1486,11 +1566,19 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
         ),
     )
 
-    # random multimodal dataset options
-    random_mm_group = parser.add_argument_group(
-        "random multimodal dataset options extended from random dataset"
-    )
-    random_mm_group.add_argument(
+
+def add_random_multimodal_dataset_args(
+    parser_or_group: FlexibleArgumentParser | argparse._ArgumentGroup,
+) -> None:
+    """Add CLI arguments for random multimodal dataset options.
+
+    This function adds arguments needed for:
+    - random-mm (random multimodal dataset)
+
+    Args:
+        parser_or_group: Either a parser or an argument group to add arguments to.
+    """
+    parser_or_group.add_argument(
         "--random-mm-base-items-per-request",
         type=int,
         default=RandomMultiModalDataset.DEFAULT_BASE_ITEMS_PER_REQUEST,
@@ -1500,7 +1588,7 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
             "--random-mm-num-mm-items-range-ratio."
         ),
     )
-    random_mm_group.add_argument(
+    parser_or_group.add_argument(
         "--random-mm-num-mm-items-range-ratio",
         type=float,
         default=RandomMultiModalDataset.DEFAULT_NUM_MM_ITEMS_RANGE_RATIO,
@@ -1515,7 +1603,7 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
             "An error is raised if the computed min exceeds the max."
         ),
     )
-    random_mm_group.add_argument(
+    parser_or_group.add_argument(
         "--random-mm-limit-mm-per-prompt",
         type=json.loads,
         default=RandomMultiModalDataset.DEFAULT_LIMIT_MM_PER_PROMPT,
@@ -1559,7 +1647,7 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
             return normalize(parsed)
         raise ValueError("Unsupported value for --random-mm-bucket-config.")
 
-    random_mm_group.add_argument(
+    parser_or_group.add_argument(
         "--random-mm-bucket-config",
         type=_parse_mm_bucket_config,
         default=RandomMultiModalDataset.DEFAULT_MM_ITEM_BUCKET_CONFIG,
@@ -1580,63 +1668,6 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
         ),
     )
 
-    hf_group = parser.add_argument_group("hf dataset options")
-    hf_group.add_argument(
-        "--hf-subset", type=str, default=None, help="Subset of the HF dataset."
-    )
-    hf_group.add_argument(
-        "--hf-split", type=str, default=None, help="Split of the HF dataset."
-    )
-    hf_group.add_argument(
-        "--hf-name",
-        type=str,
-        default=None,
-        help=(
-            "Name of the dataset on HuggingFace "
-            "(e.g., 'lmarena-ai/VisionArena-Chat'). "
-            "Specify this if your dataset-path is a local path."
-        ),
-    )
-    hf_group.add_argument(
-        "--hf-output-len",
-        type=int,
-        default=None,
-        help="Output length for each request. Overrides the output lengths "
-        "from the sampled HF dataset.",
-    )
-
-    prefix_repetition_group = parser.add_argument_group(
-        "prefix repetition dataset options"
-    )
-    prefix_repetition_group.add_argument(
-        "--prefix-repetition-prefix-len",
-        type=int,
-        default=256,
-        help="Number of prefix tokens per request, used only for prefix "
-        "repetition dataset.",
-    )
-    prefix_repetition_group.add_argument(
-        "--prefix-repetition-suffix-len",
-        type=int,
-        default=256,
-        help="Number of suffix tokens per request, used only for prefix "
-        "repetition dataset. Total input length is prefix_len + suffix_len.",
-    )
-    prefix_repetition_group.add_argument(
-        "--prefix-repetition-num-prefixes",
-        type=int,
-        default=10,
-        help="Number of prefixes to generate, used only for prefix repetition "
-        "dataset. Prompts per prefix is num_requests // num_prefixes.",
-    )
-    prefix_repetition_group.add_argument(
-        "--prefix-repetition-output-len",
-        type=int,
-        default=128,
-        help="Number of output tokens per request, used only for prefix "
-        "repetition dataset.",
-    )
-
 
 def get_samples(args, tokenizer: TokenizerLike) -> list[SampleRequest]:
     if not hasattr(args, "request_id_prefix"):
@@ -1847,7 +1878,6 @@ def get_samples(args, tokenizer: TokenizerLike) -> list[SampleRequest]:
                 random_seed=args.seed,
                 dataset_path=args.dataset_path,
                 disable_shuffle=args.disable_shuffle,
-                prefix_len=args.common_prefix_len,
             ).sample(
                 tokenizer=tokenizer,
                 num_requests=args.num_prompts,
@@ -1930,10 +1960,12 @@ class CustomDataset(BenchmarkDataset):
     Implements the Custom dataset.  Loads data from a JSONL file and generates
     sample requests based on conversation turns. E.g.,
     ```
-    {"prompt": "What is the capital of India?"}
-    {"prompt": "What is the capital of Iran?"}
-    {"prompt": "What is the capital of China?"}
+    {"prompt": "What is the capital of India?", "output_tokens": 10}
+    {"prompt": "What is the capital of Iran?", "output_tokens": 1520}
+    {"prompt": "What is the capital of China?", "output_tokens": 819}
     ```
+    Note that 'output_tokens' column is optional and has to be provided only if
+    'custom-output-len' argument is None or -1.
     """
 
     def __init__(self, **kwargs) -> None:
@@ -2003,6 +2035,23 @@ class CustomDataset(BenchmarkDataset):
                 break
             prompt = item["prompt"]
 
+            new_output_len = output_len
+            if output_len is None or output_len == -1:
+                # check that the request has an 'output_tokens' field
+                if "output_tokens" not in item:
+                    raise ValueError(
+                        "If no output length is provided the "
+                        "custom dataset must contain an 'output_tokens' field."
+                    )
+                # Use number of output tokens from the request data
+                try:
+                    new_output_len = int(item["output_tokens"])
+                except (ValueError, TypeError) as e:
+                    raise ValueError(
+                        f"Invalid value for 'output_tokens' in custom dataset: "
+                        f"'{item['output_tokens']}'. Must be an integer."
+                    ) from e
+
             # apply template
             if not skip_chat_template:
                 prompt = tokenizer.apply_chat_template(
@@ -2016,7 +2065,7 @@ class CustomDataset(BenchmarkDataset):
                 SampleRequest(
                     prompt=prompt,
                     prompt_len=prompt_len,
-                    expected_output_len=output_len,
+                    expected_output_len=new_output_len,
                     request_id=request_id_prefix + str(i),
                 )
             )
diff --git a/vllm/benchmarks/latency.py b/vllm/benchmarks/latency.py
index 99c1c846f19afaf3ab0e9adb232a782816cc5c11..a9d149666e8ba5a6571b342a338368ac1d7109e4 100644
--- a/vllm/benchmarks/latency.py
+++ b/vllm/benchmarks/latency.py
@@ -79,10 +79,6 @@ def add_cli_args(parser: argparse.ArgumentParser):
 
 def main(args: argparse.Namespace):
     engine_args = EngineArgs.from_cli_args(args)
-    if args.profile and not engine_args.profiler_config.profiler == "torch":
-        raise ValueError(
-            "The torch profiler is not enabled. Please provide profiler_config."
-        )
 
     # Lazy import to avoid importing LLM when the bench command is not selected.
     from vllm import LLM, SamplingParams
@@ -125,8 +121,8 @@ def main(args: argparse.Namespace):
                 ),
             )
 
-    def run_to_completion(profile_dir: str | None = None):
-        if profile_dir:
+    def run_to_completion(do_profile: bool = False):
+        if do_profile:
             llm.start_profile()
             llm_generate()
             llm.stop_profile()
@@ -139,18 +135,24 @@ def main(args: argparse.Namespace):
 
     print("Warming up...")
     for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
-        run_to_completion(profile_dir=None)
+        run_to_completion(do_profile=False)
 
     if args.profile:
-        profile_dir = engine_args.profiler_config.torch_profiler_dir
-        print(f"Profiling (results will be saved to '{profile_dir}')...")
-        run_to_completion(profile_dir=profile_dir)
+        profiler_config = engine_args.profiler_config
+        if profiler_config.profiler == "torch":
+            print(
+                "Profiling with torch profiler (results will be saved to"
+                f" {profiler_config.torch_profiler_dir})..."
+            )
+        elif profiler_config.profiler == "cuda":
+            print("Profiling with cuda profiler ...")
+        run_to_completion(do_profile=True)
         return
 
     # Benchmark.
     latencies = []
-    for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
-        latencies.append(run_to_completion(profile_dir=None))
+    for _ in tqdm(range(args.num_iters), desc="Bench iterations"):
+        latencies.append(run_to_completion(do_profile=False))
     latencies = np.array(latencies)
     percentages = [10, 25, 50, 75, 90, 99]
     percentiles = np.percentile(latencies, percentages)
diff --git a/vllm/benchmarks/mm_processor.py b/vllm/benchmarks/mm_processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e65a255393567f03864b94a53ad436c6e19e106
--- /dev/null
+++ b/vllm/benchmarks/mm_processor.py
@@ -0,0 +1,363 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+r"""Benchmark multimodal processor latency.
+
+This benchmark measures the latency of the mm processor module
+using multimodal prompts from datasets.
+MM processor stats are automatically enabled.
+
+Run:
+    vllm bench mm-processor \
+        --model <your_model> \
+        --dataset-name random-mm \
+        --num-prompts 10 \
+"""
+
+import argparse
+import dataclasses
+import json
+import time
+from datetime import datetime
+from typing import Any
+
+import numpy as np
+
+from vllm.benchmarks.throughput import get_requests
+from vllm.engine.arg_utils import EngineArgs
+from vllm.multimodal.processing import (
+    get_timing_stats_from_engine_client,
+)
+from vllm.utils.gc_utils import freeze_gc_heap
+from vllm.utils.import_utils import PlaceholderModule
+
+try:
+    import pandas as pd
+except ImportError:
+    pd = PlaceholderModule("pandas")
+
+
+def collect_mm_processor_stats(
+    llm_engine: Any,
+) -> dict[str, list[float]]:
+    """
+    Collect multimodal processor timing stats.
+    Returns a dictionary mapping stage names to lists of timing values (in seconds).
+    """
+    all_stats = get_timing_stats_from_engine_client(llm_engine)
+
+    stats_by_stage = {
+        "hf_processor_time": [],
+        "hashing_time": [],
+        "cache_lookup_time": [],
+        "prompt_update_time": [],
+        "total_time": [],
+    }
+
+    for stats_dict in all_stats.values():
+        stats_by_stage["hf_processor_time"].append(
+            stats_dict.get("hf_processor_time", 0.0)
+        )
+        stats_by_stage["hashing_time"].append(stats_dict.get("hashing_time", 0.0))
+        stats_by_stage["cache_lookup_time"].append(
+            stats_dict.get("cache_lookup_time", 0.0)
+        )
+        stats_by_stage["prompt_update_time"].append(
+            stats_dict.get("prompt_update_time", 0.0)
+        )
+        stats_by_stage["total_time"].append(stats_dict.get("total_time", 0.0))
+
+    return stats_by_stage
+
+
+def calculate_mm_processor_metrics(
+    stats_by_stage: dict[str, list[float]],
+    selected_percentiles: list[float],
+) -> dict[str, dict[str, float]]:
+    """
+    Calculate aggregate metrics from stats by stage.
+    """
+    metrics = {}
+
+    for stage_name, times in stats_by_stage.items():
+        if not times:
+            metrics[stage_name] = {
+                "mean": 0.0,
+                "median": 0.0,
+                "std": 0.0,
+                **{f"p{p}": 0.0 for p in selected_percentiles},
+            }
+            continue
+
+        times_ms = [t * 1000 for t in times]
+        metrics[stage_name] = {
+            "mean": float(np.mean(times_ms)),
+            "median": float(np.median(times_ms)),
+            "std": float(np.std(times_ms)),
+            **{
+                f"p{p}": float(np.percentile(times_ms, p)) for p in selected_percentiles
+            },
+        }
+
+    return metrics
+
+
+def validate_args(args):
+    """
+    Validate command-line arguments for mm_processor benchmark.
+    """
+    if not getattr(args, "tokenizer", None):
+        args.tokenizer = args.model
+    if not hasattr(args, "dataset_path"):
+        args.dataset_path = None
+    if not hasattr(args, "lora_path"):
+        args.lora_path = None
+    if not hasattr(args, "max_loras"):
+        args.max_loras = None
+
+
+def benchmark_multimodal_processor(
+    args: argparse.Namespace,
+) -> dict[str, Any]:
+    """
+    Run the multimodal processor benchmark.
+    """
+    from vllm import LLM, SamplingParams
+
+    validate_args(args)
+
+    if args.seed is None:
+        args.seed = 0
+
+    engine_args = EngineArgs.from_cli_args(args)
+    llm = LLM(**dataclasses.asdict(engine_args))
+
+    tokenizer = llm.get_tokenizer()
+    requests = get_requests(args, tokenizer)
+
+    assert all(
+        llm.llm_engine.model_config.max_model_len
+        >= (request.prompt_len + request.expected_output_len)
+        for request in requests
+    ), (
+        "Please ensure that max_model_len is greater than the sum of "
+        "prompt_len and expected_output_len for all requests."
+    )
+
+    prompts = [request.prompt for request in requests]
+    expected_output_lens = [request.expected_output_len for request in requests]
+
+    sampling_params = [
+        SamplingParams(
+            n=1,
+            temperature=0.0,
+            max_tokens=output_len,
+            detokenize=True,
+        )
+        for output_len in expected_output_lens
+    ]
+
+    selected_percentiles = [
+        float(p) for p in getattr(args, "metric_percentiles", "99").split(",")
+    ]
+
+    freeze_gc_heap()
+
+    print(f"Processing {len(prompts)} requests...")
+    start_time = time.perf_counter()
+
+    outputs = llm.chat(
+        prompts, sampling_params, use_tqdm=not getattr(args, "disable_tqdm", False)
+    )
+
+    end_time = time.perf_counter()
+    total_time = end_time - start_time
+
+    mm_stats_by_stage = collect_mm_processor_stats(
+        llm.llm_engine,
+    )
+
+    if not any(mm_stats_by_stage.values()):
+        print(
+            "\n⚠️  Warning: No MM processor stats found in registry.\n"
+            "   This may indicate that:\n"
+            "   - No multimodal requests were processed\n"
+            "   - Stats were already retrieved (registry is cleared after retrieval)\n"
+        )
+
+    mm_processor_metrics = calculate_mm_processor_metrics(
+        mm_stats_by_stage, selected_percentiles
+    )
+
+    completed = len([o for o in outputs if o.finished])
+    failed = len(outputs) - completed
+
+    e2el_times = []
+    for output in outputs:
+        if not output.finished or output.metrics is None:
+            continue
+        metrics = output.metrics
+        for attr in ("finished_time", "last_token_time"):
+            if (
+                getattr(metrics, attr, None) is not None
+                and getattr(metrics, "arrival_time", None) is not None
+            ):
+                e2el_times.append(
+                    (getattr(metrics, attr) - metrics.arrival_time) * 1000
+                )
+                break
+
+    if not e2el_times and completed > 0:
+        avg_time_per_request = total_time / completed
+        e2el_times = [avg_time_per_request * 1000] * completed
+
+    if e2el_times:
+        mean_e2el_ms = float(np.mean(e2el_times))
+        median_e2el_ms = float(np.median(e2el_times))
+        std_e2el_ms = float(np.std(e2el_times))
+        percentiles_e2el_ms = [
+            (p, float(np.percentile(e2el_times, p))) for p in selected_percentiles
+        ]
+    else:
+        mean_e2el_ms = 0.0
+        median_e2el_ms = 0.0
+        std_e2el_ms = 0.0
+        percentiles_e2el_ms = [(p, 0.0) for p in selected_percentiles]
+
+    benchmark_result = {
+        "completed": completed,
+        "failed": failed,
+        "mean_e2el_ms": mean_e2el_ms,
+        "median_e2el_ms": median_e2el_ms,
+        "std_e2el_ms": std_e2el_ms,
+        "percentiles_e2el_ms": percentiles_e2el_ms,
+        "mm_processor_stats": mm_processor_metrics,
+    }
+
+    return benchmark_result
+
+
+def add_cli_args(parser: argparse.ArgumentParser) -> None:
+    """Add CLI arguments for the multimodal processor benchmark."""
+    from vllm.engine.arg_utils import EngineArgs
+
+    EngineArgs.add_cli_args(parser)
+
+    parser.set_defaults(enable_mm_processor_stats=True)
+
+    parser.add_argument(
+        "--dataset-name",
+        type=str,
+        default="random-mm",
+        choices=["random-mm", "random-rerank"],
+        help="Name of the dataset to benchmark on. Defaults to 'random-mm'.",
+    )
+    parser.add_argument(
+        "--num-prompts",
+        type=int,
+        default=10,
+        help="Number of prompts to process.",
+    )
+
+    from vllm.benchmarks.datasets import (
+        add_random_dataset_base_args,
+        add_random_multimodal_dataset_args,
+    )
+
+    add_random_dataset_base_args(parser)
+    add_random_multimodal_dataset_args(parser)
+
+    parser.add_argument(
+        "--output-json",
+        type=str,
+        default=None,
+        help="Path to save the benchmark results in JSON format.",
+    )
+    parser.add_argument(
+        "--metric-percentiles",
+        type=str,
+        default="99",
+        help="Comma-separated list of percentiles to calculate (e.g., '50,90,99').",
+    )
+    parser.add_argument(
+        "--disable-tqdm",
+        action="store_true",
+        help="Disable tqdm progress bar.",
+    )
+
+
+def main(args: argparse.Namespace) -> None:
+    """Main entry point for the multimodal processor benchmark."""
+
+    print("Starting multimodal processor benchmark...")
+    result = benchmark_multimodal_processor(args)
+
+    print("\n" + "=" * 80)
+    print("Multimodal Processor Benchmark Results")
+    print("=" * 80)
+
+    if "mm_processor_stats" in result:
+        print("\nMM Processor Timing (ms):")
+        selected_percentiles = [
+            float(p) for p in getattr(args, "metric_percentiles", "99").split(",")
+        ]
+        mm_data = []
+        for stage, metrics in result["mm_processor_stats"].items():
+            row = {
+                "Stage": stage,
+                "Mean": f"{metrics['mean']:.2f}",
+                "Median": f"{metrics['median']:.2f}",
+                "Std": f"{metrics['std']:.2f}",
+            }
+            for p in selected_percentiles:
+                row[f"P{p}"] = f"{metrics.get(f'p{p}', 0.0):.2f}"
+            mm_data.append(row)
+
+        mm_df = pd.DataFrame(mm_data)
+        print(mm_df.to_string(index=False))
+
+    if "mean_e2el_ms" in result:
+        print("\nEnd-to-End Latency (ms):")
+        selected_percentiles = [
+            float(p) for p in getattr(args, "metric_percentiles", "99").split(",")
+        ]
+
+        e2el_data = [
+            {"Metric": "Mean", "Value (ms)": f"{result['mean_e2el_ms']:.2f}"},
+            {"Metric": "Median", "Value (ms)": f"{result['median_e2el_ms']:.2f}"},
+            {"Metric": "Std", "Value (ms)": f"{result['std_e2el_ms']:.2f}"},
+        ]
+
+        for p in selected_percentiles:
+            percentile_value = next(
+                (val for pct, val in result["percentiles_e2el_ms"] if pct == p),
+                0.0,
+            )
+            e2el_data.append(
+                {
+                    "Metric": f"P{p}",
+                    "Value (ms)": f"{percentile_value:.2f}",
+                }
+            )
+
+        e2el_df = pd.DataFrame(e2el_data)
+        print(e2el_df.to_string(index=False))
+
+    if args.output_json:
+        result["config"] = {
+            "model": args.model,
+            "num_prompts": args.num_prompts,
+            "input_len": getattr(args, "random_input_len", None),
+            "output_len": getattr(args, "random_output_len", None),
+        }
+        result["timestamp"] = datetime.now().isoformat()
+
+        with open(args.output_json, "w") as f:
+            json.dump(result, f, indent=2)
+        print(f"\nResults saved to {args.output_json}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Benchmark mm processor latency")
+    add_cli_args(parser)
+    args = parser.parse_args()
+    main(args)
diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
index f5d8ea5a975a9316b7f3dcdfb4e52b106eb8e69d..9d6298927b921191368b02ed681fc22d2e06532a 100644
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -10,8 +10,10 @@ On the client side, run:
     vllm bench serve \
         --backend <backend or endpoint type. Default 'openai'> \
         --label <benchmark result label. Default using backend> \
-        --model <your_model> \
+        --model <your_model. Optional, defaults to first model from server> \
         --dataset-name <dataset_name. Default 'random'> \
+        --input-len <general input length. Optional, maps to dataset-specific args> \
+        --output-len <general output length. Optional, maps to dataset-specific args> \
         --request-rate <request_rate. Default inf> \
         --num-prompts <num_prompts. Default 1000>
 """
@@ -57,6 +59,103 @@ TERM_PLOTLIB_AVAILABLE = (importlib.util.find_spec("termplotlib") is not None) a
 )
 
 
+async def get_first_model_from_server(
+    base_url: str, headers: dict | None = None
+) -> tuple[str, str]:
+    """Fetch the first model from the server's /v1/models endpoint."""
+    models_url = f"{base_url}/v1/models"
+    async with aiohttp.ClientSession() as session:
+        try:
+            async with session.get(models_url, headers=headers) as response:
+                response.raise_for_status()
+                data = await response.json()
+                if "data" in data and len(data["data"]) > 0:
+                    return data["data"][0]["id"], data["data"][0]["root"]
+                else:
+                    raise ValueError(
+                        f"No models found on the server at {base_url}. "
+                        "Make sure the server is running and has models loaded."
+                    )
+        except (aiohttp.ClientError, json.JSONDecodeError) as e:
+            raise RuntimeError(
+                f"Failed to fetch models from server at {models_url}. "
+                "Check that:\n"
+                "1. The server is running\n"
+                "2. The server URL is correct\n"
+                f"Error: {e}"
+            ) from e
+
+
+@dataclass
+class SpecDecodeMetrics:
+    """Speculative decoding metrics from the server's Prometheus endpoint."""
+
+    num_drafts: int
+    num_draft_tokens: int
+    num_accepted_tokens: int
+    accepted_per_pos: dict[int, int]
+
+
+async def fetch_spec_decode_metrics(
+    base_url: str, session: aiohttp.ClientSession
+) -> SpecDecodeMetrics | None:
+    """Fetch speculative decoding metrics from the server's Prometheus endpoint.
+
+    Returns None if speculative decoding is not enabled or metrics are not available.
+    """
+    metrics_url = f"{base_url}/metrics"
+    try:
+        async with session.get(metrics_url) as response:
+            if response.status != 200:
+                return None
+            text = await response.text()
+
+            num_drafts = 0
+            num_draft_tokens = 0
+            num_accepted_tokens = 0
+            accepted_per_pos: dict[int, int] = {}
+            found_spec_decode = False
+
+            for line in text.split("\n"):
+                line = line.strip()
+                if not line or line.startswith("#"):
+                    continue
+
+                if line.startswith("vllm:spec_decode"):
+                    found_spec_decode = True
+                    parts = line.split()
+                    if parts:
+                        with contextlib.suppress(ValueError):
+                            if "num_drafts" in line:
+                                num_drafts += int(float(parts[-1]))
+                            elif "num_draft_tokens" in line:
+                                num_draft_tokens += int(float(parts[-1]))
+                            elif "num_accepted_tokens_per_pos" in line:
+                                pos_label = 'position="'
+                                if pos_label in line:
+                                    start = line.index(pos_label) + len(pos_label)
+                                    end = line.index('"', start)
+                                    pos = int(line[start:end])
+                                    val = int(float(parts[-1]))
+                                    accepted_per_pos[pos] = (
+                                        accepted_per_pos.get(pos, 0) + val
+                                    )
+                            elif "num_accepted_tokens" in line:
+                                num_accepted_tokens += int(float(parts[-1]))
+
+            if not found_spec_decode:
+                return None
+
+            return SpecDecodeMetrics(
+                num_drafts=num_drafts,
+                num_draft_tokens=num_draft_tokens,
+                num_accepted_tokens=num_accepted_tokens,
+                accepted_per_pos=accepted_per_pos,
+            )
+    except (aiohttp.ClientError, asyncio.TimeoutError):
+        return None
+
+
 class TaskType(Enum):
     GENERATION = "generation"
     POOLING = "pooling"
@@ -383,6 +482,12 @@ def calculate_metrics(
     # Find the time range across all successful requests
     successful_outputs = [output for output in outputs if output.success]
     failed_outputs = [output for output in outputs if not output.success]
+
+    if len(failed_outputs) > 0:
+        print("Failed requests during benchmark run detected (capping to 10):")
+        for i, err in enumerate(failed_outputs[:10]):
+            print(f"Error {i}: {err.error}")
+
     if successful_outputs:
         min_start_time = min(output.start_time for output in successful_outputs)
         max_end_time = max(
@@ -650,6 +755,8 @@ async def benchmark(
     print(f"Burstiness factor: {burstiness} ({distribution})")
     print(f"Maximum request concurrency: {max_concurrency}")
 
+    spec_decode_metrics_before = await fetch_spec_decode_metrics(base_url, session)
+
     pbar = None if disable_tqdm else tqdm(total=len(input_requests))
 
     semaphore = (
@@ -733,6 +840,48 @@ async def benchmark(
 
     benchmark_duration = time.perf_counter() - benchmark_start_time
 
+    spec_decode_metrics_after = await fetch_spec_decode_metrics(base_url, session)
+    spec_decode_stats: dict[str, Any] | None = None
+    if spec_decode_metrics_before is not None and spec_decode_metrics_after is not None:
+        delta_drafts = (
+            spec_decode_metrics_after.num_drafts - spec_decode_metrics_before.num_drafts
+        )
+        delta_draft_tokens = (
+            spec_decode_metrics_after.num_draft_tokens
+            - spec_decode_metrics_before.num_draft_tokens
+        )
+        delta_accepted = (
+            spec_decode_metrics_after.num_accepted_tokens
+            - spec_decode_metrics_before.num_accepted_tokens
+        )
+        per_pos_rates: list[float] = []
+        if delta_drafts > 0:
+            positions = sorted(
+                set(spec_decode_metrics_before.accepted_per_pos.keys())
+                | set(spec_decode_metrics_after.accepted_per_pos.keys())
+            )
+            for pos in positions:
+                before_val = spec_decode_metrics_before.accepted_per_pos.get(pos, 0)
+                after_val = spec_decode_metrics_after.accepted_per_pos.get(
+                    pos, before_val
+                )
+                delta_pos = after_val - before_val
+                per_pos_rates.append(delta_pos / delta_drafts)
+
+        if delta_draft_tokens > 0:
+            acceptance_rate = (delta_accepted / delta_draft_tokens) * 100
+            acceptance_length = (
+                1 + delta_accepted / delta_drafts if delta_drafts > 0 else 0.0
+            )
+            spec_decode_stats = {
+                "num_drafts": delta_drafts,
+                "draft_tokens": delta_draft_tokens,
+                "accepted_tokens": delta_accepted,
+                "acceptance_rate": acceptance_rate,
+                "acceptance_length": acceptance_length,
+                "per_position_acceptance_rates": per_pos_rates,
+            }
+
     if task_type == TaskType.GENERATION:
         metrics, actual_output_lens = calculate_metrics(
             input_requests=input_requests,
@@ -828,6 +977,18 @@ async def benchmark(
     if rps_change_events:
         result["rps_change_events"] = rps_change_events
 
+    if spec_decode_stats is not None:
+        result["spec_decode_acceptance_rate"] = spec_decode_stats["acceptance_rate"]
+        result["spec_decode_acceptance_length"] = spec_decode_stats["acceptance_length"]
+        result["spec_decode_num_drafts"] = int(spec_decode_stats["num_drafts"])
+        result["spec_decode_draft_tokens"] = int(spec_decode_stats["draft_tokens"])
+        result["spec_decode_accepted_tokens"] = int(
+            spec_decode_stats["accepted_tokens"]
+        )
+        result["spec_decode_per_position_acceptance_rates"] = spec_decode_stats.get(
+            "per_position_acceptance_rates", []
+        )
+
     def process_one_metric(
         # E.g., "ttft"
         metric_attribute_name: str,
@@ -873,6 +1034,35 @@ async def benchmark(
         process_one_metric("itl", "ITL", "Inter-token Latency")
     process_one_metric("e2el", "E2EL", "End-to-end Latency")
 
+    if spec_decode_stats is not None:
+        print("{s:{c}^{n}}".format(s="Speculative Decoding", n=50, c="-"))
+        print(
+            "{:<40} {:<10.2f}".format(
+                "Acceptance rate (%):", spec_decode_stats["acceptance_rate"]
+            )
+        )
+        print(
+            "{:<40} {:<10.2f}".format(
+                "Acceptance length:", spec_decode_stats["acceptance_length"]
+            )
+        )
+        print("{:<40} {:<10}".format("Drafts:", int(spec_decode_stats["num_drafts"])))
+        print(
+            "{:<40} {:<10}".format(
+                "Draft tokens:", int(spec_decode_stats["draft_tokens"])
+            )
+        )
+        print(
+            "{:<40} {:<10}".format(
+                "Accepted tokens:", int(spec_decode_stats["accepted_tokens"])
+            )
+        )
+        per_pos = spec_decode_stats.get("per_position_acceptance_rates", [])
+        if per_pos:
+            print("Per-position acceptance (%):")
+            for i, rate in enumerate(per_pos):
+                print("{:<40} {:<10.2f}".format(f"  Position {i}:", rate * 100))
+
     print("=" * 50)
 
     if profile:
@@ -1025,8 +1215,26 @@ def add_cli_args(parser: argparse.ArgumentParser):
     parser.add_argument(
         "--model",
         type=str,
-        required=True,
-        help="Name of the model.",
+        required=False,
+        default=None,
+        help="Name of the model. If not specified, will fetch the first model "
+        "from the server's /v1/models endpoint.",
+    )
+    parser.add_argument(
+        "--input-len",
+        type=int,
+        default=None,
+        help="General input length for datasets. Maps to dataset-specific "
+        "input length arguments (e.g., --random-input-len, --sonnet-input-len). "
+        "If not specified, uses dataset defaults.",
+    )
+    parser.add_argument(
+        "--output-len",
+        type=int,
+        default=None,
+        help="General output length for datasets. Maps to dataset-specific "
+        "output length arguments (e.g., --random-output-len, --sonnet-output-len). "
+        "If not specified, uses dataset defaults.",
     )
     parser.add_argument(
         "--tokenizer",
@@ -1110,7 +1318,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
         "--save-detailed",
         action="store_true",
         help="When saving the results, whether to include per request "
-        "information such as response, error, ttfs, tpots, etc.",
+        "information such as response, error, ttfts, tpots, etc.",
     )
     parser.add_argument(
         "--append-result",
@@ -1234,12 +1442,6 @@ def add_cli_args(parser: argparse.ArgumentParser):
         help="Repetition penalty sampling parameter. Only has effect on "
         "openai-compatible backends.",
     )
-    sampling_group.add_argument(
-        "--common-prefix-len",
-        type=int,
-        default=None,
-        help="Common prefix length shared by all prompts (used by random dataset)",
-    )
 
     parser.add_argument(
         "--served-model-name",
@@ -1286,10 +1488,9 @@ def add_cli_args(parser: argparse.ArgumentParser):
     parser.add_argument(
         "--ready-check-timeout-sec",
         type=int,
-        default=600,
+        default=0,
         help="Maximum time to wait for the endpoint to become ready "
-        "in seconds (default: 600 seconds / 10 minutes). If set to 0, "
-        "the ready check will be skipped.",
+        "in seconds. Ready check will be skipped by default.",
     )
 
     parser.add_argument(
@@ -1332,10 +1533,6 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
             raise ValueError("For exponential ramp-up, the start RPS cannot be 0.")
 
     label = args.label
-    model_id = args.model
-    model_name = args.served_model_name
-    tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
-    tokenizer_mode = args.tokenizer_mode
 
     if args.base_url is not None:
         api_url = f"{args.base_url}{args.endpoint}"
@@ -1356,6 +1553,18 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
             else:
                 raise ValueError("Invalid header format. Please use KEY=VALUE format.")
 
+    # Fetch model from server if not specified
+    if args.model is None:
+        print("Model not specified, fetching first model from server...")
+        model_name, model_id = await get_first_model_from_server(base_url, headers)
+        print(f"First model name: {model_name}, first model id: {model_id}")
+    else:
+        model_name = args.served_model_name
+        model_id = args.model
+
+    tokenizer_id = args.tokenizer if args.tokenizer is not None else model_id
+    tokenizer_mode = args.tokenizer_mode
+
     tokenizer = get_tokenizer(
         tokenizer_id,
         tokenizer_mode=tokenizer_mode,
@@ -1368,6 +1577,20 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
             "'--dataset-path' if required."
         )
 
+    # Map general --input-len and --output-len to all dataset-specific arguments
+    if args.input_len is not None:
+        args.random_input_len = args.input_len
+        args.sonnet_input_len = args.input_len
+
+    if args.output_len is not None:
+        args.random_output_len = args.output_len
+        args.sonnet_output_len = args.output_len
+        args.sharegpt_output_len = args.output_len
+        args.custom_output_len = args.output_len
+        args.hf_output_len = args.output_len
+        args.spec_bench_output_len = args.output_len
+        args.prefix_repetition_output_len = args.output_len
+
     # when using random datasets, default to ignoring EOS
     # so generation runs to the requested length
     if (
diff --git a/vllm/benchmarks/startup.py b/vllm/benchmarks/startup.py
index 086f7bf62f838a6ece2dc11ba7e6be3fea7a8031..d0d46a96385d36f9f6f9d2a98789430252034846 100644
--- a/vllm/benchmarks/startup.py
+++ b/vllm/benchmarks/startup.py
@@ -55,7 +55,7 @@ def cold_startup():
             os.environ.pop("VLLM_CACHE_ROOT", None)
 
 
-def run_startup_in_subprocess(engine_args_dict, result_queue):
+def run_startup_in_subprocess(engine_args, result_queue):
     """
     Run LLM startup in a subprocess and return timing metrics via a queue.
     This ensures complete isolation between iterations.
@@ -63,9 +63,6 @@ def run_startup_in_subprocess(engine_args_dict, result_queue):
     try:
         # Import inside the subprocess to avoid issues with forking
         from vllm import LLM
-        from vllm.engine.arg_utils import EngineArgs
-
-        engine_args = EngineArgs(**engine_args_dict)
 
         # Measure total startup time
         start_time = time.perf_counter()
@@ -200,15 +197,13 @@ def main(args: argparse.Namespace):
         Create LLM instance in a subprocess and measure startup time.
         Returns timing metrics, using subprocess for complete isolation.
         """
-        # Convert engine_args to dictionary for pickling
-        engine_args_dict = dataclasses.asdict(engine_args)
 
         # Create a queue for inter-process communication
         result_queue = multiprocessing.Queue()
         process = multiprocessing.Process(
             target=run_startup_in_subprocess,
             args=(
-                engine_args_dict,
+                engine_args,
                 result_queue,
             ),
         )
diff --git a/vllm/benchmarks/sweep/param_sweep.py b/vllm/benchmarks/sweep/param_sweep.py
index a438a328880fde38edbdd18b3a0651e3f7fc8d57..f20134cfcb2eb3866a12a28c49a335303ca76266 100644
--- a/vllm/benchmarks/sweep/param_sweep.py
+++ b/vllm/benchmarks/sweep/param_sweep.py
@@ -74,7 +74,8 @@ class ParameterSweepItem(dict[str, object]):
         representation of all parameters.
         """
         if "_benchmark_name" in self:
-            return self["_benchmark_name"]
+            return str(self["_benchmark_name"])
+
         return self.as_text(sep="-")
 
     # In JSON, we prefer "_"
diff --git a/vllm/benchmarks/sweep/serve_sla.py b/vllm/benchmarks/sweep/serve_sla.py
index 0403d1ddfd6c115f14e94f6c342e750b6e5064c8..26f0d6bf652efac4b88da6c1cbf5ae33cd666663 100644
--- a/vllm/benchmarks/sweep/serve_sla.py
+++ b/vllm/benchmarks/sweep/serve_sla.py
@@ -3,14 +3,11 @@
 import argparse
 import contextlib
 import json
-import math
 from dataclasses import asdict, dataclass
 from datetime import datetime
 from pathlib import Path
 from typing import ClassVar, Literal, get_args
 
-from typing_extensions import assert_never
-
 from vllm.utils.import_utils import PlaceholderModule
 
 from .param_sweep import ParameterSweep, ParameterSweepItem
@@ -24,6 +21,15 @@ try:
 except ImportError:
     pd = PlaceholderModule("pandas")
 
+try:
+    from scipy.interpolate import PchipInterpolator
+except ImportError:
+    PchipInterpolator = (
+        PlaceholderModule("scipy")
+        .placeholder_attr("interpolate")
+        .placeholder_attr("PchipInterpolator")
+    )
+
 
 def _get_sla_base_path(
     output_dir: Path,
@@ -59,6 +65,14 @@ def _get_sla_run_path(iter_path: Path, run_number: int | None):
     return iter_path / f"run={run_number}.json"
 
 
+def _iter_sla_val_paths(base_path: Path, sla_variable: str):
+    for iter_path in base_path.glob(f"{sla_variable}=*"):
+        sla_value = int(iter_path.name.removeprefix(f"{sla_variable}="))
+        summary_path = iter_path / "summary.json"
+        if summary_path.exists():
+            yield sla_value, summary_path
+
+
 def _sla_needs_server(
     serve_comb: ParameterSweepItem,
     bench_combs: ParameterSweep,
@@ -118,81 +132,55 @@ def run_sla(
 SLAVariable = Literal["request_rate", "max_concurrency"]
 
 
-def _estimate_sla_value(run_data: dict[str, object], sla_variable: SLAVariable):
-    request_throughput = float(run_data["request_throughput"])  # type: ignore
-    if sla_variable == "request_rate":
-        return request_throughput
-    if sla_variable == "max_concurrency":
-        mean_latency_ms = float(run_data["mean_e2el_ms"])  # type: ignore
-        return request_throughput * mean_latency_ms / 1000
+class SLAHistory(dict[int, float]):
+    def __init__(self, min_value: int, max_value: int) -> None:
+        super().__init__()
 
-    assert_never(sla_variable)
+        self.min_value = min_value
+        self.max_value = max_value
 
+    def get_xy(self) -> tuple[list[int], list[float]]:
+        xs = list[int]()
+        ys = list[float]()
+        for x, y in sorted(self.items()):
+            xs.append(x)
+            ys.append(y)
 
-def _estimate_sla_bounds(
-    server: ServerProcess | None,
-    bench_cmd: list[str],
-    *,
-    serve_comb: ParameterSweepItem,
-    bench_comb: ParameterSweepItem,
-    sla_comb: SLASweepItem,
-    base_path: Path,
-    num_runs: int,
-    dry_run: bool,
-    sla_variable: SLAVariable,
-    init_value: int,
-    max_value: int,
-):
-    sla_data = list[dict[str, object]]()
-
-    max_passing: int = 0
-    min_failing: int = 0
-
-    val: int = init_value
-    assert val > 0
-
-    while True:
-        print(f"Testing {sla_variable}: {val} req/s")
+        return xs, ys
 
-        iter_data = run_sla(
-            server,
-            bench_cmd,
-            serve_comb=serve_comb,
-            bench_comb=bench_comb | {sla_variable: val},
-            iter_path=_get_sla_iter_path(base_path, sla_comb, sla_variable, val),
-            num_runs=num_runs,
-            dry_run=dry_run,
+    def get_max_passing(self) -> float:
+        return max(
+            (val for val, margin in self.items() if margin <= 0),
+            default=self.min_value,
         )
 
-        assert iter_data is not None
-        sla_data.extend(iter_data)
+    def get_min_failing(self) -> float:
+        return min(
+            (val for val, margin in self.items() if margin > 0),
+            default=self.max_value,
+        )
 
-        iter_data_mean = {
-            k: sum(float(run_data[k]) for run_data in iter_data) / len(iter_data)  # type: ignore
-            for k in sla_comb
-        }
 
-        sla_results = [
-            criterion.print_and_validate(iter_data_mean, k)
-            for k, criterion in sla_comb.items()
-        ]
+def _compute_margin(
+    sla_comb: SLASweepItem,
+    iter_data: list[dict[str, object]],
+):
+    assert iter_data, "Summary should not be empty"
 
-        if all(sla_results):
-            print("SLA criteria are met.")
-            max_passing = val
-            val *= 2
-        else:
-            print("SLA criteria are not met.")
-            min_failing = val
-            break
+    iter_data_mean = {
+        k: sum(float(run_data[k]) for run_data in iter_data) / len(iter_data)  # type: ignore
+        for k in sla_comb
+    }
 
-        if val >= max_value:
-            break
+    sla_margins = [
+        criterion.print_and_compute_margin(iter_data_mean, k)
+        for k, criterion in sla_comb.items()
+    ]
 
-    return sla_data, (max_passing, min_failing)
+    return max(sla_margins)
 
 
-def _find_sla_value(
+def solve_sla(
     server: ServerProcess | None,
     bench_cmd: list[str],
     *,
@@ -203,16 +191,40 @@ def _find_sla_value(
     num_runs: int,
     dry_run: bool,
     sla_variable: SLAVariable,
-    min_value: int,
-    max_value: int,
+    sla_min_value: int = 1,
+    sla_max_value: int = 8192,  # The value that represents infinite QPS
 ):
     sla_data = list[dict[str, object]]()
+    history = SLAHistory(min_value=sla_min_value, max_value=sla_max_value)
+
+    # Use results from previous runs
+    for past_sla_value, path in _iter_sla_val_paths(base_path, sla_variable):
+        with path.open("rb") as f:
+            past_iter_data = json.load(f)
 
-    left: int = min_value
-    right: int = max_value
+        history[past_sla_value] = _compute_margin(sla_comb, past_iter_data)
 
-    while True:
-        val = (left + right) // 2
+    # NOTE: We don't use equality here to be more robust against noisy results
+    while history.get_max_passing() + 1 < history.get_min_failing():
+        if max(history, default=sla_min_value) < sla_max_value:
+            val = sla_max_value
+        elif min(history, default=sla_max_value) > sla_min_value:
+            val = sla_min_value
+        else:
+            spl = PchipInterpolator(*history.get_xy(), extrapolate=False)
+            spl_roots = spl.solve()
+            if len(spl_roots) == 0:
+                # Fallback to binary search
+                val = int((history.get_max_passing() + history.get_min_failing()) / 2)
+            else:
+                val = int(spl_roots[0])
+
+            if val in history:
+                # Cover both sides (floor and ceil) of the root to be sure
+                # that it is indeed the target value
+                val += 1
+
+        val = max(sla_min_value, min(val, sla_max_value))
         print(f"Testing {sla_variable}: {val} req/s")
 
         iter_data = run_sla(
@@ -224,31 +236,19 @@ def _find_sla_value(
             num_runs=num_runs,
             dry_run=dry_run,
         )
+        if iter_data is None:
+            return None
 
-        assert iter_data is not None
-        sla_data.extend(iter_data)
-
-        iter_data_mean = {
-            k: sum(float(run_data[k]) for run_data in iter_data) / len(iter_data)  # type: ignore
-            for k in sla_comb
-        }
-
-        sla_results = [
-            criterion.print_and_validate(iter_data_mean, k)
-            for k, criterion in sla_comb.items()
-        ]
-
-        if all(sla_results):
-            print("SLA criteria are met.")
-            left = val
+        margin = _compute_margin(sla_comb, iter_data)
+        if margin <= 0:
+            print(f"SLA criteria are met. ({margin=:.2f})")
         else:
-            print("SLA criteria are not met.")
-            right = val
+            print(f"SLA criteria are not met. ({margin=:.2f})")
 
-        if right - left <= 1:
-            break
+        sla_data.extend(iter_data)
+        history[val] = margin
 
-    return sla_data, left
+    return sla_data, history
 
 
 def search_sla(
@@ -259,7 +259,6 @@ def search_sla(
     bench_comb: ParameterSweepItem,
     sla_comb: SLASweepItem,
     sla_variable: SLAVariable,
-    sla_inf_value: int = 65536,  # The value that represents infinite QPS
     base_path: Path,
     num_runs: int,
     dry_run: bool,
@@ -267,28 +266,7 @@ def search_sla(
     print("[SLA START]")
     print(f"SLA criteria: {sla_comb.as_text()}")
 
-    sla_data_0 = run_sla(
-        server,
-        bench_cmd,
-        serve_comb=serve_comb,
-        bench_comb=bench_comb | {sla_variable: sla_inf_value},
-        iter_path=_get_sla_iter_path(base_path, sla_comb, sla_variable, sla_inf_value),
-        num_runs=num_runs,
-        dry_run=dry_run,
-    )
-    if sla_data_0 is None:
-        assert dry_run
-        print("Omitting SLA search.")
-        print("[SLA END]")
-        return None
-
-    sla_init_value = math.ceil(
-        sum(_estimate_sla_value(item, sla_variable) for item in sla_data_0)
-        / len(sla_data_0)
-    )
-    print(f"Initial {sla_variable} to search: {sla_init_value} req/s.")
-
-    sla_data_1, (sla_min, sla_max) = _estimate_sla_bounds(
+    result = solve_sla(
         server,
         bench_cmd,
         serve_comb=serve_comb,
@@ -298,26 +276,15 @@ def search_sla(
         num_runs=num_runs,
         dry_run=dry_run,
         sla_variable=sla_variable,
-        init_value=sla_init_value,
-        max_value=sla_inf_value,
-    )
-    print(f"Range of {sla_variable} to search: [{sla_min}, {sla_max}] req/s.")
-
-    sla_data_2, sla_value = _find_sla_value(
-        server,
-        bench_cmd,
-        serve_comb=serve_comb,
-        bench_comb=bench_comb,
-        sla_comb=sla_comb,
-        base_path=base_path,
-        num_runs=num_runs,
-        dry_run=dry_run,
-        sla_variable=sla_variable,
-        min_value=sla_min,
-        max_value=sla_max,
     )
+    if result is None:
+        assert dry_run
+        print("Omitting SLA search.")
+        print("[SLA END]")
+        return
 
-    sla_data = sla_data_0 + sla_data_1 + sla_data_2
+    sla_data, sla_history = result
+    sla_value = sla_history.get_max_passing()
     print(f"Maximum {sla_variable} for SLA: {sla_value} req/s.")
 
     with _get_sla_iter_path(
diff --git a/vllm/benchmarks/sweep/sla_sweep.py b/vllm/benchmarks/sweep/sla_sweep.py
index 327e3c7c5897a7ce3f512243ab7d501cecea38d1..0a780860df270f9018a900696c51bf7fd8ad774a 100644
--- a/vllm/benchmarks/sweep/sla_sweep.py
+++ b/vllm/benchmarks/sweep/sla_sweep.py
@@ -7,39 +7,45 @@ from dataclasses import dataclass
 
 from typing_extensions import override
 
+SLA_EPS = 1e-8
+"""Offset used to differentiate margins for equality checks."""
+
 
 @dataclass
 class SLACriterionBase(ABC):
     target: float
 
     @abstractmethod
-    def validate(self, actual: float) -> bool:
-        """Return `True` if this criterion is met; otherwise `False`."""
+    def compute_margin(self, actual: float) -> float:
+        """
+        Return a negative value or `0` if this criterion is met;
+        otherwise a positive value indicating the distance to the target.
+        """
         raise NotImplementedError
 
     @abstractmethod
     def format_cond(self, lhs: str) -> str:
         raise NotImplementedError
 
-    def print_and_validate(
+    def print_and_compute_margin(
         self,
         metrics: dict[str, float],
         metrics_key: str,
-    ) -> bool:
+    ) -> float:
         metric = metrics[metrics_key]
-        result = self.validate(metric)
+        margin = self.compute_margin(metric)
 
         cond = self.format_cond(f"{metrics_key} = {metric:.2f}")
-        print(f"Validating SLA: {cond} | " + ("PASSED" if result else "FAILED"))
+        print(f"Validating SLA: {cond} | " + ("PASSED" if margin <= 0 else "FAILED"))
 
-        return result
+        return margin
 
 
 @dataclass
 class SLALessThan(SLACriterionBase):
     @override
-    def validate(self, actual: float) -> bool:
-        return actual < self.target
+    def compute_margin(self, actual: float) -> float:
+        return actual + SLA_EPS - self.target
 
     @override
     def format_cond(self, lhs: str) -> str:
@@ -49,8 +55,8 @@ class SLALessThan(SLACriterionBase):
 @dataclass
 class SLALessThanOrEqualTo(SLACriterionBase):
     @override
-    def validate(self, actual: float) -> bool:
-        return actual <= self.target
+    def compute_margin(self, actual: float) -> float:
+        return actual - self.target
 
     @override
     def format_cond(self, lhs: str) -> str:
@@ -60,8 +66,8 @@ class SLALessThanOrEqualTo(SLACriterionBase):
 @dataclass
 class SLAGreaterThan(SLACriterionBase):
     @override
-    def validate(self, actual: float) -> bool:
-        return actual > self.target
+    def compute_margin(self, actual: float) -> float:
+        return self.target + SLA_EPS - actual
 
     @override
     def format_cond(self, lhs: str) -> str:
@@ -71,8 +77,8 @@ class SLAGreaterThan(SLACriterionBase):
 @dataclass
 class SLAGreaterThanOrEqualTo(SLACriterionBase):
     @override
-    def validate(self, actual: float) -> bool:
-        return actual >= self.target
+    def compute_margin(self, actual: float) -> float:
+        return self.target - actual
 
     @override
     def format_cond(self, lhs: str) -> str:
diff --git a/vllm/benchmarks/throughput.py b/vllm/benchmarks/throughput.py
index d824e982b74891c1ad9ae8563b11ef993c4d83d9..3c0fea8e01118474ce6eeaee8ff34d53c5341987 100644
--- a/vllm/benchmarks/throughput.py
+++ b/vllm/benchmarks/throughput.py
@@ -24,10 +24,14 @@ from vllm.benchmarks.datasets import (
     MultiModalConversationDataset,
     PrefixRepetitionRandomDataset,
     RandomDataset,
+    RandomDatasetForReranking,
+    RandomMultiModalDataset,
     SampleRequest,
     ShareGPTDataset,
     SonnetDataset,
     VisionArenaDataset,
+    add_random_dataset_base_args,
+    add_random_multimodal_dataset_args,
 )
 from vllm.benchmarks.lib.utils import convert_to_pytorch_benchmark_format, write_to_json
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
@@ -342,18 +346,33 @@ def get_requests(args, tokenizer):
         "lora_path": args.lora_path,
         "max_loras": args.max_loras,
         "num_requests": args.num_prompts,
-        "input_len": args.input_len,
-        "output_len": args.output_len,
     }
 
-    if args.dataset_path is None or args.dataset_name == "random":
+    if args.dataset_name == "random" or (
+        args.dataset_path is None
+        and args.dataset_name not in {"prefix_repetition", "random-mm", "random-rerank"}
+    ):
         sample_kwargs["range_ratio"] = args.random_range_ratio
-        sample_kwargs["prefix_len"] = args.prefix_len
+        # prefer random_* arguments, fall back to regular arguments
+        random_prefix_len = getattr(args, "random_prefix_len", None)
+        sample_kwargs["prefix_len"] = (
+            random_prefix_len if random_prefix_len is not None else args.prefix_len
+        )
+        random_input_len = getattr(args, "random_input_len", None)
+        sample_kwargs["input_len"] = (
+            random_input_len if random_input_len is not None else args.input_len
+        )
+        random_output_len = getattr(args, "random_output_len", None)
+        sample_kwargs["output_len"] = (
+            random_output_len if random_output_len is not None else args.output_len
+        )
         dataset_cls = RandomDataset
     elif args.dataset_name == "sharegpt":
         dataset_cls = ShareGPTDataset
         if args.backend == "vllm-chat":
             sample_kwargs["enable_multimodal_chat"] = True
+        if args.output_len is not None:
+            sample_kwargs["output_len"] = args.output_len
     elif args.dataset_name == "sonnet":
         assert tokenizer.chat_template or tokenizer.default_chat_template, (
             "Tokenizer/model must have chat template for sonnet dataset."
@@ -361,9 +380,15 @@ def get_requests(args, tokenizer):
         dataset_cls = SonnetDataset
         sample_kwargs["prefix_len"] = args.prefix_len
         sample_kwargs["return_prompt_formatted"] = True
+        if args.input_len is not None:
+            sample_kwargs["input_len"] = args.input_len
+        if args.output_len is not None:
+            sample_kwargs["output_len"] = args.output_len
     elif args.dataset_name == "burstgpt":
         dataset_cls = BurstGPTDataset
     elif args.dataset_name == "hf":
+        if args.output_len is not None:
+            sample_kwargs["output_len"] = args.output_len
         if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
             dataset_cls = VisionArenaDataset
             common_kwargs["dataset_subset"] = None
@@ -392,6 +417,56 @@ def get_requests(args, tokenizer):
         sample_kwargs["suffix_len"] = args.prefix_repetition_suffix_len
         sample_kwargs["num_prefixes"] = args.prefix_repetition_num_prefixes
         sample_kwargs["output_len"] = args.prefix_repetition_output_len
+    elif args.dataset_name == "random-mm":
+        dataset_cls = RandomMultiModalDataset
+        # prefer random_* arguments, fall back to regular arguments
+        random_input_len = getattr(args, "random_input_len", None)
+        sample_kwargs["input_len"] = (
+            random_input_len
+            if random_input_len is not None
+            else getattr(args, "input_len", None)
+        )
+        random_output_len = getattr(args, "random_output_len", None)
+        sample_kwargs["output_len"] = (
+            random_output_len
+            if random_output_len is not None
+            else getattr(args, "output_len", None)
+        )
+        sample_kwargs["base_items_per_request"] = getattr(
+            args, "random_mm_base_items_per_request", None
+        )
+        sample_kwargs["num_mm_items_range_ratio"] = getattr(
+            args, "random_mm_num_mm_items_range_ratio", None
+        )
+        sample_kwargs["limit_mm_per_prompt"] = getattr(
+            args, "random_mm_limit_mm_per_prompt", None
+        )
+        sample_kwargs["bucket_config"] = getattr(args, "random_mm_bucket_config", None)
+        sample_kwargs["enable_multimodal_chat"] = True
+        random_prefix_len = getattr(args, "random_prefix_len", None)
+        prefix_len = getattr(args, "prefix_len", None)
+        sample_kwargs["prefix_len"] = (
+            random_prefix_len if random_prefix_len is not None else prefix_len
+        )
+        sample_kwargs["range_ratio"] = args.random_range_ratio
+    elif args.dataset_name == "random-rerank":
+        dataset_cls = RandomDatasetForReranking
+        # prefer random_* arguments, fall back to regular arguments
+        random_input_len = getattr(args, "random_input_len", None)
+        sample_kwargs["input_len"] = (
+            random_input_len
+            if random_input_len is not None
+            else getattr(args, "input_len", None)
+        )
+        random_output_len = getattr(args, "random_output_len", None)
+        sample_kwargs["output_len"] = (
+            random_output_len
+            if random_output_len is not None
+            else getattr(args, "output_len", None)
+        )
+        sample_kwargs["batchsize"] = getattr(args, "random_batch_size", 1)
+        sample_kwargs["is_reranker"] = not getattr(args, "no_reranker", False)
+        sample_kwargs["range_ratio"] = args.random_range_ratio
     else:
         raise ValueError(f"Unknown dataset name: {args.dataset_name}")
     # Remove None values
@@ -448,8 +523,12 @@ def validate_args(args):
     ):
         print("When dataset path is not set, it will default to random dataset")
         args.dataset_name = "random"
-        if args.input_len is None:
-            raise ValueError("input_len must be provided for a random dataset")
+        random_input_len = getattr(args, "random_input_len", None)
+        if args.input_len is None and random_input_len is None:
+            raise ValueError(
+                "Either --input-len or --random-input-len must be provided "
+                "for a random dataset"
+            )
 
     # === Dataset Name Specific Checks ===
     # --hf-subset and --hf-split: only used
@@ -482,26 +561,79 @@ def validate_args(args):
         else:
             raise ValueError(f"{args.dataset_path} is not supported by hf dataset.")
 
-    # --random-range-ratio: only used when dataset_name is 'random'
-    if args.dataset_name != "random" and args.random_range_ratio is not None:
+    # --random-range-ratio: only used when dataset_name is 'random',
+    # 'random-mm', or 'random-rerank'
+    if (
+        args.dataset_name not in {"random", "random-mm", "random-rerank"}
+        and args.random_range_ratio is not None
+    ):
         warnings.warn(
             "--random-range-ratio will be ignored since \
-                --dataset-name is not 'random'.",
+                --dataset-name is not 'random', 'random-mm', or 'random-rerank'.",
             stacklevel=2,
         )
 
-    # --prefix-len: only used when dataset_name is 'random', 'sonnet', or not
-    # set.
+    # --random-batch-size: only used when dataset_name is 'random-rerank'
     if (
-        args.dataset_name not in {"random", "sonnet", None}
+        args.dataset_name != "random-rerank"
+        and getattr(args, "random_batch_size", None) is not None
+    ) and args.random_batch_size != 1:
+        warnings.warn(
+            "--random-batch-size will be ignored since \
+                    --dataset-name is not 'random-rerank'.",
+            stacklevel=2,
+        )
+
+    # --no-reranker: only used when dataset_name is 'random-rerank'
+    if args.dataset_name != "random-rerank" and getattr(args, "no_reranker", False):
+        warnings.warn(
+            "--no-reranker will be ignored since \
+                --dataset-name is not 'random-rerank'.",
+            stacklevel=2,
+        )
+
+    # --prefix-len: only used when dataset_name is 'random', 'random-mm',
+    # 'sonnet', or not set.
+    if (
+        args.dataset_name not in {"random", "random-mm", "sonnet", None}
         and args.prefix_len is not None
     ):
         warnings.warn(
             "--prefix-len will be ignored since --dataset-name\
-                 is not 'random', 'sonnet', or not set.",
+                 is not 'random', 'random-mm', 'sonnet', or not set.",
             stacklevel=2,
         )
 
+    # === Random Dataset Argument Conflict Detection ===
+    # Check for conflicts between regular and random arguments when using
+    # random datasets
+    if args.dataset_name in {"random", "random-mm", "random-rerank"}:
+        random_input_len = getattr(args, "random_input_len", None)
+        random_output_len = getattr(args, "random_output_len", None)
+        random_prefix_len = getattr(args, "random_prefix_len", None)
+
+        if args.input_len is not None and random_input_len is not None:
+            warnings.warn(
+                "Both --input-len and --random-input-len are specified. "
+                "The random version (--random-input-len) will be preferred "
+                "in this run.",
+                stacklevel=2,
+            )
+        if args.output_len is not None and random_output_len is not None:
+            warnings.warn(
+                "Both --output-len and --random-output-len are specified. "
+                "The random version (--random-output-len) will be preferred "
+                "in this run.",
+                stacklevel=2,
+            )
+        if args.prefix_len is not None and random_prefix_len is not None:
+            warnings.warn(
+                "Both --prefix-len and --random-prefix-len are specified. "
+                "The random version (--random-prefix-len) will be preferred "
+                "in this run.",
+                stacklevel=2,
+            )
+
     # === LoRA Settings ===
     if getattr(args, "enable_lora", False) and args.backend != "vllm":
         raise ValueError("LoRA benchmarking is only supported for vLLM backend")
@@ -551,7 +683,16 @@ def add_cli_args(parser: argparse.ArgumentParser):
     parser.add_argument(
         "--dataset-name",
         type=str,
-        choices=["sharegpt", "random", "sonnet", "burstgpt", "hf", "prefix_repetition"],
+        choices=[
+            "sharegpt",
+            "random",
+            "sonnet",
+            "burstgpt",
+            "hf",
+            "prefix_repetition",
+            "random-mm",
+            "random-rerank",
+        ],
         help="Name of the dataset to benchmark on.",
         default="sharegpt",
     )
@@ -633,23 +774,19 @@ def add_cli_args(parser: argparse.ArgumentParser):
         help="Number of fixed prefix tokens before the random "
         "context in a request (default: 0).",
     )
-    # random dataset
-    parser.add_argument(
-        "--random-range-ratio",
-        type=float,
-        default=0.0,
-        help="Range ratio for sampling input/output length, "
-        "used only for RandomDataset. Must be in the range [0, 1) to define "
-        "a symmetric sampling range "
-        "[length * (1 - range_ratio), length * (1 + range_ratio)].",
-    )
 
     # hf dtaset
     parser.add_argument(
-        "--hf-subset", type=str, default=None, help="Subset of the HF dataset."
+        "--hf-subset",
+        type=str,
+        default=None,
+        help="Subset of the HF dataset.",
     )
     parser.add_argument(
-        "--hf-split", type=str, default=None, help="Split of the HF dataset."
+        "--hf-split",
+        type=str,
+        default=None,
+        help="Split of the HF dataset.",
     )
     parser.add_argument(
         "--profile",
@@ -659,31 +796,28 @@ def add_cli_args(parser: argparse.ArgumentParser):
     )
 
     # prefix repetition dataset
-    prefix_repetition_group = parser.add_argument_group(
-        "prefix repetition dataset options"
-    )
-    prefix_repetition_group.add_argument(
+    parser.add_argument(
         "--prefix-repetition-prefix-len",
         type=int,
         default=None,
         help="Number of prefix tokens per request, used only for prefix "
         "repetition dataset.",
     )
-    prefix_repetition_group.add_argument(
+    parser.add_argument(
         "--prefix-repetition-suffix-len",
         type=int,
         default=None,
         help="Number of suffix tokens per request, used only for prefix "
         "repetition dataset. Total input length is prefix_len + suffix_len.",
     )
-    prefix_repetition_group.add_argument(
+    parser.add_argument(
         "--prefix-repetition-num-prefixes",
         type=int,
         default=None,
         help="Number of prefixes to generate, used only for prefix repetition "
         "dataset. Prompts per prefix is num_requests // num_prefixes.",
     )
-    prefix_repetition_group.add_argument(
+    parser.add_argument(
         "--prefix-repetition-output-len",
         type=int,
         default=None,
@@ -691,6 +825,10 @@ def add_cli_args(parser: argparse.ArgumentParser):
         "repetition dataset.",
     )
 
+    # (random, random-mm, random-rerank)
+    add_random_dataset_base_args(parser)
+    add_random_multimodal_dataset_args(parser)
+
     parser = AsyncEngineArgs.add_cli_args(parser)
 
 
diff --git a/vllm/compilation/activation_quant_fusion.py b/vllm/compilation/activation_quant_fusion.py
index ad894d6e82768dad5a8ddd9535a9bcffd6f81bcf..bdd3433b9d27e9dbfcc2773c9d1f845e09a67c1d 100644
--- a/vllm/compilation/activation_quant_fusion.py
+++ b/vllm/compilation/activation_quant_fusion.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
+from typing import Any
 
 import torch
 from torch._higher_order_ops.auto_functionalize import auto_functionalized
@@ -52,7 +53,7 @@ class ActivationQuantPattern(ABC):
     def __init__(
         self,
         quant_key: QuantKey,
-    ):
+    ) -> None:
         self.quant_key = quant_key
         self.quant_dtype = quant_key.dtype
 
@@ -68,12 +69,12 @@ class ActivationQuantPattern(ABC):
 
         self.silu_and_mul_matcher = MatcherSiluAndMul()
 
-    def empty_quant(self, *args, **kwargs):
+    def empty_quant(self, *args: Any, **kwargs: Any) -> torch.Tensor:
         kwargs = {"dtype": self.quant_dtype, "device": "cuda", **kwargs}
         return torch.empty(*args, **kwargs)
 
     @abstractmethod
-    def register(self, pm_pass: PatternMatcherPass):
+    def register(self, pm_pass: PatternMatcherPass) -> None:
         raise NotImplementedError
 
 
@@ -82,15 +83,22 @@ class SiluMulFp8StaticQuantPattern(ActivationQuantPattern):
     Fusion for SiluMul+Fp8StaticQuant Pattern
     """
 
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__(kFp8StaticTensorSym)
         self.quant_matcher = MatcherQuantFP8(kFp8StaticTensorSym)
 
-    def register(self, pm_pass: PatternMatcherPass):
+    def get_inputs(self) -> list[torch.Tensor]:
+        scale = self.quant_matcher.inputs()[1]
+        return [
+            *self.silu_and_mul_matcher.inputs(),  # input
+            scale,
+        ]
+
+    def register(self, pm_pass: PatternMatcherPass) -> None:
         def pattern(
             input: torch.Tensor,
             scale: torch.Tensor,
-        ):
+        ) -> torch.Tensor:
             result_silu_mul = self.silu_and_mul_matcher(input)
             result_quant = self.quant_matcher(result_silu_mul, scale)
             return result_quant[0]
@@ -98,7 +106,7 @@ class SiluMulFp8StaticQuantPattern(ActivationQuantPattern):
         def replacement(
             input: torch.Tensor,
             scale: torch.Tensor,
-        ):
+        ) -> torch.Tensor:
             d = input.shape[-1] // 2
             output_shape = input.shape[:-1] + (d,)
             result = torch.empty(
@@ -109,13 +117,10 @@ class SiluMulFp8StaticQuantPattern(ActivationQuantPattern):
             )
             return at[1]
 
-        inputs = [
-            *self.silu_and_mul_matcher.inputs(),  # input
-            self.quant_matcher.inputs()[1],  # scale
-        ]
-        pattern(*inputs)
+        inps = self.get_inputs()
+        pattern(*inps)
 
-        register_replacement(pattern, replacement, inputs, fwd_only, pm_pass)
+        register_replacement(pattern, replacement, inps, fwd_only, pm_pass)
 
 
 class SiluMulNvfp4QuantPattern(ActivationQuantPattern):
@@ -123,16 +128,23 @@ class SiluMulNvfp4QuantPattern(ActivationQuantPattern):
     Fusion for SiluMul+Nvfp4Quant Pattern
     """
 
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__(kNvfp4Quant)
 
-    def register(self, pm_pass: PatternMatcherPass):
+    def get_inputs(self) -> list[torch.Tensor]:
+        result = self.empty_quant(5, 32)
+        output_scale = empty_i32(128, 4)
+        input_ = empty_bf16(5, 64)
+        scale = empty_fp32(1, 1)
+        return [result, output_scale, input_, scale]
+
+    def register(self, pm_pass: PatternMatcherPass) -> None:
         def pattern(
             result: torch.Tensor,
             output_scale: torch.Tensor,
             input: torch.Tensor,
             scale: torch.Tensor,
-        ):
+        ) -> tuple[torch.Tensor, torch.Tensor]:
             result_silu_mul = self.silu_and_mul_matcher(input)
             at = auto_functionalized(
                 self.QUANT_OP,
@@ -148,7 +160,7 @@ class SiluMulNvfp4QuantPattern(ActivationQuantPattern):
             output_scale: torch.Tensor,
             input: torch.Tensor,
             scale: torch.Tensor,
-        ):
+        ) -> tuple[torch.Tensor, torch.Tensor]:
             at = auto_functionalized(
                 self.FUSED_OP,
                 result=result,
@@ -158,14 +170,7 @@ class SiluMulNvfp4QuantPattern(ActivationQuantPattern):
             )
             return at[1], at[2]
 
-        inputs = [
-            self.empty_quant(5, 32),  # result
-            empty_i32(128, 4),  # output_scale
-            empty_bf16(5, 64),  # input
-            empty_fp32(1, 1),  # scale
-        ]
-
-        register_replacement(pattern, replacement, inputs, fwd_only, pm_pass)
+        register_replacement(pattern, replacement, self.get_inputs(), fwd_only, pm_pass)
 
 
 class ActivationQuantFusionPass(VllmPatternMatcherPass):
@@ -179,7 +184,7 @@ class ActivationQuantFusionPass(VllmPatternMatcherPass):
     """
 
     @enable_fake_mode
-    def __init__(self, config: VllmConfig):
+    def __init__(self, config: VllmConfig) -> None:
         super().__init__(config)
 
         self.patterns: PatternMatcherPass = PatternMatcherPass(
@@ -196,11 +201,11 @@ class ActivationQuantFusionPass(VllmPatternMatcherPass):
         self.dump_patterns(config, self.patterns)
 
     @VllmInductorPass.time_and_log
-    def __call__(self, graph: torch.fx.Graph):
+    def __call__(self, graph: torch.fx.Graph) -> None:
         self.matched_count = self.patterns.apply(graph)
         logger.debug("Replaced %s patterns", self.matched_count)
 
-    def uuid(self):
+    def uuid(self) -> str:
         return VllmInductorPass.hash_source(
             self,
             ActivationQuantPattern,
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index a1eec7d74483fd4104e3f65ae76311bc85c0ff3d..dec20cdc806ab64d998fb08707e638061eb7c226 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -9,7 +9,7 @@ import operator
 import os
 import pprint
 import time
-from collections.abc import Callable, Sequence
+from collections.abc import Callable, Generator, Sequence
 from contextlib import contextmanager
 from copy import deepcopy
 from functools import partial
@@ -90,7 +90,7 @@ class CompilerManager:
     support int as key.
     """
 
-    def __init__(self, compilation_config: CompilationConfig):
+    def __init__(self, compilation_config: CompilationConfig) -> None:
         self.cache: dict[tuple[Range, int, str], Any] = dict()
         self.is_cache_updated = False
         self.compilation_config = compilation_config
@@ -100,7 +100,7 @@ class CompilerManager:
         return self.compiler.compute_hash(vllm_config)
 
     @contextmanager
-    def compile_context(self, compile_range: Range):
+    def compile_context(self, compile_range: Range) -> Generator[None, None, None]:
         """Provide compilation context for the duration of compilation to set
         any torch global properties we want to scope to a single Inductor
         compilation (e.g. partition rules, pass context)."""
@@ -115,7 +115,7 @@ class CompilerManager:
 
     def initialize_cache(
         self, cache_dir: str, disable_cache: bool = False, prefix: str = ""
-    ):
+    ) -> None:
         """
         Initialize the cache directory for the compiler.
 
@@ -143,7 +143,7 @@ class CompilerManager:
                 # do not use eval(), it is unsafe.
                 cache = ast.literal_eval(f.read())
 
-            def check_type(value, ty):
+            def check_type(value: Any, ty: type) -> None:
                 if not isinstance(value, ty):
                     raise TypeError(f"Expected {ty} but got {type(value)} for {value}")
 
@@ -165,7 +165,7 @@ class CompilerManager:
             cache_dir=cache_dir, disable_cache=disable_cache, prefix=prefix
         )
 
-    def save_to_file(self):
+    def save_to_file(self) -> None:
         if self.disable_cache or not self.is_cache_updated:
             return
         printer = pprint.PrettyPrinter(indent=4)
@@ -179,7 +179,7 @@ class CompilerManager:
         example_inputs: list[Any],
         graph_index: int,
         compile_range: Range,
-    ) -> Callable | None:
+    ) -> Callable[..., Any] | None:
         if (compile_range, graph_index, self.compiler.name) not in self.cache:
             return None
         handle = self.cache[(compile_range, graph_index, self.compiler.name)]
@@ -198,8 +198,8 @@ class CompilerManager:
     def compile(
         self,
         graph: fx.GraphModule,
-        example_inputs,
-        additional_inductor_config,
+        example_inputs: list[Any],
+        additional_inductor_config: dict[str, Any],
         compilation_config: CompilationConfig,
         compile_range: Range,
         graph_index: int = 0,
@@ -355,7 +355,7 @@ def split_graph(
 compilation_start_time = 0.0
 
 
-class PiecewiseCompileInterpreter(torch.fx.Interpreter):
+class PiecewiseCompileInterpreter(torch.fx.Interpreter):  # type: ignore[misc]
     """Code adapted from `torch.fx.passes.shape_prop.ShapeProp`.
     It runs the given graph with fake inputs, and compile some
     submodules specified by `compile_submod_names` with the given
@@ -373,7 +373,7 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter):
         compile_submod_names: list[str],
         vllm_config: VllmConfig,
         vllm_backend: "VllmBackend",
-    ):
+    ) -> None:
         super().__init__(module)
         from torch._guards import detect_fake_mode
 
@@ -385,7 +385,7 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter):
         # When True, it annoyingly dumps the torch.fx.Graph on errors.
         self.extra_traceback = False
 
-    def run(self, *args):
+    def run(self, *args: Any) -> Any:
         # maybe instead just assert inputs are fake?
         fake_args = [
             self.fake_mode.from_tensor(t) if isinstance(t, torch.Tensor) else t
@@ -467,7 +467,7 @@ model_is_encoder: bool = False
 
 
 @contextmanager
-def set_model_tag(tag: str, is_encoder: bool = False):
+def set_model_tag(tag: str, is_encoder: bool = False) -> Generator[None, None, None]:
     """Context manager to set the model tag."""
     global model_tag
     global model_is_encoder
@@ -506,9 +506,9 @@ class VllmBackend:
     # the stiching graph module for all the piecewise graphs
     split_gm: fx.GraphModule
     piecewise_graphs: list[SplitItem]
-    returned_callable: Callable
+    returned_callable: Callable[..., Any]
     # Inductor passes to run on the graph pre-defunctionalization
-    post_grad_passes: Sequence[Callable]
+    post_grad_passes: Sequence[Callable[..., Any]]
     sym_tensor_indices: list[int]
     input_buffers: list[torch.Tensor]
     compiler_manager: CompilerManager
@@ -520,7 +520,8 @@ class VllmBackend:
         self,
         vllm_config: VllmConfig,
         prefix: str = "",
-    ):
+        is_encoder: bool = False,
+    ) -> None:
         # if the model is initialized with a non-empty prefix,
         # then usually it's enough to use that prefix,
         # e.g. language_model, vision_model, etc.
@@ -530,7 +531,7 @@ class VllmBackend:
         self.prefix = prefix or model_tag
 
         # Mark compilation for encoder.
-        self.is_encoder = model_is_encoder
+        self.is_encoder = is_encoder or model_is_encoder
 
         # Passes to run on the graph post-grad.
         self.pass_manager = resolve_obj_by_qualname(
@@ -557,7 +558,7 @@ class VllmBackend:
         # `torch.compile` is JIT compiled, so we don't need to
         # do anything here
 
-    def configure_post_pass(self):
+    def configure_post_pass(self) -> None:
         self.pass_manager.configure(self.vllm_config)
 
         # Post-grad custom passes are run using the post_grad_custom_post_pass
@@ -579,7 +580,7 @@ class VllmBackend:
         self.inductor_config[self.pass_key] = self.pass_manager
 
     def __call__(
-        self, graph: fx.GraphModule, example_inputs
+        self, graph: fx.GraphModule, example_inputs: Sequence[Any]
     ) -> VllmSerializableFunction:
         vllm_config = self.vllm_config
         # Minimal hashing here with existing utilities, reused below.
@@ -605,7 +606,7 @@ class VllmBackend:
             try:
                 with open(filepath) as f:
                     hash_content.append(f.read())
-            except Exception:
+            except OSError:
                 logger.warning("Failed to read file %s", filepath)
                 continue
         code_hash = hashlib.sha256("\n".join(hash_content).encode()).hexdigest()
@@ -629,7 +630,7 @@ class VllmBackend:
         os.makedirs(cache_dir, exist_ok=True)
         self.compilation_config.cache_dir = cache_dir
         rank = vllm_config.parallel_config.rank
-        dp_rank = vllm_config.parallel_config.data_parallel_rank
+        dp_rank = vllm_config.parallel_config.data_parallel_index
         local_cache_dir = os.path.join(cache_dir, f"rank_{rank}_{dp_rank}", self.prefix)
         os.makedirs(local_cache_dir, exist_ok=True)
         self.compilation_config.local_cache_dir = local_cache_dir
@@ -797,7 +798,7 @@ class VllmBackend:
             or not self.compilation_config.cudagraph_copy_inputs
         ):
             return VllmSerializableFunction(
-                graph, example_inputs, self.prefix, self.split_gm
+                graph, example_inputs, self.prefix, self.split_gm, self.is_encoder
             )
 
         # index of tensors that have symbolic shapes (batch size)
@@ -820,7 +821,7 @@ class VllmBackend:
         ]
 
         # this is the callable we return to Dynamo to run
-        def copy_and_call(*args):
+        def copy_and_call(*args: Any) -> Any:
             list_args = list(args)
             for i, index in enumerate(self.sym_tensor_indices):
                 runtime_tensor = list_args[index]
@@ -835,5 +836,5 @@ class VllmBackend:
             return self.split_gm(*list_args)
 
         return VllmSerializableFunction(
-            graph, example_inputs, self.prefix, copy_and_call
+            graph, example_inputs, self.prefix, copy_and_call, self.is_encoder
         )
diff --git a/vllm/compilation/caching.py b/vllm/compilation/caching.py
index ce482572b401b2684cbca436091b16c5af719ec5..3d945e2ddd5fe8cbbc70abc480cb56387dd3779e 100644
--- a/vllm/compilation/caching.py
+++ b/vllm/compilation/caching.py
@@ -4,6 +4,8 @@
 import inspect
 import os
 import pickle
+from collections.abc import Callable, Sequence
+from typing import Any, Literal
 from unittest.mock import patch
 
 import torch
@@ -25,7 +27,7 @@ assert isinstance(SerializableCallable, type)
 logger = init_logger(__name__)
 
 
-class VllmSerializableFunction(SerializableCallable):
+class VllmSerializableFunction(SerializableCallable):  # type: ignore[misc]
     """
     A wrapper around a compiled function by vllm. It will forward the tensor
     inputs to the compiled function and return the result.
@@ -37,12 +39,20 @@ class VllmSerializableFunction(SerializableCallable):
     serializing the Dynamo fx graph plus example inputs.
     """
 
-    def __init__(self, graph_module, example_inputs, prefix, optimized_call):
+    def __init__(
+        self,
+        graph_module: torch.fx.GraphModule,
+        example_inputs: Sequence[Any],
+        prefix: str,
+        optimized_call: Callable[..., Any],
+        is_encoder: bool = False,
+    ) -> None:
         assert isinstance(graph_module, torch.fx.GraphModule)
         self.graph_module = graph_module
         self.example_inputs = example_inputs
         self.prefix = prefix
         self.optimized_call = optimized_call
+        self.is_encoder = is_encoder
         self.shape_env = None
         sym_input = next(
             (i for i in self.example_inputs if isinstance(i, torch.SymInt)), None
@@ -50,7 +60,7 @@ class VllmSerializableFunction(SerializableCallable):
         if sym_input is not None:
             self.shape_env = sym_input.node.shape_env
 
-    def __call__(self, *args, **kwargs):
+    def __call__(self, *args: Any, **kwargs: Any) -> Any:
         return self.optimized_call(*args, **kwargs)
 
     @classmethod
@@ -70,7 +80,9 @@ class VllmSerializableFunction(SerializableCallable):
 
         graph_reducer_override = GraphPickler.reducer_override
 
-        def _graph_reducer_override(self, obj):
+        def _graph_reducer_override(
+            self: GraphPickler, obj: Any
+        ) -> tuple[Callable[..., Any], tuple[Any, ...]] | Any:
             if (
                 inspect.isclass(obj)
                 and issubclass(obj, sympy.Function)
@@ -104,10 +116,14 @@ class VllmSerializableFunction(SerializableCallable):
         state = pickle.loads(data)
         fake_mode = FakeTensorMode(shape_env=ShapeEnv())
         state["graph_module"] = GraphPickler.loads(state["graph_module"], fake_mode)
+        state["graph_module"].recompile()
         state["example_inputs"] = GraphPickler.loads(state["example_inputs"], fake_mode)
-        vllm_backend = VllmBackend(get_current_vllm_config(), state["prefix"])
+        is_encoder = state.get("is_encoder", False)
+        vllm_backend = VllmBackend(
+            get_current_vllm_config(), state["prefix"], is_encoder
+        )
 
-        def optimized_call(*example_inputs):
+        def optimized_call(*example_inputs: Any) -> Any:
             """
             On the first run of the optimized call, we rerun the compiler
             backend which should result in a cache hit. After the backend
@@ -129,7 +145,7 @@ class VllmSerializableFunction(SerializableCallable):
         return fn
 
     @property
-    def co_name(self):
+    def co_name(self) -> Literal["VllmSerializableFunction"]:
         """
         Used for depyf debugging.
         """
diff --git a/vllm/compilation/collective_fusion.py b/vllm/compilation/collective_fusion.py
index 57bd94c7e8ad63addbb904dfe4e6885fb17c4e1d..4200071310acd8a70a0a73ff750c49f56c78143d 100644
--- a/vllm/compilation/collective_fusion.py
+++ b/vllm/compilation/collective_fusion.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from importlib.util import find_spec
+from types import ModuleType
 
 import torch
 import torch._inductor.pattern_matcher as pm
@@ -33,15 +34,15 @@ if find_spec("flashinfer"):
     try:
         import flashinfer.comm as flashinfer_comm
 
-        flashinfer_comm = (
+        flashinfer_comm: ModuleType | None = (  # type: ignore[no-redef]
             flashinfer_comm
             if hasattr(flashinfer_comm, "trtllm_allreduce_fusion")
             else None
         )
     except ImportError:
-        flashinfer_comm = None
+        flashinfer_comm = None  # type: ignore[assignment]
 else:
-    flashinfer_comm = None
+    flashinfer_comm = None  # type: ignore[assignment]
 
 logger = init_logger(__name__)
 
@@ -50,7 +51,7 @@ if hasattr(torch.ops._C, "scaled_fp4_quant"):
 
 
 class BasePattern:
-    def __init__(self, dtype: torch.dtype, device: str):
+    def __init__(self, dtype: torch.dtype, device: str | None) -> None:
         self.dtype = dtype
         self.device = device
         self.tp = get_tp_group()
@@ -58,13 +59,13 @@ class BasePattern:
 
 
 class GEMMReduceScatterPattern(BasePattern):
-    def get_inputs(self):
+    def get_inputs(self) -> list[torch.Tensor]:
         mul = torch.empty([16, 4], device=self.device, dtype=self.dtype)
         mm_weight = torch.empty([4, 4], device=self.device, dtype=self.dtype)
         return [mul, mm_weight]
 
-    def register(self, pm_pass: PatternMatcherPass):
-        def pattern(mul: torch.Tensor, mm_weight: torch.Tensor):
+    def register(self, pm_pass: PatternMatcherPass) -> None:
+        def pattern(mul: torch.Tensor, mm_weight: torch.Tensor) -> torch.Tensor:
             mm = torch.ops.aten.mm.default(mul, mm_weight)
             reduce_scatter = torch.ops.vllm.reduce_scatter.default(
                 mm,
@@ -74,7 +75,7 @@ class GEMMReduceScatterPattern(BasePattern):
             )
             return reduce_scatter
 
-        def replacement(mul: torch.Tensor, mm_weight: torch.Tensor):
+        def replacement(mul: torch.Tensor, mm_weight: torch.Tensor) -> torch.Tensor:
             gemm_rs = torch.ops.symm_mem.fused_matmul_reduce_scatter(
                 mul,
                 mm_weight,
@@ -91,17 +92,17 @@ class GEMMReduceScatterPattern(BasePattern):
 
 
 class AllGatherGEMMPattern(BasePattern):
-    def get_inputs(self):
+    def get_inputs(self) -> list[torch.Tensor]:
         x = torch.empty([4, 4], device=self.device, dtype=self.dtype)
         weight = torch.empty([4, 4], device=self.device, dtype=self.dtype)
 
         return [x, weight]
 
-    def register(self, pm_pass: PatternMatcherPass):
+    def register(self, pm_pass: PatternMatcherPass) -> None:
         def pattern(
             x: torch.Tensor,
             weight: torch.Tensor,
-        ) -> tuple[torch.Tensor, torch.Tensor]:
+        ) -> torch.Tensor:
             all_gather = torch.ops.vllm.all_gather.default(
                 x,
                 dim=0,
@@ -111,9 +112,7 @@ class AllGatherGEMMPattern(BasePattern):
 
             return torch.ops.aten.mm.default(all_gather, weight)
 
-        def replacement(
-            x: torch.Tensor, weight: torch.Tensor
-        ) -> tuple[torch.Tensor, torch.Tensor]:
+        def replacement(x: torch.Tensor, weight: torch.Tensor) -> torch.Tensor:
             ag_output, mm_outputs = torch.ops.symm_mem.fused_all_gather_matmul(
                 x,
                 [weight],
@@ -128,7 +127,7 @@ class AllGatherGEMMPattern(BasePattern):
 
 
 class ScaledMMReduceScatterPattern(BasePattern):
-    def get_inputs(self):
+    def get_inputs(self) -> list[torch.Tensor]:
         input = torch.empty([16, 16], device=self.device, dtype=FP8_DTYPE)
         mm_weight = (
             torch.empty([16, 16], device=self.device, dtype=FP8_DTYPE)
@@ -139,7 +138,7 @@ class ScaledMMReduceScatterPattern(BasePattern):
         scale_b = torch.empty([1, 16], device=self.device, dtype=torch.float32)
         return [input, mm_weight, scale_a, scale_b]
 
-    def register(self, pm_pass: PatternMatcherPass):
+    def register(self, pm_pass: PatternMatcherPass) -> None:
         def pattern(
             input: torch.Tensor,
             mat2: torch.Tensor,
@@ -196,7 +195,7 @@ class ScaledMMReduceScatterPattern(BasePattern):
 
 
 class AllGatherScaledMMPattern(BasePattern):
-    def get_inputs(self):
+    def get_inputs(self) -> list[torch.Tensor]:
         x = torch.empty([8, 16], device=self.device, dtype=FP8_DTYPE)
         weight = (
             torch.empty([16, 16], device=self.device, dtype=FP8_DTYPE)
@@ -211,7 +210,7 @@ class AllGatherScaledMMPattern(BasePattern):
 
         return [x, weight, scale_a, scale_b]
 
-    def register(self, pm_pass: PatternMatcherPass):
+    def register(self, pm_pass: PatternMatcherPass) -> None:
         def pattern(
             x: torch.Tensor,
             weight: torch.Tensor,
@@ -258,7 +257,7 @@ class AllGatherScaledMMPattern(BasePattern):
 
 
 class CutlassScaledMMReduceScatterPattern(BasePattern):
-    def get_inputs(self):
+    def get_inputs(self) -> list[torch.Tensor]:
         input = torch.empty([16, 16], device=self.device, dtype=FP8_DTYPE)
         mm_weight = (
             torch.empty([16, 16], device=self.device, dtype=FP8_DTYPE)
@@ -271,7 +270,7 @@ class CutlassScaledMMReduceScatterPattern(BasePattern):
         cutlass_mm_output = torch.empty([16, 16], device=self.device, dtype=self.dtype)
         return [input, mm_weight, scale_a, scale_b, cutlass_mm_output]
 
-    def register(self, pm_pass: PatternMatcherPass):
+    def register(self, pm_pass: PatternMatcherPass) -> None:
         def pattern(
             input: torch.Tensor,
             weight: torch.Tensor,
@@ -331,7 +330,7 @@ class CutlassScaledMMReduceScatterPattern(BasePattern):
 
 
 class AllGatherCutlassScaledMMPattern(BasePattern):
-    def get_inputs(self):
+    def get_inputs(self) -> list[torch.Tensor]:
         x = torch.empty([8, 16], device=self.device, dtype=FP8_DTYPE)
         weight = (
             torch.empty([16, 16], device=self.device, dtype=FP8_DTYPE)
@@ -349,7 +348,7 @@ class AllGatherCutlassScaledMMPattern(BasePattern):
 
         return [x, weight, scale_a, scale_b, output]
 
-    def register(self, pm_pass: PatternMatcherPass):
+    def register(self, pm_pass: PatternMatcherPass) -> None:
         def pattern(
             x: torch.Tensor,
             weight: torch.Tensor,
@@ -400,7 +399,7 @@ class AllGatherCutlassScaledMMPattern(BasePattern):
 
 class AsyncTPPass(VllmPatternMatcherPass):
     @enable_fake_mode
-    def __init__(self, config: VllmConfig):
+    def __init__(self, config: VllmConfig) -> None:
         super().__init__(config)
 
         # Enable symmetric memory for the TP process group
@@ -445,7 +444,7 @@ class AsyncTPPass(VllmPatternMatcherPass):
         return compile_range.is_single_size() and compile_range.end % tp_size == 0
 
     @VllmInductorPass.time_and_log
-    def __call__(self, graph: fx.Graph):
+    def __call__(self, graph: fx.Graph) -> None:
         self.matched_count = self.patterns.apply(graph)
         logger.debug("Replaced %s patterns", self.matched_count)
 
@@ -512,11 +511,13 @@ if flashinfer_comm is not None:
             f"max token num {max_token_num} * hidden size {hidden_size} * "
             f"element size {element_size}"
         )
-        device_capability = current_platform.get_device_capability().to_int()
+        curr_device = current_platform.get_device_capability()
+        device_capability = curr_device.to_int() if curr_device is not None else None
         # Get one shot input size limit for the current world size
         # for the current device capability
         max_one_shot_size = _FI_ALLREDUCE_ONE_SHOT_MAX_SIZES_MB.get(
-            device_capability, {}
+            device_capability,  # type: ignore[arg-type]
+            {},
         ).get(world_size, None)
         # Use one shot if no max size is specified
         use_oneshot = (
@@ -606,7 +607,7 @@ class FlashInferFusedAllReduceParams:
         world_size: int,
         use_fp32_lamport: bool = False,
         max_token_num: int = 1024,
-    ):
+    ) -> None:
         self.rank = rank
         self.world_size = world_size
         self.use_fp32_lamport = use_fp32_lamport
@@ -615,7 +616,7 @@ class FlashInferFusedAllReduceParams:
         self.fp32_acc = True
         self.max_token_num = max_token_num
 
-    def get_trtllm_fused_allreduce_kwargs(self):
+    def get_trtllm_fused_allreduce_kwargs(self) -> dict[str, bool | int]:
         return {
             "world_rank": self.rank,
             "world_size": self.world_size,
@@ -637,28 +638,32 @@ class AllReduceRMSNormPattern(BasePattern):
         self,
         epsilon: float,
         dtype: torch.dtype,
-        device: str,
+        device: str | None,
         allreduce_params: FlashInferFusedAllReduceParams,
-    ):
+    ) -> None:
         super().__init__(dtype, device)
         self.epsilon = epsilon
         self.allreduce_params = allreduce_params
         self.rmsnorm_matcher = MatcherRMSNorm(epsilon)
 
-    def get_inputs(self):
+    def get_inputs(self) -> list[torch.Tensor]:
         input, weight = self.rmsnorm_matcher.inputs()
 
         # input goes through allreduce first, always 16-bit
         return [input.to(self.dtype), weight]
 
-    def register(self, pm_pass: PatternMatcherPass):
-        def pattern(input: torch.Tensor, weight: torch.Tensor):
+    def register(self, pm_pass: PatternMatcherPass) -> None:
+        def pattern(
+            input: torch.Tensor, weight: torch.Tensor
+        ) -> tuple[torch.Tensor, torch.Tensor]:
             allreduce_output = tensor_model_parallel_all_reduce(input)
             rms = self.rmsnorm_matcher(allreduce_output, weight)
 
             return rms, allreduce_output
 
-        def replacement(input: torch.Tensor, weight: torch.Tensor):
+        def replacement(
+            input: torch.Tensor, weight: torch.Tensor
+        ) -> tuple[torch.Tensor, torch.Tensor]:
             residual = torch.zeros_like(input)
             rms_result = torch.empty_like(input)
             allreduce = auto_functionalized(
@@ -692,29 +697,31 @@ class AllReduceFusedAddRMSNormPattern(BasePattern):
         self,
         epsilon: float,
         dtype: torch.dtype,
-        device: str,
+        device: str | None,
         allreduce_params: FlashInferFusedAllReduceParams,
-    ):
+    ) -> None:
         super().__init__(dtype, device)
         self.epsilon = epsilon
         self.allreduce_params = allreduce_params
         self.rmsnorm_matcher = MatcherFusedAddRMSNorm(epsilon)
 
-    def get_inputs(self):
+    def get_inputs(self) -> list[torch.Tensor]:
         input, residual, weight = self.rmsnorm_matcher.inputs()
 
         # input goes through allreduce first, always 16-bit
         return [residual, input.to(self.dtype), weight]
 
-    def register(self, pm_pass: PatternMatcherPass):
-        def pattern(residual: torch.Tensor, input: torch.Tensor, weight: torch.Tensor):
+    def register(self, pm_pass: PatternMatcherPass) -> None:
+        def pattern(
+            residual: torch.Tensor, input: torch.Tensor, weight: torch.Tensor
+        ) -> tuple[torch.Tensor, torch.Tensor]:
             allreduce_output = tensor_model_parallel_all_reduce(input)
             rms, residual = self.rmsnorm_matcher(allreduce_output, weight, residual)
             return rms, residual
 
         def replacement(
             residual: torch.Tensor, input: torch.Tensor, weight: torch.Tensor
-        ):
+        ) -> tuple[torch.Tensor, torch.Tensor]:
             allreduce = auto_functionalized(
                 flashinfer_trtllm_fused_allreduce_norm,
                 allreduce_in=input,
@@ -739,8 +746,8 @@ class AllReduceFusedAddRMSNormPattern(BasePattern):
         first_return_only = lambda fn: lambda a, b, c: fn(a, b, c)[0]
 
         pm.register_replacement(
-            first_return_only(pattern),
-            first_return_only(replacement),
+            first_return_only(pattern),  # type: ignore[no-untyped-call]
+            first_return_only(replacement),  # type: ignore[no-untyped-call]
             self.get_inputs(),
             pm.fwd_only,
             pm_pass,
@@ -759,9 +766,9 @@ class AllReduceFusedRMSNormStaticQuantFP8Pattern(BasePattern):
         self,
         epsilon: float,
         dtype: torch.dtype,
-        device: str,
+        device: str | None,
         allreduce_params: FlashInferFusedAllReduceParams,
-    ):
+    ) -> None:
         super().__init__(dtype, device)
         self.epsilon = epsilon
         self.allreduce_params = allreduce_params
@@ -769,25 +776,27 @@ class AllReduceFusedRMSNormStaticQuantFP8Pattern(BasePattern):
         self.rmsnorm_matcher = MatcherRMSNorm(epsilon)
         self.quant_matcher = MatcherQuantFP8(kFp8StaticTensorSym)
 
-    def register(self, pm_pass: PatternMatcherPass):
-        def get_inputs():
-            input, weight = self.rmsnorm_matcher.inputs()
-            _, scale = self.quant_matcher.inputs()
+    def get_inputs(self) -> list[torch.Tensor]:
+        input, weight = self.rmsnorm_matcher.inputs()
+        _, scale = self.quant_matcher.inputs()
 
-            # input goes through allreduce first, always 16-bit
-            return [input.to(self.dtype), weight, scale]
+        # input goes through allreduce first, always 16-bit
+        return [input.to(self.dtype), weight, scale]
 
+    def register(self, pm_pass: PatternMatcherPass) -> None:
         def pattern(
             input: torch.Tensor,
             weight: torch.Tensor,
             scale: torch.Tensor,
-        ):
+        ) -> tuple[torch.Tensor, torch.Tensor]:
             all_reduce = tensor_model_parallel_all_reduce(input)
             rms = self.rmsnorm_matcher(all_reduce, weight)
             quant, _ = self.quant_matcher(rms, scale)
             return quant, all_reduce
 
-        def replacement(input: torch.Tensor, weight: torch.Tensor, scale: torch.Tensor):
+        def replacement(
+            input: torch.Tensor, weight: torch.Tensor, scale: torch.Tensor
+        ) -> tuple[torch.Tensor, torch.Tensor]:
             residual = torch.zeros_like(input)
             result_rms = torch.empty_like(input)
             result_quant = torch.empty_like(input, dtype=self.quant_dtype)
@@ -812,7 +821,7 @@ class AllReduceFusedRMSNormStaticQuantFP8Pattern(BasePattern):
             return allreduce[4], allreduce[1]
 
         pm.register_replacement(
-            pattern, replacement, get_inputs(), pm.fwd_only, pm_pass
+            pattern, replacement, self.get_inputs(), pm.fwd_only, pm_pass
         )
 
 
@@ -828,9 +837,9 @@ class AllReduceFusedAddRMSNormStaticQuantFP8Pattern(BasePattern):
         self,
         epsilon: float,
         dtype: torch.dtype,
-        device: str,
+        device: str | None,
         allreduce_params: FlashInferFusedAllReduceParams,
-    ):
+    ) -> None:
         super().__init__(dtype, device)
         self.epsilon = epsilon
         self.allreduce_params = allreduce_params
@@ -839,20 +848,20 @@ class AllReduceFusedAddRMSNormStaticQuantFP8Pattern(BasePattern):
         self.rmsnorm_matcher = MatcherFusedAddRMSNorm(epsilon)
         self.quant_matcher = MatcherQuantFP8(kFp8StaticTensorSym)
 
-    def register(self, pm_pass: PatternMatcherPass):
-        def get_inputs():
-            input, residual, weight = self.rmsnorm_matcher.inputs()
-            _, scale = self.quant_matcher.inputs()
+    def get_inputs(self) -> list[torch.Tensor]:
+        input, residual, weight = self.rmsnorm_matcher.inputs()
+        _, scale = self.quant_matcher.inputs()
 
-            # input goes through allreduce first, always 16-bit
-            return [residual, input.to(self.dtype), weight, scale]
+        # input goes through allreduce first, always 16-bit
+        return [residual, input.to(self.dtype), weight, scale]
 
+    def register(self, pm_pass: PatternMatcherPass) -> None:
         def pattern(
             residual: torch.Tensor,
             input: torch.Tensor,
             weight: torch.Tensor,
             scale: torch.Tensor,
-        ):
+        ) -> tuple[torch.Tensor, torch.Tensor]:
             allreduce_output = tensor_model_parallel_all_reduce(input)
             rms, res = self.rmsnorm_matcher(allreduce_output, weight, residual)
             quant, _ = self.quant_matcher(rms, scale)
@@ -864,7 +873,7 @@ class AllReduceFusedAddRMSNormStaticQuantFP8Pattern(BasePattern):
             input: torch.Tensor,
             weight: torch.Tensor,
             scale: torch.Tensor,
-        ):
+        ) -> tuple[torch.Tensor, torch.Tensor]:
             result_quant = torch.empty_like(input, dtype=self.quant_dtype)
             allreduce = auto_functionalized(
                 flashinfer_trtllm_fused_allreduce_norm,
@@ -886,7 +895,7 @@ class AllReduceFusedAddRMSNormStaticQuantFP8Pattern(BasePattern):
             return allreduce[4], allreduce[2]
 
         pm.register_replacement(
-            pattern, replacement, get_inputs(), pm.fwd_only, pm_pass
+            pattern, replacement, self.get_inputs(), pm.fwd_only, pm_pass
         )
 
 
@@ -902,33 +911,33 @@ class AllReduceFusedRMSNormStaticQuantNVFP4Pattern(BasePattern):
         self,
         epsilon: float,
         dtype: torch.dtype,
-        device: str,
+        device: str | None,
         allreduce_params: FlashInferFusedAllReduceParams,
-    ):
+    ) -> None:
         super().__init__(dtype, device)
         self.epsilon = epsilon
         self.allreduce_params = allreduce_params
         self.rmsnorm_matcher = MatcherRMSNorm(epsilon)
 
-    def register(self, pm_pass: PatternMatcherPass):
-        def get_inputs():
-            input = torch.empty([1, 16, 16], device=self.device, dtype=self.dtype)
-            quant_result = torch.empty((16, 8), device=self.device, dtype=torch.uint8)
-            input_global_scale = torch.empty(
-                [1, 1], device=self.device, dtype=torch.float32
-            )
-            weight = torch.empty([16], device=self.device, dtype=self.dtype)
-            output_scale = torch.empty([128, 4], device=self.device, dtype=torch.int32)
+    def get_inputs(self) -> list[torch.Tensor]:
+        input = torch.empty([1, 16, 16], device=self.device, dtype=self.dtype)
+        quant_result = torch.empty((16, 8), device=self.device, dtype=torch.uint8)
+        input_global_scale = torch.empty(
+            [1, 1], device=self.device, dtype=torch.float32
+        )
+        weight = torch.empty([16], device=self.device, dtype=self.dtype)
+        output_scale = torch.empty([128, 4], device=self.device, dtype=torch.int32)
 
-            return [input, quant_result, weight, input_global_scale, output_scale]
+        return [input, quant_result, weight, input_global_scale, output_scale]
 
+    def register(self, pm_pass: PatternMatcherPass) -> None:
         def pattern(
             input: torch.Tensor,
             quant_result: torch.Tensor,
             weight: torch.Tensor,
             input_global_scale: torch.Tensor,
             output_scale: torch.Tensor,
-        ):
+        ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
             all_reduce = tensor_model_parallel_all_reduce(input)
             rms = self.rmsnorm_matcher(all_reduce, weight)
             quant_out_tuple = auto_functionalized(
@@ -948,7 +957,7 @@ class AllReduceFusedRMSNormStaticQuantNVFP4Pattern(BasePattern):
             weight: torch.Tensor,
             input_global_scale: torch.Tensor,
             output_scale: torch.Tensor,
-        ):
+        ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
             residual = torch.zeros_like(input)
             result_rms = torch.empty_like(input)
             allreduce = auto_functionalized(
@@ -972,7 +981,7 @@ class AllReduceFusedRMSNormStaticQuantNVFP4Pattern(BasePattern):
             return allreduce[4], allreduce[1], allreduce[5]
 
         pm.register_replacement(
-            pattern, replacement, get_inputs(), pm.fwd_only, pm_pass
+            pattern, replacement, self.get_inputs(), pm.fwd_only, pm_pass
         )
 
 
@@ -988,35 +997,35 @@ class AllReduceFusedAddRMSNormStaticQuantNVFP4Pattern(BasePattern):
         self,
         epsilon: float,
         dtype: torch.dtype,
-        device: str,
+        device: str | None,
         allreduce_params: FlashInferFusedAllReduceParams,
-    ):
+    ) -> None:
         super().__init__(dtype, device)
         self.epsilon = epsilon
         self.allreduce_params = allreduce_params
         self.rmsnorm_matcher = MatcherFusedAddRMSNorm(epsilon)
 
-    def register(self, pm_pass: PatternMatcherPass):
-        def get_inputs():
-            input = torch.empty([16, 16], device=self.device, dtype=self.dtype)
-
-            residual = torch.empty([16, 16], device=self.device, dtype=self.dtype)
-            weight = torch.empty([16, 16], device=self.device, dtype=self.dtype)
-            quant_result = torch.empty((16, 8), device=self.device, dtype=torch.uint8)
-            input_global_scale = torch.empty(
-                [1, 1], device=self.device, dtype=torch.float32
-            )
-            output_scale = torch.empty([128, 4], device=self.device, dtype=torch.int32)
-
-            return [
-                quant_result,
-                residual,
-                input,
-                output_scale,
-                weight,
-                input_global_scale,
-            ]
+    def get_inputs(self) -> list[torch.Tensor]:
+        input = torch.empty([16, 16], device=self.device, dtype=self.dtype)
 
+        residual = torch.empty([16, 16], device=self.device, dtype=self.dtype)
+        weight = torch.empty([16, 16], device=self.device, dtype=self.dtype)
+        quant_result = torch.empty((16, 8), device=self.device, dtype=torch.uint8)
+        input_global_scale = torch.empty(
+            [1, 1], device=self.device, dtype=torch.float32
+        )
+        output_scale = torch.empty([128, 4], device=self.device, dtype=torch.int32)
+
+        return [
+            quant_result,
+            residual,
+            input,
+            output_scale,
+            weight,
+            input_global_scale,
+        ]
+
+    def register(self, pm_pass: PatternMatcherPass) -> None:
         def pattern(
             quant_result: torch.Tensor,
             residual: torch.Tensor,
@@ -1024,7 +1033,7 @@ class AllReduceFusedAddRMSNormStaticQuantNVFP4Pattern(BasePattern):
             output_scale: torch.Tensor,
             weight: torch.Tensor,
             input_global_scale: torch.Tensor,
-        ):
+        ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
             allreduce_output = tensor_model_parallel_all_reduce(input)
             rms, residual = self.rmsnorm_matcher(allreduce_output, weight, residual)
             quant_out_tuple = auto_functionalized(
@@ -1045,7 +1054,7 @@ class AllReduceFusedAddRMSNormStaticQuantNVFP4Pattern(BasePattern):
             output_scale: torch.Tensor,
             weight: torch.Tensor,
             input_global_scale: torch.Tensor,
-        ):
+        ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
             allreduce = auto_functionalized(
                 flashinfer_trtllm_fused_allreduce_norm,
                 allreduce_in=input,
@@ -1066,12 +1075,12 @@ class AllReduceFusedAddRMSNormStaticQuantNVFP4Pattern(BasePattern):
             return allreduce[4], allreduce[2], allreduce[5]
 
         pm.register_replacement(
-            pattern, replacement, get_inputs(), pm.fwd_only, pm_pass
+            pattern, replacement, self.get_inputs(), pm.fwd_only, pm_pass
         )
 
 
 class AllReduceFusionPass(VllmPatternMatcherPass):
-    def __init__(self, config: VllmConfig):
+    def __init__(self, config: VllmConfig) -> None:
         super().__init__(config)
         self.disabled = True
         self.tp_size = get_tensor_model_parallel_world_size()
@@ -1122,7 +1131,7 @@ class AllReduceFusionPass(VllmPatternMatcherPass):
         )
 
         self.ipc_handles, workspace_tensor = (
-            flashinfer_comm.trtllm_create_ipc_workspace_for_all_reduce_fusion(
+            flashinfer_comm.trtllm_create_ipc_workspace_for_all_reduce_fusion(  # type: ignore[misc]
                 tp_rank=rank,
                 tp_size=self.tp_size,
                 max_token_num=self.max_token_num,
@@ -1145,7 +1154,7 @@ class AllReduceFusionPass(VllmPatternMatcherPass):
         self.dump_patterns(config, self.patterns)
 
     @enable_fake_mode
-    def register_patterns(self):
+    def register_patterns(self) -> None:
         for epsilon in [1e-5, 1e-6]:
             AllReduceFusedRMSNormStaticQuantFP8Pattern(
                 epsilon,
@@ -1198,7 +1207,7 @@ class AllReduceFusionPass(VllmPatternMatcherPass):
         return compile_range.end <= self.max_token_num
 
     @VllmInductorPass.time_and_log
-    def __call__(self, graph: fx.Graph):
+    def __call__(self, graph: fx.Graph) -> None:
         if self.disabled:
             logger.debug("AllReduceFusionPass disabled")
             return
@@ -1206,7 +1215,7 @@ class AllReduceFusionPass(VllmPatternMatcherPass):
         self.matched_count = self.patterns.apply(graph)
         logger.debug("Replaced %s patterns", self.matched_count)
 
-    def __del__(self):
+    def __del__(self) -> None:
         if getattr(self, "disabled", True):
             return
         if flashinfer_comm is not None:
diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py
index ab56d3561c569f3d189bb6f6aac604ddc007d271..bb478fceb125bec115948fcfd452b2216a57dad8 100644
--- a/vllm/compilation/compiler_interface.py
+++ b/vllm/compilation/compiler_interface.py
@@ -31,7 +31,7 @@ class CompilerInterface:
 
     def initialize_cache(
         self, cache_dir: str, disable_cache: bool = False, prefix: str = ""
-    ):
+    ) -> None:
         """
         when the vLLM process uses `cache_dir` as the cache directory,
         the compiler should initialize itself with the cache directory,
@@ -66,7 +66,7 @@ class CompilerInterface:
         compiler_config: dict[str, Any],
         compile_range: Range,
         key: str | None = None,
-    ) -> tuple[Callable | None, Any | None]:
+    ) -> tuple[Callable[..., Any] | None, Any | None]:
         """
         Compile the graph with the given example inputs and compiler config,
         with a range. The `compile_range` specifies the range of the inputs,
@@ -100,7 +100,7 @@ class CompilerInterface:
         example_inputs: list[Any],
         graph_index: int,
         compile_range: Range,
-    ) -> Callable:
+    ) -> Callable[..., Any]:
         """
         Load the compiled function from the handle.
         Raises an error if the handle is invalid.
@@ -138,13 +138,13 @@ class AlwaysHitShapeEnv:
     def __init__(self) -> None:
         self.guards: list[Any] = []
 
-    def evaluate_guards_expression(self, *args, **kwargs):
+    def evaluate_guards_expression(self, *args: Any, **kwargs: Any) -> Literal[True]:
         return True
 
-    def get_pruned_guards(self, *args, **kwargs):
+    def get_pruned_guards(self, *args: Any, **kwargs: Any) -> list[Any]:
         return []
 
-    def produce_guards_expression(self, *args, **kwargs):
+    def produce_guards_expression(self, *args: Any, **kwargs: Any) -> Literal[""]:
         return ""
 
 
@@ -193,7 +193,7 @@ class InductorStandaloneAdaptor(CompilerInterface):
 
     name = "inductor_standalone"
 
-    def __init__(self, save_format: Literal["binary", "unpacked"]):
+    def __init__(self, save_format: Literal["binary", "unpacked"]) -> None:
         self.save_format = save_format
 
     def compute_hash(self, vllm_config: VllmConfig) -> str:
@@ -205,7 +205,7 @@ class InductorStandaloneAdaptor(CompilerInterface):
 
     def initialize_cache(
         self, cache_dir: str, disable_cache: bool = False, prefix: str = ""
-    ):
+    ) -> None:
         self.cache_dir = cache_dir
 
     def compile(
@@ -215,7 +215,7 @@ class InductorStandaloneAdaptor(CompilerInterface):
         compiler_config: dict[str, Any],
         compile_range: Range,
         key: str | None = None,
-    ) -> tuple[Callable | None, Any | None]:
+    ) -> tuple[Callable[..., Any] | None, Any | None]:
         compilation_counter.num_inductor_compiles += 1
         current_config = {}
         if compiler_config is not None:
@@ -252,7 +252,7 @@ class InductorStandaloneAdaptor(CompilerInterface):
         example_inputs: list[Any],
         graph_index: int,
         compile_range: Range,
-    ) -> Callable:
+    ) -> Callable[..., Any]:
         assert isinstance(handle, tuple)
         assert isinstance(handle[0], str)
         assert isinstance(handle[1], str)
@@ -264,7 +264,7 @@ class InductorStandaloneAdaptor(CompilerInterface):
 
         returns_tuple = graph_returns_tuple(graph)
 
-        def compiled_graph_wrapper(*args):
+        def compiled_graph_wrapper(*args: Any) -> tuple[Any, ...] | Any:
             graph_output = inductor_compiled_graph(*args)
             # unpack the tuple if needed
             # TODO(rzou): the implication is that we're not
@@ -293,7 +293,7 @@ class InductorAdaptor(CompilerInterface):
 
     def initialize_cache(
         self, cache_dir: str, disable_cache: bool = False, prefix: str = ""
-    ):
+    ) -> None:
         self.cache_dir = cache_dir
         self.prefix = prefix
         self.base_cache_dir = cache_dir[: -len(prefix)] if prefix else cache_dir
@@ -317,7 +317,7 @@ class InductorAdaptor(CompilerInterface):
         compiler_config: dict[str, Any],
         compile_range: Range,
         key: str | None = None,
-    ) -> tuple[Callable | None, Any | None]:
+    ) -> tuple[Callable[..., Any] | None, Any | None]:
         compilation_counter.num_inductor_compiles += 1
         from torch._inductor.compile_fx import compile_fx
 
@@ -348,7 +348,7 @@ class InductorAdaptor(CompilerInterface):
             original_load = FxGraphCache.load
             original_load_name = "torch._inductor.codecache.FxGraphCache.load"
 
-            def hijack_load(*args, **kwargs):
+            def hijack_load(*args: Any, **kwargs: Any) -> Any:
                 inductor_compiled_graph = original_load(*args, **kwargs)
                 nonlocal file_path
                 compiled_fn = inductor_compiled_graph.current_callable
@@ -375,7 +375,7 @@ class InductorAdaptor(CompilerInterface):
             # function renamed in 2.6
             original_load_name = None
 
-            def hijacked_compile_fx_inner(*args, **kwargs):
+            def hijacked_compile_fx_inner(*args: Any, **kwargs: Any) -> Any:
                 output = torch._inductor.compile_fx.compile_fx_inner(*args, **kwargs)
                 nonlocal hash_str
                 inductor_compiled_graph = output
@@ -401,13 +401,13 @@ class InductorAdaptor(CompilerInterface):
                     hash_str = inductor_compiled_graph._fx_graph_cache_key
                 return output
 
-        def hijack_compiled_fx_graph_hash(*args, **kwargs):
+        def hijack_compiled_fx_graph_hash(*args: Any, **kwargs: Any) -> Any:
             out = compiled_fx_graph_hash(*args, **kwargs)
             nonlocal hash_str
             hash_str = out[0]
             return out
 
-        def _check_can_cache(*args, **kwargs):
+        def _check_can_cache(*args: Any, **kwargs: Any) -> None:
             # no error means it can be cached.
             # Inductor refuses to cache the graph outside of Dynamo
             # tracing context, and also disables caching for graphs
@@ -513,7 +513,7 @@ class InductorAdaptor(CompilerInterface):
         example_inputs: list[Any],
         graph_index: int,
         compile_range: Range,
-    ) -> Callable:
+    ) -> Callable[..., Any]:
         assert isinstance(handle, tuple)
         assert isinstance(handle[0], str)
         assert isinstance(handle[1], str)
@@ -546,7 +546,7 @@ class InductorAdaptor(CompilerInterface):
                     hash_str, example_inputs, True, False
                 )
                 assert inductor_compiled_graph is not None, (
-                    "Inductor cache lookup failed. Please remove"
+                    "Inductor cache lookup failed. Please remove "
                     f"the cache directory and try again."  # noqa
                 )
             elif torch.__version__ >= "2.6":
@@ -557,7 +557,7 @@ class InductorAdaptor(CompilerInterface):
                     hash_str, example_inputs, True, None, constants
                 )
                 assert inductor_compiled_graph is not None, (
-                    "Inductor cache lookup failed. Please remove"
+                    "Inductor cache lookup failed. Please remove "
                     f"the cache directory and try again."  # noqa
                 )
 
@@ -572,7 +572,7 @@ class InductorAdaptor(CompilerInterface):
         returns_tuple = graph_returns_tuple(graph)
 
         # this is the callable we return to Dynamo to run
-        def compiled_graph(*args):
+        def compiled_graph(*args: Any) -> tuple[Any, ...] | Any:
             # convert args to list
             list_args = list(args)
             graph_output = inductor_compiled_graph(list_args)
@@ -584,7 +584,7 @@ class InductorAdaptor(CompilerInterface):
 
         return compiled_graph
 
-    def metrics_context(self) -> contextlib.AbstractContextManager:
+    def metrics_context(self) -> contextlib.AbstractContextManager[Any]:
         """
         This method returns the Dynamo metrics context (if it exists,
         otherwise a null context). It is used by various compile components.
@@ -603,12 +603,12 @@ class InductorAdaptor(CompilerInterface):
         if is_torch_equal_or_newer("2.6"):
             import torch._dynamo.utils
 
-            return torch._dynamo.utils.get_metrics_context()
+            return torch._dynamo.utils.get_metrics_context()  # type: ignore[no-any-return]
         else:
             return contextlib.nullcontext()
 
 
-def set_inductor_config(config, compile_range: Range):
+def set_inductor_config(config: dict[str, Any], compile_range: Range) -> None:
     if compile_range.is_single_size():
         # for a specific batch size, tuning triton kernel parameters
         # can be beneficial
@@ -618,7 +618,7 @@ def set_inductor_config(config, compile_range: Range):
         )
 
 
-def set_functorch_config():
+def set_functorch_config() -> None:
     torch._functorch.config.bundled_autograd_cache = False
 
 
@@ -632,7 +632,7 @@ class EagerAdaptor(CompilerInterface):
         compiler_config: dict[str, Any],
         compile_range: Range,
         key: str | None = None,
-    ) -> tuple[Callable | None, Any | None]:
+    ) -> tuple[Callable[..., Any] | None, Any | None]:
         compilation_counter.num_eager_compiles += 1
         # we don't need to compile the graph, just return the graph itself.
         # It does not support caching, return None for the handle.
diff --git a/vllm/compilation/counter.py b/vllm/compilation/counter.py
index 20918099f169d8c6af0dfc1c018614dde7b93296..29d3045aac64bb28119c2dd0407f79eafb8193dc 100644
--- a/vllm/compilation/counter.py
+++ b/vllm/compilation/counter.py
@@ -3,7 +3,9 @@
 
 import copy
 import dataclasses
+from collections.abc import Generator
 from contextlib import contextmanager
+from typing import Any
 
 
 @dataclasses.dataclass
@@ -34,7 +36,7 @@ class CompilationCounter:
         return copy.deepcopy(self)
 
     @contextmanager
-    def expect(self, **kwargs):
+    def expect(self, **kwargs: Any) -> Generator[None, None, None]:
         old = self.clone()
         yield
         for k, v in kwargs.items():
diff --git a/vllm/compilation/cuda_graph.py b/vllm/compilation/cuda_graph.py
index 0748643a5299ff8dc72d7707f55666d7a761a79d..7ffa74d0d7e6f6d04a741cb1c5f0904aeb544109 100644
--- a/vllm/compilation/cuda_graph.py
+++ b/vllm/compilation/cuda_graph.py
@@ -18,7 +18,7 @@ from vllm.distributed.device_communicators.pynccl_allocator import set_graph_poo
 from vllm.forward_context import BatchDescriptor, get_forward_context
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.utils.torch_utils import weak_ref_tensors
+from vllm.utils.torch_utils import current_stream, weak_ref_tensors
 
 logger = init_logger(__name__)
 
@@ -42,7 +42,9 @@ class CUDAGraphLogging:
         "Count",
     ]
 
-    def __init__(self, cg_mode: CUDAGraphMode, cg_capture_sizes: list[int] | None):
+    def __init__(
+        self, cg_mode: CUDAGraphMode, cg_capture_sizes: list[int] | None
+    ) -> None:
         self.reset()
         self.cg_mode = str(cg_mode)
         self.cg_capture_sizes = str(cg_capture_sizes or [])
@@ -54,10 +56,10 @@ class CUDAGraphLogging:
             "**CUDAGraph Stats:**\n\n"
         )
 
-    def reset(self):
-        self.stats = []
+    def reset(self) -> None:
+        self.stats: list[CUDAGraphStat] = []
 
-    def observe(self, cudagraph_stat: CUDAGraphStat):
+    def observe(self, cudagraph_stat: CUDAGraphStat) -> None:
         self.stats.append(cudagraph_stat)
 
     def generate_metric_table(self) -> str:
@@ -109,7 +111,7 @@ class CUDAGraphLogging:
             + "\n"
         )
 
-    def log(self, log_fn=logger.info):
+    def log(self, log_fn: Callable[..., Any] = logger.info) -> None:
         if not self.stats:
             return
         log_fn(self.generate_metric_table())
@@ -161,11 +163,11 @@ class CUDAGraphWrapper:
 
     def __init__(
         self,
-        runnable: Callable,
+        runnable: Callable[..., Any],
         vllm_config: VllmConfig,
         runtime_mode: CUDAGraphMode,
         cudagraph_options: CUDAGraphOptions | None = None,
-    ):
+    ) -> None:
         self.runnable = runnable
         self.vllm_config = vllm_config
         self.runtime_mode = runtime_mode
@@ -189,7 +191,7 @@ class CUDAGraphWrapper:
         # cudagraphs for.
         self.concrete_cudagraph_entries: dict[BatchDescriptor, CUDAGraphEntry] = {}
 
-    def __getattr__(self, key: str):
+    def __getattr__(self, key: str) -> Any:
         # allow accessing the attributes of the runnable.
         if hasattr(self.runnable, key):
             return getattr(self.runnable, key)
@@ -198,11 +200,11 @@ class CUDAGraphWrapper:
             f"cudagraph wrapper: {self.runnable}"
         )
 
-    def unwrap(self) -> Callable:
+    def unwrap(self) -> Callable[..., Any]:
         # in case we need to access the original runnable.
         return self.runnable
 
-    def __call__(self, *args, **kwargs):
+    def __call__(self, *args: Any, **kwargs: Any) -> Any | None:
         forward_context = get_forward_context()
         batch_descriptor = forward_context.batch_descriptor
         cudagraph_runtime_mode = forward_context.cudagraph_runtime_mode
@@ -219,6 +221,7 @@ class CUDAGraphWrapper:
             # runtime modes.
             return self.runnable(*args, **kwargs)
 
+        assert batch_descriptor is not None
         if batch_descriptor not in self.concrete_cudagraph_entries:
             # create a new entry for this batch descriptor
             self.concrete_cudagraph_entries[batch_descriptor] = CUDAGraphEntry(
@@ -263,7 +266,11 @@ class CUDAGraphWrapper:
                 else:
                     set_graph_pool_id(current_platform.graph_pool_handle())
                 # mind-exploding: carefully manage the reference and memory.
-                with torch.cuda.graph(cudagraph, pool=self.graph_pool):
+                with torch.cuda.graph(
+                    cudagraph,
+                    pool=self.graph_pool,
+                    stream=current_stream(),
+                ):
                     # `output` is managed by pytorch's cudagraph pool
                     output = self.runnable(*args, **kwargs)
                     if self.cudagraph_options.weak_ref_output:
diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
index 887c24e6cb08c98826d17efc0d3547a0051e7f4a..55527b4a887d177838c55b56e3860f8b97b1ac5e 100644
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -6,8 +6,8 @@ import hashlib
 import inspect
 import os
 import sys
-from collections.abc import Callable
-from typing import TypeVar, overload
+from collections.abc import Callable, Generator
+from typing import TYPE_CHECKING, Any, Literal, TypeVar, overload
 from unittest.mock import patch
 
 import torch
@@ -33,6 +33,14 @@ from vllm.utils.torch_utils import is_torch_equal_or_newer, supports_dynamo
 from .monitor import start_monitoring_torch_compile
 from vllm.forward_context import get_profilling
 
+if TYPE_CHECKING:
+    # Only added on nightly/2.10 so wrap
+    try:
+        from torch._dynamo.package import SourceInfo
+    except ImportError:
+        # Fallback for old versions not supporting
+        SourceInfo = Any
+
 logger = init_logger(__name__)
 
 IGNORE_COMPILE_KEY = "_ignore_compile_vllm"
@@ -60,7 +68,7 @@ def ignore_torch_compile(cls: _T) -> _T:
     return cls
 
 
-def _should_ignore_torch_compile(cls) -> bool:
+def _should_ignore_torch_compile(cls: _T) -> bool:
     """
     Check if the class should be ignored for torch.compile.
     """
@@ -225,7 +233,7 @@ def support_torch_compile(
     return cls_decorator_helper
 
 
-def _model_hash_key(fn) -> str:
+def _model_hash_key(fn: Callable[..., Any]) -> str:
     import vllm
 
     sha256_hash = hashlib.sha256()
@@ -235,7 +243,9 @@ def _model_hash_key(fn) -> str:
     return sha256_hash.hexdigest()
 
 
-def _verify_source_unchanged(source_info, vllm_config) -> None:
+def _verify_source_unchanged(
+    source_info: "SourceInfo", vllm_config: VllmConfig
+) -> None:
     from .caching import _compute_code_hash, _compute_code_hash_with_content
 
     file_contents = {}
@@ -276,8 +286,12 @@ def _support_torch_compile(
     setattr(cls, IGNORE_COMPILE_KEY, False)
 
     def __init__(
-        self, *, vllm_config: VllmConfig | None = None, prefix: str = "", **kwargs
-    ):
+        self: _T,
+        *,
+        vllm_config: VllmConfig | None = None,
+        prefix: str = "",
+        **kwargs: Any,
+    ) -> None:
         if vllm_config is None:
             vllm_config = get_current_vllm_config()
 
@@ -310,13 +324,17 @@ def _support_torch_compile(
 
         compilation_counter.num_models_seen += 1
         self.compiled = False
-        TorchCompileWithNoGuardsWrapper.__init__(self)
+
+        # Handled by monkeypatching `TorchCompileWithNoGuardsWrapper` into base class
+        TorchCompileWithNoGuardsWrapper.__init__(self)  # type: ignore[arg-type]
 
     cls.__init__ = __init__
 
-    def _mark_dynamic_inputs(mod, type, *args, **kwargs):
-        def mark_dynamic(arg, dims):
-            if type == DynamicShapesType.UNBACKED:
+    def _mark_dynamic_inputs(
+        mod: _T, ds_type: DynamicShapesType, *args: Any, **kwargs: Any
+    ) -> None:
+        def mark_dynamic(arg: torch.Tensor, dims: list[int]) -> None:
+            if ds_type == DynamicShapesType.UNBACKED:
                 if is_torch_equal_or_newer("2.10.0.dev"):
                     for dim in dims:
                         torch._dynamo.decorators.mark_unbacked(
@@ -327,7 +345,7 @@ def _support_torch_compile(
             else:
                 torch._dynamo.mark_dynamic(arg, dims)
 
-        sig = inspect.signature(mod.__class__.forward)
+        sig = inspect.signature(mod.__class__.forward)  # type: ignore[attr-defined]
         bound_args = sig.bind(mod, *args, **kwargs)
         bound_args.apply_defaults()
         for k, dims in dynamic_arg_dims.items():
@@ -365,16 +383,19 @@ def _support_torch_compile(
                         else:
                             torch._dynamo.decorators.mark_unbacked(arg, dims)
 
-    def __call__(self, *args, **kwargs):
+    def __call__(self: _T, *args: Any, **kwargs: Any) -> Any:
         # torch.compiler.is_compiling() means we are inside the compilation
         # e.g. TPU has the compilation logic in model runner, so we don't
         # need to compile the model inside.
         if self.do_not_compile or torch.compiler.is_compiling() or get_profilling():
             return self.forward(*args, **kwargs)
 
-        # if aot_compiled_fn is set, just call it.
+        # if aot_compiled_fn is set, call it with partition wrapper context.
+        # The partition wrapper must be active at runtime for CUDA graph
+        # capture to work correctly with inductor graph partitioning.
         if getattr(self, "aot_compiled_fn", None) is not None:
-            return self.aot_compiled_fn(self, *args, **kwargs)
+            with maybe_use_cudagraph_partition_wrapper(self.vllm_config):
+                return self.aot_compiled_fn(self, *args, **kwargs)
 
         ds_type = self.compilation_config.dynamic_shapes_config.type
         cache_dir = None
@@ -404,7 +425,7 @@ def _support_torch_compile(
             )
 
             rank = self.vllm_config.parallel_config.rank
-            dp_rank = self.vllm_config.parallel_config.data_parallel_rank
+            dp_rank = self.vllm_config.parallel_config.data_parallel_index
             cache_dir = os.path.join(cache_dir, f"rank_{rank}_{dp_rank}")
             aot_compilation_path = os.path.join(cache_dir, "model")
             try:
@@ -433,11 +454,16 @@ def _support_torch_compile(
                 logger.info(
                     "Directly load AOT compilation from path %s", aot_compilation_path
                 )
-                return self.aot_compiled_fn(self, *args, **kwargs)
+                # Apply partition wrapper context for proper CUDA graph capture
+                with maybe_use_cudagraph_partition_wrapper(self.vllm_config):
+                    return self.aot_compiled_fn(self, *args, **kwargs)
 
         if self.compiled:
-            assert not envs.VLLM_USE_AOT_COMPILE
-            return TorchCompileWithNoGuardsWrapper.__call__(self, *args, **kwargs)
+            assert (
+                not envs.VLLM_USE_AOT_COMPILE
+                or self.vllm_config.compilation_config.backend == "eager"
+            )
+            return TorchCompileWithNoGuardsWrapper.__call__(self, *args, **kwargs)  # type: ignore[arg-type]
 
         # This is the path for the first compilation.
         # the first compilation needs to have dynamic shapes marked
@@ -470,7 +496,7 @@ def _support_torch_compile(
         # during Dynamo tracing, and their corresponding files
         inline_call = InliningInstructionTranslator.inline_call_
 
-        def patched_inline_call(self_):
+        def patched_inline_call(self_: Any) -> Any:
             code = self_.f_code
             self.compilation_config.traced_files.add(code.co_filename)
             return inline_call(self_)
@@ -497,7 +523,9 @@ def _support_torch_compile(
         # assume_32bit_indexing is only available in torch 2.10.0.dev+
         inductor_config_patches = {}
         if is_torch_equal_or_newer("2.10.0.dev"):
-            inductor_config_patches["assume_32bit_indexing"] = True
+            inductor_config_patches["assume_32bit_indexing"] = (
+                self.compilation_config.dynamic_shapes_config.assume_32_bit_indexing
+            )
 
         with (
             patch.object(
@@ -509,7 +537,11 @@ def _support_torch_compile(
             _torch27_patch_tensor_subclasses(),
             torch._inductor.config.patch(**inductor_config_patches),
         ):
-            if envs.VLLM_USE_AOT_COMPILE:
+            use_aot_compile = envs.VLLM_USE_AOT_COMPILE
+            if self.vllm_config.compilation_config.backend == "eager":
+                logger.warning("Detected eager backend, disabling AOT compile.")
+                use_aot_compile = False
+            if use_aot_compile:
                 self.aot_compiled_fn = self.aot_compile(*args, **kwargs)
                 output = self.aot_compiled_fn(self, *args, **kwargs)
                 assert aot_compilation_path is not None
@@ -524,7 +556,7 @@ def _support_torch_compile(
                         str(e),
                     )
             else:
-                output = TorchCompileWithNoGuardsWrapper.__call__(self, *args, **kwargs)
+                output = TorchCompileWithNoGuardsWrapper.__call__(self, *args, **kwargs)  # type: ignore[arg-type]
 
         self.compiled = True
         return output
@@ -534,7 +566,9 @@ def _support_torch_compile(
 
 
 @contextlib.contextmanager
-def maybe_use_cudagraph_partition_wrapper(vllm_config: VllmConfig):
+def maybe_use_cudagraph_partition_wrapper(
+    vllm_config: VllmConfig,
+) -> Generator[None, None, None]:
     """
     Context manager to set/unset customized cudagraph partition wrappers.
 
@@ -561,7 +595,9 @@ def maybe_use_cudagraph_partition_wrapper(vllm_config: VllmConfig):
             current_platform.get_static_graph_wrapper_cls()
         )
 
-        def customized_cudagraph_wrapper(f, metadata: CUDAGraphWrapperMetadata):
+        def customized_cudagraph_wrapper(
+            f: Callable[..., Any], metadata: CUDAGraphWrapperMetadata
+        ) -> Any:
             partition_id = metadata.partition_index
             num_partitions = metadata.num_partitions
             return static_graph_wrapper_class(
@@ -589,7 +625,7 @@ def maybe_use_cudagraph_partition_wrapper(vllm_config: VllmConfig):
 
 
 @contextlib.contextmanager
-def _torch27_patch_tensor_subclasses():
+def _torch27_patch_tensor_subclasses() -> Generator[None, None, None]:
     """
     Add support for using tensor subclasses (ie `BasevLLMParameter`, ect) when
     using torch 2.7.0. This enables using weight_loader_v2 and the use of
@@ -603,7 +639,7 @@ def _torch27_patch_tensor_subclasses():
         _ColumnvLLMParameter,
     )
 
-    def return_false(*args, **kwargs):
+    def return_false(*args: Any, **kwargs: Any) -> Literal[False]:
         return False
 
     if version.parse("2.7") <= version.parse(torch.__version__) < version.parse("2.8"):
diff --git a/vllm/compilation/fix_functionalization.py b/vllm/compilation/fix_functionalization.py
index ed9e83b589f8b6d3468296b9049acc5dd6039e70..c7de3a39cd38c8a72e2ab09280f7f6494a57f0cc 100644
--- a/vllm/compilation/fix_functionalization.py
+++ b/vllm/compilation/fix_functionalization.py
@@ -26,7 +26,7 @@ class FixFunctionalizationPass(VllmInductorPass):
     """
 
     @VllmInductorPass.time_and_log
-    def __call__(self, graph: torch.fx.Graph):
+    def __call__(self, graph: torch.fx.Graph) -> None:
         # XPU does not support auto-functionalization yet.
         # Will enable this when switch to vllm-xpu-kernels.
         if current_platform.is_xpu():
@@ -179,7 +179,7 @@ class FixFunctionalizationPass(VllmInductorPass):
         )
         self.nodes_to_remove.clear()
 
-    def _remove(self, node_or_nodes: torch.fx.Node | Iterable[torch.fx.Node]):
+    def _remove(self, node_or_nodes: torch.fx.Node | Iterable[torch.fx.Node]) -> None:
         """
         Stage a node (or nodes) for removal at the end of the pass.
         """
@@ -194,7 +194,7 @@ class FixFunctionalizationPass(VllmInductorPass):
         node: torch.fx.Node,
         mutated_args: dict[int, torch.fx.Node | str],
         args: tuple[torch.fx.Node | str, ...] | None = None,
-    ):
+    ) -> None:
         """
         De-functionalize a node by replacing it with a call to the original.
         It also replaces the getitem users with the mutated arguments.
@@ -206,7 +206,7 @@ class FixFunctionalizationPass(VllmInductorPass):
 
     def replace_users_with_mutated_args(
         self, node: torch.fx.Node, mutated_args: dict[int, torch.fx.Node | str]
-    ):
+    ) -> None:
         """
         Replace all getitem users of the auto-functionalized node with the
         mutated arguments.
@@ -237,7 +237,7 @@ class FixFunctionalizationPass(VllmInductorPass):
         graph: torch.fx.Graph,
         node: torch.fx.Node,
         args: tuple[torch.fx.Node | str, ...] | None = None,
-    ):
+    ) -> None:
         """
         Insert a new defunctionalized node into the graph before node.
         If one of the kwargs is 'out', provide args directly,
diff --git a/vllm/compilation/fusion.py b/vllm/compilation/fusion.py
index 7460bb7006ece0c055f529caaf02f8ebe70fb02c..a55d48a67d43067f877fb985366cc47764ed43c7 100644
--- a/vllm/compilation/fusion.py
+++ b/vllm/compilation/fusion.py
@@ -38,19 +38,19 @@ FP8_DTYPE = current_platform.fp8_dtype()
 FP4_DTYPE = torch.uint8
 
 
-def empty_bf16(*args, **kwargs):
+def empty_bf16(*args: Any, **kwargs: Any) -> torch.Tensor:
     return torch.empty(*args, **kwargs, dtype=torch.bfloat16, device="cuda")
 
 
-def empty_fp32(*args, **kwargs):
+def empty_fp32(*args: Any, **kwargs: Any) -> torch.Tensor:
     return torch.empty(*args, **kwargs, dtype=torch.float32, device="cuda")
 
 
-def empty_i32(*args, **kwargs):
+def empty_i32(*args: Any, **kwargs: Any) -> torch.Tensor:
     return torch.empty(*args, **kwargs, dtype=torch.int32, device="cuda")
 
 
-def empty_i64(*args, **kwargs):
+def empty_i64(*args: Any, **kwargs: Any) -> torch.Tensor:
     return torch.empty(*args, **kwargs, dtype=torch.int64, device="cuda")
 
 
@@ -79,7 +79,7 @@ class FusedRMSQuantKey(NamedTuple):
     quant: QuantKey
     fused_add: bool
 
-    def __str__(self):
+    def __str__(self) -> str:
         return (
             f"FusedQuantKey({self.quant}, with"
             f"{'' if self.fused_add else 'out'} residual)"
@@ -121,7 +121,7 @@ class RMSNormQuantPattern:
         key: FusedRMSQuantKey,
         has_col_major_scales: bool = False,
         is_e8m0: bool = False,
-    ):
+    ) -> None:
         self.epsilon = epsilon
         self.quant_dtype = key.quant.dtype
         config = get_current_vllm_config()
@@ -141,7 +141,9 @@ class RMSNormQuantPattern:
 
 
 class RMSNormStaticQuantPattern(RMSNormQuantPattern):
-    def __init__(self, epsilon: float, quant_dtype: torch.dtype, symmetric=True):
+    def __init__(
+        self, epsilon: float, quant_dtype: torch.dtype, symmetric: bool = True
+    ) -> None:
         fused_key = FusedRMSQuantKey(
             fused_add=False,
             quant=QuantKey(
@@ -150,13 +152,17 @@ class RMSNormStaticQuantPattern(RMSNormQuantPattern):
         )
         super().__init__(epsilon, fused_key)
 
-    def register(self, pm_pass: PatternMatcherPass):
+    def register(self, pm_pass: PatternMatcherPass) -> None:
         # Cannot use methods, as the self argument affects tracing
-        def pattern(input: torch.Tensor, weight: torch.Tensor, scale: torch.Tensor):
+        def pattern(
+            input: torch.Tensor, weight: torch.Tensor, scale: torch.Tensor
+        ) -> torch.Tensor:
             result_rms = self.rmsnorm_matcher(input, weight)
             return self.quant_matcher(result_rms, scale)[0]
 
-        def replacement(input: torch.Tensor, weight: torch.Tensor, scale: torch.Tensor):
+        def replacement(
+            input: torch.Tensor, weight: torch.Tensor, scale: torch.Tensor
+        ) -> torch.Tensor:
             # In case we're matching native rms-norm, conversions might be
             # optimized out. We convert here just to be safe.
             input = input.to(dtype=self.model_dtype)
@@ -187,7 +193,9 @@ class RMSNormStaticQuantPattern(RMSNormQuantPattern):
 
 
 class FusedAddRMSNormStaticQuantPattern(RMSNormQuantPattern):
-    def __init__(self, epsilon: float, quant_dtype: torch.dtype, symmetric=True):
+    def __init__(
+        self, epsilon: float, quant_dtype: torch.dtype, symmetric: bool = True
+    ) -> None:
         key = FusedRMSQuantKey(
             fused_add=True,
             quant=QuantKey(
@@ -196,13 +204,13 @@ class FusedAddRMSNormStaticQuantPattern(RMSNormQuantPattern):
         )
         super().__init__(epsilon, key)
 
-    def register(self, pm_pass: PatternMatcherPass):
+    def register(self, pm_pass: PatternMatcherPass) -> None:
         def pattern(
             input: torch.Tensor,
             weight: torch.Tensor,
             residual: torch.Tensor,
             scale: torch.Tensor,
-        ):
+        ) -> tuple[torch.Tensor, torch.Tensor]:
             result_rms, residual = self.rmsnorm_matcher(input, weight, residual)
             result, _ = self.quant_matcher(result_rms, scale)
 
@@ -213,7 +221,7 @@ class FusedAddRMSNormStaticQuantPattern(RMSNormQuantPattern):
             weight: torch.Tensor,
             residual: torch.Tensor,
             scale: torch.Tensor,
-        ):
+        ) -> tuple[torch.Tensor, torch.Tensor]:
             # In case we're matching native rms-norm, conversions might be
             # optimized out. We convert here just to be safe.
             input = input.to(dtype=self.model_dtype)
@@ -253,10 +261,10 @@ class FusedAddRMSNormGroupQuantPattern(RMSNormQuantPattern):
         epsilon: float,
         quant_dtype: torch.dtype,
         group_shape: GroupShape,
-        symmetric=True,
+        symmetric: bool = True,
         has_col_major_scales: bool = False,
         is_e8m0: bool = False,
-    ):
+    ) -> None:
         scale = ScaleDesc(torch.float32, False, group_shape)
         key = FusedRMSQuantKey(
             fused_add=True,
@@ -269,15 +277,17 @@ class FusedAddRMSNormGroupQuantPattern(RMSNormQuantPattern):
             epsilon, key, has_col_major_scales=has_col_major_scales, is_e8m0=is_e8m0
         )
 
-    def register(self, pm_pass: PatternMatcherPass):
-        def pattern(input: torch.Tensor, weight: torch.Tensor, residual: torch.Tensor):
+    def register(self, pm_pass: PatternMatcherPass) -> None:
+        def pattern(
+            input: torch.Tensor, weight: torch.Tensor, residual: torch.Tensor
+        ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
             result_rms, residual = self.rmsnorm_matcher(input, weight, residual)
             result, scale = self.quant_matcher(result_rms)
             return result, residual, scale
 
         def replacement(
             input: torch.Tensor, weight: torch.Tensor, residual: torch.Tensor
-        ):
+        ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
             # In case we're matching native rms-norm, conversions might be
             # optimized out. We convert here just to be safe.
             input = input.to(dtype=self.model_dtype)
@@ -315,10 +325,10 @@ class RMSNormGroupQuantPattern(RMSNormQuantPattern):
         epsilon: float,
         quant_dtype: torch.dtype,
         group_shape: GroupShape,
-        symmetric=True,
+        symmetric: bool = True,
         has_col_major_scales: bool = False,
         is_e8m0: bool = False,
-    ):
+    ) -> None:
         scale = ScaleDesc(torch.float32, False, group_shape)
         key = FusedRMSQuantKey(
             fused_add=False,
@@ -329,13 +339,17 @@ class RMSNormGroupQuantPattern(RMSNormQuantPattern):
             epsilon, key, has_col_major_scales=has_col_major_scales, is_e8m0=is_e8m0
         )
 
-    def register(self, pm_pass: PatternMatcherPass):
-        def pattern(input: torch.Tensor, weight: torch.Tensor):
+    def register(self, pm_pass: PatternMatcherPass) -> None:
+        def pattern(
+            input: torch.Tensor, weight: torch.Tensor
+        ) -> tuple[torch.Tensor, torch.Tensor]:
             result_rms = self.rmsnorm_matcher(input, weight)
             result, scale = self.quant_matcher(result_rms)
             return result, scale
 
-        def replacement(input: torch.Tensor, weight: torch.Tensor):
+        def replacement(
+            input: torch.Tensor, weight: torch.Tensor
+        ) -> tuple[torch.Tensor, torch.Tensor]:
             # In case we're matching native rms-norm, conversions might be
             # optimized out. We convert here just to be safe.
             input = input.to(dtype=self.model_dtype)
@@ -375,8 +389,8 @@ class RMSNormDynamicQuantPattern(RMSNormQuantPattern):
         epsilon: float,
         quant_dtype: torch.dtype,
         group_shape: GroupShape = GroupShape.PER_TOKEN,
-        symmetric=True,
-    ):
+        symmetric: bool = True,
+    ) -> None:
         scale = ScaleDesc(torch.float32, False, group_shape)
         key = FusedRMSQuantKey(
             fused_add=False,
@@ -384,13 +398,17 @@ class RMSNormDynamicQuantPattern(RMSNormQuantPattern):
         )
         super().__init__(epsilon, key)
 
-    def register(self, pm_pass: PatternMatcherPass):
-        def pattern(input: torch.Tensor, weight: torch.Tensor):
+    def register(self, pm_pass: PatternMatcherPass) -> None:
+        def pattern(
+            input: torch.Tensor, weight: torch.Tensor
+        ) -> tuple[torch.Tensor, torch.Tensor]:
             result_rms = self.rmsnorm_matcher(input, weight)
             # result, scale
-            return self.quant_matcher(result_rms)
+            return self.quant_matcher(result_rms)  # type: ignore[no-any-return]
 
-        def replacement(input: torch.Tensor, weight: torch.Tensor):
+        def replacement(
+            input: torch.Tensor, weight: torch.Tensor
+        ) -> tuple[torch.Tensor, torch.Tensor]:
             # In case we're matching native rms-norm, conversions might be
             # optimized out. We convert here just to be safe.
             input = input.to(dtype=self.model_dtype)
@@ -426,8 +444,8 @@ class FusedAddRMSNormDynamicQuantPattern(RMSNormQuantPattern):
         epsilon: float,
         quant_dtype: torch.dtype,
         group_shape: GroupShape = GroupShape.PER_TOKEN,
-        symmetric=True,
-    ):
+        symmetric: bool = True,
+    ) -> None:
         scale = ScaleDesc(torch.float32, False, group_shape)
         key = FusedRMSQuantKey(
             fused_add=True,
@@ -435,8 +453,10 @@ class FusedAddRMSNormDynamicQuantPattern(RMSNormQuantPattern):
         )
         super().__init__(epsilon, key)
 
-    def register(self, pm_pass: PatternMatcherPass):
-        def pattern(input: torch.Tensor, weight: torch.Tensor, residual: torch.Tensor):
+    def register(self, pm_pass: PatternMatcherPass) -> None:
+        def pattern(
+            input: torch.Tensor, weight: torch.Tensor, residual: torch.Tensor
+        ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
             result_rms, residual = self.rmsnorm_matcher(input, weight, residual)
             result, scale = self.quant_matcher(result_rms)
 
@@ -444,7 +464,7 @@ class FusedAddRMSNormDynamicQuantPattern(RMSNormQuantPattern):
 
         def replacement(
             input: torch.Tensor, weight: torch.Tensor, residual: torch.Tensor
-        ):
+        ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
             # In case we're matching native rms-norm, conversions might be
             # optimized out. We convert here just to be safe.
             input = input.to(dtype=self.model_dtype)
@@ -481,7 +501,7 @@ class RMSNormQuantFusionPass(VllmPatternMatcherPass):
     """
 
     @enable_fake_mode
-    def __init__(self, config: VllmConfig):
+    def __init__(self, config: VllmConfig) -> None:
         super().__init__(config)
 
         self.patterns: PatternMatcherPass = PatternMatcherPass(
@@ -533,11 +553,11 @@ class RMSNormQuantFusionPass(VllmPatternMatcherPass):
         self.dump_patterns(config, self.patterns)
 
     @VllmInductorPass.time_and_log
-    def __call__(self, graph: fx.Graph):
+    def __call__(self, graph: fx.Graph) -> None:
         self.matched_count = self.patterns.apply(graph)
         logger.debug("Replaced %s patterns", self.matched_count)
 
-    def uuid(self) -> Any:
+    def uuid(self) -> str:
         return self.hash_source(
             self,
             RMSNormGroupQuantPattern,
diff --git a/vllm/compilation/fusion_attn.py b/vllm/compilation/fusion_attn.py
index 6dcbbd85d7031e78a335392d452a7b30ae1cbd70..57448aa0b096f574ab11cae6bb42f33d99daf920 100644
--- a/vllm/compilation/fusion_attn.py
+++ b/vllm/compilation/fusion_attn.py
@@ -3,6 +3,7 @@
 
 from abc import ABC, abstractmethod
 from collections.abc import Callable
+from typing import Any, ParamSpec
 
 import torch
 import torch._inductor.pattern_matcher as pm
@@ -28,7 +29,7 @@ from .matcher_utils import MatcherQuantFP8
 from .vllm_inductor_pass import VllmInductorPass, VllmPatternMatcherPass
 
 logger = init_logger(__name__)
-
+P = ParamSpec("P")
 FP8_DTYPE = current_platform.fp8_dtype()
 FP4_DTYPE = torch.uint8
 
@@ -47,7 +48,7 @@ class AttentionQuantPattern(ABC):
         layer: Attention,
         quant_key: QuantKey,
         dtype: torch.dtype,
-    ):
+    ) -> None:
         self.layer = layer
         self.layer_name = layer.layer_name
         self.num_heads = layer.num_heads
@@ -61,17 +62,20 @@ class AttentionQuantPattern(ABC):
         )
         self.QUANT_OP = QUANT_OPS[self.quant_key]
 
-    def empty(self, *args, **kwargs):
+    def empty(self, *args: Any, **kwargs: Any) -> torch.Tensor:
         kwargs = {"dtype": self.dtype, "device": "cuda", **kwargs}
         return torch.empty(*args, **kwargs)
 
-    def empty_quant(self, *args, **kwargs):
+    def empty_quant(self, *args: Any, **kwargs: Any) -> torch.Tensor:
         kwargs = {"dtype": self.quant_dtype, "device": "cuda", **kwargs}
         return torch.empty(*args, **kwargs)
 
     @staticmethod
-    def wrap_trace_fn(trace_fn, *process_fx_fns: Callable[[fx.GraphModule], None]):
-        def wrapped(*args, **kwargs):
+    def wrap_trace_fn(
+        trace_fn: Callable[P, fx.GraphModule],
+        *process_fx_fns: Callable[[fx.GraphModule], None],
+    ) -> Callable[P, fx.GraphModule]:
+        def wrapped(*args: P.args, **kwargs: P.kwargs) -> fx.GraphModule:
             gm = trace_fn(*args, **kwargs)
             for process_fx in process_fx_fns:
                 process_fx(gm)
@@ -81,13 +85,13 @@ class AttentionQuantPattern(ABC):
         return wrapped
 
     @staticmethod
-    def fx_view_to_reshape(gm: torch.fx.GraphModule):
+    def fx_view_to_reshape(gm: torch.fx.GraphModule) -> None:
         from torch._inductor.fx_passes.post_grad import view_to_reshape
 
         view_to_reshape(gm)
 
     @staticmethod
-    def remove_noop_permutes(gm: torch.fx.GraphModule):
+    def remove_noop_permutes(gm: torch.fx.GraphModule) -> None:
         for node in gm.graph.nodes:
             if not is_func(node, torch.ops.aten.permute.default):
                 continue
@@ -100,12 +104,12 @@ class AttentionQuantPattern(ABC):
             node.replace_all_uses_with(node.args[0])
             gm.graph.erase_node(node)
 
-    def register_if_supported(self, pm_pass: PatternMatcherPass):
+    def register_if_supported(self, pm_pass: PatternMatcherPass) -> None:
         if self.layer.impl.fused_output_quant_supported(self.quant_key):
             self._register(pm_pass)
 
     @abstractmethod
-    def _register(self, pm_pass: PatternMatcherPass):
+    def _register(self, pm_pass: PatternMatcherPass) -> None:
         raise NotImplementedError
 
 
@@ -124,21 +128,21 @@ class AttentionFp8StaticQuantPattern(AttentionQuantPattern):
         layer: Attention,
         dtype: torch.dtype,
         symmetric: bool = True,
-    ):
+    ) -> None:
         quant_key = QuantKey(
             dtype=FP8_DTYPE, scale=kStaticTensorScale, symmetric=symmetric
         )
         super().__init__(layer, quant_key, dtype)
         self.quant_matcher = MatcherQuantFP8(quant_key)
 
-    def _register(self, pm_pass: PatternMatcherPass):
+    def _register(self, pm_pass: PatternMatcherPass) -> None:
         def pattern(
             q: torch.Tensor,
             k: torch.Tensor,
             v: torch.Tensor,
             output_attn: torch.Tensor,
             scale: torch.Tensor,
-        ):
+        ) -> torch.Tensor:
             at1 = auto_functionalized(
                 ATTN_OP,
                 query=q,
@@ -161,7 +165,7 @@ class AttentionFp8StaticQuantPattern(AttentionQuantPattern):
             v: torch.Tensor,
             output_attn: torch.Tensor,
             scale: torch.Tensor,
-        ):
+        ) -> torch.Tensor:
             # attn output in quant_dtype
             output_attn = torch.ops.aten.full.default(
                 [q.shape[0], self.num_heads, self.head_size],
@@ -212,10 +216,10 @@ class AttentionNvfp4QuantPattern(AttentionQuantPattern):
     will be passed into Attention op as the `output_scale` argument.
     """
 
-    def __init__(self, layer: Attention, dtype: torch.dtype):
+    def __init__(self, layer: Attention, dtype: torch.dtype) -> None:
         super().__init__(layer, kNvfp4Quant, dtype)
 
-    def _register(self, pm_pass: PatternMatcherPass):
+    def _register(self, pm_pass: PatternMatcherPass) -> None:
         def pattern(
             q: torch.Tensor,
             k: torch.Tensor,
@@ -224,7 +228,7 @@ class AttentionNvfp4QuantPattern(AttentionQuantPattern):
             output_quant: torch.Tensor,
             output_scale: torch.Tensor,
             input_scale: torch.Tensor,
-        ):
+        ) -> tuple[torch.Tensor, torch.Tensor]:
             at1 = auto_functionalized(
                 ATTN_OP,
                 query=q,
@@ -256,7 +260,7 @@ class AttentionNvfp4QuantPattern(AttentionQuantPattern):
             output_quant: torch.Tensor,
             output_scale: torch.Tensor,
             input_scale: torch.Tensor,
-        ):
+        ) -> tuple[torch.Tensor, torch.Tensor]:
             # attention output in quant_dtype
             output_attn = torch.ops.aten.full.default(
                 [q.shape[0], self.num_heads, self.head_size // 2],
@@ -318,7 +322,7 @@ class AttnFusionPass(VllmPatternMatcherPass):
     """
 
     @enable_fake_mode
-    def __init__(self, config: VllmConfig):
+    def __init__(self, config: VllmConfig) -> None:
         super().__init__(config)
 
         self.patterns = PatternMatcherPass(pass_name="attn_fusion_pass")
@@ -350,7 +354,7 @@ class AttnFusionPass(VllmPatternMatcherPass):
         self.matched_count = self.patterns.apply(graph)
         logger.debug("Fused quant onto %s attention nodes", self.matched_count)
 
-    def uuid(self):
+    def uuid(self) -> str:
         return VllmInductorPass.hash_source(
             self,
             AttentionQuantPattern,
diff --git a/vllm/compilation/fx_utils.py b/vllm/compilation/fx_utils.py
index 3650ee6b41745186fc828b0dfbd1cd06f4a15868..5c2e7ac93e66818a2b3552188aab28e36401b24d 100644
--- a/vllm/compilation/fx_utils.py
+++ b/vllm/compilation/fx_utils.py
@@ -7,10 +7,11 @@ from collections.abc import Iterable, Iterator
 from torch import fx
 from torch._higher_order_ops.auto_functionalize import auto_functionalized
 from torch._ops import OpOverload, OpOverloadPacket
+from torch.fx.node import Target
 
 
-def is_func(node: fx.Node, target) -> bool:
-    return node.op == "call_function" and node.target == target
+def is_func(node: fx.Node, target: Target) -> bool:
+    return bool(node.op == "call_function" and node.target == target)
 
 
 def is_auto_func(node: fx.Node, op: OpOverload) -> bool:
diff --git a/vllm/compilation/inductor_pass.py b/vllm/compilation/inductor_pass.py
index dbf154eeb86a444c5f5e529a2f5c51a9c72d0798..21723b6d3fff6f885e3ea9e4eb8e106b419ec9fe 100644
--- a/vllm/compilation/inductor_pass.py
+++ b/vllm/compilation/inductor_pass.py
@@ -8,9 +8,9 @@ import hashlib
 import inspect
 import json
 import types
-from collections.abc import Callable
+from collections.abc import Callable, Generator
 from contextlib import contextmanager
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, ParamSpec, TypeVar
 
 import torch
 from torch import fx
@@ -29,7 +29,12 @@ else:
         Torch25CustomGraphPass as CustomGraphPass,
     )
 
+# Re-export CustomGraphPass for external usage
+__all__ = ["CustomGraphPass"]
+
 _pass_context = None
+P = ParamSpec("P")
+R = TypeVar("R")
 
 
 class PassContext:
@@ -44,7 +49,7 @@ def get_pass_context() -> PassContext:
 
 
 @contextmanager
-def pass_context(compile_range: Range):
+def pass_context(compile_range: Range) -> Generator[None, None, None]:
     """A context manager that stores the current pass context,
     usually it is a list of sizes to specialize.
     """
@@ -57,13 +62,13 @@ def pass_context(compile_range: Range):
         _pass_context = prev_context
 
 
-class InductorPass(CustomGraphPass):
+class InductorPass(CustomGraphPass):  # type: ignore[misc]
     """
     A custom graph pass that uses a hash of its source as the UUID.
     This is defined as a convenience and should work in most cases.
     """
 
-    def uuid(self) -> Any:
+    def uuid(self) -> str:
         """
         Provide a unique identifier for the pass, used in Inductor code cache.
         This should depend on the pass implementation, so that changes to the
@@ -73,7 +78,7 @@ class InductorPass(CustomGraphPass):
         return InductorPass.hash_source(self)
 
     @staticmethod
-    def hash_source(*srcs: str | Any):
+    def hash_source(*srcs: str | Any) -> str:
         """
         Utility method to hash the sources of functions or objects.
         :param srcs: strings or objects to add to the hash.
@@ -93,7 +98,7 @@ class InductorPass(CustomGraphPass):
         return hasher.hexdigest()
 
     @staticmethod
-    def hash_dict(dict_: dict[Any, Any]):
+    def hash_dict(dict_: dict[Any, Any]) -> str:
         """
         Utility method to hash a dictionary, can alternatively be used for uuid.
         :return: A sha256 hash of the json rep of the dictionary.
@@ -101,7 +106,7 @@ class InductorPass(CustomGraphPass):
         encoded = json.dumps(dict_, sort_keys=True).encode("utf-8")
         return hashlib.sha256(encoded).hexdigest()
 
-    def is_applicable_for_range(self, compile_range: Range):
+    def is_applicable_for_range(self, compile_range: Range) -> bool:
         return True
 
 
@@ -111,25 +116,27 @@ class CallableInductorPass(InductorPass):
     implementation of the UUID.
     """
 
-    def __init__(self, callable: Callable[[fx.Graph], None], uuid: Any | None = None):
+    def __init__(
+        self, callable: Callable[[fx.Graph], None], uuid: Any | None = None
+    ) -> None:
         self.callable = callable
         self._uuid = self.hash_source(callable) if uuid is None else uuid
 
-    def __call__(self, graph: torch.fx.Graph):
+    def __call__(self, graph: torch.fx.Graph) -> None:
         self.callable(graph)
 
     def uuid(self) -> Any:
         return self._uuid
 
 
-def enable_fake_mode(fn: Callable[..., Any]) -> Callable[..., Any]:
+def enable_fake_mode(fn: Callable[P, R]) -> Callable[P, R]:
     """
     Applies a FakeTensorMode context. This is useful when you don't want to
     create or run things with real tensors.
     """
 
     @functools.wraps(fn)
-    def fn_new(*args, **kwargs) -> Any:
+    def fn_new(*args: P.args, **kwargs: P.kwargs) -> R:
         with torch._guards.tracing(None), unset_fake_temporarily(), FakeTensorMode():
             result = fn(*args, **kwargs)
 
diff --git a/vllm/compilation/matcher_utils.py b/vllm/compilation/matcher_utils.py
index a4536363c538ffabd41ee0ea7004e360ce003088..bd0ae708f83654d756fa2d19bc7d65ef35f64f7f 100644
--- a/vllm/compilation/matcher_utils.py
+++ b/vllm/compilation/matcher_utils.py
@@ -1,16 +1,19 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from abc import ABC, abstractmethod
+from typing import Any
 
 import torch
 from torch._higher_order_ops import auto_functionalized
 from torch._ops import OpOverload
 
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.config import get_current_vllm_config
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    GroupShape,
     QuantKey,
     _normalize_quant_group_shape,
     kFp8Dynamic64Sym,
@@ -45,7 +48,7 @@ SILU_MUL_OP = torch.ops._C.silu_and_mul.default
 
 
 class MatcherCustomOp(ABC):
-    def __init__(self, enabled: bool):
+    def __init__(self, enabled: bool) -> None:
         config = get_current_vllm_config()
         self.model_dtype = config.model_config.dtype if config.model_config else None
         self.device = config.device_config.device if config.device_config else None
@@ -54,24 +57,24 @@ class MatcherCustomOp(ABC):
         self.forward = self.forward_custom if enabled else self.forward_native
 
     @abstractmethod
-    def forward_custom(self, *args, **kws):
+    def forward_custom(self, *args: Any, **kwargs: Any) -> Any:
         pass
 
     @abstractmethod
-    def forward_native(self, *args, **kws):
+    def forward_native(self, *args: Any, **kwargs: Any) -> Any:
         pass
 
-    def __call__(self, *args, **kws):
-        return self.forward(*args, **kws)
+    def __call__(self, *args: Any, **kwargs: Any) -> Any:
+        return self.forward(*args, **kwargs)
 
-    def empty(self, *args, **kws):
-        return torch.empty(*args, dtype=self.model_dtype, device=self.device, **kws)
+    def empty(self, *args: Any, **kwargs: Any) -> torch.Tensor:
+        return torch.empty(*args, dtype=self.model_dtype, device=self.device, **kwargs)
 
-    def empty_int64(self, *args, **kws):
-        return torch.empty(*args, dtype=torch.int64, device=self.device, **kws)
+    def empty_int64(self, *args: Any, **kwargs: Any) -> torch.Tensor:
+        return torch.empty(*args, dtype=torch.int64, device=self.device, **kwargs)
 
-    def empty_f32(self, *args, **kws):
-        return torch.empty(*args, dtype=torch.float32, device=self.device, **kws)
+    def empty_f32(self, *args: Any, **kwargs: Any) -> torch.Tensor:
+        return torch.empty(*args, dtype=torch.float32, device=self.device, **kwargs)
 
     def inputs(self) -> list[torch.Tensor]:
         """Utility for inputs to the pattern"""
@@ -150,26 +153,50 @@ class MatcherRotaryEmbedding(MatcherCustomOp):
 
 
 class MatcherRMSNorm(MatcherCustomOp):
-    def __init__(self, epsilon: float, enabled: bool | None = None):
+    def __init__(
+        self,
+        epsilon: float,
+        enabled: bool | None = None,
+        match_rocm_aiter: bool = False,
+    ) -> None:
         if enabled is None:
             enabled = RMSNorm.enabled()
 
         super().__init__(enabled)
         self.epsilon = epsilon
+        self._rmsnorm_op = RMS_OP
+        self.match_rocm_aiter = match_rocm_aiter
+
+        if match_rocm_aiter:
+            self._rmsnorm_op = rocm_aiter_ops.get_rmsnorm_op()
 
-    def inputs(self):
+    def inputs(self) -> list[torch.Tensor]:
         input = self.empty(5, 16) if self.enabled else self.empty_f32(5, 16)
         weight = self.empty(16)
         return [input, weight]
 
+    def forward_rocm_aiter(
+        self,
+        input: torch.Tensor,
+        weight: torch.Tensor,
+    ) -> torch.Tensor:
+        return self._rmsnorm_op(
+            x=input,
+            weight=weight,
+            variance_epsilon=self.epsilon,
+        )
+
     def forward_custom(
         self,
         input: torch.Tensor,
         weight: torch.Tensor,
     ) -> torch.Tensor:
+        if self.match_rocm_aiter:
+            return self.forward_rocm_aiter(input, weight)
+
         result = torch.empty_like(input)
         _, result = auto_functionalized(
-            RMS_OP,
+            self._rmsnorm_op,
             result=result,
             input=input,
             weight=weight,
@@ -189,27 +216,51 @@ class MatcherRMSNorm(MatcherCustomOp):
 
 
 class MatcherFusedAddRMSNorm(MatcherCustomOp):
-    def __init__(self, epsilon: float, enabled: bool | None = None):
+    def __init__(
+        self,
+        epsilon: float,
+        enabled: bool | None = None,
+        match_rocm_aiter: bool = False,
+    ) -> None:
         if enabled is None:
             enabled = RMSNorm.enabled()
 
         super().__init__(enabled)
         self.epsilon = epsilon
+        self.match_rocm_aiter = match_rocm_aiter
 
-    def inputs(self):
+        self._rmsnorm_op = RMS_ADD_OP
+
+        if match_rocm_aiter:
+            self._rmsnorm_op = rocm_aiter_ops.get_rmsnorm_fused_add_op()
+
+    def inputs(self) -> list[torch.Tensor]:
         input = self.empty(5, 16) if self.enabled else self.empty_f32(5, 16)
         weight = self.empty(16)
         residual = self.empty(5, 16)
         return [input, weight, residual]
 
+    def forward_rocm_aiter(
+        self,
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        residual: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        return self._rmsnorm_op(  # type: ignore[no-any-return]
+            x=input, residual=residual, weight=weight, variance_epsilon=self.epsilon
+        )
+
     def forward_custom(
         self,
         input: torch.Tensor,
         weight: torch.Tensor,
         residual: torch.Tensor,
     ) -> tuple[torch.Tensor, torch.Tensor]:
+        if self.match_rocm_aiter:
+            return self.forward_rocm_aiter(input, weight, residual)
+
         _, result, residual = auto_functionalized(
-            RMS_ADD_OP,
+            self._rmsnorm_op,
             input=input,
             residual=residual,
             weight=weight,
@@ -236,22 +287,46 @@ class MatcherQuantFP8(MatcherCustomOp):
         enabled: bool | None = None,
         has_col_major_scales: bool = False,
         is_e8m0: bool = False,
-    ):
+        match_rocm_aiter: bool = False,
+    ) -> None:
         if enabled is None:
             enabled = QuantFP8.enabled()
 
         super().__init__(enabled)
         self.quant_key = quant_key
-        assert quant_key in QUANT_OPS, f"unsupported quantization scheme {quant_key}"
-        self.QUANT_OP = QUANT_OPS[quant_key]
-
         self.has_col_major_scales = has_col_major_scales
         self.is_e8m0 = is_e8m0
+        self.match_rocm_aiter = match_rocm_aiter
+
+        if match_rocm_aiter:
+            assert not quant_key.scale.group_shape.is_per_tensor(), (
+                "ROCm aiter fusion pass does not support per tensor quantization"
+            )
+            if quant_key.scale.group_shape.is_per_token():
+                self.QUANT_OP = rocm_aiter_ops.get_per_token_quant_op()
+            else:
+                assert quant_key.scale.group_shape.col == 128, (
+                    "ROCm aiter fusion pass currently supports "
+                    "quantization operation with group_size 128"
+                )
+                if current_platform.is_fp8_fnuz():
+                    self.QUANT_OP = rocm_aiter_ops.get_group_quant_op()
+                else:
+                    self.QUANT_OP = (
+                        torch.ops.vllm.triton_per_token_group_quant_fp8.default
+                    )
+
+        else:
+            assert quant_key in QUANT_OPS, (
+                f"unsupported quantization scheme {quant_key}"
+            )
+            self.QUANT_OP = QUANT_OPS[quant_key]
+
+            assert quant_key.dtype == current_platform.fp8_dtype(), (
+                "Only QuantFP8 supported by"
+            )
+            assert quant_key.scale2 is None
 
-        assert quant_key.dtype == current_platform.fp8_dtype(), (
-            "Only QuantFP8 supported by"
-        )
-        assert quant_key.scale2 is None
         self.quant_fp8 = QuantFP8(
             quant_key.scale.static,
             quant_key.scale.group_shape,
@@ -259,11 +334,29 @@ class MatcherQuantFP8(MatcherCustomOp):
             use_ue8m0=is_e8m0,
         )
 
+    def forward_rocm_aiter(
+        self,
+        input: torch.Tensor,
+        scale: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        quant_key_group_shape = self.quant_key.scale.group_shape
+        if quant_key_group_shape == GroupShape.PER_TOKEN:
+            return self.QUANT_OP(  # type: ignore[no-any-return]
+                x=input,
+                quant_dtype=self.quant_key.dtype,
+                scale=scale,
+            )
+        else:
+            return self.QUANT_OP(input, quant_key_group_shape.col)  # type: ignore[no-any-return]
+
     def forward_custom(
         self,
         input: torch.Tensor,
         scale: torch.Tensor | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
+        if self.match_rocm_aiter:
+            return self.forward_rocm_aiter(input, scale)
+
         result = torch.empty(
             input.shape, device=input.device, dtype=self.quant_key.dtype
         )
@@ -308,9 +401,9 @@ class MatcherQuantFP8(MatcherCustomOp):
         input: torch.Tensor,
         scale: torch.Tensor | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
-        return self.quant_fp8(input, scale)
+        return self.quant_fp8(input, scale)  # type: ignore[no-any-return]
 
-    def make_scale(self, input: torch.Tensor, transposed: bool = False):
+    def make_scale(self, input: torch.Tensor, transposed: bool = False) -> torch.Tensor:
         normalized_group_shape = _normalize_quant_group_shape(
             input, self.quant_key.scale.group_shape
         )
@@ -335,7 +428,7 @@ class MatcherQuantFP8(MatcherCustomOp):
 
 
 class MatcherSiluAndMul(MatcherCustomOp):
-    def __init__(self, enabled: bool | None = None):
+    def __init__(self, enabled: bool | None = None) -> None:
         if enabled is None:
             enabled = SiluAndMul.enabled()
         super().__init__(enabled)
diff --git a/vllm/compilation/monitor.py b/vllm/compilation/monitor.py
index 660fb9887e2cd1ff360b9734270640bf7c0c2cc1..2bad5f0a16fc81a0231945053d66b1aadaea50ed 100644
--- a/vllm/compilation/monitor.py
+++ b/vllm/compilation/monitor.py
@@ -12,7 +12,7 @@ context_manager = None
 torch_compile_start_time: float = 0.0
 
 
-def start_monitoring_torch_compile(vllm_config: VllmConfig):
+def start_monitoring_torch_compile(vllm_config: VllmConfig) -> None:
     global torch_compile_start_time
     torch_compile_start_time = time.time()
 
@@ -28,7 +28,7 @@ def start_monitoring_torch_compile(vllm_config: VllmConfig):
         context_manager.__enter__()
 
 
-def end_monitoring_torch_compile(vllm_config: VllmConfig):
+def end_monitoring_torch_compile(vllm_config: VllmConfig) -> None:
     compilation_config: CompilationConfig = vllm_config.compilation_config
     if compilation_config.mode == CompilationMode.VLLM_COMPILE:
         logger.info_once(
@@ -45,7 +45,7 @@ def end_monitoring_torch_compile(vllm_config: VllmConfig):
 cudagraph_capturing_enabled: bool = True
 
 
-def validate_cudagraph_capturing_enabled():
+def validate_cudagraph_capturing_enabled() -> None:
     # used to monitor whether a cudagraph capturing is legal at runtime.
     # should be called before any cudagraph capturing.
     # if an illegal cudagraph capturing happens, raise an error.
@@ -57,6 +57,6 @@ def validate_cudagraph_capturing_enabled():
         )
 
 
-def set_cudagraph_capturing_enabled(enabled: bool):
+def set_cudagraph_capturing_enabled(enabled: bool) -> None:
     global cudagraph_capturing_enabled
     cudagraph_capturing_enabled = enabled
diff --git a/vllm/compilation/noop_elimination.py b/vllm/compilation/noop_elimination.py
index 06e1771bac96066cba3b6235b2324f634b2966f3..9af904b457a67a25af3cf27011f24322670d3559 100644
--- a/vllm/compilation/noop_elimination.py
+++ b/vllm/compilation/noop_elimination.py
@@ -65,7 +65,7 @@ class NoOpEliminationPass(VllmInductorPass):
     """
 
     @VllmInductorPass.time_and_log
-    def __call__(self, graph: torch.fx.Graph):
+    def __call__(self, graph: torch.fx.Graph) -> None:
         count = 0
         # Remove no-op reshapes/views:
         for node in graph.nodes:
@@ -117,7 +117,7 @@ class NoOpEliminationPass(VllmInductorPass):
         2. The dimensions both correspond to the same SymInt
         """
         # Case 1
-        return statically_known_true(dim == i_dim)
+        return statically_known_true(dim == i_dim)  # type: ignore[no-any-return]
 
     def all_dims_equivalent(
         self, dims: Iterable[int | SymInt], i_dims: Iterable[int | SymInt]
diff --git a/vllm/compilation/partition_rules.py b/vllm/compilation/partition_rules.py
index 08bd27e8095268a1d5a62aeca8090ba05371a6e9..18ebb15d1112345a229eb92ebf49ae028db0d476 100644
--- a/vllm/compilation/partition_rules.py
+++ b/vllm/compilation/partition_rules.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import contextlib
+from collections.abc import Generator
 
 import torch
 
@@ -38,7 +39,9 @@ def should_split(node: torch.fx.Node, splitting_ops: list[str]) -> bool:
 
 
 @contextlib.contextmanager
-def inductor_partition_rule_context(splitting_ops: list[str]):
+def inductor_partition_rule_context(
+    splitting_ops: list[str] | None,
+) -> Generator[None, None, None]:
     """Context manager to temporarily register Inductor partition rules.
 
     Registers custom partition rules for specified operators, forcing the
diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py
index 4ebb386f75ed80c2e30d36025e1ad5688e765c8a..a207edd93905bf8928b2c07d08e2d60b0765c028 100644
--- a/vllm/compilation/pass_manager.py
+++ b/vllm/compilation/pass_manager.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import functools
+from collections.abc import Callable
+from typing import Any, ParamSpec, TypeVar
 
 from torch import fx as fx
 
@@ -16,7 +18,7 @@ from .vllm_inductor_pass import VllmInductorPass
 
 if rocm_aiter_ops.is_enabled():
     from vllm.compilation.rocm_aiter_fusion import (
-        RocmAiterRMSNormFp8GroupQuantFusionPass,
+        RocmAiterRMSNormFusionPass,
         RocmAiterSiluMulFp8GroupQuantFusionPass,
     )
 
@@ -40,8 +42,11 @@ from .noop_elimination import NoOpEliminationPass
 
 logger = init_logger(__name__)
 
+P = ParamSpec("P")
+R = TypeVar("R")
 
-def with_pattern_match_debug(fn):
+
+def with_pattern_match_debug(fn: Callable[P, R]) -> Callable[P, R]:
     """
     Function decorator that turns on inductor pattern match debug
     for the duration of the call.
@@ -49,7 +54,7 @@ def with_pattern_match_debug(fn):
     """
 
     @functools.wraps(fn)
-    def wrapper(*args, **kwargs):
+    def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
         if (debug_val := envs.VLLM_PATTERN_MATCH_DEBUG) is not None:
             # optionally check rank here
             with set_env_var("TORCHINDUCTOR_PATTERN_MATCH_DEBUG", debug_val):
@@ -59,7 +64,7 @@ def with_pattern_match_debug(fn):
     return wrapper
 
 
-class PostGradPassManager(CustomGraphPass):
+class PostGradPassManager(CustomGraphPass):  # type: ignore[misc]
     """
     The pass manager for post-grad passes.
     It handles configuration, adding custom passes, and running passes.
@@ -74,11 +79,11 @@ class PostGradPassManager(CustomGraphPass):
     This way, all passes operate on a functionalized graph.
     """
 
-    def __init__(self):
+    def __init__(self) -> None:
         self.passes: list[InductorPass] = []
 
     @with_pattern_match_debug
-    def __call__(self, graph: fx.Graph):
+    def __call__(self, graph: fx.Graph) -> None:
         VllmInductorPass.dump_prefix = 0  # reset dump index
 
         compile_range = get_pass_context().compile_range
@@ -98,7 +103,7 @@ class PostGradPassManager(CustomGraphPass):
         self.fix_functionalization(graph)
         VllmInductorPass.dump_prefix = None  # Cleanup index
 
-    def configure(self, config: VllmConfig):
+    def configure(self, config: VllmConfig) -> None:
         self.pass_config = config.compilation_config.pass_config
 
         # Set the current vllm config to allow tracing CustomOp instances
@@ -117,7 +122,9 @@ class PostGradPassManager(CustomGraphPass):
             if self.pass_config.fuse_norm_quant:
                 self.passes += [RMSNormQuantFusionPass(config)]
                 if rocm_aiter_ops.is_enabled():
-                    self.passes += [RocmAiterRMSNormFp8GroupQuantFusionPass(config)]
+                    self.passes += [
+                        RocmAiterRMSNormFusionPass(config),
+                    ]
             if self.pass_config.fuse_act_quant:
                 self.passes += [ActivationQuantFusionPass(config)]
                 if rocm_aiter_ops.is_enabled():
@@ -133,23 +140,25 @@ class PostGradPassManager(CustomGraphPass):
             self.post_cleanup = PostCleanupPass(config)
             self.fix_functionalization = FixFunctionalizationPass(config)
 
-    def add(self, pass_: InductorPass):
+    def add(self, pass_: InductorPass) -> None:
         assert isinstance(pass_, InductorPass)
         self.passes.append(pass_)
 
-    def uuid(self):
+    def uuid(self) -> str:
         """
         The PostGradPassManager is set as a custom pass in the Inductor and
         affects compilation caching. Its uuid depends on the UUIDs of all
         dependent passes and the pass config. See InductorPass for more info.
         """
-        state = {"pass_config": self.pass_config.compute_hash(), "passes": []}
+        passes = []
+
+        state: dict[str, Any] = {"pass_config": self.pass_config.compute_hash()}
         for pass_ in self.passes:
-            state["passes"].append(pass_.uuid())
-        state["passes"].append(self.fix_functionalization.uuid())
+            passes.append(pass_.uuid())
+        passes.append(self.fix_functionalization.uuid())
 
         # Include the compile range in the uuid to ensure that inductor
         # recompiles the graph for the new dynamic compile range.
         state["compile_range"] = str(get_pass_context().compile_range)
-
+        state["passes"] = passes
         return InductorPass.hash_dict(state)
diff --git a/vllm/compilation/piecewise_backend.py b/vllm/compilation/piecewise_backend.py
index 58d3e2a14b22ac88e053bd2b2554686ff4371fad..29d6f89990cd87390a27e6a522dc75523214c1d5 100644
--- a/vllm/compilation/piecewise_backend.py
+++ b/vllm/compilation/piecewise_backend.py
@@ -86,27 +86,36 @@ class PiecewiseBackend:
         self.to_be_compiled_ranges: set[Range] = set(self.compile_ranges)
 
         # We only keep compilation management inside this class directly.
-        for size in self.compile_sizes:
-            range = Range(start=size, end=size)
-            if range not in self.compile_ranges:
-                self.range_entries[range] = RangeEntry(
-                    compile_range=range,
-                )
-                self.to_be_compiled_ranges.add(range)
+        if self.compile_sizes is not None:
+            for size in self.compile_sizes:
+                if isinstance(size, str):
+                    assert size == "cudagraph_capture_sizes"
+                    raise NotImplementedError(
+                        "cudagraph_capture_sizes not supported in compile_sizes."
+                        "This should be handled in `post_init_cudagraph_sizes`."
+                    )
+                else:
+                    assert isinstance(size, int)
+                    range = Range(start=size, end=size)
+                    if range not in self.compile_ranges:
+                        self.range_entries[range] = RangeEntry(
+                            compile_range=range,
+                        )
+                        self.to_be_compiled_ranges.add(range)
 
         for range in self.compile_ranges:
             self.range_entries[range] = RangeEntry(
                 compile_range=range,
             )
 
-    def check_for_ending_compilation(self):
+    def check_for_ending_compilation(self) -> None:
         if self.is_last_graph and not self.to_be_compiled_ranges:
             # no specific sizes to compile
             # save the hash of the inductor graph for the next run
             self.vllm_backend.compiler_manager.save_to_file()
             end_monitoring_torch_compile(self.vllm_config)
 
-    def _fakify_args(self, args: list[Any]) -> list[Any]:
+    def _fakify_args(self, args: tuple[Any, ...]) -> list[Any]:
         # We need to pass fake example_inputs, otherwise torch.compile
         # will fakify the example_inputs potentially causing some non dynamic
         # dimension to be be duck shaped to other existing shapes that have hints
@@ -127,7 +136,9 @@ class PiecewiseBackend:
         assert len(fake_example_inputs) == len(args)
         return fake_example_inputs
 
-    def _maybe_compile_for_range_entry(self, range_entry: RangeEntry, args) -> Any:
+    def _maybe_compile_for_range_entry(
+        self, range_entry: RangeEntry, args: tuple[Any, ...]
+    ) -> Any:
         if not range_entry.compiled:
             range_entry.compiled = True
             self.to_be_compiled_ranges.remove(range_entry.compile_range)
@@ -136,14 +147,14 @@ class PiecewiseBackend:
             # fakify for range, real args for concrete size.
             # For concrete size, we clear the shape env in
             # compiler_manager.compile() so no need to fakify.
-            args = (
+            args_list = (
                 self._fakify_args(args)
                 if not range_entry.compile_range.is_single_size()
-                else args
+                else list(args)
             )
             range_entry.runnable = self.vllm_backend.compiler_manager.compile(
                 self.graph,
-                args,
+                args_list,
                 self.vllm_backend.inductor_config,
                 self.compilation_config,
                 compile_range=range_entry.compile_range,
@@ -153,10 +164,13 @@ class PiecewiseBackend:
 
             self.check_for_ending_compilation()
 
-    def _find_range_for_shape(self, runtime_shape: int) -> Range | None:
+    def _find_range_for_shape(self, runtime_shape: int) -> RangeEntry | None:
         # First we try to find the range entry for the concrete compile size
         # If not found, we search for the range entry
         # that contains the runtime shape.
+        if self.compile_sizes is None:
+            return None
+
         if runtime_shape in self.compile_sizes:
             return self.range_entries[Range(start=runtime_shape, end=runtime_shape)]
         else:
@@ -165,13 +179,12 @@ class PiecewiseBackend:
                     return self.range_entries[range]
         return None
 
-    def __call__(self, *args) -> Any:
+    def __call__(self, *args: Any) -> Any:
         runtime_shape = args[self.sym_shape_indices[0]]
         range_entry = self._find_range_for_shape(runtime_shape)
 
         assert range_entry is not None, (
-            f"Shape out of considered range: {runtime_shape} "
-            "[1, max_num_batched_tokens]"
+            f"Shape: {runtime_shape} out of considered ranges: {self.compile_ranges}"
         )
 
         self._maybe_compile_for_range_entry(range_entry, args)
diff --git a/vllm/compilation/qk_norm_rope_fusion.py b/vllm/compilation/qk_norm_rope_fusion.py
index 794cd8e3fce568e387481a10d9809843bc676582..bc95b7238af308abcea024c0cd2fb89514ed3577 100644
--- a/vllm/compilation/qk_norm_rope_fusion.py
+++ b/vllm/compilation/qk_norm_rope_fusion.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Callable
+from typing import ParamSpec
 
 import torch
 import torch._inductor.pattern_matcher as pm
@@ -23,6 +24,8 @@ logger = init_logger(__name__)
 
 FUSED_QK_ROPE_OP = torch.ops._C.fused_qk_norm_rope.default
 
+P = ParamSpec("P")
+
 
 class QkNormRopePattern:
     """
@@ -72,7 +75,7 @@ class QkNormRopePattern:
             use_flashinfer=self.rope_flashinfer,
         )
 
-    def get_inputs(self):
+    def get_inputs(self) -> list[torch.Tensor]:
         # Sample inputs to help pattern tracing
         T = 5
         qkv = empty_bf16(T, self.q_size + 2 * self.kv_size)
@@ -92,8 +95,11 @@ class QkNormRopePattern:
         ]
 
     @staticmethod
-    def wrap_trace_fn(trace_fn, *process_fx_fns: Callable[[fx.GraphModule], None]):
-        def wrapped(*args, **kwargs):
+    def wrap_trace_fn(
+        trace_fn: Callable[P, fx.GraphModule],
+        *process_fx_fns: Callable[[fx.GraphModule], None],
+    ) -> Callable[P, fx.GraphModule]:
+        def wrapped(*args: P.args, **kwargs: P.kwargs) -> fx.GraphModule:
             gm = trace_fn(*args, **kwargs)
             for process_fx in process_fx_fns:
                 process_fx(gm)
@@ -103,19 +109,19 @@ class QkNormRopePattern:
         return wrapped
 
     @staticmethod
-    def fx_view_to_reshape(gm: torch.fx.GraphModule):
+    def fx_view_to_reshape(gm: torch.fx.GraphModule) -> None:
         from torch._inductor.fx_passes.post_grad import view_to_reshape
 
         view_to_reshape(gm)
 
-    def register(self, pm_pass: PatternMatcherPass):
+    def register(self, pm_pass: PatternMatcherPass) -> None:
         def pattern(
             qkv: torch.Tensor,
             positions: torch.Tensor,
             q_weight: torch.Tensor,
             k_weight: torch.Tensor,
             cos_sin_cache: torch.Tensor,
-        ):
+        ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
             # split qkv -> q,k,v
             q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
 
@@ -143,7 +149,7 @@ class QkNormRopePattern:
             q_weight: torch.Tensor,
             k_weight: torch.Tensor,
             cos_sin_cache: torch.Tensor,
-        ):
+        ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
             # Run fused qk_norm_rope op
             result = auto_functionalized(
                 FUSED_QK_ROPE_OP,
@@ -162,7 +168,7 @@ class QkNormRopePattern:
             result_qkv = result[1]
 
             # Split back to q,k,v and return
-            return result_qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+            return result_qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)  # type: ignore[no-any-return]
 
         # NOTE: use fx_view_to_reshape to unify view/reshape to simplify
         # pattern and increase matching opportunities
@@ -182,7 +188,7 @@ class QKNormRoPEFusionPass(VllmPatternMatcherPass):
     """Fuse Q/K RMSNorm + RoPE into fused_qk_norm_rope when the custom op exists."""
 
     @enable_fake_mode
-    def __init__(self, config: VllmConfig):
+    def __init__(self, config: VllmConfig) -> None:
         super().__init__(config)
         self.patterns: PatternMatcherPass = PatternMatcherPass(
             pass_name="qk_norm_rope_fusion_pass"
@@ -234,5 +240,5 @@ class QKNormRoPEFusionPass(VllmPatternMatcherPass):
         self.matched_count = self.patterns.apply(graph)
         logger.debug("Fused QK Norm+RoPE on %s sites", self.matched_count)
 
-    def uuid(self):
+    def uuid(self) -> str:
         return VllmInductorPass.hash_source(self, QkNormRopePattern)
diff --git a/vllm/compilation/rocm_aiter_fusion.py b/vllm/compilation/rocm_aiter_fusion.py
index 8b5db9de38181a635f6a8ff24632f682265614f2..7a300cf5077785734b899882219a7429028314c7 100644
--- a/vllm/compilation/rocm_aiter_fusion.py
+++ b/vllm/compilation/rocm_aiter_fusion.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Any
 
 import torch
 import torch._inductor.pattern_matcher as pm
@@ -9,60 +8,195 @@ from torch._inductor.pattern_matcher import PatternMatcherPass
 from torch._ops import OpOverload
 
 import vllm.model_executor.layers.quantization.utils.fp8_utils  # noqa: F401
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.compilation.activation_quant_fusion import ActivationQuantPattern
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    GroupShape,
+    QuantKey,
+    ScaleDesc,
+)
 from vllm.platforms import current_platform
 
-from .fusion import empty_bf16
+from .fusion import (
+    FusedRMSQuantKey,
+)
 from .inductor_pass import enable_fake_mode
-from .matcher_utils import MatcherSiluAndMul
+from .matcher_utils import (
+    MatcherFusedAddRMSNorm,
+    MatcherQuantFP8,
+    MatcherRMSNorm,
+    MatcherSiluAndMul,
+)
 from .vllm_inductor_pass import VllmInductorPass, VllmPatternMatcherPass
 
 logger = init_logger(__name__)
 FP8_DTYPE = current_platform.fp8_dtype()
 
-AITER_RMS_GROUP_QUANT_OP = torch.ops.vllm.rocm_aiter_rmsnorm_fp8_group_quant.default
-AITER_RMS_ADD_GROUP_QUANT_OP = (
-    torch.ops.vllm.rocm_aiter_rmsnorm_with_add_fp8_group_quant.default
-)
 
-AITER_RMS_OP = torch.ops.vllm.rocm_aiter_rms_norm.default
-AITER_RMS_ADD_OP = torch.ops.vllm.rocm_aiter_rmsnorm2d_fwd_with_add.default
+class AiterRMSNormQuantPattern:
+    def __init__(
+        self, epsilon: float, key: FusedRMSQuantKey, match_aiter_quant: bool = True
+    ):
+        self.epsilon = epsilon
+        self.quant_dtype = key.quant.dtype
+
+        self.rmsnorm_matcher = (
+            MatcherRMSNorm(epsilon, match_rocm_aiter=True)
+            if not key.fused_add
+            else MatcherFusedAddRMSNorm(epsilon, match_rocm_aiter=True)
+        )
+        self.quant_matcher = MatcherQuantFP8(
+            key.quant,
+            match_rocm_aiter=match_aiter_quant,
+        )
+
+
+class AiterRMSNormDynamicQuantPattern(AiterRMSNormQuantPattern):
+    """AITER RMSNorm + Dynamic Quantization pattern."""
+
+    FUSED_OP = rocm_aiter_ops.get_rmsnorm_fused_dynamic_quant_op()
+
+    def __init__(
+        self,
+        epsilon: float,
+        quant_dtype: torch.dtype,
+        match_aiter_quant: bool = True,
+        group_shape: GroupShape = GroupShape.PER_TOKEN,
+        symmetric: bool = True,
+    ) -> None:
+        scale = ScaleDesc(torch.float32, False, group_shape)
+        key = FusedRMSQuantKey(
+            fused_add=False,
+            quant=QuantKey(dtype=quant_dtype, scale=scale, symmetric=symmetric),
+        )
+
+        super().__init__(epsilon, key, match_aiter_quant)
+
+    def register(self, pm_pass: PatternMatcherPass) -> None:
+        def pattern(
+            input: torch.Tensor,
+            weight: torch.Tensor,
+        ) -> tuple[torch.Tensor, torch.Tensor]:
+            result_rms = self.rmsnorm_matcher(input, weight)
+            result, scale = self.quant_matcher(result_rms)
+            return result, scale
+
+        def replacement(
+            input: torch.Tensor,
+            weight: torch.Tensor,
+        ) -> tuple[torch.Tensor, torch.Tensor]:
+            result = self.FUSED_OP(
+                x=input,
+                weight=weight,
+                epsilon=self.epsilon,
+                quant_dtype=self.quant_dtype,
+            )
+
+            return result[0], result[1]
+
+        pm.register_replacement(
+            pattern,
+            replacement,
+            self.rmsnorm_matcher.inputs(),
+            pm.fwd_only,
+            pm_pass,
+        )
+
+
+class AiterFusedAddRMSNormDynamicQuantPattern(AiterRMSNormQuantPattern):
+    """AITER RMSNorm Fused Add + Dynamic Quantization pattern."""
+
+    FUSED_OP = rocm_aiter_ops.get_rmsnorm_fused_add_dynamic_quant_op()
+
+    def __init__(
+        self,
+        epsilon: float,
+        quant_dtype: torch.dtype,
+        match_aiter_quant: bool = True,
+        group_shape: GroupShape = GroupShape.PER_TOKEN,
+        symmetric: bool = True,
+    ) -> None:
+        scale = ScaleDesc(torch.float32, False, group_shape)
+        key = FusedRMSQuantKey(
+            fused_add=True,
+            quant=QuantKey(dtype=quant_dtype, scale=scale, symmetric=symmetric),
+        )
+
+        super().__init__(epsilon, key, match_aiter_quant)
+
+    def register(self, pm_pass: PatternMatcherPass) -> None:
+        def pattern(
+            input: torch.Tensor,
+            weight: torch.Tensor,
+            residual: torch.Tensor,
+        ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+            result_rms, residual_out = self.rmsnorm_matcher(input, weight, residual)
+            result, scale = self.quant_matcher(result_rms)
+
+            return result, residual_out, scale
+
+        def replacement(
+            input: torch.Tensor, weight: torch.Tensor, residual: torch.Tensor
+        ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+            result = self.FUSED_OP(
+                x=input,
+                residual=residual,
+                weight=weight,
+                epsilon=self.epsilon,
+                quant_dtype=self.quant_dtype,
+            )
 
-AITER_GROUP_FP8_QUANT_OP = torch.ops.vllm.rocm_aiter_group_fp8_quant.default
-TRITON_GROUP_FP8_QUANT_OP = torch.ops.vllm.triton_per_token_group_quant_fp8.default
+            return result[0], result[1], result[2]
 
-FUSED_SILU_MUL_QUANT_OP = torch.ops.vllm.rocm_aiter_act_mul_and_fp8_group_quant.default
+        pm.register_replacement(
+            pattern,
+            replacement,
+            self.rmsnorm_matcher.inputs(),
+            pm.fwd_only,
+            pm_pass,
+        )
 
 
-class AiterRMSFp8GroupQuantPattern:
+class AiterRMSFp8GroupQuantPattern(AiterRMSNormQuantPattern):
     """
     This pattern fuses aiter rms_norm & group fp8 quant custom
     ops into an aiter rms_norm_group_fp8_quant op.
     """
 
-    def __init__(self, epsilon: float, quant_dtype: torch.dtype, quant_op: OpOverload):
-        self.epsilon = epsilon
-        self.quant_dtype = quant_dtype
-        self.quant_op = quant_op
+    FUSED_OP = rocm_aiter_ops.get_rmsnorm_group_fused_quant_op()
+
+    def __init__(
+        self,
+        epsilon: float,
+        quant_dtype: torch.dtype,
+        group_shape: GroupShape,
+        match_aiter_quant: bool = True,
+        symmetric: bool = True,
+    ) -> None:
+        scale = ScaleDesc(torch.float32, False, group_shape)
+        key = FusedRMSQuantKey(
+            fused_add=False,
+            quant=QuantKey(dtype=quant_dtype, scale=scale, symmetric=symmetric),
+        )
 
-    def register(self, pm_pass: PatternMatcherPass):
+        super().__init__(epsilon, key, match_aiter_quant)
+
+    def register(self, pm_pass: PatternMatcherPass) -> None:
         def pattern(
             input: torch.Tensor,
             weight: torch.Tensor,
-        ):
-            at1 = AITER_RMS_OP(x=input, weight=weight, variance_epsilon=self.epsilon)
-
-            at2 = self.quant_op(at1, 128)
-
-            return at2[0], at2[1]
+        ) -> tuple[torch.Tensor, torch.Tensor]:
+            result_rms = self.rmsnorm_matcher(input, weight)
+            result, scale = self.quant_matcher(result_rms)
+            return result, scale
 
         def replacement(
             input: torch.Tensor,
             weight: torch.Tensor,
-        ):
-            at = AITER_RMS_GROUP_QUANT_OP(
+        ) -> tuple[torch.Tensor, torch.Tensor]:
+            at = self.FUSED_OP(
                 x=input,
                 weight=weight,
                 variance_epsilon=self.epsilon,
@@ -71,49 +205,52 @@ class AiterRMSFp8GroupQuantPattern:
 
             return at[0], at[1]
 
-        inputs = [
-            empty_bf16(5, 4),  # input
-            empty_bf16(1, 5),  # weight
-        ]
-
-        pm.register_replacement(pattern, replacement, inputs, pm.fwd_only, pm_pass)
+        pm.register_replacement(
+            pattern, replacement, self.rmsnorm_matcher.inputs(), pm.fwd_only, pm_pass
+        )
 
 
-class AiterFusedAddRMSFp8GroupQuantPattern:
+class AiterFusedAddRMSFp8GroupQuantPattern(AiterRMSNormQuantPattern):
     """
     This pattern fuses aiter rms_norm_with_add & group fp8 quant custom ops
     into a aiter rms_norm_with_add_group_fp8_quant op.
     """
 
-    def __init__(self, epsilon: float, quant_dtype: torch.dtype, quant_op: OpOverload):
-        self.epsilon = epsilon
-        self.quant_dtype = quant_dtype
-        self.quant_op = quant_op
+    FUSED_OP = rocm_aiter_ops.get_rmsnorm_group_add_fused_quant_op()
+
+    def __init__(
+        self,
+        epsilon: float,
+        quant_dtype: torch.dtype,
+        group_shape: GroupShape,
+        match_aiter_quant: bool = True,
+        symmetric: bool = True,
+    ) -> None:
+        scale = ScaleDesc(torch.float32, False, group_shape)
+        key = FusedRMSQuantKey(
+            fused_add=True,
+            quant=QuantKey(dtype=quant_dtype, scale=scale, symmetric=symmetric),
+        )
 
-    def register(self, pm_pass: PatternMatcherPass):
+        super().__init__(epsilon, key, match_aiter_quant)
+
+    def register(self, pm_pass: PatternMatcherPass) -> None:
         def pattern(
             input: torch.Tensor,
-            residual: torch.Tensor,
             weight: torch.Tensor,
-        ):
-            at1 = AITER_RMS_ADD_OP(
-                x=input,
-                residual=residual,
-                weight=weight,
-                variance_epsilon=self.epsilon,
-            )
-
-            at2 = self.quant_op(at1[0], 128)
+            residual: torch.Tensor,
+        ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+            result_rms, residual_out = self.rmsnorm_matcher(input, weight, residual)
+            result, scale = self.quant_matcher(result_rms)
 
-            # result, scale, residual
-            return at2[0], at2[1], at1[1]
+            return result, residual_out, scale
 
         def replacement(
             input: torch.Tensor,
-            residual: torch.Tensor,
             weight: torch.Tensor,
-        ):
-            at = AITER_RMS_ADD_GROUP_QUANT_OP(
+            residual: torch.Tensor,
+        ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+            at = self.FUSED_OP(
                 x=input,
                 residual=residual,
                 weight=weight,
@@ -124,51 +261,63 @@ class AiterFusedAddRMSFp8GroupQuantPattern:
             # result, scale, residual
             return at[0], at[1], at[2]
 
-        inputs = [
-            empty_bf16(5, 4),  # input
-            empty_bf16(5, 4),  # residual
-            empty_bf16(1, 5),  # weight
-        ]
-
-        pm.register_replacement(pattern, replacement, inputs, pm.fwd_only, pm_pass)
+        pm.register_replacement(
+            pattern, replacement, self.rmsnorm_matcher.inputs(), pm.fwd_only, pm_pass
+        )
 
 
-class RocmAiterRMSNormFp8GroupQuantFusionPass(VllmPatternMatcherPass):
+class RocmAiterRMSNormFusionPass(VllmPatternMatcherPass):
     """
-    This pass fuses rms_norm & quant custom ops into a fused rms_norm_quant op.
+    This pass fuses aiter rms_norm & vllm/aiter quant custom ops
+    into a fused rms_norm_quant op.
     It also supports fused_add_rms_norm.
     """
 
     @enable_fake_mode
-    def __init__(self, config: VllmConfig):
+    def __init__(self, config: VllmConfig) -> None:
         super().__init__(config)
 
         self.patterns: PatternMatcherPass = PatternMatcherPass(
-            pass_name="rocm_aiter_rms_norm_fp8_group_quant_fusion_pass"
+            pass_name="rocm_aiter_rms_norm_quant_fusion_pass"
         )
 
         # Make sure fused add patterns are before simple rms norm,
         # as the latter is a subset of the former in torch ops
         for epsilon in [1e-5, 1e-6]:
-            # Fuse rms_norm + dynamic group fp8 quant
-            for quant_op in [AITER_GROUP_FP8_QUANT_OP, TRITON_GROUP_FP8_QUANT_OP]:
-                AiterRMSFp8GroupQuantPattern(epsilon, FP8_DTYPE, quant_op).register(
-                    self.patterns
-                )
-
-                AiterFusedAddRMSFp8GroupQuantPattern(
-                    epsilon, FP8_DTYPE, quant_op
+            #  Fuse aiter rms_norm + aiter dynamic group fp8 quant
+            AiterRMSFp8GroupQuantPattern(
+                epsilon, FP8_DTYPE, GroupShape(1, 128)
+            ).register(self.patterns)
+
+            # Fuse aiter fused_add_rms_norm + aiter dynamic group fp8 quant
+            AiterFusedAddRMSFp8GroupQuantPattern(
+                epsilon, FP8_DTYPE, GroupShape(1, 128)
+            ).register(self.patterns)
+
+            for match_aiter_quant in [True, False]:
+                # Fuse aiter rms_norm + (aiter / vllm built-in)
+                # dynamic per-token fp8 quant
+                AiterRMSNormDynamicQuantPattern(
+                    epsilon, FP8_DTYPE, match_aiter_quant=match_aiter_quant
+                ).register(self.patterns)
+
+                # Fuse aiter fused_add_rms_norm + (aiter / vllm built-in)
+                # dynamic per-token fp8 quant
+                AiterFusedAddRMSNormDynamicQuantPattern(
+                    epsilon, FP8_DTYPE, match_aiter_quant=match_aiter_quant
                 ).register(self.patterns)
 
         self.dump_patterns(config, self.patterns)
 
     @VllmInductorPass.time_and_log
-    def __call__(self, graph: fx.Graph):
+    def __call__(self, graph: fx.Graph) -> None:
         self.matched_count = self.patterns.apply(graph)
         logger.debug("Replaced %s patterns", self.matched_count)
 
-    def uuid(self) -> Any:
+    def uuid(self) -> str:
         fusion_patterns = [
+            AiterRMSNormDynamicQuantPattern,
+            AiterFusedAddRMSNormDynamicQuantPattern,
             AiterRMSFp8GroupQuantPattern,
             AiterFusedAddRMSFp8GroupQuantPattern,
         ]
@@ -181,29 +330,34 @@ class AiterSiluMulFp8GroupQuantPattern(ActivationQuantPattern):
     ops into an aiter silu_and_mul_group_fp8_quant op.
     """
 
-    def __init__(self, quant_op: OpOverload):
+    FUSED_SILU_MUL_QUANT_OP = rocm_aiter_ops.get_act_mul_fused_fp8_group_quant_op()
+
+    def __init__(self, quant_op: OpOverload) -> None:
         self.silu_and_mul_matcher = MatcherSiluAndMul()
         self.quant_op = quant_op
 
-    def register(self, pm_pass: PatternMatcherPass):
+    def get_inputs(self) -> list[torch.Tensor]:
+        return [
+            self.silu_and_mul_matcher.inputs()[0],
+        ]
+
+    def register(self, pm_pass: PatternMatcherPass) -> None:
         def pattern(
             input: torch.Tensor,
-        ):
+        ) -> tuple[torch.Tensor, torch.Tensor]:
             at1 = self.silu_and_mul_matcher(input)
             at2 = self.quant_op(at1, 128)
             return at2[0], at2[1]
 
         def replacement(
             input: torch.Tensor,
-        ):
-            at = FUSED_SILU_MUL_QUANT_OP(x=input, group_size=128)
+        ) -> tuple[torch.Tensor, torch.Tensor]:
+            at = self.FUSED_SILU_MUL_QUANT_OP(x=input, group_size=128)
             return at[0], at[1]
 
-        inputs = [
-            self.silu_and_mul_matcher.inputs()[0],
-        ]
-
-        pm.register_replacement(pattern, replacement, inputs, pm.fwd_only, pm_pass)
+        pm.register_replacement(
+            pattern, replacement, self.get_inputs(), pm.fwd_only, pm_pass
+        )
 
 
 class RocmAiterSiluMulFp8GroupQuantFusionPass(VllmPatternMatcherPass):
@@ -216,25 +370,30 @@ class RocmAiterSiluMulFp8GroupQuantFusionPass(VllmPatternMatcherPass):
     https://github.com/pytorch/pytorch/pull/139321#issuecomment-2452354980
     """
 
+    AITER_GROUP_FP8_QUANT_OP = rocm_aiter_ops.get_group_quant_op()
+    TRITON_GROUP_FP8_QUANT_OP = torch.ops.vllm.triton_per_token_group_quant_fp8.default
+
+    QUANT_OPS = [AITER_GROUP_FP8_QUANT_OP, TRITON_GROUP_FP8_QUANT_OP]
+
     @enable_fake_mode
-    def __init__(self, config: VllmConfig):
+    def __init__(self, config: VllmConfig) -> None:
         super().__init__(config)
 
         self.patterns: PatternMatcherPass = PatternMatcherPass(
             pass_name="rocm_aiter_silu_mul_fp8_group_quant_fusion_pass"
         )
 
-        for quant_op in [AITER_GROUP_FP8_QUANT_OP, TRITON_GROUP_FP8_QUANT_OP]:
+        for quant_op in self.QUANT_OPS:
             AiterSiluMulFp8GroupQuantPattern(quant_op).register(self.patterns)
 
         self.dump_patterns(config, self.patterns)
 
     @VllmInductorPass.time_and_log
-    def __call__(self, graph: torch.fx.Graph):
+    def __call__(self, graph: torch.fx.Graph) -> None:
         self.matched_count = self.patterns.apply(graph)
         logger.debug("Replaced %s patterns", self.matched_count)
 
-    def uuid(self):
+    def uuid(self) -> str:
         fusion_patterns = [
             ActivationQuantPattern,
             AiterSiluMulFp8GroupQuantPattern,
diff --git a/vllm/compilation/sequence_parallelism.py b/vllm/compilation/sequence_parallelism.py
index 9fb8cca9760a3d0845758f6dd8bcc8f587d5c4c4..7d83f4863d425d59d9ca120bec148788f2b3fcff 100644
--- a/vllm/compilation/sequence_parallelism.py
+++ b/vllm/compilation/sequence_parallelism.py
@@ -2,6 +2,8 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import functools
+from collections.abc import Callable, Sequence
+from typing import Any
 
 import torch
 import torch._inductor.pattern_matcher as pm
@@ -26,9 +28,11 @@ from .vllm_inductor_pass import VllmInductorPass, VllmPatternMatcherPass
 logger = init_logger(__name__)
 
 
-def get_first_out_wrapper(fn):
+def get_first_out_wrapper(
+    fn: Callable[..., Sequence[torch.Tensor]],
+) -> Callable[..., torch.Tensor]:
     @functools.wraps(fn)
-    def wrapper(*args):
+    def wrapper(*args: Any) -> torch.Tensor:
         return fn(*args)[0]
 
     return wrapper
@@ -41,8 +45,8 @@ class _SequenceParallelPatternHelper:
         self,
         epsilon: float,
         dtype: torch.dtype,
-        device: str,
-    ):
+        device: str | None,
+    ) -> None:
         self.epsilon = epsilon
         self.dtype = dtype
         self.device = device
@@ -64,21 +68,21 @@ class _SequenceParallelPatternHelper:
 
 
 class FirstAllReduceRMSNormPattern(_SequenceParallelPatternHelper):
-    def __init__(self, epsilon: float, dtype: torch.dtype, device: str):
+    def __init__(self, epsilon: float, dtype: torch.dtype, device: str | None) -> None:
         super().__init__(epsilon, dtype, device)
         self.rmsnorm_matcher = MatcherRMSNorm(epsilon)
 
-    def get_inputs(self):
+    def get_inputs(self) -> list[torch.Tensor]:
         input = torch.empty([1, 8, 4], device=self.device, dtype=self.dtype)
         arg3_1 = torch.empty([4], device=self.device, dtype=self.dtype)
 
         return [input, arg3_1]
 
-    def register(self, pm_pass: PatternMatcherPass):
+    def register(self, pm_pass: PatternMatcherPass) -> None:
         def pattern(
             input: torch.Tensor,
             arg3_1: torch.Tensor,
-        ):
+        ) -> tuple[torch.Tensor, torch.Tensor]:
             all_reduce = self._all_reduce(input)
             rmsnorm = self.rmsnorm_matcher(all_reduce, arg3_1)
 
@@ -87,7 +91,7 @@ class FirstAllReduceRMSNormPattern(_SequenceParallelPatternHelper):
         def replacement(
             input: torch.Tensor,
             arg3_1: torch.Tensor,
-        ):
+        ) -> tuple[torch.Tensor, torch.Tensor]:
             reduce_scatter = self._reduce_scatter(input)
 
             rmsnorm = self.rmsnorm_matcher(reduce_scatter, arg3_1)
@@ -100,11 +104,11 @@ class FirstAllReduceRMSNormPattern(_SequenceParallelPatternHelper):
 
 
 class MiddleAllReduceRMSNormPattern(_SequenceParallelPatternHelper):
-    def __init__(self, epsilon: float, dtype: torch.dtype, device: str):
+    def __init__(self, epsilon: float, dtype: torch.dtype, device: str | None) -> None:
         super().__init__(epsilon, dtype, device)
         self.rmsnorm_matcher = MatcherFusedAddRMSNorm(epsilon)
 
-    def get_inputs(self):
+    def get_inputs(self) -> list[torch.Tensor]:
         mm_1 = torch.empty([4, 4], device=self.device, dtype=self.dtype)
 
         residual = torch.empty([4, 4], device=self.device, dtype=self.dtype)
@@ -116,7 +120,7 @@ class MiddleAllReduceRMSNormPattern(_SequenceParallelPatternHelper):
             rms_norm_weights,
         ]
 
-    def register(self, pm_pass: PatternMatcherPass):
+    def register(self, pm_pass: PatternMatcherPass) -> None:
         def pattern(
             residual: torch.Tensor,
             mm_1: torch.Tensor,
@@ -162,24 +166,24 @@ class FirstAllReduceRMSNormStaticFP8Pattern(_SequenceParallelPatternHelper):
         self,
         epsilon: float,
         dtype: torch.dtype,
-        device: str,
-    ):
+        device: str | None,
+    ) -> None:
         super().__init__(epsilon, dtype, device)
         self.rmsnorm_matcher = MatcherRMSNorm(epsilon)
         self.quant_matcher = MatcherQuantFP8(kFp8StaticTensorSym)
 
-    def get_inputs(self):
+    def get_inputs(self) -> list[torch.Tensor]:
         input = torch.zeros([1, 8, 4], device=self.device, dtype=self.dtype)
         weight = torch.empty([4], device=self.device, dtype=self.dtype)
         scale = torch.tensor(1.0, device=self.device, dtype=torch.float32)
         return [input, weight, scale]
 
-    def register(self, pm_pass: PatternMatcherPass):
+    def register(self, pm_pass: PatternMatcherPass) -> None:
         def pattern(
             input: torch.Tensor,
             weight: torch.Tensor,
             scale: torch.Tensor,
-        ):
+        ) -> tuple[torch.Tensor, torch.Tensor]:
             all_reduce = self._all_reduce(input)
             rms = self.rmsnorm_matcher(all_reduce, weight)
             quant, _ = self.quant_matcher(rms, scale)
@@ -189,7 +193,7 @@ class FirstAllReduceRMSNormStaticFP8Pattern(_SequenceParallelPatternHelper):
             input: torch.Tensor,
             weight: torch.Tensor,
             scale: torch.Tensor,
-        ):
+        ) -> tuple[torch.Tensor, torch.Tensor]:
             reduce_scatter = self._reduce_scatter(input)
             rms = self.rmsnorm_matcher(reduce_scatter, weight)
             quant, _ = self.quant_matcher(rms, scale)
@@ -203,12 +207,12 @@ class FirstAllReduceRMSNormStaticFP8Pattern(_SequenceParallelPatternHelper):
 
 
 class MiddleAllReduceRMSNormStaticFP8Pattern(_SequenceParallelPatternHelper):
-    def __init__(self, epsilon: float, dtype: torch.dtype, device: str):
+    def __init__(self, epsilon: float, dtype: torch.dtype, device: str | None) -> None:
         super().__init__(epsilon, dtype, device)
         self.rmsnorm_matcher = MatcherFusedAddRMSNorm(epsilon)
         self.quant_matcher = MatcherQuantFP8(kFp8StaticTensorSym)
 
-    def get_inputs(self):
+    def get_inputs(self) -> list[torch.Tensor]:
         mm_1 = torch.empty([4, 4], device=self.device, dtype=self.dtype)
         residual = torch.empty([4, 4], device=self.device, dtype=self.dtype)
         rms_norm_weights = torch.empty([4, 4], device=self.device, dtype=self.dtype)
@@ -216,7 +220,7 @@ class MiddleAllReduceRMSNormStaticFP8Pattern(_SequenceParallelPatternHelper):
 
         return [residual, mm_1, rms_norm_weights, scale]
 
-    def register(self, pm_pass: PatternMatcherPass):
+    def register(self, pm_pass: PatternMatcherPass) -> None:
         def pattern(
             residual: torch.Tensor,
             mm_1: torch.Tensor,
@@ -302,7 +306,7 @@ class SequenceParallelismPass(VllmPatternMatcherPass):
     """
 
     @enable_fake_mode
-    def __init__(self, config: VllmConfig):
+    def __init__(self, config: VllmConfig) -> None:
         super().__init__(config)
 
         # Used to clean up redundant views created temporarily
@@ -357,7 +361,7 @@ class SequenceParallelismPass(VllmPatternMatcherPass):
         return (compile_range.is_single_size()) and (compile_range.end % tp_size == 0)
 
     @VllmInductorPass.time_and_log
-    def __call__(self, graph: fx.Graph):
+    def __call__(self, graph: fx.Graph) -> None:
         self.matched_count = self.patterns.apply(graph)
         logger.debug("Replaced %s patterns", self.matched_count)
         # Clean up reshape nodes
diff --git a/vllm/compilation/torch25_custom_graph_pass.py b/vllm/compilation/torch25_custom_graph_pass.py
index 1031856cdf008037732c59675b13013aa256229a..2da4190c416a8d050a13c2c8bc06a3cad6c3b8b0 100644
--- a/vllm/compilation/torch25_custom_graph_pass.py
+++ b/vllm/compilation/torch25_custom_graph_pass.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from abc import ABC, abstractmethod
-from typing import Any
+from typing import Any, NoReturn
 
 import torch
 
@@ -29,14 +29,14 @@ class Torch25CustomGraphPass(ABC):  # noqa (redefinition)
         Return None to skip inductor code caching entirely.
         """
 
-    def __getstate__(self):
+    def __getstate__(self) -> Any | None:
         """
         Pickling is used instead of uuid() in torch<2.6. Just return uuid()
          to enable subclasses to only have to implement uuid.
         """
         return self.uuid()
 
-    def __setstate__(self, state):
+    def __setstate__(self, state: Any) -> NoReturn:
         raise ValueError(
             "Cannot unpickle CustomGraphPass because pickling"
             " is used for cache key uuid. Use torch>=2.6 with"
diff --git a/vllm/compilation/vllm_inductor_pass.py b/vllm/compilation/vllm_inductor_pass.py
index 08721e3ae4a249c99a19ce86ed9ec30fae2b7f45..b64c892881f5dc9ed1884262caf204dc8d466388 100644
--- a/vllm/compilation/vllm_inductor_pass.py
+++ b/vllm/compilation/vllm_inductor_pass.py
@@ -3,6 +3,7 @@
 import functools
 import operator
 import time
+from collections.abc import Callable
 from dataclasses import dataclass
 from typing import ClassVar
 
@@ -43,13 +44,17 @@ class VllmInductorPass(InductorPass):
         )
         self.pass_config = config.compilation_config.pass_config
         self.model_dtype = config.model_config.dtype if config.model_config else None
-        self.device = config.device_config.device if config.device_config else None
+        self.device: str | None = (
+            config.device_config.device if config.device_config else None
+        )
         self.pass_name = self.__class__.__name__
 
     @staticmethod
-    def time_and_log(call_fn):
+    def time_and_log(
+        call_fn: Callable[["VllmInductorPass", torch.fx.Graph], None],
+    ) -> Callable[["VllmInductorPass", torch.fx.Graph], None]:
         @functools.wraps(call_fn)
-        def wrapped(self: VllmInductorPass, graph: torch.fx.Graph):
+        def wrapped(self: VllmInductorPass, graph: torch.fx.Graph) -> None:
             self.begin()
             self.dump_graph(graph, "before")
             call_fn(self, graph)
@@ -58,17 +63,17 @@ class VllmInductorPass(InductorPass):
 
         return wrapped
 
-    def dump_graph(self, graph: torch.fx.Graph, stage: str):
+    def dump_graph(self, graph: torch.fx.Graph, stage: str) -> None:
         i = VllmInductorPass.dump_prefix
         i_str = "" if i is None else f".{i}"
         lazy_format_graph_code(
             f"post_grad{i_str}.{self.pass_name}.{stage}", graph.owning_module
         )
 
-    def begin(self):
+    def begin(self) -> None:
         self._start_time = time.perf_counter_ns()
 
-    def end_and_log(self):
+    def end_and_log(self) -> None:
         self._end_time = time.perf_counter_ns()
         duration_ms = float(self._end_time - self._start_time) / 1.0e6
         logger.debug("%s completed in %.1f ms", self.pass_name, duration_ms)
@@ -92,12 +97,14 @@ class VllmPatternMatcherPass(VllmInductorPass):
 
     def _replace_op_overloads(self, string: str) -> str:
         """Replace <OpOverload(..., ...)> with nicer formulations"""
-        return self._OP_OVERLOAD_PATTERN.sub(
-            lambda m: f"torch.ops.{m.group(1)}.{m.group(2)}",
-            string,
+        return str(
+            self._OP_OVERLOAD_PATTERN.sub(
+                lambda m: f"torch.ops.{m.group(1)}.{m.group(2)}",
+                string,
+            )
         )
 
-    def dump_patterns(self, config: VllmConfig, pm_pass: PatternMatcherPass):
+    def dump_patterns(self, config: VllmConfig, pm_pass: PatternMatcherPass) -> None:
         """
         If debug dumping is enabled, dump the Inductor pattern-matcher patterns
         into the debug_dump_path folder next to the dumped fx graphs.
@@ -165,9 +172,9 @@ class VllmPatternMatcherPass(VllmInductorPass):
 
 
 class PrinterInductorPass(VllmInductorPass):
-    def __init__(self, name: str, config: VllmConfig):
+    def __init__(self, name: str, config: VllmConfig) -> None:
         super().__init__(config)
         self.name = name
 
-    def __call__(self, graph: torch.fx.Graph):
+    def __call__(self, graph: torch.fx.Graph) -> None:
         self.dump_graph(graph, self.name)
diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py
index 02e974b0f9e8c5e7ca0c757d564ea18e8bf0339c..62574d8072d2ed241608e874a09c2d49631abb3d 100644
--- a/vllm/compilation/wrapper.py
+++ b/vllm/compilation/wrapper.py
@@ -4,9 +4,10 @@
 import os
 import sys
 from abc import abstractmethod
+from collections.abc import Callable, Generator
 from contextlib import contextmanager, nullcontext
 from types import CodeType
-from typing import Any
+from typing import Any, ParamSpec, TypeVar
 
 import torch
 import torch._C._dynamo.guards
@@ -19,19 +20,26 @@ from vllm.utils.nvtx_pytorch_hooks import layerwise_nvtx_marker_context
 
 logger = init_logger(__name__)
 
+R = TypeVar("R")
+P = ParamSpec("P")
 
-def _noop_add_global_state_guard(self, *args, **kwargs):
+
+def _noop_add_global_state_guard(
+    self: torch._C._dynamo.guards.GuardManager, *args: Any, **kwargs: Any
+) -> None:
     """No-op to skip the GLOBAL_STATE guard entirely"""
     pass
 
 
-def _noop_add_torch_function_mode_stack_guard(self, *args, **kwargs):
+def _noop_add_torch_function_mode_stack_guard(
+    self: torch._C._dynamo.guards.GuardManager, *args: Any, **kwargs: Any
+) -> None:
     """No-op to skip the TORCH_FUNCTION_MODE_STACK guard entirely"""
     pass
 
 
 @contextmanager
-def _compilation_context():
+def _compilation_context() -> Generator[None, None, None]:
     """Context manager for compilation settings and patches.
 
     This manager:
@@ -88,13 +96,15 @@ class TorchCompileWithNoGuardsWrapper:
     since we drop all guards.
     """
 
-    def check_invariants_and_forward(self, *args, **kwargs):
+    def check_invariants_and_forward(self, *args: Any, **kwargs: Any) -> Any:
         assert hasattr(self, "_check_shape_invariants")
         self._check_shape_invariants(*args, **kwargs)
 
         return self.forward(*args, **kwargs)
 
-    def _call_with_optional_nvtx_range(self, callable_fn, *args, **kwargs):
+    def _call_with_optional_nvtx_range(
+        self, callable_fn: Callable[P, R], *args: P.args, **kwargs: P.kwargs
+    ) -> Any:
         if self.layerwise_nvtx_tracing_enabled:
             args_list = list(args)
             kwargs_dict = dict(kwargs)
@@ -108,7 +118,7 @@ class TorchCompileWithNoGuardsWrapper:
             return ctx.result
         return callable_fn(*args, **kwargs)
 
-    def __init__(self):
+    def __init__(self) -> None:
         self.compiled = False
 
         vllm_config = get_current_vllm_config()
@@ -192,9 +202,9 @@ class TorchCompileWithNoGuardsWrapper:
 
         if envs.VLLM_USE_BYTECODE_HOOK and mode != CompilationMode.STOCK_TORCH_COMPILE:
             torch._dynamo.convert_frame.register_bytecode_hook(self.bytecode_hook)
-            self._compiled_bytecode = None
+            self._compiled_bytecode: CodeType | None = None
 
-    def aot_compile(self, *args, **kwargs):
+    def aot_compile(self, *args: Any, **kwargs: Any) -> Any:
         if not hasattr(self._compiled_callable, "aot_compile"):
             raise RuntimeError(
                 "aot_compile is not supported by the current configuration. "
@@ -203,7 +213,7 @@ class TorchCompileWithNoGuardsWrapper:
             )
         return self._compiled_callable.aot_compile((args, kwargs))
 
-    def __call__(self, *args, **kwargs):
+    def __call__(self, *args: Any, **kwargs: Any) -> Any:
         if envs.VLLM_USE_BYTECODE_HOOK:
             if (
                 self.vllm_config.compilation_config.mode
@@ -236,13 +246,13 @@ class TorchCompileWithNoGuardsWrapper:
                 )
 
     @abstractmethod
-    def forward(self, *args, **kwargs): ...
+    def forward(self, *args: Any, **kwargs: Any) -> Any: ...
 
     def original_code_object(self) -> CodeType:
         """Return the original code object of the forward method."""
         return self.__class__.forward.__code__
 
-    def bytecode_hook(self, old_code: CodeType, new_code: CodeType):
+    def bytecode_hook(self, old_code: CodeType, new_code: CodeType) -> None:
         """Hook to save the compiled bytecode for direct execution."""
         if old_code is not self.original_code_object():
             return
@@ -299,7 +309,7 @@ class TorchCompileWithNoGuardsWrapper:
             raise RuntimeError(msg)
 
     @contextmanager
-    def _dispatch_to_compiled_code(self):
+    def _dispatch_to_compiled_code(self) -> Generator[None, None, None]:
         # noqa: E501
         """
         Context manager to dispatch to internally compiled code for torch<2.8.
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index 0e91dd57420a8bfcac58838153a8836dfdce2013..7f6565053ee69911f977ae0e9e53225a9d6c8d86 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -18,6 +18,7 @@ from vllm.config.lora import LoRAConfig
 from vllm.config.model import (
     ModelConfig,
     iter_architecture_defaults,
+    str_dtype_to_torch_dtype,
     try_match_architecture_defaults,
 )
 from vllm.config.multimodal import MultiModalConfig
@@ -41,6 +42,7 @@ from vllm.config.vllm import (
     VllmConfig,
     get_cached_compilation_config,
     get_current_vllm_config,
+    get_current_vllm_config_or_none,
     get_layers_from_vllm_config,
     set_current_vllm_config,
 )
@@ -72,6 +74,7 @@ __all__ = [
     # From vllm.config.model
     "ModelConfig",
     "iter_architecture_defaults",
+    "str_dtype_to_torch_dtype",
     "try_match_architecture_defaults",
     # From vllm.config.multimodal
     "MultiModalConfig",
@@ -103,6 +106,7 @@ __all__ = [
     "VllmConfig",
     "get_cached_compilation_config",
     "get_current_vllm_config",
+    "get_current_vllm_config_or_none",
     "set_current_vllm_config",
     "get_layers_from_vllm_config",
 ]
diff --git a/vllm/config/attention.py b/vllm/config/attention.py
index dd62d88826bd602a0bc30352bcaceb19f488ede8..293045787a1c0166b06ca615b1d3f70fd3a85db7 100644
--- a/vllm/config/attention.py
+++ b/vllm/config/attention.py
@@ -6,9 +6,9 @@ from typing import Any, Literal
 from pydantic import field_validator
 from pydantic.dataclasses import dataclass
 
-from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config.utils import config
 from vllm.logger import init_logger
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
 
 logger = init_logger(__name__)
 
diff --git a/vllm/config/cache.py b/vllm/config/cache.py
index 01ac8854d93ba53ca3a1f047cd7ceb9e826264a9..1c342db52f0d455215d9a1bf8919ad3281514aa3 100644
--- a/vllm/config/cache.py
+++ b/vllm/config/cache.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import math
 from dataclasses import field
 from typing import TYPE_CHECKING, Any, Literal
 
@@ -10,8 +11,9 @@ from pydantic.dataclasses import dataclass
 from vllm.config.utils import config
 from vllm.logger import init_logger
 from vllm.utils.mem_constants import GiB_bytes
-from vllm.utils.mem_utils import get_cpu_memory
+
 from vllm import envs
+from vllm.utils.mem_utils import format_gib, get_cpu_memory
 
 if TYPE_CHECKING:
     from vllm.config.parallel import ParallelConfig
@@ -154,13 +156,13 @@ class CacheConfig:
     kv_offloading_size: float | None = None
     """Size of the KV cache offloading buffer in GiB. When TP > 1, this is
     the total buffer size summed across all TP ranks. By default, this is set
-    to None, which means no KV offloading is enabled. When set with
-    kv_offloading_backend, vLLM will enable KV cache offloading to CPU"""
+    to None, which means no KV offloading is enabled. When set, vLLM will
+    enable KV cache offloading to CPU using the kv_offloading_backend."""
 
-    kv_offloading_backend: KVOffloadingBackend | None = None
+    kv_offloading_backend: KVOffloadingBackend = "native"
     """The backend to use for KV cache offloading. Supported backends include
-    'native' (vLLM native CPU offloading), 'lmcache' This option must be used
-    together with kv_offloading_size."""
+    'native' (vLLM native CPU offloading), 'lmcache'.
+    KV offloading is only activated when kv_offloading_size is set."""
 
     def compute_hash(self) -> str:
         """
@@ -217,7 +219,7 @@ class CacheConfig:
         self,
         parallel_config: ParallelConfig,
     ) -> None:
-        swap_space_bytes = self.swap_space * GiB_bytes
+        swap_space_bytes = math.ceil(self.swap_space * GiB_bytes)
         total_cpu_memory = get_cpu_memory()
         # FIXME(woosuk): Here, it is assumed that the GPUs in a tensor parallel
         # group are in the same node. However, the GPUs may span multiple nodes.
@@ -225,8 +227,8 @@ class CacheConfig:
         cpu_memory_usage = swap_space_bytes * num_gpus_per_node
 
         msg = (
-            f"{cpu_memory_usage / GiB_bytes:.2f} GiB out of the "
-            f"{total_cpu_memory / GiB_bytes:.2f} GiB total CPU memory "
+            f"{format_gib(cpu_memory_usage)} GiB out of the "
+            f"{format_gib(total_cpu_memory)} GiB total CPU memory "
             "is allocated for the swap space."
         )
         if cpu_memory_usage > 0.7 * total_cpu_memory:
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index b5d17ea7a939fd936018eb87bc978f95f87f5208..9907bb4f65511590abb7a5f67a331416184e4a66 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -33,6 +33,9 @@ else:
 
 logger = init_logger(__name__)
 
+# Explicitly exports Range
+__all__ = ["Range"]
+
 
 class CompilationMode(enum.IntEnum):
     """The compilation approach used for torch.compile-based compilation of the
@@ -276,7 +279,11 @@ class DynamicShapesConfig:
     artifacts also.
     When type is backed, aot_compile must be disabled for this mode to work.
     until this change picked up https://github.com/pytorch/pytorch/pull/169239.
+    """
 
+    assume_32_bit_indexing: bool = True
+    """
+    whether all tensor sizes can use 32 bit indexing.
     """
 
     def compute_hash(self) -> str:
@@ -405,7 +412,8 @@ class CompilationConfig:
     - 'none,+op1,+op2' to enable only op1 and op2
 
     By default, all custom ops are enabled when running without Inductor and
-    disabled when running with Inductor: mode>=VLLM_COMPILE and backend="inductor".
+    disabled when running with Inductor: mode>CompilationMode.NONE and
+    backend="inductor".
     Inductor generates (fused) Triton kernels for disabled custom ops."""
     splitting_ops: list[str] | None = None
     """A list of ops to exclude from cudagraphs, used in piecewise compilation.
@@ -427,8 +435,9 @@ class CompilationConfig:
     If empty list [], no ops are excluded (suitable for full cudagraphs)."""
     compile_mm_encoder: bool = False
     """Whether or not to compile the multimodal encoder.
-    Currently, this only works for `Qwen2_5_vl` on selected platforms.
-    Disabled by default until more models are supported/tested to work."""
+    Currently, this only works for `Qwen2_5_vl` and `mLLaMa4` models
+    on selected platforms. Disabled by default until more models
+    are supported/tested to work."""
 
     # Inductor capture
     compile_sizes: list[int | str] | None = None
@@ -438,14 +447,14 @@ class CompilationConfig:
 
     compile_ranges_split_points: list[int] | None = None
     """Split points that represent compile ranges for inductor.
-    The compile ranges are 
-    [1, split_points[0]], 
-    [split_points[0] + 1, split_points[1]], ..., 
+    The compile ranges are
+    [1, split_points[0]],
+    [split_points[0] + 1, split_points[1]], ...,
     [split_points[-1] + 1, max_num_batched_tokens].
     Compile sizes are also used single element ranges,
     the range is represented as [compile_sizes[i], compile_sizes[i]].
-    
-    If a range overlaps with the compile size, graph for compile size 
+
+    If a range overlaps with the compile size, graph for compile size
     will be prioritized, i.e. if we have a range [1, 8] and a compile size 4,
     graph for compile size 4 will be compiled and used instead of the graph
     for range [1, 8].
@@ -636,6 +645,7 @@ class CompilationConfig:
             "compilation_time",
             "static_forward_context",
             "pass_config",  # handled separately below
+            "dynamic_shapes_config",  # handled separately below
         }
 
         from vllm.config.utils import get_hash_factors, hash_factors
@@ -643,6 +653,7 @@ class CompilationConfig:
         factors = get_hash_factors(self, ignored_factors)
 
         factors["pass_config"] = self.pass_config.compute_hash()
+        factors["dynamic_shapes_config"] = self.dynamic_shapes_config.compute_hash()
         return hash_factors(factors)
 
     def __repr__(self) -> str:
@@ -840,9 +851,9 @@ class CompilationConfig:
         """
         if self.mode is None:
             raise ValueError(
-                "No compilation mode is set. This method should only be \
-                called via vllm config where the level is set if none is \
-                provided."
+                "No compilation mode is set. This method should only be "
+                "called via vllm config where the level is set if none is "
+                "provided."
             )
         if self.mode == CompilationMode.NONE:
             raise ValueError("No compilation mode is set.")
@@ -900,7 +911,7 @@ class CompilationConfig:
         self.compute_bs_to_padded_graph_size()
 
     def set_splitting_ops_for_v1(
-        self, all2all_backend: str | None = None, data_parallel_size: int | None = None
+        self, all2all_backend: str, data_parallel_size: int = 1
     ):
         # To compatible with OOT hardware plugin platform (for example vllm-ascend)
         # which currently only supports sequence parallelism in eager mode.
@@ -935,12 +946,12 @@ class CompilationConfig:
                     or self.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE
                 ):
                     logger.warning_once(
-                        "Using piecewise compilation with empty splitting_ops"
+                        "Using piecewise cudagraph with empty splitting_ops"
                     )
                 if self.cudagraph_mode == CUDAGraphMode.PIECEWISE:
                     logger.warning_once(
-                        "Piecewise compilation with empty splitting_ops do not"
-                        "contains piecewise cudagraph. Setting cudagraph_"
+                        "Piecewise compilation with empty splitting_ops does not "
+                        "contain piecewise cudagraph. Setting cudagraph_"
                         "mode to NONE. Hint: If you are using attention "
                         "backends that support cudagraph, consider manually "
                         "setting cudagraph_mode to FULL or FULL_DECODE_ONLY "
@@ -949,19 +960,17 @@ class CompilationConfig:
                     self.cudagraph_mode = CUDAGraphMode.NONE
                 elif self.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE:
                     logger.warning_once(
-                        "Piecewise compilation with empty splitting_ops do "
-                        "not contains piecewise cudagraph. Setting "
+                        "Piecewise compilation with empty splitting_ops does "
+                        "not contain piecewise cudagraph. Setting "
                         "cudagraph_mode to FULL."
                     )
                     self.cudagraph_mode = CUDAGraphMode.FULL
                 self.splitting_ops = []
 
         # Disable CUDA graphs for DeepEP high-throughput since its not CG compatible
-        backend = all2all_backend or envs.VLLM_ALL2ALL_BACKEND
-        dp_size = data_parallel_size if data_parallel_size is not None else 1
         if (
-            backend == "deepep_high_throughput"
-            and dp_size > 1
+            all2all_backend == "deepep_high_throughput"
+            and data_parallel_size > 1
             and self.cudagraph_mode != CUDAGraphMode.NONE
         ):
             # TODO: Piecewise Cuda graph might be enabled
diff --git a/vllm/config/lora.py b/vllm/config/lora.py
index 6a8fd6359aadd01d913ad82d26f236084ffe83e2..56aa08fcb273a9b8be64e9701e0ff1c0bbee8d95 100644
--- a/vllm/config/lora.py
+++ b/vllm/config/lora.py
@@ -55,6 +55,11 @@ class LoRAConfig:
     per prompt. When run in offline mode, the lora IDs for n modalities
     will be automatically assigned to 1-n with the names of the modalities
     in alphabetic order."""
+    enable_tower_connector_lora: bool = False
+    """If `True`, LoRA support for the tower (vision encoder) and connector 
+    of multimodal models will be enabled. This is an experimental feature and 
+    currently only supports some MM models such as the Qwen VL series. The default 
+    is False."""
 
     def compute_hash(self) -> str:
         """
@@ -73,6 +78,7 @@ class LoRAConfig:
         factors.append(self.max_loras)
         factors.append(self.fully_sharded_loras)
         factors.append(self.lora_dtype)
+        factors.append(self.enable_tower_connector_lora)
 
         hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
         return hash_str
@@ -84,7 +90,7 @@ class LoRAConfig:
         elif self.max_cpu_loras < self.max_loras:
             raise ValueError(
                 f"max_cpu_loras ({self.max_cpu_loras}) must be >= "
-                f"max_loras ({self.max_loras})"
+                f"max_loras ({self.max_loras})."
             )
 
         return self
diff --git a/vllm/config/model.py b/vllm/config/model.py
index 9c9efc0a6da9a9b45bc3c123ade08eec8fa18032..8aed94df1a8bd678dd43514e4349654b2262833b 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -11,11 +11,11 @@ from typing import TYPE_CHECKING, Any, Literal, cast, get_args
 import torch
 from pydantic import ConfigDict, Field, field_validator, model_validator
 from pydantic.dataclasses import dataclass
-from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE
-from transformers.configuration_utils import ALLOWED_LAYER_TYPES
 
 import vllm.envs as envs
-from vllm.attention.backends.registry import AttentionBackendEnum
+from vllm.config.model_arch import (
+    ModelArchitectureConfig,
+)
 from vllm.config.multimodal import MMCacheType, MMEncoderTPMode, MultiModalConfig
 from vllm.config.pooler import PoolerConfig
 from vllm.config.scheduler import RunnerType
@@ -30,9 +30,9 @@ from vllm.transformers_utils.config import (
     get_pooling_config,
     get_sentence_transformer_tokenizer_config,
     is_encoder_decoder,
+    is_rope_parameters_nested,
     try_get_dense_modules,
     try_get_generation_config,
-    try_get_safetensors_metadata,
     try_get_tokenizer_config,
     uses_mrope,
     uses_xdrope_dim,
@@ -43,10 +43,14 @@ from vllm.transformers_utils.gguf_utils import (
     maybe_patch_hf_config_from_gguf,
     split_remote_gguf,
 )
+from vllm.transformers_utils.model_arch_config_convertor import (
+    MODEL_ARCH_CONFIG_CONVERTORS,
+    ModelArchConfigConvertorBase,
+)
 from vllm.transformers_utils.runai_utils import ObjectStorageModel, is_runai_obj_uri
 from vllm.transformers_utils.utils import maybe_model_redirect
 from vllm.utils.import_utils import LazyLoader
-from vllm.utils.torch_utils import common_broadcastable_dtype
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
 
 if TYPE_CHECKING:
     from transformers import PretrainedConfig
@@ -72,7 +76,7 @@ else:
 logger = init_logger(__name__)
 
 RunnerOption = Literal["auto", RunnerType]
-ConvertType = Literal["none", "embed", "classify", "reward"]
+ConvertType = Literal["none", "embed", "classify", "reward", "mm_encoder_only"]
 ConvertOption = Literal["auto", ConvertType]
 TokenizerMode = Literal["auto", "hf", "slow", "mistral", "deepseek_v32"]
 ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"]
@@ -105,6 +109,10 @@ class ModelConfig:
     """Name or path of the Hugging Face model to use. It is also used as the
     content for `model_name` tag in metrics output when `served_model_name` is
     not specified."""
+    model_weights: str = ""
+    """Original model weights path. Used when the model is pulled from object 
+    storage (e.g., RunAI) to preserve the original URI while `model` points to 
+    the local directory."""
     runner: RunnerOption = "auto"
     """The type of model runner to use. Each vLLM instance only supports one
     model runner, even if the same model can be used for multiple types."""
@@ -167,7 +175,7 @@ class ModelConfig:
     """The specific revision to use for the tokenizer on the Hugging Face Hub.
     It can be a branch name, a tag name, or a commit id. If unspecified, will
     use the default version."""
-    max_model_len: int = Field(default=None, gt=0)
+    max_model_len: int = Field(default=None, ge=-1)
     """Model context length (prompt and output). If unspecified, will be
     automatically derived from the model config.
 
@@ -175,7 +183,10 @@ class ModelConfig:
     format. Examples:\n
     - 1k -> 1000\n
     - 1K -> 1024\n
-    - 25.6k -> 25,600"""
+    - 25.6k -> 25,600\n
+    - -1 or 'auto' -> Automatically choose the maximum model length that fits in
+    GPU memory. This will use the model's maximum context length if it fits,
+    otherwise it will find the largest length that can be accommodated."""
     spec_target_max_model_len: int | None = None
     """Specify the maximum length for spec decoding draft models."""
     quantization: QuantizationMethods | str | None = None
@@ -183,11 +194,15 @@ class ModelConfig:
     `quantization_config` attribute in the model config file. If that is
     `None`, we assume the model weights are not quantized and use `dtype` to
     determine the data type of the weights."""
+    allow_deprecated_quantization: bool = False
+    """Whether to allow deprecated quantization methods."""
     enforce_eager: bool = False
     """Whether to always use eager-mode PyTorch. If True, we will disable CUDA
     graph and always execute the model in eager mode. If False, we will use
     CUDA graph and eager execution in hybrid for maximal performance and
     flexibility."""
+    enable_return_routed_experts: bool = False
+    """Whether to return routed experts."""
     max_logprobs: int = 20
     """Maximum number of log probabilities to return when `logprobs` is
     specified in `SamplingParams`. The default value comes the default for the
@@ -487,6 +502,7 @@ class ModelConfig:
         self.hf_image_processor_config = get_hf_image_processor_config(
             self.model, hf_token=self.hf_token, revision=self.revision
         )
+        self.model_arch_config = self.get_model_arch_config()
 
         architectures = self.architectures
         registry = self.registry
@@ -532,9 +548,12 @@ class ModelConfig:
                     if getattr(self.pooler_config, k) is None:
                         setattr(self.pooler_config, k, v)
 
-            default_pooling_type = self._model_info.default_pooling_type
-            if self.pooler_config.pooling_type is None:
-                self.pooler_config.pooling_type = default_pooling_type
+            default_seq_pooling_type = self._model_info.default_seq_pooling_type
+            if self.pooler_config.seq_pooling_type is None:
+                self.pooler_config.seq_pooling_type = default_seq_pooling_type
+            default_tok_pooling_type = self._model_info.default_tok_pooling_type
+            if self.pooler_config.tok_pooling_type is None:
+                self.pooler_config.tok_pooling_type = default_tok_pooling_type
 
         self.dtype: torch.dtype = _get_and_verify_dtype(
             self.model,
@@ -599,11 +618,20 @@ class ModelConfig:
 
         # Avoid running try_verify_and_update_config multiple times
         self.config_updated = False
-
+        self._try_verify_and_update_model_config()
         self._verify_quantization()
         self._verify_cuda_graph()
         self._verify_bnb_config()
 
+    def get_model_arch_config(
+        self,
+    ) -> ModelArchitectureConfig:
+        convertor_cls = MODEL_ARCH_CONFIG_CONVERTORS.get(
+            self.hf_config.model_type, ModelArchConfigConvertorBase
+        )
+        convertor = convertor_cls(self.hf_config, self.hf_text_config)
+        return convertor.convert()
+
     @field_validator("tokenizer", "max_model_len", mode="wrap")
     @classmethod
     def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
@@ -646,7 +674,7 @@ class ModelConfig:
         cls = "Transformers"
         # If 'hf_config != hf_text_config' it's a nested config, i.e. multimodal
         cls += "MultiModal" if self.hf_config != self.hf_text_config else ""
-        cls += "MoE" if self.get_num_experts() > 1 else ""
+        cls += "MoE" if self.is_moe else ""
         # Check if the architecture we're wrapping has defaults
         runner = None
         task = None
@@ -679,7 +707,7 @@ class ModelConfig:
 
     @property
     def architectures(self) -> list[str]:
-        return getattr(self.hf_config, "architectures", [])
+        return self.model_arch_config.architectures
 
     @property
     def architecture(self) -> str:
@@ -695,6 +723,10 @@ class ModelConfig:
             tokenizer: Tokenizer name or path
         """
 
+        # Skip if model_weights is already set (model already pulled)
+        if self.model_weights:
+            return
+
         if not (is_runai_obj_uri(model) or is_runai_obj_uri(tokenizer)):
             return
 
@@ -839,50 +871,16 @@ class ModelConfig:
 
         return convert_type
 
-    def _parse_quant_hf_config(self, hf_config: PretrainedConfig):
-        quant_cfg = getattr(hf_config, "quantization_config", None)
-        if quant_cfg is None:
-            # compressed-tensors uses a "compression_config" key
-            quant_cfg = getattr(hf_config, "compression_config", None)
-
-        else:
-            # Set quant_method for ModelOpt models.
-            producer_name = quant_cfg.get("producer", {}).get("name")
-            if producer_name == "modelopt":
-                quant_algo = quant_cfg.get("quantization", {}).get("quant_algo")
-                if quant_algo == "FP8":
-                    quant_cfg["quant_method"] = "modelopt"
-                elif quant_algo == "NVFP4":
-                    quant_cfg["quant_method"] = "modelopt_fp4"
-                elif quant_algo is not None:
-                    raise ValueError(f"Unknown ModelOpt quant algo: {quant_algo}")
-
-        return quant_cfg
-
     def _verify_quantization(self) -> None:
         supported_quantization = me_quant.QUANTIZATION_METHODS
         if self.quantization is not None:
             self.quantization = cast(me_quant.QuantizationMethods, self.quantization)
 
         # Parse quantization method from the HF model config, if available.
-        quant_cfg = self._parse_quant_hf_config(self.hf_config)
-        if quant_cfg is None and (
-            text_config := getattr(self.hf_config, "text_config", None)
-        ):
-            # Check the text config as well for multi-modal models.
-            quant_cfg = self._parse_quant_hf_config(text_config)
+        quant_cfg = self.model_arch_config.quantization_config
 
         if quant_cfg is not None:
-            # Use the community standard 'quant_method'
-            quant_method = quant_cfg.get("quant_method", "").lower()
-
-            # Normalize library names
-            quant_method = quant_method.replace(
-                "compressed_tensors", "compressed-tensors"
-            )
-
-            quant_cfg["quant_method"] = quant_method
-
+            quant_method = quant_cfg["quant_method"]
             # Quantization methods which are overrides (i.e. they have a
             # `override_quantization_method` method) must be checked in order
             # of preference (this is particularly important for GPTQ).
@@ -900,7 +898,6 @@ class ModelConfig:
                 # Ensure heavy backends are probed last to avoid unnecessary
                 # imports during override detection (e.g., MXFP4 imports Triton)
                 "mxfp4",
-                "cpu_gptq",
                 "cpu_awq",
                 "slimquant_w4a8_marlin",
                 "slimquant_compressed_tensors_marlin",
@@ -959,6 +956,21 @@ class ModelConfig:
 
             current_platform.verify_quantization(self.quantization)
 
+        if self.quantization in me_quant.DEPRECATED_QUANTIZATION_METHODS:
+            if self.allow_deprecated_quantization:
+                logger.warning(
+                    "The quantization method %s is deprecated "
+                    "and will be removed in future versions of vLLM.",
+                    self.quantization,
+                )
+            else:
+                raise ValueError(
+                    "The quantization method %s is deprecated "
+                    "and will be removed in future versions of vLLM. To bypass, "
+                    "set `--allow-deprecated-quantization`.",
+                    self.quantization,
+                )
+
     def _verify_cuda_graph(self) -> None:
         # CUDAGraph capture not supported for encoder-decoder models on ROCm
         unsupported_rocm = self.is_encoder_decoder
@@ -966,7 +978,7 @@ class ModelConfig:
             logger.warning(
                 "CUDA graph is not supported for %s on ROCm yet, fallback "
                 "to eager mode.",
-                self.hf_config.model_type,
+                self.model_arch_config.model_type,
             )
             self.enforce_eager = True
 
@@ -977,11 +989,9 @@ class ModelConfig:
         # TODO Remove this when bitsandbytes supports.
         """
         is_bitsandbytes = self.quantization == "bitsandbytes"
-        has_quantization_config = (
-            getattr(self.hf_config, "quantization_config", None) is not None
-        )
+        has_quantization_config = self.model_arch_config.quantization_config is not None
         is_8bit = (
-            self.hf_config.quantization_config.get("load_in_8bit", False)
+            self.model_arch_config.quantization_config.get("load_in_8bit", False)
             if has_quantization_config
             else False
         )
@@ -1001,13 +1011,29 @@ class ModelConfig:
             self.enforce_eager = True
 
     def _verify_with_expert_parallelism(self) -> None:
-        num_experts = self.get_num_experts()
-        if num_experts < 1:
+        if not self.is_moe:
             raise ValueError(
                 "Number of experts in the model must be greater than 0 "
                 "when expert parallelism is enabled."
             )
 
+    def _try_verify_and_update_model_config(self):
+        # Avoid running try_verify_and_update_config multiple times
+        if getattr(self, "config_updated", False):
+            return
+
+        architecture = self.architecture
+        if architecture is None:
+            return
+
+        from vllm.model_executor.models.config import (
+            MODELS_CONFIG_MAP,
+        )
+
+        cls = MODELS_CONFIG_MAP.get(architecture, None)
+        if cls is not None:
+            cls.verify_and_update_model_config(self)
+
     def verify_dual_chunk_attention_config(
         self,
         load_config: LoadConfig,
@@ -1035,9 +1061,7 @@ class ModelConfig:
         self,
         parallel_config: ParallelConfig,
     ) -> None:
-        total_num_attention_heads = getattr(
-            self.hf_text_config, "num_attention_heads", 0
-        )
+        total_num_attention_heads = self.model_arch_config.total_num_attention_heads
         tensor_parallel_size = parallel_config.tensor_parallel_size
         if total_num_attention_heads % tensor_parallel_size != 0:
             raise ValueError(
@@ -1088,160 +1112,41 @@ class ModelConfig:
         return getattr(self.hf_text_config, "sliding_window", None)
 
     def get_vocab_size(self) -> int:
-        return getattr(self.hf_text_config, "vocab_size", 0)
+        return self.model_arch_config.vocab_size
 
     def get_hidden_size(self) -> int:
-        return getattr(self.hf_text_config, "hidden_size", 0)
+        return self.model_arch_config.hidden_size
 
     def get_inputs_embeds_size(self) -> int:
         # The size of inputs_embeds is usually identical to the size
         # of the hidden states, however there are exceptions, such as
         # embedding models like CLIP and SigLIP
-        for target_attr in ("projection_dim", "projection_size"):
-            if hasattr(self.hf_text_config, target_attr):
-                return getattr(self.hf_text_config, target_attr)
-
-        return self.get_hidden_size()
+        names = ("projection_dim", "projection_size")
+        return getattr_iter(
+            self.hf_text_config, names, default_factory=self.get_hidden_size
+        )
 
     @property
     def is_deepseek_mla(self) -> bool:
-        if not hasattr(self.hf_text_config, "model_type"):
-            return False
-        elif self.hf_text_config.model_type in (
-            "deepseek_v2",
-            "deepseek_v3",
-            "deepseek_v32",
-            "deepseek_mtp",
-            "kimi_k2",
-            "kimi_linear",
-            "longcat_flash",
-            "pangu_ultra_moe",
-            "pangu_ultra_moe_mtp",
-        ):
-            return self.hf_text_config.kv_lora_rank is not None
-        elif self.hf_text_config.model_type == "eagle":
-            # if the model is an EAGLE module, check for the
-            # underlying architecture
-            return (
-                self.hf_text_config.model.model_type
-                in ("deepseek_v2", "deepseek_v3", "deepseek_v32")
-                and self.hf_text_config.kv_lora_rank is not None
-            )
-        return False
+        return self.model_arch_config.is_deepseek_mla
 
     @cached_property
     def is_mm_prefix_lm(self) -> bool:
         """Whether to use bidirectional attention for mm positions."""
         MM_PREFIX_LM_MODELS = (
             "gemma3",
-            # TODO(Isotr0py): Disable paligemma for now before
-            # we supports soft cap attention for FlexAttention
-            # "paligemma",
+            "paligemma",
         )
         if not hasattr(self.hf_config, "model_type"):
             return False
         return self.hf_config.model_type in MM_PREFIX_LM_MODELS
 
     def get_head_size(self) -> int:
-        # TODO remove hard code
-        if self.is_deepseek_mla:
-            qk_rope_head_dim = getattr(self.hf_text_config, "qk_rope_head_dim", 0)
-            if self.use_mla:
-                return self.hf_text_config.kv_lora_rank + qk_rope_head_dim
-            else:
-                qk_nope_head_dim = getattr(self.hf_text_config, "qk_nope_head_dim", 0)
-                if qk_rope_head_dim and qk_nope_head_dim:
-                    return qk_rope_head_dim + qk_nope_head_dim
-
-        if hasattr(self.hf_text_config, "model_type") and (
-            self.hf_text_config.model_type == "zamba2"
-        ):
-            return self.hf_text_config.attention_head_dim
-
-        if self.is_attention_free:
-            return 0
-
-        # NOTE: Some configs may set head_dim=None in the config
-        if getattr(self.hf_text_config, "head_dim", None) is not None:
-            return self.hf_text_config.head_dim
-
-        # NOTE: Some models (such as PLaMo2.1) use `hidden_size_per_head`
-        if getattr(self.hf_text_config, "hidden_size_per_head", None) is not None:
-            return self.hf_text_config.hidden_size_per_head
-
-        # FIXME(woosuk): This may not be true for all models.
-        return (
-            self.hf_text_config.hidden_size // self.hf_text_config.num_attention_heads
-        )
+        return self.model_arch_config.head_size
 
     def get_total_num_kv_heads(self) -> int:
         """Returns the total number of KV heads."""
-        # For GPTBigCode & Falcon:
-        # NOTE: for falcon, when new_decoder_architecture is True, the
-        # multi_query flag is ignored and we use n_head_kv for the number of
-        # KV heads.
-        falcon_model_types = ["falcon", "RefinedWeb", "RefinedWebModel"]
-        new_decoder_arch_falcon = (
-            self.hf_config.model_type in falcon_model_types
-            and getattr(self.hf_config, "new_decoder_architecture", False)
-        )
-        if not new_decoder_arch_falcon and getattr(
-            self.hf_text_config, "multi_query", False
-        ):
-            # Multi-query attention, only one KV head.
-            # Currently, tensor parallelism is not supported in this case.
-            return 1
-
-        # For DBRX and MPT
-        if self.hf_config.model_type == "mpt":
-            if "kv_n_heads" in self.hf_config.attn_config:
-                return self.hf_config.attn_config["kv_n_heads"]
-            return self.hf_config.num_attention_heads
-        if self.hf_config.model_type == "dbrx":
-            return getattr(
-                self.hf_config.attn_config,
-                "kv_n_heads",
-                self.hf_config.num_attention_heads,
-            )
-
-        if self.hf_config.model_type == "nemotron-nas":
-            for block in self.hf_config.block_configs:
-                if not block.attention.no_op:
-                    return (
-                        self.hf_config.num_attention_heads
-                        // block.attention.n_heads_in_group
-                    )
-
-            raise RuntimeError(
-                "Could not determine the number of key-value attention heads "
-                "from model configuration. "
-                f"Model: {self.model}, Architecture: {self.architectures}. "
-                "This usually indicates an unsupported model architecture or "
-                "missing configuration. "
-                "Please check if your model is supported at: "
-                "https://docs.vllm.ai/en/latest/models/supported_models.html"
-            )
-
-        if self.is_attention_free:
-            return 0
-
-        attributes = [
-            # For Falcon:
-            "n_head_kv",
-            "num_kv_heads",
-            # For LLaMA-2:
-            "num_key_value_heads",
-            # For ChatGLM:
-            "multi_query_group_num",
-        ]
-        for attr in attributes:
-            num_kv_heads = getattr(self.hf_text_config, attr, None)
-            if num_kv_heads is not None:
-                return num_kv_heads
-
-        # For non-grouped-query attention models, the number of KV heads is
-        # equal to the number of attention heads.
-        return self.hf_text_config.num_attention_heads
+        return self.model_arch_config.total_num_kv_heads
 
     def get_num_kv_heads(self, parallel_config: ParallelConfig) -> int:
         """Returns the number of KV heads per GPU."""
@@ -1257,46 +1162,14 @@ class ModelConfig:
         return max(1, total_num_kv_heads // parallel_config.tensor_parallel_size)
 
     def get_num_attention_heads(self, parallel_config: ParallelConfig) -> int:
-        num_heads = getattr(self.hf_text_config, "num_attention_heads", 0)
+        num_heads = self.model_arch_config.total_num_attention_heads
         return num_heads // parallel_config.tensor_parallel_size
 
     def get_num_experts(self) -> int:
-        """Returns the number of experts in the model."""
-        num_expert_names = [
-            "num_experts",  # Jamba
-            "moe_num_experts",  # Dbrx
-            "n_routed_experts",  # DeepSeek
-            "num_local_experts",  # Mixtral
-        ]
-        num_experts = getattr_iter(self.hf_text_config, num_expert_names, 0)
-        if isinstance(num_experts, list):
-            # Ernie VL's remote code uses list[int]...
-            # The values are always the same so we just take the first one.
-            return num_experts[0]
-        # Coerce to 0 if explicitly set to None
-        return num_experts or 0
+        return self.model_arch_config.num_experts
 
     def get_total_num_hidden_layers(self) -> int:
-        if (
-            self.hf_text_config.model_type == "deepseek_mtp"
-            or self.hf_config.model_type == "mimo_mtp"
-            or self.hf_config.model_type == "glm4_moe_mtp"
-            or self.hf_config.model_type == "ernie_mtp"
-            or self.hf_config.model_type == "qwen3_next_mtp"
-            or self.hf_config.model_type == "pangu_ultra_moe_mtp"
-        ):
-            total_num_hidden_layers = getattr(
-                self.hf_text_config, "num_nextn_predict_layers", 0
-            )
-        elif self.hf_config.model_type == "longcat_flash_mtp":
-            total_num_hidden_layers = getattr(
-                self.hf_text_config, "num_nextn_predict_layers", 1
-            )
-        else:
-            total_num_hidden_layers = getattr(
-                self.hf_text_config, "num_hidden_layers", 0
-            )
-        return total_num_hidden_layers
+        return self.model_arch_config.total_num_hidden_layers
 
     def get_layers_start_end_indices(
         self, parallel_config: ParallelConfig
@@ -1347,9 +1220,7 @@ class ModelConfig:
                 self.hf_text_config, "layers_block_type", None
             )
             if layers_block_type_value is not None:
-                if hasattr(self.hf_text_config, "model_type") and (
-                    self.hf_text_config.model_type == "zamba2"
-                ):
+                if self.model_arch_config.text_model_type == "zamba2":
                     if attn_block_type:
                         return sum(
                             t == "hybrid" for t in layers_block_type_value[start:end]
@@ -1545,6 +1416,10 @@ class ModelConfig:
     def is_multimodal_raw_input_only_model(self) -> bool:
         return self._model_info.supports_multimodal_raw_input_only
 
+    @property
+    def requires_raw_input_tokens(self) -> bool:
+        return self._model_info.requires_raw_input_tokens
+
     @property
     def is_cross_encoder(self) -> bool:
         return (
@@ -1561,14 +1436,14 @@ class ModelConfig:
 
     @property
     def is_hybrid(self) -> bool:
+        if not self._model_info.is_hybrid:
+            return False
         # Handle granite-4.0-micro case which uses hybrid config but does not
         # actually contain any non-attention layers.
         layer_types = getattr(self.hf_config, "layer_types", None)
-        if layer_types is not None and all(
+        return layer_types is None or not all(
             layer == "attention" for layer in layer_types
-        ):
-            return False
-        return self._model_info.is_hybrid
+        )
 
     @property
     def has_noops(self) -> bool:
@@ -1597,10 +1472,18 @@ class ModelConfig:
         return getattr(self.hf_config, "matryoshka_dimensions", None)
 
     @property
-    def use_pad_token(self) -> bool:
-        # cross_encoder models defaults to using pad_token.
-        # `llm as reranker` models defaults to not using pad_token.
-        return getattr(self.hf_config, "use_pad_token", True)
+    def use_sep_token(self) -> bool:
+        # cross_encoder models defaults to using separating token.
+        # `llm as reranker` defaults to not using separating token.
+
+        use_pad_token = getattr(self.hf_config, "use_pad_token", None)
+        if use_pad_token is not None:
+            logger.warning_once(
+                "use_pad_token has been deprecated; please use use_sep_token instead."
+            )
+            return use_pad_token
+
+        return getattr(self.hf_config, "use_sep_token", True)
 
     @property
     def head_dtype(self) -> torch.dtype:
@@ -1620,7 +1503,7 @@ class ModelConfig:
 
         if self.runner_type != "pooling" and head_dtype != self.dtype:
             logger.warning_once(
-                "`head_dtype` currently only supports pooling models."
+                "`head_dtype` currently only supports pooling models, "
                 "fallback to model dtype [%s].",
                 self.dtype,
             )
@@ -1660,6 +1543,7 @@ class ModelConfig:
             )
         max_model_len = _get_and_verify_max_len(
             hf_config=self.hf_text_config,
+            model_arch_config=self.model_arch_config,
             tokenizer_config=tokenizer_config,
             max_model_len=max_model_len,
             disable_sliding_window=self.disable_sliding_window,
@@ -1673,8 +1557,8 @@ class ModelConfig:
     @property
     def attn_type(self) -> AttnTypeStr:
         if self.pooler_config is not None:
-            pooling_type = self._model_info.default_pooling_type.lower()
-            if pooling_type == "cls":
+            seq_pooling_type = self._model_info.default_seq_pooling_type
+            if seq_pooling_type == "CLS":
                 return "encoder_only"
             else:
                 is_causal = getattr(self.hf_config, "is_causal", True)
@@ -1691,99 +1575,112 @@ class ModelConfig:
     @property
     def is_chunked_prefill_supported(self) -> bool:
         attn_type = self.attn_type
-        if self.pooler_config is not None:
+
+        if pooler_config := self.pooler_config:
             # for pooling models
             if attn_type == "encoder_only":
                 logger.debug(
-                    "Pooling models with bidirectional attn does not support "
-                    "chunked prefill."
+                    "Pooling models with bidirectional attn "
+                    "do not support chunked prefill."
                 )
                 return False
-            elif attn_type == "decoder":
-                pooling_type = self.pooler_config.pooling_type.lower()
-                if pooling_type in ["mean", "step", "cls"]:
+
+            if attn_type == "decoder":
+                if (
+                    pooler_config.seq_pooling_type in ("MEAN", "CLS")
+                    or pooler_config.tok_pooling_type == "STEP"
+                ):
                     logger.debug(
-                        "Pooling models with %s pooling does not "
-                        "support chunked prefill.",
-                        pooling_type,
+                        "Pooling models with causal attn and %s/%s pooling "
+                        "do not support chunked prefill.",
+                        pooler_config.seq_pooling_type,
+                        pooler_config.tok_pooling_type,
                     )
                     return False
-                elif pooling_type in ["all", "last"]:
+                else:
                     logger.debug(
-                        "Pooling models with causal attn and %s pooling support "
-                        "chunked prefill.",
-                        pooling_type,
+                        "Pooling models with causal attn and %s/%s pooling "
+                        "support chunked prefill.",
+                        pooler_config.seq_pooling_type,
+                        pooler_config.tok_pooling_type,
                     )
                     return True
-                else:
-                    raise ValueError(f"{pooling_type=} not supported.")
+
             # vllm currently does not have pooling models using hybrid,
             # attention_free or encoder_decoder attn types.
             return attn_type != "encoder_decoder"
         else:
+            # for generative models
             if attn_type == "encoder_decoder":
-                logger.debug("Encoder decoder models does not support chunked prefill.")
+                logger.debug("Encoder decoder models do not support chunked prefill.")
                 return False
+
             logger.debug("Generative models support chunked prefill.")
             return True
 
     @property
     def is_prefix_caching_supported(self) -> bool:
         attn_type = self.attn_type
-        if self.pooler_config is not None:
+
+        if pooler_config := self.pooler_config:
             # for pooling models
             if attn_type == "encoder_only":
                 logger.debug(
-                    "Pooling models with bidirectional attn does not "
-                    "support prefix caching."
+                    "Pooling models with bidirectional attn "
+                    "do not support prefix caching."
                 )
                 return False
-            elif attn_type == "decoder":
-                pooling_type = self.pooler_config.pooling_type.lower()
-                if pooling_type in ["mean", "step", "cls"]:
+
+            if attn_type == "decoder":
+                if (
+                    pooler_config.seq_pooling_type in ("MEAN", "CLS")
+                    or pooler_config.tok_pooling_type == "STEP"
+                ):
                     logger.debug(
-                        "Pooling models with %s pooling does not "
-                        "support prefix caching.",
-                        pooling_type,
+                        "Pooling models with causal attn and %s/%s pooling "
+                        "do not support prefix caching.",
+                        pooler_config.seq_pooling_type,
+                        pooler_config.tok_pooling_type,
                     )
                     return False
-                elif pooling_type in ["all", "last"]:
+                else:
                     logger.debug(
-                        "Pooling models with causal attn and %s pooling support "
-                        "prefix caching.",
-                        pooling_type,
+                        "Pooling models with causal attn and %s/%s pooling "
+                        "support prefix caching.",
+                        pooler_config.seq_pooling_type,
+                        pooler_config.tok_pooling_type,
                     )
                     return True
-                else:
-                    raise ValueError(f"{pooling_type=} not supported.")
+
             # vllm currently does not have pooling models using hybrid,
             # attention_free or encoder_decoder attn types.
             return False
         else:
+            # for generative models
             if attn_type == "hybrid":
                 logger.debug(
-                    "Hybrid models does not support prefix caching since the feature "
+                    "Hybrid models do not support prefix caching since the feature "
                     "is still experimental."
                 )
                 return False
             elif attn_type == "attention_free":
                 logger.debug(
-                    "Attention free models does not support prefix caching since the "
+                    "Attention free models do not support prefix caching since the "
                     "feature is still experimental."
                 )
                 return False
             elif attn_type == "encoder_decoder":
-                logger.debug("Encoder decoder models does not support prefix caching.")
+                logger.debug("Encoder decoder models do not support prefix caching.")
                 return False
             else:  # attn_type == "decoder"
                 logger.debug("Generative models support prefix caching.")
                 return True
 
-    def is_model_moe(
-        self,
-    ) -> bool:
-        return self.get_num_experts() > 1
+    @property
+    def is_moe(self) -> bool:
+        return self.get_num_experts() > 0
 
+    @property
     def is_quantized(self) -> bool:
         return getattr(self.hf_config, "quantization_config", None) is not None
 
@@ -1858,6 +1755,11 @@ _STR_DTYPE_TO_TORCH_DTYPE = {
     "bfloat16": torch.bfloat16,
 }
 
+
+def str_dtype_to_torch_dtype(type: str):
+    return _STR_DTYPE_TO_TORCH_DTYPE.get(type)
+
+
 # model_type -> reason
 _FLOAT16_NOT_SUPPORTED_MODELS = {
     "gemma2": "Numerical instability. Please use bfloat16 or float32 instead.",
@@ -1885,46 +1787,6 @@ def _check_valid_dtype(model_type: str, dtype: torch.dtype):
     return True
 
 
-def _find_dtype(
-    model_id: str,
-    config: PretrainedConfig,
-    *,
-    revision: str | None,
-):
-    # NOTE: getattr(config, "dtype", torch.float32) is not correct
-    # because config.dtype can be None.
-    config_dtype = getattr(config, "dtype", None)
-
-    # Fallbacks for multi-modal models if the root config
-    # does not define dtype
-    if config_dtype is None:
-        config_dtype = getattr(config.get_text_config(), "dtype", None)
-    if config_dtype is None and hasattr(config, "vision_config"):
-        config_dtype = getattr(config.vision_config, "dtype", None)
-    if config_dtype is None and hasattr(config, "encoder_config"):
-        config_dtype = getattr(config.encoder_config, "dtype", None)
-
-    # Try to read the dtype of the weights if they are in safetensors format
-    if config_dtype is None:
-        repo_mt = try_get_safetensors_metadata(model_id, revision=revision)
-
-        if repo_mt and (files_mt := repo_mt.files_metadata):
-            param_dtypes: set[torch.dtype] = {
-                _SAFETENSORS_TO_TORCH_DTYPE[dtype_str]
-                for file_mt in files_mt.values()
-                for dtype_str in file_mt.parameter_count
-                if dtype_str in _SAFETENSORS_TO_TORCH_DTYPE
-            }
-
-            if param_dtypes:
-                return common_broadcastable_dtype(param_dtypes)
-
-    if config_dtype is None:
-        config_dtype = torch.float32
-
-    return config_dtype
-
-
 def _resolve_auto_dtype(
     model_type: str,
     config_dtype: torch.dtype,
@@ -1979,7 +1841,9 @@ def _get_and_verify_dtype(
     is_pooling_model: bool,
     revision: str | None = None,
 ) -> torch.dtype:
-    config_dtype = _find_dtype(model_id, config, revision=revision)
+    config_dtype = ModelArchConfigConvertorBase.get_torch_dtype(
+        config, model_id, revision=revision
+    )
     model_type = config.model_type
 
     if isinstance(dtype, str):
@@ -2042,6 +1906,7 @@ def _get_head_dtype(
 
 def _get_and_verify_max_len(
     hf_config: PretrainedConfig,
+    model_arch_config: ModelArchitectureConfig,
     tokenizer_config: dict | None,
     max_model_len: int | None,
     disable_sliding_window: bool,
@@ -2050,36 +1915,9 @@ def _get_and_verify_max_len(
     encoder_config: Any | None = None,
 ) -> int:
     """Get and verify the model's maximum length."""
-    derived_max_model_len = float("inf")
-    possible_keys = [
-        # OPT
-        "max_position_embeddings",
-        # GPT-2
-        "n_positions",
-        # MPT
-        "max_seq_len",
-        # ChatGLM2
-        "seq_length",
-        # Command-R
-        "model_max_length",
-        # Whisper
-        "max_target_positions",
-        # Others
-        "max_sequence_length",
-        "max_seq_length",
-        "seq_len",
-    ]
-    # Choose the smallest "max_length" from the possible keys
-    max_len_key = None
-    for key in possible_keys:
-        max_len = getattr(hf_config, key, None)
-        if max_len is not None:
-            max_len_key = key if max_len < derived_max_model_len else max_len_key
-            derived_max_model_len = min(derived_max_model_len, max_len)
-    # For Command-R / Cohere, Cohere2 / Aya Vision models
-    if tmp_max_len := getattr(hf_config, "model_max_length", None):
-        max_len_key = "model_max_length"
-        derived_max_model_len = tmp_max_len
+    (derived_max_model_len, max_len_key) = (
+        model_arch_config.derived_max_model_len_and_key
+    )
 
     # If sliding window is manually disabled, max_length should be less
     # than the sliding window length in the model config.
@@ -2112,10 +1950,9 @@ def _get_and_verify_max_len(
 
         default_max_len = 2048
         logger.warning(
-            "The model's config.json does not contain any of the following "
-            "keys to determine the original maximum length of the model: "
-            "%s. Assuming the model's maximum length is %d.",
-            possible_keys,
+            "The model's config.json does not contain any of the keys "
+            "to determine the original maximum length of the model. "
+            "Assuming the model's maximum length is %d.",
             default_max_len,
         )
         derived_max_model_len = default_max_len
@@ -2123,9 +1960,7 @@ def _get_and_verify_max_len(
     # In Transformers v5 rope_parameters could be TypedDict or dict[str, TypedDict].
     # To simplify the verification, we convert it to dict[str, TypedDict].
     rope_parameters = getattr(hf_config, "rope_parameters", None)
-    if rope_parameters and not set(rope_parameters.keys()).issubset(
-        ALLOWED_LAYER_TYPES
-    ):
+    if rope_parameters and not is_rope_parameters_nested(rope_parameters):
         rope_parameters = {"": rope_parameters}
 
     # NOTE(woosuk): Gemma3's max_model_len (128K) is already scaled by RoPE
@@ -2150,9 +1985,10 @@ def _get_and_verify_max_len(
     if encoder_config and "max_seq_length" in encoder_config:
         derived_max_model_len = encoder_config["max_seq_length"]
 
-    # If the user didn't specify `max_model_len`, then use that derived from
-    # the model config as a default value.
-    if max_model_len is None:
+    # If the user didn't specify `max_model_len` or specified -1 (auto-fit),
+    # then use that derived from the model config as a default value.
+    # When -1 is specified, the engine will later auto-fit to available memory.
+    if max_model_len is None or max_model_len == -1:
         # For LongRoPE, default to original_max_position_embeddings to avoid
         # performance degradation for shorter sequences
         if rope_parameters is not None and any(
diff --git a/vllm/config/model_arch.py b/vllm/config/model_arch.py
new file mode 100644
index 0000000000000000000000000000000000000000..d55e2a3399b39a637157112bcc4ba31e46d56441
--- /dev/null
+++ b/vllm/config/model_arch.py
@@ -0,0 +1,57 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any
+
+from pydantic import ConfigDict
+from pydantic.dataclasses import dataclass
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
+class ModelArchitectureConfig:
+    """
+    Configuration for model architecture that required by vLLM runtime
+    """
+
+    architectures: list[str] | None
+    """List of model architecture class names (e.g., ['LlamaForCausalLM']).
+       It can be None upon calling `vllm_config.with_hf_config(config.text_config)`"""
+
+    model_type: str
+    """Model type identifier (e.g., 'llama', 'gpt_oss')."""
+
+    text_model_type: str | None
+    """Text model type identifier (e.g., 'llama4_text')."""
+
+    hidden_size: int
+    """Hidden size of the model."""
+
+    total_num_hidden_layers: int
+    """Number of hidden layers in the model."""
+
+    total_num_attention_heads: int
+    """Number of attention heads in the model."""
+
+    head_size: int
+    """Head dimension of the model."""
+
+    vocab_size: int
+    """Vocabulary size of the model."""
+
+    total_num_kv_heads: int
+    """Number of key value heads in the model."""
+
+    num_experts: int
+    """Number of experts in the model."""
+
+    quantization_config: dict[str, Any] | None
+    """Quantization configuration dictionary containing quantization parameters."""
+
+    is_deepseek_mla: bool
+    """Whether the model is a DeepSeek MLA model."""
+
+    derived_max_model_len_and_key: tuple[float, str | None]
+    """Derived maximum model length and key from the hf config."""
diff --git a/vllm/config/multimodal.py b/vllm/config/multimodal.py
index 8a2936de96d6fa24db653350d8e692d0abb26195..ecb346af8f3c99e673a0ca5c480e5f6322e38960 100644
--- a/vllm/config/multimodal.py
+++ b/vllm/config/multimodal.py
@@ -7,9 +7,9 @@ from typing import Any, Literal, TypeAlias
 from pydantic import ConfigDict, Field, field_validator, model_validator
 from pydantic.dataclasses import dataclass
 
-from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config.utils import config
 from vllm.utils.hashing import safe_hash
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
 
 
 @dataclass
@@ -124,7 +124,7 @@ class MultiModalConfig:
     mm_encoder_attn_backend: AttentionBackendEnum | None = None
     """Optional override for the multi-modal encoder attention backend when
     using vision transformers. Accepts any value from
-    `vllm.attention.backends.registry.AttentionBackendEnum` (e.g. `FLASH_ATTN`)."""
+    `vllm.v1.attention.backends.registry.AttentionBackendEnum` (e.g. `FLASH_ATTN`)."""
     interleave_mm_strings: bool = False
     """Enable fully interleaved support for multimodal prompts, while using
     --chat-template-content-format=string."""
diff --git a/vllm/config/observability.py b/vllm/config/observability.py
index e40bf18a00ce22cc3cdc66dae26a62445c9723d9..dae7032bc8c25f946f937345d0c2a60a5e81b26f 100644
--- a/vllm/config/observability.py
+++ b/vllm/config/observability.py
@@ -64,6 +64,23 @@ class ObservabilityConfig:
     module in the model and attach informations such as input/output shapes to
     nvtx range markers. Noted that this doesn't work with CUDA graphs enabled."""
 
+    enable_mfu_metrics: bool = False
+    """Enable Model FLOPs Utilization (MFU) metrics."""
+
+    enable_mm_processor_stats: bool = False
+    """Enable collection of timing statistics for multimodal processor operations.
+    This is for internal use only (e.g., benchmarks) and is not exposed as a CLI
+    argument."""
+
+    enable_mfu_metrics: bool = False
+    """Enable Model FLOPs Utilization (MFU) metrics."""
+
+    enable_logging_iteration_details: bool = False
+    """Enable detailed logging of iteration details.
+    If set, vllm EngineCore will log iteration details
+    This includes number of context/generation requests and tokens
+    and the elapsed cpu time for the iteration."""
+
     @cached_property
     def collect_model_forward_time(self) -> bool:
         """Whether to collect model forward time for the request."""
diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py
index ca717f6e3e3e84145e09e551defcf9d5ac500ca3..706f4cc42a39e2d5de3c025f1b0a85e4695873d5 100644
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -2,10 +2,11 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
+from collections.abc import Callable
 from typing import TYPE_CHECKING, Any, Literal
 
 import torch
-from pydantic import Field, model_validator
+from pydantic import Field, field_validator, model_validator
 from pydantic.dataclasses import dataclass
 from torch.distributed import ProcessGroup, ReduceOp
 from typing_extensions import Self
@@ -36,6 +37,14 @@ ExpertPlacementStrategy = Literal["linear", "round_robin"]
 DistributedExecutorBackend = Literal["ray", "mp", "uni", "external_launcher"]
 DataParallelBackend = Literal["ray", "mp"]
 EPLBPolicyOption = Literal["default"]
+All2AllBackend = Literal[
+    "naive",
+    "pplx",
+    "deepep_high_throughput",
+    "deepep_low_latency",
+    "allgather_reducescatter",
+    "flashinfer_all2allv",
+]
 
 
 @config
@@ -61,6 +70,10 @@ class EPLBConfig:
     Log the balancedness each step of expert parallelism.
     This is turned off by default since it will cause communication overhead.
     """
+    log_balancedness_interval: int = 1
+    """
+    Interval for logging the balancedness.
+    """
     use_async: bool = False
     """
     Whether to use non-blocking EPLB.
@@ -69,6 +82,14 @@ class EPLBConfig:
     policy: EPLBPolicyOption = "default"
     """The policy type for expert parallel load balancing (EPLB)."""
 
+    @model_validator(mode="after")
+    def _validate_eplb_config(self) -> Self:
+        if self.use_async and self.policy != "default":
+            raise ValueError("Async EPLB is only supported with the default policy.")
+        if self.log_balancedness and self.log_balancedness_interval <= 0:
+            raise ValueError("log_balancedness_interval must be greater than 0.")
+        return self
+
 
 @config
 @dataclass
@@ -111,6 +132,8 @@ class ParallelConfig:
     between local data parallel ranks, but an external LB balances
     between vLLM nodes/replicas. Set explicitly in conjunction with
     --data-parallel-start-rank."""
+    is_moe_model: bool | None = None
+    """Whether the deployed model is MoE (if known)."""
     enable_expert_parallel: bool = False
     """Use expert parallelism instead of tensor parallelism for MoE layers."""
     enable_eplb: bool = False
@@ -126,24 +149,14 @@ class ParallelConfig:
       with 4 experts and 2 ranks, rank 0 will have experts [0, 2] and rank 1
       will have experts [1, 3]. This strategy can help improve load balancing
       for grouped expert models with no redundant experts."""
-    all2all_backend: (
-        Literal[
-            "naive",
-            "pplx",
-            "deepep_high_throughput",
-            "deepep_low_latency",
-            "allgather_reducescatter",
-            "flashinfer_all2allv",
-        ]
-        | None
-    ) = None
-    """All2All backend for MoE expert parallel communication. If not set, uses
-    the value from VLLM_ALL2ALL_BACKEND environment variable. Available options:
-    - "naive": Naive all2all implementation using broadcasts
-    - "allgather_reducescatter": All2all based on allgather and reducescatter
-    - "pplx": Use pplx kernels
-    - "deepep_high_throughput": Use deepep high-throughput kernels
-    - "deepep_low_latency": Use deepep low-latency kernels
+    all2all_backend: All2AllBackend = "allgather_reducescatter"
+    """All2All backend for MoE expert parallel communication. Available options:
+
+    - "naive": Naive all2all implementation using broadcasts\n
+    - "allgather_reducescatter": All2all based on allgather and reducescatter\n
+    - "pplx": Use pplx kernels\n
+    - "deepep_high_throughput": Use deepep high-throughput kernels\n
+    - "deepep_low_latency": Use deepep low-latency kernels\n
     - "flashinfer_all2allv": Use flashinfer alltoallv kernels for mnnvl"""
 
     max_parallel_loading_workers: int | None = None
@@ -156,6 +169,8 @@ class ParallelConfig:
 
     enable_dbo: bool = False
     """Enable dual batch overlap for the model executor."""
+    ubatch_size: int = 0
+    """Number of ubatch size."""
 
     dbo_decode_token_threshold: int = 32
     """The threshold for dual batch overlap for batches only containing decodes.
@@ -168,9 +183,12 @@ class ParallelConfig:
     threshold, microbatching will be used. Otherwise, the request will be
     processed in a single batch."""
 
-    disable_nccl_for_dp_synchronization: bool = False
+    disable_nccl_for_dp_synchronization: bool = Field(default=None)
     """Forces the dp synchronization logic in vllm/v1/worker/dp_utils.py 
-    to use Gloo instead of NCCL for its all reduce"""
+    to use Gloo instead of NCCL for its all reduce.
+
+    Defaults to True when async scheduling is enabled, False otherwise.
+    """
 
     ray_workers_use_nsight: bool = False
     """Whether to profile Ray workers with nsight, see https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler."""
@@ -255,6 +273,10 @@ class ParallelConfig:
     Block_size should be divisible by cp_kv_cache_interleave_size.
     """
 
+    data_parallel_index: int = Field(init=False)
+    """Equal to the data parallel rank but not used for torch process groups
+    and not overridden for dense models."""
+
     _api_process_count: int = Field(default=1, gt=0)
     """
     The number of API processes initialized.
@@ -274,6 +296,12 @@ class ParallelConfig:
         should only be set by API server scale-out.
     """
 
+    @field_validator("disable_nccl_for_dp_synchronization", mode="wrap")
+    @classmethod
+    def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
+        """Skip validation if the value is `None` when initialisation is delayed."""
+        return None if value is None else handler(value)
+
     @model_validator(mode="after")
     def _validate_parallel_config(self) -> Self:
         if self._api_process_rank >= self._api_process_count:
@@ -325,6 +353,14 @@ class ParallelConfig:
         including data parallelism."""
         return self.world_size * self.data_parallel_size
 
+    @property
+    def use_ubatching(self) -> bool:
+        return self.enable_dbo or self.ubatch_size > 1
+
+    @property
+    def num_ubatches(self) -> int:
+        return 2 if self.enable_dbo else self.ubatch_size
+
     def get_next_dp_init_port(self) -> int:
         """
         We might need to initialize process groups in multiple
@@ -457,6 +493,8 @@ class ParallelConfig:
             # Derived/runtime topology, networking, or launch details
             "data_parallel_rank",
             "data_parallel_rank_local",
+            "data_parallel_size_local",
+            "data_parallel_index",
             "data_parallel_backend",
             "data_parallel_external_lb",
             "data_parallel_hybrid_lb",
@@ -485,20 +523,17 @@ class ParallelConfig:
         from vllm.config.utils import get_hash_factors, hash_factors
 
         factors = get_hash_factors(self, ignored_factors)
-        # Explicitly include backend affecting env factor as before
-        factors["VLLM_ALL2ALL_BACKEND"] = str(envs.VLLM_ALL2ALL_BACKEND)
         return hash_factors(factors)
 
     def __post_init__(self) -> None:
         # Set all2all_backend from env var if not specified, with deprecation warning
-        if self.all2all_backend is None:
+        if envs.is_set("VLLM_ALL2ALL_BACKEND"):
+            logger.warning_once(
+                "VLLM_ALL2ALL_BACKEND environment variable is deprecated and "
+                "will be removed in v0.15.0. Please use the "
+                "--all2all-backend command-line argument instead."
+            )
             self.all2all_backend = envs.VLLM_ALL2ALL_BACKEND
-            if envs.is_set("VLLM_ALL2ALL_BACKEND"):
-                logger.warning_once(
-                    "VLLM_ALL2ALL_BACKEND environment variable is deprecated and "
-                    "will be removed in a future release. Please use the "
-                    "--all2all-backend command-line argument instead."
-                )
 
         # Continue with the rest of the initialization
         self.world_size = (
@@ -540,6 +575,14 @@ class ParallelConfig:
             self.data_parallel_master_ip = envs.VLLM_DP_MASTER_IP
             self.data_parallel_master_port = envs.VLLM_DP_MASTER_PORT
 
+            if self.data_parallel_size > 1 and self.is_moe_model is False:
+                raise ValueError(
+                    "Offline data parallel mode is not supported/useful"
+                    " for dense models."
+                )
+
+        self.data_parallel_index = self.data_parallel_rank
+
         if self.distributed_executor_backend == "external_launcher":
             os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
             logger.info("Disabling V1 multiprocessing for external launcher.")
diff --git a/vllm/config/pooler.py b/vllm/config/pooler.py
index 976ae8c063eb7fd645e3b491014fea4bb1243cc0..a3b1f1cbee719bf7a80336880caa0dcaad2aea28 100644
--- a/vllm/config/pooler.py
+++ b/vllm/config/pooler.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Any, Literal
+from typing import Any, Literal, get_args
 
 from pydantic.dataclasses import dataclass
 
@@ -11,7 +11,11 @@ from vllm.utils.hashing import safe_hash
 
 logger = init_logger(__name__)
 
-PoolingTypeStr = Literal["LAST", "ALL", "CLS", "STEP", "MEAN"]
+SequencePoolingType = Literal["CLS", "LAST", "MEAN"]
+SEQ_POOLING_TYPES: tuple[SequencePoolingType, ...] = get_args(SequencePoolingType)
+
+TokenPoolingType = Literal["ALL", "STEP"]
+TOK_POOLING_TYPES: tuple[TokenPoolingType, ...] = get_args(TokenPoolingType)
 
 
 @config
@@ -19,16 +23,32 @@ PoolingTypeStr = Literal["LAST", "ALL", "CLS", "STEP", "MEAN"]
 class PoolerConfig:
     """Controls the behavior of output pooling in pooling models."""
 
-    pooling_type: PoolingTypeStr | None = None
+    pooling_type: SequencePoolingType | TokenPoolingType | None = None
     """
-    The pooling method of the pooling model. This should be a key in
-    [`vllm.model_executor.layers.pooler.PoolingType`][].
+    The pooling method used for pooling.
+
+    If set, `seq_pooling_type` or `tok_pooling_type` are automatically populated
+    with this field. Alternatively, users can set `seq_pooling_type` and
+    `tok_pooling_type` explicitly.
+
+    This field is mainly for user convenience. Internal code should always use
+    `seq_pooling_type` or `tok_pooling_type` instead of `pooling_type`.
+    """
+
+    seq_pooling_type: SequencePoolingType | None = None
+    """
+    The pooling method used for sequence pooling.
+    """
+
+    tok_pooling_type: TokenPoolingType | None = None
+    """
+    The pooling method used for tokenwise pooling.
     """
 
     ## for embeddings models
     normalize: bool | None = None
     """
-    Whether to normalize the embeddings outputs. Defaults to True.
+    DEPRECATED: please use `use_activation` instead.
     """
     dimensions: int | None = None
     """
@@ -55,11 +75,11 @@ class PoolerConfig:
     ## for classification models
     softmax: float | None = None
     """
-    softmax will be deprecated, please use use_activation instead.
+    DEPRECATED: please use `use_activation` instead.
     """
     activation: float | None = None
     """
-    activation will be deprecated, please use use_activation instead.
+    DEPRECATED: please use `use_activation` instead.
     """
     use_activation: bool | None = None
     """
@@ -89,6 +109,41 @@ class PoolerConfig:
         # raise deprecated warning for softmax and activation
         self.use_activation = get_use_activation(self)
 
+        if pooling_type := self.pooling_type:
+            if self.seq_pooling_type is not None:
+                raise ValueError(
+                    "Cannot set both `pooling_type` and `seq_pooling_type`"
+                )
+            if self.tok_pooling_type is not None:
+                raise ValueError(
+                    "Cannot set both `pooling_type` and `tok_pooling_type`"
+                )
+
+            if pooling_type in SEQ_POOLING_TYPES:
+                logger.debug(
+                    "Resolved `pooling_type=%r` to `seq_pooling_type=%r`.",
+                    pooling_type,
+                    pooling_type,
+                )
+                self.seq_pooling_type = pooling_type
+            elif pooling_type in TOK_POOLING_TYPES:
+                logger.debug(
+                    "Resolved `pooling_type=%r` to `tok_pooling_type=%r`.",
+                    pooling_type,
+                    pooling_type,
+                )
+                self.tok_pooling_type = pooling_type
+            else:
+                raise NotImplementedError(pooling_type)
+
+    def get_seq_pooling_type(self) -> SequencePoolingType:
+        assert self.seq_pooling_type is not None, "Should be resolved by ModelConfig"
+        return self.seq_pooling_type
+
+    def get_tok_pooling_type(self) -> TokenPoolingType:
+        assert self.tok_pooling_type is not None, "Should be resolved by ModelConfig"
+        return self.tok_pooling_type
+
     def compute_hash(self) -> str:
         """
         WARNING: Whenever a new field is added to this config,
@@ -109,17 +164,24 @@ class PoolerConfig:
 
 
 def get_use_activation(o: object):
-    if softmax := getattr(o, "softmax", None) is not None:
+    if (normalize := getattr(o, "normalize", None)) is not None:
+        logger.warning_once(
+            "`normalize` is deprecated and will be removed in v0.15. "
+            "Please use `use_activation` instead."
+        )
+        return normalize
+
+    if (softmax := getattr(o, "softmax", None)) is not None:
         logger.warning_once(
-            "softmax will be deprecated and will be removed in v0.15. "
-            "Please use use_activation instead."
+            "`softmax` is deprecated and will be removed in v0.15. "
+            "Please use `use_activation` instead."
         )
         return softmax
 
-    if activation := getattr(o, "activation", None) is not None:
+    if (activation := getattr(o, "activation", None)) is not None:
         logger.warning_once(
-            "activation will be deprecated and will be removed in v0.15. "
-            "Please use use_activation instead."
+            "`activation` is deprecated and will be removed in v0.15. "
+            "Please use `use_activation` instead."
         )
         return activation
 
diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py
index 8abbe8ba0103e0da9b9f7bfc2089bb875ed8d51f..1bceaa933e66e4a161d9a659423bf3ede0b5dbec 100644
--- a/vllm/config/scheduler.py
+++ b/vllm/config/scheduler.py
@@ -130,11 +130,12 @@ class SchedulerConfig:
     and starting configuration.
     """
 
-    async_scheduling: bool = False
-    """If set to True, perform async scheduling. This helps to avoid gaps in
-    GPU utilization, leading to better latency and throughput.
-    Async scheduling is currently not supported with some features such as
-    speculative decoding and pipeline parallelism.
+    async_scheduling: bool = Field(default=None)
+    """If set to False, disable async scheduling. Async scheduling helps to
+    avoid gaps in GPU utilization, leading to better latency and throughput.
+    It is currently not supported with some features such as
+    speculative decoding and pipeline parallelism, and will be automatically
+    disabled in those cases.
     """
 
     stream_interval: int = Field(default=1, ge=1)
@@ -208,9 +209,7 @@ class SchedulerConfig:
     @classmethod
     def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
         """Skip validation if the value is `None` when initialisation is delayed."""
-        if value is None:
-            return value
-        return handler(value)
+        return None if value is None else handler(value)
 
     def __post_init__(self, max_model_len: int, is_encoder_decoder: bool) -> None:
         if is_encoder_decoder:
diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py
index 34f999ce7d5742ea24c7285335e83b0c249c0062..324aedaa7a93310f91e66a48ef57737203919eeb 100644
--- a/vllm/config/speculative.py
+++ b/vllm/config/speculative.py
@@ -33,6 +33,7 @@ MTPModelTypes = Literal[
     "mimo_mtp",
     "glm4_moe_mtp",
     "ernie_mtp",
+    "exaone_moe_mtp",
     "qwen3_next_mtp",
     "longcat_flash_mtp",
     "mtp",
@@ -199,7 +200,6 @@ class SpeculativeConfig:
             n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
             hf_config.update(
                 {
-                    "num_hidden_layers": 0,
                     "n_predict": n_predict,
                     "architectures": ["Glm4MoeMTPModel"],
                 }
@@ -220,6 +220,15 @@ class SpeculativeConfig:
             hf_config.update(
                 {"n_predict": n_predict, "architectures": ["Qwen3NextMTP"]}
             )
+
+        if hf_config.model_type == "exaone_moe":
+            hf_config.model_type = "exaone_moe_mtp"
+        if hf_config.model_type == "exaone_moe_mtp":
+            n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
+            hf_config.update(
+                {"n_predict": n_predict, "architectures": ["ExaoneMoeMTP"]}
+            )
+
         if hf_config.model_type == "longcat_flash":
             hf_config.model_type = "longcat_flash_mtp"
             n_predict = getattr(hf_config, "num_nextn_predict_layers", 1)
@@ -401,6 +410,9 @@ class SpeculativeConfig:
                             model_type="eagle",
                         )
                         self.draft_model_config.hf_config = eagle_config
+                        self.draft_model_config.model_arch_config = (
+                            self.draft_model_config.get_model_arch_config()
+                        )
 
                 if self.num_speculative_tokens is not None and hasattr(
                     self.draft_model_config.hf_config, "num_lookahead_tokens"
diff --git a/vllm/config/speech_to_text.py b/vllm/config/speech_to_text.py
index 3eafff1a30609945f76e6122973b82b88eb104ae..fe3532c9742dedfe01737afac42dd9c495ffe33a 100644
--- a/vllm/config/speech_to_text.py
+++ b/vllm/config/speech_to_text.py
@@ -17,10 +17,11 @@ class SpeechToTextConfig:
     16kHz audio input. The input audio will be automatically resampled to this
     rate before processing."""
 
-    max_audio_clip_s: int = 30
+    max_audio_clip_s: int | None = 30
     """Maximum duration in seconds for a single audio clip without chunking.
     Audio longer than this will be split into smaller chunks if
-    `allow_audio_chunking` evaluates to True, otherwise it will be rejected."""
+    `allow_audio_chunking` evaluates to True, otherwise it will be rejected. 
+    `None` means audio duration can be unlimited and won't be chunked."""
 
     overlap_chunk_second: int = 1
     """Overlap duration in seconds between consecutive audio chunks when
diff --git a/vllm/config/utils.py b/vllm/config/utils.py
index 470296517deb116458349997a4cd780d7025c5e1..614373782d12fc61171e43cf8d17ddb65752f94a 100644
--- a/vllm/config/utils.py
+++ b/vllm/config/utils.py
@@ -9,7 +9,7 @@ import inspect
 import json
 import pathlib
 import textwrap
-from collections.abc import Iterable, Mapping, Sequence, Set
+from collections.abc import Callable, Iterable, Mapping, Sequence, Set
 from dataclasses import MISSING, Field, dataclass, field, fields, is_dataclass, replace
 from itertools import pairwise
 from typing import TYPE_CHECKING, Any, Protocol, TypeVar
@@ -74,7 +74,11 @@ def get_field(cls: ConfigType, name: str) -> Field:
 
 
 def getattr_iter(
-    object: object, names: Iterable[str], default: Any, warn: bool = False
+    object: object,
+    names: Iterable[str],
+    default: Any | None = None,
+    default_factory: Callable[[], Any] | None = None,
+    warn: bool = False,
 ) -> Any:
     """
     A helper function that retrieves an attribute from an object which may
@@ -96,7 +100,7 @@ def getattr_iter(
                     names[0],
                 )
             return getattr(object, name)
-    return default
+    return default_factory() if default_factory is not None else default
 
 
 def contains_object_print(text: str) -> bool:
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index d056bd7fe6371723eada3148550eb4c77f59f14e..b6c1339ff5ae95bed1f26e01b3e801c1f95d0e07 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -343,6 +343,29 @@ class VllmConfig:
         # i.e., batch_size <= self.compilation_config.max_cudagraph_capture_size
         return self.compilation_config.bs_to_padded_graph_size[batch_size]
 
+    @property
+    def needs_dp_coordinator(self) -> bool:
+        """
+        Determine if the DPCoordinator process is needed.
+
+        The DPCoordinator is needed in two cases:
+        1. For MoE models with DP > 1: to handle wave coordination
+           (even in external LB mode, since wave coordination runs in the coordinator)
+        2. For non-MoE models in internal/hybrid LB mode: to collect and publish
+           queue stats for load balancing across DP ranks
+
+        Returns:
+            True if DPCoordinator process is needed, False otherwise.
+        """
+
+        # For non-MoE models, only need coordinator in internal/hybrid LB mode
+        # (for stats collection).
+        return self.parallel_config.data_parallel_size > 1 and (
+            self.model_config is None
+            or self.model_config.is_moe
+            or not self.parallel_config.data_parallel_external_lb
+        )
+
     def enable_trace_function_call_for_thread(self) -> None:
         """
         Set up function tracing for the current thread,
@@ -421,6 +444,7 @@ class VllmConfig:
 
         model_config = copy.deepcopy(self.model_config)
         model_config.hf_config = hf_config
+        model_config.model_arch_config = model_config.get_model_arch_config()
 
         return replace(self, model_config=model_config)
 
@@ -474,17 +498,15 @@ class VllmConfig:
         Right now, this function reads the offloading settings from
         CacheConfig and configures the KVTransferConfig accordingly.
         """
-        if (kv_offloading_backend := self.cache_config.kv_offloading_backend) is None:
+        # KV offloading is only activated when kv_offloading_size is set.
+        if (kv_offloading_size := self.cache_config.kv_offloading_size) is None:
             return
 
+        kv_offloading_backend = self.cache_config.kv_offloading_backend
+
         # If no KVTransferConfig is provided, create a default one.
         if self.kv_transfer_config is None:
             self.kv_transfer_config = KVTransferConfig()
-
-        if (kv_offloading_size := self.cache_config.kv_offloading_size) is None:
-            raise ValueError(
-                "You must set kv_offloading_size when kv_offloading_backend is set."
-            )
         num_kv_ranks = (
             self.parallel_config.tensor_parallel_size
             * self.parallel_config.pipeline_parallel_size
@@ -492,12 +514,8 @@ class VllmConfig:
 
         if kv_offloading_backend == "native":
             self.kv_transfer_config.kv_connector = "OffloadingConnector"
-            kv_bytes_per_rank = kv_offloading_size * (1 << 30) / num_kv_ranks
-
-            # NOTE(ApostaC): the actual calculation for num_cpu_blocks should be
-            # done after the model's KV cache is initialized
             self.kv_transfer_config.kv_connector_extra_config.update(
-                {"kv_bytes_per_rank": kv_bytes_per_rank, "num_cpu_blocks": 0}
+                {"cpu_bytes_to_use": kv_offloading_size * (1 << 30)}
             )
         elif kv_offloading_backend == "lmcache":
             self.kv_transfer_config.kv_connector = "LMCacheConnectorV1"
@@ -522,6 +540,8 @@ class VllmConfig:
             self.model_config.verify_with_parallel_config(self.parallel_config)
             self.model_config.verify_dual_chunk_attention_config(self.load_config)
 
+            self.parallel_config.is_moe_model = self.model_config.is_moe
+
         self.cache_config.verify_with_parallel_config(self.parallel_config)
 
         if self.lora_config is not None:
@@ -552,15 +572,12 @@ class VllmConfig:
                 if self.speculative_config.method not in get_args(EagleModelTypes):
                     raise ValueError(
                         "Currently, async scheduling is only supported "
-                        "with EAGLE/MTP kind of speculative decoding"
+                        "with EAGLE/MTP kind of speculative decoding."
                     )
                 if self.speculative_config.disable_padded_drafter_batch:
                     raise ValueError(
-                        "async scheduling for EAGLE/MTP kind of speculative "
-                        "decoding is enabled, but disable_padded_drafter_batch=True "
-                        "disable_padded_drafter_batch=True is not supported for "
-                        "this situation now. please set "
-                        "disable_padded_drafter_batch=Fasle"
+                        "Async scheduling is not compatible with "
+                        "disable_padded_drafter_batch=True."
                     )
             if not executor_supports_async_sched:
                 raise ValueError(
@@ -570,35 +587,61 @@ class VllmConfig:
                 )
         elif self.scheduler_config.async_scheduling is None:
             # Enable async scheduling unless there is an incompatible option.
-            # NOTE: we won't reach here until async scheduling is enabled by default.
-            if (
-                self.parallel_config.pipeline_parallel_size > 1
-                or self.speculative_config is not None
+            if self.parallel_config.pipeline_parallel_size > 1:
+                logger.warning_once(
+                    "Async scheduling is not yet supported with "
+                    "pipeline_parallel_size > 1 and will be disabled.",
+                    scope="local",
+                )
+                self.scheduler_config.async_scheduling = False
+            elif (
+                self.speculative_config is not None
+                and self.speculative_config.method not in get_args(EagleModelTypes)
             ):
-                logger.warning(
-                    "Async scheduling is not yet supported with speculative decoding "
-                    " or pipeline_parallel_size > 1 and will be disabled."
+                logger.warning_once(
+                    "Async scheduling not supported with %s-based "
+                    "speculative decoding and will be disabled.",
+                    self.speculative_config.method,
+                    scope="local",
+                )
+                self.scheduler_config.async_scheduling = False
+            elif (
+                self.speculative_config is not None
+                and self.speculative_config.disable_padded_drafter_batch
+            ):
+                logger.warning_once(
+                    "Async scheduling is not compatible with "
+                    "disable_padded_drafter_batch=True and will be disabled.",
+                    scope="local",
                 )
                 self.scheduler_config.async_scheduling = False
             elif not executor_supports_async_sched:
-                logger.warning(
+                logger.warning_once(
                     "Async scheduling will be disabled because it is not supported "
                     "with the `%s` distributed executor backend (only `mp`, `uni`, and "
                     "`external_launcher` are supported).",
                     executor_backend,
+                    scope="local",
                 )
                 self.scheduler_config.async_scheduling = False
             else:
                 self.scheduler_config.async_scheduling = True
 
-        if (
-            self.scheduler_config.async_scheduling
-            and not self.parallel_config.disable_nccl_for_dp_synchronization
-        ):
-            logger.info(
-                "Disabling NCCL for DP synchronization when using async scheduling."
-            )
-            self.parallel_config.disable_nccl_for_dp_synchronization = True
+        logger.info_once(
+            "Asynchronous scheduling is %s.",
+            "enabled" if self.scheduler_config.async_scheduling else "disabled",
+        )
+
+        if self.parallel_config.disable_nccl_for_dp_synchronization is None:
+            if self.scheduler_config.async_scheduling:
+                logger.info_once(
+                    "Disabling NCCL for DP synchronization "
+                    "when using async scheduling.",
+                    scope="local",
+                )
+                self.parallel_config.disable_nccl_for_dp_synchronization = True
+            else:
+                self.parallel_config.disable_nccl_for_dp_synchronization = False
 
         from vllm.platforms import current_platform
 
@@ -627,9 +670,9 @@ class VllmConfig:
             and self.compilation_config.mode != CompilationMode.VLLM_COMPILE
         ):
             logger.warning(
-                "Inductor compilation was disabled by user settings,"
-                "Optimizations settings that are only active during"
-                "Inductor compilation will be ignored."
+                "Inductor compilation was disabled by user settings, "
+                "optimizations settings that are only active during "
+                "inductor compilation will be ignored."
             )
 
         def has_blocked_weights():
@@ -748,7 +791,7 @@ class VllmConfig:
 
             logger.warning_once(
                 "--kv-sharing-fast-prefill requires changes on model side for "
-                "correctness and to realize prefill savings. "
+                "correctness and to realize prefill savings."
             )
         # TODO: Move after https://github.com/vllm-project/vllm/pull/26847 lands
         self._set_compile_ranges()
@@ -771,7 +814,7 @@ class VllmConfig:
             and not self.cache_config.enable_prefix_caching
         ):
             logger.warning(
-                "KV cache events are on, but prefix caching is not enabled."
+                "KV cache events are on, but prefix caching is not enabled. "
                 "Use --enable-prefix-caching to enable."
             )
         if (
@@ -780,9 +823,9 @@ class VllmConfig:
             and not self.kv_events_config.enable_kv_cache_events
         ):
             logger.warning(
-                "KV cache events are disabled,"
-                "but the scheduler is configured to publish them."
-                "Modify KVEventsConfig.enable_kv_cache_events"
+                "KV cache events are disabled, "
+                "but the scheduler is configured to publish them. "
+                "Modify KVEventsConfig.enable_kv_cache_events "
                 "to True to enable."
             )
         current_platform.check_and_update_config(self)
@@ -814,9 +857,14 @@ class VllmConfig:
             )
 
         # Do this after all the updates to compilation_config.mode
+        effective_dp_size = (
+            self.parallel_config.data_parallel_size
+            if self.model_config is None or self.model_config.is_moe
+            else 1
+        )
         self.compilation_config.set_splitting_ops_for_v1(
             all2all_backend=self.parallel_config.all2all_backend,
-            data_parallel_size=self.parallel_config.data_parallel_size,
+            data_parallel_size=effective_dp_size,
         )
 
         if self.compilation_config.pass_config.enable_sp:
@@ -846,7 +894,7 @@ class VllmConfig:
                         else "pipeline parallelism"
                     )
                     logger.warning_once(
-                        "Sequence parallelism not supported with"
+                        "Sequence parallelism not supported with "
                         "native rms_norm when using %s, "
                         "this will likely lead to an error.",
                         regime,
@@ -863,7 +911,7 @@ class VllmConfig:
                 logger.warning_once(
                     "No piecewise cudagraph for executing cascade attention."
                     " Will fall back to eager execution if a batch runs "
-                    "into cascade attentions"
+                    "into cascade attentions."
                 )
 
             if self.compilation_config.cudagraph_mode.requires_piecewise_compilation():
@@ -873,9 +921,12 @@ class VllmConfig:
                     f"cudagraph_mode={self.compilation_config.cudagraph_mode}"
                 )
 
-        if self.parallel_config.enable_dbo:
+        if self.parallel_config.use_ubatching:
             a2a_backend = self.parallel_config.all2all_backend
-            assert a2a_backend in ["deepep_low_latency", "deepep_high_throughput"], (
+            assert a2a_backend in [
+                "deepep_low_latency",
+                "deepep_high_throughput",
+            ], (
                 "Microbatching currently only supports the deepep_low_latency and "
                 f"deepep_high_throughput all2all backend. {a2a_backend} is not "
                 "supported. To fix use --all2all-backend=deepep_low_latency or "
@@ -1217,12 +1268,6 @@ class VllmConfig:
             computed_compile_ranges_split_points
         )
 
-    def recalculate_max_model_len(self, max_model_len: int):
-        # Can only be called in try_verify_and_update_config
-        model_config = self.model_config
-        max_model_len = model_config.get_and_verify_max_len(max_model_len)
-        self.model_config.max_model_len = max_model_len
-
     def try_verify_and_update_config(self):
         if self.model_config is None:
             return
@@ -1281,13 +1326,8 @@ class VllmConfig:
         if self.compilation_config.debug_dump_path is None:
             return None
         tp_rank = self.parallel_config.rank
-        dp_rank = self.parallel_config.data_parallel_rank
-        data_parallel_size = self.parallel_config.data_parallel_size
-        append_path = (
-            f"rank_{tp_rank}"
-            if data_parallel_size == 1
-            else f"rank_{tp_rank}_dp_{dp_rank}"
-        )
+        dp_rank = self.parallel_config.data_parallel_index
+        append_path = f"rank_{tp_rank}_dp_{dp_rank}"
         path = self.compilation_config.debug_dump_path / append_path
         return path
 
@@ -1311,6 +1351,7 @@ class VllmConfig:
             f"disable_custom_all_reduce={self.parallel_config.disable_custom_all_reduce}, "  # noqa
             f"quantization={self.model_config.quantization}, "
             f"enforce_eager={self.model_config.enforce_eager}, "
+            f"enable_return_routed_experts={self.model_config.enable_return_routed_experts}, "  # noqa
             f"kv_cache_dtype={self.cache_config.cache_dtype}, "
             f"device_config={self.device_config.device}, "
             f"structured_outputs_config={self.structured_outputs_config!r}, "
@@ -1360,6 +1401,11 @@ def set_current_vllm_config(
 
     num_models_seen = compilation_counter.num_models_seen
     try:
+        # Clear the compilation config cache when context changes.
+        # This is needed since the old config may have been accessed
+        # and cached before the new config is set.
+        get_cached_compilation_config.cache_clear()
+
         _current_vllm_config = vllm_config
         _current_prefix = prefix
         yield
@@ -1400,11 +1446,18 @@ def get_cached_compilation_config():
 
 def get_current_vllm_config() -> VllmConfig:
     if _current_vllm_config is None:
-        # in ci, usually when we test custom ops/modules directly,
-        # we don't set the vllm config. In that case, we set a default
-        # config.
-        logger.warning("Current vLLM config is not set.")
-        return VllmConfig()
+        raise AssertionError(
+            "Current vLLM config is not set. This typically means "
+            "get_current_vllm_config() was called outside of a "
+            "set_current_vllm_config() context, or a CustomOp was instantiated "
+            "at module import time or model forward time when config is not set. "
+            "For tests that directly test custom ops/modules, use the "
+            "'default_vllm_config' pytest fixture from tests/conftest.py."
+        )
+    return _current_vllm_config
+
+
+def get_current_vllm_config_or_none() -> VllmConfig | None:
     return _current_vllm_config
 
 
diff --git a/vllm/device_allocator/cumem.py b/vllm/device_allocator/cumem.py
index e9695698bb493784022ebe13b6e6688bcb1b8b55..2f97288b649218c63f6e3509afeecb0ed4cc7cfd 100644
--- a/vllm/device_allocator/cumem.py
+++ b/vllm/device_allocator/cumem.py
@@ -19,37 +19,11 @@ import torch
 
 from vllm.logger import init_logger
 from vllm.utils.platform_utils import is_pin_memory_available
+from vllm.utils.system_utils import find_loaded_library
 
 logger = init_logger(__name__)
 
 
-def find_loaded_library(lib_name) -> str | None:
-    """
-    According to according to https://man7.org/linux/man-pages/man5/proc_pid_maps.5.html,
-    the file `/proc/self/maps` contains the memory maps of the process, which includes the
-    shared libraries loaded by the process. We can use this file to find the path of the
-    a loaded library.
-    """  # noqa
-    found_line = None
-    with open("/proc/self/maps") as f:
-        for line in f:
-            if lib_name in line:
-                found_line = line
-                break
-    if found_line is None:
-        # the library is not loaded in the current process
-        return None
-    # if lib_name is libcudart, we need to match a line with:
-    # address /path/to/libcudart-hash.so.11.0
-    start = found_line.index("/")
-    path = found_line[start:].strip()
-    filename = path.split("/")[-1]
-    assert filename.rpartition(".so")[0].startswith(lib_name), (
-        f"Unexpected filename: {filename} for library {lib_name}"
-    )
-    return path
-
-
 cumem_available = False
 try:
     from vllm.cumem_allocator import (
diff --git a/vllm/distributed/device_communicators/all2all.py b/vllm/distributed/device_communicators/all2all.py
index 1ea1e6d084398e87449fc4bbb50dd28b23a0ca2a..7a4e81cf967de45bdd83d114b8d593613cf449b1 100644
--- a/vllm/distributed/device_communicators/all2all.py
+++ b/vllm/distributed/device_communicators/all2all.py
@@ -64,7 +64,12 @@ class NaiveAll2AllManager(All2AllManagerBase):
         hidden_states: torch.Tensor,
         router_logits: torch.Tensor,
         is_sequence_parallel: bool = False,
+        extra_tensors: list[torch.Tensor] | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
+        if extra_tensors is not None:
+            raise NotImplementedError(
+                "extra_tensors is not supported for NaiveAll2AllManager"
+            )
         sp_size = self.tp_group.world_size if is_sequence_parallel else 1
         dp_metadata = get_forward_context().dp_metadata
         assert dp_metadata is not None
@@ -76,6 +81,7 @@ class NaiveAll2AllManager(All2AllManagerBase):
         router_logits = self.naive_multicast(
             router_logits, cu_tokens_across_sp_cpu, is_sequence_parallel
         )
+
         return hidden_states, router_logits
 
     def combine(
@@ -113,7 +119,11 @@ class AgRsAll2AllManager(All2AllManagerBase):
         hidden_states: torch.Tensor,
         router_logits: torch.Tensor,
         is_sequence_parallel: bool = False,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
+        extra_tensors: list[torch.Tensor] | None = None,
+    ) -> (
+        tuple[torch.Tensor, torch.Tensor]
+        | tuple[torch.Tensor, torch.Tensor, list[torch.Tensor]]
+    ):
         """
         Gather hidden_states and router_logits from all dp ranks.
         """
@@ -121,16 +131,23 @@ class AgRsAll2AllManager(All2AllManagerBase):
         assert dp_metadata is not None
         sizes = dp_metadata.get_chunk_sizes_across_dp_rank()
         assert sizes is not None
-
         dist_group = get_ep_group() if is_sequence_parallel else get_dp_group()
         assert sizes[dist_group.rank_in_group] == hidden_states.shape[0]
-        hidden_states, router_logits = dist_group.all_gatherv(
-            [hidden_states, router_logits],
+
+        tensors_to_gather = [hidden_states, router_logits]
+        if extra_tensors is not None:
+            tensors_to_gather.extend(extra_tensors)
+
+        gathered_tensors = dist_group.all_gatherv(
+            tensors_to_gather,
             dim=0,
             sizes=sizes,
         )
-        return hidden_states, router_logits
-    
+
+        if extra_tensors is not None:
+            return (gathered_tensors[0], gathered_tensors[1], gathered_tensors[2:])
+        return gathered_tensors[0], gathered_tensors[1]
+
     def combine(
         self, hidden_states: torch.Tensor, is_sequence_parallel: bool = False
     ) -> torch.Tensor:
@@ -204,6 +221,7 @@ class PPLXAll2AllManager(All2AllManagerBase):
         hidden_states: torch.Tensor,
         router_logits: torch.Tensor,
         is_sequence_parallel: bool = False,
+        extra_tensors: list[torch.Tensor] | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         raise NotImplementedError
 
@@ -251,6 +269,7 @@ class DeepEPAll2AllManagerBase(All2AllManagerBase):
         hidden_states: torch.Tensor,
         router_logits: torch.Tensor,
         is_sequence_parallel: bool = False,
+        extra_tensors: list[torch.Tensor] | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         raise NotImplementedError
 
diff --git a/vllm/distributed/device_communicators/base_device_communicator.py b/vllm/distributed/device_communicators/base_device_communicator.py
index 3a849da70e4cb34b52076e9306c62b125212766e..8bc361741cae1dc9bd7867ba914904b046ef4055 100644
--- a/vllm/distributed/device_communicators/base_device_communicator.py
+++ b/vllm/distributed/device_communicators/base_device_communicator.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import threading
+from typing import Any
 from weakref import WeakValueDictionary
 
 import torch
@@ -68,7 +69,11 @@ class All2AllManagerBase:
         hidden_states: torch.Tensor,
         router_logits: torch.Tensor,
         is_sequence_parallel: bool = False,
-    ):
+        extra_tensors: list[torch.Tensor] | None = None,
+    ) -> Any:
+        # Subclasses should either:
+        # - implement handling for extra_tensors, or
+        # - raise a clear error if extra_tensors is not supported.
         raise NotImplementedError
 
     def set_num_sms(self, num_sms: int):
@@ -112,9 +117,9 @@ class DeviceCommunicatorBase:
 
         use_ep = False
         all2all_backend = None
-        from vllm.config import get_current_vllm_config
+        from vllm.config import get_current_vllm_config_or_none
 
-        config = get_current_vllm_config()
+        config = get_current_vllm_config_or_none()
         if config is not None:
             # as long as we use data parallel (coupled data parallel
             # where all data parallel ranks execute forward together),
@@ -280,6 +285,7 @@ class DeviceCommunicatorBase:
         hidden_states: torch.Tensor,
         router_logits: torch.Tensor,
         is_sequence_parallel: bool = False,
+        extra_tensors: list[torch.Tensor] | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         """
         Dispatch the hidden states and router logits to the appropriate device.
diff --git a/vllm/distributed/device_communicators/cuda_communicator.py b/vllm/distributed/device_communicators/cuda_communicator.py
index cd9c267beb5b5a57879955ddd4ab61786a2af1b1..9542498c453ec8aa02ec0ffdb643dbbb9c99321f 100644
--- a/vllm/distributed/device_communicators/cuda_communicator.py
+++ b/vllm/distributed/device_communicators/cuda_communicator.py
@@ -318,17 +318,23 @@ class CudaCommunicator(DeviceCommunicatorBase):
 
         return output_list
 
-    def dispatch(
+    def dispatch(  # type: ignore[override]
         self,
         hidden_states: torch.Tensor,
         router_logits: torch.Tensor,
         is_sequence_parallel: bool = False,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
+        extra_tensors: list[torch.Tensor] | None = None,
+    ) -> (
+        tuple[torch.Tensor, torch.Tensor]
+        | tuple[torch.Tensor, torch.Tensor, list[torch.Tensor]]
+    ):
         assert self.all2all_manager is not None
-        hidden_states, router_logits = self.all2all_manager.dispatch(
-            hidden_states, router_logits, is_sequence_parallel
+        return self.all2all_manager.dispatch(
+            hidden_states,
+            router_logits,
+            is_sequence_parallel,
+            extra_tensors,  # type: ignore[call-arg]
         )
-        return hidden_states, router_logits
 
     def combine(
         self, hidden_states: torch.Tensor, is_sequence_parallel: bool = False
diff --git a/vllm/distributed/device_communicators/cuda_wrapper.py b/vllm/distributed/device_communicators/cuda_wrapper.py
index 6aadab33e31324ba4c3881e6949cb7c0dd7c009a..422991ca93e6068129ecd58741c5d682d80da696 100644
--- a/vllm/distributed/device_communicators/cuda_wrapper.py
+++ b/vllm/distributed/device_communicators/cuda_wrapper.py
@@ -15,6 +15,7 @@ import torch  # noqa
 import vllm.envs as envs
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
+from vllm.utils.system_utils import find_loaded_library
 
 logger = init_logger(__name__)
 
@@ -37,33 +38,6 @@ class Function:
     argtypes: list[Any]
 
 
-def find_loaded_library(lib_name) -> str | None:
-    """
-    According to according to https://man7.org/linux/man-pages/man5/proc_pid_maps.5.html,
-    the file `/proc/self/maps` contains the memory maps of the process, which includes the
-    shared libraries loaded by the process. We can use this file to find the path of the
-    a loaded library.
-    """  # noqa
-    found = False
-    with open("/proc/self/maps") as f:
-        for line in f:
-            if lib_name in line:
-                found = True
-                break
-    if not found:
-        # the library is not loaded in the current process
-        return None
-    # if lib_name is libcudart, we need to match a line with:
-    # address /path/to/libcudart-hash.so.11.0
-    start = line.index("/")
-    path = line[start:].strip()
-    filename = path.split("/")[-1]
-    assert filename.rpartition(".so")[0].startswith(lib_name), (
-        f"Unexpected filename: {filename} for library {lib_name}"
-    )
-    return path
-
-
 class CudaRTLibrary:
     exported_functions = [
         # ​cudaError_t cudaSetDevice ( int  device )
diff --git a/vllm/distributed/device_communicators/pynccl_allocator.py b/vllm/distributed/device_communicators/pynccl_allocator.py
index 2e5d94de9d016d19dd8111597f7e03f74cde7a45..0ce307bc596c1ae41db34eef3bfd8eb9949522de 100644
--- a/vllm/distributed/device_communicators/pynccl_allocator.py
+++ b/vllm/distributed/device_communicators/pynccl_allocator.py
@@ -60,7 +60,7 @@ def is_symmetric_memory_tensor(tensor: torch.Tensor):
     return False
 
 
-def set_graph_pool_id(graph_pool_id):
+def set_graph_pool_id(graph_pool_id: Any) -> None:
     global _graph_pool_id
     _graph_pool_id = graph_pool_id
 
diff --git a/vllm/distributed/device_communicators/pynccl_wrapper.py b/vllm/distributed/device_communicators/pynccl_wrapper.py
index c8e2f730e0f068e6b3dd6f294188897698e02bd8..a4bc3d90c849f60e495f3dc656afc8f01d9dcd3d 100644
--- a/vllm/distributed/device_communicators/pynccl_wrapper.py
+++ b/vllm/distributed/device_communicators/pynccl_wrapper.py
@@ -92,7 +92,10 @@ class ncclDataTypeEnum:
             return cls.ncclFloat64
         if dtype == torch.bfloat16:
             return cls.ncclBfloat16
-        raise ValueError(f"Unsupported dtype: {dtype}")
+        raise ValueError(
+            f"Unsupported dtype {dtype}: should be one of "
+            f"int8, uint8, int32, int64, float16, float32, float64, bfloat16."
+        )
 
 
 ncclRedOp_t = ctypes.c_int
diff --git a/vllm/distributed/device_communicators/quick_all_reduce.py b/vllm/distributed/device_communicators/quick_all_reduce.py
index 9c7765883cfd1340081d2d2d8593c7d7ef90778a..7670ec134b53358ab619e139d87df7b29645424c 100644
--- a/vllm/distributed/device_communicators/quick_all_reduce.py
+++ b/vllm/distributed/device_communicators/quick_all_reduce.py
@@ -9,7 +9,7 @@ from torch.distributed import ProcessGroup
 
 import vllm.envs as envs
 from vllm import _custom_ops as ops
-from vllm.config import get_current_vllm_config
+from vllm.config import get_current_vllm_config_or_none
 from vllm.distributed.parallel_state import in_the_same_node_as
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
@@ -184,7 +184,7 @@ class QuickAllReduce:
             )
             return
         self.qr_quant_level = QuickReduceRegime[regime_str]
-        vllm_config = get_current_vllm_config()
+        vllm_config = get_current_vllm_config_or_none()
         if (
             vllm_config is not None
             and hasattr(vllm_config, "model_config")
diff --git a/vllm/distributed/device_communicators/tpu_communicator.py b/vllm/distributed/device_communicators/tpu_communicator.py
deleted file mode 100644
index fa99078e9ff0d095f7a9d6dacba7154a17155b8a..0000000000000000000000000000000000000000
--- a/vllm/distributed/device_communicators/tpu_communicator.py
+++ /dev/null
@@ -1,99 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import os
-
-import torch
-from torch.distributed import ProcessGroup
-
-from vllm.config import get_current_vllm_config
-from vllm.logger import init_logger
-from vllm.platforms import current_platform
-from vllm.platforms.tpu import USE_TPU_INFERENCE
-
-from .base_device_communicator import DeviceCommunicatorBase
-
-USE_RAY = parallel_config = (
-    get_current_vllm_config().parallel_config.distributed_executor_backend == "ray"
-)
-
-logger = init_logger(__name__)
-
-if not USE_TPU_INFERENCE:
-    logger.info("tpu_inference not found, using vLLM's TpuCommunicator")
-    if current_platform.is_tpu():
-        import torch_xla
-        import torch_xla.core.xla_model as xm
-        import torch_xla.runtime as xr
-        from torch_xla._internal import pjrt
-        from torch_xla.distributed.xla_multiprocessing import (
-            create_optimized_replica_groups,
-        )
-
-        if USE_RAY:
-            from vllm.v1.executor import ray_utils
-
-
-class TpuCommunicator(DeviceCommunicatorBase):
-    def __init__(
-        self,
-        cpu_group: ProcessGroup,
-        device: torch.device | None = None,
-        device_group: ProcessGroup | None = None,
-        unique_name: str = "",
-    ):
-        super().__init__(cpu_group, device, device_group, unique_name)
-
-        # NOTE(woosuk): When using TP > 1 on TPUs, every TPU on the same node
-        # must be used together. Therefore, the local rank and world size can
-        # be simply calculated as follows.
-        global_rank = self.global_rank
-        global_world_size = self.global_world_size
-
-        if USE_RAY:
-            logger.info("TpuCommunicator initialized with RAY")
-            # Calculate how many TPU nodes are in the current deployment. This
-            # is the Ray placement group if it is deployed with Ray. Default
-            # to the number of TPU nodes in the Ray cluster. The number of TPU
-            # nodes is computed by the total number of TPUs divided by the
-            # number of TPU accelerators per node, to account for clusters
-            # with both CPUs and TPUs.
-            num_nodes = ray_utils.get_num_tpu_nodes()
-            num_nodes_in_pg = ray_utils.get_num_nodes_in_placement_group()
-            if num_nodes_in_pg > 0:
-                num_nodes = num_nodes_in_pg
-
-            local_world_size = global_world_size // num_nodes
-            local_rank = global_rank % local_world_size
-        else:
-            logger.info("TpuCommunicator initialized with MP")
-            # Sanity: Verify we run on a single host
-            num_hosts = torch_xla.tpu.num_tpu_workers()
-            assert num_hosts == 1
-
-            # Get the current number of TPUs (we have locally)
-            local_world_size = torch_xla.tpu.num_available_chips()
-
-            # Get current rank
-            local_rank = global_rank % local_world_size
-
-        # Ensure environment variables are set for multihost deployments.
-        # On GKE, this is needed for libtpu and TPU driver to know which TPU
-        # chip is actually visible. Otherwise the TPU driver will fail to
-        # initialize because the number of devices would be different from
-        # the number of visible worker addresses.
-        os.environ["CLOUD_TPU_TASK_ID"] = str(global_rank)
-        os.environ["TPU_VISIBLE_CHIPS"] = str(local_rank)
-
-        pjrt.initialize_multiprocess(local_rank, local_world_size)
-        xr._init_world_size_ordinal()
-        self.groups = create_optimized_replica_groups()
-
-    def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
-        # TODO: Remove the groups specification after XLA compiler can support
-        # auto-reordering the ring order for all-reduce.
-        return xm.all_reduce(xm.REDUCE_SUM, input_, groups=self.groups)
-
-    def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
-        assert dim == -1, "TPUs only support dim=-1 for all-gather."
-        return xm.all_gather(input_, dim=dim)
diff --git a/vllm/distributed/device_communicators/xpu_communicator.py b/vllm/distributed/device_communicators/xpu_communicator.py
index ad61fdfb8ea526b3bf24247c6081330a886c0501..f3d9262d20cf47992fa77359c9e0bcdba1cbd3f3 100644
--- a/vllm/distributed/device_communicators/xpu_communicator.py
+++ b/vllm/distributed/device_communicators/xpu_communicator.py
@@ -23,11 +23,11 @@ class XpuCommunicator(DeviceCommunicatorBase):
     ):
         super().__init__(cpu_group, device, device_group, unique_name)
         if self.use_all2all:
-            if self.all2all_backend != "naive":
+            if self.all2all_backend != "naive":  # type: ignore[has-type]
                 logger.warning(
                     "`%s` all2all manager is not supported on XPU. "
                     "Falling back to `naive` all2all manager for XPU.",
-                    self.all2all_backend,
+                    self.all2all_backend,  # type: ignore[has-type]
                 )
                 self.all2all_backend = "naive"
             if self.all2all_backend == "naive":
@@ -78,12 +78,15 @@ class XpuCommunicator(DeviceCommunicatorBase):
         hidden_states: torch.Tensor,
         router_logits: torch.Tensor,
         is_sequence_parallel: bool = False,
+        extra_tensors: list[torch.Tensor] | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         assert self.all2all_manager is not None
-        hidden_states, router_logits = self.all2all_manager.dispatch(
-            hidden_states, router_logits, is_sequence_parallel
+        return self.all2all_manager.dispatch(
+            hidden_states,
+            router_logits,
+            is_sequence_parallel,
+            extra_tensors,  # type: ignore[call-arg]
         )
-        return hidden_states, router_logits
 
     def combine(
         self, hidden_states: torch.Tensor, is_sequence_parallel: bool = False
diff --git a/vllm/distributed/ec_transfer/ec_connector/example_connector.py b/vllm/distributed/ec_transfer/ec_connector/example_connector.py
index c9aad9e9fc8f3e11922a6e0150b07d491881f774..48a7d41908fd4a09d1f3066a43925cd04f88dbab 100644
--- a/vllm/distributed/ec_transfer/ec_connector/example_connector.py
+++ b/vllm/distributed/ec_transfer/ec_connector/example_connector.py
@@ -73,6 +73,7 @@ class ECExampleConnector(ECConnectorBase):
                 data hashes (`mm_hash`) to encoder cache tensors.
             kwargs (dict): Additional keyword arguments for the connector.
         """
+        from vllm.platforms import current_platform
 
         # Get the metadata
         metadata: ECConnectorMetadata = self._get_connector_metadata()
@@ -80,10 +81,7 @@ class ECExampleConnector(ECConnectorBase):
         assert encoder_cache is not None
         if metadata is None:
             logger.warning(
-                (
-                    "In connector.start_load_caches, ",
-                    "but the connector metadata is None",
-                )
+                "In connector.start_load_caches, but the connector metadata is None"
             )
             return
         # Load the EC for each mm data
@@ -91,7 +89,9 @@ class ECExampleConnector(ECConnectorBase):
             if mm_data.mm_hash in encoder_cache:
                 continue
             filename = self._generate_filename_debug(mm_data.mm_hash)
-            ec_cache = safetensors.torch.load_file(filename)["ec_cache"].cuda()
+            ec_cache = safetensors.torch.load_file(
+                filename, device=current_platform.device_type
+            )["ec_cache"]
             encoder_cache[mm_data.mm_hash] = ec_cache
             logger.debug("Success load encoder cache for hash %s", mm_data.mm_hash)
 
diff --git a/vllm/distributed/eplb/async_worker.py b/vllm/distributed/eplb/async_worker.py
index e4b4fc92eeaaaaae28bb34010b32293f00892f94..9d7366996e3b2f24518eb4ad85fd4bbaa22af266 100644
--- a/vllm/distributed/eplb/async_worker.py
+++ b/vllm/distributed/eplb/async_worker.py
@@ -89,7 +89,7 @@ async def transfer_run_periodically(
                         (
                             model_state.is_unchanged,
                             model_state.is_received_locally,
-                            model_state.experts_recv_loc,
+                            model_state.recv_metadata,
                         ) = await transfer_layer(
                             old_global_expert_indices=model_state.physical_to_logical_map,
                             new_global_expert_indices=model_state.new_physical_to_logical_map,
diff --git a/vllm/distributed/eplb/eplb_state.py b/vllm/distributed/eplb/eplb_state.py
index c5654659b79d6b00e46020531e868dba4109b259..a482c6f55cafb6cf2a71d23687175a00ddb9dbac 100644
--- a/vllm/distributed/eplb/eplb_state.py
+++ b/vllm/distributed/eplb/eplb_state.py
@@ -27,10 +27,10 @@ physical experts.
 """
 
 import threading
-import time
 from collections.abc import Sequence
 from dataclasses import dataclass
 
+import numpy as np
 import torch
 from torch.distributed import ProcessGroup, all_reduce
 
@@ -46,7 +46,11 @@ from vllm.model_executor.models.interfaces import MixtureOfExperts
 
 from .async_worker import start_async_worker
 from .policy import EPLB_POLICIES, AbstractEplbPolicy, DefaultEplbPolicy
-from .rebalance_execute import move_from_buffer, rearrange_expert_weights_inplace
+from .rebalance_execute import (
+    RecvMetadata,
+    move_from_buffer,
+    rearrange_expert_weights_inplace,
+)
 
 logger = init_logger(__name__)
 
@@ -164,20 +168,19 @@ class EplbModelState:
     """
     Whether the async EPLB needs to poll peers for buffer readiness.
     """
-    is_unchanged: list[bool]
+    is_unchanged: np.ndarray
     """
     intermediate variable between `move_to_buffer` and `move_to_workspace`.
     The size is same as the num of physical experts in the current layer.
     """
-    is_received_locally: list[bool]
+    is_received_locally: np.ndarray
     """
     intermediate variable between `move_to_buffer` and `move_to_workspace`.
     The size is same as the num of physical experts in the current layer.
     """
-    experts_recv_loc: dict[int, int]
+    recv_metadata: RecvMetadata
     """
     intermediate variable between `move_to_buffer` and `move_to_workspace`.
-    The size is same as the num of physical experts in the current layer.
     """
     is_async_enabled: bool
     """
@@ -423,7 +426,7 @@ class EplbState:
         # Set the policy based on the selected eplb algorithm type.
         policy_type = self.parallel_config.eplb_config.policy
         self.policy = EPLB_POLICIES[policy_type]
-        logger.debug("Selected EPLB policy: %d", policy_type)
+        logger.debug("Selected EPLB policy: %s", policy_type)
         if global_expert_load is not None:
             ep_group = get_ep_group().device_group
             assert global_expert_load.shape == (
@@ -507,9 +510,14 @@ class EplbState:
             layer_to_transfer=0,
             rebalanced=False,
             pending_global_ready_check=False,
-            is_unchanged=[],
-            is_received_locally=[],
-            experts_recv_loc={},
+            is_unchanged=np.array([]),
+            is_received_locally=np.array([]),
+            recv_metadata=RecvMetadata(
+                recv_primary_mask=np.array([]),
+                recv_count=0,
+                recv_expert_ids=np.array([]),
+                recv_dst_rows=np.array([]),
+            ),
             is_async_enabled=self.is_async,
             cuda_device_index=self.cuda_device_index,
             new_physical_to_logical_map=new_physical_to_logical_map,
@@ -553,7 +561,12 @@ class EplbState:
             for eplb_model_state in self.model_states.values():
                 eplb_model_state.expert_load_pass.zero_()
 
-        if log_stats:
+        if (
+            log_stats
+            and self.expert_rearrangement_step
+            % self.parallel_config.eplb_config.log_balancedness_interval
+            == 0
+        ):
             # Sync the expert load pass for each model (main and drafter).
             # expert_load_pass: (num_moe_layers, num_physical_experts)
             expert_load_pass_list = self._sync_load_pass()
@@ -586,12 +599,15 @@ class EplbState:
                 if ep_group.rank() == 0:
                     logger.info(
                         "EPLB step: %d for model %s: avg_tokens=%.2f, "
-                        "max_tokens=%d, balancedness=%.4f",
+                        "max_tokens=%d, balancedness=%.4f, "
+                        "steps until the next rearrangement: %d",
                         self.expert_rearrangement_step,
                         eplb_model_state.model_name,
                         avg_tokens,
                         max_tokens,
                         balancedness,
+                        self.expert_rearrangement_step_interval
+                        - self.expert_rearrangement_step,
                     )
 
         # Update the expert load sliding window
@@ -684,11 +700,14 @@ class EplbState:
         ep_group = get_ep_group().device_group
         ep_rank = ep_group.rank()
 
-        time_start = None
+        start_event = None
+        end_event = None
         is_main_rank = ep_rank == 0
         if is_main_rank:
-            torch.cuda.synchronize()
-            time_start = time.perf_counter()
+            if not self.is_async or is_profile:
+                start_event = torch.cuda.Event(enable_timing=True)
+                end_event = torch.cuda.Event(enable_timing=True)
+                start_event.record()
             logger.info(
                 "Rearranging experts %s %s...",
                 "(async mode)" if self.is_async else "sync mode",
@@ -800,6 +819,7 @@ class EplbState:
                 num_groups,
                 num_nodes,
                 num_gpus,
+                eplb_model_state.physical_to_logical_map,
             )
 
             if not eplb_model_state.is_async_enabled or is_profile:
@@ -848,17 +868,17 @@ class EplbState:
                         new_logical_replica_count
                     )
                 if is_main_rank:
-                    assert time_start is not None
-                    torch.cuda.synchronize()
-                    time_end = time.perf_counter()
+                    assert start_event is not None
+                    assert end_event is not None
+                    end_event.record()
+                    end_event.synchronize()
+                    gpu_elapsed = start_event.elapsed_time(end_event) / 1000.0
                     logger.info(
-                        "Rearranged experts%sin %.2f seconds.",
+                        "Rearranged experts %s in %.2f s.",
                         " (profile) " if is_profile else " ",
-                        time_end - time_start,
+                        gpu_elapsed,
                     )
             else:
-                device = eplb_model_state.physical_to_logical_map.device
-                new_physical = new_physical_to_logical_map.to(device)
                 max_slots = eplb_model_state.logical_to_physical_map.shape[-1]
                 padded_logical = torch.nn.functional.pad(
                     new_logical_to_physical_map,
@@ -869,7 +889,10 @@ class EplbState:
                     eplb_model_state.logical_replica_count.device
                 )
 
-                eplb_model_state.new_physical_to_logical_map = new_physical
+                # Move map to cpu in advance
+                eplb_model_state.new_physical_to_logical_map = (
+                    new_physical_to_logical_map.cpu()
+                )
                 eplb_model_state.new_logical_to_physical_map = padded_logical
                 eplb_model_state.new_logical_replica_count = new_replica
 
@@ -968,25 +991,30 @@ class EplbState:
                 stream = torch.cuda.current_stream(device=device_index)
                 stream.wait_event(model_state.buffer_ready_event)
                 model_state.buffer_ready_event = None
+            expert_weights = model_state.model.expert_weights[
+                model_state.layer_to_transfer
+            ]
+            expert_weights_buffer = model_state.expert_buffer
+            new_indices = (
+                model_state.new_physical_to_logical_map[model_state.layer_to_transfer]
+                .cpu()
+                .numpy()
+            )
             move_from_buffer(
-                expert_weights=model_state.model.expert_weights[
-                    model_state.layer_to_transfer
-                ],
-                expert_weights_buffer=model_state.expert_buffer,
+                expert_weights=expert_weights,
+                expert_weights_buffers=expert_weights_buffer,
                 is_unchanged=model_state.is_unchanged,
                 is_received_locally=model_state.is_received_locally,
-                experts_recv_loc=model_state.experts_recv_loc,
-                new_indices=model_state.new_physical_to_logical_map[
-                    model_state.layer_to_transfer
-                ].tolist(),
-                ep_group=ep_group,
+                recv_metadata=model_state.recv_metadata,
+                new_indices=new_indices,
+                ep_rank=ep_group.rank(),
             )
             transferred_layer = model_state.layer_to_transfer
             self._update_layer_mapping_from_new(model_state, transferred_layer)
             # After the main thread consumes, advance layer_to_transfer
             model_state.layer_to_transfer += 1
             model_state.ep_buffer_ready = 0
-            logger.info(
+            logger.debug(
                 "model %s successfully move_to_workspace layer %d",
                 model_state.model_name,
                 transferred_layer,
diff --git a/vllm/distributed/eplb/policy/abstract.py b/vllm/distributed/eplb/policy/abstract.py
index 40ed621c84892a2777aa5a3b2be0decb1bdb4849..f4435f11bd57b7afb1d75ff01e8db4e4eabc56a9 100644
--- a/vllm/distributed/eplb/policy/abstract.py
+++ b/vllm/distributed/eplb/policy/abstract.py
@@ -16,6 +16,7 @@ class AbstractEplbPolicy(ABC):
         num_groups: int,
         num_nodes: int,
         num_ranks: int,
+        old_global_expert_indices: torch.Tensor | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """
         Entry point for expert-parallelism load balancer.
@@ -28,7 +29,9 @@ class AbstractEplbPolicy(ABC):
             num_groups: number of expert groups
             num_nodes: number of server nodes
             num_ranks: number of ranks, must be a multiple of `num_nodes`
-
+            old_global_expert_indices: [layers, num_logical_experts], the old global
+                expert indices. Used to avoid unnecessary weight copying
+                for experts moving within one rank.
         Returns:
             physical_to_logical_map: [layers, num_replicas], the expert
                 index of each replica
diff --git a/vllm/distributed/eplb/policy/default.py b/vllm/distributed/eplb/policy/default.py
index 6127ec703184a23a60049438c202705c51ba4045..b9cfcae0141083f9718c8adc63208608ad786aed 100644
--- a/vllm/distributed/eplb/policy/default.py
+++ b/vllm/distributed/eplb/policy/default.py
@@ -21,8 +21,8 @@ from .abstract import AbstractEplbPolicy
 class DefaultEplbPolicy(AbstractEplbPolicy):
     @classmethod
     def balanced_packing(
-        cls, weight: torch.Tensor, num_packs: int
-    ) -> tuple[torch.Tensor, torch.Tensor]:
+        cls, weight: np.ndarray, num_packs: int
+    ) -> tuple[np.ndarray, np.ndarray]:
         """
         Pack n weighted objects to m packs, such that each bin contains exactly
         n/m objects and the weights of all packs are as balanced as possible.
@@ -39,50 +39,43 @@ class DefaultEplbPolicy(AbstractEplbPolicy):
         assert num_groups % num_packs == 0
         groups_per_pack = num_groups // num_packs
 
-        device = weight.device
-
         if groups_per_pack == 1:
-            pack_index = torch.arange(
-                weight.size(-1), dtype=torch.int64, device=device
-            ).expand(weight.shape)
-            rank_in_pack = torch.zeros_like(weight, dtype=torch.int64, device=device)
+            pack_index = np.tile(np.arange(num_groups, dtype=np.int64), (num_layers, 1))
+            rank_in_pack = np.zeros_like(pack_index, dtype=np.int64)
             return pack_index, rank_in_pack
 
-        weight_np = weight.cpu().numpy()
-
         # Sort and get indices in decending order
-        indices_np = np.argsort(-weight_np, axis=-1)
-
-        pack_index_np = np.full((num_layers, num_groups), -1, dtype=np.int64)
-        rank_in_pack_np = np.full((num_layers, num_groups), -1, dtype=np.int64)
+        indices = np.argsort(-weight, axis=-1)
 
-        # Run the packing algorithm
-        for i in range(num_layers):
-            pack_weights = [0.0] * num_packs
-            pack_items = [0] * num_packs
-
-            for group in indices_np[i]:
-                # Find a pack with capacity that has the lowest weight
-                pack = min(
-                    (j for j in range(num_packs) if pack_items[j] < groups_per_pack),
-                    key=pack_weights.__getitem__,
-                )
+        pack_index = np.full((num_layers, num_groups), -1, dtype=np.int64)
+        rank_in_pack = np.full((num_layers, num_groups), -1, dtype=np.int64)
 
-                assert pack_items[pack] < groups_per_pack
-                pack_index_np[i, group] = pack
-                rank_in_pack_np[i, group] = pack_items[pack]
-                pack_weights[pack] += weight_np[i, group]
-                pack_items[pack] += 1
+        pack_weights = np.zeros((num_layers, num_packs), dtype=np.float64)
+        pack_items = np.zeros((num_layers, num_packs), dtype=np.int64)
 
-        pack_index = torch.from_numpy(pack_index_np).to(device)
-        rank_in_pack = torch.from_numpy(rank_in_pack_np).to(device)
+        # Run the packing algorithm
+        for layer_idx in range(num_layers):
+            weights_row = pack_weights[layer_idx]
+            items_row = pack_items[layer_idx]
+
+            for group in indices[layer_idx]:
+                # Pick the lightest pack; full packs are masked out by inf.
+                pack = int(np.argmin(weights_row))
+
+                pack_index[layer_idx, group] = pack
+                rank_in_pack[layer_idx, group] = items_row[pack]
+                weights_row[pack] += weight[layer_idx, group]
+                items_row[pack] += 1
+                if items_row[pack] == groups_per_pack:
+                    # Mark as unavailable for future selections.
+                    weights_row[pack] = np.inf
 
         return pack_index, rank_in_pack
 
     @classmethod
     def replicate_experts(
-        cls, weight: torch.Tensor, num_phy: int
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        cls, weight: np.ndarray, num_phy: int
+    ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
         """
         Replicate `num_log` experts to `num_phy` replicas, such that the maximum
         load of all replicas is minimized.
@@ -93,33 +86,32 @@ class DefaultEplbPolicy(AbstractEplbPolicy):
 
         Returns:
             phy2log: [X, num_phy], logical expert id of each physical expert
-            rank: [X, num_phy], the replica rank
+            replica_idx: [X, num_phy], the index of the replica for each logical expert
             logcnt: [X, num_log], number of replicas for each logical expert
         """
         n, num_log = weight.shape
         num_redundant = num_phy - num_log
         assert num_redundant >= 0
-        device = weight.device
-        phy2log = torch.arange(num_phy, dtype=torch.int64, device=device).repeat(n, 1)
-        rank = torch.zeros(n, num_phy, dtype=torch.int64, device=device)
-        logcnt = torch.ones(n, num_log, dtype=torch.int64, device=device)
-        arangen = torch.arange(n, dtype=torch.int64, device=device)
+        phy2log = np.tile(np.arange(num_phy, dtype=np.int64), (n, 1))
+        replica_idx = np.zeros((n, num_phy), dtype=np.int64)
+        logcnt = np.ones((n, num_log), dtype=np.int64)
+        arangen = np.arange(n, dtype=np.int64)
         for i in range(num_log, num_phy):
-            redundant_indices = (weight / logcnt).max(dim=-1).indices
+            redundant_indices = np.argmax(weight / logcnt, axis=-1)
             phy2log[:, i] = redundant_indices
-            rank[:, i] = logcnt[arangen, redundant_indices]
+            replica_idx[:, i] = logcnt[arangen, redundant_indices]
             logcnt[arangen, redundant_indices] += 1
-        return phy2log, rank, logcnt
+        return phy2log, replica_idx, logcnt
 
     @classmethod
     def rebalance_experts_hierarchical(
         cls,
-        weight: torch.Tensor,
+        weight: np.ndarray,
         num_physical_experts: int,
         num_groups: int,
         num_nodes: int,
         num_gpus: int,
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
         """
         Parameters:
             weight: [num_moe_layers, num_logical_experts]
@@ -132,7 +124,7 @@ class DefaultEplbPolicy(AbstractEplbPolicy):
         Returns:
             phy2log: [layers, num_replicas], the expert
                 index of each replica
-            log2phy: [layers, num_logical_experts, X],
+            pphy_replicas_idx: [layers, num_logical_experts, X],
                 the replica indices for each expert
             logcnt: [layers, num_logical_experts], number of
                 physical replicas for each logical expert
@@ -146,66 +138,160 @@ class DefaultEplbPolicy(AbstractEplbPolicy):
         assert num_physical_experts % num_gpus == 0
         phy_experts_per_gpu = num_physical_experts // num_gpus
 
-        def inverse(perm: torch.Tensor) -> torch.Tensor:
-            inv = torch.empty_like(perm)
-            inv.scatter_(
-                1,
-                perm,
-                torch.arange(
-                    perm.size(1), dtype=torch.int64, device=perm.device
-                ).expand(perm.shape),
-            )
+        def inverse(perm: np.ndarray) -> np.ndarray:
+            inv = np.empty_like(perm)
+            row_idx = np.arange(perm.shape[0])[:, None]
+            col_idx = np.arange(perm.shape[1], dtype=np.int64)
+            inv[row_idx, perm] = col_idx
             return inv
 
         # Step 1: pack groups to nodes
-        tokens_per_group = weight.unflatten(-1, (num_groups, group_size)).sum(-1)
+        tokens_per_group = weight.reshape(num_layers, num_groups, group_size).sum(
+            axis=-1
+        )
         group_pack_index, group_rank_in_pack = cls.balanced_packing(
             tokens_per_group, num_nodes
         )
+        # Map each logical expert into a node-local ordering based on packed groups.
         log2mlog = (
             (
-                (group_pack_index * groups_per_node + group_rank_in_pack) * group_size
-            ).unsqueeze(-1)
-            + torch.arange(
-                group_size, dtype=torch.int64, device=group_pack_index.device
+                (group_pack_index * groups_per_node + group_rank_in_pack)[..., None]
+                * group_size
             )
-        ).flatten(-2)
+            + np.arange(group_size, dtype=np.int64)
+        ).reshape(num_layers, num_logical_experts)
         mlog2log = inverse(log2mlog)
 
         # Step 2: construct redundant experts within nodes
-        # [num_layers * num_nodes, num_logical_experts // num_nodes]
-        tokens_per_mlog = weight.gather(-1, mlog2log).view(
+        # Reorder weights into the node-local layout so replication is done per node.
+        tokens_per_mlog = np.take_along_axis(weight, mlog2log, axis=1).reshape(
             -1, num_logical_experts // num_nodes
         )
-        phy2mlog, phyrank, mlogcnt = cls.replicate_experts(
+        phy2mlog, replicas_idx, mlogcnt = cls.replicate_experts(
             tokens_per_mlog, num_physical_experts // num_nodes
         )
 
         # Step 3: pack physical_experts to GPUs
-        # [num_layers * num_nodes, num_physical_experts // num_nodes]
-        tokens_per_phy = (tokens_per_mlog / mlogcnt).gather(-1, phy2mlog)
+        # Effective per-physical load = logical load divided by replica count.
+        tokens_per_phy = np.take_along_axis(tokens_per_mlog / mlogcnt, phy2mlog, axis=1)
         pack_index, rank_in_pack = cls.balanced_packing(
             tokens_per_phy, num_gpus // num_nodes
         )
         phy2pphy = pack_index * phy_experts_per_gpu + rank_in_pack
         pphy2phy = inverse(phy2pphy)
 
-        pphy2mlog = phy2mlog.gather(
-            -1, pphy2phy
-        )  # [num_layers * num_nodes, num_log_per_nodes]
+        # Reorder node-local logical indices into the post-packing physical order.
+        pphy2mlog = np.take_along_axis(phy2mlog, pphy2phy, axis=1)
         pphy2mlog = (
-            pphy2mlog.view(num_layers, num_nodes, -1)
-            + torch.arange(
+            pphy2mlog.reshape(num_layers, num_nodes, -1)
+            + np.arange(
                 0,
                 num_logical_experts,
                 num_logical_experts // num_nodes,
-                device=group_pack_index.device,
-            ).view(1, -1, 1)
-        ).flatten(-2)
-        pphy2log = mlog2log.gather(-1, pphy2mlog)
-        pphyrank = phyrank.gather(-1, pphy2phy).view(num_layers, -1)
-        logcnt = mlogcnt.view(num_layers, -1).gather(-1, log2mlog)
-        return pphy2log, pphyrank, logcnt
+                dtype=np.int64,
+            )[None, :, None]
+        ).reshape(num_layers, -1)
+        # Map node-local logical indices back to global logical expert ids.
+        pphy2log = np.take_along_axis(mlog2log, pphy2mlog, axis=1)
+        # Reorder replica ranks to the post-packing physical ordering.
+        pphy_replicas_idx = np.take_along_axis(replicas_idx, pphy2phy, axis=1).reshape(
+            num_layers, -1
+        )
+        # Convert replica counts back to the original logical ordering.
+        logcnt = np.take_along_axis(mlogcnt.reshape(num_layers, -1), log2mlog, axis=1)
+        return pphy2log, pphy_replicas_idx, logcnt
+
+    @classmethod
+    def preserve_intragpu_slots(
+        cls,
+        phy2log: np.ndarray,
+        phy_replicas_idx: np.ndarray,
+        num_ranks: int,
+        old_phy2log: np.ndarray,
+    ) -> tuple[np.ndarray, np.ndarray]:
+        """
+        Reorder the new mapping per GPU so that experts that remain on the same GPU
+        keep their previous slot positions when possible. Incoming experts to that GPU
+        fill any remaining available slots. This is applied only when the number of GPUs
+        is unchanged and the slots per GPU remain the same between
+        the old and new mappings.
+        """
+        num_phy_experts = phy2log.shape[1]
+        if num_ranks <= 0 or num_phy_experts % num_ranks != 0:
+            return phy2log, phy_replicas_idx
+
+        # Move to CPU and convert to NumPy for processing
+        slots_per_gpu = num_phy_experts // num_ranks
+        num_layers = phy2log.shape[0]
+
+        post_phy2log = phy2log.copy()
+        post_phy_replicas_idx = phy_replicas_idx.copy()
+
+        for gpu_idx in range(num_ranks):
+            start = gpu_idx * slots_per_gpu
+            end = start + slots_per_gpu
+            # Experts across all layers for this GPU
+            old_local = old_phy2log[:, start:end]  # [layers, slots]
+            new_local = phy2log[:, start:end]  # [layers, slots]
+            new_ridx = phy_replicas_idx[:, start:end]  # [layers, slots]
+
+            used_new_indices = np.zeros((num_layers, slots_per_gpu), dtype=bool)
+            preserved_positions = np.zeros((num_layers, slots_per_gpu), dtype=bool)
+
+            # First pass: preserve same-logical experts in their previous slots
+            for slot_idx in range(slots_per_gpu):
+                # matches: [layers, slots], True where new local experts have
+                # the same logical value as the old from 'slot_idx' and not checked yet
+                matches = (new_local == old_local[:, slot_idx][:, None]) & (
+                    ~used_new_indices
+                )
+                has_any = matches.any(axis=1)
+                if np.any(has_any):
+                    first_idx = np.argmax(matches, axis=1)
+                    layer_indices = np.nonzero(has_any)[0]
+                    matched_new_positions = first_idx[layer_indices]
+                    post_phy2log[layer_indices, start + slot_idx] = new_local[
+                        layer_indices, matched_new_positions
+                    ]
+                    post_phy_replicas_idx[layer_indices, start + slot_idx] = new_ridx[
+                        layer_indices, matched_new_positions
+                    ]
+                    used_new_indices[layer_indices, matched_new_positions] = True
+                    preserved_positions[layer_indices, slot_idx] = True
+
+            # Second pass: fill remaining slots with remaining new experts
+            remaining_mask = ~used_new_indices  # [layers, slots]
+            fill_mask = ~preserved_positions  # [layers, slots]
+            if remaining_mask.any() and fill_mask.any():
+                idx_base = np.tile(np.arange(slots_per_gpu), (num_layers, 1))
+                # Sentinel value for unavailable positions.
+                large = slots_per_gpu + 1
+                # Priorities: keep original index for available spots, set sentinel
+                # for unavailable; lower is earlier.
+                remaining_priority = np.where(remaining_mask, idx_base, large)
+                fill_priority = np.where(fill_mask, idx_base, large)
+                # Sort to get ordered indices of available src/dst positions per layer.
+                remaining_indices = np.argsort(remaining_priority, axis=1)
+                fill_indices = np.argsort(fill_priority, axis=1)
+                # Fill count per layer (cannot exceed either side).
+                remaining_counts = remaining_mask.sum(axis=1)
+                fill_counts = fill_mask.sum(axis=1)
+                take_counts = np.minimum(remaining_counts, fill_counts)
+                # Assign remaining new experts to remaining slots per layer.
+                for layer_idx in range(num_layers):
+                    k = int(take_counts[layer_idx])
+                    if k <= 0:
+                        continue
+                    src_pos = remaining_indices[layer_idx, :k]
+                    dst_pos = fill_indices[layer_idx, :k]
+                    post_phy2log[layer_idx, start + dst_pos] = new_local[
+                        layer_idx, src_pos
+                    ]
+                    post_phy_replicas_idx[layer_idx, start + dst_pos] = new_ridx[
+                        layer_idx, src_pos
+                    ]
+
+        return post_phy2log, post_phy_replicas_idx
 
     @classmethod
     def rebalance_experts(
@@ -215,6 +301,7 @@ class DefaultEplbPolicy(AbstractEplbPolicy):
         num_groups: int,
         num_nodes: int,
         num_ranks: int,
+        old_global_expert_indices: torch.Tensor | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """
         Entry point for expert-parallelism load balancer.
@@ -228,7 +315,9 @@ class DefaultEplbPolicy(AbstractEplbPolicy):
             num_nodes: number of server nodes, where the intra-node network
                 (e.g, NVLink) is faster
             num_ranks: number of ranks, must be a multiple of `num_nodes`
-
+            old_global_expert_indices: [layers, num_logical_experts], the old global
+                expert indices. Used to avoid unnecessary weight copying
+                for experts moving within one rank.
         Returns:
             phy2log: [layers, num_replicas], the expert
                 index of each replica
@@ -237,31 +326,51 @@ class DefaultEplbPolicy(AbstractEplbPolicy):
             logcnt: [layers, num_logical_experts], number of
                 physical replicas for each logical expert
         """
+        device = weight.device
         num_layers, num_logical_experts = weight.shape
-        weight = weight.float()
+        weight_np = weight.float().cpu().numpy()
+        old_phy2log_np = (
+            old_global_expert_indices.cpu().numpy()
+            if old_global_expert_indices is not None
+            else None
+        )
+
         if num_groups % num_nodes == 0:
             # use hierarchical load-balance policy
-            phy2log, phyrank, logcnt = cls.rebalance_experts_hierarchical(
-                weight, num_replicas, num_groups, num_nodes, num_ranks
+            phy2log_np, phy_replicas_idx_np, logcnt_np = (
+                cls.rebalance_experts_hierarchical(
+                    weight_np, num_replicas, num_groups, num_nodes, num_ranks
+                )
             )
         else:
             # use global load-balance policy
-            phy2log, phyrank, logcnt = cls.rebalance_experts_hierarchical(
-                weight, num_replicas, 1, 1, num_ranks
+            phy2log_np, phy_replicas_idx_np, logcnt_np = (
+                cls.rebalance_experts_hierarchical(
+                    weight_np, num_replicas, 1, 1, num_ranks
+                )
+            )
+
+        # Optional postprocessing to preserve slots for experts moving
+        # within the same GPU
+        # Only apply when the number of GPUs and slots per GPU remain unchanged.
+        # Helps to avoid unnecessary weight copying when experts move
+        # within the same GPU.
+        if old_global_expert_indices is not None:
+            phy2log_np, phy_replicas_idx_np = cls.preserve_intragpu_slots(
+                phy2log_np, phy_replicas_idx_np, num_ranks, old_phy2log_np
             )
         num_redundant_experts = num_replicas - num_logical_experts
         maxlogcnt = num_redundant_experts + 1
-        log2phy: torch.Tensor = torch.full(
-            (num_layers, num_logical_experts, maxlogcnt),
-            -1,
-            dtype=torch.int64,
-            device=logcnt.device,
+        log2phy_np = np.full(
+            (num_layers, num_logical_experts, maxlogcnt), -1, dtype=np.int64
         )
-        log2phy.view(num_layers, -1).scatter_(
-            -1,
-            phy2log * maxlogcnt + phyrank,
-            torch.arange(num_replicas, dtype=torch.int64, device=log2phy.device).expand(
-                num_layers, -1
-            ),
+        layer_indices = np.arange(num_layers)[:, None]
+        replica_indices = np.tile(
+            np.arange(num_replicas, dtype=np.int64), (num_layers, 1)
         )
+        log2phy_np[layer_indices, phy2log_np, phy_replicas_idx_np] = replica_indices
+
+        phy2log = torch.from_numpy(phy2log_np).to(device)
+        log2phy = torch.from_numpy(log2phy_np).to(device)
+        logcnt = torch.from_numpy(logcnt_np).to(device)
         return phy2log, log2phy, logcnt
diff --git a/vllm/distributed/eplb/rebalance_execute.py b/vllm/distributed/eplb/rebalance_execute.py
index 55856d940f0018160985f66ccc3ca420516d67ba..b7b6c11b239ac2051b00c44e903e8dbb25fbfa48 100644
--- a/vllm/distributed/eplb/rebalance_execute.py
+++ b/vllm/distributed/eplb/rebalance_execute.py
@@ -6,9 +6,10 @@ The actual execution of the rearrangement.
 This involves the exchange of expert weights between GPUs.
 """
 
-from collections.abc import Iterable, MutableSequence, Sequence
-from functools import partial
+from collections.abc import Iterable, Sequence
+from dataclasses import dataclass
 
+import numpy as np
 import torch
 from torch.distributed import (
     P2POp,
@@ -18,213 +19,317 @@ from torch.distributed import (
     get_global_rank,
 )
 
+from vllm.logger import init_logger
 
-def idx_local_to_global(
-    local_idx: int,
-    local_cnt: int,
-    ep_rank: int,
-) -> int:
-    """
-    Convert a local expert index to a global expert index.
-    """
-    return ep_rank * local_cnt + local_idx
+logger = init_logger(__name__)
 
 
-def idx_global_to_local(
-    global_idx: int,
-    local_cnt: int,
-    ep_rank: int,
-) -> int:
-    """
-    Convert a global expert index to a local expert index.
-    """
-    return global_idx - ep_rank * local_cnt
+@dataclass
+class RecvMetadata:
+    """Metadata describing remote receives during EPLB rebalancing."""
 
+    recv_primary_mask: np.ndarray
+    """Mask of (num_local_experts,) indicating primary experts received."""
+    recv_count: int
+    """Number of received experts for the layer."""
+    recv_expert_ids: np.ndarray
+    """Expert ids (num_local_experts,) of remote primary experts."""
+    recv_dst_rows: np.ndarray
+    """Target expert indices (num_local_experts,) in local tensors to send."""
 
-def global_idx_to_rank(
-    global_idx: int,
-    local_cnt: int,
-) -> int:
-    """
-    Convert a global expert index to a rank index.
-    """
-    return global_idx // local_cnt
 
+# Type alias for the result of move_to_buffer or transfer_layer
+MoveToBufferResult = tuple[np.ndarray, np.ndarray, RecvMetadata]
 
-def get_ep_ranks_with_expert(
-    idx: int,
+
+def get_ep_ranks_with_experts_batch(
+    expert_ids: np.ndarray,
     num_local_experts: int,
-    old_indices: Sequence[int],
-    new_indices: Sequence[int],
-) -> tuple[MutableSequence[int], MutableSequence[int]]:
+    old_indices: np.ndarray,
+    new_indices: np.ndarray,
+) -> tuple[dict[int, list[int]], dict[int, list[int]]]:
     """
     Get the ranks of the experts that need to be exchanged.
 
     Args:
-        idx: The index of the expert.
+        expert_ids: 1D array of expert indices to query.
         num_local_experts: The number of local experts.
         old_indices: The old indices of the experts.
         new_indices: The new indices of the experts.
 
     Returns:
-        A tuple of two lists:
-        - The ranks of the experts that need to be sent.
-        - The ranks of the experts that need to be received.
+        A tuple of two dictionaries mapping expert_id to:
+        - ranks_to_send: The ranks that have this expert and need to send.
+        - ranks_to_recv: The ranks that need to receive this expert.
     """
-    global2rank = partial(
-        global_idx_to_rank,
-        local_cnt=num_local_experts,
-    )
+    ranks_to_send_map: dict[int, list[int]] = {}
+    ranks_to_recv_map: dict[int, list[int]] = {}
+
+    # Fast path: if no experts, return empty dicts
+    if expert_ids.size == 0:
+        return ranks_to_send_map, ranks_to_recv_map
+
+    unique_experts = np.unique(expert_ids)
+    num_positions = len(old_indices)
+    position_indices = np.arange(num_positions, dtype=np.int32)
+
+    # Vectorized approach: find all positions matching any query expert in one pass
+    # Use np.isin to get boolean masks for all relevant positions at once
+    old_relevant_mask = np.isin(old_indices, unique_experts)
+    new_relevant_mask = np.isin(new_indices, unique_experts)
+
+    # Process old_indices (send ranks)
+    if np.any(old_relevant_mask):
+        old_relevant_positions = position_indices[old_relevant_mask]
+        old_relevant_experts = old_indices[old_relevant_mask]
+        old_relevant_ranks = old_relevant_positions // num_local_experts
+
+        # Sort by expert first, then by position (to maintain first-appearance order)
+        sort_order = np.lexsort((old_relevant_positions, old_relevant_experts))
+        sorted_experts = old_relevant_experts[sort_order]
+        sorted_ranks = old_relevant_ranks[sort_order]
+
+        # Find boundaries where expert changes
+        expert_boundaries = np.concatenate(
+            [[0], np.where(np.diff(sorted_experts) != 0)[0] + 1, [len(sorted_experts)]]
+        )
 
-    ranks_to_send: list[int] = []
-    ranks_to_recv: list[int] = []
-
-    for i, e in enumerate(old_indices):
-        if e == idx:
-            rank = global2rank(i)
-            if not ranks_to_send or ranks_to_send[-1] != rank:
-                ranks_to_send.append(rank)
-
-    for i, e in enumerate(new_indices):
-        if e == idx:
-            rank = global2rank(i)
-            if not ranks_to_recv or ranks_to_recv[-1] != rank:
-                ranks_to_recv.append(rank)
-
-    # Remove those ranks that can get this expert locally.
-    ranks_to_send_set = set(ranks_to_send)
-    ranks_to_recv_actual = [
-        rank for rank in ranks_to_recv if rank not in ranks_to_send_set
-    ]
+        # For each expert, extract unique ranks in order of first appearance
+        for i in range(len(expert_boundaries) - 1):
+            start, end = expert_boundaries[i], expert_boundaries[i + 1]
+            expert = int(sorted_experts[start])
+            expert_ranks = sorted_ranks[start:end]
+
+            # Get unique ranks preserving order
+            _, unique_idx = np.unique(expert_ranks, return_index=True)
+            unique_ranks = expert_ranks[np.sort(unique_idx)]
+            ranks_to_send_map[expert] = unique_ranks.tolist()
+
+    # Process new_indices (recv ranks)
+    if np.any(new_relevant_mask):
+        new_relevant_positions = position_indices[new_relevant_mask]
+        new_relevant_experts = new_indices[new_relevant_mask]
+        new_relevant_ranks = new_relevant_positions // num_local_experts
+
+        # Sort by expert first, then by position
+        sort_order = np.lexsort((new_relevant_positions, new_relevant_experts))
+        sorted_experts = new_relevant_experts[sort_order]
+        sorted_ranks = new_relevant_ranks[sort_order]
+
+        # Find boundaries where expert changes
+        expert_boundaries = np.concatenate(
+            [[0], np.where(np.diff(sorted_experts) != 0)[0] + 1, [len(sorted_experts)]]
+        )
 
-    return ranks_to_send, ranks_to_recv_actual
+        # For each expert, extract unique ranks and exclude local copies
+        for i in range(len(expert_boundaries) - 1):
+            start, end = expert_boundaries[i], expert_boundaries[i + 1]
+            expert = int(sorted_experts[start])
+            expert_ranks = sorted_ranks[start:end]
+
+            # Get unique ranks preserving order
+            _, unique_idx = np.unique(expert_ranks, return_index=True)
+            unique_ranks = expert_ranks[np.sort(unique_idx)]
+
+            # Remove ranks that have local copies (in send map)
+            send_ranks_set = set(ranks_to_send_map.get(expert, []))
+            recv_ranks_actual = [
+                int(r) for r in unique_ranks if r not in send_ranks_set
+            ]
+            ranks_to_recv_map[expert] = recv_ranks_actual
+
+    # Handle experts that only appear in old (send only) or new (recv only)
+    for expert in unique_experts:
+        expert = int(expert)
+        if expert not in ranks_to_send_map:
+            ranks_to_send_map[expert] = []
+        if expert not in ranks_to_recv_map:
+            ranks_to_recv_map[expert] = []
+
+    return ranks_to_send_map, ranks_to_recv_map
 
 
 def move_to_buffer(
     num_local_experts: int,
-    old_indices: Sequence[int],
-    new_indices: Sequence[int],
+    old_indices: np.ndarray,
+    new_indices: np.ndarray,
     expert_weights: Iterable[torch.Tensor],
-    expert_weights_buffer: Sequence[torch.Tensor],
+    expert_weights_buffers: Sequence[torch.Tensor],
     cuda_stream: torch.cuda.Stream | None,
     ep_group: ProcessGroup,
-) -> tuple[list[bool], list[bool], dict[int, int]]:
+) -> MoveToBufferResult:
     """
-    Perform expert weights rearrangement of one layer.
+    Rearranges expert weights during EPLB rebalancing.
+
+    Args:
+        num_local_experts: Number of local experts.
+        old_indices: (num_experts_total,) ndarray of current (old)
+            global-to-local expert assignments.
+        new_indices: (num_experts_total,) ndarray of desired (new)
+            global-to-local assignments after rebalance.
+        expert_weights: Original expert weights for the layer.
+        expert_weights_buffers: Intermediate buffers (one per tensor).
+        cuda_stream: CUDA stream for async copies (can be None for sync mode).
+        ep_group: Distributed process group for expert parallel comms.
+
+    Returns:
+        is_unchanged (np.ndarray): (num_local_experts,), True where an expert row
+            is unchanged after rebalance.
+        is_received_locally (np.ndarray): (num_local_experts,), True where a row
+            can be updated from local data.
+        RecvMetadata: Metadata needed for completing remote weight transfers.
     """
+    assert old_indices.shape == new_indices.shape
     ep_rank = ep_group.rank()
-    local2global = partial(
-        idx_local_to_global,
-        local_cnt=num_local_experts,
-        ep_rank=ep_rank,
-    )
 
-    # 0. Do nothing for experts that did not change.
-    is_unchanged = [
-        old_indices[local2global(i)] == new_indices[local2global(i)]
-        for i in range(num_local_experts)
-    ]
+    recv_primary_mask = np.zeros((num_local_experts,), dtype=np.bool_)
+    send_expert_ids = np.full((num_local_experts,), -1, dtype=np.int64)
+    send_src_rows = np.full((num_local_experts,), -1, dtype=np.int32)
+    recv_expert_ids = np.full((num_local_experts,), -1, dtype=np.int64)
+    recv_dst_rows = np.full((num_local_experts,), -1, dtype=np.int32)
 
-    # 1. Perform weight copy inside the local rank.
-    is_received_locally = is_unchanged[:]
-    for src in range(num_local_experts):
-        src_global = local2global(src)
-        for dst in range(num_local_experts):
-            dst_global = local2global(dst)
-            if is_received_locally[dst]:
-                continue
-            if old_indices[src_global] == -1 or new_indices[dst_global] == -1:
-                continue
-            if old_indices[src_global] == new_indices[dst_global]:
-                is_received_locally[dst] = True
-                for weight, buffer in zip(expert_weights, expert_weights_buffer):
-                    with torch.cuda.stream(cuda_stream):
-                        buffer[dst].copy_(weight[src], non_blocking=True)
+    base = ep_rank * num_local_experts
+    local_rows = np.arange(num_local_experts, dtype=np.int32)
+    local_global = base + local_rows
+
+    old_local_expert_ids = old_indices[local_global]
+    new_local_expert_ids = new_indices[local_global]
+
+    # Unchanged mask
+    is_unchanged = old_local_expert_ids == new_local_expert_ids
+
+    # Local receive eligibility
+    new_valid = new_local_expert_ids != -1
+    can_recv_local = np.isin(
+        new_local_expert_ids, old_local_expert_ids, assume_unique=False
+    )
+    is_received_locally = np.logical_or(
+        is_unchanged, np.logical_and(new_valid, can_recv_local)
+    )
+
+    # Send map: first src row per unique expert present locally in old mapping
+    send_count = 0
+    valid_old = old_local_expert_ids != -1
+    if np.any(valid_old):
+        uniq_experts, first_idx = np.unique(
+            old_local_expert_ids[valid_old], return_index=True
+        )
+        filtered_rows = local_rows[valid_old]
+        src_rows = filtered_rows[first_idx]
+        send_count = int(uniq_experts.shape[0])
+        send_expert_ids[:send_count] = uniq_experts
+        send_src_rows[:send_count] = src_rows
+
+    # Recv map: primary dst per unique expert needed remotely
+    recv_count = 0
+    need_recv_mask = np.logical_and(~is_received_locally, new_valid)
+    if np.any(need_recv_mask):
+        desired_experts = new_local_expert_ids[need_recv_mask]
+        desired_dsts = local_rows[need_recv_mask]
+        uniq_recv_experts, uniq_indices = np.unique(desired_experts, return_index=True)
+        dst_rows = desired_dsts[uniq_indices]
+        recv_count = int(uniq_recv_experts.shape[0])
+        recv_expert_ids[:recv_count] = uniq_recv_experts
+        recv_dst_rows[:recv_count] = dst_rows
+        recv_primary_mask[dst_rows] = True
+
+    eligible_local_buffer_mask = np.logical_and(~is_unchanged, is_received_locally)
+
+    # 1. Local moves into tmp buffers
+    if bool(eligible_local_buffer_mask.any()) and send_count > 0:
+        dest_indices = np.nonzero(eligible_local_buffer_mask)[0].tolist()
+        expert_to_src_map = dict(
+            zip(send_expert_ids[:send_count], send_src_rows[:send_count])
+        )
+        for dst in dest_indices:
+            expert = new_local_expert_ids[dst]
+            src_local = expert_to_src_map.get(expert, -1)
+            if src_local != -1:
+                for w, b in zip(expert_weights, expert_weights_buffers):
+                    b[dst].copy_(w[src_local], non_blocking=True)
 
     p2p_ops: list[P2POp] = []
 
-    # 2. Initiate sending of weights.
-    experts_send_loc: dict[int, int] = {}
-    for src in range(num_local_experts):
-        expert = old_indices[local2global(src)]
-        if expert == -1:
-            continue
-        if expert in experts_send_loc:
-            continue
-        experts_send_loc[expert] = src
-
-    # We need to sort here to match send/recv
-    for expert, src in sorted(experts_send_loc.items()):
-        ranks_to_send, ranks_to_recv = get_ep_ranks_with_expert(
-            expert,
+    # Pre-compute global ranks mapping
+    ep_size = ep_group.size()
+    rank_to_global = {rank: get_global_rank(ep_group, rank) for rank in range(ep_size)}
+
+    # 2. Post sends
+    if send_count > 0:
+        experts = send_expert_ids[:send_count]
+        srcs = send_src_rows[:send_count]
+        order = np.argsort(experts, kind="stable")
+        experts = experts[order]
+        srcs = srcs[order]
+
+        send_map, recv_map = get_ep_ranks_with_experts_batch(
+            experts,
             num_local_experts,
             old_indices,
             new_indices,
         )
 
-        # Calculate the ranks to send by this rank
-        num_dst_per_sender = len(ranks_to_recv) // len(ranks_to_send)
-        sender_pos = ranks_to_send.index(ep_rank)
-        recv_begin = sender_pos * num_dst_per_sender
-        recv_end = recv_begin + num_dst_per_sender
-        recv_ranks = ranks_to_recv[recv_begin:recv_end]
-
-        # Tackle remainders
-        remainder_start = len(ranks_to_send) * num_dst_per_sender
-        recver_pos = remainder_start + sender_pos
-        if recver_pos < len(ranks_to_recv):
-            recv_ranks.append(ranks_to_recv[recver_pos])
-
-        for dst in recv_ranks:
-            dst_global = get_global_rank(ep_group, dst)
-            p2p_ops += [
-                P2POp(
-                    torch.distributed.isend,
-                    weight[src],
-                    dst_global,
-                )
-                for weight in expert_weights
-            ]
-
-    # 3. Initiate receiving of weights.
-    experts_recv_loc: dict[int, int] = {}
-    for dst in range(num_local_experts):
-        if is_received_locally[dst]:
-            continue
-        expert = new_indices[local2global(dst)]
-        if expert == -1:
-            continue
-        if expert in experts_recv_loc:
-            continue
-        experts_recv_loc[expert] = dst
-
-    # We need to sort here to match send/recv
-    for expert, dst in sorted(experts_recv_loc.items()):
-        ranks_to_send, ranks_to_recv = get_ep_ranks_with_expert(
-            expert,
+        for expert, src in zip(experts.tolist(), srcs.tolist()):
+            ranks_to_send = send_map[expert]
+            ranks_to_recv = recv_map[expert]
+            if not ranks_to_send or not ranks_to_recv:
+                continue
+            num_dst_per_sender = len(ranks_to_recv) // len(ranks_to_send)
+            sender_pos = ranks_to_send.index(ep_rank)
+            recv_begin = sender_pos * num_dst_per_sender
+            recv_end = recv_begin + num_dst_per_sender
+            recv_ranks = ranks_to_recv[recv_begin:recv_end]
+            remainder_start = len(ranks_to_send) * num_dst_per_sender
+            recver_pos = remainder_start + sender_pos
+            if recver_pos < len(ranks_to_recv):
+                recv_ranks.append(ranks_to_recv[recver_pos])
+            for dst in recv_ranks:
+                dst_global = rank_to_global[dst]
+                p2p_ops += [
+                    P2POp(
+                        torch.distributed.isend,
+                        w[src],
+                        dst_global,
+                    )
+                    for w in expert_weights
+                ]
+
+    # 3. Post recvs
+    if recv_count > 0:
+        experts = recv_expert_ids[:recv_count]
+        dsts = recv_dst_rows[:recv_count]
+        order = np.argsort(experts, kind="stable")
+        experts = experts[order]
+        dsts = dsts[order]
+
+        send_map, recv_map = get_ep_ranks_with_experts_batch(
+            experts,
             num_local_experts,
             old_indices,
             new_indices,
         )
 
-        # Calculate the rank to recv by this rank
-        num_dst_per_sender = len(ranks_to_recv) // len(ranks_to_send)
-        recver_pos = ranks_to_recv.index(ep_rank)
-        remainder_start = len(ranks_to_send) * num_dst_per_sender
-        if recver_pos < remainder_start:
-            src = ranks_to_send[recver_pos // num_dst_per_sender]
-        else:
-            src = ranks_to_send[recver_pos - remainder_start]
-
-        src_global = get_global_rank(ep_group, src)
-        p2p_ops += [
-            P2POp(
-                torch.distributed.irecv,
-                weight[dst],
-                src_global,
-            )
-            for weight in expert_weights_buffer
-        ]
+        for expert, dst in zip(experts.tolist(), dsts.tolist()):
+            ranks_to_send = send_map[expert]
+            ranks_to_recv = recv_map[expert]
+            if not ranks_to_send or not ranks_to_recv:
+                continue
+            num_dst_per_sender = len(ranks_to_recv) // len(ranks_to_send)
+            recver_pos = ranks_to_recv.index(ep_rank)
+            remainder_start = len(ranks_to_send) * num_dst_per_sender
+            if recver_pos < remainder_start:
+                src = ranks_to_send[recver_pos // num_dst_per_sender]
+            else:
+                src = ranks_to_send[recver_pos - remainder_start]
+            src_global = rank_to_global[src]
+            p2p_ops += [
+                P2POp(
+                    torch.distributed.irecv,
+                    b[dst],
+                    src_global,
+                )
+                for b in expert_weights_buffers
+            ]
 
     # 4. Execute the P2P operations. The real communication happens here.
     if p2p_ops and cuda_stream is not None:
@@ -237,38 +342,95 @@ def move_to_buffer(
         for req in reqs:
             req.wait()
     # wait for the communication to finish
-    return is_unchanged, is_received_locally, experts_recv_loc
+    return (
+        is_unchanged,
+        is_received_locally,
+        RecvMetadata(
+            recv_primary_mask=recv_primary_mask,
+            recv_count=recv_count,
+            recv_expert_ids=recv_expert_ids,
+            recv_dst_rows=recv_dst_rows,
+        ),
+    )
 
 
 def move_from_buffer(
     expert_weights: Iterable[torch.Tensor],
-    expert_weights_buffer: list[torch.Tensor],
-    is_unchanged: list[bool],
-    is_received_locally: list[bool],
-    experts_recv_loc: dict[int, int],
-    new_indices: Sequence[int],
-    ep_group: ProcessGroup,
+    expert_weights_buffers: list[torch.Tensor],
+    is_unchanged: np.ndarray,
+    is_received_locally: np.ndarray,
+    recv_metadata: RecvMetadata,
+    new_indices: np.ndarray,
+    ep_rank: int,
 ) -> None:
-    ep_rank = ep_group.rank()
-    num_local_experts = len(is_unchanged)
+    """
+    Copies expert weights from communication buffers back to the target weight tensors
+    after EPLB rebalancing.
+
+    Args:
+        expert_weights: List of the actual MoE layer weights used in the execution.
+        expert_weights_buffers: Intermediate buffers containing the experts weights
+            after the transfer is completed.
+        is_unchanged: (num_local_experts,), True where an expert row is unchanged.
+        is_received_locally: (num_local_experts,), True where a row is updated locally.
+        recv_metadata: RecvMetadata containing remote receive metadata.
+        new_indices: (num_experts_total,) mapping from local rows to desired
+            (possibly global) expert id, after rebalance.
+        ep_rank: Rank of the process in the expert parallel group.
+    """
+    recv_primary_mask = recv_metadata.recv_primary_mask
+    recv_count = recv_metadata.recv_count
+    recv_expert_ids = recv_metadata.recv_expert_ids
+    recv_dst_rows = recv_metadata.recv_dst_rows
+    num_local_experts = is_unchanged.shape[0]
+
+    # Mask for rows to copy back from buffers:
+    # copy if locally received OR remote primary recv
+    copy_mask = np.logical_or(is_received_locally, recv_primary_mask)
+    dest_mask_np = np.logical_and(~is_unchanged, copy_mask)
+    if bool(dest_mask_np.any()):
+        dest_indices = np.nonzero(dest_mask_np)[0].tolist()
+        for dst in dest_indices:
+            for w, b in zip(expert_weights, expert_weights_buffers):
+                w[dst].copy_(b[dst], non_blocking=True)
+
+    if recv_count == 0:
+        return
 
-    local2global = partial(
-        idx_local_to_global, local_cnt=num_local_experts, ep_rank=ep_rank
+    # Duplicate remote received rows to non-primary duplicate dsts
+    base = ep_rank * num_local_experts
+    local_experts = new_indices[base + np.arange(num_local_experts, dtype=np.int32)]
+    duplicate_mask = np.logical_and(
+        np.logical_and(~is_unchanged, ~is_received_locally),
+        np.logical_and(~recv_primary_mask, local_experts != -1),
     )
+    # All received experts are unique in the destination, so no need to copy duplicates
+    if not bool(duplicate_mask.any()):
+        return
 
-    for dst in range(num_local_experts):
-        if is_unchanged[dst]:
-            continue
-        if is_received_locally[dst]:
-            for weight, buffer in zip(expert_weights, expert_weights_buffer):
-                weight[dst].copy_(buffer[dst], non_blocking=True)
-        else:
-            expert = new_indices[local2global(dst)]
-            if expert == -1:
-                continue
-            src = experts_recv_loc[expert]
-            for weight, buffer in zip(expert_weights, expert_weights_buffer):
-                weight[dst].copy_(buffer[src], non_blocking=True)
+    dup_dst_rows = np.nonzero(duplicate_mask)[0]
+    dup_experts = local_experts[dup_dst_rows]
+
+    prim_experts = recv_expert_ids[:recv_count]
+    prim_dsts = recv_dst_rows[:recv_count]
+    order = np.argsort(prim_experts, kind="stable")
+    prim_experts_sorted = prim_experts[order]
+    prim_dsts_sorted = prim_dsts[order]
+    pos = np.searchsorted(prim_experts_sorted, dup_experts)
+    valid = np.logical_and(
+        pos < prim_experts_sorted.shape[0],
+        prim_experts_sorted[np.minimum(pos, prim_experts_sorted.shape[0] - 1)]
+        == dup_experts,
+    )
+    if not bool(valid.any()):
+        return
+
+    matched_dst_rows = dup_dst_rows[valid]
+    matched_src_rows = prim_dsts_sorted[pos[valid]]
+
+    for dst, src in zip(matched_dst_rows.tolist(), matched_src_rows.tolist()):
+        for w in expert_weights:
+            w[dst].copy_(w[src], non_blocking=True)
 
 
 async def transfer_layer(
@@ -281,7 +443,7 @@ async def transfer_layer(
     layer: int = 0,
     cuda_stream: torch.cuda.Stream | None = None,
     rank_mapping: dict[int, int] | None = None,
-) -> tuple[list[bool], list[bool], dict[int, int]]:
+) -> MoveToBufferResult:
     """
     Rearranges the expert weights in place according to the new expert indices.
 
@@ -299,6 +461,13 @@ async def transfer_layer(
         is_profile (bool): If `True`, do not perform any actual weight copy.
             This is used during profile run, where we only perform dummy
             communications to reserve enough memory for the buffers.
+
+    Returns:
+        is_unchanged (np.ndarray): (1, num_local_experts), True where expert
+            is left unchanged.
+        is_received_locally (np.ndarray): (1, num_local_experts), True where expert
+            can be received locally.
+        RecvMetadata: Metadata needed for completing remote weight transfers.
     """
     ep_size = ep_group.size()
     if rank_mapping is not None:
@@ -323,16 +492,19 @@ async def transfer_layer(
     assert new_global_expert_indices.shape == (num_moe_layers, num_physical_experts)
     assert num_physical_experts == ep_size * num_local_physical_experts
 
-    is_unchanged, is_received_locally, experts_recv_loc = move_to_buffer(
+    old_global_expert_indices_np = old_global_expert_indices.cpu().numpy()
+    new_global_expert_indices_np = new_global_expert_indices.cpu().numpy()
+
+    is_unchanged, is_received_locally, recv_metadata = move_to_buffer(
         num_local_experts=num_local_physical_experts,
-        old_indices=old_global_expert_indices[layer].tolist(),
-        new_indices=new_global_expert_indices[layer].tolist(),
+        old_indices=old_global_expert_indices_np[layer],
+        new_indices=new_global_expert_indices_np[layer],
         expert_weights=expert_weights[layer],
-        expert_weights_buffer=expert_weights_buffer,
+        expert_weights_buffers=expert_weights_buffer,
         cuda_stream=cuda_stream,
         ep_group=ep_group,
     )
-    return is_unchanged, is_received_locally, experts_recv_loc
+    return is_unchanged, is_received_locally, recv_metadata
 
 
 def rearrange_expert_weights_inplace(
@@ -388,19 +560,17 @@ def rearrange_expert_weights_inplace(
     ep_size = ep_group.size()
     assert num_physical_experts == ep_size * num_local_physical_experts
 
-    # A buffer to hold the expert weights in one layer during the exchange.
+    first_layer_weights = list(expert_weights[0])
+    # Buffers to hold the expert weights during the exchange.
     # NOTE: Currently we assume the same weights across different layers
     # have the same shape.
-    expert_weights_buffer = [torch.empty_like(w) for w in expert_weights[0]]
-
+    weights_buffer: list[torch.Tensor] = [
+        torch.empty_like(w) for w in first_layer_weights
+    ]
     if is_profile:
-        # Maximum send size is to send all local experts to all ranks,
-        # So we use a dummy `all_gather` to reserve enough communication buffer
-        for weight, buffer in zip(expert_weights[0], expert_weights_buffer):
-            # A `/dev/null`-like buffer to avoid real memory allocation
+        # Reserve communication buffers via a minimal dummy all_gather on first layer
+        for weight, buffer in zip(expert_weights[0], weights_buffer):
             dummy_recv_buffer = [buffer for _ in range(ep_size)]
-            # NOTE(bowen): Needed this barrier to avoid OOM during actual
-            # execution. I'm not very sure why this is needed
             torch.distributed.barrier()
             all_gather(
                 dummy_recv_buffer,
@@ -409,32 +579,32 @@ def rearrange_expert_weights_inplace(
             )
         return
 
-    old_global_expert_indices_cpu = old_global_expert_indices.cpu()
-    new_global_expert_indices_cpu = new_global_expert_indices.cpu()
-
     # NOTE(bowen): We need this synchronize to run, but I don't know why.
     # If you figure out the reason, please let me know -- thank you!
     torch.cuda.synchronize()
 
-    for layer in range(num_moe_layers):
-        is_unchanged, is_received_locally, experts_recv_loc = move_to_buffer(
+    old_global_expert_indices_cpu = old_global_expert_indices.cpu().numpy()
+    new_global_expert_indices_cpu = new_global_expert_indices.cpu().numpy()
+
+    for layer_idx in range(num_moe_layers):
+        is_unchanged, is_received_locally, recv_metadata = move_to_buffer(
             num_local_experts=num_local_physical_experts,
-            old_indices=old_global_expert_indices_cpu[layer].tolist(),
-            new_indices=new_global_expert_indices_cpu[layer].tolist(),
-            expert_weights=expert_weights[layer],
-            expert_weights_buffer=expert_weights_buffer,
+            old_indices=old_global_expert_indices_cpu[layer_idx],
+            new_indices=new_global_expert_indices_cpu[layer_idx],
+            expert_weights=expert_weights[layer_idx],
+            expert_weights_buffers=weights_buffer,
             cuda_stream=None,
             ep_group=ep_group,
         )
 
         move_from_buffer(
-            expert_weights=expert_weights[layer],
-            expert_weights_buffer=expert_weights_buffer,
+            expert_weights=expert_weights[layer_idx],
+            expert_weights_buffers=weights_buffer,
             is_unchanged=is_unchanged,
             is_received_locally=is_received_locally,
-            experts_recv_loc=experts_recv_loc,
-            new_indices=new_global_expert_indices[layer].tolist(),
-            ep_group=ep_group,
+            recv_metadata=recv_metadata,
+            new_indices=new_global_expert_indices_cpu[layer_idx],
+            ep_rank=ep_group.rank(),
         )
 
 
@@ -526,4 +696,4 @@ def _map_new_expert_indices_with_rank_mapping(
     return mapped_expert_indices
 
 
-__all__ = ["transfer_layer", "move_from_buffer"]
+__all__ = ["transfer_layer", "move_from_buffer", "RecvMetadata"]
diff --git a/vllm/distributed/kv_events.py b/vllm/distributed/kv_events.py
index 3b76af75504de23141f51cfc48f31a2ba7fddb81..123af17ef09120ba5462d9d25dc912e88f63b051 100644
--- a/vllm/distributed/kv_events.py
+++ b/vllm/distributed/kv_events.py
@@ -51,8 +51,14 @@ class BlockStored(KVCacheEvent):
     parent_block_hash: ExternalBlockHash | None
     token_ids: list[int]
     block_size: int
+
     lora_id: int | None
+    """Deprecated: use `lora_name` for KV block key hash.
+    Retained for backward compatibility.
+    """
+
     medium: str | None
+    lora_name: str | None
 
     def __hash__(self) -> int:
         return hash(
diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py
index 02d9a1ec9599ec5c4f06d4ec6d260f72551955ad..f4113c91b60f6b2de810b3233e51303ab9ee1479 100644
--- a/vllm/distributed/kv_transfer/kv_connector/factory.py
+++ b/vllm/distributed/kv_transfer/kv_connector/factory.py
@@ -179,6 +179,12 @@ KVConnectorFactory.register_connector(
     "MultiConnector",
 )
 
+KVConnectorFactory.register_connector(
+    "MoRIIOConnector",
+    "vllm.distributed.kv_transfer.kv_connector.v1.moriio.moriio_connector",
+    "MoRIIOConnector",
+)
+
 KVConnectorFactory.register_connector(
     "OffloadingConnector",
     "vllm.distributed.kv_transfer.kv_connector.v1.offloading_connector",
diff --git a/vllm/distributed/kv_transfer/kv_connector/utils.py b/vllm/distributed/kv_transfer/kv_connector/utils.py
index 117d159e25e7168785862ebd41753122df8efb8e..fd833e293938c03402633b37fd00117330ea083c 100644
--- a/vllm/distributed/kv_transfer/kv_connector/utils.py
+++ b/vllm/distributed/kv_transfer/kv_connector/utils.py
@@ -4,16 +4,17 @@
 KV cache helper for store.
 """
 
+from collections.abc import Iterator
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Literal
+from typing import TYPE_CHECKING, Any, Literal, cast
 
 import torch
 
-from vllm.attention.backends.abstract import AttentionBackend
-from vllm.attention.backends.registry import AttentionBackendEnum
-from vllm.config import get_current_vllm_config
+from vllm.config import VllmConfig, get_current_vllm_config, get_layers_from_vllm_config
 from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory
 from vllm.logger import init_logger
+from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
+from vllm.v1.attention.backend import AttentionBackend
 from vllm.v1.outputs import KVConnectorOutput, ModelRunnerOutput
 
 if TYPE_CHECKING:
@@ -21,6 +22,8 @@ if TYPE_CHECKING:
 
 logger = init_logger(__name__)
 
+EngineId = str
+
 
 def get_kv_connector_cache_layout():
     # NOTE (NickLucche) When running disaggregated PD with NIXL, HND layout is
@@ -201,6 +204,104 @@ def copy_kv_blocks(
         copy_fn(src_tensor, dst_tensor, src_indices, dst_indices)
 
 
+def kv_postprocess_blksize_on_receive(cache, indices, block_size_ratio):
+    """
+    Transforms the layout of received KV cache blocks to the local block_size.
+    (Only works for local blocksize > remote blocksize)
+
+    example:
+    local blocksize = 16 tokens, remote blocksize = 4 tokens
+    local block[0] = remote block[0, 1, 2, 3]
+    remote is |h0-b0|h1-b0|h2-b0|h3-b0|h0-b1|h1-b1|h2-b1|h3-b1|...
+    local is  |h0-b0..................|h1-b0..................|...
+    permute is to:
+    1. view => view remote as n_blocks * remote_shape(H,remoteN,D)
+    2. permute => (H, nblocks, remoteN, D)
+    3. flatten => (H, localN, D)
+    """
+    blocks_to_update = cache.index_select(0, indices)
+    # use physical order
+    blocks_to_update = blocks_to_update.permute(0, 2, 1, 3)
+    n_kv_heads, block_size, head_size = blocks_to_update.shape[1:]
+    remote_block_size = block_size // block_size_ratio
+    n_blocks = block_size_ratio
+
+    permuted_blocks = (
+        blocks_to_update.reshape(-1, n_blocks, n_kv_heads, remote_block_size, head_size)
+        .permute(0, 2, 1, 3, 4)
+        .flatten(2, 3)
+    )
+    permuted_blocks = permuted_blocks.permute(0, 2, 1, 3)
+    cache.index_copy_(0, indices, permuted_blocks)
+
+
+def kv_postprocess_layout_on_receive(cache, indices):
+    """Transforms the layout of received KV cache blocks to the local format.
+
+    This method corrects layout mismatches from direct memory copies by
+    permuting the tensor dimensions.
+
+    - **Source Layout:** `[num_blocks, n_kv_head, block_size, head_dim]`
+    - **Target Layout:** `[num_blocks, block_size, n_kv_head, head_dim]`
+
+    Implementation:
+    - x = blocks_to_update.reshape(src_shape) # view local kv with sender layout
+    - permuted_blocks = x.permute(*inv_order) # transpose n_kv_heads, block_size
+    - cache.index_copy_(0, indices, permuted_blocks) # copy permuted kv back
+
+    """
+    blocks_to_update = cache.index_select(0, indices)
+    target_shape = list(blocks_to_update.shape)
+    target_shape[0] = -1
+    inv_order = [0, 2, 1, 3]
+    src_shape = tuple(target_shape[i] for i in inv_order)
+    blocks_to_update = cache.index_select(0, indices)
+    permuted_blocks = blocks_to_update.reshape(src_shape).permute(*inv_order)
+    cache.index_copy_(0, indices, permuted_blocks)
+
+
+def kv_postprocess_blksize_and_layout_on_receive(cache, indices, block_size_ratio):
+    """
+    Transforms the layout of received KV cache to the local block_size and HND.
+    (Only works for local blocksize > remote blocksize)
+
+    prefill is HND, smaller block_size
+    decode(local) is NHD, larger block_size
+    """
+    blocks_to_update = cache.index_select(0, indices)
+
+    block_size, n_kv_heads, head_size = blocks_to_update.shape[1:]
+    remote_block_size = block_size // block_size_ratio
+    n_blocks = block_size_ratio
+
+    permuted_blocks = (
+        blocks_to_update.reshape(-1, n_blocks, n_kv_heads, remote_block_size, head_size)
+        .permute(0, 1, 3, 2, 4)
+        .flatten(1, 2)
+    )
+    cache.index_copy_(0, indices, permuted_blocks)
+
+
+def yield_req_data(
+    scheduler_output,
+) -> Iterator[tuple[str, tuple[list[int], ...], bool]]:
+    """
+    Yields:
+        (req_id, new_block_id_groups, preempted)
+    """
+    # new requests
+    for req_data in scheduler_output.scheduled_new_reqs:
+        yield req_data.req_id, req_data.block_ids, False
+
+    # cached requests
+    cached_reqs = scheduler_output.scheduled_cached_reqs
+    yield from zip(
+        cached_reqs.req_ids,
+        cached_reqs.new_block_ids,
+        (req_id in cached_reqs.resumed_req_ids for req_id in cached_reqs.req_ids),
+    )
+
+
 @dataclass
 class TpKVTopology:
     """
@@ -209,12 +310,12 @@ class TpKVTopology:
     """
 
     tp_rank: int
-    remote_tp_size: dict[str, int]
+    remote_tp_size: dict[EngineId, int]
     is_mla: bool
     total_num_kv_heads: int
     attn_backend: type[AttentionBackend]
-    engine_id: str
-    remote_block_size: dict[str, int]
+    engine_id: EngineId
+    remote_block_size: dict[EngineId, int]
 
     def __post_init__(self):
         # Figure out whether the first dimension of the cache is K/V
@@ -228,9 +329,6 @@ class TpKVTopology:
             len(kv_cache_shape) == 5 and kv_cache_shape[0] == 1
         )
 
-        attn_backend = AttentionBackendEnum[self.attn_backend.get_name()]
-        self._use_pallas = attn_backend == AttentionBackendEnum.PALLAS
-
     @property
     def is_kv_layout_blocks_first(self) -> bool:
         return self._is_kv_layout_blocks_first
@@ -238,7 +336,7 @@ class TpKVTopology:
     @property
     def split_k_and_v(self) -> bool:
         # Whether to register regions for K and V separately (when present).
-        return not (self.is_mla or self._use_pallas or self.is_kv_layout_blocks_first)
+        return not (self.is_mla or self.is_kv_layout_blocks_first)
 
     @property
     def tp_size(self) -> int:
@@ -256,18 +354,28 @@ class TpKVTopology:
         Calculate the tensor parallel ratio between local and remote TP.
         We can think of it as the number of local TP workers-per-remote TP
         workers. Local workers will read from the same remote TP worker in
-        groups of size `tp_ratio`.
+        groups of size `tp_ratio`.If remote tp_size > local tp_size, the
+        ratio is flipped (remote_size/local_size) and the returned value is
+        negative.
         """
-        assert self.tp_size % remote_tp_size == 0, (
-            f"Local tensor parallel size {self.tp_size} is not divisible "
-            f"by remote tensor parallel size {remote_tp_size}."
+        if self.tp_size >= remote_tp_size:
+            assert self.tp_size % remote_tp_size == 0, (
+                f"Local tensor parallel size {self.tp_size} is not divisible "
+                f"by remote tensor parallel size {remote_tp_size}."
+            )
+            return self.tp_size // remote_tp_size
+
+        assert remote_tp_size % self.tp_size == 0, (
+            f"Remote tensor parallel size {remote_tp_size} is not divisible "
+            f"by local tensor parallel size {self.tp_size}."
         )
-        return self.tp_size // remote_tp_size
+        # P TP > D TP case, return the ratio as negative
+        return -remote_tp_size // self.tp_size
 
     def block_size_ratio(
         self,
         remote_block_size: int,
-    ) -> float:
+    ) -> int:
         """
         Calculate the block size ratio between local and remote TP.
         """
@@ -279,19 +387,19 @@ class TpKVTopology:
 
     def tp_ratio_from_engine_id(
         self,
-        remote_engine_id: str,
+        remote_engine_id: EngineId,
     ) -> int:
         remote_tp_size = self.remote_tp_size[remote_engine_id]
         return self.tp_ratio(remote_tp_size)
 
     def block_size_ratio_from_engine_id(
         self,
-        remote_engine_id: str,
-    ) -> float:
+        remote_engine_id: EngineId,
+    ) -> int:
         remote_block_size = self.remote_block_size[remote_engine_id]
         return self.block_size_ratio(remote_block_size)
 
-    def is_kv_replicated(self, engine_id: str) -> bool:
+    def is_kv_replicated(self, engine_id: EngineId) -> bool:
         """
         Whether the KV cache is replicated across TP workers due to the
         number of TP workers being greater than the number of KV heads.
@@ -299,24 +407,53 @@ class TpKVTopology:
         tp_size = self.remote_tp_size[engine_id]
         return tp_size // self.total_num_kv_heads >= 1
 
-    def replicates_kv_cache(self, remote_engine_id: str) -> bool:
+    def replicates_kv_cache(self, remote_engine_id: EngineId) -> bool:
         # MLA is always replicated as the hidden dim can't be split.
         return self.is_mla or self.is_kv_replicated(remote_engine_id)
 
-    def get_target_remote_rank(
+    def get_target_remote_ranks(
         self,
         remote_tp_size: int,
-    ) -> int:
+    ) -> list[int]:
         """
         Get the remote TP rank (on P) that the current local TP rank
-        (on D) will read from.
+        (on D) will read from. When remote tp_size > local tp_size, we
+        read from multiple remote ranks.
         """
         tp_ratio = self.tp_ratio(remote_tp_size)
-        return self.tp_rank // tp_ratio
+        if tp_ratio > 0:
+            return [self.tp_rank // tp_ratio]
+
+        # P TP > D TP case, D reads from |tp_ratio| remote workers.
+        tp_ratio = -tp_ratio
+        return [self.tp_rank * tp_ratio + i for i in range(tp_ratio)]
 
-    def get_target_remote_rank_from_engine_id(
+    def get_target_remote_ranks_from_engine_id(
         self,
-        remote_engine_id: str,
-    ) -> int:
+        remote_engine_id: EngineId,
+    ) -> list[int]:
         remote_tp_size = self.remote_tp_size[remote_engine_id]
-        return self.get_target_remote_rank(remote_tp_size)
+        return self.get_target_remote_ranks(remote_tp_size)
+
+
+def get_current_attn_backend(vllm_config: VllmConfig):
+    layer_type = cast(type[Any], AttentionLayerBase)
+    layers = get_layers_from_vllm_config(vllm_config, layer_type, None)
+    if layers:
+        backend = next(iter(layers.values())).get_attn_backend()
+    else:
+        # Fallback for tests, when static_forward_context is empty.
+        logger.debug(
+            "No layers found in the vLLM config. "
+            "Falling back to default attention backend."
+        )
+        from vllm.v1.attention.selector import get_attn_backend
+
+        backend = get_attn_backend(
+            head_size=vllm_config.model_config.get_head_size(),
+            dtype=vllm_config.model_config.dtype,
+            kv_cache_dtype=vllm_config.cache_config.cache_dtype,
+            block_size=vllm_config.cache_config.block_size,
+            use_mla=vllm_config.model_config.use_mla,
+        )
+    return backend
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
index c05e5485a835e290113413b77404659d6ff4d250..01b606b28dff95abf15af239ef8631a59d10094e 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
@@ -25,6 +25,9 @@ The class provides the following primitives:
 
     Worker-side: runs in each worker, loads/saves KV cache to/from
     the Connector based on the metadata.
+        handle_preemptions() - called if there are preempted requests,
+            before their blocks are overwritten
+
         start_load_kv() - starts loading all KVs (maybe async)
         wait_for_layer_load() - blocks until layer i load is done
 
@@ -38,12 +41,12 @@ The class provides the following primitives:
 import enum
 from abc import ABC, abstractmethod
 from collections.abc import Callable, Iterable
-from typing import TYPE_CHECKING, Any, ClassVar, Literal, Optional
+from typing import TYPE_CHECKING, Any, Literal, Optional
 
 import torch
 
-from vllm.attention.backends.abstract import AttentionBackend, AttentionMetadata
 from vllm.logger import init_logger
+from vllm.v1.attention.backend import AttentionBackend, AttentionMetadata
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.outputs import KVConnectorOutput
 
@@ -144,15 +147,15 @@ class KVConnectorMetadata(ABC):  # noqa: B024
 class KVConnectorBase_V1(ABC):
     """
     Base class for KV connectors.
-
-    Attributes:
-        prefer_cross_layer_blocks (bool): Indicates whether this connector
-            prefers KV blocks that hold KV data for all layers (for speeding
-            up KV data transfers).
-            Defaults to False.
     """
 
-    prefer_cross_layer_blocks: ClassVar[bool] = False
+    @property
+    def prefer_cross_layer_blocks(self) -> bool:
+        """
+        Indicates whether this connector prefers KV blocks that hold KV data for all
+        layers, which can speed up KV data transfers. Defaults to False.
+        """
+        return False
 
     def __init__(
         self,
@@ -262,6 +265,13 @@ class KVConnectorBase_V1(ABC):
         """
         return
 
+    def handle_preemptions(self, preempted_req_ids: set[str]):
+        """
+        Handle preempted requests BEFORE their blocks are overwritten.
+        Needed for connectors which use async saves (e.g., OffloadingConnector)
+        """
+        return
+
     @abstractmethod
     def start_load_kv(self, forward_context: "ForwardContext", **kwargs: Any) -> None:
         """
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py
index e9b2bd392b0efe19325399fa7ede4a402f5c6115..525061fc0087a6bdef42ab73a65917e178135297 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py
@@ -36,7 +36,6 @@ from typing import TYPE_CHECKING, Any, Optional
 
 import torch
 
-from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.distributed.kv_transfer.kv_connector.v1 import (
     KVConnectorBase_V1,
     KVConnectorRole,
@@ -44,6 +43,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1 import (
 from vllm.distributed.kv_transfer.kv_connector.v1.base import KVConnectorMetadata
 from vllm.logger import init_logger
 from vllm.utils.math_utils import cdiv
+from vllm.v1.attention.backend import AttentionMetadata
 
 if TYPE_CHECKING:
     from vllm.config import VllmConfig
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/example_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/example_connector.py
index 41243fc866b5970dd6ffdb0c488779b3310d9a55..ca2647194cec493a5ec8be7bd30a12aebb89d79a 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/example_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/example_connector.py
@@ -7,7 +7,6 @@ from typing import TYPE_CHECKING, Any, Optional
 import safetensors
 import torch
 
-from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import VllmConfig
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
     KVConnectorBase_V1,
@@ -16,6 +15,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import (
 )
 from vllm.logger import init_logger
 from vllm.utils.hashing import safe_hash
+from vllm.v1.attention.backend import AttentionMetadata
 from vllm.v1.attention.backends.mla.common import MLACommonMetadata
 from vllm.v1.core.sched.output import SchedulerOutput
 
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
index 17d468fe6c30514520658dbe5542365675f31455..376215e06660b11648e06df3c0a542f9c4225cfa 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
@@ -5,7 +5,6 @@ from typing import TYPE_CHECKING, Any
 
 import torch
 
-from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import VllmConfig
 from vllm.distributed.kv_events import (
     BlockStored,
@@ -19,6 +18,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import (
     KVConnectorRole,
 )
 from vllm.logger import init_logger
+from vllm.v1.attention.backend import AttentionMetadata
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.outputs import KVConnectorOutput
 
@@ -107,6 +107,22 @@ class LMCacheConnectorV1(KVConnectorBase_V1):
     # ==============================
     # Worker-side methods
     # ==============================
+    def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
+        """
+        Initialize with the KV caches. Useful for pre-registering the
+        KV Caches in the KVConnector (e.g. for NIXL).
+
+        Args:
+            kv_caches: dictionary of layer names, kv cache
+        """
+        if hasattr(self._lmcache_engine, "register_kv_caches"):
+            self._lmcache_engine.register_kv_caches(kv_caches)
+        else:
+            logger.warning(
+                "LMCache engine does not support register_kv_caches, "
+                "please check and use the latest version"
+            )
+
     def start_load_kv(self, forward_context: "ForwardContext", **kwargs: Any) -> None:
         """
         Start loading the KV cache from the connector to vLLM's paged
@@ -218,6 +234,7 @@ class LMCacheConnectorV1(KVConnectorBase_V1):
                 lora_id=e.lora_id,
                 block_size=e.block_size,
                 medium=e.medium,
+                lora_name=getattr(e, "lora_name", None),
             )
             for e in events
         ]
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py
index 6acfb73997f250896896b48180c7924984209e8f..d865f70bdd877046c2715cb9d37a2d1ae53f17d3 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py
@@ -95,6 +95,10 @@ class LMCacheMPSchedulerAdapter:
             kv_rank: The kv rank used for LMCache keys
             vllm_block_size: The block size used in vLLM
         """
+        logger.warning(
+            "Importing LMCacheMPSchedulerAdapter is deprecated. "
+            "Please update your LMCache to the latest version."
+        )
         self.mq_client = MessageQueueClient(server_url, context)
 
         # Request futures
@@ -147,6 +151,14 @@ class LMCacheMPSchedulerAdapter:
         """
         return self.blocks_in_chunk
 
+    def cleanup_lookup_result(self, request_id: str) -> None:
+        """
+        Clean up lookup future for a finished request to prevent memory leak.
+        Args:
+            request_id: The ID of the finished request.
+        """
+        self.lookup_futures.pop(request_id, None)
+
     # Helper functions
     def _create_key(self, block_hash: bytes) -> IPCCacheEngineKey:
         """Convert a block hash to an IPC cache engine key"""
@@ -168,6 +180,10 @@ class LMCacheMPWorkerAdapter:
         kv_rank: int,
         vllm_block_size: int,
     ):
+        logger.warning(
+            "Importing LMCacheMPWorkerAdapter is deprecated. "
+            "Please update your LMCache to the latest version."
+        )
         self.mq_client = MessageQueueClient(server_url, context)
 
         # Instance id for GPU worker
@@ -262,6 +278,7 @@ class LMCacheMPWorkerAdapter:
     ):
         keys = []
         block_ids = []
+
         for op in ops:
             keys.extend(self._block_hashes_to_keys(op.block_hashes))
             block_ids.extend(op.block_ids)
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/utils.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/utils.py
index 0e87dea59d232e024a82ab122b941684546a6c26..1383fc09eb0ad692d918a06f47252eb7242835cd 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/utils.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/utils.py
@@ -6,7 +6,6 @@ import threading
 from typing import TYPE_CHECKING, Union
 
 import torch
-from lmcache.config import LMCacheEngineConfig as Config
 from lmcache.logging import init_logger
 from lmcache.v1.config import LMCacheEngineConfig as V1Config
 
@@ -20,7 +19,7 @@ logger = init_logger(__name__)
 ENGINE_NAME = "vllm-instance"
 
 # Thread-safe singleton storage
-_config_instance: Config | V1Config | None = None
+_config_instance: V1Config | None = None
 _config_lock = threading.Lock()
 
 
@@ -29,7 +28,7 @@ def is_false(value: str) -> bool:
     return value.lower() in ("false", "0", "no", "n", "off")
 
 
-def lmcache_get_or_create_config() -> Config | V1Config:
+def lmcache_get_or_create_config() -> V1Config:
     """Get the LMCache configuration from the environment variable
     `LMCACHE_CONFIG_FILE`. If the environment variable is not set, this
     function will return the default configuration.
@@ -43,16 +42,7 @@ def lmcache_get_or_create_config() -> Config | V1Config:
     if _config_instance is None:
         with _config_lock:
             if _config_instance is None:  # Check again within lock
-                if is_false(os.getenv("LMCACHE_USE_EXPERIMENTAL", "True")):
-                    logger.warning(
-                        "Detected LMCACHE_USE_EXPERIMENTAL is set to False. "
-                        "Using legacy configuration is deprecated and will "
-                        "be remove soon! Please set LMCACHE_USE_EXPERIMENTAL "
-                        "to True."
-                    )
-                    LMCacheEngineConfig = Config  # type: ignore[assignment]
-                else:
-                    LMCacheEngineConfig = V1Config  # type: ignore[assignment]
+                LMCacheEngineConfig = V1Config  # type: ignore[assignment]
 
                 if "LMCACHE_CONFIG_FILE" not in os.environ:
                     logger.warning(
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py
index 09af128f3ed7445ee392ddeeb3b8b4854fd37a92..8159832cc342d31103412f56e25da263b2d85669 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py
@@ -36,7 +36,6 @@ except ImportError:
         PluginLauncher as RuntimePluginLauncher,
     )
 
-from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import VllmConfig
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
     KVConnectorBase_V1,
@@ -54,6 +53,7 @@ from vllm.distributed.parallel_state import get_tensor_model_parallel_rank, get_
 from vllm.sampling_params import SamplingParams
 from vllm.utils.math_utils import cdiv
 from vllm.utils.torch_utils import get_kv_cache_torch_dtype
+from vllm.v1.attention.backend import AttentionMetadata
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.version import __version__ as VLLM_VERSION
 
@@ -233,7 +233,10 @@ class RequestTracker:
         elif isinstance(new_block_ids, list):
             pass
         else:
-            raise ValueError(f"Unsupported new_block_ids type {type(new_block_ids)}")
+            raise ValueError(
+                f"Unsupported new_block_ids type {type(new_block_ids)}: "
+                f"should be None[list[int], ...], tuple or list[int]."
+            )
         self.allocated_block_ids.extend(new_block_ids)
 
         # When a request is scheduled again, and the number of new tokens
@@ -782,6 +785,16 @@ class LMCacheConnectorV1Impl:
     ####################
     # Worker side APIs
     ####################
+    @_lmcache_nvtx_annotate
+    def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
+        logger.info("Registering KV caches")
+        # TODO(chunxiaozheng): `_init_kv_caches_from_forward_context` is
+        #  not called, we should consider removing it.
+        assert len(self.kv_caches) == 0 and len(kv_caches) > 0
+        self.kv_caches = kv_caches
+        if self.lmcache_engine is not None:
+            kvcaches = list(self.kv_caches.values())
+            self.lmcache_engine.post_init(kvcaches=kvcaches)
 
     @_lmcache_nvtx_annotate
     def start_load_kv(self, forward_context: "ForwardContext", **kwargs) -> None:
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py
index 78256a6552c22aa978e538fed225ca5387ee1786..629170615dd8bf3c885f0399ad67b7d7a27f19e6 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py
@@ -10,22 +10,31 @@ import zmq
 from lmcache.integration.vllm.utils import mla_enabled
 from lmcache.utils import init_logger as lmcache_init_logger
 
-from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import VllmConfig
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
     KVConnectorBase_V1,
     KVConnectorMetadata,
     KVConnectorRole,
 )
-from vllm.distributed.kv_transfer.kv_connector.v1.lmcache_integration import (
-    LMCacheMPSchedulerAdapter,
-    LMCacheMPWorkerAdapter,
-    LoadStoreOp,
-)
+from vllm.v1.attention.backend import AttentionMetadata
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.outputs import KVConnectorOutput
+from vllm.v1.request import RequestStatus
 from vllm.v1.utils import ConstantList
 
+try:
+    from lmcache.integration.vllm.vllm_multi_process_adapter import (
+        LMCacheMPSchedulerAdapter,
+        LMCacheMPWorkerAdapter,
+        LoadStoreOp,
+    )
+except ImportError:
+    from vllm.distributed.kv_transfer.kv_connector.v1.lmcache_integration import (
+        LMCacheMPSchedulerAdapter,
+        LMCacheMPWorkerAdapter,
+        LoadStoreOp,
+    )
+
 if TYPE_CHECKING:
     from vllm.config import VllmConfig
     from vllm.distributed.kv_events import KVCacheEvent
@@ -211,7 +220,7 @@ class LMCacheMPRequestTracker:
         """
         self.num_stored_blocks += num_new_blocks
 
-    def update_block_ids(
+    def append_block_ids(
         self,
         new_block_ids: list[int],
     ):
@@ -455,10 +464,6 @@ class LMCacheMPConnector(KVConnectorBase_V1):
         metadata = self._get_connector_metadata()
         assert isinstance(metadata, LMCacheMPConnectorMetadata)
 
-        with torch.cuda.stream(torch.cuda.current_stream()):
-            event = torch.cuda.Event(interprocess=True)
-            event.record()
-
         request_ids = []
         ops = []
 
@@ -468,10 +473,14 @@ class LMCacheMPConnector(KVConnectorBase_V1):
             request_ids.append(meta.request_id)
             ops.append(meta.op)
 
-        if len(request_ids) > 0:
-            self.worker_adapter.batched_submit_retrieve_requests(
-                request_ids, ops, event
-            )
+        if len(request_ids) == 0:
+            return
+
+        with torch.cuda.stream(torch.cuda.current_stream()):
+            event = torch.cuda.Event(interprocess=True)
+            event.record()
+
+        self.worker_adapter.batched_submit_retrieve_requests(request_ids, ops, event)
 
     def wait_for_layer_load(self, layer_name: str) -> None:
         """
@@ -518,10 +527,6 @@ class LMCacheMPConnector(KVConnectorBase_V1):
         metadata = self._get_connector_metadata()
         assert isinstance(metadata, LMCacheMPConnectorMetadata)
 
-        with torch.cuda.stream(torch.cuda.current_stream()):
-            event = torch.cuda.Event(interprocess=True)
-            event.record()
-
         request_ids = []
         ops = []
         for meta in metadata.requests:
@@ -530,8 +535,14 @@ class LMCacheMPConnector(KVConnectorBase_V1):
             request_ids.append(meta.request_id)
             ops.append(meta.op)
 
-        if len(request_ids) > 0:
-            self.worker_adapter.batched_submit_store_requests(request_ids, ops, event)
+        if len(request_ids) == 0:
+            return
+
+        with torch.cuda.stream(torch.cuda.current_stream()):
+            event = torch.cuda.Event(interprocess=True)
+            event.record()
+
+        self.worker_adapter.batched_submit_store_requests(request_ids, ops, event)
 
     def get_finished(
         self, finished_req_ids: set[str]
@@ -627,6 +638,9 @@ class LMCacheMPConnector(KVConnectorBase_V1):
             into account.
         """
         tracker = self._get_or_create_request_tracker(request)
+        # TODO: support loading KV for preempted requests in the future
+        if request.status == RequestStatus.PREEMPTED:
+            return 0, False
 
         self.scheduler_adapter.maybe_submit_lookup_request(
             request.request_id, convert_block_hashes_to_bytes(request.block_hashes)
@@ -683,7 +697,7 @@ class LMCacheMPConnector(KVConnectorBase_V1):
 
         # No matter we need to retrieve or not, we need to update
         # the block ids into the tracker
-        tracker.update_block_ids(block_ids)
+        tracker.append_block_ids(block_ids)
 
         # Update the state of the tracker
         condition = tracker.needs_retrieve()
@@ -695,6 +709,8 @@ class LMCacheMPConnector(KVConnectorBase_V1):
                 if condition
                 else LMCacheMPRequestState.READY
             )
+            # Clean up lookup future in scheduler adapter
+            self.scheduler_adapter.cleanup_lookup_result(request.request_id)
 
     def build_connector_meta(
         self, scheduler_output: SchedulerOutput
@@ -748,6 +764,8 @@ class LMCacheMPConnector(KVConnectorBase_V1):
             Optional KVTransferParams to be included in the request outputs
             returned by the engine.
         """
+        # Clean up request tracker to prevent memory leak
+        self._cleanup_request_tracker(request.request_id)
         return True, None
 
     def take_events(self) -> Iterable["KVCacheEvent"]:
@@ -866,7 +884,8 @@ class LMCacheMPConnector(KVConnectorBase_V1):
 
             # Update block ids
             new_block_ids = reformat_block_ids(cached_reqs.new_block_ids[idx])
-            request_tracker.update_block_ids(new_block_ids)
+            if request_id not in cached_reqs.resumed_req_ids:
+                request_tracker.append_block_ids(new_block_ids)
 
             # Update new scheduled tokens
             num_new_tokens = cached_reqs.num_computed_tokens[idx]
@@ -889,7 +908,34 @@ class LMCacheMPConnector(KVConnectorBase_V1):
         self, request: "Request"
     ) -> LMCacheMPRequestTracker:
         request_id = request.request_id
+        # Remove the old trackers that is created before the preemption
+        if (
+            request.status == RequestStatus.PREEMPTED
+            and request_id in self.request_trackers
+        ):
+            tracker = self.request_trackers[request_id]
+
+            # NOTE: since this function may be called multiple times
+            # for a single request (because get_num_new_matched_tokens
+            # may be called multiple times) for the same request, we
+            # will only do the remove if the tracker is not in the "fresh"
+            # state, i.e., PREFETCHING
+            if tracker.state != LMCacheMPRequestState.PREFETCHING:
+                self.request_trackers.pop(request_id)
+
         if request_id not in self.request_trackers:
             new_tracker = LMCacheMPRequestTracker(request)
             self.request_trackers[request_id] = new_tracker
         return self.request_trackers[request_id]
+
+    def _cleanup_request_tracker(self, request_id: str) -> None:
+        """
+        Clean up request tracker and associated lookup future for a request.
+        This should be called when a request is finished to prevent memory leak.
+        """
+        # Clean up request tracker
+        if self.request_trackers.pop(request_id, None):
+            logger.debug(
+                "[KVConnector] Cleaned up request_tracker for request %s",
+                request_id,
+            )
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/mooncake_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/mooncake_connector.py
index 705960aebe2da2c4cc0d59bee394eb8f0ad9c06c..ef0268b9aba0ed2365cffb359307551d73c6b68f 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/mooncake_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/mooncake_connector.py
@@ -3,7 +3,6 @@
 import asyncio
 import threading
 import time
-import uuid
 from collections import defaultdict
 from concurrent.futures import ThreadPoolExecutor
 from dataclasses import dataclass
@@ -16,10 +15,11 @@ import zmq
 import zmq.asyncio
 
 from vllm import envs
-from vllm.attention.backends.abstract import AttentionMetadata
-from vllm.attention.selector import get_attn_backend
 from vllm.config import VllmConfig
-from vllm.distributed.kv_transfer.kv_connector.utils import TpKVTopology
+from vllm.distributed.kv_transfer.kv_connector.utils import (
+    TpKVTopology,
+    get_current_attn_backend,
+)
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
     KVConnectorBase_V1,
     KVConnectorMetadata,
@@ -33,6 +33,7 @@ from vllm.distributed.parallel_state import (
 from vllm.forward_context import ForwardContext
 from vllm.logger import init_logger
 from vllm.utils.network_utils import get_ip, make_zmq_path, make_zmq_socket
+from vllm.v1.attention.backend import AttentionMetadata
 from vllm.v1.attention.backends.utils import get_kv_cache_layout
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.request import RequestStatus
@@ -83,28 +84,10 @@ class RecvReqMeta:
 @dataclass
 class SendBlockMeta:
     local_block_ids: list[int]
-    ready: threading.Event
+    ready: asyncio.Event
     expire_time: float = float("inf")
 
 
-@dataclass
-class SendReqMeta:
-    reqs: dict[ReqId, SendBlockMeta]
-    lock: threading.Lock
-
-
-@dataclass
-class FinishedSendReqSet:
-    set: set[ReqId]
-    lock: threading.Lock
-
-
-@dataclass
-class FinishedReceiveReqSet:
-    set: set[ReqId]
-    lock: asyncio.Lock
-
-
 class MooncakeConnectorMetadata(KVConnectorMetadata):
     def __init__(self):
         self.reqs_to_recv: dict[ReqId, RecvReqMeta] = {}
@@ -408,7 +391,13 @@ class MooncakeConnectorWorker:
 
         self.engine = TransferEngine()
         self.hostname = get_ip()
-        ret_value = self.engine.initialize(self.hostname, "P2PHANDSHAKE", "rdma", "")
+        protocol = self.vllm_config.kv_transfer_config.kv_connector_extra_config.get(  # type: ignore[union-attr]
+            "mooncake_protocol", "rdma"
+        )
+        logger.info(
+            "The Mooncake Transfer Engine is using %s as its protocol.", protocol
+        )
+        ret_value = self.engine.initialize(self.hostname, "P2PHANDSHAKE", protocol, "")
         if ret_value != 0:
             raise RuntimeError("Mooncake Transfer Engine initialization failed.")
 
@@ -431,52 +420,59 @@ class MooncakeConnectorWorker:
 
         assert vllm_config.kv_transfer_config
         self.kv_role = vllm_config.kv_transfer_config.kv_role
-        self.num_workers = vllm_config.kv_transfer_config.kv_connector_extra_config.get(
-            "num_workers", 10
+        self.num_sender_workers = (
+            vllm_config.kv_transfer_config.kv_connector_extra_config.get(
+                "num_workers", 10
+            )
         )
+        # Create more tasks than workers to keep the thread pool saturated.
+        # Tasks can await async events, so a surplus (2x is a robust heuristic)
+        # prevents workers from idling.
+        self.num_sender_tasks = self.num_sender_workers * 2
 
         self.kv_caches_base_addr: list[int] = []
         self.device_kv_caches: dict[str, torch.Tensor] = {}
-        self.reqs_need_send: SendReqMeta = SendReqMeta(reqs={}, lock=threading.Lock())
+        self.reqs_need_send: dict[ReqId, SendBlockMeta] = {}
 
         # For kv_both, we will act both prefiller and decoder.
         if self.kv_role != "kv_consumer":
-            # Background thread for sending kvcaches to D.
-            self._mooncake_sender_t: threading.Thread | None = None
-            # Background thread for processing new sending requests.
+            # Background threads for sending kvcaches to D.
             self._sender_executor = ThreadPoolExecutor(
-                max_workers=self.num_workers, thread_name_prefix="vllm-mooncake-sender"
+                max_workers=self.num_sender_workers,
+                thread_name_prefix="vllm-mooncake-sender",
             )
             logger.debug(
-                "Mooncake Prefiller: use %d workers to send kvcaches", self.num_workers
+                "Mooncake Prefiller: use %d workers to send kvcaches",
+                self.num_sender_workers,
             )
+            # An asyncio queue to buffer incoming requests for the sender
+            self.sender_worker_queue = asyncio.Queue[tuple[bytes, bytes]]()
+            self.sender_loop = asyncio.new_event_loop()
+            # Background thread for processing new sending requests.
+            self._sender_listener_t = threading.Thread(
+                target=_async_loop, args=(self.sender_loop,), daemon=True
+            )
+            self._sender_listener_t.start()
+
         if self.kv_role != "kv_producer":
             self.receiver_loop = asyncio.new_event_loop()
             self._mooncake_receiver_t = threading.Thread(
-                target=self._receiver_loop, args=(self.receiver_loop,), daemon=True
+                target=_async_loop, args=(self.receiver_loop,), daemon=True
             )
             self._mooncake_receiver_t.start()
             logger.debug("Mooncake Decoder: start receiver thread")
 
-        self.finished_sending_reqs: FinishedSendReqSet = FinishedSendReqSet(
-            set(), threading.Lock()
-        )
-        self.finished_recving_reqs: FinishedReceiveReqSet = FinishedReceiveReqSet(
-            set(), asyncio.Lock()
-        )
+        self.finished_sending_reqs: set[ReqId] = set()
+        self.finished_recving_reqs: set[ReqId] = set()
 
         self.block_size = vllm_config.cache_config.block_size
         self.model_config = vllm_config.model_config
         self.cache_config = vllm_config.cache_config
         self.use_mla = self.model_config.use_mla
 
-        backend = get_attn_backend(
-            self.model_config.get_head_size(),
-            self.model_config.dtype,
-            self.cache_config.cache_dtype,
-            self.block_size,
-            use_mla=self.use_mla,
-        )
+        # Get the attention backend from the first layer
+        # NOTE (NickLucche) models with multiple backends are not supported yet
+        backend = get_current_attn_backend(vllm_config)
         self.backend_name = backend.get_name()
         self.kv_cache_layout = get_kv_cache_layout()
         logger.debug("Detected attention backend %s", self.backend_name)
@@ -493,9 +489,7 @@ class MooncakeConnectorWorker:
             total_num_kv_heads=self.model_config.get_total_num_kv_heads(),
             attn_backend=backend,
         )
-        self._use_pallas = self.kv_topo._use_pallas
 
-        self.zmq_ctx = zmq.Context()
         self.async_zmq_ctx = zmq.asyncio.Context()
         self._encoder = msgspec.msgpack.Encoder()
         self._decoder = msgspec.msgpack.Decoder(MooncakeAgentMetadata)
@@ -505,21 +499,17 @@ class MooncakeConnectorWorker:
 
     def shutdown(self):
         """Cleanup background threads on destruction."""
-        self.zmq_ctx.term()
         self.async_zmq_ctx.term()
         if self.kv_role != "kv_consumer":
             self._sender_executor.shutdown(wait=False)
-            if self._mooncake_sender_t:
-                self._mooncake_sender_t.join()
+            if self.sender_loop.is_running():
+                self.sender_loop.call_soon_threadsafe(self.sender_loop.stop)
+                self._sender_listener_t.join()
         if self.kv_role != "kv_producer" and self.receiver_loop.is_running():
             self.receiver_loop.call_soon_threadsafe(self.receiver_loop.stop)
             self._mooncake_receiver_t.join()
 
-    def _receiver_loop(self, loop: asyncio.AbstractEventLoop):
-        asyncio.set_event_loop(loop)
-        loop.run_forever()
-
-    def _mooncake_sender(
+    async def _mooncake_sender_listener(
         self, ready_event: threading.Event, base_port: int, tp_rank: int
     ):
         """
@@ -527,93 +517,86 @@ class MooncakeConnectorWorker:
         to a thread pool, and sends acknowledgments upon completion.
         """
 
-        frontend_path = make_zmq_path("tcp", self.hostname, base_port + tp_rank)
-        frontend = make_zmq_socket(self.zmq_ctx, frontend_path, zmq.ROUTER)
-        logger.debug("Mooncake sender starting listening on path: %s", frontend_path)
-
-        backend_path = make_zmq_path("inproc", str(uuid.uuid4()))
-        backend = make_zmq_socket(self.zmq_ctx, backend_path, zmq.PULL)
+        path = make_zmq_path("tcp", self.hostname, base_port + tp_rank)
+        sock = make_zmq_socket(self.async_zmq_ctx, path, zmq.ROUTER)
+        logger.debug("Mooncake sender starting listening on path: %s", path)
 
-        poller = zmq.Poller()
-        poller.register(frontend, zmq.POLLIN)
-        poller.register(backend, zmq.POLLIN)
+        # Create async worker tasks that process items from the queue
+        sender_tasks = [
+            asyncio.create_task(self._sender_worker(sock))
+            for _ in range(self.num_sender_tasks)
+        ]
 
         ready_event.set()
 
         try:
             while True:
-                sockets = dict(poller.poll())
-
-                if frontend in sockets:
-                    identity, _, metadata_bytes = frontend.recv_multipart()
-                    self._sender_executor.submit(
-                        self._sender_worker,
-                        identity,
-                        metadata_bytes,
-                        backend_path,
-                    )
-
-                if backend in sockets:
-                    identity, status = backend.recv_multipart()
-                    frontend.send_multipart((identity, b"", status))
-
+                identity, _, metadata_bytes = await sock.recv_multipart()
+                await self.sender_worker_queue.put((identity, metadata_bytes))
         except zmq.ContextTerminated:
             logger.debug("ZMQ context terminated, exiting Mooncake sender thread.")
         except Exception as e:
             logger.error("Error in Mooncake sender thread: %s. Exiting thread.", str(e))
         finally:
-            frontend.close()
-            backend.close()
-
-    def _sender_worker(
-        self, identity: bytes, metadata_bytes: bytes, worker_channel_path: str
-    ):
-        status = TRANS_ERROR
+            # Clean up worker tasks
+            for task in sender_tasks:
+                task.cancel()
+            await asyncio.gather(*sender_tasks, return_exceptions=True)
+            sock.close()
 
-        try:
-            metadata = self._decoder.decode(metadata_bytes)
-            self.send_kv_to_decode(metadata)
-            status = TRANS_DONE
-        except Exception as e:
-            logger.error("Error processing Mooncake handshake: %s", e)
-        finally:
-            pusher = make_zmq_socket(self.zmq_ctx, worker_channel_path, zmq.PUSH)
+    async def _sender_worker(self, sock: zmq.asyncio.Socket):
+        while True:
             try:
-                pusher.send_multipart((identity, status))
-            except zmq.ZMQError as e:
-                logger.warning(
-                    "Internal error, maybe the server is shutting down. Error: %s",
-                    e,
-                )
-            finally:
-                pusher.close()
-
-    def send_kv_to_decode(self, meta: MooncakeAgentMetadata):
+                identity, metadata_bytes = await self.sender_worker_queue.get()
+                try:
+                    metadata = self._decoder.decode(metadata_bytes)
+                    await self.send_kv_to_decode(metadata)
+                    await sock.send_multipart((identity, b"", TRANS_DONE))
+                except Exception as e:
+                    logger.error("Error processing Mooncake xfer request: %s", e)
+                    await sock.send_multipart((identity, b"", TRANS_ERROR))
+                finally:
+                    self.sender_worker_queue.task_done()
+            except asyncio.CancelledError:
+                break
+            except Exception as e:
+                logger.error("Error in _sender_worker: %s", e)
+
+    async def send_kv_to_decode(self, meta: MooncakeAgentMetadata):
         send_reqs: list[tuple[ReqId, SendBlockMeta]] = []
-        with self.reqs_need_send.lock:
-            for req_id in meta.request_ids:
-                send_meta = self.reqs_need_send.reqs.get(req_id)
-                if send_meta is None:
-                    logger.warning("Request %s not found in reqs_need_send", req_id)
-                    return
-                # Mark it as not expired. We will send it now.
-                send_meta.expire_time = float("inf")
-                send_reqs.append((req_id, send_meta))
+        for req_id in meta.request_ids:
+            send_meta = self.reqs_need_send.get(req_id)
+            if send_meta is None:
+                logger.warning("Request %s not found in reqs_need_send", req_id)
+                return
+            # Mark it as not expired. We will send it now.
+            send_meta.expire_time = float("inf")
+            send_reqs.append((req_id, send_meta))
+
+        src_ptrs, dst_ptrs, lengths = await self._build_transfer_params(send_reqs, meta)
+        remote_session = f"{meta.remote_hostname}:{meta.remote_port}"
+        ret_value = await self.sender_loop.run_in_executor(
+            self._sender_executor,
+            self._send_blocks,
+            remote_session,
+            src_ptrs,
+            dst_ptrs,
+            lengths,
+        )
 
-        self._send_blocks(send_reqs, meta)
+        if ret_value != 0:
+            raise RuntimeError(f"Error in batch_transfer_sync_write: {ret_value}")
 
-        with self.reqs_need_send.lock:
-            for req_id in meta.request_ids:
-                del self.reqs_need_send.reqs[req_id]
+        for req_id in meta.request_ids:
+            del self.reqs_need_send[req_id]
 
-        with self.finished_sending_reqs.lock:
-            self.finished_sending_reqs.set.update(meta.request_ids)
+        self.finished_sending_reqs.update(meta.request_ids)
 
-    def _send_blocks(
+    async def _build_transfer_params(
         self,
         send_reqs: list[tuple[ReqId, SendBlockMeta]],
         agent_meta: MooncakeAgentMetadata,
-    ):
+    ) -> tuple[list[int], list[int], list[int]]:
         src_ptrs = []
         dst_ptrs = []
         lengths = []
@@ -626,7 +609,7 @@ class MooncakeConnectorWorker:
         for (req_id, send_meta), remote_block_ids in zip(
             send_reqs, agent_meta.block_ids
         ):
-            send_meta.ready.wait()
+            await send_meta.ready.wait()
 
             num_remote_blocks = len(remote_block_ids)
             if num_remote_blocks == 0:
@@ -665,18 +648,26 @@ class MooncakeConnectorWorker:
                 remote_session,
             )
 
+        return src_ptrs, dst_ptrs, lengths
+
+    def _send_blocks(
+        self,
+        remote_session: str,
+        src_ptrs: list[int],
+        dst_ptrs: list[int],
+        lengths: list[int],
+    ) -> int:
         start_time = time.perf_counter()
         ret_value = self.engine.batch_transfer_sync_write(
             remote_session, src_ptrs, dst_ptrs, lengths
         )
-        if ret_value != 0:
-            raise RuntimeError(f"Error in batch_transfer_sync_write: {ret_value}")
-
-        logger.debug(
-            "Sending to %s done, took %s",
-            remote_session,
-            time.perf_counter() - start_time,
-        )
+        if ret_value == 0:
+            logger.debug(
+                "Sending to %s done, took %s",
+                remote_session,
+                time.perf_counter() - start_time,
+            )
+        return ret_value
 
     def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
         """Register the KV Cache data in mooncake."""
@@ -735,41 +726,63 @@ class MooncakeConnectorWorker:
             return
 
         ready_event = threading.Event()
-        self._mooncake_sender_t = threading.Thread(
-            target=self._mooncake_sender,
-            args=(ready_event, self.side_channel_port, self.tp_rank),
-            daemon=True,
-            name="mooncake_sender",
+        asyncio.run_coroutine_threadsafe(
+            self._mooncake_sender_listener(
+                ready_event, self.side_channel_port, self.tp_rank
+            ),
+            self.sender_loop,
         )
-        self._mooncake_sender_t.start()
         ready_event.wait()  # Wait for listener ZMQ socket to be ready.
 
     async def fetch_finished_recving_reqs(self) -> set[ReqId]:
-        async with self.finished_recving_reqs.lock:
-            finished_recving_reqs = self.finished_recving_reqs.set
-            self.finished_recving_reqs.set = set()
+        finished_recving_reqs = self.finished_recving_reqs
+        self.finished_recving_reqs = set()
         return finished_recving_reqs
 
+    async def fetch_finished_sending_reqs(self) -> set[ReqId]:
+        finished_sending_reqs = self.finished_sending_reqs
+        self.finished_sending_reqs = set()
+
+        # Handle timeout to avoid stranding blocks on remote.
+        now = time.perf_counter()
+        expired_reqs = [
+            req_id
+            for req_id, send_meta in self.reqs_need_send.items()
+            if send_meta.expire_time < now
+        ]
+        for req_id in expired_reqs:
+            logger.warning(
+                "Request %s timed out after %d seconds without "
+                "being sent. Freeing its blocks on the producer side.",
+                req_id,
+                envs.VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT,
+            )
+            del self.reqs_need_send[req_id]
+        if expired_reqs:
+            finished_sending_reqs.update(expired_reqs)
+
+        return finished_sending_reqs
+
     def get_finished(self) -> tuple[set[str] | None, set[str] | None]:
         """
         Get requests that are done sending or recving on this specific worker.
         The scheduler process (via the MultiprocExecutor) will use this output
         to track which workers are done.
         """
-        fut = None
+        recv_fut = None
+        send_fut = None
         if self.kv_role != "kv_producer":
-            fut = asyncio.run_coroutine_threadsafe(
+            recv_fut = asyncio.run_coroutine_threadsafe(
                 self.fetch_finished_recving_reqs(), self.receiver_loop
             )
 
         if self.kv_role != "kv_consumer":
-            with self.finished_sending_reqs.lock:
-                finished_sending_reqs = self.finished_sending_reqs.set
-                self.finished_sending_reqs.set = set()
-        else:
-            finished_sending_reqs = set()
+            send_fut = asyncio.run_coroutine_threadsafe(
+                self.fetch_finished_sending_reqs(), self.sender_loop
+            )
 
-        finished_recving_reqs = fut.result() if fut else set()
+        finished_recving_reqs = recv_fut.result() if recv_fut else set()
+        finished_sending_reqs = send_fut.result() if send_fut else set()
 
         if finished_sending_reqs or finished_recving_reqs:
             logger.debug(
@@ -780,25 +793,6 @@ class MooncakeConnectorWorker:
                 len(finished_recving_reqs),
             )
 
-        # Handle timeout to avoid stranding blocks on remote.
-        now = time.perf_counter()
-        with self.reqs_need_send.lock:
-            expired_reqs = [
-                req_id
-                for req_id, send_meta in self.reqs_need_send.reqs.items()
-                if send_meta.expire_time < now
-            ]
-            for req_id in expired_reqs:
-                logger.warning(
-                    "Request %s timed out after %d seconds without "
-                    "being sent. Freeing its blocks on the producer side.",
-                    req_id,
-                    envs.VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT,
-                )
-                del self.reqs_need_send.reqs[req_id]
-            if expired_reqs:
-                finished_sending_reqs.update(expired_reqs)
-
         return finished_sending_reqs or None, finished_recving_reqs or None
 
     async def receive_kv(self, path: str, req_blocks: list[tuple[str, list[int]]]):
@@ -839,8 +833,7 @@ class MooncakeConnectorWorker:
         finally:
             sock.close()
 
-        async with self.finished_recving_reqs.lock:
-            self.finished_recving_reqs.set.update(req_ids)
+        self.finished_recving_reqs.update(req_ids)
 
         logger.debug("pulling kv_caches for %s finished", req_ids)
 
@@ -860,6 +853,24 @@ class MooncakeConnectorWorker:
 
         return kv_pulls
 
+    async def record_send_reqs(self, metadata: MooncakeConnectorMetadata):
+        for req_id, block_ids in metadata.reqs_to_send.items():
+            if block_ids:
+                # Already gone through request_finished()
+                send_meta = self.reqs_need_send[req_id]
+                send_meta.local_block_ids = block_ids
+                send_meta.expire_time = (
+                    time.perf_counter() + envs.VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT
+                )
+                send_meta.ready.set()
+            else:
+                # From update_state_after_alloc(),
+                # but not reach request_finished() yet
+                self.reqs_need_send[req_id] = SendBlockMeta(
+                    local_block_ids=[],
+                    ready=asyncio.Event(),
+                )
+
     def start_load_kv(self, metadata: MooncakeConnectorMetadata):
         if self.kv_role != "kv_producer":
             kv_pulls = self.group_kv_pull(metadata)
@@ -869,23 +880,9 @@ class MooncakeConnectorWorker:
                 )
 
         if self.kv_role != "kv_consumer":
-            with self.reqs_need_send.lock:
-                for req_id, block_ids in metadata.reqs_to_send.items():
-                    if block_ids:
-                        # Already gone through request_finished()
-                        send_meta = self.reqs_need_send.reqs[req_id]
-                        send_meta.local_block_ids = block_ids
-                        send_meta.ready.set()
-                        send_meta.expire_time = (
-                            time.perf_counter()
-                            + envs.VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT
-                        )
-                    else:
-                        # From update_state_after_alloc(),
-                        # but not reach request_finished() yet
-                        self.reqs_need_send.reqs[req_id] = SendBlockMeta(
-                            local_block_ids=[], ready=threading.Event()
-                        )
+            asyncio.run_coroutine_threadsafe(
+                self.record_send_reqs(metadata), self.sender_loop
+            )
 
 
 def group_concurrent_contiguous(
@@ -909,6 +906,11 @@ def get_mooncake_side_channel_port(vllm_config: VllmConfig) -> int:
     # This logic is now centralized
     return (
         envs.VLLM_MOONCAKE_BOOTSTRAP_PORT
-        + vllm_config.parallel_config.data_parallel_rank
+        + vllm_config.parallel_config.data_parallel_index
         * vllm_config.parallel_config.tensor_parallel_size
     )
+
+
+def _async_loop(loop: asyncio.AbstractEventLoop):
+    asyncio.set_event_loop(loop)
+    loop.run_forever()
diff --git a/vllm/attention/layers/__init__.py b/vllm/distributed/kv_transfer/kv_connector/v1/moriio/__init__.py
similarity index 100%
rename from vllm/attention/layers/__init__.py
rename to vllm/distributed/kv_transfer/kv_connector/v1/moriio/__init__.py
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_common.py b/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_common.py
new file mode 100644
index 0000000000000000000000000000000000000000..026e7faf5761658c5482c86aae0e3085bc6ebbaf
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_common.py
@@ -0,0 +1,321 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import contextlib
+import threading
+import time
+from collections.abc import Iterator
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Optional
+
+import msgspec
+import torch
+import zmq
+
+from vllm import envs
+from vllm.config import VllmConfig
+from vllm.distributed.kv_transfer.kv_connector.v1.base import (
+    KVConnectorMetadata,
+)
+from vllm.distributed.parallel_state import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+from vllm.logger import init_logger
+from vllm.utils.network_utils import (
+    get_ip,
+    get_open_port,
+    make_zmq_socket,
+)
+
+if TYPE_CHECKING:
+    pass
+
+from dataclasses import field
+from enum import Enum
+
+logger = init_logger(__name__)
+
+
+Transfer = tuple[int, float]
+EngineId = str
+ReqId = str
+
+
+@dataclass
+class WriteTask:
+    request_id: str
+    dst_engine_id: str
+    local_block_ids: list[int]
+    remote_block_ids_hint: list[int] | None
+    layer_name: str
+    event: torch.cuda.Event
+    remote_notify_port: int
+    remote_ip: str
+    enqueue_time: float = field(default_factory=time.perf_counter)
+    retried: int = 0
+
+
+@dataclass
+class LayerTransferPlan:
+    """Plan for transferring a single layer."""
+
+    request_id: str
+    layer_name: str
+    sess_idx: int
+    transfer_local_offsets: list[int]
+    transfer_remote_offsets: list[int]
+    transfer_sizes: list[int]
+    use_batch: bool = True
+
+
+@dataclass
+class RemoteAllocInfo:
+    """Information about remote block allocation."""
+
+    block_ids: list[int]
+    writes_done: int = 0
+    decode_dp_rank: int = 0
+    transfer_offset: tuple[list[int], list[int], list[int]] | None = None
+
+
+class ROLE(Enum):
+    PRODUCER = "producer"
+    CONSUMER = "consumer"
+    NOTINIT = "notinit"
+
+
+class MoRIIOAgentMetadata(
+    msgspec.Struct,
+    omit_defaults=True,  # type: ignore[call-arg]
+    # required for @cached_property.d
+    dict=True,
+):
+    engine_id: str
+    agent_metadata: bytes
+    kv_caches_base_addr: list[int]
+    num_blocks: int
+    block_len: int
+    attn_backend_name: str
+
+
+class RoleManager:
+    """Manages role state across the connector."""
+
+    _instance: Optional["RoleManager"] = None
+    _lock = threading.Lock()
+
+    def __init__(self) -> None:
+        self._role: ROLE = ROLE.NOTINIT
+
+    @classmethod
+    def get_instance(cls) -> "RoleManager":
+        if cls._instance is None:
+            with cls._lock:
+                if cls._instance is None:
+                    cls._instance = cls()
+        return cls._instance
+
+    def set_role(self, role: ROLE) -> None:
+        """Set the current role."""
+        with self._lock:
+            self._role = role
+
+    def get_role(self) -> ROLE:
+        """Get the current role."""
+        return self._role
+
+
+def set_role(role: ROLE):
+    """Set the global role."""
+    RoleManager.get_instance().set_role(role)
+
+
+def get_role() -> ROLE:
+    """Get the global role."""
+    return RoleManager.get_instance().get_role()
+
+
+class MoRIIOMode(Enum):
+    READ = "read"
+    WRITE = "write"
+
+
+class MoRIIOError(Exception):
+    """Base exception for MoRIIO operations."""
+
+    pass
+
+
+class HandshakeError(MoRIIOError):
+    """Exception raised when handshake fails."""
+
+    pass
+
+
+class TransferError(MoRIIOError):
+    """Exception raised when transfer fails."""
+
+    pass
+
+
+def get_moriio_mode() -> MoRIIOMode:
+    read_mode = envs.VLLM_MORIIO_CONNECTOR_READ_MODE
+    logger.debug("MoRIIO Connector read_mode: %s", read_mode)
+    if read_mode:
+        return MoRIIOMode.READ
+    else:
+        return MoRIIOMode.WRITE
+
+
+def get_port_offset(dp_rank: int, tp_rank: int, tp_size: int = 1) -> int:
+    return (dp_rank) * tp_size + tp_rank
+
+
+@dataclass
+class MoRIIOConfig:
+    local_ip: str
+    local_kv_port: int
+    proxy_ip: str
+    local_ping_port: int
+    proxy_ping_port: int
+    http_port: int
+    handshake_port: int
+    notify_port: int
+    tp_rank: int
+    dp_rank: int
+    dp_size: int
+    tp_size: int
+
+    @classmethod
+    def from_vllm_config(cls, vllm_config: VllmConfig) -> "MoRIIOConfig":
+        # Port Configuration:
+        # local_ping_port   -> Outgoing heartbeat to proxy
+        # proxy_ping_port   -> Remote proxy's heartbeat ingress port
+        # http_port         -> Instance's HTTP service endpoint
+        # local_kv_port     -> service port for mori engine
+        # notify_port       -> For synchronizing stages between prefill and decode
+        # handshake_port    -> For initial handshake between mori engine
+
+        # TODO : merge notify_port and handshake_port to simplify port management
+        #        supports non-contiguous ports
+        assert vllm_config.kv_transfer_config is not None, (
+            "kv_transfer_config must be set for MoRIIOConnector"
+        )
+        kv_transfer_config = vllm_config.kv_transfer_config
+        extra_config = kv_transfer_config.kv_connector_extra_config
+        tp_rank = get_tensor_model_parallel_rank()
+        dp_rank = vllm_config.parallel_config.data_parallel_rank
+        base_notify_port = int(extra_config["notify_port"])
+        dp_size = vllm_config.parallel_config.data_parallel_size
+        tp_size = get_tensor_model_parallel_world_size()
+        port_offset = get_port_offset(dp_rank, tp_rank)
+
+        return cls(
+            local_ip=get_ip(),
+            local_kv_port=get_open_port(),
+            proxy_ip=extra_config["proxy_ip"],
+            local_ping_port=get_open_port(),
+            proxy_ping_port=int(extra_config["proxy_ping_port"]),
+            http_port=int(extra_config["http_port"]),
+            handshake_port=int(extra_config["handshake_port"]),
+            notify_port=base_notify_port + port_offset,
+            tp_rank=tp_rank,
+            dp_rank=dp_rank,
+            dp_size=dp_size,
+            tp_size=tp_size,
+        )
+
+
+class MoRIIOConstants:
+    """Constants for MoRIIO connector."""
+
+    # ZMQ message types
+    GET_META_MSG = b"get_meta_msg"
+    POP_DONE_RECV = b"pop_done_recv"
+    OVER = b"OVER"
+    COMPLETION_PREFIX = "cmpl"
+
+    PING_INTERVAL = 5
+    MAX_PING_RETRIES = 100
+    DEFAULT_HANDSHAKE_PORT = "6301"
+    DEFAULT_NOTIFY_PORT = "61005"
+
+    VLLM_MORI_READ_ABORT_REQUEST_TIMEOUT = 3600
+
+
+@dataclass
+class ReqMeta:
+    """Metadata for a single request."""
+
+    local_block_ids: list[int]
+    remote_block_ids: list[int]
+    remote_host: str
+    remote_port: int
+    remote_handshake_port: int
+    remote_notify_port: int
+    remote_engine_id: str
+    tp_size: int
+    remote_dp_size: int
+
+
+class MoRIIOConnectorMetadata(KVConnectorMetadata):
+    def __init__(self):
+        self.reqs_to_recv: dict[ReqId, ReqMeta] = {}
+        self.reqs_to_save: dict[ReqId, ReqMeta] = {}
+        self.reqs_to_send: dict[ReqId, float] = {}
+
+    def __repr__(self):
+        return_str = ""
+        for req_id, req_meta in self.reqs_to_recv.items():
+            return_str += (
+                f"{req_id = },{req_meta.local_block_ids = },"
+                f"{req_meta.remote_host = },{req_meta.remote_port = }"
+                f"{req_meta.remote_engine_id = },{req_meta.tp_size = }"
+            )
+        return_str = f"MoRIIOConnectorMetadata:reqs_to_recv:{return_str},"
+
+        for req_id, expiry in self.reqs_to_send.items():
+            return_str += f"{req_id = },{expiry = }"
+        return_str = f"MoRIIOConnectorMetadata:reqs_to_send:{return_str},"
+        return return_str
+
+    def add_new_req(
+        self,
+        request_id: ReqId,
+        local_block_ids: list[int],
+        kv_transfer_params: dict[str, Any],
+        write_mode=False,
+    ):
+        _req = ReqMeta(
+            local_block_ids=local_block_ids,
+            remote_block_ids=kv_transfer_params["remote_block_ids"],
+            remote_engine_id=kv_transfer_params["remote_engine_id"],
+            remote_host=kv_transfer_params["remote_host"],
+            remote_port=kv_transfer_params["remote_port"],
+            remote_handshake_port=kv_transfer_params["remote_handshake_port"],
+            remote_notify_port=kv_transfer_params["remote_notify_port"],
+            tp_size=kv_transfer_params.get("tp_size", 1),
+            remote_dp_size=kv_transfer_params.get("remote_dp_size", 1),
+        )
+        if write_mode:
+            self.reqs_to_save[request_id] = _req
+        else:
+            self.reqs_to_recv[request_id] = _req
+
+
+@contextlib.contextmanager
+def zmq_ctx(socket_type: Any, addr: str) -> Iterator[zmq.Socket]:
+    """Context manager for a ZMQ socket"""
+
+    if socket_type not in (zmq.ROUTER, zmq.REQ, zmq.DEALER):
+        raise ValueError(f"Unexpected socket type: {socket_type}")
+
+    ctx: zmq.Context | None = None
+    try:
+        ctx = zmq.Context()  # type: ignore[attr-defined]
+        yield make_zmq_socket(
+            ctx=ctx, path=addr, socket_type=socket_type, bind=socket_type == zmq.ROUTER
+        )
+    finally:
+        if ctx is not None:
+            ctx.destroy(linger=0)
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py
new file mode 100644
index 0000000000000000000000000000000000000000..abdbeb9e416e1db5d18b09cf91e21115fc47d15e
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py
@@ -0,0 +1,1515 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import logging
+import math
+import queue
+import threading
+import time
+from collections import defaultdict
+from concurrent.futures import Future, ThreadPoolExecutor
+from typing import TYPE_CHECKING, Any, Optional
+
+import msgpack
+import msgspec
+import numpy as np
+import torch
+import zmq
+
+from vllm.config import VllmConfig
+from vllm.distributed.kv_transfer.kv_connector.v1.base import (
+    KVConnectorBase_V1,
+    KVConnectorMetadata,
+    KVConnectorRole,
+)
+from vllm.distributed.kv_transfer.kv_connector.v1.moriio.moriio_common import (
+    ROLE,
+    EngineId,
+    HandshakeError,
+    MoRIIOAgentMetadata,
+    MoRIIOConfig,
+    MoRIIOConnectorMetadata,
+    MoRIIOConstants,
+    MoRIIOMode,
+    ReqId,
+    ReqMeta,
+    WriteTask,
+    get_moriio_mode,
+    get_port_offset,
+    get_role,
+    set_role,
+    zmq_ctx,
+)
+from vllm.distributed.kv_transfer.kv_connector.v1.moriio.moriio_engine import (
+    MoRIIOWrapper,
+    MoRIIOWriter,
+)
+from vllm.distributed.parallel_state import (
+    get_tensor_model_parallel_world_size,
+    get_tp_group,
+    get_world_group,
+)
+from vllm.forward_context import ForwardContext
+from vllm.logger import init_logger
+from vllm.utils.network_utils import (
+    get_ip,
+    make_zmq_path,
+    make_zmq_socket,
+)
+from vllm.v1.attention.selector import get_attn_backend
+from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.v1.request import RequestStatus
+
+if TYPE_CHECKING:
+    from vllm.v1.attention.backend import AttentionMetadata
+    from vllm.v1.core.kv_cache_manager import KVCacheBlocks
+    from vllm.v1.kv_cache_interface import KVCacheConfig
+    from vllm.v1.request import Request
+
+logger = init_logger(__name__)
+
+try:
+    from mori.io import (
+        BackendType,
+        IOEngine,
+        IOEngineConfig,
+    )
+
+    logger.info("MoRIIO is available")
+    MoRIIO_enabled = True
+except ImportError:
+    logger.error("MoRIIO is not available")
+    MoRIIO_enabled = False
+
+
+def is_moriio_available() -> bool:
+    return MoRIIO_enabled
+
+
+class MoRIIOConnector(KVConnectorBase_V1):
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        role: KVConnectorRole,
+        kv_cache_config: Optional["KVCacheConfig"] = None,
+    ):
+        super().__init__(vllm_config, role)
+        assert vllm_config.kv_transfer_config is not None, (
+            "kv_transfer_config must be set for MoRIIOConnector"
+        )
+
+        self.kv_transfer_config = vllm_config.kv_transfer_config
+        self._set_port_defaults(vllm_config)
+
+        self.engine_id = (
+            str(get_ip())
+            + ":"
+            + str(self.kv_transfer_config.kv_connector_extra_config["handshake_port"])
+        )
+        self.mode = get_moriio_mode()
+        if role == KVConnectorRole.SCHEDULER:
+            self.connector_scheduler: MoRIIOConnectorScheduler | None = (
+                MoRIIOConnectorScheduler(vllm_config, self.engine_id)
+            )
+            self.connector_worker: MoRIIOConnectorWorker | None = None
+        elif role == KVConnectorRole.WORKER:
+            self.connector_scheduler = None
+            self.connector_worker = MoRIIOConnectorWorker(vllm_config, self.engine_id)
+        logger.info(
+            "Initialized MoRIIO Connector,engine_id:%s,role: %s",
+            self.engine_id,
+            role.value,
+        )
+
+    ############################################################
+    # Scheduler Side Methods
+    ############################################################
+
+    def _set_port_defaults(self, vllm_config: VllmConfig):
+        assert vllm_config.kv_transfer_config is not None, (
+            "kv_transfer_config must be set for MoRIIOConnector"
+        )
+        kv_transfer_config = vllm_config.kv_transfer_config
+        extra_config = kv_transfer_config.kv_connector_extra_config
+
+        if "handshake_port" not in extra_config or not extra_config["handshake_port"]:
+            extra_config["handshake_port"] = MoRIIOConstants.DEFAULT_HANDSHAKE_PORT
+
+        if "notify_port" not in extra_config or not extra_config["notify_port"]:
+            extra_config["notify_port"] = MoRIIOConstants.DEFAULT_NOTIFY_PORT
+
+    def get_num_new_matched_tokens(
+        self, request: "Request", num_computed_tokens: int
+    ) -> tuple[int, bool]:
+        assert self.connector_scheduler is not None
+        return self.connector_scheduler.get_num_new_matched_tokens(
+            request, num_computed_tokens
+        )
+
+    def update_state_after_alloc(
+        self, request: "Request", blocks: "KVCacheBlocks", num_external_tokens: int
+    ):
+        assert self.connector_scheduler is not None
+        return self.connector_scheduler.update_state_after_alloc(
+            request, blocks, num_external_tokens, self.connector_worker
+        )
+
+    def build_connector_meta(
+        self,
+        scheduler_output: SchedulerOutput,
+    ) -> KVConnectorMetadata:
+        assert self.connector_scheduler is not None
+        return self.connector_scheduler.build_connector_meta(scheduler_output)
+
+    def request_finished(
+        self,
+        request: "Request",
+        block_ids: list[int],
+    ) -> tuple[bool, dict[str, Any] | None]:
+        assert self.connector_scheduler is not None
+        return self.connector_scheduler.request_finished(request, block_ids)
+
+    ############################################################
+    # Worker Side Methods
+    ############################################################
+    def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
+        assert self.connector_worker is not None
+        self.connector_worker.register_kv_caches(kv_caches)
+
+    def get_finished(self, finished_req_ids: set[str]) -> tuple[set[str], set[str]]:
+        """Get the finished recving and sending requests."""
+        assert self.connector_worker is not None
+        return self.connector_worker.get_finished()
+
+    def start_load_kv(self, forward_context: "ForwardContext", **kwargs) -> None:
+        assert self.connector_worker is not None
+        if self.mode == MoRIIOMode.WRITE and get_role() == ROLE.CONSUMER:
+            self.connector_worker.moriio_wrapper.async_wait_reqid()
+
+        assert isinstance(self._connector_metadata, MoRIIOConnectorMetadata)
+        self.connector_worker.start_load_kv(self._connector_metadata)
+
+    def wait_for_layer_load(self, layer_name: str) -> None:
+        pass
+
+    def save_kv_layer(
+        self,
+        layer_name: str,
+        kv_layer: torch.Tensor,
+        attn_metadata: "AttentionMetadata",
+        **kwargs,
+    ) -> None:
+        # Only producer/prefill saves KV Cache
+        if get_role() == ROLE.CONSUMER:
+            return
+        assert self.connector_worker is not None, (
+            "save_kv_layer called on scheduler role"
+        )
+
+        assert isinstance(self._connector_metadata, MoRIIOConnectorMetadata), (
+            "Connector metadata not initialized yet"
+        )
+        self.connector_worker.save_kv_layer(
+            self._connector_metadata, layer_name, kv_layer, attn_metadata, **kwargs
+        )
+
+        return None
+
+    def wait_for_save(self):
+        pass
+
+    def shutdown(self):
+        if self.connector_worker is not None:
+            self.connector_worker.shutdown()
+        if self.connector_scheduler is not None:
+            self.connector_scheduler.shutdown()
+
+    def has_connector_metadata(self) -> bool:
+        """Check whether the connector metadata is currently set.
+
+        Returns:
+            bool: True if connector metadata exists, False otherwise.
+        """
+        try:
+            return self._connector_metadata is not None
+        except AttributeError:
+            return False
+
+
+class MoRIIOConnectorScheduler:
+    """Implementation of Scheduler side methods"""
+
+    def __init__(self, vllm_config: VllmConfig, engine_id: str):
+        self.vllm_config = vllm_config
+
+        assert vllm_config.kv_transfer_config is not None, (
+            "kv_transfer_config must be set for MoRIIOConnector"
+        )
+        self.kv_transfer_config = vllm_config.kv_transfer_config
+        self.block_size = vllm_config.cache_config.block_size
+        self.engine_id: EngineId = engine_id
+        self.mode = get_moriio_mode()
+        self.host_ip = get_ip()
+        self.handshake_port = self.kv_transfer_config.kv_connector_extra_config[
+            "handshake_port"
+        ]
+        logger.info("Initializing MoRIIO Scheduler engine_id = %s", engine_id)
+
+        self.side_notify_port = self.kv_transfer_config.kv_connector_extra_config[
+            "notify_port"
+        ]
+        self.tp_size = self.vllm_config.parallel_config.tensor_parallel_size
+        self.dp_rank = self.vllm_config.parallel_config.data_parallel_rank
+        self.is_producer = self.kv_transfer_config.kv_role == "kv_producer"
+        # Requests that need to start recv/send.
+        # New requests are added by update_state_after_alloc in
+        # the scheduler. Used to make metadata passed to Worker.
+        self._reqs_need_recv: dict[ReqId, tuple[Request, list[int]]] = {}
+        self._reqs_need_save: dict[ReqId, tuple[Request, list[int]]] = {}
+
+        # For chunked prefill, we perform layer-wise access within the final chunk.
+        # TODO: Perform transfer at end chunk.
+        self._reqs_need_pending_save: dict[ReqId, tuple[Request, list[int]]] = {}
+
+        if self.is_producer:
+            set_role(ROLE.PRODUCER)
+        else:
+            set_role(ROLE.CONSUMER)
+        # Reqs to send and their expiration time
+        self._reqs_need_send: dict[ReqId, float] = {}
+        self.paths: dict[str, zmq.Socket] = {}
+
+    def get_num_new_matched_tokens(
+        self,
+        request: "Request",
+        num_computed_tokens: int,
+    ) -> tuple[int, bool]:
+        """
+        For remote prefill, pull all prompt blocks from remote
+        asynchronously relative to engine execution.
+
+        Args:
+            request (Request): the request object.
+            num_computed_tokens (int): the number of locally
+                computed tokens for this request
+        Returns:
+            * the number of tokens that can be loaded from the
+              external KV cache beyond what is already computed.
+            * true if the external KV cache tokens will be loaded
+              asynchronously (between scheduler steps).
+        """
+        if self.is_producer:
+            return 0, False
+
+        token_ids = request.prompt_token_ids or []
+        if self.mode == MoRIIOMode.WRITE:
+            # MoriiO in write mode, no remote prefill
+
+            return len(token_ids) - num_computed_tokens, True
+
+        return len(token_ids) - 1 - num_computed_tokens, False
+
+    def send_notify_block(
+        self, req_id: str, block_notify_list: list[int], host=None, port=None
+    ):
+        path = make_zmq_path("tcp", host, port)
+        if path not in self.paths:
+            ctx = zmq.Context.instance()
+            sock = make_zmq_socket(
+                ctx=ctx, path=path, socket_type=zmq.DEALER, bind=False
+            )
+            self.paths[path] = sock
+
+        data = {
+            "req_id": req_id,
+            "block_notify_list": block_notify_list or [],
+            "decode_rank": self.dp_rank,
+            "type": "remote_blocks",
+        }
+        serialized_data = msgpack.dumps(data)
+        self.paths[path].send(serialized_data)
+
+    def update_state_after_alloc(
+        self,
+        request: "Request",
+        blocks: "KVCacheBlocks",
+        num_external_tokens: int,
+        connector_worker: Optional["MoRIIOConnectorWorker"] = None,
+    ):
+        params = request.kv_transfer_params
+        if not params:
+            return
+        if params.get("do_remote_decode"):
+            local_block_ids = blocks.get_block_ids()[0]
+            self._reqs_need_save[request.request_id] = (request, local_block_ids)
+
+        if params is not None and params.get("do_remote_prefill"):
+            if self.mode == MoRIIOMode.READ:
+                if remote_block_ids := params.get("remote_block_ids"):
+                    if all(
+                        p in params
+                        for p in ("remote_engine_id", "remote_host", "remote_port")
+                    ):
+                        # If remote_blocks and num_external_tokens = 0, we
+                        # a full prefix cache hit on the D worker. We need to call
+                        # send_notif in _read_blocks to free the memory on the P.
+
+                        # Get unhashed blocks to pull from remote.
+                        local_block_ids = blocks.get_block_ids()[0]
+                        assert len(local_block_ids) <= len(remote_block_ids)
+                        if len(local_block_ids) == len(remote_block_ids):
+                            pass
+                        else:
+                            local_block_ids = remote_block_ids[-len(local_block_ids) :]
+
+                        self._reqs_need_recv[request.request_id] = (
+                            request,
+                            local_block_ids,
+                        )
+                    else:
+                        logger.warning(
+                            "Got invalid KVTransferParams: %s. This "
+                            "request will not utilize KVTransfer",
+                            params,
+                        )
+
+            else:
+                assert request.kv_transfer_params is not None, (
+                    "kv_transfer_params should not be None"
+                )
+
+                remote_dp_rank = request.kv_transfer_params.get("remote_dp_rank", 0)
+
+                for tp_index in range(self.tp_size):
+                    target_port = request.kv_transfer_params[
+                        "remote_notify_port"
+                    ] + get_port_offset(remote_dp_rank, tp_index)
+
+                    self.send_notify_block(
+                        req_id=request.request_id,
+                        block_notify_list=blocks.get_block_ids()[0],
+                        host=params.get("remote_host"),
+                        port=target_port,
+                    )
+
+            # Only trigger 1 KV transfer per request.
+
+            params["do_remote_prefill"] = False
+
+    def build_connector_meta(
+        self,
+        scheduler_output: SchedulerOutput,
+    ) -> KVConnectorMetadata:
+        meta = MoRIIOConnectorMetadata()
+
+        if self.mode == MoRIIOMode.WRITE:
+            # when async_load_kv finished,
+            # new reqs will be added to scheduler_output.scheduled_new_reqs
+
+            if get_role() == ROLE.CONSUMER:
+                for new_req in scheduler_output.scheduled_new_reqs:
+                    red_id = new_req.req_id
+                    local_block_ids = list(new_req.block_ids)[0]
+                    assert new_req.sampling_params is not None, (
+                        f"sampling_params is None for req {new_req.req_id}"
+                    )
+                    assert hasattr(new_req.sampling_params, "extra_args"), (
+                        f"sampling_params missing extra_args for req {new_req.req_id}"
+                    )
+                    kv_transfer_params = (
+                        new_req.sampling_params.extra_args.get("kv_transfer_params", {})
+                        if new_req.sampling_params.extra_args
+                        else {}
+                    )
+                    meta.add_new_req(
+                        red_id,
+                        local_block_ids,
+                        kv_transfer_params,
+                    )
+            if get_role() == ROLE.PRODUCER:
+                # This is the logic for checking against chunked prefill.
+                # When the last chunk is identified,
+                # It places the request metadata into the saving queue.
+
+                for i, req_id in enumerate(
+                    scheduler_output.scheduled_cached_reqs.req_ids
+                ):
+                    new_block_ids = (
+                        scheduler_output.scheduled_cached_reqs.new_block_ids[i]
+                    )
+
+                    if new_block_ids is not None:
+                        block_ids = new_block_ids[0]
+                        # TODO : hybrid attn, etc
+                        req, existing_blocks = self._reqs_need_pending_save[req_id]
+                        updated_blocks = list(existing_blocks) + (block_ids)
+                        self._reqs_need_pending_save[req_id] = (req, updated_blocks)
+                        if (
+                            len(self._reqs_need_pending_save[req_id][1])
+                            * self.block_size
+                            >= req.num_prompt_tokens
+                        ):
+                            meta.add_new_req(
+                                request_id=req_id,
+                                local_block_ids=self._reqs_need_pending_save[req_id][1],
+                                kv_transfer_params=req.kv_transfer_params or {},
+                                write_mode=True,
+                            )
+                            del self._reqs_need_pending_save[req_id]
+
+        # Loop through scheduled reqs and convert to ReqMeta.
+        for req_id, (req, block_ids) in self._reqs_need_recv.items():
+            assert req.kv_transfer_params is not None
+            meta.add_new_req(
+                request_id=req_id,
+                local_block_ids=block_ids,
+                kv_transfer_params=req.kv_transfer_params,
+            )
+
+        for req_id, (req, block_ids) in self._reqs_need_save.items():
+            assert req.kv_transfer_params is not None
+            if req.num_prompt_tokens > len(block_ids) * self.block_size:
+                # not last chunk prefill
+                self._reqs_need_pending_save[req_id] = (req, block_ids)
+                continue
+            meta.add_new_req(
+                request_id=req_id,
+                local_block_ids=block_ids,
+                kv_transfer_params=req.kv_transfer_params,
+                write_mode=True,
+            )
+        # Clear the list once workers start the transfers
+
+        meta.reqs_to_send = self._reqs_need_send
+
+        self._reqs_need_recv.clear()
+        self._reqs_need_save.clear()
+        self._reqs_need_send = {}
+
+        return meta
+
+    def shutdown(self):
+        for path, sock in self.paths.items():
+            try:
+                sock.close(linger=0)
+                logger.debug("Closed ZMQ socket for path: %s", path)
+            except Exception as e:
+                logger.warning("Error closing ZMQ socket for path %s: %s", path, e)
+        self.paths.clear()
+
+    def request_finished(
+        self,
+        request: "Request",
+        block_ids: list[int],
+    ) -> tuple[bool, dict[str, Any] | None]:
+        """
+        Once a request is finished, determine whether request blocks
+        should be freed now or will be sent asynchronously and freed later.
+        """
+
+        params = request.kv_transfer_params
+        logger.debug(
+            "MoriioConnector request_finished, request_status=%s, "
+            "kv_transfer_params=%s",
+            request.status,
+            params,
+        )
+        if not params:
+            return False, None
+
+        if params.get("do_remote_prefill"):
+            # If do_remote_prefill is still True when the request is finished,
+            # update_state_after_alloc must not have been called (the request
+            # must have been aborted before it was scheduled).
+            # To avoid stranding the prefill blocks in the prefill instance,
+            # we must add empty block_ids to _reqs_need_recv so that our
+            # worker side will notify and free blocks in the prefill instance.
+            self._reqs_need_recv[request.request_id] = (request, [])
+            params["do_remote_prefill"] = False
+            return False, None
+
+        if (
+            not params.get("do_remote_decode")
+            or request.status != RequestStatus.FINISHED_LENGTH_CAPPED
+        ):
+            return False, None
+
+        # computed_block_ids = block_ids if all_full else block_ids[:-1]
+        computed_block_ids = block_ids
+        # If prompt < block_size, no xfer so free blocks immediately.
+        delay_free_blocks = len(computed_block_ids) > 0
+
+        if delay_free_blocks:
+            # Prefill request on remote. It will be read from D upon completion
+            self._reqs_need_send[request.request_id] = (
+                time.perf_counter()
+                + MoRIIOConstants.VLLM_MORI_READ_ABORT_REQUEST_TIMEOUT
+            )
+
+        # If we execute in P-D serial mode, no notification port is needed.
+        return delay_free_blocks, dict(
+            do_remote_prefill=True,
+            do_remote_decode=False,
+            remote_block_ids=computed_block_ids,
+            remote_engine_id=self.engine_id,
+            remote_host=self.host_ip,
+            remote_port=self.handshake_port,
+            tp_size=self.vllm_config.parallel_config.tensor_parallel_size,
+        )
+
+
+class MoRIIOConnectorWorker:
+    """Implementation of Worker side methods"""
+
+    def __init__(self, vllm_config: VllmConfig, engine_id: str):
+        if not is_moriio_available():
+            raise RuntimeError(
+                "MoRIIO is not available. Please ensure the 'mori' package "
+                "is installed and properly configured."
+            )
+
+        self.moriio_config = MoRIIOConfig.from_vllm_config(vllm_config)
+        self.mode = get_moriio_mode()
+
+        logger.info("Initializing MoRIIO worker %s", engine_id)
+
+        logging.getLogger("aiter").disabled = True
+
+        # Config.
+        self.vllm_config = vllm_config
+        assert vllm_config.kv_transfer_config is not None, (
+            "kv_transfer_config must be set for MoRIIOConnector"
+        )
+        self.kv_transfer_config = vllm_config.kv_transfer_config
+        self.is_producer = self.kv_transfer_config.is_kv_producer
+
+        if self.is_producer:
+            set_role(ROLE.PRODUCER)
+        else:
+            set_role(ROLE.CONSUMER)
+        # mori engine
+        self._rank = get_world_group().rank
+        self._local_rank = get_world_group().local_rank
+        self.tp_rank = self.moriio_config.tp_rank
+        self.dp_rank = self.moriio_config.dp_rank
+
+        self.local_ip = self.moriio_config.local_ip
+        self.local_kv_port = self.moriio_config.local_kv_port
+        self.proxy_ip = self.moriio_config.proxy_ip
+        self.local_ping_port = self.moriio_config.local_ping_port
+        self.proxy_ping_port = self.moriio_config.proxy_ping_port
+        self.http_port = self.moriio_config.http_port
+        self.handshake_port = self.moriio_config.handshake_port
+        self.notify_port = self.moriio_config.notify_port
+
+        self.zmq_context = zmq.Context()
+        self.metadata_address = (
+            f"{self.moriio_config.local_ip}:{self.moriio_config.local_ping_port}"
+        )
+        self.request_address = (
+            f"{self.moriio_config.local_ip}:{self.moriio_config.http_port}"
+        )
+
+        self.moriio_engine = None
+        self._handle_request_thread = None
+        self._ping_thread = None
+        self._writer = MoRIIOWriter(self)
+
+        role = "producer" if self.is_producer else "consumer"
+        engine_suffix = (
+            f"{self.moriio_config.local_ip}:{self.moriio_config.handshake_port}:"
+            f"tp{self.tp_rank}:dp{self.dp_rank}"
+        )
+        self.moriio_engine = IOEngine(
+            f"{role}:{engine_suffix}",
+            IOEngineConfig(
+                self.moriio_config.local_ip, self.moriio_config.local_kv_port
+            ),
+        )
+        logger.debug(
+            "build MORI IOEngine %s (ip=%s port=%s)",
+            f"{role}:{engine_suffix}",
+            self.moriio_config.local_ip,
+            self.moriio_config.local_kv_port,
+        )
+
+        if self._rank == 0 and self.moriio_config.proxy_ip:
+            self._ping_thread = threading.Thread(
+                target=self._ping, args=(self.zmq_context,), daemon=True
+            )
+            self._ping_thread.start()
+
+        logger.info(
+            "Initializing MoRIIO Engine, engine = %s, role = %s",
+            self.moriio_engine,
+            "producer" if self.is_producer else "consumer",
+        )
+
+        # Agent.
+        self.moriio_wrapper = MoRIIOWrapper(tp_rank=self.tp_rank, dp_rank=self.dp_rank)
+        self.moriio_wrapper.set_moriio_engine(self.moriio_engine)
+        self.moriio_wrapper.set_backend_type(BackendType.RDMA)
+        self.moriio_wrapper.notify_port = self.moriio_config.notify_port
+        self.local_kv_cache_metadata: list[bytes] = []
+        self.local_kv_cache_size: list[int] = []
+        self.layer_name_to_local_kv_cache_metadata: dict[str, list[bytes]] = {}
+
+        self.remote_kv_cache_metadata: list[bytes] = []
+        self.remote_kv_cache_size: list[int] = []
+        self.layer_name_to_remote_kv_cache_metadata: dict[str, dict[str, list[Any]]] = (
+            dict()
+        )
+        self.remote_moriio_metadata: dict[EngineId, MoRIIOAgentMetadata] = {}
+        self.slot_size_bytes = 0
+
+        self.load_ready_flag: dict[str, bool] = {}
+        self.write_ready_flags: dict[str, bool] = {}
+        self.kv_cache_shape = None
+        self.block_shape = None
+        self.kv_element_size = 0
+
+        # Map of engine_id -> {agent_name0, agent_name1..}.
+        self._remote_agents: dict[EngineId, set[str]] = {}
+
+        self.side_channel_port: int = (
+            self.moriio_config.handshake_port
+            + get_port_offset(self.dp_rank, self.tp_rank)
+        )
+        self.engine_id: EngineId = engine_id
+
+        self.world_size = get_tensor_model_parallel_world_size()
+        self.tp_group = get_tp_group()
+
+        # KV Caches and moriio tracking data.
+        self.kv_caches: dict[str, torch.Tensor] = {}
+
+        # Map of engine_id -> kv_caches_base_addr. For TP case, each local
+        # rank will still only pull from a single remote TP worker.
+        self.kv_caches_base_addr: dict[EngineId, list[int]] = {}
+
+        # Number of MoRIIO regions. Currently one region per cache
+        # (so 1 per layer for MLA, otherwise 2 per layer)
+        self.num_regions = 0
+        self.num_layers = 0
+
+        # Map of engine_id -> num_blocks. All ranks in the same deployment will
+        # have the same number of blocks.
+        self.dst_num_blocks: dict[EngineId, int] = {}
+        # In progress transfers.
+        self._recving_transfers: defaultdict[ReqId, list] = defaultdict(list)
+        self._recving_transfers_callback_addr: dict[ReqId, tuple[str, str]] = {}
+
+        # Track the expiration time of requests that are waiting to be sent.
+        self._reqs_to_send: dict[ReqId, float] = {}
+
+        # Background thread for handling new handshake requests.
+        self._moriio_handshake_listener_t: threading.Thread | None = None
+        # Background thread for initializing new MoRIIO handshakes.
+        self._handshake_initiation_executor = ThreadPoolExecutor(
+            # MoRIIO is not guaranteed to be thread-safe, limit 1 worker.
+            max_workers=1,
+            thread_name_prefix="vllm-moriio-handshake-initiator",
+        )
+        self._ready_requests = queue.Queue[tuple[ReqId, ReqMeta]]()
+        self._handshake_futures: dict[EngineId, Future[set[str]]] = {}
+        # Protects _handshake_futures and _remote_agents.
+        self._handshake_lock = threading.RLock()
+
+        self.block_size = vllm_config.cache_config.block_size
+        self.model_config = vllm_config.model_config
+        self.cache_config = vllm_config.cache_config
+
+        self.block_window_per_layer: list[int | None] = []
+        self.use_mla = self.model_config.use_mla
+        self.built_session = False
+        self.built_write_session: defaultdict[str, list] = defaultdict(list)
+        backend = get_attn_backend(
+            self.model_config.get_head_size(),
+            self.model_config.dtype,
+            self.cache_config.cache_dtype,
+            self.block_size,
+            use_mla=self.use_mla,
+        )
+
+        # TODO: consider the integration of flashinfer or other backends.
+        self.backend_name = backend.get_name()
+        logger.debug("Detected attention backend %s", self.backend_name)
+
+    def schedule_write_blocks(
+        self,
+        request_id: str,
+        dst_engine_id: str,
+        local_block_ids: list[int],
+        remote_block_ids: list[int] | None,
+        layer_name: str,
+        kv_layer: torch.Tensor,
+        remote_notify_port: int,
+        remote_ip: str,
+    ) -> None:
+        """Schedule a block write operation.
+
+        Args:
+            request_id: Unique identifier for the request
+            dst_engine_id: Destination engine ID
+            local_block_ids: Local block IDs to transfer
+            remote_block_ids: Hint for remote block IDs
+            layer_name: Name of the layer
+            kv_layer: KV cache tensor
+            remote_notify_port: Port for completion notification
+            remote_ip: IP address of remote node
+        """
+
+        # synchronization to prevent dirty reads between
+        # transfer and attention operations
+        # we can consider removing this synchronization after ibgda is enabled.
+        # when mori-io supports ibgda functionality
+
+        stream = torch.cuda.current_stream()
+        event = torch.cuda.Event()
+        event.record(stream)
+
+        task = WriteTask(
+            request_id=request_id,
+            dst_engine_id=dst_engine_id,
+            local_block_ids=local_block_ids,
+            remote_block_ids_hint=remote_block_ids,
+            layer_name=layer_name,
+            event=event,
+            remote_notify_port=remote_notify_port,
+            remote_ip=remote_ip,
+        )
+        self._writer.schedule_write(task)
+
+    def _get_built_session(self, remote_engine_id):
+        if remote_engine_id not in self.built_write_session:
+            cur_remote_engine_sessions = []
+            for ln, local_meta in self.layer_name_to_local_kv_cache_metadata.items():
+                unpacked_local_memory_meta = (
+                    self.moriio_wrapper.get_unpack_memory_metadata(local_meta[0])
+                )
+                unpacked_remote_memory_meta = (
+                    self.moriio_wrapper.get_unpack_memory_metadata(
+                        self.layer_name_to_remote_kv_cache_metadata[remote_engine_id][
+                            ln
+                        ][0]
+                    )
+                )
+                cur_remote_engine_sessions.append(
+                    self.moriio_wrapper.build_session(
+                        unpacked_local_memory_meta, unpacked_remote_memory_meta
+                    )
+                )
+            self.built_write_session[remote_engine_id] = cur_remote_engine_sessions
+        return self.built_write_session[remote_engine_id], self.remote_moriio_metadata[
+            remote_engine_id
+        ]
+
+    def _ping(self, zmq_context):
+        http_request_address = f"http://{self.request_address}/v1/completions"
+        role = "P" if self.is_producer else "D"
+
+        retry_count = 0
+        index = 1
+        with zmq_context.socket(zmq.DEALER) as sock:
+            sock.connect(f"tcp://{self.proxy_ip}:{self.proxy_ping_port}")
+
+            while True:
+                try:
+                    data = {
+                        "type": "register",
+                        "role": role,
+                        "index": str(index),
+                        "request_address": http_request_address,
+                        "handshake_port": self.handshake_port,
+                        "notify_port": self.notify_port,
+                        "dp_size": self.moriio_config.dp_size,
+                        "tp_size": self.moriio_config.tp_size,
+                        "transfer_mode": self.mode.name,
+                    }
+
+                    sock.send(msgpack.dumps(data))
+                    # logger.debug(f"Successfully sent ping message #{index}")
+                    retry_count = 0
+
+                except ConnectionRefusedError:
+                    logger.info(
+                        "Connection refused: %s:%s -> %s:%s",
+                        self.local_ip,
+                        self.local_ping_port,
+                        self.proxy_ip,
+                        self.proxy_ping_port,
+                    )
+                    retry_count += 1
+
+                except OSError as e:
+                    logger.info("OS error when sending ping: %s", e)
+                    retry_count += 1
+
+                except Exception as e:
+                    logger.info("Unexpected error when sending ping: %s", e)
+                    retry_count += 1
+                    if retry_count >= MoRIIOConstants.MAX_PING_RETRIES:
+                        logger.error(
+                            "Max retries (%s) exceeded. Stopping ping loop.",
+                            MoRIIOConstants.MAX_PING_RETRIES,
+                        )
+                        raise RuntimeError(
+                            f"Ping failed after {retry_count} retries"
+                        ) from e
+
+                finally:
+                    time.sleep(MoRIIOConstants.PING_INTERVAL)
+                    index += 1
+
+    def shutdown(self):
+        if hasattr(self, "moriio_wrapper") and self.moriio_wrapper:
+            self.moriio_wrapper.shutdown()
+
+        if hasattr(self, "_handshake_initiation_executor"):
+            self._handshake_initiation_executor.shutdown(wait=False)
+
+        if (
+            hasattr(self, "_moriio_handshake_listener_t")
+            and self._moriio_handshake_listener_t
+        ):
+            self._moriio_handshake_listener_t.join(timeout=0)
+
+        if hasattr(self, "zmq_context") and self.zmq_context:
+            self.zmq_context.destroy(linger=0)
+            self.zmq_context = None
+
+    def __del__(self):
+        self.shutdown()
+
+    @staticmethod
+    def _moriio_handshake_listener(
+        metadata: MoRIIOAgentMetadata,
+        ready_event: threading.Event,
+        base_port: int,
+        tp_rank: int,
+        dp_rank: int,
+        layer_name_to_local_kv_cache_metadata: dict,
+    ):
+        """Background thread for getting new MoRIIO handshakes."""
+
+        encoder = msgspec.msgpack.Encoder()
+        encoded_data = encoder.encode(metadata)
+        size_in_bytes = len(encoded_data)
+        logger.debug(
+            "Size of encoded MoRIIOAgentMetadata: %s bytes", str(size_in_bytes)
+        )
+
+        # Listen for new requests for metadata.
+        host = "*"
+
+        path = make_zmq_path("tcp", host, base_port)
+        logger.debug("mori handshake starting listening on path: %s", path)
+
+        with zmq_ctx(zmq.ROUTER, path) as sock:
+            ready_event.set()
+            while True:
+                identity, msg = sock.recv_multipart()
+                if (
+                    msg != MoRIIOConstants.GET_META_MSG
+                    and msg != MoRIIOConstants.POP_DONE_RECV
+                ):
+                    logger.error("Connection listener got unexpected message")
+                    raise HandshakeError("handshake failed, unexpected msg type")
+                elif msg == MoRIIOConstants.GET_META_MSG:
+                    sock.send_multipart(
+                        (identity, b"", encoded_data)
+                    )  # send local mori io engine meta data
+                    logger.debug("MoRIIO handshake listener sent metadata")
+                    # now we send tensor meta data for each block
+                    buf = msgpack.dumps(layer_name_to_local_kv_cache_metadata)
+                    sock.send_multipart((identity, b"", buf))
+                elif msg == MoRIIOConstants.POP_DONE_RECV:
+                    _, req_id = sock.recv_multipart()
+                    logger.debug(
+                        "MoRIIO handshake listener received done recv for req",
+                        req_id.decode(),
+                    )
+
+    def _moriio_handshake(
+        self,
+        host: str,
+        port: int,
+        remote_tp_size: int,
+        expected_engine_id: str,
+        remote_dp_rank: int = 0,
+    ) -> set[str]:
+        """Do a MoRIIO handshake with a remote instance."""
+
+        start_time = time.perf_counter()
+
+        # NOTE(rob): we need each rank to have a unique port. This is
+        # a hack to keep us moving. We will switch when moving to etcd
+        # or where we have a single ZMQ socket in the scheduler.
+
+        port_offset = get_port_offset(remote_dp_rank, self.tp_rank)
+        path = make_zmq_path("tcp", host, port + port_offset)
+        logger.debug("handshake Querying metadata on path: %s", path)
+
+        # Send query for the request.
+        with zmq_ctx(zmq.DEALER, path) as sock:
+            logger.debug("prepare send msg INSTAZNCE: %s", path)
+            sock.send(MoRIIOConstants.GET_META_MSG)
+            received_frame = sock.recv_multipart()
+            if len(received_frame) != 2 or received_frame[0] != b"":
+                raise HandshakeError(f"Unexpected frame! {received_frame = }")
+
+            metadata_bytes = received_frame[1]
+            decoder = msgspec.msgpack.Decoder(MoRIIOAgentMetadata)
+            metadata = decoder.decode(metadata_bytes)
+            got_metadata_time = time.perf_counter()
+            logger.info(
+                "MoRIIO handshake: get metadata took: %s",
+                got_metadata_time - start_time,
+            )
+
+            self.moriio_wrapper.remote_engine_ip = host
+            remote_agent_name = self.moriio_wrapper.register_remote_engine(
+                metadata.agent_metadata
+            )
+
+            logger.debug(
+                "MoRIIO handshake: registered"
+                "remote agent %s for engine ID %s, path = %s",
+                remote_agent_name,
+                expected_engine_id,
+                path,
+            )
+
+            if len(self.local_kv_cache_metadata) > 0:
+                logger.warning(
+                    "len(self.local_kv_cache_metadata) = %s,"
+                    "maybe you didnt clear this buffer correctly",
+                    len(self.local_kv_cache_metadata),
+                )
+                self.local_kv_cache_metadata = []
+            if len(self.remote_kv_cache_metadata) > 0:
+                logger.warning(
+                    "len(self.remote_kv_cache_metadata) = %s,"
+                    "maybe you didnt clear this buffer correctly",
+                    len(self.remote_kv_cache_metadata),
+                )
+                self.remote_kv_cache_metadata = []
+
+            received_frame = sock.recv_multipart()
+            if len(received_frame) != 2 or received_frame[0] != b"":
+                raise HandshakeError(f"unexpected frame! {received_frame = }")
+            buf = received_frame[1]
+            self.layer_name_to_remote_kv_cache_metadata[expected_engine_id] = (
+                msgpack.loads(buf)
+            )
+            self.remote_moriio_metadata[expected_engine_id] = metadata
+            setup_agent_time = time.perf_counter()
+            logger.debug(
+                "MoRIIO handshake: add agent took: %s",
+                setup_agent_time - got_metadata_time,
+            )
+
+        return {remote_agent_name}
+
+    def _background_moriio_handshake(
+        self, req_id: str, remote_engine_id: EngineId, meta: ReqMeta
+    ):
+        # Do MoRIIO handshake in background and add to _ready_requests when done.
+        fut = None
+        if remote_engine_id is not None:
+            fut = self._handshake_futures.get(remote_engine_id)
+        if fut is None:
+            host = meta.remote_host
+            port = int(meta.remote_handshake_port)
+            tp_size = int(meta.tp_size)
+            remote_dp_size = int(meta.remote_dp_size)
+
+        def request_ready(_f: Future[Any], entry=(req_id, meta)):
+            logger.info("MoRIIO handshake done for request %s", req_id)
+            self._ready_requests.put(entry)
+            self.load_ready_flag[remote_engine_id] = True
+            self.write_ready_flags[remote_engine_id] = True
+
+        fut_list = []
+
+        # In dp(prefill)<->dp(decode) communication, we require an all-to-all handshake.
+
+        for cur_dp_rank in range(remote_dp_size):
+            dp_engine_id = self.get_engine_name_with_dp(remote_engine_id, cur_dp_rank)
+            future = self._handshake_initiation_executor.submit(
+                self._moriio_handshake, host, port, tp_size, dp_engine_id, cur_dp_rank
+            )
+            fut_list.append(future)
+
+            def done_callback(f: Future[set[str]], eid=dp_engine_id):
+                with self._handshake_lock:
+                    self._handshake_futures.pop(eid, None)
+                    try:
+                        self._remote_agents[eid] = f.result()
+                    except Exception:
+                        logger.exception("Handshake with %s failed", eid)
+
+            future.add_done_callback(done_callback)
+            self._handshake_futures[dp_engine_id] = future
+
+        # fut = fut_list
+        def wait_all_dp():
+            for future in fut_list:
+                future.result()
+            return True
+
+        all_done_future = self._handshake_initiation_executor.submit(wait_all_dp)
+        all_done_future.add_done_callback(request_ready)
+
+    def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
+        """Register the KV Cache data in moriio."""
+
+        _, first_kv_cache = next(iter(kv_caches.items()))
+        kv_elem_size = first_kv_cache.element_size()
+
+        use_mla = len(first_kv_cache.shape) == 3
+        assert use_mla == self.use_mla
+
+        if use_mla:
+            # MLA case.
+            self.num_blocks = first_kv_cache.shape[0]
+            block_rank = 2  # [block_size, latent_dim]
+            block_shape = first_kv_cache.shape[-block_rank:]
+            block_size, kv_latent_dim = block_shape
+            self.slot_size_bytes = kv_elem_size * kv_latent_dim
+        else:
+            # [2 (k and v), num_blocks, ...]
+            self.num_blocks = first_kv_cache.shape[1]
+            block_rank = 3  # [block_size, kv_heads, head_dim]
+            block_shape = first_kv_cache.shape[-block_rank:]
+            block_size, n_kv_heads, head_dim = block_shape[-3:]
+            # head size in bytes.
+            self.slot_size_bytes = (
+                kv_elem_size * n_kv_heads * head_dim
+            )  # 1 token 1 layer size , slot size
+        assert block_size == self.block_size
+        # TODO(tms): self.block_len needs to be per-layer for sliding window,
+        # hybrid attn, etc
+        # block size in bytes
+        self.block_len = kv_elem_size * math.prod(block_shape)
+        self.kv_cache_shape = first_kv_cache.shape
+        self.block_shape = block_shape
+        self.kv_element_size = kv_elem_size
+
+        self.dst_num_blocks[self.engine_id] = self.num_blocks
+        self.kv_caches = kv_caches  # layer name to kv cache
+        kv_caches_base_addr = []
+        caches_data = []
+
+        for cache_or_caches in kv_caches.values():
+            cache_list = [cache_or_caches] if use_mla else cache_or_caches
+            for cache in cache_list:
+                base_addr = cache.data_ptr()
+                region_len = self.num_blocks * self.block_len
+                caches_data.append((base_addr, region_len, cache.device.index, ""))
+                kv_caches_base_addr.append(base_addr)
+
+        for layer_name, kv_cache in kv_caches.items():
+            if layer_name not in self.layer_name_to_local_kv_cache_metadata:
+                self.layer_name_to_local_kv_cache_metadata[layer_name] = []
+
+            moriio_mem_metadata = self.moriio_wrapper.register_local_tensor(kv_cache)
+            self.layer_name_to_local_kv_cache_metadata[layer_name].append(
+                moriio_mem_metadata
+            )
+
+            self.local_kv_cache_size.append(cache.nelement() * cache.element_size())
+
+        self.kv_caches_base_addr[self.engine_id] = kv_caches_base_addr
+        self.num_regions = len(caches_data)
+        self.num_layers = len(self.kv_caches.keys())
+
+        # Optimization for models with local attention (Llama 4)
+        if self.vllm_config.model_config.hf_config.model_type == "llama4":
+            from transformers import Llama4TextConfig
+
+            assert isinstance(
+                self.vllm_config.model_config.hf_text_config, Llama4TextConfig
+            )
+            llama4_config = self.vllm_config.model_config.hf_text_config
+            no_rope_layers = llama4_config.no_rope_layers
+            chunk_size = llama4_config.attention_chunk_size
+            chunk_block_size = math.ceil(chunk_size / self.block_size)
+            for layer_idx in range(self.num_layers):
+                # no_rope_layers[layer_idx] == 0 means NoPE (global)
+                # Any other value means RoPE (local chunked)
+                is_local_attention = no_rope_layers[layer_idx] != 0
+                block_window = chunk_block_size if is_local_attention else None
+                self.block_window_per_layer.append(block_window)
+            logger.debug(
+                "Llama 4 block window per layer mapping: %s",
+                self.block_window_per_layer,
+            )
+            assert len(self.block_window_per_layer) == self.num_layers
+
+        metadata = MoRIIOAgentMetadata(
+            engine_id=self.engine_id,
+            agent_metadata=self.moriio_wrapper.get_agent_metadata(),
+            kv_caches_base_addr=self.kv_caches_base_addr[self.engine_id],
+            num_blocks=self.num_blocks,
+            block_len=self.block_len,
+            attn_backend_name=self.backend_name,
+        )
+        ready_event = threading.Event()
+        self._moriio_handshake_listener_t = threading.Thread(
+            target=self._moriio_handshake_listener,
+            args=(
+                metadata,
+                ready_event,
+                self.side_channel_port,
+                self.tp_rank,
+                self.dp_rank,
+                self.layer_name_to_local_kv_cache_metadata,
+            ),
+            daemon=True,
+            name="moriio_handshake_listener",
+        )
+        self._moriio_handshake_listener_t.start()
+        ready_event.wait()  # Wait for listener ZMQ socket to be ready.
+        self.moriio_wrapper.async_wait_reqid()
+
+    def get_finished(self) -> tuple[set[str], set[str]]:
+        """
+        Get requests that are done sending or recving on this specific worker.
+        The scheduler process (via the MultiprocExecutor) will use this output
+        to track which workers are done.
+        """
+
+        done_sending, done_recving = set(), set()
+
+        if self.is_producer:
+            done_sending = self.moriio_wrapper.pop_finished_req_ids()
+
+        else:
+            if self.mode == MoRIIOMode.WRITE:
+                done_recving = self.moriio_wrapper.pop_finished_write_req_ids()
+            else:
+                done_recving = self._pop_done_transfers()
+
+        return done_sending, done_recving
+
+    def _pop_done_transfers(self) -> set[str]:
+        done_req_ids: set[str] = set()
+        with self.moriio_wrapper.lock:
+            to_remove = []
+            for req_id, status_list in self._recving_transfers.items():
+                if status_list[-1].Succeeded():
+                    done_req_ids.add(req_id)
+
+                    self.moriio_wrapper.send_notify(
+                        req_id,
+                        self._recving_transfers_callback_addr[req_id][0],
+                        self._recving_transfers_callback_addr[req_id][1],
+                    )
+                    to_remove.append(req_id)
+            for req_id in to_remove:
+                del self._recving_transfers[req_id]
+                del self._recving_transfers_callback_addr[req_id]
+
+            return done_req_ids
+
+    def save_kv_layer(
+        self,
+        metadata: MoRIIOConnectorMetadata,
+        layer_name: str,
+        kv_layer: torch.Tensor,
+        attn_metadata: "AttentionMetadata",
+        **kwargs,
+    ):
+        if not self.is_producer:
+            return
+        if self.mode == MoRIIOMode.READ:
+            return
+        remote_engine_id = None
+
+        for req_id, meta in metadata.reqs_to_save.items():
+            # we only need to check if dp0 in rank
+            remote_engine_id = (
+                str(meta.remote_host) + ":" + str(meta.remote_handshake_port)
+            )
+
+            meta.remote_engine_id = remote_engine_id
+
+            dp0_remote_engine_id = self.get_engine_name_with_dp(remote_engine_id, 0)
+            if dp0_remote_engine_id not in self._remote_agents:
+                # Initiate handshake with remote engine to exchange metadata.
+                with self._handshake_lock:
+                    if remote_engine_id not in self._remote_agents:
+                        self._background_moriio_handshake(
+                            req_id, remote_engine_id, meta
+                        )
+
+                        continue
+            self._write_blocks_for_req(req_id, meta, layer_name, kv_layer)
+
+        while True:
+            if (
+                self._ready_requests.empty()
+                and remote_engine_id not in self.write_ready_flags
+            ):
+                continue
+            elif not self._ready_requests.empty() and (
+                remote_engine_id in self.write_ready_flags
+            ):
+                self._write_blocks_for_req(
+                    *self._ready_requests.get_nowait(), layer_name, kv_layer
+                )
+                break
+            else:
+                break
+
+    def get_engine_name_with_dp(self, engine_name, dp_rank):
+        return f"{engine_name}_dp{dp_rank}"
+
+    def start_load_kv(self, metadata: MoRIIOConnectorMetadata):
+        """
+        Start loading by triggering non-blocking moriio_xfer.
+        We check for these trnxs to complete in each step().
+        """
+        if self.is_producer:
+            self.moriio_wrapper.async_wait_reqid()
+            return
+        if self.mode == MoRIIOMode.WRITE:
+            return
+
+        wait_handshake_readd_req = False
+        remote_engine_id = None
+
+        for req_id, meta in metadata.reqs_to_recv.items():
+            remote_engine_id = (
+                str(meta.remote_host) + ":" + str(meta.remote_handshake_port)
+            )
+            meta.remote_engine_id = remote_engine_id
+            dp0_remote_engine_id = self.get_engine_name_with_dp(remote_engine_id, 0)
+            if dp0_remote_engine_id not in self._remote_agents:
+                # Initiate handshake with remote engine to exchange metadata.
+                with self._handshake_lock:
+                    if remote_engine_id not in self._remote_agents:
+                        self._background_moriio_handshake(
+                            req_id, remote_engine_id, meta
+                        )
+                        wait_handshake_readd_req = True
+
+                        continue
+
+            # Handshake already completed, start async read xfer.
+            self._read_blocks_for_req(req_id, meta)
+        # Start transfers for requests whose handshakes have now finished.
+
+        while True:
+            if (
+                self._ready_requests.empty()
+                and remote_engine_id not in self.load_ready_flag
+                and wait_handshake_readd_req
+            ):
+                continue
+            elif (
+                not self._ready_requests.empty()
+                and remote_engine_id in self.load_ready_flag
+            ):
+                self._read_blocks_for_req(*self._ready_requests.get_nowait())
+                break
+            else:
+                break
+
+        self._reqs_to_send.update(metadata.reqs_to_send)
+
+    def _read_blocks_for_req(self, req_id: str, meta: ReqMeta):
+        logger.debug(
+            "Remote agent %s available, calling _read_blocks for req %s",
+            meta.remote_engine_id,
+            req_id,
+        )
+        self._read_blocks(
+            request_id=req_id,
+            dst_engine_id=meta.remote_engine_id,
+            local_block_ids=meta.local_block_ids,
+            remote_block_ids=meta.remote_block_ids,
+            remote_host=meta.remote_host,
+            remote_notify_port=meta.remote_notify_port,
+        )
+
+    def _write_blocks_for_req(self, req_id: str, meta: ReqMeta, layer_name, kv_layer):
+        self.schedule_write_blocks(
+            request_id=req_id,
+            dst_engine_id=meta.remote_engine_id,
+            local_block_ids=meta.local_block_ids,
+            remote_block_ids=meta.remote_block_ids,
+            layer_name=layer_name,
+            kv_layer=kv_layer,
+            remote_notify_port=meta.remote_notify_port,
+            remote_ip=meta.remote_host,
+        )
+
+    def _is_last_layer(self, layer_name):
+        return layer_name == list(self.kv_caches.keys())[-1]
+
+    def merge_contiguous_blocks(
+        self,
+        offsets_local: list[int],
+        offsets_remote: list[int],
+        sizes: list[int],
+        assume_sorted: bool = False,
+    ) -> tuple[list[int], list[int], list[int]]:
+        n = len(offsets_local)
+        if n == 0:
+            return [], [], []
+        if not (n == len(offsets_remote) == len(sizes)):
+            raise ValueError("Input list lengths mismatch")
+        local_arr = np.fromiter(offsets_local, dtype=np.int64, count=n)
+        remote_arr = np.fromiter(offsets_remote, dtype=np.int64, count=n)
+        sizes_arr = np.fromiter(sizes, dtype=np.int64, count=n)
+
+        if assume_sorted:
+            local_sorted = local_arr
+            remote_sorted = remote_arr
+            sizes_sorted = sizes_arr
+        else:
+            if np.all(local_arr[:-1] <= local_arr[1:]):
+                local_sorted = local_arr
+                remote_sorted = remote_arr
+                sizes_sorted = sizes_arr
+            else:
+                sort_idx = np.argsort(local_arr, kind="stable")
+                local_sorted = local_arr[sort_idx]
+                remote_sorted = remote_arr[sort_idx]
+                sizes_sorted = sizes_arr[sort_idx]
+
+        if n == 1:
+            return (
+                [int(local_sorted[0])],
+                [int(remote_sorted[0])],
+                [int(sizes_sorted[0])],
+            )
+
+        diff_local = local_sorted[1:] - local_sorted[:-1]
+        diff_remote = remote_sorted[1:] - remote_sorted[:-1]
+        prev_size = sizes_sorted[:-1]
+
+        contiguous = (diff_local == prev_size) & (diff_remote == prev_size)
+
+        if not contiguous.any():
+            return local_sorted.tolist(), remote_sorted.tolist(), sizes_sorted.tolist()
+
+        if contiguous.all():
+            total_size = int(sizes_sorted.sum())
+            return [int(local_sorted[0])], [int(remote_sorted[0])], [total_size]
+
+        break_positions = np.flatnonzero(~contiguous) + 1
+        segment_starts = np.concatenate(([0], break_positions))
+        segment_ends = np.concatenate((break_positions, [n]))
+
+        seg_count = len(segment_starts)
+        merged_local = [0] * seg_count
+        merged_remote = [0] * seg_count
+        merged_sizes = [0] * seg_count
+
+        for si in range(seg_count):
+            s = segment_starts[si]
+            e = segment_ends[si]
+            merged_local[si] = int(local_sorted[s])
+            merged_remote[si] = int(remote_sorted[s])
+
+            merged_sizes[si] = int(
+                local_sorted[e - 1] + sizes_sorted[e - 1] - local_sorted[s]
+            )
+
+        return merged_local, merged_remote, merged_sizes
+
+    def _compute_block_transfer_offsets(
+        self,
+        layer_name: str,
+        local_block_ids: list[int],
+        remote_block_ids: list[int],
+        remote_moriio_meta: MoRIIOAgentMetadata,
+    ) -> tuple[list[int], list[int], list[int]]:
+        """Compute transfer offsets for block data.
+
+        Args:
+            layer_name: Name of the layer to transfer
+            local_block_ids: IDs of local blocks
+            remote_block_ids: IDs of remote blocks
+            remote_moriio_meta: Metadata of the remote MoRIIO agent
+        Returns:
+            Tuple of (local_offsets, remote_offsets, transfer_sizes)
+        """
+        assert self.kv_cache_shape is not None, "KV caches shape not initialized"
+        is_mla = len(self.kv_cache_shape) == 3
+        stride = self.kv_caches[layer_name].stride()
+        sz = self.kv_caches[layer_name].element_size()
+        if is_mla:
+            blknum, blksize, hs = self.kv_cache_shape
+            hn = 1
+            block_stride = stride[0]
+        else:
+            _, blknum, blksize, hn, hs = self.kv_cache_shape
+            local_ktov_stride = stride[0]
+            block_stride = stride[1]
+            remote_ktov_stride = block_stride * remote_moriio_meta.num_blocks
+
+        transfer_size_byte = blksize * hn * hs * sz
+        per_block = 1 if is_mla else 2
+        total = len(local_block_ids) * per_block
+        offset_local = [0] * total
+        offset_remote = [0] * total
+        sizes = [transfer_size_byte] * total
+
+        w = 0
+        for i, lb in enumerate(local_block_ids):
+            rb = remote_block_ids[i]
+            # K
+            offset_local[w] = sz * (lb * block_stride)
+            offset_remote[w] = sz * (rb * block_stride)
+            w += 1
+            if not is_mla:
+                # V
+                # Handle num_block variations originating from PD (different kv strides)
+                # TODO: address block_sz differences in heterogeneous TP scenarios
+                # In MLA, we don't need to consider these two cases.
+                offset_local[w] = sz * (1 * local_ktov_stride + lb * block_stride)
+                offset_remote[w] = sz * (1 * remote_ktov_stride + rb * block_stride)
+                w += 1
+
+        merged_l, merged_r, merged_s = self.merge_contiguous_blocks(
+            offset_local, offset_remote, sizes, assume_sorted=False
+        )
+        return merged_l, merged_r, merged_s
+
+    def _read_blocks(
+        self,
+        local_block_ids: list[int],
+        remote_block_ids: list[int],
+        dst_engine_id: str,
+        request_id: str,
+        remote_host: str,
+        remote_notify_port: int,
+    ) -> None:
+        if self.mode == MoRIIOMode.WRITE:
+            return
+
+        dp0_engine_id = self.get_engine_name_with_dp(dst_engine_id, 0)
+        sessions, remote_moriio_meta = self._get_built_session(dp0_engine_id)
+
+        first_layer = list(self.layer_name_to_local_kv_cache_metadata.keys())[0]
+        offs = self._compute_block_transfer_offsets(
+            first_layer, local_block_ids, remote_block_ids, remote_moriio_meta
+        )
+
+        for layer_name in self.layer_name_to_local_kv_cache_metadata:
+            sess_idx = list(self.layer_name_to_local_kv_cache_metadata.keys()).index(
+                layer_name
+            )
+            # TODO : apply multi-session batch-read when moriio support it
+            transfer_status = self.moriio_wrapper.read_remote_data(
+                offs[2], offs[0], offs[1], sessions[sess_idx]
+            )
+            with self.moriio_wrapper.lock:
+                self._recving_transfers[request_id].append(transfer_status)
+                self._recving_transfers_callback_addr[request_id] = (
+                    remote_host,
+                    str(remote_notify_port + self.tp_rank),
+                )
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a35c22622b89f96b92defb4a26ca6e74bd9471c
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_engine.py
@@ -0,0 +1,609 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import threading
+from typing import TYPE_CHECKING, Any, Optional
+from weakref import ref as weakref_ref
+
+import msgpack
+import torch
+import zmq
+
+from vllm import envs
+from vllm.logger import init_logger
+from vllm.utils.network_utils import (
+    make_zmq_path,
+    make_zmq_socket,
+)
+
+if TYPE_CHECKING:
+    pass
+
+from queue import Empty, Queue
+
+from vllm.distributed.kv_transfer.kv_connector.v1.moriio.moriio_common import (
+    ROLE,
+    HandshakeError,
+    LayerTransferPlan,
+    MoRIIOAgentMetadata,
+    MoRIIOConstants,
+    MoRIIOError,
+    RemoteAllocInfo,
+    TransferError,
+    WriteTask,
+    get_port_offset,
+    get_role,
+    zmq_ctx,
+)
+
+if TYPE_CHECKING:
+    from vllm.distributed.kv_transfer.kv_connector.v1.moriio.moriio_connector import (
+        MoRIIOConnectorWorker,
+    )
+
+logger = init_logger(__name__)
+try:
+    from mori.io import (
+        EngineDesc,
+        IOEngine,
+        MemoryDesc,
+        PollCqMode,
+        RdmaBackendConfig,
+    )
+
+    logger.info("MoRIIO is available")
+except ImportError:
+    logger.error("MoRIIO is not available")
+
+
+"""Write task execution logic for MoRIIO connector."""
+
+
+class MoRIIOWriter:
+    """Handles write operations for KV cache transfers.
+    Implements distributed KV cache transfer using the MoRIIO library
+    for RDMA-based communication between prefill and decode instances."""
+
+    def __init__(self, worker: "MoRIIOConnectorWorker"):
+        """Initialize the writer.
+
+        Args:
+            worker: Reference to the parent worker
+        """
+        self._worker_ref: weakref_ref[MoRIIOConnectorWorker] = weakref_ref(worker)
+        self._write_task_q: Queue[WriteTask] = Queue()
+        self._write_worker_started = False
+        self._write_worker_lock = threading.Lock()
+        self._deferred_tasks: list[WriteTask] = []
+
+    @property
+    def worker(self) -> "MoRIIOConnectorWorker":
+        """Get the worker instance.
+
+        Returns:
+            The parent worker instance
+
+        Raises:
+            RuntimeError: If worker has been garbage collected
+        """
+        worker = self._worker_ref()
+        if worker is None:
+            raise RuntimeError("Parent worker has been garbage collected")
+        return worker
+
+    def ensure_worker_started(self) -> None:
+        """Ensure the background write worker is running."""
+        if self._write_worker_started:
+            return
+        self._write_worker_started = True
+        with self._write_worker_lock:
+            thread = threading.Thread(
+                target=self._write_worker_loop, daemon=True, name="moriio-write-worker"
+            )
+            thread.start()
+            logger.info("Started MoRIIO write worker thread")
+
+    def schedule_write(self, task: WriteTask) -> None:
+        """Schedule a write task.
+
+        Args:
+            task: The write task to schedule
+        """
+        self.ensure_worker_started()
+        self._write_task_q.put(task)
+
+    def _write_worker_loop(self) -> None:
+        """Main loop for the write worker thread."""
+
+        while True:
+            # Process deferred tasks first
+            self._process_deferred_tasks()
+
+            # Get new task
+            try:
+                task = self._write_task_q.get(timeout=0.01)
+            except Empty:
+                continue
+
+            # Check if remote blocks are ready
+            if not self._is_remote_ready(task):
+                # task.retry_count += 1
+                self._deferred_tasks.append(task)
+                # logger.debug(
+                #     "Deferred task for request %s (retry %d)",
+                #     task.request_id, task.retry_count
+                # )
+                continue
+
+            # Execute the task
+
+            self._execute_write_task(task)
+
+    def _process_deferred_tasks(self) -> None:
+        """Process tasks that were previously deferred."""
+        if not self._deferred_tasks:
+            return
+
+        still_deferred: list[WriteTask] = []
+        for task in self._deferred_tasks:
+            if self._is_remote_ready(task):
+                self._execute_write_task(task)
+            else:
+                still_deferred.append(task)
+
+        self._deferred_tasks = still_deferred
+
+    def _is_remote_ready(self, task: WriteTask) -> bool:
+        """Check if remote blocks are allocated for this task.
+
+        Args:
+            task: The write task
+
+        Returns:
+            True if remote blocks are ready
+        """
+        return (
+            task.request_id in self.worker.moriio_wrapper.done_remote_allocate_req_dict
+        )
+
+    def _get_remote_alloc_info(self, request_id: str) -> RemoteAllocInfo:
+        """Get remote allocation info for a request.
+
+        Args:
+            request_id: The request ID
+
+        Returns:
+            Remote allocation information
+
+        Raises:
+            KeyError: If allocation info is missing
+        """
+        try:
+            return self.worker.moriio_wrapper.done_remote_allocate_req_dict[request_id]
+        except KeyError as e:
+            raise KeyError(
+                f"Remote allocation info missing for request {request_id}"
+            ) from e
+
+    def _execute_write_task(self, task: WriteTask) -> None:
+        """Execute a single write task.
+
+        Args:
+            task: The write task to execute
+
+        """
+        # Get remote allocation info
+        request_info = self._get_remote_alloc_info(task.request_id)
+
+        if request_info.block_ids is None:
+            logger.debug("Request %s remote block IDs not ready", task.request_id)
+            return
+
+        # Wait for CUDA event
+        # The attention computation of the current layer cannot
+        # overlap with the kv transfer task,
+        # otherwise it will cause precision issues.
+        # This event is used to synchronize the kv transfer and computation tasks.
+        task.event.synchronize()
+
+        # Update engine ID with DP rank
+        task.dst_engine_id = self.worker.get_engine_name_with_dp(
+            task.dst_engine_id, request_info.decode_dp_rank
+        )
+
+        # Get or create sessions
+        sessions, remote_moriio_meta = self.worker._get_built_session(
+            task.dst_engine_id
+        )
+
+        # Prepare transfer plan
+        plan = self._prepare_transfer_plan(task, request_info, remote_moriio_meta)
+
+        # Execute transfer
+        self._do_layer_write(plan, sessions)
+
+        # Finalize if all layers complete
+        self._finalize_if_complete(task, request_info)
+
+    def _prepare_transfer_plan(
+        self,
+        task: WriteTask,
+        request_info: RemoteAllocInfo,
+        remote_moriio_meta: MoRIIOAgentMetadata,
+    ) -> LayerTransferPlan:
+        """Prepare the transfer plan for a layer.
+
+        Args:
+            task: The write task
+            request_info: Remote allocation information
+
+        Returns:
+            The transfer plan
+        """
+        # Compute offsets if not cached
+        if request_info.transfer_offset is None:
+            offsets = self.worker._compute_block_transfer_offsets(
+                task.layer_name,
+                task.local_block_ids,
+                request_info.block_ids,
+                remote_moriio_meta,
+            )
+            request_info.transfer_offset = offsets
+
+        # Get session index
+        layer_names = list(self.worker.layer_name_to_local_kv_cache_metadata.keys())
+        sess_idx = layer_names.index(task.layer_name)
+
+        local_off, remote_off, sizes = request_info.transfer_offset
+
+        return LayerTransferPlan(
+            request_id=task.request_id,
+            layer_name=task.layer_name,
+            sess_idx=sess_idx,
+            transfer_local_offsets=local_off,
+            transfer_remote_offsets=remote_off,
+            transfer_sizes=sizes,
+            use_batch=True,
+        )
+
+    def _do_layer_write(self, plan: LayerTransferPlan, sessions: list) -> None:
+        """Perform the actual layer write.
+
+        Args:
+            plan: The transfer plan
+            sessions: List of transfer sessions
+        """
+        if plan.use_batch:
+            self.worker.moriio_wrapper.write_remote_data(
+                plan.transfer_sizes,
+                plan.transfer_local_offsets,
+                plan.transfer_remote_offsets,
+                sessions[plan.sess_idx],
+            )
+        else:
+            for i in range(len(plan.transfer_local_offsets)):
+                self.worker.moriio_wrapper.write_remote_data_single(
+                    plan.transfer_sizes[i],
+                    plan.transfer_local_offsets[i],
+                    plan.transfer_remote_offsets[i],
+                    plan.sess_idx,
+                )
+
+    def _finalize_if_complete(
+        self, task: WriteTask, request_info: RemoteAllocInfo
+    ) -> None:
+        """Finalize transfer if all layers are complete.
+
+        Args:
+            task: The write task
+            request_info: Remote allocation information
+        """
+        request_info.writes_done += 1
+
+        if request_info.writes_done >= self.worker.num_layers:
+            # Wait for transfer to complete
+            self.worker.moriio_wrapper.waiting_for_transfer_complete()
+
+            remote_port = task.remote_notify_port + get_port_offset(
+                request_info.decode_dp_rank, self.worker.tp_rank
+            )
+            # Consider using RDMA immediate data in decode side
+            # to eliminate the need for this notification.
+            # Consider including the first gen token from prefill in the notification
+
+            # Send completion notification
+            self.worker.moriio_wrapper.send_notify(
+                task.request_id, task.remote_ip, remote_port
+            )
+            # mark request as done, then we can free the blocks
+            with self.worker.moriio_wrapper.lock:
+                self.worker.moriio_wrapper.done_req_ids.append(task.request_id)
+            del self.worker.moriio_wrapper.done_remote_allocate_req_dict[
+                task.request_id
+            ]
+            logger.debug(
+                "Completed transfer for request %s, notified port %d",
+                task.request_id,
+                remote_port,
+            )
+
+
+class MoRIIOWrapper:
+    """Wrapper for MoRIIO engine operations.
+
+    Handles both producer and consumer roles for KV cache transfers.
+
+    Args:
+        moriio_engine:  MoRIIO engine instance
+        tp_rank: Tensor parallel rank
+        dp_rank: Data parallel rank
+    """
+
+    def __init__(
+        self,
+        moriio_engine: Optional["IOEngine"] = None,
+        tp_rank: int = 0,
+        dp_rank: int = 0,
+    ):
+        self.tp_rank = tp_rank
+        self.dp_rank = dp_rank
+        self.moriio_engine = moriio_engine
+        self.remote_memory_metadata = None
+        self.local_memory_registered = False
+        self.local_memory_metadata = None
+        self.transfer_status: list[Any] = []
+        self.remote_engine_ip: str | None = None
+        self.notify_port: int | None = None
+        self.lock = threading.Lock()
+        self.done_req_ids: list[str] = []
+        self.done_remote_allocate_req_dict: dict[str, RemoteAllocInfo] = {}
+        self.done_write_cache_req_ids: list[str] = []
+        self.notify_thread: threading.Thread | None = None
+        self.sessions: list[IOEngine.Session] = []
+        self.paths: dict[str, zmq.Socket] = {}
+
+    def set_moriio_engine(self, moriio_engine):
+        assert moriio_engine is not None, (
+            "You Cannot pass None engine to MoRIIOWrapper!"
+        )
+        self.moriio_engine = moriio_engine
+
+    def set_backend_type(self, backend_type):
+        assert self.moriio_engine is not None, "MoRIIO engine must be set first"
+        qp_per_transfer = envs.VLLM_MORIIO_QP_PER_TRANSFER
+        post_batch_size = envs.VLLM_MORIIO_POST_BATCH_SIZE
+        num_worker_threads = envs.VLLM_MORIIO_NUM_WORKERS
+        poll_mode = PollCqMode.POLLING
+        rdma_cfg = RdmaBackendConfig(
+            qp_per_transfer,
+            post_batch_size,
+            num_worker_threads,
+            poll_mode,
+        )
+        self.moriio_engine.create_backend(backend_type, rdma_cfg)
+
+    def get_agent_metadata(self):
+        assert self.moriio_engine is not None, "MoRIIO engine must be set first"
+        engine_metadata = self.moriio_engine.get_engine_desc()
+        engine_metadata_packed = engine_metadata.pack()
+        return engine_metadata_packed
+
+    def register_remote_engine(self, remote_packed_engine_metadata):
+        assert self.moriio_engine is not None, "MoRIIO engine must be set first"
+        consumer_engine_metadata = EngineDesc.unpack(remote_packed_engine_metadata)
+        self.moriio_engine.register_remote_engine(consumer_engine_metadata)
+        return consumer_engine_metadata.key
+
+    def register_local_tensor(self, tensor: torch.Tensor):
+        assert self.moriio_engine is not None, "MoRIIO engine must be set first"
+        try:
+            self.local_memory_metadata = self.moriio_engine.register_torch_tensor(
+                tensor
+            )
+            assert self.local_memory_metadata is not None, (
+                "register_torch_tensor returned None"
+            )
+            local_memory_metadata_packed = self.local_memory_metadata.pack()
+        except Exception as e:
+            raise MoRIIOError(f"Failed to register local memory: {e}") from e
+        self.local_memory_registered = True
+        return local_memory_metadata_packed
+
+    def get_unpack_memory_metadata(self, packed_memory_metadata):
+        return MemoryDesc.unpack(packed_memory_metadata)
+
+    def build_session(self, local_memory_metadata, remote_memory_metadata):
+        assert self.moriio_engine is not None, "MoRIIO engine must be set first"
+        return self.moriio_engine.create_session(
+            local_memory_metadata, remote_memory_metadata
+        )
+
+    def read_remote_data(
+        self, transfer_size_byte, local_offset=0, remote_offset=0, session=None
+    ):
+        assert self.local_memory_registered, "You have not register local memory data!"
+        assert self.moriio_engine is not None, "MoRIIO engine must be set first"
+        transfer_status = session.batch_read(
+            local_offset,
+            remote_offset,
+            transfer_size_byte,
+            self.moriio_engine.allocate_transfer_uid(),
+        )
+
+        return transfer_status
+
+    def write_remote_data(
+        self, transfer_size_byte, local_offset=0, remote_offset=0, session=None
+    ):
+        assert self.local_memory_registered, "You have not register local memory data!"
+        assert self.moriio_engine is not None, "MoRIIO engine must be set first"
+        write_uid = self.moriio_engine.allocate_transfer_uid()
+
+        transfer_status = session.batch_write(
+            local_offset, remote_offset, transfer_size_byte, write_uid
+        )
+        with self.lock:
+            self.transfer_status.append(transfer_status)
+
+    def write_remote_data_single(
+        self, transfer_size_byte, local_offset=0, remote_offset=0, sess_idx=0
+    ):
+        assert self.local_memory_registered, "You have not register local memory data!"
+        assert self.moriio_engine is not None, "MoRIIO engine must be set first"
+        transfer_status = self.sessions[sess_idx].write(
+            local_offset,
+            remote_offset,
+            transfer_size_byte,
+            self.moriio_engine.allocate_transfer_uid(),
+        )
+        with self.lock:
+            self.transfer_status.append(transfer_status)
+
+    def waiting_for_transfer_complete(self):
+        if not self.transfer_status:
+            return
+
+        transfers_to_wait = []
+        with self.lock:
+            transfers_to_wait = self.transfer_status[:]
+            self.transfer_status.clear()
+
+        for status in transfers_to_wait:
+            try:
+                status.Wait()
+                if not status.Succeeded():
+                    logger.error(
+                        "Transfer failed: %s, Code: %s", status.Message(), status.Code()
+                    )
+                    raise TransferError("MoRIIO transfer failed!")
+            except Exception as e:
+                logger.error("Transfer %s failed: %s", status, e)
+                raise
+
+    def async_wait_reqid(self):
+        assert self.notify_port is not None, "Notify port cannot be None"
+
+        if self.notify_thread is not None:
+            return
+
+        def _async_wait():
+            host = "*"
+            path = make_zmq_path("tcp", host, self.notify_port)
+            logger.info("Node starting to listen notify from path = %s", path)
+
+            with zmq_ctx(zmq.ROUTER, path) as sock:
+                while True:
+                    try:
+                        identity, msg = sock.recv_multipart()
+                        self._handle_message(msg)
+                    except Exception as e:
+                        logger.error("Error processing message: %s", e)
+                        raise HandshakeError(f"Error processing message: {e}") from e
+
+        self.notify_thread = threading.Thread(
+            target=_async_wait, daemon=True, name="moriio-notify-listener"
+        )
+        self.notify_thread.start()
+
+    def _handle_message(self, msg: bytes):
+        """Handles incoming messages from remote nodes."""
+        # Handles incoming remote messages:
+        # Prefill Role:
+        #   [write] mode: receives block information (allocation)
+        #   [read]  mode: receives block release messages from decode side
+        # Decode Role:
+        #   [write] mode: receives KV cache write completion notifications
+        handled = False
+        try:
+            data = msgpack.loads(msg)
+            if isinstance(data, dict) and "req_id" in data:
+                self._handle_structured_message(data)
+
+                return
+        except (msgpack.exceptions.ExtraData, msgpack.exceptions.UnpackException):
+            logger.debug("Failed to decode msgpack message, will try as string")
+            pass
+
+        try:
+            msg_str = msg.decode("UTF-8")
+            if msg_str.startswith(MoRIIOConstants.COMPLETION_PREFIX):
+                self._handle_completion_message(msg_str)
+                handled = True
+        except UnicodeDecodeError:
+            logger.warning("Received non-UTF8 message: %s", msg_str)
+        if not handled:
+            raise MoRIIOError(f"Unhandled message format: {msg_str}")
+
+    def _handle_structured_message(self, data: dict):
+        assert get_role() == ROLE.PRODUCER, "Only prefill can get block messages"
+        req_id = data["req_id"]
+        block_notify_list = data.get("block_notify_list", [])
+        decode_dp_rank = data.get("decode_rank", 0)
+        assert len(block_notify_list) > 0, (
+            "block_notify_list cannot be empty in remote allocate message"
+        )
+
+        with self.lock:
+            self.done_remote_allocate_req_dict[req_id] = RemoteAllocInfo(
+                block_ids=block_notify_list, decode_dp_rank=decode_dp_rank
+            )
+
+    def _handle_completion_message(self, msg: str):
+        with self.lock:
+            if get_role() == ROLE.PRODUCER:
+                self.done_req_ids.append(msg)
+            else:
+                self.done_write_cache_req_ids.append(msg)
+
+    def send_notify(self, req_ids, remote_ip, remote_port):
+        if not remote_ip or not remote_port:
+            logger.warning("Missing remote_ip or remote_port for notification")
+            return
+
+        path = make_zmq_path("tcp", remote_ip, remote_port)
+
+        if path not in self.paths:
+            ctx = zmq.Context.instance()
+            sock = make_zmq_socket(
+                ctx=ctx, path=path, socket_type=zmq.DEALER, bind=False
+            )
+            self.paths[path] = sock
+
+        req_list = req_ids if isinstance(req_ids, list) else [req_ids]
+
+        sock = self.paths[path]
+        try:
+            for req_id in req_list:
+                if not isinstance(req_id, str):
+                    logger.warning(
+                        "Invalid req_id type: %s, expected str", type(req_id)
+                    )
+                    continue
+                sock.send(req_id.encode("utf-8"))
+        except Exception as e:
+            logger.error("Failed to send notification to %s: %s", path, e)
+            self.paths.pop(path, None)
+            raise
+
+    def pop_finished_req_ids(self):
+        # producer invocation: get the set of completed requests at the decode
+        with self.lock:
+            done_send = set(self.done_req_ids)
+            self.done_req_ids = []
+        return done_send
+
+    def pop_finished_write_req_ids(self):
+        # Call the consumer in write mode to get the collection after write completion
+        with self.lock:
+            done_write_cache = set(self.done_write_cache_req_ids)
+            self.done_write_cache_req_ids = []
+        return done_write_cache
+
+    def shutdown(self):
+        logger.debug("Closing MoRIIOWrapper and cleaning up ZMQ sockets")
+        for path, sock in self.paths.items():
+            try:
+                sock.close(linger=0)
+                logger.debug("Closed ZMQ socket for path: %s", path)
+            except Exception as e:
+                logger.warning("Error closing ZMQ socket for path %s: %s", path, e)
+        self.paths.clear()
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
index 682574537495941694ec1c9d3acf9bb0fc79bf8e..412e2c57133fc91f5041656f1c234a300998e5c1 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
@@ -7,7 +7,6 @@ from typing import TYPE_CHECKING, Any
 
 import torch
 
-from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import VllmConfig
 from vllm.config.kv_transfer import KVTransferConfig
 from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBaseType
@@ -24,6 +23,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
     PromMetricT,
 )
 from vllm.logger import init_logger
+from vllm.v1.attention.backend import AttentionBackend, AttentionMetadata
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.outputs import KVConnectorOutput
 
@@ -138,6 +138,12 @@ class MultiConnector(KVConnectorBase_V1):
         # Propagated from scheduler to worker side via the connector metadata.
         self._extra_async_saves: dict[str, int] = {}
 
+    @property
+    def prefer_cross_layer_blocks(self) -> bool:
+        if not self._connectors:
+            return False
+        return all(c.prefer_cross_layer_blocks for c in self._connectors)
+
     @classmethod
     def _get_connector_classes_and_configs(
         cls, vllm_config: "VllmConfig"
@@ -164,6 +170,13 @@ class MultiConnector(KVConnectorBase_V1):
             )
         return ret
 
+    def register_cross_layers_kv_cache(
+        self, kv_cache: torch.Tensor, attn_backend: type[AttentionBackend]
+    ):
+        # Register on all connectors
+        for c in self._connectors:
+            c.register_cross_layers_kv_cache(kv_cache, attn_backend)
+
     def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
         for c in self._connectors:
             c.register_kv_caches(kv_caches)
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index c780e2c0f931061f090af0604f833acf50cbf79d..7916d1e025002e50cf00afa6f5670d19d6fed124 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -20,10 +20,16 @@ import torch
 import zmq
 
 from vllm import envs
-from vllm.attention.backends.abstract import AttentionMetadata
-from vllm.attention.selector import get_attn_backend
 from vllm.config import VllmConfig
-from vllm.distributed.kv_transfer.kv_connector.utils import TpKVTopology
+from vllm.distributed.kv_transfer.kv_connector.utils import (
+    EngineId,
+    TpKVTopology,
+    get_current_attn_backend,
+    kv_postprocess_blksize_and_layout_on_receive,
+    kv_postprocess_blksize_on_receive,
+    kv_postprocess_layout_on_receive,
+    yield_req_data,
+)
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
     CopyBlocksOp,
     KVConnectorBase_V1,
@@ -46,6 +52,7 @@ from vllm.forward_context import ForwardContext
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.utils.network_utils import make_zmq_path, make_zmq_socket
+from vllm.v1.attention.backend import AttentionMetadata
 from vllm.v1.attention.backends.utils import get_kv_cache_layout
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.worker.block_table import BlockTable
@@ -56,7 +63,6 @@ if TYPE_CHECKING:
     from vllm.v1.request import Request
 
 TransferHandle = int
-EngineId = str
 ReqId = str
 
 #
@@ -81,8 +87,12 @@ logger = init_logger(__name__)
 
 # Lazy import nixl_wrapper to avoid loading nixl_bindings if nixl is not used
 try:
-    from nixl._api import nixl_agent as NixlWrapper
-    from nixl._bindings import nixlXferTelemetry
+    if not current_platform.is_rocm():
+        from nixl._api import nixl_agent as NixlWrapper
+        from nixl._bindings import nixlXferTelemetry
+    else:
+        from rixl._api import nixl_agent as NixlWrapper
+        from rixl._bindings import nixlXferTelemetry
 
     logger.info("NIXL is available")
 except ImportError:
@@ -92,7 +102,10 @@ except ImportError:
 
 
 try:
-    from nixl._api import nixl_agent_config
+    if not current_platform.is_rocm():
+        from nixl._api import nixl_agent_config
+    else:
+        from rixl._api import nixl_agent_config
 except ImportError:
     nixl_agent_config = None
     logger.warning("NIXL agent config is not available")
@@ -461,7 +474,7 @@ class NixlConnectorScheduler:
         self.side_channel_host = envs.VLLM_NIXL_SIDE_CHANNEL_HOST
         self.side_channel_port = (
             envs.VLLM_NIXL_SIDE_CHANNEL_PORT
-            + vllm_config.parallel_config.data_parallel_rank
+            + vllm_config.parallel_config.data_parallel_index
         )
         assert vllm_config.kv_transfer_config is not None
         if current_platform.device_type == "cpu":
@@ -482,7 +495,7 @@ class NixlConnectorScheduler:
         # New requests are added by update_state_after_alloc in
         # the scheduler. Used to make metadata passed to Worker.
         self._reqs_need_recv: dict[ReqId, tuple[Request, list[int]]] = {}
-        self._reqs_need_save: dict[ReqId, tuple[Request, list[int]]] = {}
+        self._reqs_need_save: dict[ReqId, Request] = {}
         # Reqs to send and their expiration time
         self._reqs_need_send: dict[ReqId, float] = {}
         self._reqs_in_batch: set[ReqId] = set()
@@ -628,16 +641,7 @@ class NixlConnectorScheduler:
         if self.use_host_buffer and params.get("do_remote_decode"):
             # NOTE: when accelerator is not directly supported by Nixl,
             # prefilled blocks need to be saved to host memory before transfer.
-
-            # save all blocks
-            block_ids = blocks.get_block_ids()[0]
-            # TODO: skip the blocks that are already in the host xfer buffer.
-            # Currently, the host xfer buffer block is 1-to-1 mapped to device
-            # kv blocks, so host blocks won't be flushed as long as its device
-            # block is not overwritten; and it will be safe to skip saving them
-            # to host xfer buffer.
-            if block_ids:
-                self._reqs_need_save[request.request_id] = (request, block_ids)
+            self._reqs_need_save[request.request_id] = request
         elif params.get("do_remote_prefill"):
             if params.get("remote_block_ids"):
                 if all(
@@ -689,13 +693,32 @@ class NixlConnectorScheduler:
                 kv_transfer_params=req.kv_transfer_params,
             )
 
-        for req_id, (req, block_ids) in self._reqs_need_save.items():
+        # NOTE: For the prefill side, there might be a chance that an early added
+        # request is a chunked prefill, so we need to check if new blocks are added
+        for req_id, new_block_id_groups, _ in yield_req_data(scheduler_output):
+            req_to_save = self._reqs_need_save.get(req_id)
+            if req_to_save is None or new_block_id_groups is None:
+                continue
+            req = req_to_save
+
             assert req.kv_transfer_params is not None
             meta.add_new_req_to_save(
                 request_id=req_id,
-                local_block_ids=block_ids,
+                local_block_ids=new_block_id_groups[0],
                 kv_transfer_params=req.kv_transfer_params,
             )
+            assert scheduler_output.num_scheduled_tokens is not None
+            num_scheduled_tokens = scheduler_output.num_scheduled_tokens[req_id]
+            is_partial = (
+                req.num_computed_tokens + num_scheduled_tokens
+            ) < req.num_prompt_tokens
+            if not is_partial:
+                # For non-partial prefills, once new req_meta is scheduled, it
+                # can be removed from _reqs_need_save.
+                # For partial prefill case, we will retain the request in
+                # _reqs_need_save until all blocks are scheduled with req_meta.
+                # Therefore, only pop if `not is_partial`.
+                self._reqs_need_save.pop(req_id)
 
         meta.reqs_to_send = self._reqs_need_send
         meta.reqs_in_batch = self._reqs_in_batch
@@ -703,7 +726,6 @@ class NixlConnectorScheduler:
 
         # Clear the list once workers start the transfers
         self._reqs_need_recv.clear()
-        self._reqs_need_save.clear()
         self._reqs_in_batch = set()
         self._reqs_not_processed = set()
         self._reqs_need_send = {}
@@ -749,6 +771,8 @@ class NixlConnectorScheduler:
             # Also include the case of a P/D Prefill request with immediate
             # block free (eg abort). Stop tracking this request.
             self._reqs_not_processed.add(request.request_id)
+            # Clear _reqs_need_save if a request is aborted as partial prefill.
+            self._reqs_need_save.pop(request.request_id, None)
             return False, None
 
         # TODO: check whether block_ids actually ever be 0. If not we could
@@ -873,9 +897,10 @@ class NixlConnectorWorker:
         self.copy_blocks: CopyBlocksOp | None = None
 
         # Map of engine_id -> kv_caches_base_addr. For TP case, each local
-        # rank will still only pull from a single remote TP worker.
-        self.kv_caches_base_addr: dict[EngineId, list[int]] = {}
         self.device_id: int = 0
+        # Current rank may pull from multiple remote TP workers.
+        # EngineId, dict[int, list[int]] -> engine_id, tp_rank, base_addr_for_layer
+        self.kv_caches_base_addr = defaultdict[EngineId, dict[int, list[int]]](dict)
 
         # Number of NIXL regions. Currently one region per cache
         # (so 1 per layer for MLA, otherwise 2 per layer)
@@ -883,10 +908,12 @@ class NixlConnectorWorker:
         self.num_layers = 0
 
         # nixl_prepped_dlist_handle.
-        self.src_xfer_side_handle: int = 0
-        self.src_xfer_side_handles: dict[int, int] = {}
-        # Map of engine_id -> nixl_prepped_dlist_handle (int)].
-        self.dst_xfer_side_handles: dict[EngineId, int] = {}
+        self.src_xfer_handles_by_block_size: dict[int, int] = {}
+        # Populated dynamically during handshake based on remote configuration.
+        # Keep track of regions at different tp_ratio values. tp_ratio->handles
+        self.src_xfer_handles_by_tp_ratio: dict[int, list[int]] = {}
+        # Map of engine_id -> {tp_rank: nixl_prepped_dlist_handle (int)}.
+        self.dst_xfer_side_handles = defaultdict[EngineId, dict[int, int]](dict)
 
         # Map of engine_id -> num_blocks. All ranks in the same deployment will
         # have the same number of blocks.
@@ -930,13 +957,10 @@ class NixlConnectorWorker:
         self.block_window_per_layer: list[int | None] = []
         self.use_mla = self.model_config.use_mla
 
-        backend = get_attn_backend(
-            self.model_config.get_head_size(),
-            self.model_config.dtype,
-            self.cache_config.cache_dtype,
-            self.block_size,
-            use_mla=self.use_mla,
-        )
+        # Get the attention backend from the first layer
+        # NOTE (NickLucche) models with multiple backends are not supported yet
+        backend = get_current_attn_backend(vllm_config)
+
         self.backend_name = backend.get_name()
         self.kv_cache_layout = get_kv_cache_layout()
         self.host_buffer_kv_cache_layout = self.kv_cache_layout
@@ -966,7 +990,6 @@ class NixlConnectorWorker:
             total_num_kv_heads=self.model_config.get_total_num_kv_heads(),
             attn_backend=backend,
         )
-        self._use_pallas = self.kv_topo._use_pallas
         self._physical_blocks_per_logical_kv_block = 1
 
     def _nixl_handshake(
@@ -977,103 +1000,101 @@ class NixlConnectorWorker:
         expected_engine_id: str,
     ) -> dict[int, str]:
         """Do a NIXL handshake with a remote instance."""
-
-        start_time = time.perf_counter()
-
-        # NOTE(rob): we need each rank to have a unique port. This is
-        # a hack to keep us moving. We will switch when moving to etcd
-        # or where we have a single ZMQ socket in the scheduler.
-
-        # Handshake only with the remote TP rank that current local rank will
-        # pull from. With homogeneous TP it happens to be the same rank_i.
-        p_remote_rank = self.kv_topo.get_target_remote_rank(remote_tp_size)
+        # When target instance TP > local TP, we need to perform multiple
+        # handshakes. Do it in a single background job for simplicity.
+        # Regardless, only handshake with the remote TP rank(s) that current
+        # local rank will read from. Note that With homogeneous TP,
+        # this happens to be the same single rank_i.
+        p_remote_ranks = self.kv_topo.get_target_remote_ranks(remote_tp_size)
+        remote_rank_to_agent_name = {}
         path = make_zmq_path("tcp", host, port)
-        logger.debug(
-            "Querying metadata on path: %s at remote tp rank %s", path, p_remote_rank
-        )
 
-        # Send query for the request.
         with zmq_ctx(zmq.REQ, path) as sock:
-            msg = msgspec.msgpack.encode((GET_META_MSG, p_remote_rank))
-            # Set receive timeout to 5 seconds to avoid hanging on dead server
-            sock.setsockopt(zmq.RCVTIMEO, 5000)  # milliseconds
-            sock.send(msg)
-            handshake_bytes = sock.recv()
-
-            # Decode handshake payload to get compatibility hash
-            handshake_decoder = msgspec.msgpack.Decoder(NixlHandshakePayload)
-            try:
-                handshake_payload = handshake_decoder.decode(handshake_bytes)
-            except (msgspec.DecodeError, msgspec.ValidationError) as e:
-                raise RuntimeError(
-                    f"Failed to decode NixlHandshakePayload. This likely indicates "
-                    f"an incompatibility between connector version. Error: {e}"
-                ) from e
-
-            got_metadata_time = time.perf_counter()
-            logger.debug(
-                "NIXL handshake: get metadata took: %s", got_metadata_time - start_time
-            )
-
-            # Check compatibility hash BEFORE decoding agent metadata
-            if (
-                self.enforce_compat_hash
-                and handshake_payload.compatibility_hash != self.compat_hash
-            ):
-                raise RuntimeError(
-                    f"NIXL compatibility hash mismatch. "
-                    f"Local: {self.compat_hash}, "
-                    f"Remote: {handshake_payload.compatibility_hash}. "
-                    f"Prefill and decode instances have incompatible configurations. "
-                    f"This may be due to: different vLLM versions, models, dtypes, "
-                    f"KV cache layouts, attention backends, etc. "
-                    f"Both instances must use identical configurations."
-                    f"Disable this check using "
-                    f'--kv-transfer-config \'{{"kv_connector_extra_config": '
-                    f'{{"enforce_handshake_compat": false}}}}\''
+            for remote_rank in p_remote_ranks:
+                logger.debug(
+                    "Querying metadata on path: %s at remote tp rank %s",
+                    path,
+                    remote_rank,
                 )
 
-            logger.info(
-                "NIXL compatibility check passed (hash: %s)",
-                handshake_payload.compatibility_hash,
-            )
+                start_time = time.perf_counter()
+                # Send query for the request.
+                msg = msgspec.msgpack.encode((GET_META_MSG, remote_rank))
+                # Set receive timeout to 5 seconds to avoid hanging on dead server
+                sock.setsockopt(zmq.RCVTIMEO, 5000)  # milliseconds
+                sock.send(msg)
+                handshake_bytes = sock.recv()
 
-            # Decode agent metadata
-            metadata_decoder = msgspec.msgpack.Decoder(NixlAgentMetadata)
-            try:
-                metadata = metadata_decoder.decode(
-                    handshake_payload.agent_metadata_bytes
+                # Decode handshake payload to get compatibility hash
+                handshake_decoder = msgspec.msgpack.Decoder(NixlHandshakePayload)
+                try:
+                    handshake_payload = handshake_decoder.decode(handshake_bytes)
+                except (msgspec.DecodeError, msgspec.ValidationError) as e:
+                    raise RuntimeError(
+                        f"Failed to decode NixlHandshakePayload. This likely indicates "
+                        f"an incompatibility between connector version. Error: {e}"
+                    ) from e
+
+                got_metadata_time = time.perf_counter()
+                logger.debug(
+                    "NIXL handshake: get metadata took: %s",
+                    got_metadata_time - start_time,
                 )
-            except (msgspec.DecodeError, msgspec.ValidationError) as e:
-                # This should not happen if hash matched
-                raise RuntimeError(
-                    f"Failed to decode NixlAgentMetadata. Error: {e}"
-                ) from e
 
-            # Ensure engine id matches.
-            if metadata.engine_id != expected_engine_id:
-                raise RuntimeError(
-                    f"Remote NIXL agent engine ID mismatch. "
-                    f"Expected {expected_engine_id},"
-                    f"received {metadata.engine_id}."
-                )
+                # Check compatibility hash BEFORE decoding agent metadata
+                if (
+                    self.enforce_compat_hash
+                    and handshake_payload.compatibility_hash != self.compat_hash
+                ):
+                    raise RuntimeError(
+                        f"NIXL compatibility hash mismatch. "
+                        f"Local: {self.compat_hash}, "
+                        f"Remote: {handshake_payload.compatibility_hash}. "
+                        f"Prefill and decode instances have incompatible "
+                        f"configurations. This may be due to: different vLLM versions,"
+                        f" models, dtypes, KV cache layouts, attention backends, etc. "
+                        f"Both instances must use identical configurations."
+                        f"Disable this check using "
+                        f'--kv-transfer-config \'{{"kv_connector_extra_config": '
+                        f'{{"enforce_handshake_compat": false}}}}\''
+                    )
 
-            # Register Remote agent.
-            assert metadata.block_size <= self.block_size, (
-                "nP > nD is not supported yet."
-            )
-            remote_agent_name = self.add_remote_agent(
-                metadata, p_remote_rank, remote_tp_size
-            )
+                logger.info(
+                    "NIXL compatibility check passed (hash: %s)",
+                    handshake_payload.compatibility_hash,
+                )
 
-            setup_agent_time = time.perf_counter()
-            logger.debug(
-                "NIXL handshake: add agent took: %s",
-                setup_agent_time - got_metadata_time,
-            )
+                # Decode agent metadata
+                metadata_decoder = msgspec.msgpack.Decoder(NixlAgentMetadata)
+                try:
+                    metadata = metadata_decoder.decode(
+                        handshake_payload.agent_metadata_bytes
+                    )
+                except (msgspec.DecodeError, msgspec.ValidationError) as e:
+                    # This should not happen if hash matched
+                    raise RuntimeError(
+                        f"Failed to decode NixlAgentMetadata. Error: {e}"
+                    ) from e
+
+                # Ensure engine id matches.
+                if metadata.engine_id != expected_engine_id:
+                    raise RuntimeError(
+                        f"Remote NIXL agent engine ID mismatch. "
+                        f"Expected {expected_engine_id},"
+                        f"received {metadata.engine_id}."
+                    )
+                setup_agent_time = time.perf_counter()
 
-        # Remote rank -> agent name.
-        return {p_remote_rank: remote_agent_name}
+                # Register Remote agent.
+                remote_agent_name = self.add_remote_agent(
+                    metadata, remote_rank, remote_tp_size
+                )
+                logger.debug(
+                    "NIXL handshake: add agent took: %s",
+                    setup_agent_time - got_metadata_time,
+                )
+                remote_rank_to_agent_name[remote_rank] = remote_agent_name
+        return remote_rank_to_agent_name
 
     def initialize_host_xfer_buffer(self, kv_caches: dict[str, torch.Tensor]) -> None:
         """
@@ -1131,6 +1152,50 @@ class NixlConnectorWorker:
         assert self.use_host_buffer
         self.copy_blocks = copy_operation
 
+    def _log_failure(
+        self,
+        failure_type: str,
+        req_id: str | None,
+        msg: str = "",
+        error: Exception | None = None,
+        meta: ReqMeta | None = None,
+        **extra_context,
+    ):
+        """Log transfer failure with structured context for easier debugging."""
+        context: dict[str, Any] = {
+            "failure_type": failure_type,
+            "request_id": req_id,
+            "engine_id": self.engine_id,
+        }
+        if meta is None and req_id is not None:
+            # Try to get metadata from in progress transfers when not provided
+            meta = self._recving_metadata.get(req_id)
+
+        if meta and meta.remote:
+            context.update(
+                {
+                    "remote_engine_id": meta.remote.engine_id,
+                    "remote_request_id": meta.remote.request_id,
+                    "remote_host": meta.remote.host,
+                    "remote_port": meta.remote.port,
+                    "num_local_blocks": len(meta.local_block_ids),
+                    "num_remote_blocks": len(meta.remote.block_ids),
+                    "local_block_ids_sample": meta.local_block_ids[:10],
+                }
+            )
+
+        context.update(extra_context)
+        if msg:
+            failure_type = f"{failure_type}. {msg}"
+
+        logger.error(
+            "NIXL transfer failure: %s | Context: %s",
+            failure_type,
+            context,
+            exc_info=error is not None,
+            stacklevel=2,
+        )
+
     def _background_nixl_handshake(
         self, req_id: str, remote_engine_id: EngineId, meta: ReqMeta
     ):
@@ -1152,8 +1217,13 @@ class NixlConnectorWorker:
                     del self._handshake_futures[eid]
                     try:
                         self._remote_agents[eid] = f.result()
-                    except Exception:
-                        logger.exception("Handshake with %s failed", eid)
+                    except Exception as e:
+                        self._log_failure(
+                            failure_type="handshake_setup_failed",
+                            req_id=None,
+                            error=e,
+                            remote_engine_id=eid,
+                        )
 
             fut.add_done_callback(done_callback)
 
@@ -1163,10 +1233,13 @@ class NixlConnectorWorker:
                 # check if handshake succeeded
                 f.result()
                 self._ready_requests.put(entry)
-            except Exception:
+            except Exception as e:
                 # handshake failed - mark blocks as invalid
-                logger.exception(
-                    "Handshake failed for request %s, marking blocks as invalid", req_id
+                self._log_failure(
+                    failure_type="handshake_failed",
+                    req_id=req_id,
+                    error=e,
+                    meta=meta,
                 )
                 if req_meta := self._recving_metadata.get(req_id):
                     self._invalid_block_ids.update(req_meta.local_block_ids)
@@ -1283,7 +1356,7 @@ class NixlConnectorWorker:
         assert len(self.block_len_per_layer) == len(seen_base_addresses)
         assert self.num_blocks != 0
 
-        self.kv_caches_base_addr[self.engine_id] = seen_base_addresses
+        self.kv_caches_base_addr[self.engine_id][self.tp_rank] = seen_base_addresses
         self.num_regions = len(caches_data)
         self.num_layers = len(xfer_buffers.keys())
 
@@ -1310,8 +1383,9 @@ class NixlConnectorWorker:
 
         # Register local/src descr for NIXL xfer.
         self.seen_base_addresses = seen_base_addresses
-        self.src_xfer_side_handle = self.register_local_xfer_handler(self.block_size)
-        self.src_xfer_side_handles[self.block_size] = self.src_xfer_side_handle
+        self.src_xfer_handles_by_block_size[self.block_size], self.src_blocks_data = (
+            self.register_local_xfer_handler(self.block_size)
+        )
 
         # TODO(mgoin): Hybrid memory allocator is currently disabled for
         # models with local attention (Llama 4). Can remove this once enabled.
@@ -1339,8 +1413,8 @@ class NixlConnectorWorker:
         agent_metadata = NixlAgentMetadata(
             engine_id=self.engine_id,
             agent_metadata=self.nixl_wrapper.get_agent_metadata(),
-            kv_caches_base_addr=self.kv_caches_base_addr[self.engine_id],
             device_id=self.device_id,
+            kv_caches_base_addr=self.kv_caches_base_addr[self.engine_id][self.tp_rank],
             num_blocks=self.num_blocks,
             block_lens=self.block_len_per_layer,
             kv_cache_layout=self.kv_cache_layout
@@ -1358,7 +1432,7 @@ class NixlConnectorWorker:
     def register_local_xfer_handler(
         self,
         block_size: int,
-    ) -> int:
+    ) -> tuple[int, list[tuple[int, int, int]]]:
         """
         Function used for register local xfer handler with local block_size or
         Remote block_size.
@@ -1406,7 +1480,7 @@ class NixlConnectorWorker:
 
         descs = self.nixl_wrapper.get_xfer_descs(blocks_data, self.nixl_memory_type)
         # NIXL_INIT_AGENT to be used for preparations of local descs.
-        return self.nixl_wrapper.prep_xfer_dlist("NIXL_INIT_AGENT", descs)
+        return self.nixl_wrapper.prep_xfer_dlist("NIXL_INIT_AGENT", descs), blocks_data
 
     def add_remote_agent(
         self,
@@ -1420,10 +1494,12 @@ class NixlConnectorWorker:
 
         In particular, handle both homogeneous and heterogeneous TP. The former
         requires local rank_i to read from remote rank_i.
-        The latter, assuming D.world_size > P.world_size, requires that two or
-        more local TP worker share the xfer from a single TP worker.
+        The latter, in the case of D.world_size < P.world_size, requires that a
+        local (D) TP worker reads from multiple remote (P) TP workers.
+        Conversely, assuming D.world_size > P.world_size, two or more local TP
+        workers will read from a single remote TP worker.
 
-        Here's an example (non-MLA case):
+        Here's an example for the last case described above (non-MLA):
 
         rank_offset     p_remote_tp_rank
         (kv split no)
@@ -1473,9 +1549,6 @@ class NixlConnectorWorker:
             nixl_agent_meta.agent_metadata
         )
 
-        # Handle tp_size>num_kv_heads: replicate KV cache.
-        replicates_kv_cache = self.kv_topo.replicates_kv_cache(engine_id)
-
         # Create dst descs and xfer side handles. TP workers have same #blocks
         # so we only register once per engine_id.
         # Example:
@@ -1489,14 +1562,52 @@ class NixlConnectorWorker:
             self.dst_num_blocks[engine_id] = nixl_agent_meta.num_blocks
 
         # Keep track of remote agent kv caches base addresses.
-        self.kv_caches_base_addr[engine_id] = nixl_agent_meta.kv_caches_base_addr
-
+        self.kv_caches_base_addr[engine_id][remote_tp_rank] = (
+            nixl_agent_meta.kv_caches_base_addr
+        )
         self._validate_remote_agent_handshake(nixl_agent_meta, remote_tp_size)
 
-        # Number of D TP workers reading from a single P TP worker. This is
-        # 1 when P and D `--tensor-parallel-size` match.
+        # This is 1 when P and D `--tensor-parallel-size` match. Otherwise,
+        # this is the ratio between the two sizes.
         tp_ratio = self.kv_topo.tp_ratio_from_engine_id(engine_id)
 
+        # Handle tp_size>num_kv_heads: replicate KV cache.
+        indexes_into_remote = (
+            not self.kv_topo.replicates_kv_cache(engine_id) and tp_ratio > 0
+        )
+
+        logger.debug(
+            "Registering remote agent (%s, rank %s) memory regions with tp_ratio %s",
+            engine_id,
+            remote_tp_rank,
+            tp_ratio,
+        )
+
+        ### (Optional) Register local agent memory regions. MLA is not split.
+        if (
+            tp_ratio < 0
+            and not self.use_mla
+            and tp_ratio not in self.src_xfer_handles_by_tp_ratio
+        ):
+            # Remote tp_size > local tp_size: read from multiple remote ranks.
+            # Logically "split" own regions into |tp_ratio| chunks. Mind that
+            # we only do this once per remote tp_size (replica-friendly).
+            self.src_xfer_handles_by_tp_ratio[tp_ratio] = []
+            for i in range(-tp_ratio):
+                blocks_data = []
+                for memory_region in self.src_blocks_data:
+                    addr, local_block_len, own_tp_rank = memory_region
+                    # Computing block len layer by layer allows for different
+                    # block sizes to be used.
+                    remote_block_len = local_block_len // (-tp_ratio)
+                    addr = addr + i * remote_block_len
+                    blocks_data.append((addr, remote_block_len, own_tp_rank))
+                descs = self.nixl_wrapper.get_xfer_descs(
+                    blocks_data, self.nixl_memory_type
+                )
+                handle = self.nixl_wrapper.prep_xfer_dlist("NIXL_INIT_AGENT", descs)
+                self.src_xfer_handles_by_tp_ratio[tp_ratio].append(handle)
+
         ### Register remote agent memory regions
         blocks_data = []
         # With homogeneous TP, D pulls the whole kv cache from corresponding
@@ -1506,14 +1617,19 @@ class NixlConnectorWorker:
 
         # Register all remote blocks, but only the corresponding kv heads.
         for i, base_addr in enumerate(nixl_agent_meta.kv_caches_base_addr):
-            kv_block_len = self.get_backend_aware_kv_block_len(layer_idx=i)
-            remote_kv_block_len = kv_block_len // block_size_ratio
+            # Read our whole local region size from remote.
+            local_block_len = self.get_backend_aware_kv_block_len(layer_idx=i)
+            remote_kv_block_len = local_block_len // block_size_ratio
             if block_size_ratio > 1:
                 # using remote kv_block_len as transfer unit
-                kv_block_len = remote_kv_block_len
+                local_block_len = remote_kv_block_len
+
+            if tp_ratio < 0 and not self.use_mla:
+                # Remote tp is bigger: read a chunk of local region from remote
+                local_block_len = local_block_len // (-tp_ratio)
             rank_offset = (
                 self.tp_rank % tp_ratio * remote_kv_block_len
-                if not replicates_kv_cache
+                if indexes_into_remote
                 else 0
             )
             for block_id in range(nixl_agent_meta.num_blocks):
@@ -1523,7 +1639,7 @@ class NixlConnectorWorker:
                 # self.block_len == remote_block_len//tp_ratio bytes.
                 addr = base_addr + block_offset + rank_offset
                 # (addr, len, device id)
-                blocks_data.append((addr, kv_block_len, nixl_agent_meta.device_id))
+                blocks_data.append((addr, local_block_len, nixl_agent_meta.device_id))
 
             if self.kv_topo.is_kv_layout_blocks_first:
                 # With FlashInfer index V separately to allow head splitting.
@@ -1532,7 +1648,7 @@ class NixlConnectorWorker:
                     addr = base_addr + block_offset + rank_offset
                     v_addr = addr + nixl_agent_meta.block_lens[i] // 2
                     blocks_data.append(
-                        (v_addr, kv_block_len, nixl_agent_meta.device_id)
+                        (v_addr, local_block_len, nixl_agent_meta.device_id)
                     )
 
         logger.debug(
@@ -1545,15 +1661,15 @@ class NixlConnectorWorker:
 
         # Register with NIXL.
         descs = self.nixl_wrapper.get_xfer_descs(blocks_data, self.nixl_memory_type)
-        self.dst_xfer_side_handles[engine_id] = self.nixl_wrapper.prep_xfer_dlist(
-            remote_agent_name, descs
+        self.dst_xfer_side_handles[engine_id][remote_tp_rank] = (
+            self.nixl_wrapper.prep_xfer_dlist(remote_agent_name, descs)
         )
 
         if block_size_ratio > 1:
             # when prefill with smaller block_size, we need to init a
             # new handler with same block_len to match
-            self.src_xfer_side_handles[nixl_agent_meta.block_size] = (
-                self.register_local_xfer_handler(nixl_agent_meta.block_size)
+            self.src_xfer_handles_by_block_size[nixl_agent_meta.block_size] = (
+                self.register_local_xfer_handler(nixl_agent_meta.block_size)[0]
             )
 
         return remote_agent_name
@@ -1573,10 +1689,9 @@ class NixlConnectorWorker:
         block_size_ratio = self.kv_topo.block_size_ratio_from_engine_id(
             remote_engine_id
         )
-        assert tp_ratio > 0, "Decode TP cannot be smaller than prefill TP"
-        assert not self._use_pallas or tp_ratio == 1, (
-            "TPU (pallas_v1) DOES NOT support heterogeneous TP yet."
-        )
+        # Num kv_heads > tp_size and P TP > D TP case, not supported
+        assert not (tp_ratio < 0 and self.kv_topo.is_kv_replicated(remote_engine_id))
+
         kv_cache_layout = (
             self.kv_cache_layout
             if not self.use_host_buffer
@@ -1615,17 +1730,29 @@ class NixlConnectorWorker:
                     "All remote layers must have the same block size"
                 )
 
-            assert (
-                remote_block_len
-                == (self.block_len_per_layer[0] * tp_ratio) // block_size_ratio
-            ), (
-                "Remote P worker KV layer cache must be of shape [2, N, "
-                "local_kv_heads*tp_ratio, block_size, head_dim] and same dtype."
-            )
+            if tp_ratio > 0:
+                # Remote tp is smaller: remote block_len size is bigger
+                assert (
+                    remote_block_len
+                    == (self.block_len_per_layer[0] * tp_ratio) // block_size_ratio
+                ), (
+                    "Remote P worker KV layer cache must be of shape [2, N, "
+                    "local_kv_heads*tp_ratio, page_size, head_dim] and same dtype."
+                )  # noqa: E501
+            else:
+                assert block_size_ratio == 1, (
+                    "Different local/remote block sizes are not supported when"
+                    " P TP > D TP."
+                )
+                # Remote tp is bigger: remote block_len size is smaller
+                assert remote_block_len == self.block_len_per_layer[0] // (-tp_ratio), (
+                    "Remote P worker KV layer cache must be of shape [2, N, "
+                    "local_kv_heads/tp_ratio, page_size, head_dim] and same dtype."
+                )  # noqa: E501
 
-        # TP workers have same #blocks.
+        # TP workers that handhshake with same remote have same #blocks.
         assert self.dst_num_blocks[remote_engine_id] == nixl_agent_meta.num_blocks
-
+        # Same number of regions/~layers.
         assert len(nixl_agent_meta.kv_caches_base_addr) == len(self.block_len_per_layer)
 
     def sync_recved_kv_to_device(self, req_id: str, meta: ReqMeta):
@@ -1674,90 +1801,62 @@ class NixlConnectorWorker:
                 "d2h",
             )
 
-    def permute_device_kv(self, block_ids: list[int]):
-        """Transforms the layout of received KV cache blocks to the local format.
-
-        This method corrects layout mismatches from direct memory copies by
-        permuting the tensor dimensions.
-
-        - **Source Layout:** `[num_blocks, n_kv_head, block_size, head_dim]`
-        - **Target Layout:** `[num_blocks, block_size, n_kv_head, head_dim]`
-
-        Args:
-            block_ids: A list of block IDs to update and permute.
+    def post_process_device_kv_on_receive(
+        self,
+        block_size_ratio: int,
+        block_ids_list: list[list[int]],
+    ):
+        """
+        Post process device kv cache after receiving from remote.
 
-        Implementation:
-        - x = blocks_to_update.reshape(src_shape) # view local kv with sender layout
-        - permuted_blocks = x.permute(*inv_order) # transpose n_kv_heads, block_size
-        - cache.index_copy_(0, indices, permuted_blocks) # copy permuted kv back
+        3 types of post processing supported:
+            * kv_cache_postprocess_layout => convert from HND to NHD
+            * kv_cache_postprocess_blksize => convert from small block size
+              to large block size
+            * kv_cache_postprocess_blksize_and_layout => convert from small
+              block size to large block size and convert from HND to NHD
 
         """
-        split_k_and_v = self.kv_topo.split_k_and_v
-        inv_order = [0, 2, 1, 3]
-        sample_cache = list(self.device_kv_caches.values())[0][0]
-        target_shape = list(sample_cache.shape)
-        target_shape[0] = -1
-        src_shape = tuple(target_shape[i] for i in inv_order)
-        indices = torch.tensor(block_ids, device=sample_cache.device)
-
-        for _, cache_or_caches in self.device_kv_caches.items():
-            cache_list = cache_or_caches if split_k_and_v else [cache_or_caches]
-            for cache in cache_list:
-                blocks_to_update = cache.index_select(0, indices)
-                permuted_blocks = blocks_to_update.reshape(src_shape).permute(
-                    *inv_order
-                )
-                cache.index_copy_(0, indices, permuted_blocks)
-
-    def blocksize_post_process(self, block_ids_per_ratio: dict[float, list[list[int]]]):
-        def _process_local_gt_remote(blocks_to_update, block_size_ratio):
-            n_kv_heads, block_size, head_size = blocks_to_update.shape[1:]
-            remote_block_size = block_size // block_size_ratio
-            n_blocks = block_size_ratio
-            # actual permute is to convert
-            # for local blocksize > remote blocksize
-            # ex: local blocksize = 16 tokens, remote blocksize = 4 tokens
-            # local block[0] = remote block[0, 1, 2, 3]
-            # remote is |h0-b0|h1-b0|h2-b0|h3-b0|h0-b1|h1-b1|h2-b1|h3-b1|...
-            # local is  |h0-b0..................|h1-b0..................|...
-            # permute is to:
-            # 1. view => view remote as n_blocks * remote_shape(H,remoteN,D)
-            # 2. permute => (H, nblocks, remoteN, D)
-            # 3. flatten => (H, localN, D)
-            permuted_blocks = (
-                blocks_to_update.reshape(
-                    -1, n_blocks, n_kv_heads, remote_block_size, head_size
-                )
-                .permute(0, 2, 1, 3, 4)
-                .flatten(2, 3)
-            )
-            return permuted_blocks
-
         if len(self.device_kv_caches) == 0:
             return
-        split_k_and_v = not (
-            self.use_mla or self._use_pallas or self.kv_topo.is_kv_layout_blocks_first
-        )
-        sample_cache = list(self.device_kv_caches.values())[0][0]
-        for block_size_ratio, block_ids_list in block_ids_per_ratio.items():
-            assert block_size_ratio > 1, "Only nP < nD supported currently."
-            block_ids_list = [[item for sublist in block_ids_list for item in sublist]]
-
-            for block_ids in block_ids_list:
-                indices = torch.tensor(block_ids, device=sample_cache.device)
-
-                for _, cache_or_caches in self.device_kv_caches.items():
-                    cache_list = cache_or_caches if split_k_and_v else [cache_or_caches]
-                    for cache in cache_list:
-                        blocks_to_update = cache.index_select(0, indices)
-                        # because kv_cache is always using original layout NHD as
-                        # virtual shape while stride can be either HND / NHD at
-                        # initialization.
-                        # we need to firstly get physical view of the tensor
-                        permuted_blocks = _process_local_gt_remote(
-                            blocks_to_update.permute(0, 2, 1, 3), block_size_ratio
-                        ).permute(0, 2, 1, 3)
-                        cache.index_copy_(0, indices, permuted_blocks)
+        assert block_size_ratio >= 1, "Only nP < nD supported currently."
+        if self.enable_permute_local_kv and block_size_ratio > 1:
+            logger.debug(
+                "Post-processing device kv cache on receive by converting "
+                "block_size with %sx bigger and permuting layout from HND"
+                " to NHD.",
+                block_size_ratio,
+            )
+        elif self.enable_permute_local_kv:
+            logger.debug(
+                "Post-processing device kv cache on receive by permuting layout"
+                "from HND to NHD."
+            )
+        else:
+            logger.debug(
+                "Post-processing device kv cache on receive by converting "
+                "block_size with %sx bigger.",
+                block_size_ratio,
+            )
+
+        split_k_and_v = not (self.use_mla or self.kv_topo.is_kv_layout_blocks_first)
+
+        for block_ids in block_ids_list:
+            indices = torch.tensor(block_ids, device=self.device_type, dtype=torch.long)
+
+            for _, cache_or_caches in self.device_kv_caches.items():
+                cache_list = cache_or_caches if split_k_and_v else [cache_or_caches]
+                for cache in cache_list:
+                    if self.enable_permute_local_kv and block_size_ratio > 1:
+                        kv_postprocess_blksize_and_layout_on_receive(
+                            cache, indices, block_size_ratio
+                        )
+                    elif self.enable_permute_local_kv:
+                        kv_postprocess_layout_on_receive(cache, indices)
+                    else:
+                        kv_postprocess_blksize_on_receive(
+                            cache, indices, block_size_ratio
+                        )
 
     def get_finished(self) -> tuple[set[str], set[str]]:
         """
@@ -1781,7 +1880,6 @@ class NixlConnectorWorker:
                 len(done_recving),
             )
 
-        block_ids_to_permute = []
         block_ids_for_blocksize_post_process = defaultdict(list)
         for req_id in done_recving:
             # clean up metadata for completed requests
@@ -1790,24 +1888,22 @@ class NixlConnectorWorker:
             assert meta.remote is not None
             if self.use_host_buffer:
                 self.sync_recved_kv_to_device(req_id, meta)
-            if self.enable_permute_local_kv:
-                block_ids_to_permute += meta.local_physical_block_ids
 
             # post processing for heteroblocksize
             block_size_ratio = self.kv_topo.block_size_ratio_from_engine_id(
                 meta.remote.engine_id
             )
-            if (
-                not self.use_mla
-                and block_size_ratio > 1
-                and self.kv_cache_layout == "HND"
+            if not self.use_mla and (
+                block_size_ratio > 1 or self.enable_permute_local_kv
             ):
                 block_ids_for_blocksize_post_process[block_size_ratio].append(
-                    meta.local_block_ids
+                    meta.local_physical_block_ids
                 )
-        self.blocksize_post_process(block_ids_for_blocksize_post_process)
-        if len(block_ids_to_permute) > 0:
-            self.permute_device_kv(block_ids_to_permute)
+        for (
+            block_size_ratio,
+            block_ids_list,
+        ) in block_ids_for_blocksize_post_process.items():
+            self.post_process_device_kv_on_receive(block_size_ratio, block_ids_list)
 
         # Handle timeout to avoid stranding blocks on remote.
         now = time.perf_counter()
@@ -1839,7 +1935,7 @@ class NixlConnectorWorker:
         notified_req_ids: set[str] = set()
         for notifs in self.nixl_wrapper.get_new_notifs().values():
             for notif in notifs:
-                req_id, tp_ratio = notif.decode("utf-8").rsplit(":", 1)
+                req_id, tp_size = notif.decode("utf-8").rsplit(":", 1)
                 if (
                     req_id not in self._reqs_to_send
                     and req_id not in self._reqs_to_process
@@ -1852,9 +1948,22 @@ class NixlConnectorWorker:
                     )
                     continue
 
+                # NOTE: `tp_ratio` is the opposite when swapping local<>remote
+                n_consumers = int(tp_size)
+                tp_ratio = self.kv_topo.tp_ratio(n_consumers)
+
+                # Number of reads *per producer* to wait for.
+                # When remote D TP > local P TP we expect `tp_ratio` reads.
+                consumers_per_producer = (
+                    -tp_ratio if n_consumers > self.world_size else 1
+                )
+
                 self.consumer_notification_counts_by_req[req_id] += 1
                 # Wait all consumers (D) to be done reading before freeing.
-                if self.consumer_notification_counts_by_req[req_id] == int(tp_ratio):
+                if (
+                    self.consumer_notification_counts_by_req[req_id]
+                    == consumers_per_producer
+                ):
                     notified_req_ids.add(req_id)
                     del self.consumer_notification_counts_by_req[req_id]
                     self._reqs_to_process.remove(req_id)
@@ -1871,7 +1980,7 @@ class NixlConnectorWorker:
         """
         done_req_ids: set[str] = set()
         for req_id, handles in list(transfers.items()):
-            in_progress = False
+            in_progress = []
             for handle in handles:
                 try:
                     xfer_state = self.nixl_wrapper.check_xfer_state(handle)
@@ -1881,29 +1990,31 @@ class NixlConnectorWorker:
                         self.xfer_stats.record_transfer(res)
                         self.nixl_wrapper.release_xfer_handle(handle)
                     elif xfer_state == "PROC":
-                        in_progress = True
+                        in_progress.append(handle)
                         continue
                     else:
-                        logger.error(
-                            "NIXL transfer failed for request %s with state "
-                            "%s. Marking blocks as invalid.",
-                            req_id,
-                            xfer_state,
+                        self._log_failure(
+                            failure_type="transfer_failed",
+                            msg="Marking blocks as invalid",
+                            req_id=req_id,
+                            xfer_state=xfer_state,
                         )
                         self._handle_failed_transfer(req_id, handle)
-                        in_progress = False
-                except Exception:
-                    logger.exception(
-                        "NIXL transfer exception for request %s. "
-                        "Marking blocks as invalid.",
-                        req_id,
+                except Exception as e:
+                    self._log_failure(
+                        failure_type="transfer_exception",
+                        msg="Marking blocks as invalid",
+                        req_id=req_id,
+                        error=e,
                     )
                     self._handle_failed_transfer(req_id, handle)
-                    in_progress = False
 
             if not in_progress:
+                # Only report request as completed when all transfers are done.
                 done_req_ids.add(req_id)
                 del transfers[req_id]
+            else:
+                transfers[req_id] = in_progress
         return done_req_ids
 
     def _handle_failed_transfer(self, req_id: str, handle: int):
@@ -1915,9 +2026,9 @@ class NixlConnectorWorker:
             req_id: The request ID.
             handle: The transfer handle.
         """
-        if meta := self._recving_metadata.pop(req_id, None):
+        # Use .get() here as the metadata cleanup is handled by get_finished()
+        if meta := self._recving_metadata.get(req_id):
             self._invalid_block_ids.update(meta.local_block_ids)
-        self._recving_metadata.pop(req_id, None)
         self.nixl_wrapper.release_xfer_handle(handle)
         self.xfer_stats.record_failed_transfer()
 
@@ -1981,18 +2092,62 @@ class NixlConnectorWorker:
 
     def _read_blocks_for_req(self, req_id: str, meta: ReqMeta):
         assert meta.remote is not None
-        logger.debug(
-            "Remote agent %s available, calling _read_blocks for req %s",
-            meta.remote.engine_id,
-            req_id,
-        )
-        self._read_blocks(
-            request_id=req_id,
-            dst_engine_id=meta.remote.engine_id,
-            remote_request_id=meta.remote.request_id,
-            local_block_ids=meta.local_physical_block_ids,
-            remote_block_ids=meta.remote.block_ids,
+        remote_ranks = self.kv_topo.get_target_remote_ranks_from_engine_id(
+            meta.remote.engine_id
         )
+        tp_ratio = self.kv_topo.tp_ratio_from_engine_id(meta.remote.engine_id)
+        # D may have to perform multiple reads from different remote ranks.
+        for i, remote_rank in enumerate(remote_ranks):
+            if self.use_mla and tp_ratio < 0 and i > 0:
+                # MLA opt: when P TP > D TP, only a single read is executed for
+                # the first remote rank (cache is duplicated)..
+                break
+
+            remote_block_size = self.kv_topo.remote_block_size[meta.remote.engine_id]
+            logger.debug(
+                "Remote agent %s available, calling _read_blocks"
+                " on remote rank %s with remote block size %s for req %s",
+                meta.remote.engine_id,
+                remote_rank,
+                remote_block_size,
+                req_id,
+            )
+            # Get side handles.
+            if tp_ratio < 0 and not self.use_mla:
+                assert remote_block_size == self.block_size
+                # Remote tp_size > local tp_size: we must perform multiple
+                # reads. Get the memory chunk onto which we will write to.
+                local_xfer_side_handle = self.src_xfer_handles_by_tp_ratio[tp_ratio][i]
+            else:
+                # Single read from remote, we write to the whole memory region.
+                # Also handle remote block size different from local block size.
+                local_xfer_side_handle = self.src_xfer_handles_by_block_size[
+                    remote_block_size
+                ]
+
+            # Destination handle: remote_engine_id -> remote_rank -> handle.
+            remote_xfer_side_handle = self.dst_xfer_side_handles[meta.remote.engine_id][
+                remote_rank
+            ]
+            self._read_blocks(
+                request_id=req_id,
+                dst_engine_id=meta.remote.engine_id,
+                remote_request_id=meta.remote.request_id,
+                local_block_ids=meta.local_physical_block_ids,
+                remote_block_ids=meta.remote.block_ids,
+                remote_rank=remote_rank,
+                local_xfer_side_handle=local_xfer_side_handle,
+                remote_xfer_side_handle=remote_xfer_side_handle,
+            )
+
+            if self.use_mla and tp_ratio < 0:
+                # ..but we still need to notify the other remote ranks that we
+                # have the blocks we need so they can update the request state.
+                notif_id = f"{req_id}:{self.world_size}".encode()
+                remote_agents = self._remote_agents[meta.remote.engine_id]
+                for rank_to_notify, agent in remote_agents.items():
+                    if rank_to_notify != remote_rank:
+                        self.nixl_wrapper.send_notif(agent, notif_msg=notif_id)
 
     def _read_blocks(
         self,
@@ -2001,7 +2156,14 @@ class NixlConnectorWorker:
         dst_engine_id: str,
         request_id: str,
         remote_request_id: str,
+        remote_rank: int,
+        local_xfer_side_handle: int,
+        remote_xfer_side_handle: int,
     ):
+        """
+        Post a READ point-to-point xfer request from a single local worker to
+        a single remote worker.
+        """
         block_size_ratio = self.kv_topo.block_size_ratio_from_engine_id(dst_engine_id)
         if block_size_ratio > 1:
             local_block_ids = self.get_mapped_blocks(
@@ -2030,27 +2192,27 @@ class NixlConnectorWorker:
         # saturate IB with heterogeneous TP sizes. We should remove the staging
         # blocks until we are ready.
 
-        # Number of D TP workers that will read from dst P. Propagate tp_ratio
+        # Number of D TP workers that will read from dst P. Propagate info
         # on notification so that dst worker can wait before freeing blocks.
-        tp_ratio = self.kv_topo.tp_ratio_from_engine_id(dst_engine_id)
-        notif_id = f"{remote_request_id}:{tp_ratio}".encode()
+        notif_id = f"{remote_request_id}:{self.world_size}".encode()
 
         # Full prefix cache hit: do not need to read remote blocks,
         # just notify P worker that we have the blocks we need.
         num_local_blocks = len(local_block_ids)
         if num_local_blocks == 0:
-            remote_rank = self.kv_topo.get_target_remote_rank_from_engine_id(
-                dst_engine_id
-            )
             agent_name = self._remote_agents[dst_engine_id][remote_rank]
             try:
                 self.nixl_wrapper.send_notif(agent_name, notif_msg=notif_id)
-            except Exception:
-                logger.exception(
-                    "NIXL send_notif failed for request %s: "
-                    "P worker blocks will be freed after timeout. "
+            except Exception as e:
+                self._log_failure(
+                    failure_type="notification_failed",
+                    msg="P worker blocks will be freed after timeout. "
                     "This may indicate network issues.",
-                    request_id,
+                    req_id=request_id,
+                    error=e,
+                    dst_engine_id=dst_engine_id,
+                    remote_rank=remote_rank,
+                    remote_agent_name=agent_name,
                 )
                 self.xfer_stats.record_failed_notification()
             return
@@ -2061,13 +2223,6 @@ class NixlConnectorWorker:
         if num_local_blocks < num_remote_blocks:
             remote_block_ids = remote_block_ids[-num_local_blocks:]
 
-        # Get side handles.
-        remote_block_size = self.kv_topo.remote_block_size[dst_engine_id]
-        local_xfer_side_handle = self.src_xfer_side_handles.get(
-            remote_block_size, self.src_xfer_side_handle
-        )
-        remote_xfer_side_handle = self.dst_xfer_side_handles[dst_engine_id]
-
         # NOTE (nicolo) With homogeneous TP, each TP worker loads KV from
         # corresponding rank. With heterogeneous TP, fixing D>P, the D tp
         # workers will issue xfers to parts of the P worker remote kv caches.
@@ -2142,13 +2297,16 @@ class NixlConnectorWorker:
 
             # Use handle to check completion in future step().
             self._recving_transfers[request_id].append(handle)
-        except Exception:
-            logger.exception(
-                "NIXL transfer setup/initiation failed for request %s. "
-                "Marking blocks as invalid.",
-                request_id,
-            )
+        except Exception as e:
             # mark all (logical) blocks for this request as invalid
+            self._log_failure(
+                failure_type="transfer_setup_failed",
+                req_id=request_id,
+                msg="Marking blocks as invalid",
+                error=e,
+                dst_engine_id=dst_engine_id,
+                remote_rank=remote_rank,
+            )
             if meta := self._recving_metadata.get(request_id):
                 self._invalid_block_ids.update(meta.local_block_ids)
             self.xfer_stats.record_failed_transfer()
@@ -2229,7 +2387,7 @@ class NixlConnectorWorker:
             block_ids_np, self._physical_blocks_per_logical_kv_block, block_arange
         ).tolist()
 
-    def get_backend_aware_kv_block_len(self, layer_idx: int):
+    def get_backend_aware_kv_block_len(self, layer_idx: int) -> int:
         """
         Get the block length for one K/V element (K and V have the same size).
 
@@ -2275,11 +2433,16 @@ class NixlConnectorWorker:
             for handle in handles:
                 self.nixl_wrapper.release_xfer_handle(handle)
         self._recving_transfers.clear()
-        if self.src_xfer_side_handle:
-            self.nixl_wrapper.release_dlist_handle(self.src_xfer_side_handle)
-            self.src_xfer_side_handle = 0
-        for dst_xfer_side_handle in self.dst_xfer_side_handles.values():
-            self.nixl_wrapper.release_dlist_handle(dst_xfer_side_handle)
+        for handle in self.src_xfer_handles_by_block_size.values():
+            self.nixl_wrapper.release_dlist_handle(handle)
+        self.src_xfer_handles_by_block_size.clear()
+        for handles in self.src_xfer_handles_by_tp_ratio.values():
+            for handle in handles:
+                self.nixl_wrapper.release_dlist_handle(handle)
+        self.src_xfer_handles_by_tp_ratio.clear()
+        for dst_xfer_side_handles in self.dst_xfer_side_handles.values():
+            for dst_xfer_side_handle in dst_xfer_side_handles.values():
+                self.nixl_wrapper.release_dlist_handle(dst_xfer_side_handle)
         self.dst_xfer_side_handles.clear()
         for remote_agents in self._remote_agents.values():
             for agent_name in remote_agents.values():
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py
index 0ad9d4ae1b39fcfd0bc6e24040c5ce28ae562a13..707dce4d26f4c6ff5c3ed3c63804e0d56ab6c4bc 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py
@@ -1,17 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections import defaultdict
-from collections.abc import Iterable, Iterator
+from collections.abc import Iterable
 from dataclasses import dataclass
 from itertools import islice
-from typing import Any, ClassVar
+from typing import Any
 
 import torch
 
-from vllm.attention.backends.abstract import AttentionBackend, AttentionMetadata
 from vllm.attention.layer import Attention
 from vllm.config import VllmConfig, get_layers_from_vllm_config
 from vllm.distributed.kv_events import BlockRemoved, BlockStored, KVCacheEvent
+from vllm.distributed.kv_transfer.kv_connector.utils import yield_req_data
 from vllm.distributed.kv_transfer.kv_connector.v1 import (
     KVConnectorBase_V1,
     KVConnectorRole,
@@ -19,6 +19,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1 import (
 from vllm.distributed.kv_transfer.kv_connector.v1.base import KVConnectorMetadata
 from vllm.forward_context import ForwardContext
 from vllm.logger import init_logger
+from vllm.v1.attention.backend import AttentionBackend, AttentionMetadata
 from vllm.v1.core.kv_cache_manager import KVCacheBlocks
 from vllm.v1.core.kv_cache_utils import BlockHash
 from vllm.v1.core.sched.output import SchedulerOutput
@@ -43,7 +44,9 @@ class OffloadingConnectorMetadata(KVConnectorMetadata):
 
 
 class OffloadingConnector(KVConnectorBase_V1):
-    prefer_cross_layer_blocks: ClassVar[bool] = True
+    @property
+    def prefer_cross_layer_blocks(self) -> bool:
+        return True
 
     def __init__(
         self,
@@ -53,7 +56,7 @@ class OffloadingConnector(KVConnectorBase_V1):
     ):
         super().__init__(vllm_config, role, kv_cache_config)
 
-        spec = OffloadingSpecFactory.create_spec(vllm_config)
+        spec = OffloadingSpecFactory.create_spec(vllm_config, kv_cache_config)
 
         self.connector_scheduler: OffloadingConnectorScheduler | None = None
         self.connector_worker: OffloadingConnectorWorker | None = None
@@ -72,10 +75,14 @@ class OffloadingConnector(KVConnectorBase_V1):
         assert self.connector_worker is not None
         self.connector_worker.register_cross_layers_kv_cache(kv_cache, attn_backend)
 
+    def handle_preemptions(self, preempted_req_ids: set[str]):
+        assert self.connector_worker is not None
+        self.connector_worker.handle_preemptions(preempted_req_ids)
+
     def start_load_kv(self, forward_context: "ForwardContext", **kwargs) -> None:
         assert self.connector_worker is not None
         assert isinstance(self._connector_metadata, OffloadingConnectorMetadata)
-        self.connector_worker.start_load_kv(self._connector_metadata)
+        self.connector_worker.start_kv_transfers(self._connector_metadata)
 
     def wait_for_layer_load(self, layer_name: str) -> None:
         pass
@@ -92,7 +99,7 @@ class OffloadingConnector(KVConnectorBase_V1):
     def wait_for_save(self):
         assert self.connector_worker is not None
         assert isinstance(self._connector_metadata, OffloadingConnectorMetadata)
-        self.connector_worker.start_store_kv(self._connector_metadata)
+        self.connector_worker.prepare_store_kv(self._connector_metadata)
 
     def get_finished(self, finished_req_ids: set[str]) -> tuple[set[str], set[str]]:
         assert self.connector_worker is not None
@@ -345,6 +352,15 @@ class OffloadingConnectorScheduler:
             reqs_to_store=self._get_reqs_to_store(scheduler_output),
         )
         self._reqs_to_load = {}
+
+        # NOTE (orozery): we should move this logic to update_connector_output
+        # once KVConnectorOutput allows us to report completed transfers
+        for req_id in scheduler_output.preempted_req_ids or ():
+            block_hashes = self._reqs_being_stored.get(req_id)
+            if block_hashes:
+                self.manager.complete_store(block_hashes)
+                block_hashes.clear()
+
         return meta
 
     def update_connector_output(self, connector_output: KVConnectorOutput):
@@ -405,6 +421,7 @@ class OffloadingConnectorScheduler:
                     lora_id=None,
                     block_size=event.block_size,
                     medium=event.medium,
+                    lora_name=None,
                 )
 
 
@@ -423,6 +440,8 @@ class OffloadingConnectorWorker:
         self._load_job: dict[ReqId, int] = {}
         # req_id -> set(active job IDs)
         self._store_jobs = defaultdict[ReqId, set[int]](set)
+        # list of store jobs pending submission (job_id, transfer_spec)
+        self._unsubmitted_store_jobs: list[tuple[int, TransferSpec]] = []
 
         self._finished_reqs_waiting_for_store: set[ReqId] = set()
 
@@ -460,20 +479,40 @@ class OffloadingConnectorWorker:
         attn_backends = {cross_layer_name: attn_backend}
         self._register_handlers(kv_caches, attn_backends)
 
-    def start_load_kv(self, metadata: OffloadingConnectorMetadata):
+    def handle_preemptions(self, preempted_req_ids: set[str]):
+        for job_id, transfer_spec in self._unsubmitted_store_jobs:
+            success = self.worker.transfer_async(job_id, transfer_spec)
+            assert success
+        self._unsubmitted_store_jobs.clear()
+
+        for req_id in preempted_req_ids:
+            job_ids = self._store_jobs.get(req_id)
+            if job_ids:
+                self.worker.wait(job_ids)
+
+    def start_kv_transfers(self, metadata: OffloadingConnectorMetadata):
+        for job_id, transfer_spec in self._unsubmitted_store_jobs:
+            success = self.worker.transfer_async(job_id, transfer_spec)
+            assert success
+        self._unsubmitted_store_jobs.clear()
+
         for req_id, transfer_spec in metadata.reqs_to_load.items():
             job_id = self._generate_job_id()
             self._jobs[job_id] = (req_id, False)
             assert req_id not in self._load_job
             self._load_job[req_id] = job_id
-            assert self.worker.transfer_async(job_id, transfer_spec)
+            success = self.worker.transfer_async(job_id, transfer_spec)
+            assert success
 
-    def start_store_kv(self, metadata: OffloadingConnectorMetadata):
+    def prepare_store_kv(self, metadata: OffloadingConnectorMetadata):
         for req_id, transfer_spec in metadata.reqs_to_store.items():
             job_id = self._generate_job_id()
             self._jobs[job_id] = (req_id, True)
             self._store_jobs[req_id].add(job_id)
-            assert self.worker.transfer_async(job_id, transfer_spec)
+            # NOTE(orozery): defer the store to the beginning of the next engine step,
+            # so that offloading starts AFTER transfers related to token sampling,
+            # thereby avoiding delays to token generation due to offloading.
+            self._unsubmitted_store_jobs.append((job_id, transfer_spec))
 
     def get_finished(self, finished_req_ids: set[str]) -> tuple[set[str], set[str]]:
         """
@@ -516,23 +555,3 @@ class OffloadingConnectorWorker:
                 del self._store_jobs[req_id]
 
         return finished_sending, finished_recving
-
-
-def yield_req_data(
-    scheduler_output,
-) -> Iterator[tuple[str, tuple[list[int], ...], bool]]:
-    """
-    Yields:
-        (req_id, new_block_id_groups, preempted)
-    """
-    # new requests
-    for req_data in scheduler_output.scheduled_new_reqs:
-        yield req_data.req_id, req_data.block_ids, False
-
-    # cached requests
-    cached_reqs = scheduler_output.scheduled_cached_reqs
-    yield from zip(
-        cached_reqs.req_ids,
-        cached_reqs.new_block_ids,
-        (req_id in cached_reqs.resumed_req_ids for req_id in cached_reqs.req_ids),
-    )
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
index d00b86e14232dfb5a7909954af45af2ca1f852c1..5a0ca192e63cf9e0394be0490945fffe87370838 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
@@ -8,9 +8,6 @@ import os
 import regex as re
 import torch
 
-from vllm import envs
-
-from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import VllmConfig
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
     KVConnectorBase_V1,
@@ -22,6 +19,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.p2p.p2p_nccl_engine import (
 )
 from vllm.distributed.parallel_state import get_world_group
 from vllm.logger import init_logger
+from vllm.v1.attention.backend import AttentionMetadata
 from vllm.v1.attention.backends.mla.common import MLACommonMetadata
 from vllm.v1.core.sched.output import SchedulerOutput
 
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 5555e9aea539ae3e2996a137b2bee5ed9ce5e93e..09e7b370a737e8ebf34f277293b55d71e71ba16e 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -1010,10 +1010,17 @@ class GroupCoordinator:
         hidden_states: torch.Tensor,
         router_logits: torch.Tensor,
         is_sequence_parallel: bool = False,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
+        extra_tensors: list[torch.Tensor] | None = None,
+    ) -> (
+        tuple[torch.Tensor, torch.Tensor]
+        | tuple[torch.Tensor, torch.Tensor, list[torch.Tensor]]
+    ):
         if self.device_communicator is not None:
-            return self.device_communicator.dispatch(
-                hidden_states, router_logits, is_sequence_parallel
+            return self.device_communicator.dispatch(  # type: ignore[call-arg]
+                hidden_states,
+                router_logits,
+                is_sequence_parallel,
+                extra_tensors,
             )
         else:
             return hidden_states, router_logits
@@ -1111,7 +1118,11 @@ _EP: GroupCoordinator | None = None
 
 
 def get_ep_group() -> GroupCoordinator:
-    assert _EP is not None, "expert parallel group is not initialized"
+    assert _EP is not None, (
+        "expert parallel group is not initialized. "
+        "EP group is only created for MoE models with num_experts > 0. "
+        "This function should only be called for MoE models."
+    )
     return _EP
 
 
@@ -1169,9 +1180,9 @@ def init_distributed_environment(
         distributed_init_method,
         backend,
     )
-    from vllm.config import get_current_vllm_config
+    from vllm.config import get_current_vllm_config_or_none
 
-    config = get_current_vllm_config()
+    config = get_current_vllm_config_or_none()
     if (
         config is not None
         and config.parallel_config.distributed_executor_backend != "external_launcher"
@@ -1244,7 +1255,7 @@ def init_distributed_environment(
     if _WORLD is None:
         ranks = list(range(torch.distributed.get_world_size()))
         _WORLD = init_world_group(ranks, local_rank, backend)
-        if config.parallel_config.nnodes > 1:
+        if config is not None and config.parallel_config.nnodes > 1:
             _NODE_COUNT = config.parallel_config.nnodes
         else:
             _NODE_COUNT = _node_count(_WORLD.cpu_group)
@@ -1253,7 +1264,7 @@ def init_distributed_environment(
         assert _WORLD.world_size == torch.distributed.get_world_size(), (
             "world group already initialized with a different world size"
         )
-    if config.parallel_config.nnodes_within_dp > 1:
+    if config is not None and config.parallel_config.nnodes_within_dp > 1:
         if parallel_config.data_parallel_size > 1:
             world_size_inner_dp = parallel_config.world_size
             group_ranks = [
@@ -1309,9 +1320,9 @@ def initialize_model_parallel(
     backend = backend or torch.distributed.get_backend(get_world_group().device_group)
 
     data_parallel_size = 1
-    from vllm.config import get_current_vllm_config
+    from vllm.config import get_current_vllm_config_or_none
 
-    config = get_current_vllm_config()
+    config = get_current_vllm_config_or_none()
     if config is not None:
         data_parallel_size = config.parallel_config.data_parallel_size
 
@@ -1397,20 +1408,23 @@ def initialize_model_parallel(
 
     global _EP
     assert _EP is None, "expert parallel group is already initialized"
-    group_ranks = (
-        all_ranks.transpose(1, 2)
-        .reshape(
-            -1,
-            data_parallel_size
-            * prefill_context_model_parallel_size
-            * tensor_model_parallel_size,
+    # Don't create EP group for dense models.
+    if config is None or config.model_config is None or config.model_config.is_moe:
+        group_ranks = (
+            all_ranks.transpose(1, 2)
+            .reshape(
+                -1,
+                data_parallel_size
+                * prefill_context_model_parallel_size
+                * tensor_model_parallel_size,
+            )
+            .unbind(0)
         )
-        .unbind(0)
-    )
-    group_ranks = [x.tolist() for x in group_ranks]
-    _EP = init_model_parallel_group(
-        group_ranks, get_world_group().local_rank, backend, group_name="ep"
-    )
+        group_ranks = [x.tolist() for x in group_ranks]
+        _EP = init_model_parallel_group(
+            group_ranks, get_world_group().local_rank, backend, group_name="ep"
+        )
+    # If no EP group needed, _EP remains None
 
     logger.info_once(
         "rank %s in world size %s is assigned as "
@@ -1422,7 +1436,7 @@ def initialize_model_parallel(
         _PP.rank_in_group,
         _PCP.rank_in_group,
         _TP.rank_in_group,
-        _EP.rank_in_group,
+        _EP.rank_in_group if _EP is not None else "N/A",
     )
 
 
@@ -1519,22 +1533,22 @@ def patch_tensor_parallel_group(tp_group: GroupCoordinator):
         _TP = old_tp_group
 
 
-def get_tensor_model_parallel_world_size():
+def get_tensor_model_parallel_world_size() -> int:
     """Return world size for the tensor model parallel group."""
     return get_tp_group().world_size
 
 
-def get_tensor_model_parallel_rank():
+def get_tensor_model_parallel_rank() -> int:
     """Return my rank for the tensor model parallel group."""
     return get_tp_group().rank_in_group
 
 
-def get_decode_context_model_parallel_world_size():
+def get_decode_context_model_parallel_world_size() -> int:
     """Return world size for the decode context model parallel group."""
     return get_dcp_group().world_size
 
 
-def get_decode_context_model_parallel_rank():
+def get_decode_context_model_parallel_rank() -> int:
     """Return my rank for the decode context model parallel group."""
     return get_dcp_group().rank_in_group
 
diff --git a/vllm/distributed/tpu_distributed_utils.py b/vllm/distributed/tpu_distributed_utils.py
deleted file mode 100644
index 4ff1f0ce4410a6dd0a1e9540cd6d83c955461c5f..0000000000000000000000000000000000000000
--- a/vllm/distributed/tpu_distributed_utils.py
+++ /dev/null
@@ -1,188 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from collections import OrderedDict
-from typing import Optional
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch_xla.distributed.spmd as xs
-from torch.nn.parameter import Parameter
-
-from vllm.logger import init_logger
-from vllm.model_executor.layers.linear import (
-    ColumnParallelLinear,
-    QKVParallelLinear,
-    RowParallelLinear,
-)
-
-logger = init_logger(__name__)
-
-
-class XlaQKVParallelLinear(nn.Module):
-    def __init__(self, qkv_linear: nn.Module, mesh: Optional["xs.Mesh"] = None):
-        super().__init__()
-        assert isinstance(qkv_linear, QKVParallelLinear)
-        self.skip_bias_add = qkv_linear.skip_bias_add
-        self.return_bias = qkv_linear.return_bias
-        assert qkv_linear.tp_size == 1, "TP > 1 is only supported under SPMD."
-
-        self.q_weight: Parameter
-        self.k_weight: Parameter
-        self.v_weight: Parameter
-        self.q_bias: Parameter | None
-        self.k_bias: Parameter | None
-        self.v_bias: Parameter | None
-        self._load_weights_from_qkv_linear(qkv_linear)
-        if mesh is not None:
-            self._shard_weight(mesh)
-
-    def _shard_weight(self, mesh: "xs.Mesh"):
-        self.q_weight = Parameter(self.q_weight.to("xla"), requires_grad=False)
-        self.k_weight = Parameter(self.k_weight.to("xla"), requires_grad=False)
-        self.v_weight = Parameter(self.v_weight.to("xla"), requires_grad=False)
-        xs.mark_sharding(self.q_weight, mesh, ("x", None))
-        xs.mark_sharding(self.k_weight, mesh, ("x", None))
-        xs.mark_sharding(self.v_weight, mesh, ("x", None))
-        if self.q_bias is not None:
-            assert self.k_bias is not None and self.v_bias is not None, (
-                "QKVParallelLinear should have q, k, and v biases together."
-            )
-            self.q_bias = Parameter(self.q_bias.to("xla"), requires_grad=False)
-            xs.mark_sharding(self.q_bias, mesh, ("x",))
-            self.k_bias = Parameter(self.k_bias.to("xla"), requires_grad=False)
-            xs.mark_sharding(self.k_bias, mesh, ("x",))
-            self.v_bias = Parameter(self.v_bias.to("xla"), requires_grad=False)
-            xs.mark_sharding(self.v_bias, mesh, ("x",))
-
-    def _load_weights_from_qkv_linear(self, qkv_linear: nn.Module):
-        q_proj_size, k_proj_size, _ = qkv_linear.output_sizes
-        # The weight of qkv linear is a concatenation of q, k, and v weights
-        # along the output dimension.
-        qkv_weight = qkv_linear.weight.data.cpu()
-        q_weight = Parameter(qkv_weight[:q_proj_size], requires_grad=False)
-        k_weight = Parameter(
-            qkv_weight[q_proj_size : q_proj_size + k_proj_size], requires_grad=False
-        )
-        v_weight = Parameter(
-            qkv_weight[q_proj_size + k_proj_size :], requires_grad=False
-        )
-        self.register_parameter("q_weight", q_weight)
-        self.register_parameter("k_weight", k_weight)
-        self.register_parameter("v_weight", v_weight)
-
-        if qkv_linear.bias is not None:
-            q_bias = Parameter(qkv_linear.bias[:q_proj_size], requires_grad=False)
-            k_bias = Parameter(
-                qkv_linear.bias[q_proj_size : q_proj_size + k_proj_size],
-                requires_grad=False,
-            )
-            v_bias = Parameter(
-                qkv_linear.bias[q_proj_size + k_proj_size :], requires_grad=False
-            )
-            self.register_parameter("q_bias", q_bias)
-            self.register_parameter("k_bias", k_bias)
-            self.register_parameter("v_bias", v_bias)
-        else:
-            self.register_parameter("q_bias", None)
-            self.register_parameter("k_bias", None)
-            self.register_parameter("v_bias", None)
-
-    def forward(self, input):
-        # Same forward functionality as QKVParallelLinear, but doing qkv porj
-        # separately.
-        q_bias = self.q_bias if not self.skip_bias_add else None
-        k_bias = self.k_bias if not self.skip_bias_add else None
-        v_bias = self.v_bias if not self.skip_bias_add else None
-        q_proj = F.linear(input, self.q_weight, q_bias)
-        k_proj = F.linear(input, self.k_weight, k_bias)
-        v_proj = F.linear(input, self.v_weight, v_bias)
-        # The q/k/v projections will be split outside of the QKVParallelLinear.
-        # Because we are replacing XlaQKVParallelLinear with the
-        # QKVParallelLinear, we need to concatenate q, k, and v projections to
-        # match the output shape of the QKVParallelLinear implementation even if
-        # it seems to be redundant.
-        # The concat and the following split will be noop, and should be
-        # optimized away by the compiler.
-        qkv_proj = torch.cat([q_proj, k_proj, v_proj], dim=-1)
-        output_bias = (
-            torch.cat([q_bias, k_bias, v_bias], dim=-1) if self.skip_bias_add else None
-        )
-        if not self.return_bias:
-            return qkv_proj
-        return qkv_proj, output_bias
-
-
-def partition_column_parallel_linear(
-    layer: torch.nn.Module, mesh: xs.Mesh
-) -> torch.nn.Module:
-    assert isinstance(layer, ColumnParallelLinear)
-    xs.mark_sharding(layer.weight, mesh, ("x", None))
-    logger.debug("Applied column-parallel sharding to %s", layer)
-    return layer
-
-
-def partition_row_parallel_linear(
-    layer: torch.nn.Module, mesh: xs.Mesh
-) -> torch.nn.Module:
-    assert isinstance(layer, RowParallelLinear)
-    xs.mark_sharding(layer.weight, mesh, (None, "x"))
-    logger.debug("Applied row-parallel sharding to %s", layer)
-    return layer
-
-
-def partition_qkv_parallel_linear(
-    layer: torch.nn.Module, mesh: xs.Mesh
-) -> torch.nn.Module:
-    assert isinstance(layer, QKVParallelLinear)
-    xla_layer = XlaQKVParallelLinear(layer, mesh)
-    logger.debug("Applied qkv parallel sharding to %s", layer)
-    return xla_layer
-
-
-MODULE_TYPE_TO_WRAPPING_FUNC = OrderedDict(
-    [
-        ("QKVParallelLinear", partition_qkv_parallel_linear),
-        ("ColumnParallelLinear", partition_column_parallel_linear),
-        ("RowParallelLinear", partition_row_parallel_linear),
-    ]
-)
-
-
-def get_fqn(module):
-    # Get the fully qualified name of the module
-    return module.__class__.__qualname__
-
-
-def shard_model(model: torch.nn.Module, mesh: "xs.Mesh") -> None:
-    """
-    Recursively check a PyTorch model and apply appropriate sharding based on
-    the MODULE_TYPE_TO_WRAPPING_FUNC mapping.
-
-    Args:
-        model: torch.nn.Module to process
-        mesh: An XLA SPMD mesh object used for sharding
-    """
-
-    def _process_module(module, name=None, parent=None):
-        for module_type, wrapping_func in MODULE_TYPE_TO_WRAPPING_FUNC.items():
-            if get_fqn(module) == module_type:
-                wrapped_module = wrapping_func(module, mesh)
-
-                assert parent is not None and name is not None, (
-                    "Top Level module is not expected to be wrapped."
-                )
-                if wrapped_module is not module:
-                    # Wrapped module and module are different py object.
-                    # The original module should be replaced by the
-                    # wrapped_module.
-                    logger.debug("replace %s with %s", module, wrapped_module)
-                    setattr(parent, name, wrapped_module)
-
-                module = wrapped_module
-                break
-
-        for child_name, child_module in list(module.named_children()):
-            _process_module(child_module, child_name, module)
-
-    _process_module(model)
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index ff8175ff8abe0c70ad70fd52a3842f0deecb0034..7ece97181b8c246ded9bc71f292a26d7df4d889f 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -34,7 +34,6 @@ from pydantic.fields import FieldInfo
 from typing_extensions import TypeIs
 
 import vllm.envs as envs
-from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config import (
     AttentionConfig,
     CacheConfig,
@@ -95,6 +94,8 @@ from vllm.transformers_utils.utils import is_cloud_storage
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.mem_constants import GiB_bytes
 from vllm.utils.network_utils import get_ip
+from vllm.utils.torch_utils import resolve_kv_cache_dtype_string
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
 from vllm.v1.sample.logits_processor import LogitsProcessor
 
 if TYPE_CHECKING:
@@ -108,6 +109,7 @@ else:
     LoadFormats = Any
     UsageContext = Any
 
+
 logger = init_logger(__name__)
 
 # object is used to allow for special typing forms
@@ -297,16 +299,14 @@ def _compute_kwargs(cls: ConfigType) -> dict[str, dict[str, Any]]:
         elif contains_type(type_hints, set):
             kwargs[name].update(collection_to_kwargs(type_hints, set))
         elif contains_type(type_hints, int):
-            kwargs[name]["type"] = int
-            # Special case for large integers
-            human_readable_ints = {
-                "max_model_len",
-                "max_num_batched_tokens",
-                "kv_cache_memory_bytes",
-            }
-            if name in human_readable_ints:
+            if name == "max_model_len":
+                kwargs[name]["type"] = human_readable_int_or_auto
+                kwargs[name]["help"] += f"\n\n{human_readable_int_or_auto.__doc__}"
+            elif name in ("max_num_batched_tokens", "kv_cache_memory_bytes"):
                 kwargs[name]["type"] = human_readable_int
                 kwargs[name]["help"] += f"\n\n{human_readable_int.__doc__}"
+            else:
+                kwargs[name]["type"] = int
         elif contains_type(type_hints, float):
             kwargs[name]["type"] = float
         elif contains_type(type_hints, dict) and (
@@ -365,6 +365,8 @@ class EngineArgs:
     """Arguments for vLLM engine."""
 
     model: str = ModelConfig.model
+    enable_return_routed_experts: bool = ModelConfig.enable_return_routed_experts
+    model_weights: str = ModelConfig.model_weights
     served_model_name: str | list[str] | None = ModelConfig.served_model_name
     tokenizer: str | None = ModelConfig.tokenizer
     hf_config_path: str | None = ModelConfig.hf_config_path
@@ -417,11 +419,12 @@ class EngineArgs:
     data_parallel_external_lb: bool = False
     data_parallel_backend: str = ParallelConfig.data_parallel_backend
     enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel
-    all2all_backend: str | None = ParallelConfig.all2all_backend
+    all2all_backend: str = ParallelConfig.all2all_backend
     enable_dbo: bool = ParallelConfig.enable_dbo
+    ubatch_size: int = ParallelConfig.ubatch_size
     dbo_decode_token_threshold: int = ParallelConfig.dbo_decode_token_threshold
     dbo_prefill_token_threshold: int = ParallelConfig.dbo_prefill_token_threshold
-    disable_nccl_for_dp_synchronization: bool = (
+    disable_nccl_for_dp_synchronization: bool | None = (
         ParallelConfig.disable_nccl_for_dp_synchronization
     )
     eplb_config: EPLBConfig = get_field(ParallelConfig, "eplb_config")
@@ -460,6 +463,7 @@ class EngineArgs:
     hf_overrides: HfOverrides = get_field(ModelConfig, "hf_overrides")
     tokenizer_revision: str | None = ModelConfig.tokenizer_revision
     quantization: QuantizationMethods | None = ModelConfig.quantization
+    allow_deprecated_quantization: bool = ModelConfig.allow_deprecated_quantization
     enforce_eager: bool = ModelConfig.enforce_eager
     disable_custom_all_reduce: bool = ParallelConfig.disable_custom_all_reduce
     limit_mm_per_prompt: dict[str, int | dict[str, int]] = get_field(
@@ -493,6 +497,7 @@ class EngineArgs:
     fully_sharded_loras: bool = LoRAConfig.fully_sharded_loras
     max_cpu_loras: int | None = LoRAConfig.max_cpu_loras
     lora_dtype: str | torch.dtype | None = LoRAConfig.lora_dtype
+    enable_tower_connector_lora: bool = LoRAConfig.enable_tower_connector_lora
 
     ray_workers_use_nsight: bool = ParallelConfig.ray_workers_use_nsight
     num_gpu_blocks_override: int | None = CacheConfig.num_gpu_blocks_override
@@ -531,6 +536,11 @@ class EngineArgs:
     enable_layerwise_nvtx_tracing: bool = (
         ObservabilityConfig.enable_layerwise_nvtx_tracing
     )
+    enable_mfu_metrics: bool = ObservabilityConfig.enable_mfu_metrics
+    enable_logging_iteration_details: bool = (
+        ObservabilityConfig.enable_logging_iteration_details
+    )
+    enable_mm_processor_stats: bool = ObservabilityConfig.enable_mm_processor_stats
     scheduling_policy: SchedulerPolicy = SchedulerConfig.policy
     scheduler_cls: str | type[object] | None = SchedulerConfig.scheduler_cls
 
@@ -579,9 +589,7 @@ class EngineArgs:
     optimization_level: OptimizationLevel = VllmConfig.optimization_level
 
     kv_offloading_size: float | None = CacheConfig.kv_offloading_size
-    kv_offloading_backend: KVOffloadingBackend | None = (
-        CacheConfig.kv_offloading_backend
-    )
+    kv_offloading_backend: KVOffloadingBackend = CacheConfig.kv_offloading_backend
     tokens_only: bool = False
 
     def __post_init__(self):
@@ -654,7 +662,15 @@ class EngineArgs:
         )
         model_group.add_argument("--max-model-len", **model_kwargs["max_model_len"])
         model_group.add_argument("--quantization", "-q", **model_kwargs["quantization"])
+        model_group.add_argument(
+            "--allow-deprecated-quantization",
+            **model_kwargs["allow_deprecated_quantization"],
+        )
         model_group.add_argument("--enforce-eager", **model_kwargs["enforce_eager"])
+        model_group.add_argument(
+            "--enable-return-routed-experts",
+            **model_kwargs["enable_return_routed_experts"],
+        )
         model_group.add_argument("--max-logprobs", **model_kwargs["max_logprobs"])
         model_group.add_argument("--logprobs-mode", **model_kwargs["logprobs_mode"])
         model_group.add_argument(
@@ -846,12 +862,18 @@ class EngineArgs:
             **parallel_kwargs["data_parallel_external_lb"],
         )
         parallel_group.add_argument(
-            "--enable-expert-parallel", **parallel_kwargs["enable_expert_parallel"]
+            "--enable-expert-parallel",
+            "-ep",
+            **parallel_kwargs["enable_expert_parallel"],
         )
         parallel_group.add_argument(
             "--all2all-backend", **parallel_kwargs["all2all_backend"]
         )
         parallel_group.add_argument("--enable-dbo", **parallel_kwargs["enable_dbo"])
+        parallel_group.add_argument(
+            "--ubatch-size",
+            **parallel_kwargs["ubatch_size"],
+        )
         parallel_group.add_argument(
             "--dbo-decode-token-threshold",
             **parallel_kwargs["dbo_decode_token_threshold"],
@@ -1001,6 +1023,10 @@ class EngineArgs:
             "--lora-dtype",
             **lora_kwargs["lora_dtype"],
         )
+        lora_group.add_argument(
+            "--enable-tower-connector-lora",
+            **lora_kwargs["enable_tower_connector_lora"],
+        )
         lora_group.add_argument("--max-cpu-loras", **lora_kwargs["max_cpu_loras"])
         lora_group.add_argument(
             "--fully-sharded-loras", **lora_kwargs["fully_sharded_loras"]
@@ -1047,6 +1073,14 @@ class EngineArgs:
             "--enable-layerwise-nvtx-tracing",
             **observability_kwargs["enable_layerwise_nvtx_tracing"],
         )
+        observability_group.add_argument(
+            "--enable-mfu-metrics",
+            **observability_kwargs["enable_mfu_metrics"],
+        )
+        observability_group.add_argument(
+            "--enable-logging-iteration-details",
+            **observability_kwargs["enable_logging_iteration_details"],
+        )
 
         # Scheduler arguments
         scheduler_kwargs = get_kwargs(SchedulerConfig)
@@ -1200,6 +1234,7 @@ class EngineArgs:
 
         return ModelConfig(
             model=self.model,
+            model_weights=self.model_weights,
             hf_config_path=self.hf_config_path,
             runner=self.runner,
             convert=self.convert,
@@ -1217,7 +1252,9 @@ class EngineArgs:
             tokenizer_revision=self.tokenizer_revision,
             max_model_len=self.max_model_len,
             quantization=self.quantization,
+            allow_deprecated_quantization=self.allow_deprecated_quantization,
             enforce_eager=self.enforce_eager,
+            enable_return_routed_experts=self.enable_return_routed_experts,
             max_logprobs=self.max_logprobs,
             logprobs_mode=self.logprobs_mode,
             disable_sliding_window=self.disable_sliding_window,
@@ -1344,6 +1381,7 @@ class EngineArgs:
 
         model_config = self.create_model_config()
         self.model = model_config.model
+        self.model_weights = model_config.model_weights
         self.tokenizer = model_config.tokenizer
 
         self._check_feature_supported(model_config)
@@ -1369,12 +1407,17 @@ class EngineArgs:
             f"dcp_size={self.decode_context_parallel_size}."
         )
 
+        # Resolve "auto" kv_cache_dtype to actual value from model config
+        resolved_cache_dtype = resolve_kv_cache_dtype_string(
+            self.kv_cache_dtype, model_config
+        )
+
         cache_config = CacheConfig(
             block_size=self.block_size,
             gpu_memory_utilization=self.gpu_memory_utilization,
             kv_cache_memory_bytes=self.kv_cache_memory_bytes,
             swap_space=self.swap_space,
-            cache_dtype=self.kv_cache_dtype,
+            cache_dtype=resolved_cache_dtype,
             is_attention_free=model_config.is_attention_free,
             num_gpu_blocks_override=self.num_gpu_blocks_override,
             sliding_window=sliding_window,
@@ -1567,9 +1610,11 @@ class EngineArgs:
             data_parallel_rpc_port=data_parallel_rpc_port,
             data_parallel_backend=self.data_parallel_backend,
             data_parallel_hybrid_lb=self.data_parallel_hybrid_lb,
+            is_moe_model=model_config.is_moe,
             enable_expert_parallel=self.enable_expert_parallel,
             all2all_backend=self.all2all_backend,
             enable_dbo=self.enable_dbo,
+            ubatch_size=self.ubatch_size,
             dbo_decode_token_threshold=self.dbo_decode_token_threshold,
             dbo_prefill_token_threshold=self.dbo_prefill_token_threshold,
             disable_nccl_for_dp_synchronization=self.disable_nccl_for_dp_synchronization,
@@ -1630,6 +1675,7 @@ class EngineArgs:
                 default_mm_loras=self.default_mm_loras,
                 fully_sharded_loras=self.fully_sharded_loras,
                 lora_dtype=self.lora_dtype,
+                enable_tower_connector_lora=self.enable_tower_connector_lora,
                 max_cpu_loras=self.max_cpu_loras
                 if self.max_cpu_loras and self.max_cpu_loras > 0
                 else None,
@@ -1691,6 +1737,9 @@ class EngineArgs:
             kv_cache_metrics_sample=self.kv_cache_metrics_sample,
             cudagraph_metrics=self.cudagraph_metrics,
             enable_layerwise_nvtx_tracing=self.enable_layerwise_nvtx_tracing,
+            enable_mfu_metrics=self.enable_mfu_metrics,
+            enable_mm_processor_stats=self.enable_mm_processor_stats,
+            enable_logging_iteration_details=self.enable_logging_iteration_details,
         )
 
         # Compilation config overrides
@@ -2039,8 +2088,7 @@ def _raise_unsupported_error(feature_name: str):
     raise NotImplementedError(msg)
 
 
-
-def human_readable_int(value):
+def human_readable_int(value: str) -> int:
     """Parse human-readable integers like '1k', '2M', etc.
     Including decimal values with decimal multipliers.
 
@@ -2050,6 +2098,7 @@ def human_readable_int(value):
     - '25.6k' -> 25,600
     """
     value = value.strip()
+
     match = re.fullmatch(r"(\d+(?:\.\d+)?)([kKmMgGtT])", value)
     if match:
         decimal_multiplier = {
@@ -2083,3 +2132,22 @@ def human_readable_int(value):
 
     # Regular plain number.
     return int(value)
+
+
+def human_readable_int_or_auto(value: str) -> int:
+    """Parse human-readable integers like '1k', '2M', etc.
+    Including decimal values with decimal multipliers.
+    Also accepts -1 or 'auto' as a special value for auto-detection.
+
+    Examples:
+    - '1k' -> 1,000
+    - '1K' -> 1,024
+    - '25.6k' -> 25,600
+    - '-1' or 'auto' -> -1 (special value for auto-detection)
+    """
+    value = value.strip()
+
+    if value == "-1" or value.lower() == "auto":
+        return -1
+
+    return human_readable_int(value)
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index d94951a0cffc85777a02b4f68c5c047f49de12aa..bf656cf23de65195e6e6a3cf774cbe010188e3ab 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -71,7 +71,11 @@ class EngineClient(ABC):
         truncate_prompt_tokens: int | None = None,
         tokenization_kwargs: dict[str, Any] | None = None,
     ) -> AsyncGenerator[PoolingRequestOutput, None]:
-        """Generate outputs for a request from a pooling model."""
+        """Generate outputs for a request from a pooling model.
+
+        NOTE: truncate_prompt_tokens is deprecated in v0.14.
+        TODO: Remove this argument in v0.15.
+        """
         ...
 
     @abstractmethod
diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py
index b59f7120551e012b41fc828707fdb973a09b4223..7512723515e0cd9e6018dd2fc41cb2c549c0dec7 100644
--- a/vllm/entrypoints/api_server.py
+++ b/vllm/entrypoints/api_server.py
@@ -60,7 +60,8 @@ async def generate(request: Request) -> Response:
 async def _generate(request_dict: dict, raw_request: Request) -> Response:
     prompt = request_dict.pop("prompt")
     stream = request_dict.pop("stream", False)
-    sampling_params = SamplingParams(**request_dict)
+    # Since SamplingParams is created fresh per request, safe to skip clone
+    sampling_params = SamplingParams(**request_dict, skip_clone=True)
     request_id = random_uuid()
 
     assert engine is not None
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index b46a4d2dead18488850e044b2469a4ed163f6db5..08fcb12aa340d71e68a710807fde6301a844d397 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -67,6 +67,15 @@ else:
 
 logger = init_logger(__name__)
 
+
+class ChatTemplateResolutionError(ValueError):
+    """Raised when chat template resolution fails.
+
+    This is a subclass of ValueError for backward compatibility with
+    existing exception handlers.
+    """
+
+
 MODALITY_PLACEHOLDERS_MAP = {
     "image": "<##IMAGE##>",
     "audio": "<##AUDIO##>",
@@ -1820,7 +1829,7 @@ def apply_hf_chat_template(
             prompt = encode_messages(conversation, **encode_config)
             return tokenizer.encode(prompt)
         else:
-            raise ValueError(
+            raise ChatTemplateResolutionError(
                 "As of transformers v4.44, default chat template is no longer "
                 "allowed, so you must provide a chat template if the tokenizer "
                 "does not define one."
diff --git a/vllm/entrypoints/cli/__init__.py b/vllm/entrypoints/cli/__init__.py
index dc02ac563406a87ef8bd8c3c6d0b7fcce42991b2..704d94d36f70122f4ad0ebe994fa743c47154b59 100644
--- a/vllm/entrypoints/cli/__init__.py
+++ b/vllm/entrypoints/cli/__init__.py
@@ -1,6 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from vllm.entrypoints.cli.benchmark.latency import BenchmarkLatencySubcommand
+from vllm.entrypoints.cli.benchmark.mm_processor import (
+    BenchmarkMMProcessorSubcommand,
+)
 from vllm.entrypoints.cli.benchmark.serve import BenchmarkServingSubcommand
 from vllm.entrypoints.cli.benchmark.startup import BenchmarkStartupSubcommand
 from vllm.entrypoints.cli.benchmark.sweep import BenchmarkSweepSubcommand
@@ -8,6 +11,7 @@ from vllm.entrypoints.cli.benchmark.throughput import BenchmarkThroughputSubcomm
 
 __all__: list[str] = [
     "BenchmarkLatencySubcommand",
+    "BenchmarkMMProcessorSubcommand",
     "BenchmarkServingSubcommand",
     "BenchmarkStartupSubcommand",
     "BenchmarkSweepSubcommand",
diff --git a/vllm/entrypoints/cli/benchmark/main.py b/vllm/entrypoints/cli/benchmark/main.py
index 2ff98577c3634887f8e4c2c752de437a4a972a30..48f34fce1d44cd6ce1ca249549e577c1e2cd2260 100644
--- a/vllm/entrypoints/cli/benchmark/main.py
+++ b/vllm/entrypoints/cli/benchmark/main.py
@@ -32,6 +32,7 @@ class BenchmarkSubcommand(CLISubcommand):
     ) -> FlexibleArgumentParser:
         bench_parser = subparsers.add_parser(
             self.name,
+            help=self.help,
             description=self.help,
             usage=f"vllm {self.name} <bench_type> [options]",
         )
diff --git a/vllm/entrypoints/cli/benchmark/mm_processor.py b/vllm/entrypoints/cli/benchmark/mm_processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f1799af12e59ca4d5fc5f224ebf7284ba8f7291
--- /dev/null
+++ b/vllm/entrypoints/cli/benchmark/mm_processor.py
@@ -0,0 +1,21 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import argparse
+
+from vllm.benchmarks.mm_processor import add_cli_args, main
+from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase
+
+
+class BenchmarkMMProcessorSubcommand(BenchmarkSubcommandBase):
+    """The `mm-processor` subcommand for `vllm bench`."""
+
+    name = "mm-processor"
+    help = "Benchmark multimodal processor latency across different configurations."
+
+    @classmethod
+    def add_cli_args(cls, parser: argparse.ArgumentParser) -> None:
+        add_cli_args(parser)
+
+    @staticmethod
+    def cmd(args: argparse.Namespace) -> None:
+        main(args)
diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py
index 96608f360e17b3e108ad9e973425b4e25325b7d3..77c7253aef06ea1f1c1986bc7a43b1386965a292 100644
--- a/vllm/entrypoints/cli/serve.py
+++ b/vllm/entrypoints/cli/serve.py
@@ -66,7 +66,11 @@ class ServeSubcommand(CLISubcommand):
         self, subparsers: argparse._SubParsersAction
     ) -> FlexibleArgumentParser:
         serve_parser = subparsers.add_parser(
-            self.name, description=DESCRIPTION, usage="vllm serve [model_tag] [options]"
+            self.name,
+            help="Launch a local OpenAI-compatible API server to serve LLM "
+            "completions via HTTP.",
+            description=DESCRIPTION,
+            usage="vllm serve [model_tag] [options]",
         )
 
         serve_parser = make_arg_parser(serve_parser)
diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py
index a22ab02229cd81350d1bc5ba4014b6a6cabffd0b..c9bece08f18875d78feb1d23fe204cf715cb2589 100644
--- a/vllm/entrypoints/context.py
+++ b/vllm/entrypoints/context.py
@@ -2,11 +2,13 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
 import contextlib
+import copy
 import json
 import logging
 from abc import ABC, abstractmethod
 from collections.abc import Callable
 from contextlib import AsyncExitStack
+from dataclasses import replace
 from typing import TYPE_CHECKING, Union
 
 from openai.types.responses.response_function_tool_call_output_item import (
@@ -39,9 +41,8 @@ from vllm.entrypoints.tool import Tool
 from vllm.entrypoints.tool_server import ToolServer
 from vllm.outputs import RequestOutput
 from vllm.reasoning.abs_reasoning_parsers import ReasoningParser
-from vllm.tokenizers.protocol import TokenizerLike
+from vllm.tokenizers import TokenizerLike
 from vllm.tool_parsers.abstract_tool_parser import ToolParser
-from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import random_uuid
 
 if TYPE_CHECKING:
@@ -164,6 +165,12 @@ class SimpleContext(ConversationContext):
 
     def __init__(self):
         self.last_output = None
+
+        # Accumulated final output for streaming mode
+        self._accumulated_text: str = ""
+        self._accumulated_token_ids: list[int] = []
+        self._accumulated_logprobs: list = []
+
         self.num_prompt_tokens = 0
         self.num_output_tokens = 0
         self.num_cached_tokens = 0
@@ -183,6 +190,13 @@ class SimpleContext(ConversationContext):
         self.num_cached_tokens = output.num_cached_tokens or 0
         self.num_output_tokens += len(output.outputs[0].token_ids or [])
 
+        # Accumulate text, token_ids, and logprobs for streaming mode
+        delta_output = output.outputs[0]
+        self._accumulated_text += delta_output.text
+        self._accumulated_token_ids.extend(delta_output.token_ids)
+        if delta_output.logprobs is not None:
+            self._accumulated_logprobs.extend(delta_output.logprobs)
+
         if len(self.input_messages) == 0:
             output_prompt = output.prompt or ""
             output_prompt_token_ids = output.prompt_token_ids or []
@@ -194,11 +208,26 @@ class SimpleContext(ConversationContext):
             )
         self.output_messages.append(
             ResponseRawMessageAndToken(
-                message=output.outputs[0].text,
-                tokens=output.outputs[0].token_ids,
+                message=delta_output.text,
+                tokens=delta_output.token_ids,
             )
         )
 
+    @property
+    def final_output(self) -> RequestOutput | None:
+        """Return the final output, with complete text/token_ids/logprobs."""
+        if self.last_output is not None and self.last_output.outputs:
+            assert isinstance(self.last_output, RequestOutput)
+            final_output = copy.copy(self.last_output)
+            # copy inner item to avoid modify last_output
+            final_output.outputs = [replace(item) for item in self.last_output.outputs]
+            final_output.outputs[0].text = self._accumulated_text
+            final_output.outputs[0].token_ids = tuple(self._accumulated_token_ids)
+            if self._accumulated_logprobs:
+                final_output.outputs[0].logprobs = self._accumulated_logprobs
+            return final_output
+        return self.last_output
+
     def append_tool_output(self, output) -> None:
         raise NotImplementedError("Should not be called.")
 
@@ -229,8 +258,8 @@ class ParsableContext(ConversationContext):
         self,
         *,
         response_messages: list[ResponseInputOutputItem],
-        tokenizer: AnyTokenizer,
-        reasoning_parser_cls: Callable[[AnyTokenizer], ReasoningParser] | None,
+        tokenizer: TokenizerLike,
+        reasoning_parser_cls: Callable[[TokenizerLike], ReasoningParser] | None,
         request: ResponsesRequest,
         available_tools: list[str] | None,
         tool_parser_cls: Callable[[TokenizerLike], ToolParser] | None,
@@ -267,12 +296,40 @@ class ParsableContext(ConversationContext):
         self.chat_template = chat_template
         self.chat_template_content_format = chat_template_content_format
 
+        self.input_messages: list[ResponseRawMessageAndToken] = []
+        self.output_messages: list[ResponseRawMessageAndToken] = []
+
     def append_output(self, output: RequestOutput) -> None:
         self.num_prompt_tokens = len(output.prompt_token_ids or [])
         self.num_cached_tokens = output.num_cached_tokens or 0
         self.num_output_tokens += len(output.outputs[0].token_ids or [])
         self.parser.process(output.outputs[0])
 
+        # only store if enable_response_messages is True, save memory
+        if self.request.enable_response_messages:
+            output_prompt = output.prompt or ""
+            output_prompt_token_ids = output.prompt_token_ids or []
+            if len(self.input_messages) == 0:
+                self.input_messages.append(
+                    ResponseRawMessageAndToken(
+                        message=output_prompt,
+                        tokens=output_prompt_token_ids,
+                    )
+                )
+            else:
+                self.output_messages.append(
+                    ResponseRawMessageAndToken(
+                        message=output_prompt,
+                        tokens=output_prompt_token_ids,
+                    )
+                )
+            self.output_messages.append(
+                ResponseRawMessageAndToken(
+                    message=output.outputs[0].text,
+                    tokens=output.outputs[0].token_ids,
+                )
+            )
+
     def append_tool_output(self, output: list[ResponseInputOutputItem]) -> None:
         self.parser.response_messages.extend(output)
 
@@ -767,6 +824,7 @@ class StreamingHarmonyContext(HarmonyContext):
         self.encoding = get_encoding()
         self.last_tok = None
         self.first_tok_of_message = True
+        self.last_content_delta = None
 
     @property
     def messages(self) -> list:
@@ -775,6 +833,7 @@ class StreamingHarmonyContext(HarmonyContext):
     def append_output(self, output: RequestOutput) -> None:
         # append_output is called for each output token in streaming case,
         # so we only want to add the prompt tokens once for each message.
+        self.last_content_delta = None
         if self.first_tok_of_message:
             self._update_prefill_token_usage(output)
         # Reset self.first_tok_of_message if needed:
@@ -782,8 +841,12 @@ class StreamingHarmonyContext(HarmonyContext):
         # (finished=True), then the next token processed will mark the
         # beginning of a new message
         self.first_tok_of_message = output.finished
+        last_delta_text = ""
         for tok in output.outputs[0].token_ids:
             self.parser.process(tok)
+            last_delta_text += self.parser.last_content_delta or ""
+        if last_delta_text:
+            self.last_content_delta = last_delta_text
         self._update_decode_token_usage(output)
 
         # For streaming, update previous turn when message is complete
diff --git a/vllm/entrypoints/grpc_server.py b/vllm/entrypoints/grpc_server.py
new file mode 100755
index 0000000000000000000000000000000000000000..2778385c99980edd6d21b5b1222ec73615077371
--- /dev/null
+++ b/vllm/entrypoints/grpc_server.py
@@ -0,0 +1,531 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# mypy: ignore-errors
+"""
+vLLM gRPC Server
+
+Starts a gRPC server for vLLM using the VllmEngine protocol.
+
+Usage:
+    python -m vllm.entrypoints.grpc_server --model <model_path>
+
+Example:
+    python -m vllm.entrypoints.grpc_server \
+        --model meta-llama/Llama-2-7b-hf \
+        --host 0.0.0.0 \
+        --port 50051
+"""
+
+import argparse
+import asyncio
+import signal
+import sys
+import time
+from collections.abc import AsyncGenerator
+
+import grpc
+import uvloop
+from grpc_reflection.v1alpha import reflection
+
+from vllm import SamplingParams, TextPrompt, TokensPrompt
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.grpc import vllm_engine_pb2, vllm_engine_pb2_grpc
+from vllm.logger import init_logger
+from vllm.outputs import RequestOutput
+from vllm.sampling_params import RequestOutputKind, StructuredOutputsParams
+from vllm.usage.usage_lib import UsageContext
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.v1.engine.async_llm import AsyncLLM
+from vllm.version import __version__ as VLLM_VERSION
+
+logger = init_logger(__name__)
+
+
+class VllmEngineServicer(vllm_engine_pb2_grpc.VllmEngineServicer):
+    """
+    gRPC servicer implementing the VllmEngine service.
+
+    Handles 6 RPCs:
+    - Generate: Streaming text generation
+    - Embed: Embeddings (TODO)
+    - HealthCheck: Health probe
+    - Abort: Cancel requests out-of-band
+    - GetModelInfo: Model metadata
+    - GetServerInfo: Server state
+    """
+
+    def __init__(self, async_llm: AsyncLLM, start_time: float):
+        """
+        Initialize the servicer.
+
+        Args:
+            async_llm: The AsyncLLM instance
+            start_time: The server start time, in seconds since epoch
+        """
+        self.async_llm = async_llm
+        self.start_time = start_time
+        logger.info("VllmEngineServicer initialized")
+
+    async def Generate(
+        self,
+        request: vllm_engine_pb2.GenerateRequest,
+        context: grpc.aio.ServicerContext,
+    ) -> AsyncGenerator[vllm_engine_pb2.GenerateResponse, None]:
+        """
+        Handle streaming generation requests.
+
+        Args:
+            request: The GenerateRequest protobuf
+            context: gRPC context
+
+        Yields:
+            GenerateResponse protobuf messages (streaming)
+        """
+        request_id = request.request_id
+        logger.debug("Generate request %s received.", request_id)
+
+        try:
+            # Extract tokenized input
+            if request.WhichOneof("input") == "tokenized":
+                prompt: TokensPrompt = {
+                    "prompt_token_ids": list(request.tokenized.input_ids)
+                }
+                if request.tokenized.original_text:
+                    prompt["prompt"] = request.tokenized.original_text
+            else:
+                prompt: TextPrompt = {"prompt": request.text}
+
+            # Build sampling params with detokenize=False
+            sampling_params = self._sampling_params_from_proto(
+                request.sampling_params, stream=request.stream
+            )
+
+            async for output in self.async_llm.generate(
+                prompt=prompt,
+                sampling_params=sampling_params,
+                request_id=request_id,
+            ):
+                # Convert vLLM output to protobuf
+                # For streaming, always send chunks
+                if request.stream:
+                    yield self._chunk_response(output)
+
+                # Send complete response when finished
+                if output.finished:
+                    yield self._complete_response(output)
+
+        except ValueError as e:
+            # Invalid request error (equiv to 400).
+            await context.abort(grpc.StatusCode.INVALID_ARGUMENT, str(e))
+        except Exception as e:
+            logger.exception("Error in Generate for request %s", request_id)
+            await context.abort(grpc.StatusCode.INTERNAL, str(e))
+
+    async def Embed(
+        self,
+        request: vllm_engine_pb2.EmbedRequest,
+        context: grpc.aio.ServicerContext,
+    ) -> vllm_engine_pb2.EmbedResponse:
+        """
+        Handle embedding requests.
+
+        TODO: Implement in Phase 4
+
+        Args:
+            request: The EmbedRequest protobuf
+            context: gRPC context
+
+        Returns:
+            EmbedResponse protobuf
+        """
+        logger.warning("Embed RPC not yet implemented")
+        await context.abort(
+            grpc.StatusCode.UNIMPLEMENTED, "Embed RPC not yet implemented"
+        )
+
+    async def HealthCheck(
+        self,
+        request: vllm_engine_pb2.HealthCheckRequest,
+        context: grpc.aio.ServicerContext,
+    ) -> vllm_engine_pb2.HealthCheckResponse:
+        """
+        Handle health check requests.
+
+        Args:
+            request: The HealthCheckRequest protobuf
+            context: gRPC context
+
+        Returns:
+            HealthCheckResponse protobuf
+        """
+        is_healthy = not self.async_llm.errored
+        message = "Health" if is_healthy else "Engine is not alive"
+
+        logger.debug("HealthCheck request: healthy=%s, message=%s", is_healthy, message)
+
+        return vllm_engine_pb2.HealthCheckResponse(healthy=is_healthy, message=message)
+
+    async def Abort(
+        self,
+        request: vllm_engine_pb2.AbortRequest,
+        context: grpc.aio.ServicerContext,
+    ) -> vllm_engine_pb2.AbortResponse:
+        """
+        Out-of-band abort requests.
+
+        Args:
+            request: The AbortRequest protobuf
+            context: gRPC context
+
+        Returns:
+            AbortResponse protobuf
+        """
+        request_ids = request.request_ids
+        logger.debug("Abort requests: %s", request_ids)
+
+        await self.async_llm.abort(request_ids)
+        return vllm_engine_pb2.AbortResponse()
+
+    async def GetModelInfo(
+        self,
+        request: vllm_engine_pb2.GetModelInfoRequest,
+        context: grpc.aio.ServicerContext,
+    ) -> vllm_engine_pb2.GetModelInfoResponse:
+        """
+        Handle model info requests.
+
+        Args:
+            request: The GetModelInfoRequest protobuf
+            context: gRPC context
+
+        Returns:
+            GetModelInfoResponse protobuf
+        """
+        model_config = self.async_llm.model_config
+
+        return vllm_engine_pb2.GetModelInfoResponse(
+            model_path=model_config.model,
+            is_generation=model_config.runner_type == "generate",
+            max_context_length=model_config.max_model_len,
+            vocab_size=model_config.get_vocab_size(),
+            supports_vision=model_config.is_multimodal_model,
+        )
+
+    async def GetServerInfo(
+        self,
+        request: vllm_engine_pb2.GetServerInfoRequest,
+        context: grpc.aio.ServicerContext,
+    ) -> vllm_engine_pb2.GetServerInfoResponse:
+        """
+        Handle server info requests.
+
+        Args:
+            request: The GetServerInfoRequest protobuf
+            context: gRPC context
+
+        Returns:
+            GetServerInfoResponse protobuf
+        """
+        num_requests = self.async_llm.output_processor.get_num_unfinished_requests()
+
+        return vllm_engine_pb2.GetServerInfoResponse(
+            active_requests=num_requests,
+            is_paused=False,  # TODO
+            last_receive_timestamp=time.time(),  # TODO looks wrong?
+            uptime_seconds=time.time() - self.start_time,
+            server_type="vllm-grpc",
+        )
+
+    # ========== Helper methods ==========
+
+    @staticmethod
+    def _sampling_params_from_proto(
+        params: vllm_engine_pb2.SamplingParams, stream: bool = True
+    ) -> SamplingParams:
+        """
+        Convert protobuf SamplingParams to vLLM SamplingParams.
+
+        Args:
+            params: Protobuf SamplingParams message
+            stream: Whether streaming is enabled
+
+        Returns:
+            vLLM SamplingParams with detokenize=False and structured_outputs
+        """
+        # Build stop sequences
+        stop = list(params.stop) if params.stop else None
+        stop_token_ids = list(params.stop_token_ids) if params.stop_token_ids else None
+
+        # Handle structured outputs constraints
+        structured_outputs = None
+        constraint_field = params.WhichOneof("constraint")
+        if constraint_field:
+            if constraint_field == "json_schema":
+                structured_outputs = StructuredOutputsParams(json=params.json_schema)
+            elif constraint_field == "regex":
+                structured_outputs = StructuredOutputsParams(regex=params.regex)
+            elif constraint_field == "grammar":
+                structured_outputs = StructuredOutputsParams(grammar=params.grammar)
+            elif constraint_field == "structural_tag":
+                structured_outputs = StructuredOutputsParams(
+                    structural_tag=params.structural_tag
+                )
+            elif constraint_field == "json_object":
+                structured_outputs = StructuredOutputsParams(
+                    json_object=params.json_object
+                )
+            elif constraint_field == "choice":
+                structured_outputs = StructuredOutputsParams(
+                    choice=list(params.choice.choices)
+                )
+
+        # Create SamplingParams
+        # output_kind=DELTA: Return only new tokens in each chunk (for streaming)
+        return SamplingParams(
+            temperature=params.temperature if params.HasField("temperature") else 1.0,
+            top_p=params.top_p if params.top_p != 0.0 else 1.0,
+            top_k=params.top_k,
+            min_p=params.min_p,
+            frequency_penalty=params.frequency_penalty,
+            presence_penalty=params.presence_penalty,
+            repetition_penalty=params.repetition_penalty
+            if params.repetition_penalty != 0.0
+            else 1.0,
+            max_tokens=params.max_tokens if params.HasField("max_tokens") else None,
+            min_tokens=params.min_tokens,
+            stop=stop,
+            stop_token_ids=stop_token_ids,
+            skip_special_tokens=params.skip_special_tokens,
+            spaces_between_special_tokens=params.spaces_between_special_tokens,
+            ignore_eos=params.ignore_eos,
+            n=params.n if params.n > 0 else 1,
+            logprobs=params.logprobs if params.HasField("logprobs") else None,
+            prompt_logprobs=params.prompt_logprobs
+            if params.HasField("prompt_logprobs")
+            else None,
+            seed=params.seed if params.HasField("seed") else None,
+            include_stop_str_in_output=params.include_stop_str_in_output,
+            logit_bias=dict(params.logit_bias) if params.logit_bias else None,
+            truncate_prompt_tokens=params.truncate_prompt_tokens
+            if params.HasField("truncate_prompt_tokens")
+            else None,
+            structured_outputs=structured_outputs,
+            # detokenize must be True if stop strings are used
+            detokenize=bool(stop),
+            output_kind=RequestOutputKind.DELTA
+            if stream
+            else RequestOutputKind.FINAL_ONLY,
+        )
+
+    @staticmethod
+    def _chunk_response(output: RequestOutput) -> vllm_engine_pb2.GenerateResponse:
+        """
+        Build a streaming chunk response from vLLM output.
+        When output_kind=DELTA, vLLM returns only new tokens automatically.
+
+        Args:
+            output: vLLM RequestOutput (with delta tokens when output_kind=DELTA)
+
+        Returns:
+            GenerateResponse with chunk field set
+        """
+        # Get the completion output (first one if n > 1)
+        completion = output.outputs[0] if output.outputs else None
+
+        if completion is None:
+            # Empty chunk
+            return vllm_engine_pb2.GenerateResponse(
+                chunk=vllm_engine_pb2.GenerateStreamChunk(
+                    token_ids=[],
+                    prompt_tokens=0,
+                    completion_tokens=0,
+                    cached_tokens=0,
+                ),
+            )
+
+        # When output_kind=DELTA, completion.token_ids contains only new tokens
+        # vLLM handles the delta logic internally
+        # completion_tokens = delta count (client will accumulate)
+        return vllm_engine_pb2.GenerateResponse(
+            chunk=vllm_engine_pb2.GenerateStreamChunk(
+                token_ids=completion.token_ids,
+                prompt_tokens=len(output.prompt_token_ids)
+                if output.prompt_token_ids
+                else 0,
+                completion_tokens=len(completion.token_ids),  # Delta count
+                cached_tokens=output.num_cached_tokens,
+            ),
+        )
+
+    @staticmethod
+    def _complete_response(output: RequestOutput) -> vllm_engine_pb2.GenerateResponse:
+        """
+        Build a final completion response from vLLM output.
+
+        Args:
+            output: vLLM RequestOutput (finished=True)
+
+        Returns:
+            GenerateResponse with complete field set
+        """
+        # Get the completion output (first one if n > 1)
+        completion = output.outputs[0] if output.outputs else None
+
+        if completion is None:
+            # Empty completion
+            return vllm_engine_pb2.GenerateResponse(
+                complete=vllm_engine_pb2.GenerateComplete(
+                    output_ids=[],
+                    finish_reason="error",
+                    prompt_tokens=0,
+                    completion_tokens=0,
+                    cached_tokens=0,
+                ),
+            )
+
+        # Build complete response
+        # When streaming (DELTA mode): completion.token_ids will be empty/last delta
+        # When non-streaming (FINAL_ONLY mode): completion.token_ids has all tokens
+        # Client will accumulate token counts for streaming
+        return vllm_engine_pb2.GenerateResponse(
+            complete=vllm_engine_pb2.GenerateComplete(
+                output_ids=completion.token_ids,
+                finish_reason=completion.finish_reason or "stop",
+                prompt_tokens=len(output.prompt_token_ids)
+                if output.prompt_token_ids
+                else 0,
+                completion_tokens=len(completion.token_ids),
+                cached_tokens=output.num_cached_tokens,
+            ),
+        )
+
+
+async def serve_grpc(args: argparse.Namespace):
+    """
+    Main serving function.
+
+    Args:
+        args: Parsed command line arguments
+    """
+    logger.info("vLLM gRPC server version %s", VLLM_VERSION)
+    logger.info("args: %s", args)
+
+    start_time = time.time()
+
+    # Create engine args
+    engine_args = AsyncEngineArgs.from_cli_args(args)
+
+    # Build vLLM config
+    vllm_config = engine_args.create_engine_config(
+        usage_context=UsageContext.OPENAI_API_SERVER
+    )
+
+    # Create AsyncLLM
+    async_llm = AsyncLLM.from_vllm_config(
+        vllm_config=vllm_config,
+        usage_context=UsageContext.OPENAI_API_SERVER,
+        enable_log_requests=args.enable_log_requests,
+        disable_log_stats=args.disable_log_stats_server,
+    )
+
+    # Create servicer
+    servicer = VllmEngineServicer(async_llm, start_time)
+
+    # Create gRPC server
+    server = grpc.aio.server(
+        options=[
+            ("grpc.max_send_message_length", -1),
+            ("grpc.max_receive_message_length", -1),
+        ],
+    )
+
+    # Add servicer to server
+    vllm_engine_pb2_grpc.add_VllmEngineServicer_to_server(servicer, server)
+
+    # Enable reflection for grpcurl and other tools
+    service_names = (
+        vllm_engine_pb2.DESCRIPTOR.services_by_name["VllmEngine"].full_name,
+        reflection.SERVICE_NAME,
+    )
+    reflection.enable_server_reflection(service_names, server)
+
+    # Bind to address
+    address = f"{args.host}:{args.port}"
+    server.add_insecure_port(address)
+
+    # Start server
+    await server.start()
+    logger.info("vLLM gRPC server started on %s", address)
+    logger.info("Server is ready to accept requests")
+
+    # Handle shutdown signals
+    loop = asyncio.get_running_loop()
+    stop_event = asyncio.Event()
+
+    def signal_handler():
+        logger.info("Received shutdown signal")
+        stop_event.set()
+
+    for sig in (signal.SIGTERM, signal.SIGINT):
+        loop.add_signal_handler(sig, signal_handler)
+
+    # Serve until shutdown signal
+    try:
+        await stop_event.wait()
+    except KeyboardInterrupt:
+        logger.info("Interrupted by user")
+    finally:
+        logger.info("Shutting down vLLM gRPC server...")
+
+        # Stop gRPC server
+        await server.stop(grace=5.0)
+        logger.info("gRPC server stopped")
+
+        # Shutdown AsyncLLM
+        async_llm.shutdown()
+        logger.info("AsyncLLM engine stopped")
+
+        logger.info("Shutdown complete")
+
+
+def main():
+    """Main entry point."""
+    parser = FlexibleArgumentParser(
+        description="vLLM gRPC Server",
+    )
+
+    # Server args
+    parser.add_argument(
+        "--host",
+        type=str,
+        default="0.0.0.0",
+        help="Host to bind gRPC server to",
+    )
+    parser.add_argument(
+        "--port",
+        type=int,
+        default=50051,
+        help="Port to bind gRPC server to",
+    )
+    parser.add_argument(
+        "--disable-log-stats-server",
+        action="store_true",
+        help="Disable stats logging on server side",
+    )
+
+    # Add vLLM engine args
+    parser = AsyncEngineArgs.add_cli_args(parser)
+
+    args = parser.parse_args()
+
+    # Run server
+    try:
+        uvloop.run(serve_grpc(args))
+    except Exception as e:
+        logger.exception("Server failed: %s", e)
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 32a9c6f57e61471380735e2b6a76ce4bf5900517..90b74d9d7bc8d155a056a0f232796a3f68b12f7d 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -18,6 +18,7 @@ from vllm.beam_search import (
     create_sort_beams_key_function,
 )
 from vllm.config import (
+    AttentionConfig,
     CompilationConfig,
     PoolerConfig,
     ProfilerConfig,
@@ -158,6 +159,7 @@ class LLM:
         enforce_eager: Whether to enforce eager execution. If True, we will
             disable CUDA graph and always execute the model in eager mode.
             If False, we will use CUDA graph and eager execution in hybrid.
+        enable_return_routed_experts: Whether to return routed experts.
         disable_custom_all_reduce: See
             [ParallelConfig][vllm.config.ParallelConfig].
         hf_token: The token to use as HTTP bearer authorization for remote files
@@ -172,10 +174,14 @@ class LLM:
             The available overrides depend on the model that is being run.
             For example, for Phi-3-Vision: `{"num_crops": 4}`.
         pooler_config: Initialize non-default pooling config for the pooling
-            model. e.g. `PoolerConfig(pooling_type="mean", normalize=False)`.
+            model. e.g. `PoolerConfig(seq_pooling_type="MEAN", normalize=False)`.
         compilation_config: Either an integer or a dictionary. If it is an
             integer, it is used as the mode of compilation optimization. If it
             is a dictionary, it can specify the full compilation configuration.
+        attention_config: Configuration for attention mechanisms. Can be a
+            dictionary or an AttentionConfig instance. If a dictionary, it will
+            be converted to an AttentionConfig. Allows specifying the attention
+            backend and other attention-related settings.
         **kwargs: Arguments for [`EngineArgs`][vllm.EngineArgs].
 
     Note:
@@ -205,6 +211,7 @@ class LLM:
         swap_space: float = 4,
         cpu_offload_gb: float = 0,
         enforce_eager: bool = False,
+        enable_return_routed_experts: bool = False,
         disable_custom_all_reduce: bool = False,
         hf_token: bool | str | None = None,
         hf_overrides: HfOverrides | None = None,
@@ -214,6 +221,7 @@ class LLM:
         | StructuredOutputsConfig
         | None = None,
         profiler_config: dict[str, Any] | ProfilerConfig | None = None,
+        attention_config: dict[str, Any] | AttentionConfig | None = None,
         kv_cache_memory_bytes: int | None = None,
         compilation_config: int | dict[str, Any] | CompilationConfig | None = None,
         logits_processors: list[str | type[LogitsProcessor]] | None = None,
@@ -253,51 +261,28 @@ class LLM:
         if hf_overrides is None:
             hf_overrides = {}
 
-        if compilation_config is not None:
-            if isinstance(compilation_config, int):
-                compilation_config_instance = CompilationConfig(
-                    mode=CompilationMode(compilation_config)
-                )
-            elif isinstance(compilation_config, dict):
-                compilation_config_instance = CompilationConfig(
-                    **{
-                        k: v
-                        for k, v in compilation_config.items()
-                        if is_init_field(CompilationConfig, k)
-                    }
-                )
-            else:
-                compilation_config_instance = compilation_config
-        else:
-            compilation_config_instance = CompilationConfig()
-
-        if structured_outputs_config is not None:
-            if isinstance(structured_outputs_config, dict):
-                structured_outputs_instance = StructuredOutputsConfig(
-                    **{
-                        k: v
-                        for k, v in structured_outputs_config.items()
-                        if is_init_field(StructuredOutputsConfig, k)
-                    }
-                )
-            else:
-                structured_outputs_instance = structured_outputs_config
-        else:
-            structured_outputs_instance = StructuredOutputsConfig()
-
-        if profiler_config is not None:
-            if isinstance(profiler_config, dict):
-                profiler_config_instance = ProfilerConfig(
-                    **{
-                        k: v
-                        for k, v in profiler_config.items()
-                        if is_init_field(ProfilerConfig, k)
-                    }
-                )
-            else:
-                profiler_config_instance = profiler_config
+        def _make_config(value: Any, cls: type[_R]) -> _R:
+            """Convert dict/None/instance to a config instance."""
+            if value is None:
+                return cls()
+            if isinstance(value, dict):
+                return cls(**{k: v for k, v in value.items() if is_init_field(cls, k)})  # type: ignore[arg-type]
+            return value
+
+        if isinstance(compilation_config, int):
+            compilation_config_instance = CompilationConfig(
+                mode=CompilationMode(compilation_config)
+            )
         else:
-            profiler_config_instance = ProfilerConfig()
+            compilation_config_instance = _make_config(
+                compilation_config, CompilationConfig
+            )
+
+        structured_outputs_instance = _make_config(
+            structured_outputs_config, StructuredOutputsConfig
+        )
+        profiler_config_instance = _make_config(profiler_config, ProfilerConfig)
+        attention_config_instance = _make_config(attention_config, AttentionConfig)
 
         # warn about single-process data parallel usage.
         _dp_size = int(kwargs.get("data_parallel_size", 1))
@@ -335,6 +320,7 @@ class LLM:
             swap_space=swap_space,
             cpu_offload_gb=cpu_offload_gb,
             enforce_eager=enforce_eager,
+            enable_return_routed_experts=enable_return_routed_experts,
             disable_custom_all_reduce=disable_custom_all_reduce,
             hf_token=hf_token,
             hf_overrides=hf_overrides,
@@ -342,6 +328,7 @@ class LLM:
             pooler_config=pooler_config,
             structured_outputs_config=structured_outputs_instance,
             profiler_config=profiler_config_instance,
+            attention_config=attention_config_instance,
             compilation_config=compilation_config_instance,
             logits_processors=logits_processors,
             **kwargs,
@@ -365,6 +352,9 @@ class LLM:
         self.input_processor = self.llm_engine.input_processor
         self.io_processor = self.llm_engine.io_processor
 
+        # Cache for __repr__ to avoid repeated collective_rpc calls
+        self._cached_repr: str | None = None
+
     def get_tokenizer(self) -> TokenizerLike:
         return self.llm_engine.get_tokenizer()
 
@@ -659,7 +649,10 @@ class LLM:
         # following the huggingface transformers implementation
         # at https://github.com/huggingface/transformers/blob/e15687fffe5c9d20598a19aeab721ae0a7580f8a/src/transformers/generation/beam_search.py#L534 # noqa
         beam_search_params = SamplingParams(
-            logprobs=2 * beam_width, max_tokens=1, temperature=temperature
+            logprobs=2 * beam_width,
+            max_tokens=1,
+            temperature=temperature,
+            skip_clone=True,  # Internal beam search, safe to skip clone
         )
         instances: list[BeamSearchInstance] = []
 
@@ -1297,6 +1290,7 @@ class LLM:
         pooling_params: PoolingParams | None = None,
         lora_request: list[LoRARequest] | LoRARequest | None = None,
         tokenization_kwargs: dict[str, Any] | None = None,
+        score_template: str | None = None,
     ) -> list[ScoringRequestOutput]:
         model_config = self.model_config
 
@@ -1330,6 +1324,7 @@ class LLM:
                 data_2=d,
                 tokenizer=tokenizer,
                 tokenization_kwargs=tokenization_kwargs,
+                score_template=score_template,
             )
 
             if token_type_ids := engine_prompt.pop("token_type_ids", None):
@@ -1364,6 +1359,7 @@ class LLM:
         use_tqdm: bool | Callable[..., tqdm] = True,
         pooling_params: PoolingParams | None = None,
         lora_request: list[LoRARequest] | LoRARequest | None = None,
+        chat_template: str | None = None,
     ) -> list[ScoringRequestOutput]:
         """Generate similarity scores for all pairs `<text,text_pair>` or
           `<multi-modal data, multi-modal data pair>`.
@@ -1396,6 +1392,8 @@ class LLM:
             lora_request: LoRA request to use for generation, if any.
             pooling_params: The pooling parameters for pooling. If None, we
                 use the default pooling parameters.
+            chat_template: The chat template to use for the scoring. If None, we
+                use the model's default chat template.
         Returns:
             A list of `ScoringRequestOutput` objects containing the
             generated scores in the same order as the input prompts.
@@ -1423,6 +1421,11 @@ class LLM:
         ):
             raise ValueError("Score API is only enabled for num_labels == 1.")
 
+        if not model_config.is_cross_encoder and chat_template is not None:
+            raise ValueError(
+                "chat_template is only supported for cross-encoder models."
+            )
+
         # the tokenizer for models such as
         # "cross-encoder/ms-marco-MiniLM-L-6-v2" doesn't support passing
         # lists of tokens to the `text` and `text_pair` kwargs
@@ -1492,6 +1495,7 @@ class LLM:
                 use_tqdm,
                 pooling_params,
                 lora_request,
+                score_template=chat_template,
             )
         else:
             return self._embedding_score(
@@ -1627,7 +1631,7 @@ class LLM:
                 added_request_ids.append(request_id)
         except Exception as e:
             if added_request_ids:
-                self.llm_engine.abort_request(added_request_ids)
+                self.llm_engine.abort_request(added_request_ids, internal=True)
             raise e
 
     def _validate_mm_data_and_uuids(
@@ -1737,7 +1741,7 @@ class LLM:
             priority=priority,
             prompt_text=prompt_text,
         )
-        return request_id
+        return engine_request.request_id
 
     def _run_engine(
         self, *, use_tqdm: bool | Callable[..., tqdm] = True
@@ -1790,3 +1794,16 @@ class LLM:
         # This is necessary because some requests may be finished earlier than
         # its previous requests.
         return sorted(outputs, key=lambda x: int(x.request_id))
+
+    def __repr__(self) -> str:
+        """Return a transformers-style hierarchical view of the model."""
+        # Cache the result to avoid repeated collective_rpc calls
+        if self._cached_repr is None:
+            results = self.llm_engine.collective_rpc("get_model_inspection")
+            # In distributed settings, we get results from all workers
+            # Just return the first one (they should all be the same)
+            if results:
+                self._cached_repr = results[0]
+            else:
+                self._cached_repr = f"LLM(model={self.model_config.model!r})"
+        return self._cached_repr
diff --git a/vllm/entrypoints/logger.py b/vllm/entrypoints/logger.py
index 678a7b3a60b558c032793cf33367d7842cba98f0..c9e809353b59ccee8df563e1b1b2c84448c8c03f 100644
--- a/vllm/entrypoints/logger.py
+++ b/vllm/entrypoints/logger.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import logging
 from collections.abc import Sequence
 
 import torch
@@ -26,23 +27,24 @@ class RequestLogger:
         params: SamplingParams | PoolingParams | BeamSearchParams | None,
         lora_request: LoRARequest | None,
     ) -> None:
-        max_log_len = self.max_log_len
-        if max_log_len is not None:
-            if prompt is not None:
-                prompt = prompt[:max_log_len]
+        if logger.isEnabledFor(logging.DEBUG):
+            max_log_len = self.max_log_len
+            if max_log_len is not None:
+                if prompt is not None:
+                    prompt = prompt[:max_log_len]
 
-            if prompt_token_ids is not None:
-                prompt_token_ids = prompt_token_ids[:max_log_len]
+                if prompt_token_ids is not None:
+                    prompt_token_ids = prompt_token_ids[:max_log_len]
 
-        logger.debug(
-            "Request %s details: prompt: %r, "
-            "prompt_token_ids: %s, "
-            "prompt_embeds shape: %s.",
-            request_id,
-            prompt,
-            prompt_token_ids,
-            prompt_embeds.shape if prompt_embeds is not None else None,
-        )
+            logger.debug(
+                "Request %s details: prompt: %r, "
+                "prompt_token_ids: %s, "
+                "prompt_embeds shape: %s.",
+                request_id,
+                prompt,
+                prompt_token_ids,
+                prompt_embeds.shape if prompt_embeds is not None else None,
+            )
 
         logger.info(
             "Received request %s: params: %s, lora_request: %s.",
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 8d2f1b095c98a2704eb2c5585d88b8958db59ebb..9321e01c4c2b3ad71c94f596f09f7ce5db344458 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -18,21 +18,20 @@ from argparse import Namespace
 from collections.abc import AsyncGenerator, AsyncIterator, Awaitable
 from contextlib import asynccontextmanager
 from http import HTTPStatus
-from typing import Annotated, Any, Literal
+from typing import Annotated, Any
 
 import model_hosting_container_standards.sagemaker as sagemaker_standards
 import pydantic
 import uvloop
-from fastapi import APIRouter, Depends, FastAPI, Form, HTTPException, Query, Request
+from fastapi import APIRouter, Depends, FastAPI, Form, HTTPException, Request
 from fastapi.exceptions import RequestValidationError
 from fastapi.middleware.cors import CORSMiddleware
-from fastapi.responses import JSONResponse, Response, StreamingResponse
+from fastapi.responses import JSONResponse, StreamingResponse
 from starlette.concurrency import iterate_in_threadpool
 from starlette.datastructures import URL, Headers, MutableHeaders, State
 from starlette.types import ASGIApp, Message, Receive, Scope, Send
 
 import vllm.envs as envs
-from vllm.config import VllmConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.anthropic.protocol import (
@@ -90,8 +89,10 @@ from vllm.entrypoints.utils import (
     log_non_default_args,
     process_chat_template,
     process_lora_modules,
+    sanitize_message,
     with_cancellation,
 )
+from vllm.exceptions import VLLMValidationError
 from vllm.logger import init_logger
 from vllm.reasoning import ReasoningParserManager
 from vllm.tasks import POOLING_TASKS
@@ -540,14 +541,8 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
 
     try:
         generator = await handler.create_completion(request, raw_request)
-    except OverflowError as e:
-        raise HTTPException(
-            status_code=HTTPStatus.BAD_REQUEST.value, detail=str(e)
-        ) from e
     except Exception as e:
-        raise HTTPException(
-            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
-        ) from e
+        return handler.create_error_response(e)
 
     if isinstance(generator, ErrorResponse):
         return JSONResponse(
@@ -640,97 +635,6 @@ async def create_translations(
     return StreamingResponse(content=generator, media_type="text/event-stream")
 
 
-if envs.VLLM_SERVER_DEV_MODE:
-    logger.warning(
-        "SECURITY WARNING: Development endpoints are enabled! "
-        "This should NOT be used in production!"
-    )
-
-    PydanticVllmConfig = pydantic.TypeAdapter(VllmConfig)
-
-    @router.get("/server_info")
-    async def show_server_info(
-        raw_request: Request,
-        config_format: Annotated[Literal["text", "json"], Query()] = "text",
-    ):
-        vllm_config: VllmConfig = raw_request.app.state.vllm_config
-        server_info = {
-            "vllm_config": str(vllm_config)
-            if config_format == "text"
-            else PydanticVllmConfig.dump_python(vllm_config, mode="json", fallback=str)
-            # fallback=str is needed to handle e.g. torch.dtype
-        }
-        return JSONResponse(content=server_info)
-
-    @router.post("/reset_prefix_cache")
-    async def reset_prefix_cache(
-        raw_request: Request,
-        reset_running_requests: bool = Query(default=False),
-        reset_external: bool = Query(default=False),
-    ):
-        """
-        Reset the local prefix cache.
-
-        Optionally, if the query parameter `reset_external=true`
-        also resets the external (connector-managed) prefix cache.
-
-        Note that we currently do not check if the prefix cache
-        is successfully reset in the API server.
-
-        Example:
-            POST /reset_prefix_cache?reset_external=true
-        """
-        logger.info("Resetting prefix cache...")
-
-        await engine_client(raw_request).reset_prefix_cache(
-            reset_running_requests, reset_external
-        )
-        return Response(status_code=200)
-
-    @router.post("/reset_mm_cache")
-    async def reset_mm_cache(raw_request: Request):
-        """
-        Reset the multi-modal cache. Note that we currently do not check if the
-        multi-modal cache is successfully reset in the API server.
-        """
-        logger.info("Resetting multi-modal cache...")
-        await engine_client(raw_request).reset_mm_cache()
-        return Response(status_code=200)
-
-    @router.post("/collective_rpc")
-    async def collective_rpc(raw_request: Request):
-        try:
-            body = await raw_request.json()
-        except json.JSONDecodeError as e:
-            raise HTTPException(
-                status_code=HTTPStatus.BAD_REQUEST.value,
-                detail=f"JSON decode error: {e}",
-            ) from e
-        method = body.get("method")
-        if method is None:
-            raise HTTPException(
-                status_code=HTTPStatus.BAD_REQUEST.value,
-                detail="Missing 'method' in request body",
-            )
-        # For security reason, only serialized string args/kwargs are passed.
-        # User-defined `method` is responsible for deserialization if needed.
-        args: list[str] = body.get("args", [])
-        kwargs: dict[str, str] = body.get("kwargs", {})
-        timeout: float | None = body.get("timeout")
-        results = await engine_client(raw_request).collective_rpc(
-            method=method, timeout=timeout, args=tuple(args), kwargs=kwargs
-        )
-        if results is None:
-            return Response(status_code=200)
-        response: list[Any] = []
-        for result in results:
-            if result is None or isinstance(result, dict | list):
-                response.append(result)
-            else:
-                response.append(str(result))
-        return JSONResponse(content={"results": response})
-
-
 def load_log_config(log_config_file: str | None) -> dict | None:
     if not log_config_file:
         return None
@@ -963,6 +867,8 @@ def build_app(args: Namespace) -> FastAPI:
         app = FastAPI(
             openapi_url=None, docs_url=None, redoc_url=None, lifespan=lifespan
         )
+    elif args.enable_offline_docs:
+        app = FastAPI(docs_url=None, redoc_url=None, lifespan=lifespan)
     else:
         app = FastAPI(lifespan=lifespan)
     app.state.args = args
@@ -993,7 +899,7 @@ def build_app(args: Namespace) -> FastAPI:
     async def http_exception_handler(_: Request, exc: HTTPException):
         err = ErrorResponse(
             error=ErrorInfo(
-                message=exc.detail,
+                message=sanitize_message(exc.detail),
                 type=HTTPStatus(exc.status_code).phrase,
                 code=exc.status_code,
             )
@@ -1002,6 +908,14 @@ def build_app(args: Namespace) -> FastAPI:
 
     @app.exception_handler(RequestValidationError)
     async def validation_exception_handler(_: Request, exc: RequestValidationError):
+        param = None
+        for error in exc.errors():
+            if "ctx" in error and "error" in error["ctx"]:
+                ctx_error = error["ctx"]["error"]
+                if isinstance(ctx_error, VLLMValidationError):
+                    param = ctx_error.parameter
+                    break
+
         exc_str = str(exc)
         errors_str = str(exc.errors())
 
@@ -1012,9 +926,10 @@ def build_app(args: Namespace) -> FastAPI:
 
         err = ErrorResponse(
             error=ErrorInfo(
-                message=message,
+                message=sanitize_message(message),
                 type=HTTPStatus.BAD_REQUEST.phrase,
                 code=HTTPStatus.BAD_REQUEST,
+                param=param,
             )
         )
         return JSONResponse(err.model_dump(), status_code=HTTPStatus.BAD_REQUEST)
@@ -1161,6 +1076,7 @@ async def init_app_state(
             request_logger=request_logger,
             chat_template=resolved_chat_template,
             chat_template_content_format=args.chat_template_content_format,
+            default_chat_template_kwargs=args.default_chat_template_kwargs,
             trust_request_chat_template=args.trust_request_chat_template,
             return_tokens_as_token_ids=args.return_tokens_as_token_ids,
             enable_auto_tools=args.enable_auto_tool_choice,
@@ -1170,11 +1086,15 @@ async def init_app_state(
             enable_prompt_tokens_details=args.enable_prompt_tokens_details,
             enable_force_include_usage=args.enable_force_include_usage,
             enable_log_outputs=args.enable_log_outputs,
+            enable_log_deltas=args.enable_log_deltas,
             log_error_stack=args.log_error_stack,
         )
         if "generate" in supported_tasks
         else None
     )
+    # Warm up chat template processing to avoid first-request latency
+    if state.openai_serving_chat is not None:
+        await state.openai_serving_chat.warmup()
     state.openai_serving_completion = (
         OpenAIServingCompletion(
             engine_client,
@@ -1235,6 +1155,7 @@ async def init_app_state(
             engine_client,
             state.openai_serving_models,
             request_logger=request_logger,
+            score_template=resolved_chat_template,
             log_error_stack=args.log_error_stack,
         )
         if ("embed" in supported_tasks or "score" in supported_tasks)
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index a8eef76cd8ae4f5e8d0f139fabc6908f272ca3fa..594130a1a25145d284384e14a681d92b5d7668bf 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -11,7 +11,7 @@ import json
 import ssl
 from collections.abc import Sequence
 from dataclasses import field
-from typing import Literal
+from typing import Any, Literal
 
 from pydantic.dataclasses import dataclass
 
@@ -80,7 +80,7 @@ class FrontendArgs:
     uds: str | None = None
     """Unix domain socket path. If set, host and port arguments are ignored."""
     uvicorn_log_level: Literal[
-        "debug", "info", "warning", "error", "critical", "trace"
+        "critical", "error", "warning", "info", "debug", "trace"
     ] = "info"
     """Log level for uvicorn."""
     disable_uvicorn_access_log: bool = False
@@ -114,6 +114,12 @@ class FrontendArgs:
     """Whether to trust the chat template provided in the request. If False,
     the server will always use the chat template specified by `--chat-template`
     or the ones from tokenizer."""
+    default_chat_template_kwargs: dict[str, Any] | None = None
+    """Default keyword arguments to pass to the chat template renderer.
+    These will be merged with request-level chat_template_kwargs,
+    with request values taking precedence. Useful for setting default
+    behavior for reasoning models. Example: '{"enable_thinking": false}'
+    to disable thinking mode by default for Qwen3/DeepSeek models."""
     response_role: str = "assistant"
     """The role name to return if `request.add_generation_prompt=true`."""
     ssl_keyfile: str | None = None
@@ -179,8 +185,12 @@ class FrontendArgs:
     """Enable the `/tokenizer_info` endpoint. May expose chat
     templates and other tokenizer configuration."""
     enable_log_outputs: bool = False
-    """If True, log model outputs (generations).
+    """If set to True, log model outputs (generations).
     Requires --enable-log-requests."""
+    enable_log_deltas: bool = True
+    """If set to False, output deltas will not be logged. Relevant only if 
+    --enable-log-outputs is set.
+    """
     h11_max_incomplete_event_size: int = H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT
     """Maximum size (bytes) of an incomplete HTTP event (header or body) for
     h11 parser. Helps mitigate header abuse. Default: 4194304 (4 MB)."""
@@ -194,6 +204,11 @@ class FrontendArgs:
     If set to True, only enable the Tokens In<>Out endpoint. 
     This is intended for use in a Disaggregated Everything setup.
     """
+    enable_offline_docs: bool = False
+    """
+    Enable offline FastAPI documentation for air-gapped environments.
+    Uses vendored static assets bundled with vLLM.
+    """
 
     @staticmethod
     def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
@@ -211,6 +226,9 @@ class FrontendArgs:
         del frontend_kwargs["allowed_methods"]["nargs"]
         del frontend_kwargs["allowed_headers"]["nargs"]
 
+        # Special case: default_chat_template_kwargs needs json.loads type
+        frontend_kwargs["default_chat_template_kwargs"]["type"] = json.loads
+
         # Special case: LoRA modules need custom parser action and
         # optional_type(str)
         frontend_kwargs["lora_modules"]["type"] = optional_type(str)
diff --git a/vllm/entrypoints/openai/parser/harmony_utils.py b/vllm/entrypoints/openai/parser/harmony_utils.py
index 376d97a03964e1ad681fe96920d2f7f4b7b04ff9..533286c5906fe82d800fa01af19cf2d7b54f4a54 100644
--- a/vllm/entrypoints/openai/parser/harmony_utils.py
+++ b/vllm/entrypoints/openai/parser/harmony_utils.py
@@ -187,14 +187,9 @@ def parse_response_input(
     if "type" not in response_msg or response_msg["type"] == "message":
         role = response_msg["role"]
         content = response_msg["content"]
-        if role == "system":
-            # User is trying to set a system message. Change it to:
-            # <|start|>developer<|message|># Instructions
-            # {instructions}<|end|>
-            role = "developer"
-            text_prefix = "Instructions:\n"
-        else:
-            text_prefix = ""
+        # Add prefix for developer messages.
+        # <|start|>developer<|message|># Instructions {instructions}<|end|>
+        text_prefix = "Instructions:\n" if role == "developer" else ""
         if isinstance(content, str):
             msg = Message.from_role_and_content(role, text_prefix + content)
         else:
diff --git a/vllm/entrypoints/openai/parser/responses_parser.py b/vllm/entrypoints/openai/parser/responses_parser.py
index 4fa6b4d906db00d4ab2dce0482378f1bbaff1853..14a6f5cb73e1acdb74ef7cdc89aeb8a4dc57aeaf 100644
--- a/vllm/entrypoints/openai/parser/responses_parser.py
+++ b/vllm/entrypoints/openai/parser/responses_parser.py
@@ -3,7 +3,11 @@
 import logging
 from collections.abc import Callable
 
-from openai.types.responses.response_function_tool_call import ResponseFunctionToolCall
+from openai.types.responses import ResponseFunctionToolCall, ResponseOutputItem
+from openai.types.responses.response_function_tool_call_output_item import (
+    ResponseFunctionToolCallOutputItem,
+)
+from openai.types.responses.response_output_item import McpCall
 from openai.types.responses.response_output_message import ResponseOutputMessage
 from openai.types.responses.response_output_text import ResponseOutputText
 from openai.types.responses.response_reasoning_item import (
@@ -11,12 +15,12 @@ from openai.types.responses.response_reasoning_item import (
     ResponseReasoningItem,
 )
 
+from vllm.entrypoints.constants import MCP_PREFIX
 from vllm.entrypoints.openai.protocol import ResponseInputOutputItem, ResponsesRequest
 from vllm.outputs import CompletionOutput
 from vllm.reasoning.abs_reasoning_parsers import ReasoningParser
-from vllm.tokenizers.protocol import TokenizerLike
+from vllm.tokenizers import TokenizerLike
 from vllm.tool_parsers.abstract_tool_parser import ToolParser
-from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import random_uuid
 
 logger = logging.getLogger(__name__)
@@ -28,8 +32,8 @@ class ResponsesParser:
     def __init__(
         self,
         *,
-        tokenizer: AnyTokenizer,
-        reasoning_parser_cls: Callable[[AnyTokenizer], ReasoningParser],
+        tokenizer: TokenizerLike,
+        reasoning_parser_cls: Callable[[TokenizerLike], ReasoningParser],
         response_messages: list[ResponseInputOutputItem],
         request: ResponsesRequest,
         tool_parser_cls: Callable[[TokenizerLike], ToolParser] | None,
@@ -47,7 +51,13 @@ class ResponsesParser:
         if tool_parser_cls is not None:
             self.tool_parser_instance = tool_parser_cls(tokenizer)
 
+        # Store the last finish_reason to determine response status
+        self.finish_reason: str | None = None
+
     def process(self, output: CompletionOutput) -> "ResponsesParser":
+        # Store the finish_reason from the output
+        self.finish_reason = output.finish_reason
+
         reasoning_content, content = self.reasoning_parser_instance.extract_reasoning(
             output.text, request=self.request
         )
@@ -111,11 +121,42 @@ class ResponsesParser:
 
         return self
 
+    def make_response_output_items_from_parsable_context(
+        self,
+    ) -> list[ResponseOutputItem]:
+        """Given a list of sentences, construct ResponseOutput Items."""
+        response_messages = self.response_messages[self.num_init_messages :]
+        output_messages: list[ResponseOutputItem] = []
+        for message in response_messages:
+            if not isinstance(message, ResponseFunctionToolCallOutputItem):
+                output_messages.append(message)
+            else:
+                if len(output_messages) == 0:
+                    raise ValueError(
+                        "Cannot have a FunctionToolCallOutput before FunctionToolCall."
+                    )
+                if isinstance(output_messages[-1], ResponseFunctionToolCall):
+                    mcp_message = McpCall(
+                        id=f"{MCP_PREFIX}{random_uuid()}",
+                        arguments=output_messages[-1].arguments,
+                        name=output_messages[-1].name,
+                        server_label=output_messages[
+                            -1
+                        ].name,  # TODO: store the server label
+                        type="mcp_call",
+                        status="completed",
+                        output=message.output,
+                        # TODO: support error output
+                    )
+                    output_messages[-1] = mcp_message
+
+        return output_messages
+
 
 def get_responses_parser_for_simple_context(
     *,
-    tokenizer: AnyTokenizer,
-    reasoning_parser_cls: Callable[[AnyTokenizer], ReasoningParser],
+    tokenizer: TokenizerLike,
+    reasoning_parser_cls: Callable[[TokenizerLike], ReasoningParser],
     response_messages: list[ResponseInputOutputItem],
     request: ResponsesRequest,
     tool_parser_cls,
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index a7c4980cd36746690c245874008fa159b445a090..845dae7c1bf1d5dd0501fea3a76655a95c56ab45 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -72,6 +72,7 @@ from pydantic import (
 )
 
 from vllm.entrypoints.chat_utils import ChatCompletionMessageParam, make_tool_call_id
+from vllm.exceptions import VLLMValidationError
 from vllm.logger import init_logger
 from vllm.logprobs import Logprob
 from vllm.sampling_params import (
@@ -444,6 +445,7 @@ class ResponsesRequest(OpenAIBaseModel):
             ),
             structured_outputs=structured_outputs,
             logit_bias=self.logit_bias,
+            skip_clone=True,  # Created fresh per request, safe to skip clone
         )
 
     def is_include_output_logprobs(self) -> bool:
@@ -466,7 +468,9 @@ class ResponsesRequest(OpenAIBaseModel):
     @model_validator(mode="before")
     def validate_prompt(cls, data):
         if data.get("prompt") is not None:
-            raise ValueError("prompt template is not supported")
+            raise VLLMValidationError(
+                "prompt template is not supported", parameter="prompt"
+            )
         return data
 
     @model_validator(mode="before")
@@ -573,7 +577,9 @@ class ChatCompletionRequest(OpenAIBaseModel):
     min_tokens: int = 0
     skip_special_tokens: bool = True
     spaces_between_special_tokens: bool = True
-    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
+    truncate_prompt_tokens: Annotated[int, Field(ge=-1, le=_LONG_INFO.max)] | None = (
+        None
+    )
     prompt_logprobs: int | None = None
     allowed_token_ids: list[int] | None = None
     bad_words: list[str] = Field(default_factory=list)
@@ -844,13 +850,17 @@ class ChatCompletionRequest(OpenAIBaseModel):
             bad_words=self.bad_words,
             allowed_token_ids=self.allowed_token_ids,
             extra_args=extra_args or None,
+            skip_clone=True,  # Created fresh per request, safe to skip clone
         )
 
     @model_validator(mode="before")
     @classmethod
     def validate_stream_options(cls, data):
         if data.get("stream_options") and not data.get("stream"):
-            raise ValueError("Stream options can only be defined when `stream=True`.")
+            raise VLLMValidationError(
+                "Stream options can only be defined when `stream=True`.",
+                parameter="stream_options",
+            )
 
         return data
 
@@ -859,19 +869,29 @@ class ChatCompletionRequest(OpenAIBaseModel):
     def check_logprobs(cls, data):
         if (prompt_logprobs := data.get("prompt_logprobs")) is not None:
             if data.get("stream") and (prompt_logprobs > 0 or prompt_logprobs == -1):
-                raise ValueError(
-                    "`prompt_logprobs` are not available when `stream=True`."
+                raise VLLMValidationError(
+                    "`prompt_logprobs` are not available when `stream=True`.",
+                    parameter="prompt_logprobs",
                 )
 
             if prompt_logprobs < 0 and prompt_logprobs != -1:
-                raise ValueError("`prompt_logprobs` must be a positive value or -1.")
+                raise VLLMValidationError(
+                    "`prompt_logprobs` must be a positive value or -1.",
+                    parameter="prompt_logprobs",
+                    value=prompt_logprobs,
+                )
         if (top_logprobs := data.get("top_logprobs")) is not None:
             if top_logprobs < 0 and top_logprobs != -1:
-                raise ValueError("`top_logprobs` must be a positive value or -1.")
+                raise VLLMValidationError(
+                    "`top_logprobs` must be a positive value or -1.",
+                    parameter="top_logprobs",
+                    value=top_logprobs,
+                )
 
             if (top_logprobs == -1 or top_logprobs > 0) and not data.get("logprobs"):
-                raise ValueError(
-                    "when using `top_logprobs`, `logprobs` must be set to true."
+                raise VLLMValidationError(
+                    "when using `top_logprobs`, `logprobs` must be set to true.",
+                    parameter="top_logprobs",
                 )
 
         return data
@@ -1040,7 +1060,9 @@ class CompletionRequest(OpenAIBaseModel):
     min_tokens: int = 0
     skip_special_tokens: bool = True
     spaces_between_special_tokens: bool = True
-    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
+    truncate_prompt_tokens: Annotated[int, Field(ge=-1, le=_LONG_INFO.max)] | None = (
+        None
+    )
     allowed_token_ids: list[int] | None = None
     prompt_logprobs: int | None = None
     # --8<-- [end:completion-sampling-params]
@@ -1271,6 +1293,7 @@ class CompletionRequest(OpenAIBaseModel):
             logit_bias=self.logit_bias,
             allowed_token_ids=self.allowed_token_ids,
             extra_args=extra_args or None,
+            skip_clone=True,  # Created fresh per request, safe to skip clone
         )
 
     @model_validator(mode="before")
@@ -1285,9 +1308,10 @@ class CompletionRequest(OpenAIBaseModel):
             for k in ("json", "regex", "choice")
         )
         if count > 1:
-            raise ValueError(
+            raise VLLMValidationError(
                 "You can only use one kind of constraints for structured "
-                "outputs ('json', 'regex' or 'choice')."
+                "outputs ('json', 'regex' or 'choice').",
+                parameter="structured_outputs",
             )
         return data
 
@@ -1296,14 +1320,23 @@ class CompletionRequest(OpenAIBaseModel):
     def check_logprobs(cls, data):
         if (prompt_logprobs := data.get("prompt_logprobs")) is not None:
             if data.get("stream") and (prompt_logprobs > 0 or prompt_logprobs == -1):
-                raise ValueError(
-                    "`prompt_logprobs` are not available when `stream=True`."
+                raise VLLMValidationError(
+                    "`prompt_logprobs` are not available when `stream=True`.",
+                    parameter="prompt_logprobs",
                 )
 
             if prompt_logprobs < 0 and prompt_logprobs != -1:
-                raise ValueError("`prompt_logprobs` must be a positive value or -1.")
+                raise VLLMValidationError(
+                    "`prompt_logprobs` must be a positive value or -1.",
+                    parameter="prompt_logprobs",
+                    value=prompt_logprobs,
+                )
         if (logprobs := data.get("logprobs")) is not None and logprobs < 0:
-            raise ValueError("`logprobs` must be a positive value.")
+            raise VLLMValidationError(
+                "`logprobs` must be a positive value.",
+                parameter="logprobs",
+                value=logprobs,
+            )
 
         return data
 
@@ -1311,7 +1344,10 @@ class CompletionRequest(OpenAIBaseModel):
     @classmethod
     def validate_stream_options(cls, data):
         if data.get("stream_options") and not data.get("stream"):
-            raise ValueError("Stream options can only be defined when `stream=True`.")
+            raise VLLMValidationError(
+                "Stream options can only be defined when `stream=True`.",
+                parameter="stream_options",
+            )
 
         return data
 
@@ -1654,13 +1690,23 @@ class ResponsesResponse(OpenAIBaseModel):
     usage: ResponseUsage | None = None
     user: str | None = None
 
-    # --8<-- [start:responses-extra-params]
+    # --8<-- [start:responses-response-extra-params]
     # These are populated when enable_response_messages is set to True
     # NOTE: custom serialization is needed
     # see serialize_input_messages and serialize_output_messages
-    input_messages: ResponseInputOutputMessage | None = None
-    output_messages: ResponseInputOutputMessage | None = None
-    # --8<-- [end:responses-extra-params]
+    input_messages: ResponseInputOutputMessage | None = Field(
+        default=None,
+        description=(
+            "If enable_response_messages, we can show raw token input to model."
+        ),
+    )
+    output_messages: ResponseInputOutputMessage | None = Field(
+        default=None,
+        description=(
+            "If enable_response_messages, we can show raw token output of model."
+        ),
+    )
+    # --8<-- [end:responses-response-extra-params]
 
     # NOTE: openAI harmony doesn't serialize TextContent properly,
     # TODO: this fixes for TextContent, but need to verify for tools etc
@@ -2054,6 +2100,9 @@ class TranscriptionRequest(OpenAIBaseModel):
 
     presence_penalty: float | None = 0.0
     """The presence penalty to use for sampling."""
+
+    max_completion_tokens: int | None = None
+    """The maximum number of tokens to generate."""
     # --8<-- [end:transcription-sampling-params]
 
     # Default sampling parameters for transcription requests.
@@ -2111,6 +2160,7 @@ class TranscriptionRequest(OpenAIBaseModel):
             if self.stream
             else RequestOutputKind.FINAL_ONLY,
             extra_args=self.vllm_xargs,
+            skip_clone=True,  # Created fresh per request, safe to skip clone
         )
 
     @model_validator(mode="before")
@@ -2125,7 +2175,15 @@ class TranscriptionRequest(OpenAIBaseModel):
         stream_opts = ["stream_include_usage", "stream_continuous_usage_stats"]
         stream = data.get("stream", False)
         if any(bool(data.get(so, False)) for so in stream_opts) and not stream:
-            raise ValueError("Stream options can only be defined when `stream=True`.")
+            # Find which specific stream option was set
+            invalid_param = next(
+                (so for so in stream_opts if data.get(so, False)),
+                "stream_include_usage",
+            )
+            raise VLLMValidationError(
+                "Stream options can only be defined when `stream=True`.",
+                parameter=invalid_param,
+            )
 
         return data
 
@@ -2300,6 +2358,9 @@ class TranslationRequest(OpenAIBaseModel):
     # Flattened stream option to simplify form data.
     stream_include_usage: bool | None = False
     stream_continuous_usage_stats: bool | None = False
+
+    max_completion_tokens: int | None = None
+    """The maximum number of tokens to generate."""
     # --8<-- [end:translation-extra-params]
 
     # Default sampling parameters for translation requests.
@@ -2327,6 +2388,7 @@ class TranslationRequest(OpenAIBaseModel):
             output_kind=RequestOutputKind.DELTA
             if self.stream
             else RequestOutputKind.FINAL_ONLY,
+            skip_clone=True,  # Created fresh per request, safe to skip clone
         )
 
     @model_validator(mode="before")
@@ -2335,7 +2397,15 @@ class TranslationRequest(OpenAIBaseModel):
         stream_opts = ["stream_include_usage", "stream_continuous_usage_stats"]
         stream = data.get("stream", False)
         if any(bool(data.get(so, False)) for so in stream_opts) and not stream:
-            raise ValueError("Stream options can only be defined when `stream=True`.")
+            # Find which specific stream option was set
+            invalid_param = next(
+                (so for so in stream_opts if data.get(so, False)),
+                "stream_include_usage",
+            )
+            raise VLLMValidationError(
+                "Stream options can only be defined when `stream=True`.",
+                parameter=invalid_param,
+            )
 
         return data
 
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index 837e742e6be49481b8af3289341c910bb671e961..6bb6d0f3f97bfae22fd41a500c1c8d3d0c5c0cb5 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -468,6 +468,9 @@ async def run_batch(
             reasoning_parser=args.structured_outputs_config.reasoning_parser,
             enable_prompt_tokens_details=args.enable_prompt_tokens_details,
             enable_force_include_usage=args.enable_force_include_usage,
+            default_chat_template_kwargs=getattr(
+                args, "default_chat_template_kwargs", None
+            ),
         )
         if "generate" in supported_tasks
         else None
@@ -495,6 +498,7 @@ async def run_batch(
             engine_client,
             openai_serving_models,
             request_logger=request_logger,
+            score_template=None,
         )
         if ("embed" in supported_tasks or enable_serving_reranking)
         else None
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 98fc7810faf96d9325cf18077aaa8f854e32f959..e65dba2b893be62d2b8aa9e43278138586dbbfa8 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -6,13 +6,14 @@ import json
 import time
 from collections.abc import AsyncGenerator, AsyncIterator
 from collections.abc import Sequence as GenericSequence
-from typing import Final
+from typing import Any, Final
 
 import jinja2
 import partial_json_parser
 import regex as re
 from fastapi import Request
 from openai_harmony import Message as OpenAIMessage
+from partial_json_parser.core.options import Allow
 
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.chat_utils import (
@@ -51,6 +52,9 @@ from vllm.entrypoints.openai.protocol import (
     ToolCall,
     UsageInfo,
 )
+from vllm.entrypoints.openai.serving_chat_stream_harmony import (
+    extract_harmony_streaming_delta,
+)
 from vllm.entrypoints.openai.serving_engine import (
     GenerationError,
     OpenAIServing,
@@ -73,6 +77,7 @@ from vllm.tokenizers.mistral import (
 )
 from vllm.tool_parsers import ToolParser
 from vllm.tool_parsers.mistral_tool_parser import MistralToolCall
+from vllm.tool_parsers.utils import partial_json_loads
 from vllm.utils.collection_utils import as_list
 from vllm.v1.sample.logits_processor import validate_logits_processors_parameters
 
@@ -98,7 +103,9 @@ class OpenAIServingChat(OpenAIServing):
         enable_prompt_tokens_details: bool = False,
         enable_force_include_usage: bool = False,
         enable_log_outputs: bool = False,
+        enable_log_deltas: bool = True,
         log_error_stack: bool = False,
+        default_chat_template_kwargs: dict[str, Any] | None = None,
     ) -> None:
         super().__init__(
             engine_client=engine_client,
@@ -112,7 +119,9 @@ class OpenAIServingChat(OpenAIServing):
         self.chat_template = chat_template
         self.chat_template_content_format: Final = chat_template_content_format
         self.trust_request_chat_template = trust_request_chat_template
+        self.default_chat_template_kwargs = default_chat_template_kwargs or {}
         self.enable_log_outputs = enable_log_outputs
+        self.enable_log_deltas = enable_log_deltas
 
         # set up logits processors
         self.logits_processors = self.model_config.logits_processors
@@ -162,6 +171,56 @@ class OpenAIServingChat(OpenAIServing):
         self.supports_code_interpreter = False
         self.python_tool = None
 
+    async def warmup(self) -> None:
+        """
+        Warm up the chat template processing to avoid first-request latency.
+
+        This method triggers Jinja2 template compilation and content format
+        detection that would otherwise happen on the first real request,
+        causing increased latency on the first request.
+        """
+        logger.info("Warming up chat template processing...")
+        start_time = time.perf_counter()
+
+        try:
+            # Get the tokenizer from the engine
+            tokenizer = await self.engine_client.get_tokenizer()
+
+            # Create a minimal dummy request
+            dummy_request = ChatCompletionRequest(
+                messages=[{"role": "user", "content": "warmup"}],
+                model=None,
+                max_completion_tokens=1,
+            )
+
+            # Call _preprocess_chat to trigger template compilation
+            # This forces:
+            # 1. Chat template content format detection
+            # 2. Jinja2 template compilation
+            # 3. Tokenizer initialization for chat
+            await self._preprocess_chat(
+                dummy_request,
+                tokenizer,
+                dummy_request.messages,
+                chat_template=self.chat_template,
+                chat_template_content_format=self.chat_template_content_format,
+                add_generation_prompt=True,
+                continue_final_message=False,
+                tool_dicts=None,
+                documents=None,
+                chat_template_kwargs=None,
+                default_chat_template_kwargs=self.default_chat_template_kwargs,
+                tool_parser=None,
+                add_special_tokens=False,
+            )
+
+            elapsed = (time.perf_counter() - start_time) * 1000
+            logger.info("Chat template warmup completed in %.1fms", elapsed)
+
+        except Exception:
+            # Log but don't fail server startup if warmup fails
+            logger.exception("Chat template warmup failed")
+
     async def create_chat_completion(
         self,
         request: ChatCompletionRequest,
@@ -204,18 +263,31 @@ class OpenAIServingChat(OpenAIServing):
                 truncate_tool_call_ids(request)
                 validate_request_params(request)
 
-            if (
-                request.tool_choice == "auto"
-                and not (self.enable_auto_tools and tool_parser is not None)
+            # Check if tool parsing is unavailable (common condition)
+            tool_parsing_unavailable = (
+                tool_parser is None
                 and not isinstance(tokenizer, MistralTokenizer)
                 and not self.use_harmony
+            )
+
+            # Validate tool_choice when tool parsing is required but unavailable
+            if tool_parsing_unavailable and request.tool_choice not in (
+                None,
+                "none",
             ):
-                # for hf tokenizers, "auto" tools requires
-                # --enable-auto-tool-choice and --tool-call-parser
-                return self.create_error_response(
-                    '"auto" tool choice requires '
-                    "--enable-auto-tool-choice and --tool-call-parser to be set"
-                )
+                if request.tool_choice == "auto" and not self.enable_auto_tools:
+                    # for hf tokenizers, "auto" tools requires
+                    # --enable-auto-tool-choice and --tool-call-parser
+                    return self.create_error_response(
+                        '"auto" tool choice requires '
+                        "--enable-auto-tool-choice and --tool-call-parser to be set"
+                    )
+                elif request.tool_choice != "auto":
+                    # "required" or named tool requires tool parser
+                    return self.create_error_response(
+                        f'tool_choice="{request.tool_choice}" requires '
+                        "--tool-call-parser to be set"
+                    )
 
             if request.tools is None or (
                 request.tool_choice == "none"
@@ -234,6 +306,10 @@ class OpenAIServingChat(OpenAIServing):
                 )
                 if error_check_ret is not None:
                     return error_check_ret
+
+                chat_template_kwargs = request.chat_template_kwargs or {}
+                chat_template_kwargs.update(reasoning_effort=request.reasoning_effort)
+
                 conversation, engine_prompts = await self._preprocess_chat(
                     request,
                     tokenizer,
@@ -244,13 +320,17 @@ class OpenAIServingChat(OpenAIServing):
                     continue_final_message=request.continue_final_message,
                     tool_dicts=tool_dicts,
                     documents=request.documents,
-                    chat_template_kwargs=request.chat_template_kwargs,
+                    chat_template_kwargs=chat_template_kwargs,
+                    default_chat_template_kwargs=self.default_chat_template_kwargs,
                     tool_parser=tool_parser,
                     add_special_tokens=request.add_special_tokens,
                 )
             else:
                 # For GPT-OSS.
-                conversation, engine_prompts = self._make_request_with_harmony(request)
+                should_include_tools = tool_dicts is not None
+                conversation, engine_prompts = self._make_request_with_harmony(
+                    request, should_include_tools
+                )
         except (ValueError, TypeError, RuntimeError, jinja2.TemplateError) as e:
             logger.exception("Error in preprocessing prompt inputs")
             return self.create_error_response(f"{e} {e.__cause__}")
@@ -332,6 +412,7 @@ class OpenAIServingChat(OpenAIServing):
                         lora_request=lora_request,
                         trace_headers=trace_headers,
                         priority=request.priority,
+                        data_parallel_rank=data_parallel_rank,
                     )
 
                     generator = self.engine_client.generate(
@@ -348,8 +429,7 @@ class OpenAIServingChat(OpenAIServing):
 
                 generators.append(generator)
         except ValueError as e:
-            # TODO: Use a vllm-specific Validation Error
-            return self.create_error_response(str(e))
+            return self.create_error_response(e)
 
         assert len(generators) == 1
         (result_generator,) = generators
@@ -379,8 +459,7 @@ class OpenAIServingChat(OpenAIServing):
         except GenerationError as e:
             return self._convert_generation_error_to_response(e)
         except ValueError as e:
-            # TODO: Use a vllm-specific Validation Error
-            return self.create_error_response(str(e))
+            return self.create_error_response(e)
 
     def get_chat_request_role(self, request: ChatCompletionRequest) -> str:
         if request.add_generation_prompt:
@@ -438,8 +517,12 @@ class OpenAIServingChat(OpenAIServing):
             # if the current text is empty, we cannot parse it
             return None, function_name_returned
         try:
-            obj = partial_json_parser.loads(current_text)
-        except partial_json_parser.core.exceptions.MalformedJSON:
+            flags = Allow.ALL
+            obj, _ = partial_json_loads(current_text, flags)
+        except (
+            partial_json_parser.core.exceptions.MalformedJSON,
+            json.JSONDecodeError,
+        ):
             logger.debug("not enough tokens to parse into JSON yet")
             obj = None
 
@@ -588,9 +671,14 @@ class OpenAIServingChat(OpenAIServing):
                         "Tokenizer not available when `skip_tokenizer_init=True`"
                     )
 
+                # Pass the same chat template kwargs as used in tokenization
+                chat_template_kwargs = self._prepare_extra_chat_template_kwargs(
+                    request.chat_template_kwargs,
+                    self.default_chat_template_kwargs,
+                )
                 reasoning_parser = self.reasoning_parser(
                     tokenizer,
-                    chat_template_kwargs=request.chat_template_kwargs,  # type: ignore
+                    chat_template_kwargs=chat_template_kwargs,  # type: ignore[call-arg]
                 )
         except RuntimeError as e:
             logger.exception("Error in reasoning parser creation.")
@@ -613,7 +701,7 @@ class OpenAIServingChat(OpenAIServing):
                 tool_parsers = [None] * num_choices
         except Exception as e:
             logger.exception("Error in tool parser creation.")
-            data = self.create_streaming_error_response(str(e))
+            data = self.create_streaming_error_response(e)
             yield f"data: {data}\n\n"
             yield "data: [DONE]\n\n"
             return
@@ -742,6 +830,11 @@ class OpenAIServingChat(OpenAIServing):
                             delta_text += harmony_parser.last_content_delta or ""
                         cur_channel = harmony_parser.current_channel
                         cur_recipient = harmony_parser.current_recipient
+                        # handle the case where several tokens where generated at once
+                        # including the final token, leading to a delta in the text
+                        # but the current channel to be empty (start state)
+                        if not cur_channel and delta_text:
+                            cur_channel = "final"
                     else:
                         delta_text = output.text
 
@@ -771,64 +864,17 @@ class OpenAIServingChat(OpenAIServing):
                             current_token_ids = as_list(output.token_ids)
 
                     if self.use_harmony:
-                        if cur_channel == "final":
-                            delta_message = DeltaMessage(content=delta_text)
-                        elif cur_channel == "analysis":
-                            if request.include_reasoning:
-                                delta_message = DeltaMessage(reasoning=delta_text)
-                            else:
-                                delta_message = None
-                        elif (
-                            cur_channel == "commentary"
-                            and cur_recipient
-                            and cur_recipient.startswith("functions.")
-                        ):
-                            # Count completed tool calls to determine index
-                            base_index = 0
-                            for msg in harmony_parser.messages:
-                                if (
-                                    msg.channel == "commentary"
-                                    and msg.recipient
-                                    and msg.recipient.startswith("functions.")
-                                ):
-                                    base_index += 1
-
-                            if prev_recipient != cur_recipient:
-                                tool_name = cur_recipient.split("functions.", 1)[1]
-                                delta_message = DeltaMessage(
-                                    tool_calls=[
-                                        DeltaToolCall(
-                                            id=make_tool_call_id(),
-                                            type="function",
-                                            function=DeltaFunctionCall(
-                                                name=tool_name,
-                                                arguments="",
-                                            ),
-                                            index=base_index,
-                                        )
-                                    ]
-                                )
-                            elif delta_text:
-                                delta_message = DeltaMessage(
-                                    tool_calls=[
-                                        DeltaToolCall(
-                                            index=base_index,
-                                            function=DeltaFunctionCall(
-                                                arguments=delta_text
-                                            ),
-                                        )
-                                    ]
-                                )
-                            else:
-                                delta_message = None
-
-                            if delta_message is not None:
-                                harmony_tools_streamed[i] = True
-                        elif cur_channel == "commentary":
-                            # Tool call preambles meant to be shown to the user
-                            delta_message = DeltaMessage(content=delta_text)
-                        else:
-                            delta_message = None
+                        delta_message, tools_streamed_flag = (
+                            extract_harmony_streaming_delta(
+                                harmony_parser=harmony_parser,
+                                cur_channel=cur_channel,
+                                cur_recipient=cur_recipient,
+                                prev_recipient=prev_recipient,
+                                delta_text=delta_text,
+                                include_reasoning=request.include_reasoning,
+                            )
+                        )
+                        harmony_tools_streamed[i] |= tools_streamed_flag
                     # handle streaming deltas for tools with named tool_choice
                     elif tool_choice_function_name:
                         if (
@@ -1101,7 +1147,7 @@ class OpenAIServingChat(OpenAIServing):
                                 if tc.function and tc.function.arguments
                             )
 
-                        if delta_content:
+                        if delta_content and self.enable_log_deltas:
                             self.request_logger.log_outputs(
                                 request_id=request_id,
                                 outputs=delta_content,
@@ -1183,15 +1229,8 @@ class OpenAIServingChat(OpenAIServing):
                             # check to see if there's anything left to stream
                             remaining_call = expected_call.replace(actual_call, "", 1)
                             # set that as a delta message
-                            delta_message = DeltaMessage(
-                                tool_calls=[
-                                    DeltaToolCall(
-                                        index=index,
-                                        function=DeltaFunctionCall(
-                                            arguments=remaining_call
-                                        ).model_dump(exclude_none=True),
-                                    )
-                                ]
+                            delta_message = self._create_remaining_args_delta(
+                                delta_message, remaining_call, index
                             )
 
                         # Send the finish response for each request.n only once
@@ -1301,9 +1340,8 @@ class OpenAIServingChat(OpenAIServing):
         except GenerationError as e:
             yield f"data: {self._convert_generation_error_to_streaming_response(e)}\n\n"
         except Exception as e:
-            # TODO: Use a vllm-specific Validation Error
             logger.exception("Error in chat completion stream generator.")
-            data = self.create_streaming_error_response(str(e))
+            data = self.create_streaming_error_response(e)
             yield f"data: {data}\n\n"
         # Send the final done message after all response.n are finished
         yield "data: [DONE]\n\n"
@@ -1327,8 +1365,7 @@ class OpenAIServingChat(OpenAIServing):
         except asyncio.CancelledError:
             return self.create_error_response("Client disconnected")
         except ValueError as e:
-            # TODO: Use a vllm-specific Validation Error
-            return self.create_error_response(str(e))
+            return self.create_error_response(e)
 
         assert final_res is not None
 
@@ -1417,9 +1454,14 @@ class OpenAIServingChat(OpenAIServing):
                             "Tokenizer not available when `skip_tokenizer_init=True`"
                         )
 
+                    # Pass the same chat template kwargs as used in tokenization
+                    chat_template_kwargs = self._prepare_extra_chat_template_kwargs(
+                        request.chat_template_kwargs,
+                        self.default_chat_template_kwargs,
+                    )
                     reasoning_parser = self.reasoning_parser(
                         tokenizer,
-                        chat_template_kwargs=request.chat_template_kwargs,  # type: ignore
+                        chat_template_kwargs=chat_template_kwargs,  # type: ignore[call-arg]
                     )
                 except RuntimeError as e:
                     logger.exception("Error in reasoning parser creation.")
@@ -1780,9 +1822,39 @@ class OpenAIServingChat(OpenAIServing):
             and delta_message.tool_calls[0].function.arguments is not None
         )
 
+    @staticmethod
+    def _create_remaining_args_delta(
+        delta_message: DeltaMessage,
+        remaining_call: str,
+        index: int,
+    ) -> DeltaMessage:
+        """
+        Create a delta message for remaining tool arguments, preserving
+        id/type/name from the original delta.
+        """
+        original_tc = next(
+            (tc for tc in delta_message.tool_calls if tc.index == index),
+            None,
+        )
+        original_fn = original_tc.function if original_tc else None
+        return DeltaMessage(
+            tool_calls=[
+                DeltaToolCall(
+                    index=index,
+                    id=original_tc.id if original_tc else None,
+                    type=original_tc.type if original_tc else None,
+                    function=DeltaFunctionCall(
+                        name=original_fn.name if original_fn else None,
+                        arguments=remaining_call,
+                    ),
+                )
+            ]
+        )
+
     def _make_request_with_harmony(
         self,
         request: ChatCompletionRequest,
+        should_include_tools: bool = True,
     ):
         messages: list[OpenAIMessage] = []
 
@@ -1800,13 +1872,16 @@ class OpenAIServingChat(OpenAIServing):
             reasoning_effort=request.reasoning_effort,
             browser_description=None,
             python_description=None,
-            with_custom_tools=request.tools is not None,
+            with_custom_tools=should_include_tools,
         )
         messages.append(sys_msg)
 
         # Add developer message.
-        dev_msg = get_developer_message(tools=request.tools)
-        messages.append(dev_msg)
+        if request.tools:
+            dev_msg = get_developer_message(
+                tools=request.tools if should_include_tools else None
+            )
+            messages.append(dev_msg)
 
         # Add user message.
         messages.extend(parse_chat_inputs_to_harmony_messages(request.messages))
diff --git a/vllm/entrypoints/openai/serving_chat_stream_harmony.py b/vllm/entrypoints/openai/serving_chat_stream_harmony.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b5ae620651c6195172fcdc3a43d061d5a1fc5f1
--- /dev/null
+++ b/vllm/entrypoints/openai/serving_chat_stream_harmony.py
@@ -0,0 +1,101 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Harmony-specific streaming delta extraction for chat completions.
+
+This module handles the extraction of DeltaMessage objects from
+harmony parser state during streaming chat completions.
+"""
+
+from openai_harmony import StreamableParser
+
+from vllm.entrypoints.chat_utils import make_tool_call_id
+from vllm.entrypoints.openai.protocol import (
+    DeltaFunctionCall,
+    DeltaMessage,
+    DeltaToolCall,
+)
+
+
+def extract_harmony_streaming_delta(
+    harmony_parser: StreamableParser,
+    cur_channel: str | None,
+    cur_recipient: str | None,
+    prev_recipient: str | None,
+    delta_text: str,
+    include_reasoning: bool,
+) -> tuple[DeltaMessage | None, bool]:
+    """
+    Extract a DeltaMessage from harmony parser state during streaming.
+
+    Args:
+        harmony_parser: The StreamableParser instance tracking parse state
+        cur_channel: Current channel ("final", "analysis", "commentary", etc.)
+        cur_recipient: Current recipient (e.g., "functions.my_func")
+        prev_recipient: Previous recipient for detecting tool call transitions
+        delta_text: The text delta to include in the message
+        include_reasoning: Whether to include reasoning content
+
+    Returns:
+        A tuple of (DeltaMessage or None, tools_streamed_flag)
+    """
+    tools_streamed = False
+
+    if cur_channel == "final":
+        delta_message = DeltaMessage(content=delta_text)
+    elif (
+        (cur_channel == "commentary" or cur_channel == "analysis")
+        and cur_recipient
+        and cur_recipient.startswith("functions.")
+    ):
+        # Count completed tool calls to determine index
+        base_index = 0
+        for msg in harmony_parser.messages:
+            if (
+                (msg.channel == "commentary" or msg.channel == "analysis")
+                and msg.recipient
+                and msg.recipient.startswith("functions.")
+            ):
+                base_index += 1
+
+        if prev_recipient != cur_recipient:
+            tool_name = cur_recipient.split("functions.", 1)[1]
+            delta_message = DeltaMessage(
+                tool_calls=[
+                    DeltaToolCall(
+                        id=make_tool_call_id(),
+                        type="function",
+                        function=DeltaFunctionCall(
+                            name=tool_name,
+                            arguments="",
+                        ),
+                        index=base_index,
+                    )
+                ]
+            )
+        elif delta_text:
+            delta_message = DeltaMessage(
+                tool_calls=[
+                    DeltaToolCall(
+                        index=base_index,
+                        function=DeltaFunctionCall(arguments=delta_text),
+                    )
+                ]
+            )
+        else:
+            delta_message = None
+
+        if delta_message is not None:
+            tools_streamed = True
+    elif cur_channel == "commentary":
+        # Tool call preambles meant to be shown to the user
+        delta_message = DeltaMessage(content=delta_text)
+    elif cur_channel == "analysis":
+        if include_reasoning:
+            delta_message = DeltaMessage(reasoning=delta_text)
+        else:
+            delta_message = None
+    else:
+        delta_message = None
+
+    return delta_message, tools_streamed
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 1be0afc8c74e54fee96182eaf9dbcac33e7b4e6f..6ef5ae3ef01c9d7f1f46691b389016cab6caf917 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -23,6 +23,7 @@ from vllm.entrypoints.openai.protocol import (
     PromptTokenUsageInfo,
     RequestResponseMetadata,
     UsageInfo,
+    VLLMValidationError,
 )
 from vllm.entrypoints.openai.serving_engine import (
     GenerationError,
@@ -139,16 +140,16 @@ class OpenAIServingCompletion(OpenAIServing):
             )
         except ValueError as e:
             logger.exception("Error in preprocessing prompt inputs")
-            return self.create_error_response(str(e))
+            return self.create_error_response(e)
         except TypeError as e:
             logger.exception("Error in preprocessing prompt inputs")
-            return self.create_error_response(str(e))
+            return self.create_error_response(e)
         except RuntimeError as e:
             logger.exception("Error in preprocessing prompt inputs")
-            return self.create_error_response(str(e))
+            return self.create_error_response(e)
         except jinja2.TemplateError as e:
             logger.exception("Error in preprocessing prompt inputs")
-            return self.create_error_response(str(e))
+            return self.create_error_response(e)
 
         # Extract data_parallel_rank from header (router can inject it)
         data_parallel_rank = self._get_data_parallel_rank(raw_request)
@@ -230,6 +231,7 @@ class OpenAIServingCompletion(OpenAIServing):
                         lora_request=lora_request,
                         trace_headers=trace_headers,
                         priority=request.priority,
+                        data_parallel_rank=data_parallel_rank,
                     )
 
                     generator = self.engine_client.generate(
@@ -246,8 +248,7 @@ class OpenAIServingCompletion(OpenAIServing):
 
                 generators.append(generator)
         except ValueError as e:
-            # TODO: Use a vllm-specific Validation Error
-            return self.create_error_response(str(e))
+            return self.create_error_response(e)
 
         result_generator = merge_async_iterators(*generators)
 
@@ -307,8 +308,7 @@ class OpenAIServingCompletion(OpenAIServing):
         except GenerationError as e:
             return self._convert_generation_error_to_response(e)
         except ValueError as e:
-            # TODO: Use a vllm-specific Validation Error
-            return self.create_error_response(str(e))
+            return self.create_error_response(e)
 
         # When user requests streaming but we don't stream, we still need to
         # return a streaming response with a single event.
@@ -509,9 +509,8 @@ class OpenAIServingCompletion(OpenAIServing):
         except GenerationError as e:
             yield f"data: {self._convert_generation_error_to_streaming_response(e)}\n\n"
         except Exception as e:
-            # TODO: Use a vllm-specific Validation Error
             logger.exception("Error in completion stream generator.")
-            data = self.create_streaming_error_response(str(e))
+            data = self.create_streaming_error_response(e)
             yield f"data: {data}\n\n"
         yield "data: [DONE]\n\n"
 
@@ -659,8 +658,11 @@ class OpenAIServingCompletion(OpenAIServing):
                     token = f"token_id:{token_id}"
                 else:
                     if tokenizer is None:
-                        raise ValueError(
-                            "Unable to get tokenizer because `skip_tokenizer_init=True`"
+                        raise VLLMValidationError(
+                            "Unable to get tokenizer because "
+                            "`skip_tokenizer_init=True`",
+                            parameter="skip_tokenizer_init",
+                            value=True,
                         )
 
                     token = tokenizer.decode(token_id)
@@ -719,6 +721,15 @@ class OpenAIServingCompletion(OpenAIServing):
         request: CompletionRequest,
         max_input_length: int | None = None,
     ) -> RenderConfig:
+        # Validate max_tokens before using it
+        if request.max_tokens is not None and request.max_tokens > self.max_model_len:
+            raise VLLMValidationError(
+                f"'max_tokens' ({request.max_tokens}) cannot be greater than "
+                f"the model's maximum context length ({self.max_model_len}).",
+                parameter="max_tokens",
+                value=request.max_tokens,
+            )
+
         max_input_tokens_len = self.max_model_len - (request.max_tokens or 0)
         return RenderConfig(
             max_length=max_input_tokens_len,
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index e4c34d3dcd1df802f19343e4d3d97ce66a4f6b19..6dad72cdba7daf4da83a0ee5b939a27943895d47 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -57,6 +57,7 @@ from vllm.entrypoints.openai.protocol import (
     TranscriptionRequest,
     TranscriptionResponse,
     TranslationRequest,
+    VLLMValidationError,
 )
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.pooling.classify.protocol import (
@@ -85,7 +86,7 @@ from vllm.entrypoints.responses_utils import (
     construct_input_messages,
 )
 from vllm.entrypoints.serve.disagg.protocol import GenerateRequest, GenerateResponse
-from vllm.entrypoints.utils import _validate_truncation_size
+from vllm.entrypoints.utils import _validate_truncation_size, sanitize_message
 from vllm.inputs.data import PromptType, TokensPrompt
 from vllm.inputs.parse import (
     PromptComponents,
@@ -322,8 +323,10 @@ class OpenAIServing:
         input_processor = self.input_processor
         tokenizer = input_processor.tokenizer
         if tokenizer is None:
-            raise ValueError(
-                "You cannot use beam search when `skip_tokenizer_init=True`"
+            raise VLLMValidationError(
+                "You cannot use beam search when `skip_tokenizer_init=True`",
+                parameter="skip_tokenizer_init",
+                value=True,
             )
 
         eos_token_id: int = tokenizer.eos_token_id  # type: ignore
@@ -706,8 +709,7 @@ class OpenAIServing:
             return None
 
         except Exception as e:
-            # TODO: Use a vllm-specific Validation Error
-            return self.create_error_response(str(e))
+            return self.create_error_response(e)
 
     async def _collect_batch(
         self,
@@ -738,33 +740,76 @@ class OpenAIServing:
             return None
 
         except Exception as e:
-            return self.create_error_response(str(e))
+            return self.create_error_response(e)
 
     def create_error_response(
         self,
-        message: str,
+        message: str | Exception,
         err_type: str = "BadRequestError",
         status_code: HTTPStatus = HTTPStatus.BAD_REQUEST,
+        param: str | None = None,
     ) -> ErrorResponse:
+        exc: Exception | None = None
+
+        if isinstance(message, Exception):
+            exc = message
+
+            from vllm.exceptions import VLLMValidationError
+
+            if isinstance(exc, VLLMValidationError):
+                err_type = "BadRequestError"
+                status_code = HTTPStatus.BAD_REQUEST
+                param = exc.parameter
+            elif isinstance(exc, (ValueError, TypeError, RuntimeError, OverflowError)):
+                # Common validation errors from user input
+                err_type = "BadRequestError"
+                status_code = HTTPStatus.BAD_REQUEST
+                param = None
+            elif isinstance(exc, NotImplementedError):
+                err_type = "NotImplementedError"
+                status_code = HTTPStatus.NOT_IMPLEMENTED
+                param = None
+            elif exc.__class__.__name__ == "TemplateError":
+                # jinja2.TemplateError (avoid importing jinja2)
+                err_type = "BadRequestError"
+                status_code = HTTPStatus.BAD_REQUEST
+                param = None
+            else:
+                err_type = "InternalServerError"
+                status_code = HTTPStatus.INTERNAL_SERVER_ERROR
+                param = None
+
+            message = str(exc)
+
         if self.log_error_stack:
             exc_type, _, _ = sys.exc_info()
             if exc_type is not None:
                 traceback.print_exc()
             else:
                 traceback.print_stack()
+
         return ErrorResponse(
-            error=ErrorInfo(message=message, type=err_type, code=status_code.value)
+            error=ErrorInfo(
+                message=sanitize_message(message),
+                type=err_type,
+                code=status_code.value,
+                param=param,
+            )
         )
 
     def create_streaming_error_response(
         self,
-        message: str,
+        message: str | Exception,
         err_type: str = "BadRequestError",
         status_code: HTTPStatus = HTTPStatus.BAD_REQUEST,
+        param: str | None = None,
     ) -> str:
         json_str = json.dumps(
             self.create_error_response(
-                message=message, err_type=err_type, status_code=status_code
+                message=message,
+                err_type=err_type,
+                status_code=status_code,
+                param=param,
             ).model_dump()
         )
         return json_str
@@ -825,6 +870,7 @@ class OpenAIServing:
             message=f"The model `{request.model}` does not exist.",
             err_type="NotFoundError",
             status_code=HTTPStatus.NOT_FOUND,
+            param="model",
         )
 
     def _get_active_default_mm_loras(self, request: AnyRequest) -> LoRARequest | None:
@@ -992,11 +1038,13 @@ class OpenAIServing:
                     ClassificationChatRequest: "classification",
                 }
                 operation = operations.get(type(request), "embedding generation")
-                raise ValueError(
+                raise VLLMValidationError(
                     f"This model's maximum context length is "
                     f"{self.max_model_len} tokens. However, you requested "
                     f"{token_num} tokens in the input for {operation}. "
-                    f"Please reduce the length of the input."
+                    f"Please reduce the length of the input.",
+                    parameter="input_tokens",
+                    value=token_num,
                 )
             return TokensPrompt(prompt=input_text, prompt_token_ids=input_ids)
 
@@ -1018,20 +1066,24 @@ class OpenAIServing:
         # Note: input length can be up to model context length - 1 for
         # completion-like requests.
         if token_num >= self.max_model_len:
-            raise ValueError(
+            raise VLLMValidationError(
                 f"This model's maximum context length is "
                 f"{self.max_model_len} tokens. However, your request has "
                 f"{token_num} input tokens. Please reduce the length of "
-                "the input messages."
+                "the input messages.",
+                parameter="input_tokens",
+                value=token_num,
             )
 
         if max_tokens is not None and token_num + max_tokens > self.max_model_len:
-            raise ValueError(
+            raise VLLMValidationError(
                 "'max_tokens' or 'max_completion_tokens' is too large: "
                 f"{max_tokens}. This model's maximum context length is "
                 f"{self.max_model_len} tokens and your request has "
                 f"{token_num} input tokens ({max_tokens} > {self.max_model_len}"
-                f" - {token_num})."
+                f" - {token_num}).",
+                parameter="max_tokens",
+                value=max_tokens,
             )
 
         return TokensPrompt(prompt=input_text, prompt_token_ids=input_ids)
@@ -1100,6 +1152,18 @@ class OpenAIServing:
             )
         return None
 
+    @staticmethod
+    def _prepare_extra_chat_template_kwargs(
+        request_chat_template_kwargs: dict[str, Any] | None = None,
+        default_chat_template_kwargs: dict[str, Any] | None = None,
+    ) -> dict[str, Any]:
+        """Helper to merge server-default and request-specific chat template kwargs."""
+        request_chat_template_kwargs = request_chat_template_kwargs or {}
+        if default_chat_template_kwargs is None:
+            return request_chat_template_kwargs
+        # Apply server defaults first, then request kwargs override.
+        return default_chat_template_kwargs | request_chat_template_kwargs
+
     async def _preprocess_chat(
         self,
         request: ChatLikeRequest | ResponsesRequest,
@@ -1112,6 +1176,7 @@ class OpenAIServing:
         tool_dicts: list[dict[str, Any]] | None = None,
         documents: list[dict[str, str]] | None = None,
         chat_template_kwargs: dict[str, Any] | None = None,
+        default_chat_template_kwargs: dict[str, Any] | None = None,
         tool_parser: Callable[[TokenizerLike], ToolParser] | None = None,
         add_special_tokens: bool = False,
     ) -> tuple[list[ConversationMessage], list[TokensPrompt]]:
@@ -1137,7 +1202,10 @@ class OpenAIServing:
             tools=tool_dicts,
             documents=documents,
         )
-        _chat_template_kwargs.update(chat_template_kwargs or {})
+        _chat_template_kwargs |= self._prepare_extra_chat_template_kwargs(
+            chat_template_kwargs,
+            default_chat_template_kwargs,
+        )
 
         request_prompt: str | list[int]
 
@@ -1232,6 +1300,7 @@ class OpenAIServing:
         lora_request: LoRARequest | None,
         trace_headers: Mapping[str, str] | None,
         priority: int,
+        data_parallel_rank: int | None = None,
     ) -> tuple[EngineCoreRequest, dict[str, Any]]:
         """Use the Processor to process inputs for AsyncLLM."""
         tokenization_kwargs: dict[str, Any] = {}
@@ -1247,6 +1316,7 @@ class OpenAIServing:
             tokenization_kwargs=tokenization_kwargs,
             trace_headers=trace_headers,
             priority=priority,
+            data_parallel_rank=data_parallel_rank,
         )
         return engine_request, tokenization_kwargs
 
diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py
index 953398a9a72aec976687780be703c0e60ad8777a..df3d974e5a0e90b5f95f1673a8efb7524a730106 100644
--- a/vllm/entrypoints/openai/serving_models.py
+++ b/vllm/entrypoints/openai/serving_models.py
@@ -16,6 +16,7 @@ from vllm.entrypoints.openai.protocol import (
     ModelPermission,
     UnloadLoRAAdapterRequest,
 )
+from vllm.entrypoints.utils import sanitize_message
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry
@@ -119,7 +120,7 @@ class OpenAIServingModels:
         lora_cards = [
             ModelCard(
                 id=lora.lora_name,
-                root=lora.local_path,
+                root=lora.path,
                 parent=lora.base_model_name
                 if lora.base_model_name
                 else self.base_model_paths[0].name,
@@ -300,5 +301,9 @@ def create_error_response(
     status_code: HTTPStatus = HTTPStatus.BAD_REQUEST,
 ) -> ErrorResponse:
     return ErrorResponse(
-        error=ErrorInfo(message=message, type=err_type, code=status_code.value)
+        error=ErrorInfo(
+            message=sanitize_message(message),
+            type=err_type,
+            code=status_code.value,
+        )
     )
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index fb2a6440daf09c9d089dcdbbcf783da19a055b58..f79dad8d9e5e6fde519d0e47c3cebbcd5818ec47 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -9,6 +9,7 @@ from collections import deque
 from collections.abc import AsyncGenerator, AsyncIterator, Callable, Sequence
 from contextlib import AsyncExitStack
 from copy import copy
+from dataclasses import dataclass
 from http import HTTPStatus
 from typing import Final
 
@@ -27,6 +28,10 @@ from openai.types.responses import (
     ResponseFunctionCallArgumentsDoneEvent,
     ResponseFunctionToolCall,
     ResponseFunctionWebSearch,
+    ResponseMcpCallArgumentsDeltaEvent,
+    ResponseMcpCallArgumentsDoneEvent,
+    ResponseMcpCallCompletedEvent,
+    ResponseMcpCallInProgressEvent,
     ResponseOutputItem,
     ResponseOutputItemAddedEvent,
     ResponseOutputItemDoneEvent,
@@ -44,6 +49,7 @@ from openai.types.responses import (
     response_function_web_search,
     response_text_delta_event,
 )
+from openai.types.responses.response_output_item import McpCall
 from openai.types.responses.response_output_text import Logprob, LogprobTopLogprob
 from openai.types.responses.response_reasoning_item import (
     Content as ResponseReasoningTextContent,
@@ -94,6 +100,7 @@ from vllm.entrypoints.openai.protocol import (
     ResponsesResponse,
     ResponseUsage,
     StreamingResponsesResponse,
+    VLLMValidationError,
 )
 from vllm.entrypoints.openai.serving_engine import (
     GenerationError,
@@ -104,7 +111,6 @@ from vllm.entrypoints.responses_utils import (
     construct_input_messages,
     construct_tool_dicts,
     extract_tool_types,
-    make_response_output_items_from_parsable_context,
 )
 from vllm.entrypoints.tool_server import ToolServer
 from vllm.inputs.data import TokensPrompt
@@ -119,6 +125,23 @@ from vllm.utils import random_uuid
 logger = init_logger(__name__)
 
 
+@dataclass
+class HarmonyStreamingState:
+    """Mutable state for harmony streaming event processing."""
+
+    current_content_index: int = -1
+    current_output_index: int = 0
+    current_item_id: str = ""
+    sent_output_item_added: bool = False
+    is_first_function_call_delta: bool = False
+
+    def reset_for_new_item(self) -> None:
+        """Reset state when expecting a new output item."""
+        self.current_output_index += 1
+        self.sent_output_item_added = False
+        self.is_first_function_call_delta = False
+
+
 def _extract_allowed_tools_from_mcp_requests(
     tools: list[Tool],
 ) -> dict[str, list[str] | None]:
@@ -272,6 +295,7 @@ class OpenAIServingResponses(OpenAIServing):
                 err_type="invalid_request_error",
                 message=error_message,
                 status_code=HTTPStatus.BAD_REQUEST,
+                param="input",
             )
         return None
 
@@ -283,6 +307,7 @@ class OpenAIServingResponses(OpenAIServing):
                 err_type="invalid_request_error",
                 message="logprobs are not supported with gpt-oss models",
                 status_code=HTTPStatus.BAD_REQUEST,
+                param="logprobs",
             )
         if request.store and not self.enable_store and request.background:
             return self.create_error_response(
@@ -295,6 +320,7 @@ class OpenAIServingResponses(OpenAIServing):
                     "the vLLM server."
                 ),
                 status_code=HTTPStatus.BAD_REQUEST,
+                param="background",
             )
         if request.previous_input_messages and request.previous_response_id:
             return self.create_error_response(
@@ -302,6 +328,7 @@ class OpenAIServingResponses(OpenAIServing):
                 message="Only one of `previous_input_messages` and "
                 "`previous_response_id` can be set.",
                 status_code=HTTPStatus.BAD_REQUEST,
+                param="previous_response_id",
             )
         return None
 
@@ -369,7 +396,7 @@ class OpenAIServingResponses(OpenAIServing):
             NotImplementedError,
         ) as e:
             logger.exception("Error in preprocessing prompt inputs")
-            return self.create_error_response(f"{e} {e.__cause__}")
+            return self.create_error_response(e)
 
         request_metadata = RequestResponseMetadata(request_id=request.request_id)
         if raw_request:
@@ -458,8 +485,7 @@ class OpenAIServingResponses(OpenAIServing):
                 )
                 generators.append(generator)
         except ValueError as e:
-            # TODO: Use a vllm-specific Validation Error
-            return self.create_error_response(str(e))
+            return self.create_error_response(e)
 
         assert len(generators) == 1
         (result_generator,) = generators
@@ -547,7 +573,7 @@ class OpenAIServingResponses(OpenAIServing):
         except GenerationError as e:
             return self._convert_generation_error_to_response(e)
         except Exception as e:
-            return self.create_error_response(str(e))
+            return self.create_error_response(e)
 
     async def _make_request(
         self,
@@ -563,6 +589,13 @@ class OpenAIServingResponses(OpenAIServing):
             prev_msg=self.msg_store.get(prev_response.id) if prev_response else None,
             prev_response_output=prev_response.output if prev_response else None,
         )
+
+        chat_template_kwargs = dict(
+            reasoning_effort=None
+            if request.reasoning is None
+            else request.reasoning.effort
+        )
+
         _, engine_prompts = await self._preprocess_chat(
             request,
             tokenizer,
@@ -571,6 +604,7 @@ class OpenAIServingResponses(OpenAIServing):
             tool_parser=self.tool_parser,
             chat_template=self.chat_template,
             chat_template_content_format=self.chat_template_content_format,
+            chat_template_kwargs=chat_template_kwargs,
         )
         return messages, engine_prompts
 
@@ -631,8 +665,7 @@ class OpenAIServingResponses(OpenAIServing):
             except asyncio.CancelledError:
                 return self.create_error_response("Client disconnected")
             except ValueError as e:
-                # TODO: Use a vllm-specific Validation Error
-                return self.create_error_response(str(e))
+                return self.create_error_response(e)
 
         # NOTE: Implementation of stauts is still WIP, but for now
         # we guarantee that if the status is not "completed", it is accurate.
@@ -658,24 +691,23 @@ class OpenAIServingResponses(OpenAIServing):
             else:
                 status = "incomplete"
         elif isinstance(context, ParsableContext):
-            response_messages = context.parser.response_messages[
-                context.parser.num_init_messages :
-            ]
-            output = make_response_output_items_from_parsable_context(response_messages)
+            output = context.parser.make_response_output_items_from_parsable_context()
 
-            # TODO: context for non-gptoss models doesn't use messages
-            # so we can't get them out yet
             if request.enable_response_messages:
-                raise NotImplementedError(
-                    "enable_response_messages is currently only supported for gpt-oss"
-                )
+                input_messages = context.input_messages
+                output_messages = context.output_messages
 
             # TODO: Calculate usage.
             # assert final_res.prompt_token_ids is not None
             num_tool_output_tokens = 0
+
+            # Check finish reason from the parser
+            if context.parser.finish_reason == "length":
+                status = "incomplete"
         else:
             assert isinstance(context, SimpleContext)
-            final_res = context.last_output
+            # Use final_output which has accumulated text/token_ids/logprobs
+            final_res = context.final_output
             assert final_res is not None
             assert len(final_res.outputs) == 1
             final_output = final_res.outputs[0]
@@ -683,6 +715,10 @@ class OpenAIServingResponses(OpenAIServing):
             # finish_reason='error' indicates retryable internal error
             self._raise_if_error(final_output.finish_reason, request.request_id)
 
+            # Check if generation was stopped due to max_tokens
+            if final_output.finish_reason == "length":
+                status = "incomplete"
+
             output = self._make_response_output_items(request, final_output, tokenizer)
 
             if request.enable_response_messages:
@@ -743,6 +779,26 @@ class OpenAIServingResponses(OpenAIServing):
                     self.response_store[response.id] = response
         return response
 
+    def _is_mcp_tool_by_namespace(self, recipient: str | None) -> bool:
+        """
+        Determine if a tool call is an MCP tool based on recipient prefix.
+
+        - Tools starting with "functions." are function calls
+        - Everything else is an MCP tool
+        """
+        if recipient is None:
+            return False
+
+        # Function calls have "functions." prefix
+        # Everything else is an MCP tool
+        return not recipient.startswith("functions.")
+
+    _TOOL_NAME_TO_MCP_SERVER_LABEL: Final[dict[str, str]] = {
+        "python": "code_interpreter",
+        "container": "container",
+        "browser": "web_search_preview",
+    }
+
     def _topk_logprobs(
         self,
         logprobs: dict[int, SampleLogprob],
@@ -946,9 +1002,23 @@ class OpenAIServingResponses(OpenAIServing):
             output_items.extend(last_items)
         return output_items
 
+    def _extract_system_message_from_request(self, request) -> str | None:
+        system_msg = None
+        if not isinstance(request.input, str):
+            for response_msg in request.input:
+                if (
+                    isinstance(response_msg, dict)
+                    and response_msg.get("role") == "system"
+                ):
+                    system_msg = response_msg.get("content")
+                    break
+        return system_msg
+
     def _construct_harmony_system_input_message(
         self, request: ResponsesRequest, with_custom_tools: bool, tool_types: set[str]
     ) -> OpenAIHarmonyMessage:
+        model_identity = self._extract_system_message_from_request(request)
+
         reasoning_effort = request.reasoning.effort if request.reasoning else None
 
         # Extract allowed_tools from MCP tool requests
@@ -985,6 +1055,7 @@ class OpenAIServingResponses(OpenAIServing):
         )
 
         sys_msg = get_system_message(
+            model_identity=model_identity,
             reasoning_effort=reasoning_effort,
             browser_description=browser_description,
             python_description=python_description,
@@ -1039,8 +1110,7 @@ class OpenAIServingResponses(OpenAIServing):
                     del prev_msgs[prev_final_msg_idx + 1 :]
                     for msg in recent_turn_msgs:
                         assert isinstance(msg, OpenAIHarmonyMessage)
-                        if msg.channel != "analysis":
-                            prev_msgs.append(msg)
+                        prev_msgs.append(msg)
             messages.extend(prev_msgs)
         # Append the new input.
         # Responses API supports simple text inputs without chat format.
@@ -1052,7 +1122,10 @@ class OpenAIServingResponses(OpenAIServing):
             else:
                 prev_outputs = []
             for response_msg in request.input:
-                messages.append(parse_response_input(response_msg, prev_outputs))
+                new_msg = parse_response_input(response_msg, prev_outputs)
+                if new_msg.author.role != "system":
+                    messages.append(new_msg)
+
                 # User passes in a tool call request and its output. We need
                 # to add the tool call request to prev_outputs so that the
                 # parse_response_input can find the tool call request when
@@ -1080,7 +1153,7 @@ class OpenAIServingResponses(OpenAIServing):
             response = self._convert_generation_error_to_response(e)
         except Exception as e:
             logger.exception("Background request failed for %s", request.request_id)
-            response = self.create_error_response(str(e))
+            response = self.create_error_response(e)
         finally:
             new_event_signal.set()
 
@@ -1105,7 +1178,7 @@ class OpenAIServingResponses(OpenAIServing):
             response = self._convert_generation_error_to_response(e)
         except Exception as e:
             logger.exception("Background request failed for %s", request.request_id)
-            response = self.create_error_response(str(e))
+            response = self.create_error_response(e)
 
         if isinstance(response, ErrorResponse):
             # If the request has failed, update the status to "failed".
@@ -1122,7 +1195,11 @@ class OpenAIServingResponses(OpenAIServing):
         starting_after: int | None = None,
     ) -> AsyncGenerator[StreamingResponsesResponse, None]:
         if response_id not in self.event_store:
-            raise ValueError(f"Unknown response_id: {response_id}")
+            raise VLLMValidationError(
+                f"Unknown response_id: {response_id}",
+                parameter="response_id",
+                value=response_id,
+            )
 
         event_deque, new_event_signal = self.event_store[response_id]
         start_index = 0 if starting_after is None else starting_after + 1
@@ -1178,6 +1255,7 @@ class OpenAIServingResponses(OpenAIServing):
                 return self.create_error_response(
                     err_type="invalid_request_error",
                     message="Cannot cancel a synchronous response.",
+                    param="response_id",
                 )
 
             # Update the status to "cancelled".
@@ -1197,6 +1275,7 @@ class OpenAIServingResponses(OpenAIServing):
             err_type="invalid_request_error",
             message=f"Response with id '{response_id}' not found.",
             status_code=HTTPStatus.NOT_FOUND,
+            param="response_id",
         )
 
     def _make_store_not_supported_error(self) -> ErrorResponse:
@@ -1209,6 +1288,7 @@ class OpenAIServingResponses(OpenAIServing):
                 "starting the vLLM server."
             ),
             status_code=HTTPStatus.BAD_REQUEST,
+            param="store",
         )
 
     async def _process_simple_streaming_events(
@@ -1516,6 +1596,816 @@ class OpenAIServingResponses(OpenAIServing):
                     )
                 )
 
+    def _emit_function_call_done_events(
+        self,
+        previous_item,
+        state: HarmonyStreamingState,
+    ) -> list[StreamingResponsesResponse]:
+        """Emit events when a function call completes."""
+        function_name = previous_item.recipient[len("functions.") :]
+        events = []
+        events.append(
+            ResponseFunctionCallArgumentsDoneEvent(
+                type="response.function_call_arguments.done",
+                arguments=previous_item.content[0].text,
+                name=function_name,
+                item_id=state.current_item_id,
+                output_index=state.current_output_index,
+                sequence_number=-1,
+            )
+        )
+        function_call_item = ResponseFunctionToolCall(
+            type="function_call",
+            arguments=previous_item.content[0].text,
+            name=function_name,
+            item_id=state.current_item_id,
+            output_index=state.current_output_index,
+            sequence_number=-1,
+            call_id=f"fc_{random_uuid()}",
+            status="completed",
+        )
+        events.append(
+            ResponseOutputItemDoneEvent(
+                type="response.output_item.done",
+                sequence_number=-1,
+                output_index=state.current_output_index,
+                item=function_call_item,
+            )
+        )
+        return events
+
+    def _emit_mcp_call_done_events(
+        self,
+        previous_item,
+        state: HarmonyStreamingState,
+    ) -> list[StreamingResponsesResponse]:
+        """Emit events when an MCP tool call completes."""
+        server_label = self._TOOL_NAME_TO_MCP_SERVER_LABEL.get(
+            previous_item.recipient, previous_item.recipient
+        )
+        events = []
+        events.append(
+            ResponseMcpCallArgumentsDoneEvent(
+                type="response.mcp_call_arguments.done",
+                arguments=previous_item.content[0].text,
+                name=previous_item.recipient,
+                item_id=state.current_item_id,
+                output_index=state.current_output_index,
+                sequence_number=-1,
+            )
+        )
+        events.append(
+            ResponseMcpCallCompletedEvent(
+                type="response.mcp_call.completed",
+                sequence_number=-1,
+                output_index=state.current_output_index,
+                item_id=state.current_item_id,
+            )
+        )
+        events.append(
+            ResponseOutputItemDoneEvent(
+                type="response.output_item.done",
+                sequence_number=-1,
+                output_index=state.current_output_index,
+                item=McpCall(
+                    type="mcp_call",
+                    arguments=previous_item.content[0].text,
+                    name=previous_item.recipient,
+                    id=state.current_item_id,
+                    server_label=server_label,
+                    status="completed",
+                ),
+            )
+        )
+        return events
+
+    def _emit_reasoning_done_events(
+        self,
+        previous_item,
+        state: HarmonyStreamingState,
+    ) -> list[StreamingResponsesResponse]:
+        """Emit events when a reasoning (analysis) item completes."""
+        content = ResponseReasoningTextContent(
+            text=previous_item.content[0].text,
+            type="reasoning_text",
+        )
+        reasoning_item = ResponseReasoningItem(
+            type="reasoning",
+            content=[content],
+            status="completed",
+            id=state.current_item_id,
+            summary=[],
+        )
+        events = []
+        events.append(
+            ResponseReasoningTextDoneEvent(
+                type="response.reasoning_text.done",
+                item_id=state.current_item_id,
+                sequence_number=-1,
+                output_index=state.current_output_index,
+                content_index=state.current_content_index,
+                text=previous_item.content[0].text,
+            )
+        )
+        events.append(
+            ResponseReasoningPartDoneEvent(
+                type="response.reasoning_part.done",
+                sequence_number=-1,
+                item_id=state.current_item_id,
+                output_index=state.current_output_index,
+                content_index=state.current_content_index,
+                part=content,
+            )
+        )
+        events.append(
+            ResponseOutputItemDoneEvent(
+                type="response.output_item.done",
+                sequence_number=-1,
+                output_index=state.current_output_index,
+                item=reasoning_item,
+            )
+        )
+        return events
+
+    def _emit_text_output_done_events(
+        self,
+        previous_item,
+        state: HarmonyStreamingState,
+    ) -> list[StreamingResponsesResponse]:
+        """Emit events when a final text output item completes."""
+        text_content = ResponseOutputText(
+            type="output_text",
+            text=previous_item.content[0].text,
+            annotations=[],
+        )
+        events = []
+        events.append(
+            ResponseTextDoneEvent(
+                type="response.output_text.done",
+                sequence_number=-1,
+                output_index=state.current_output_index,
+                content_index=state.current_content_index,
+                text=previous_item.content[0].text,
+                logprobs=[],
+                item_id=state.current_item_id,
+            )
+        )
+        events.append(
+            ResponseContentPartDoneEvent(
+                type="response.content_part.done",
+                sequence_number=-1,
+                item_id=state.current_item_id,
+                output_index=state.current_output_index,
+                content_index=state.current_content_index,
+                part=text_content,
+            )
+        )
+        events.append(
+            ResponseOutputItemDoneEvent(
+                type="response.output_item.done",
+                sequence_number=-1,
+                output_index=state.current_output_index,
+                item=ResponseOutputMessage(
+                    id=state.current_item_id,
+                    type="message",
+                    role="assistant",
+                    content=[text_content],
+                    status="completed",
+                ),
+            )
+        )
+        return events
+
+    def _emit_previous_item_done_events(
+        self,
+        previous_item,
+        state: HarmonyStreamingState,
+    ) -> list[StreamingResponsesResponse]:
+        """Emit done events for the previous item when expecting a new start."""
+        if previous_item.recipient is not None:
+            # Deal with tool call
+            if previous_item.recipient.startswith("functions."):
+                return self._emit_function_call_done_events(previous_item, state)
+            elif (
+                self._is_mcp_tool_by_namespace(previous_item.recipient)
+                and state.current_item_id is not None
+                and state.current_item_id.startswith("mcp_")
+            ):
+                return self._emit_mcp_call_done_events(previous_item, state)
+        elif previous_item.channel == "analysis":
+            return self._emit_reasoning_done_events(previous_item, state)
+        elif previous_item.channel == "final":
+            return self._emit_text_output_done_events(previous_item, state)
+        return []
+
+    def _emit_final_channel_delta_events(
+        self,
+        ctx: StreamingHarmonyContext,
+        state: HarmonyStreamingState,
+    ) -> list[StreamingResponsesResponse]:
+        """Emit events for final channel text delta streaming."""
+        events = []
+        if not state.sent_output_item_added:
+            state.sent_output_item_added = True
+            state.current_item_id = f"msg_{random_uuid()}"
+            events.append(
+                ResponseOutputItemAddedEvent(
+                    type="response.output_item.added",
+                    sequence_number=-1,
+                    output_index=state.current_output_index,
+                    item=ResponseOutputMessage(
+                        id=state.current_item_id,
+                        type="message",
+                        role="assistant",
+                        content=[],
+                        status="in_progress",
+                    ),
+                )
+            )
+            state.current_content_index += 1
+            events.append(
+                ResponseContentPartAddedEvent(
+                    type="response.content_part.added",
+                    sequence_number=-1,
+                    output_index=state.current_output_index,
+                    item_id=state.current_item_id,
+                    content_index=state.current_content_index,
+                    part=ResponseOutputText(
+                        type="output_text",
+                        text="",
+                        annotations=[],
+                        logprobs=[],
+                    ),
+                )
+            )
+        events.append(
+            ResponseTextDeltaEvent(
+                type="response.output_text.delta",
+                sequence_number=-1,
+                content_index=state.current_content_index,
+                output_index=state.current_output_index,
+                item_id=state.current_item_id,
+                delta=ctx.last_content_delta,
+                # TODO, use logprobs from ctx.last_request_output
+                logprobs=[],
+            )
+        )
+        return events
+
+    def _emit_analysis_channel_delta_events(
+        self,
+        ctx: StreamingHarmonyContext,
+        state: HarmonyStreamingState,
+    ) -> list[StreamingResponsesResponse]:
+        """Emit events for analysis channel reasoning delta streaming."""
+        events = []
+        if not state.sent_output_item_added:
+            state.sent_output_item_added = True
+            state.current_item_id = f"msg_{random_uuid()}"
+            events.append(
+                ResponseOutputItemAddedEvent(
+                    type="response.output_item.added",
+                    sequence_number=-1,
+                    output_index=state.current_output_index,
+                    item=ResponseReasoningItem(
+                        type="reasoning",
+                        id=state.current_item_id,
+                        summary=[],
+                        status="in_progress",
+                    ),
+                )
+            )
+            state.current_content_index += 1
+            events.append(
+                ResponseReasoningPartAddedEvent(
+                    type="response.reasoning_part.added",
+                    sequence_number=-1,
+                    output_index=state.current_output_index,
+                    item_id=state.current_item_id,
+                    content_index=state.current_content_index,
+                    part=ResponseReasoningTextContent(
+                        text="",
+                        type="reasoning_text",
+                    ),
+                )
+            )
+        events.append(
+            ResponseReasoningTextDeltaEvent(
+                type="response.reasoning_text.delta",
+                item_id=state.current_item_id,
+                output_index=state.current_output_index,
+                content_index=state.current_content_index,
+                delta=ctx.last_content_delta,
+                sequence_number=-1,
+            )
+        )
+        return events
+
+    def _emit_mcp_tool_delta_events(
+        self,
+        ctx: StreamingHarmonyContext,
+        state: HarmonyStreamingState,
+        recipient: str,
+    ) -> list[StreamingResponsesResponse]:
+        """Emit events for MCP tool delta streaming."""
+        server_label = self._TOOL_NAME_TO_MCP_SERVER_LABEL.get(recipient, recipient)
+        events = []
+        if not state.sent_output_item_added:
+            state.sent_output_item_added = True
+            state.current_item_id = f"mcp_{random_uuid()}"
+            events.append(
+                ResponseOutputItemAddedEvent(
+                    type="response.output_item.added",
+                    sequence_number=-1,
+                    output_index=state.current_output_index,
+                    item=McpCall(
+                        type="mcp_call",
+                        id=state.current_item_id,
+                        name=recipient,
+                        arguments="",
+                        server_label=server_label,
+                        status="in_progress",
+                    ),
+                )
+            )
+            events.append(
+                ResponseMcpCallInProgressEvent(
+                    type="response.mcp_call.in_progress",
+                    sequence_number=-1,
+                    output_index=state.current_output_index,
+                    item_id=state.current_item_id,
+                )
+            )
+        events.append(
+            ResponseMcpCallArgumentsDeltaEvent(
+                type="response.mcp_call_arguments.delta",
+                sequence_number=-1,
+                output_index=state.current_output_index,
+                item_id=state.current_item_id,
+                delta=ctx.last_content_delta,
+            )
+        )
+        return events
+
+    def _emit_code_interpreter_delta_events(
+        self,
+        ctx: StreamingHarmonyContext,
+        state: HarmonyStreamingState,
+    ) -> list[StreamingResponsesResponse]:
+        """Emit events for code interpreter delta streaming."""
+        events = []
+        if not state.sent_output_item_added:
+            state.sent_output_item_added = True
+            state.current_item_id = f"tool_{random_uuid()}"
+            events.append(
+                ResponseOutputItemAddedEvent(
+                    type="response.output_item.added",
+                    sequence_number=-1,
+                    output_index=state.current_output_index,
+                    item=ResponseCodeInterpreterToolCallParam(
+                        type="code_interpreter_call",
+                        id=state.current_item_id,
+                        code=None,
+                        container_id="auto",
+                        outputs=None,
+                        status="in_progress",
+                    ),
+                )
+            )
+            events.append(
+                ResponseCodeInterpreterCallInProgressEvent(
+                    type="response.code_interpreter_call.in_progress",
+                    sequence_number=-1,
+                    output_index=state.current_output_index,
+                    item_id=state.current_item_id,
+                )
+            )
+        events.append(
+            ResponseCodeInterpreterCallCodeDeltaEvent(
+                type="response.code_interpreter_call_code.delta",
+                sequence_number=-1,
+                output_index=state.current_output_index,
+                item_id=state.current_item_id,
+                delta=ctx.last_content_delta,
+            )
+        )
+        return events
+
+    def _emit_mcp_prefix_delta_events(
+        self,
+        ctx: StreamingHarmonyContext,
+        state: HarmonyStreamingState,
+    ) -> list[StreamingResponsesResponse]:
+        """Emit events for MCP prefix (mcp.*) delta streaming."""
+        events = []
+        if not state.sent_output_item_added:
+            state.sent_output_item_added = True
+            state.current_item_id = f"mcp_{random_uuid()}"
+            mcp_name = ctx.parser.current_recipient[len("mcp.") :]
+
+            events.append(
+                ResponseOutputItemAddedEvent(
+                    type="response.output_item.added",
+                    sequence_number=-1,
+                    output_index=state.current_output_index,
+                    item=McpCall(
+                        type="mcp_call",
+                        id=state.current_item_id,
+                        name=mcp_name,
+                        arguments="",
+                        server_label=mcp_name,
+                        status="in_progress",
+                    ),
+                )
+            )
+            events.append(
+                ResponseMcpCallInProgressEvent(
+                    type="response.mcp_call.in_progress",
+                    sequence_number=-1,
+                    output_index=state.current_output_index,
+                    item_id=state.current_item_id,
+                )
+            )
+
+        events.append(
+            ResponseMcpCallArgumentsDeltaEvent(
+                type="response.mcp_call_arguments.delta",
+                sequence_number=-1,
+                output_index=state.current_output_index,
+                item_id=state.current_item_id,
+                delta=ctx.last_content_delta,
+            )
+        )
+        return events
+
+    def _emit_content_delta_events(
+        self,
+        ctx: StreamingHarmonyContext,
+        state: HarmonyStreamingState,
+    ) -> list[StreamingResponsesResponse]:
+        """Emit events for content delta streaming based on channel type."""
+        if not ctx.last_content_delta:
+            return []
+
+        if (
+            ctx.parser.current_channel == "final"
+            and ctx.parser.current_recipient is None
+        ):
+            return self._emit_final_channel_delta_events(ctx, state)
+        elif (
+            ctx.parser.current_channel == "analysis"
+            and ctx.parser.current_recipient is None
+        ):
+            return self._emit_analysis_channel_delta_events(ctx, state)
+        # built-in tools will be triggered on the analysis channel
+        # However, occasionally built-in tools will
+        # still be output to commentary.
+        elif (
+            ctx.parser.current_channel == "commentary"
+            or ctx.parser.current_channel == "analysis"
+        ) and ctx.parser.current_recipient is not None:
+            recipient = ctx.parser.current_recipient
+            # Check for function calls first - they have their own event handling
+            if recipient.startswith("functions."):
+                return self._emit_function_call_delta_events(ctx, state)
+            is_mcp_tool = self._is_mcp_tool_by_namespace(recipient)
+            if is_mcp_tool:
+                return self._emit_mcp_tool_delta_events(ctx, state, recipient)
+            else:
+                return self._emit_code_interpreter_delta_events(ctx, state)
+        elif (
+            (
+                ctx.parser.current_channel == "commentary"
+                or ctx.parser.current_channel == "analysis"
+            )
+            and ctx.parser.current_recipient is not None
+            and ctx.parser.current_recipient.startswith("mcp.")
+        ):
+            return self._emit_mcp_prefix_delta_events(ctx, state)
+
+        return []
+
+    def _emit_browser_tool_events(
+        self,
+        previous_item,
+        state: HarmonyStreamingState,
+    ) -> list[StreamingResponsesResponse]:
+        """Emit events for browser tool calls (web search)."""
+        function_name = previous_item.recipient[len("browser.") :]
+        parsed_args = json.loads(previous_item.content[0].text)
+        action = None
+
+        if function_name == "search":
+            action = response_function_web_search.ActionSearch(
+                type="search",
+                query=parsed_args["query"],
+            )
+        elif function_name == "open":
+            action = response_function_web_search.ActionOpenPage(
+                type="open_page",
+                # TODO: translate to url
+                url=f"cursor:{parsed_args.get('cursor', '')}",
+            )
+        elif function_name == "find":
+            action = response_function_web_search.ActionFind(
+                type="find",
+                pattern=parsed_args["pattern"],
+                # TODO: translate to url
+                url=f"cursor:{parsed_args.get('cursor', '')}",
+            )
+        else:
+            raise ValueError(f"Unknown function name: {function_name}")
+
+        state.current_item_id = f"tool_{random_uuid()}"
+        events = []
+        events.append(
+            ResponseOutputItemAddedEvent(
+                type="response.output_item.added",
+                sequence_number=-1,
+                output_index=state.current_output_index,
+                item=response_function_web_search.ResponseFunctionWebSearch(
+                    # TODO: generate a unique id for web search call
+                    type="web_search_call",
+                    id=state.current_item_id,
+                    action=action,
+                    status="in_progress",
+                ),
+            )
+        )
+        events.append(
+            ResponseWebSearchCallInProgressEvent(
+                type="response.web_search_call.in_progress",
+                sequence_number=-1,
+                output_index=state.current_output_index,
+                item_id=state.current_item_id,
+            )
+        )
+        events.append(
+            ResponseWebSearchCallSearchingEvent(
+                type="response.web_search_call.searching",
+                sequence_number=-1,
+                output_index=state.current_output_index,
+                item_id=state.current_item_id,
+            )
+        )
+        # enqueue
+        events.append(
+            ResponseWebSearchCallCompletedEvent(
+                type="response.web_search_call.completed",
+                sequence_number=-1,
+                output_index=state.current_output_index,
+                item_id=state.current_item_id,
+            )
+        )
+        events.append(
+            ResponseOutputItemDoneEvent(
+                type="response.output_item.done",
+                sequence_number=-1,
+                output_index=state.current_output_index,
+                item=ResponseFunctionWebSearch(
+                    type="web_search_call",
+                    id=state.current_item_id,
+                    action=action,
+                    status="completed",
+                ),
+            )
+        )
+        return events
+
+    def _emit_mcp_tool_completion_events(
+        self,
+        previous_item,
+        state: HarmonyStreamingState,
+    ) -> list[StreamingResponsesResponse]:
+        """Emit events when an MCP tool completes during assistant action turn."""
+        recipient = previous_item.recipient
+        server_label = self._TOOL_NAME_TO_MCP_SERVER_LABEL.get(recipient, recipient)
+        events = []
+        events.append(
+            ResponseMcpCallArgumentsDoneEvent(
+                type="response.mcp_call_arguments.done",
+                sequence_number=-1,
+                output_index=state.current_output_index,
+                item_id=state.current_item_id,
+                arguments=previous_item.content[0].text,
+                name=recipient,
+            )
+        )
+        events.append(
+            ResponseMcpCallCompletedEvent(
+                type="response.mcp_call.completed",
+                sequence_number=-1,
+                output_index=state.current_output_index,
+                item_id=state.current_item_id,
+            )
+        )
+        events.append(
+            ResponseOutputItemDoneEvent(
+                type="response.output_item.done",
+                sequence_number=-1,
+                output_index=state.current_output_index,
+                item=McpCall(
+                    type="mcp_call",
+                    id=state.current_item_id,
+                    name=recipient,
+                    arguments=previous_item.content[0].text,
+                    server_label=server_label,
+                    status="completed",
+                ),
+            )
+        )
+        return events
+
+    def _emit_code_interpreter_completion_events(
+        self,
+        previous_item,
+        state: HarmonyStreamingState,
+    ) -> list[StreamingResponsesResponse]:
+        """Emit events when code interpreter completes."""
+        events = []
+        events.append(
+            ResponseCodeInterpreterCallCodeDoneEvent(
+                type="response.code_interpreter_call_code.done",
+                sequence_number=-1,
+                output_index=state.current_output_index,
+                item_id=state.current_item_id,
+                code=previous_item.content[0].text,
+            )
+        )
+        events.append(
+            ResponseCodeInterpreterCallInterpretingEvent(
+                type="response.code_interpreter_call.interpreting",
+                sequence_number=-1,
+                output_index=state.current_output_index,
+                item_id=state.current_item_id,
+            )
+        )
+        events.append(
+            ResponseCodeInterpreterCallCompletedEvent(
+                type="response.code_interpreter_call.completed",
+                sequence_number=-1,
+                output_index=state.current_output_index,
+                item_id=state.current_item_id,
+            )
+        )
+        events.append(
+            ResponseOutputItemDoneEvent(
+                type="response.output_item.done",
+                sequence_number=-1,
+                output_index=state.current_output_index,
+                item=ResponseCodeInterpreterToolCallParam(
+                    type="code_interpreter_call",
+                    id=state.current_item_id,
+                    code=previous_item.content[0].text,
+                    container_id="auto",
+                    outputs=[],
+                    status="completed",
+                ),
+            )
+        )
+        return events
+
+    def _emit_mcp_prefix_completion_events(
+        self,
+        previous_item,
+        state: HarmonyStreamingState,
+    ) -> list[StreamingResponsesResponse]:
+        """Emit events when an MCP prefix tool (mcp.*) completes."""
+        mcp_name = previous_item.recipient[len("mcp.") :]
+        events = []
+        events.append(
+            ResponseMcpCallArgumentsDoneEvent(
+                type="response.mcp_call_arguments.done",
+                sequence_number=-1,
+                output_index=state.current_output_index,
+                item_id=state.current_item_id,
+                arguments=previous_item.content[0].text,
+                name=mcp_name,
+            )
+        )
+        events.append(
+            ResponseMcpCallCompletedEvent(
+                type="response.mcp_call.completed",
+                sequence_number=-1,
+                output_index=state.current_output_index,
+                item_id=state.current_item_id,
+            )
+        )
+        events.append(
+            ResponseOutputItemDoneEvent(
+                type="response.output_item.done",
+                sequence_number=-1,
+                output_index=state.current_output_index,
+                item=McpCall(
+                    type="mcp_call",
+                    id=state.current_item_id,
+                    name=mcp_name,
+                    arguments=previous_item.content[0].text,
+                    server_label=mcp_name,
+                    status="completed",
+                ),
+            )
+        )
+        return events
+
+    def _emit_tool_action_events(
+        self,
+        ctx: StreamingHarmonyContext,
+        state: HarmonyStreamingState,
+    ) -> list[StreamingResponsesResponse]:
+        """Emit events for tool action turn."""
+        if not ctx.is_assistant_action_turn() or len(ctx.parser.messages) == 0:
+            return []
+
+        events = []
+        previous_item = ctx.parser.messages[-1]
+
+        # Handle browser tool
+        if (
+            self.tool_server is not None
+            and self.tool_server.has_tool("browser")
+            and previous_item.recipient is not None
+            and previous_item.recipient.startswith("browser.")
+        ):
+            events.extend(self._emit_browser_tool_events(previous_item, state))
+
+        # Handle tool completion
+        if (
+            self.tool_server is not None
+            and previous_item.recipient is not None
+            and state.current_item_id is not None
+            and state.sent_output_item_added
+        ):
+            recipient = previous_item.recipient
+            # Handle MCP prefix tool completion first
+            if recipient.startswith("mcp."):
+                events.extend(
+                    self._emit_mcp_prefix_completion_events(previous_item, state)
+                )
+            else:
+                # Handle other MCP tool and code interpreter completion
+                is_mcp_tool = self._is_mcp_tool_by_namespace(
+                    recipient
+                ) and state.current_item_id.startswith("mcp_")
+                if is_mcp_tool:
+                    events.extend(
+                        self._emit_mcp_tool_completion_events(previous_item, state)
+                    )
+                else:
+                    events.extend(
+                        self._emit_code_interpreter_completion_events(
+                            previous_item, state
+                        )
+                    )
+
+        return events
+
+    def _emit_function_call_delta_events(
+        self,
+        ctx: StreamingHarmonyContext,
+        state: HarmonyStreamingState,
+    ) -> list[StreamingResponsesResponse]:
+        """Emit events for developer function calls on commentary channel."""
+        if not (
+            ctx.parser.current_channel == "commentary"
+            and ctx.parser.current_recipient
+            and ctx.parser.current_recipient.startswith("functions.")
+        ):
+            return []
+
+        events = []
+        if state.is_first_function_call_delta is False:
+            state.is_first_function_call_delta = True
+            fc_name = ctx.parser.current_recipient[len("functions.") :]
+            state.current_item_id = f"fc_{random_uuid()}"
+            tool_call_item = ResponseFunctionToolCall(
+                name=fc_name,
+                type="function_call",
+                id=state.current_item_id,
+                call_id=f"call_{random_uuid()}",
+                arguments="",
+                status="in_progress",
+            )
+            events.append(
+                ResponseOutputItemAddedEvent(
+                    type="response.output_item.added",
+                    sequence_number=-1,
+                    output_index=state.current_output_index,
+                    item=tool_call_item,
+                )
+            )
+        # Always emit the delta (including on first call)
+        events.append(
+            ResponseFunctionCallArgumentsDeltaEvent(
+                item_id=state.current_item_id,
+                delta=ctx.last_content_delta,
+                output_index=state.current_output_index,
+                sequence_number=-1,
+                type="response.function_call_arguments.delta",
+            )
+        )
+        return events
+
     async def _process_harmony_streaming_events(
         self,
         request: ResponsesRequest,
@@ -1530,11 +2420,8 @@ class OpenAIServingResponses(OpenAIServing):
             [StreamingResponsesResponse], StreamingResponsesResponse
         ],
     ) -> AsyncGenerator[StreamingResponsesResponse, None]:
-        current_content_index = -1
-        current_output_index = 0
-        current_item_id: str = ""
-        sent_output_item_added = False
-        is_first_function_call_delta = False
+        state = HarmonyStreamingState()
+
         async for ctx in result_generator:
             assert isinstance(ctx, StreamingHarmonyContext)
 
@@ -1542,435 +2429,21 @@ class OpenAIServingResponses(OpenAIServing):
             self._raise_if_error(ctx.finish_reason, request.request_id)
 
             if ctx.is_expecting_start():
-                current_output_index += 1
-                sent_output_item_added = False
-                is_first_function_call_delta = False
                 if len(ctx.parser.messages) > 0:
                     previous_item = ctx.parser.messages[-1]
-                    if previous_item.recipient is not None:
-                        # Deal with tool call
-                        if previous_item.recipient.startswith("functions."):
-                            function_name = previous_item.recipient[len("functions.") :]
-                            yield _increment_sequence_number_and_return(
-                                ResponseFunctionCallArgumentsDoneEvent(
-                                    type="response.function_call_arguments.done",
-                                    arguments=previous_item.content[0].text,
-                                    name=function_name,
-                                    item_id=current_item_id,
-                                    output_index=current_output_index,
-                                    sequence_number=-1,
-                                )
-                            )
-                            function_call_item = ResponseFunctionToolCall(
-                                type="function_call",
-                                arguments=previous_item.content[0].text,
-                                name=function_name,
-                                item_id=current_item_id,
-                                output_index=current_output_index,
-                                sequence_number=-1,
-                                call_id=f"fc_{random_uuid()}",
-                                status="completed",
-                            )
-                            yield _increment_sequence_number_and_return(
-                                ResponseOutputItemDoneEvent(
-                                    type="response.output_item.done",
-                                    sequence_number=-1,
-                                    output_index=current_output_index,
-                                    item=function_call_item,
-                                )
-                            )
-                    elif previous_item.channel == "analysis":
-                        content = ResponseReasoningTextContent(
-                            text=previous_item.content[0].text,
-                            type="reasoning_text",
-                        )
-                        reasoning_item = ResponseReasoningItem(
-                            type="reasoning",
-                            content=[content],
-                            status="completed",
-                            id=current_item_id,
-                            summary=[],
-                        )
-                        yield _increment_sequence_number_and_return(
-                            ResponseReasoningTextDoneEvent(
-                                type="response.reasoning_text.done",
-                                item_id=current_item_id,
-                                sequence_number=-1,
-                                output_index=current_output_index,
-                                content_index=current_content_index,
-                                text=previous_item.content[0].text,
-                            )
-                        )
-                        yield _increment_sequence_number_and_return(
-                            ResponseReasoningPartDoneEvent(
-                                type="response.reasoning_part.done",
-                                sequence_number=-1,
-                                item_id=current_item_id,
-                                output_index=current_output_index,
-                                content_index=current_content_index,
-                                part=content,
-                            )
-                        )
-                        yield _increment_sequence_number_and_return(
-                            ResponseOutputItemDoneEvent(
-                                type="response.output_item.done",
-                                sequence_number=-1,
-                                output_index=current_output_index,
-                                item=reasoning_item,
-                            )
-                        )
-                    elif previous_item.channel == "final":
-                        text_content = ResponseOutputText(
-                            type="output_text",
-                            text=previous_item.content[0].text,
-                            annotations=[],
-                        )
-                        yield _increment_sequence_number_and_return(
-                            ResponseTextDoneEvent(
-                                type="response.output_text.done",
-                                sequence_number=-1,
-                                output_index=current_output_index,
-                                content_index=current_content_index,
-                                text=previous_item.content[0].text,
-                                logprobs=[],
-                                item_id=current_item_id,
-                            )
-                        )
-                        yield _increment_sequence_number_and_return(
-                            ResponseContentPartDoneEvent(
-                                type="response.content_part.done",
-                                sequence_number=-1,
-                                item_id=current_item_id,
-                                output_index=current_output_index,
-                                content_index=current_content_index,
-                                part=text_content,
-                            )
-                        )
-                        yield _increment_sequence_number_and_return(
-                            ResponseOutputItemDoneEvent(
-                                type="response.output_item.done",
-                                sequence_number=-1,
-                                output_index=current_output_index,
-                                item=ResponseOutputMessage(
-                                    id=current_item_id,
-                                    type="message",
-                                    role="assistant",
-                                    content=[text_content],
-                                    status="completed",
-                                ),
-                            )
-                        )
+                    for event in self._emit_previous_item_done_events(
+                        previous_item, state
+                    ):
+                        yield _increment_sequence_number_and_return(event)
+                state.reset_for_new_item()
 
-            # stream the output of a harmony message
-            if ctx.parser.last_content_delta:
-                if (
-                    ctx.parser.current_channel == "final"
-                    and ctx.parser.current_recipient is None
-                ):
-                    if not sent_output_item_added:
-                        sent_output_item_added = True
-                        current_item_id = f"msg_{random_uuid()}"
-                        yield _increment_sequence_number_and_return(
-                            ResponseOutputItemAddedEvent(
-                                type="response.output_item.added",
-                                sequence_number=-1,
-                                output_index=current_output_index,
-                                item=ResponseOutputMessage(
-                                    id=current_item_id,
-                                    type="message",
-                                    role="assistant",
-                                    content=[],
-                                    status="in_progress",
-                                ),
-                            )
-                        )
-                        current_content_index += 1
-                        yield _increment_sequence_number_and_return(
-                            ResponseContentPartAddedEvent(
-                                type="response.content_part.added",
-                                sequence_number=-1,
-                                output_index=current_output_index,
-                                item_id=current_item_id,
-                                content_index=current_content_index,
-                                part=ResponseOutputText(
-                                    type="output_text",
-                                    text="",
-                                    annotations=[],
-                                    logprobs=[],
-                                ),
-                            )
-                        )
-                    yield _increment_sequence_number_and_return(
-                        ResponseTextDeltaEvent(
-                            type="response.output_text.delta",
-                            sequence_number=-1,
-                            content_index=current_content_index,
-                            output_index=current_output_index,
-                            item_id=current_item_id,
-                            delta=ctx.parser.last_content_delta,
-                            # TODO, use logprobs from ctx.last_request_output
-                            logprobs=[],
-                        )
-                    )
-                elif (
-                    ctx.parser.current_channel == "analysis"
-                    and ctx.parser.current_recipient is None
-                ):
-                    if not sent_output_item_added:
-                        sent_output_item_added = True
-                        current_item_id = f"msg_{random_uuid()}"
-                        yield _increment_sequence_number_and_return(
-                            ResponseOutputItemAddedEvent(
-                                type="response.output_item.added",
-                                sequence_number=-1,
-                                output_index=current_output_index,
-                                item=ResponseReasoningItem(
-                                    type="reasoning",
-                                    id=current_item_id,
-                                    summary=[],
-                                    status="in_progress",
-                                ),
-                            )
-                        )
-                        current_content_index += 1
-                        yield _increment_sequence_number_and_return(
-                            ResponseReasoningPartAddedEvent(
-                                type="response.reasoning_part.added",
-                                sequence_number=-1,
-                                output_index=current_output_index,
-                                item_id=current_item_id,
-                                content_index=current_content_index,
-                                part=ResponseReasoningTextContent(
-                                    text="",
-                                    type="reasoning_text",
-                                ),
-                            )
-                        )
-                    yield _increment_sequence_number_and_return(
-                        ResponseReasoningTextDeltaEvent(
-                            type="response.reasoning_text.delta",
-                            item_id=current_item_id,
-                            output_index=current_output_index,
-                            content_index=current_content_index,
-                            delta=ctx.parser.last_content_delta,
-                            sequence_number=-1,
-                        )
-                    )
-                # built-in tools will be triggered on the analysis channel
-                # However, occasionally built-in tools will
-                # still be output to commentary.
-                elif (
-                    ctx.parser.current_channel == "commentary"
-                    or ctx.parser.current_channel == "analysis"
-                ) and ctx.parser.current_recipient == "python":
-                    if not sent_output_item_added:
-                        sent_output_item_added = True
-                        current_item_id = f"tool_{random_uuid()}"
-                        yield _increment_sequence_number_and_return(
-                            ResponseOutputItemAddedEvent(
-                                type="response.output_item.added",
-                                sequence_number=-1,
-                                output_index=current_output_index,
-                                item=ResponseCodeInterpreterToolCallParam(
-                                    type="code_interpreter_call",
-                                    id=current_item_id,
-                                    code=None,
-                                    container_id="auto",
-                                    outputs=None,
-                                    status="in_progress",
-                                ),
-                            )
-                        )
-                        yield _increment_sequence_number_and_return(
-                            ResponseCodeInterpreterCallInProgressEvent(
-                                type="response.code_interpreter_call.in_progress",
-                                sequence_number=-1,
-                                output_index=current_output_index,
-                                item_id=current_item_id,
-                            )
-                        )
-                    yield _increment_sequence_number_and_return(
-                        ResponseCodeInterpreterCallCodeDeltaEvent(
-                            type="response.code_interpreter_call_code.delta",
-                            sequence_number=-1,
-                            output_index=current_output_index,
-                            item_id=current_item_id,
-                            delta=ctx.parser.last_content_delta,
-                        )
-                    )
+            # Stream the output of a harmony message
+            for event in self._emit_content_delta_events(ctx, state):
+                yield _increment_sequence_number_and_return(event)
 
-            # stream tool call outputs
-            if ctx.is_assistant_action_turn() and len(ctx.parser.messages) > 0:
-                previous_item = ctx.parser.messages[-1]
-                if (
-                    self.tool_server is not None
-                    and self.tool_server.has_tool("browser")
-                    and previous_item.recipient is not None
-                    and previous_item.recipient.startswith("browser.")
-                ):
-                    function_name = previous_item.recipient[len("browser.") :]
-                    action = None
-                    parsed_args = json.loads(previous_item.content[0].text)
-                    if function_name == "search":
-                        action = response_function_web_search.ActionSearch(
-                            type="search",
-                            query=parsed_args["query"],
-                        )
-                    elif function_name == "open":
-                        action = response_function_web_search.ActionOpenPage(
-                            type="open_page",
-                            # TODO: translate to url
-                            url=f"cursor:{parsed_args.get('cursor', '')}",
-                        )
-                    elif function_name == "find":
-                        action = response_function_web_search.ActionFind(
-                            type="find",
-                            pattern=parsed_args["pattern"],
-                            # TODO: translate to url
-                            url=f"cursor:{parsed_args.get('cursor', '')}",
-                        )
-                    else:
-                        raise ValueError(f"Unknown function name: {function_name}")
-
-                    current_item_id = f"tool_{random_uuid()}"
-                    yield _increment_sequence_number_and_return(
-                        ResponseOutputItemAddedEvent(
-                            type="response.output_item.added",
-                            sequence_number=-1,
-                            output_index=current_output_index,
-                            item=response_function_web_search.ResponseFunctionWebSearch(
-                                # TODO: generate a unique id for web search call
-                                type="web_search_call",
-                                id=current_item_id,
-                                action=action,
-                                status="in_progress",
-                            ),
-                        )
-                    )
-                    yield _increment_sequence_number_and_return(
-                        ResponseWebSearchCallInProgressEvent(
-                            type="response.web_search_call.in_progress",
-                            sequence_number=-1,
-                            output_index=current_output_index,
-                            item_id=current_item_id,
-                        )
-                    )
-                    yield _increment_sequence_number_and_return(
-                        ResponseWebSearchCallSearchingEvent(
-                            type="response.web_search_call.searching",
-                            sequence_number=-1,
-                            output_index=current_output_index,
-                            item_id=current_item_id,
-                        )
-                    )
-
-                    # enqueue
-                    yield _increment_sequence_number_and_return(
-                        ResponseWebSearchCallCompletedEvent(
-                            type="response.web_search_call.completed",
-                            sequence_number=-1,
-                            output_index=current_output_index,
-                            item_id=current_item_id,
-                        )
-                    )
-                    yield _increment_sequence_number_and_return(
-                        ResponseOutputItemDoneEvent(
-                            type="response.output_item.done",
-                            sequence_number=-1,
-                            output_index=current_output_index,
-                            item=ResponseFunctionWebSearch(
-                                type="web_search_call",
-                                id=current_item_id,
-                                action=action,
-                                status="completed",
-                            ),
-                        )
-                    )
-
-                if (
-                    self.tool_server is not None
-                    and self.tool_server.has_tool("python")
-                    and previous_item.recipient is not None
-                    and previous_item.recipient.startswith("python")
-                ):
-                    yield _increment_sequence_number_and_return(
-                        ResponseCodeInterpreterCallCodeDoneEvent(
-                            type="response.code_interpreter_call_code.done",
-                            sequence_number=-1,
-                            output_index=current_output_index,
-                            item_id=current_item_id,
-                            code=previous_item.content[0].text,
-                        )
-                    )
-                    yield _increment_sequence_number_and_return(
-                        ResponseCodeInterpreterCallInterpretingEvent(
-                            type="response.code_interpreter_call.interpreting",
-                            sequence_number=-1,
-                            output_index=current_output_index,
-                            item_id=current_item_id,
-                        )
-                    )
-                    yield _increment_sequence_number_and_return(
-                        ResponseCodeInterpreterCallCompletedEvent(
-                            type="response.code_interpreter_call.completed",
-                            sequence_number=-1,
-                            output_index=current_output_index,
-                            item_id=current_item_id,
-                        )
-                    )
-                    yield _increment_sequence_number_and_return(
-                        ResponseOutputItemDoneEvent(
-                            type="response.output_item.done",
-                            sequence_number=-1,
-                            output_index=current_output_index,
-                            item=ResponseCodeInterpreterToolCallParam(
-                                type="code_interpreter_call",
-                                id=current_item_id,
-                                code=previous_item.content[0].text,
-                                container_id="auto",
-                                # TODO: add outputs here
-                                outputs=[],
-                                status="completed",
-                            ),
-                        )
-                    )
-            # developer tools will be triggered on the commentary channel
-            # and recipient starts with "functions.TOOL_NAME"
-            if (
-                ctx.parser.current_channel == "commentary"
-                and ctx.parser.current_recipient
-                and ctx.parser.current_recipient.startswith("functions.")
-            ):
-                if is_first_function_call_delta is False:
-                    is_first_function_call_delta = True
-                    fc_name = ctx.parser.current_recipient[len("functions.") :]
-                    tool_call_item = ResponseFunctionToolCall(
-                        name=fc_name,
-                        type="function_call",
-                        id=current_item_id,
-                        call_id=f"call_{random_uuid()}",
-                        arguments="",
-                        status="in_progress",
-                    )
-                    current_item_id = f"fc_{random_uuid()}"
-                    yield _increment_sequence_number_and_return(
-                        ResponseOutputItemAddedEvent(
-                            type="response.output_item.added",
-                            sequence_number=-1,
-                            output_index=current_output_index,
-                            item=tool_call_item,
-                        )
-                    )
-                else:
-                    yield _increment_sequence_number_and_return(
-                        ResponseFunctionCallArgumentsDeltaEvent(
-                            item_id=current_item_id,
-                            delta=ctx.parser.last_content_delta,
-                            output_index=current_output_index,
-                            sequence_number=-1,
-                            type="response.function_call_arguments.delta",
-                        )
-                    )
+            # Stream tool call outputs
+            for event in self._emit_tool_action_events(ctx, state):
+                yield _increment_sequence_number_and_return(event)
 
     async def responses_stream_generator(
         self,
@@ -2001,7 +2474,6 @@ class OpenAIServingResponses(OpenAIServing):
             return event
 
         async with AsyncExitStack() as exit_stack:
-            processer = None
             if self.use_harmony:
                 # TODO: in streaming, we noticed this bug:
                 # https://github.com/vllm-project/vllm/issues/25697
diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py
index cea9924ebbaca86468285ed801841cdfc315845e..b6332d1941c1d9b7c1619b79b2290feff284fa93 100644
--- a/vllm/entrypoints/openai/speech_to_text.py
+++ b/vllm/entrypoints/openai/speech_to_text.py
@@ -30,12 +30,13 @@ from vllm.entrypoints.openai.protocol import (
     TranslationSegment,
     TranslationStreamResponse,
     UsageInfo,
+    VLLMValidationError,
 )
 from vllm.entrypoints.openai.serving_engine import OpenAIServing, SpeechToTextRequest
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.inputs.data import PromptType
 from vllm.logger import init_logger
-from vllm.model_executor.models import SupportsTranscription
+from vllm.model_executor.models import SupportsTranscription, supports_transcription
 from vllm.outputs import RequestOutput
 from vllm.tokenizers import get_tokenizer
 from vllm.utils.import_utils import PlaceholderModule
@@ -112,6 +113,132 @@ class OpenAISpeechToText(OpenAIServing):
                 self.default_sampling_params,
             )
 
+        # Warm up audio preprocessing to avoid first-request latency
+        self._warmup_audio_preprocessing()
+        # Warm up input processor with dummy audio
+        self._warmup_input_processor()
+
+    def _warmup_audio_preprocessing(self) -> None:
+        """Warm up audio processing libraries to avoid first-request latency.
+
+        The first call to librosa functions (load, get_duration, mel-spectrogram)
+        triggers JIT compilation and library initialization which can take ~7s.
+        This method warms up these operations during server initialization.
+        """
+        # Skip warmup if librosa is not installed (optional dependency)
+        if isinstance(librosa, PlaceholderModule):
+            return
+
+        # Skip warmup if model doesn't support transcription
+        if not supports_transcription(self.model_cls):
+            return
+
+        try:
+            warmup_start = time.perf_counter()
+            logger.info("Warming up audio preprocessing libraries...")
+
+            # Create a minimal dummy audio (1 second of silence at target sample rate)
+            dummy_audio = np.zeros(int(self.asr_config.sample_rate), dtype=np.float32)
+
+            # Warm up librosa.load by using librosa functions on the dummy data
+            # This initializes FFTW, numba JIT, and other audio processing libraries
+            _ = librosa.get_duration(y=dummy_audio, sr=self.asr_config.sample_rate)
+
+            # Warm up mel-spectrogram computation with model-specific parameters
+            from vllm.transformers_utils.processor import (
+                cached_processor_from_config,
+            )
+
+            processor = cached_processor_from_config(self.model_config)
+            feature_extractor = None
+            if hasattr(processor, "feature_extractor"):
+                feature_extractor = processor.feature_extractor
+            elif hasattr(processor, "audio_processor"):
+                # For models like GraniteSpeech that use audio_processor
+                audio_proc = processor.audio_processor
+                if hasattr(audio_proc, "feature_extractor"):
+                    feature_extractor = audio_proc.feature_extractor
+                # If audio_processor doesn't have feature_extractor,
+                # skip mel-spectrogram warmup for these models
+
+            if feature_extractor is not None:
+                _ = librosa.feature.melspectrogram(
+                    y=dummy_audio,
+                    sr=self.asr_config.sample_rate,
+                    n_mels=getattr(feature_extractor, "n_mels", 128),
+                    n_fft=getattr(feature_extractor, "n_fft", 400),
+                    hop_length=getattr(feature_extractor, "hop_length", 160),
+                )
+
+            warmup_elapsed = time.perf_counter() - warmup_start
+            logger.info("Audio preprocessing warmup completed in %.2fs", warmup_elapsed)
+        except Exception:
+            # Don't fail initialization if warmup fails - log exception and continue
+            logger.exception(
+                "Audio preprocessing warmup failed (non-fatal): %s. "
+                "First request may experience higher latency.",
+            )
+
+    def _warmup_input_processor(self) -> None:
+        """Warm up input processor with dummy audio to avoid first-request latency.
+
+        The first call to input_processor.process_inputs() with multimodal audio
+        triggers multimodal processing initialization which can take ~2.5s.
+        This method processes a dummy audio request to warm up the pipeline.
+        """
+        # Skip warmup if model doesn't support transcription
+        if not supports_transcription(self.model_cls):
+            return
+
+        # Only warm up if model supports transcription methods
+        if not hasattr(self.model_cls, "get_generation_prompt"):
+            return
+
+        try:
+            from vllm.sampling_params import SamplingParams
+
+            warmup_start = time.perf_counter()
+            logger.info("Warming up multimodal input processor...")
+
+            # Create minimal dummy audio (1 second of silence)
+            dummy_audio = np.zeros(int(self.asr_config.sample_rate), dtype=np.float32)
+
+            # Use the same method that _preprocess_speech_to_text uses
+            # to create the prompt
+            dummy_prompt = self.model_cls.get_generation_prompt(
+                audio=dummy_audio,
+                stt_config=self.asr_config,
+                model_config=self.model_config,
+                language="en",
+                task_type=self.task_type,
+                request_prompt="",
+                to_language=None,
+            )
+
+            # Create minimal sampling params
+            dummy_params = SamplingParams(
+                max_tokens=1,
+                temperature=0.0,
+                skip_clone=True,  # Internal warmup, safe to skip clone
+            )
+
+            # Process the dummy input through the input processor
+            # This will trigger all the multimodal processing initialization
+            _ = self.input_processor.process_inputs(
+                request_id="warmup",
+                prompt=dummy_prompt,
+                params=dummy_params,
+            )
+
+            warmup_elapsed = time.perf_counter() - warmup_start
+            logger.info("Input processor warmup completed in %.2fs", warmup_elapsed)
+        except Exception:
+            # Don't fail initialization if warmup fails - log warning and continue
+            logger.exception(
+                "Input processor warmup failed (non-fatal): %s. "
+                "First request may experience higher latency."
+            )
+
     @cached_property
     def model_cls(self) -> type[SupportsTranscription]:
         from vllm.model_executor.model_loader import get_model_cls
@@ -134,7 +261,11 @@ class OpenAISpeechToText(OpenAIServing):
         )
 
         if len(audio_data) / 1024**2 > self.max_audio_filesize_mb:
-            raise ValueError("Maximum file size exceeded.")
+            raise VLLMValidationError(
+                "Maximum file size exceeded",
+                parameter="audio_filesize_mb",
+                value=len(audio_data) / 1024**2,
+            )
 
         with io.BytesIO(audio_data) as bytes_:
             # NOTE resample to model SR here for efficiency. This is also a
@@ -162,12 +293,18 @@ class OpenAISpeechToText(OpenAIServing):
             )
             if request.response_format == "verbose_json":
                 if not isinstance(prompt, dict):
-                    raise ValueError(f"Expected prompt to be a dict,got {type(prompt)}")
+                    raise VLLMValidationError(
+                        "Expected prompt to be a dict",
+                        parameter="prompt",
+                        value=type(prompt).__name__,
+                    )
                 prompt_dict = cast(dict, prompt)
                 decoder_prompt = prompt.get("decoder_prompt")
                 if not isinstance(decoder_prompt, str):
-                    raise ValueError(
-                        f"Expected decoder_prompt to bestr, got {type(decoder_prompt)}"
+                    raise VLLMValidationError(
+                        "Expected decoder_prompt to be str",
+                        parameter="decoder_prompt",
+                        value=type(decoder_prompt).__name__,
                     )
                 prompt_dict["decoder_prompt"] = decoder_prompt.replace(
                     "<|notimestamps|>", "<|0.00|>"
@@ -287,14 +424,20 @@ class OpenAISpeechToText(OpenAIServing):
 
         except ValueError as e:
             logger.exception("Error in preprocessing prompt inputs")
-            return self.create_error_response(str(e))
+            return self.create_error_response(e)
 
         list_result_generator: list[AsyncGenerator[RequestOutput, None]] | None = None
         try:
             # Unlike most decoder-only models, whisper generation length is not
             # constrained by the size of the input audio, which is mapped to a
-            # fixed-size log-mel-spectogram.
-            default_max_tokens = self.model_config.max_model_len
+            # fixed-size log-mel-spectogram. Still, allow for fewer tokens to be
+            # generated by respecting the extra completion tokens arg.
+            if request.max_completion_tokens is None:
+                default_max_tokens = self.model_config.max_model_len
+            else:
+                default_max_tokens = min(
+                    self.model_config.max_model_len, request.max_completion_tokens
+                )
             sampling_params = request.to_sampling_params(
                 default_max_tokens, self.default_sampling_params
             )
@@ -317,8 +460,7 @@ class OpenAISpeechToText(OpenAIServing):
                 for i, prompt in enumerate(prompts)
             ]
         except ValueError as e:
-            # TODO: Use a vllm-specific Validation Error
-            return self.create_error_response(str(e))
+            return self.create_error_response(e)
 
         if request.stream:
             return stream_generator_method(
@@ -335,7 +477,15 @@ class OpenAISpeechToText(OpenAIServing):
             }
             segment_class: type[SpeechToTextSegment] = segments_types[self.task_type]
             text = ""
+            chunk_size_in_s = self.asr_config.max_audio_clip_s
+            if chunk_size_in_s is None:
+                assert len(list_result_generator) == 1, (
+                    "`max_audio_clip_s` is set to None, audio cannot be chunked"
+                )
             for idx, result_generator in enumerate(list_result_generator):
+                start_time = (
+                    float(idx * chunk_size_in_s) if chunk_size_in_s is not None else 0.0
+                )
                 async for op in result_generator:
                     if request.response_format == "verbose_json":
                         segments: list[SpeechToTextSegment] = (
@@ -343,7 +493,7 @@ class OpenAISpeechToText(OpenAIServing):
                                 tokens=tuple(op.outputs[0].token_ids),
                                 segment_class=segment_class,
                                 request=request,
-                                start_time=idx * self.asr_config.max_audio_clip_s,
+                                start_time=start_time,
                             )
                         )
 
@@ -392,8 +542,7 @@ class OpenAISpeechToText(OpenAIServing):
         except asyncio.CancelledError:
             return self.create_error_response("Client disconnected")
         except ValueError as e:
-            # TODO: Use a vllm-specific Validation Error
-            return self.create_error_response(str(e))
+            return self.create_error_response(e)
 
     async def _speech_to_text_stream_generator(
         self,
@@ -503,9 +652,8 @@ class OpenAISpeechToText(OpenAIServing):
             )
 
         except Exception as e:
-            # TODO: Use a vllm-specific Validation Error
             logger.exception("Error in %s stream generator.", self.task_type)
-            data = self.create_streaming_error_response(str(e))
+            data = self.create_streaming_error_response(e)
             yield f"data: {data}\n\n"
         # Send the final done message after all response.n are finished
         yield "data: [DONE]\n\n"
@@ -513,6 +661,10 @@ class OpenAISpeechToText(OpenAIServing):
     def _split_audio(
         self, audio_data: np.ndarray, sample_rate: int
     ) -> list[np.ndarray]:
+        assert self.asr_config.max_audio_clip_s is not None, (
+            f"{self.asr_config.max_audio_clip_s=} cannot be None to"
+            " split audio into chunks."
+        )
         chunk_size = sample_rate * self.asr_config.max_audio_clip_s
         overlap_size = sample_rate * self.asr_config.overlap_chunk_second
         chunks = []
diff --git a/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py
deleted file mode 100644
index ad1b682a9ef65ba44969b30c9e40d083c216f1de..0000000000000000000000000000000000000000
--- a/vllm/entrypoints/openai/tool_parsers/__init__.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import warnings
-
-
-def __getattr__(name: str):
-    if name == "ToolParser":
-        from vllm.tool_parsers import ToolParser
-
-        warnings.warn(
-            "`vllm.entrypoints.openai.tool_parsers.ToolParser` has been moved to "
-            "`vllm.tool_parsers.ToolParser`. "
-            "The old name will be removed in v0.14.",
-            DeprecationWarning,
-            stacklevel=2,
-        )
-
-        return ToolParser
-    if name == "ToolParserManager":
-        from vllm.tool_parsers import ToolParserManager
-
-        warnings.warn(
-            "`vllm.entrypoints.openai.tool_parsers.ToolParserManager` "
-            "has been moved to `vllm.tool_parsers.ToolParserManager`. "
-            "The old name will be removed in v0.14.",
-            DeprecationWarning,
-            stacklevel=2,
-        )
-
-        return ToolParserManager
-
-    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
diff --git a/vllm/entrypoints/pooling/classify/api_router.py b/vllm/entrypoints/pooling/classify/api_router.py
index d6ced73c88ebc8b64ccc243f7633271729e2ba0b..ffd9b09c401b0be364a20a3d8f83cc698f8293ce 100644
--- a/vllm/entrypoints/pooling/classify/api_router.py
+++ b/vllm/entrypoints/pooling/classify/api_router.py
@@ -1,8 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from http import HTTPStatus
 
-from fastapi import APIRouter, Depends, HTTPException, Request
+from fastapi import APIRouter, Depends, Request
 from starlette.responses import JSONResponse
 from typing_extensions import assert_never
 
@@ -36,9 +35,8 @@ async def create_classify(request: ClassificationRequest, raw_request: Request):
     try:
         generator = await handler.create_classify(request, raw_request)
     except Exception as e:
-        raise HTTPException(
-            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
-        ) from e
+        return handler.create_error_response(e)
+
     if isinstance(generator, ErrorResponse):
         return JSONResponse(
             content=generator.model_dump(), status_code=generator.error.code
diff --git a/vllm/entrypoints/pooling/embed/api_router.py b/vllm/entrypoints/pooling/embed/api_router.py
index 24b0c8c2b3cf650686a54f3815b5fb4c05fcd327..7f4096b0953e0469becbdc940a307e411748220f 100644
--- a/vllm/entrypoints/pooling/embed/api_router.py
+++ b/vllm/entrypoints/pooling/embed/api_router.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from http import HTTPStatus
 
-from fastapi import APIRouter, Depends, HTTPException, Request
+from fastapi import APIRouter, Depends, Request
 from fastapi.responses import JSONResponse, StreamingResponse
 from typing_extensions import assert_never
 
@@ -47,9 +47,7 @@ async def create_embedding(
     try:
         generator = await handler.create_embedding(request, raw_request)
     except Exception as e:
-        raise HTTPException(
-            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
-        ) from e
+        return handler.create_error_response(e)
 
     if isinstance(generator, ErrorResponse):
         return JSONResponse(
diff --git a/vllm/entrypoints/pooling/embed/conftest.py b/vllm/entrypoints/pooling/embed/conftest.py
new file mode 100644
index 0000000000000000000000000000000000000000..002b85874049c0861410a8252bf11308d5047884
--- /dev/null
+++ b/vllm/entrypoints/pooling/embed/conftest.py
@@ -0,0 +1,28 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Pytest configuration for vLLM pooling embed tests."""
+
+import warnings
+
+import torch
+
+from vllm.platforms import current_platform
+
+
+def pytest_collection_modifyitems(config, items):
+    """Configure ROCm-specific settings based on collected tests."""
+    if not current_platform.is_rocm():
+        return
+
+    # Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
+    # accuracy issues: https://github.com/vllm-project/vllm/issues/30167
+    # TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace
+    torch.backends.cuda.enable_flash_sdp(False)
+    torch.backends.cuda.enable_mem_efficient_sdp(False)
+    torch.backends.cuda.enable_math_sdp(True)
+    warnings.warn(
+        "ROCm: Disabled flash_sdp and mem_efficient_sdp, enabled math_sdp "
+        "to avoid HuggingFace Transformers accuracy issues",
+        UserWarning,
+        stacklevel=1,
+    )
diff --git a/vllm/entrypoints/pooling/embed/protocol.py b/vllm/entrypoints/pooling/embed/protocol.py
index 6a8f8c4434e557c55cb6ccc34e868a9617baff88..d8b85ec2b3fd9cf5eadd8fec201050d21cb299b2 100644
--- a/vllm/entrypoints/pooling/embed/protocol.py
+++ b/vllm/entrypoints/pooling/embed/protocol.py
@@ -75,7 +75,7 @@ class EmbeddingCompletionRequest(OpenAIBaseModel):
         return PoolingParams(
             truncate_prompt_tokens=self.truncate_prompt_tokens,
             dimensions=self.dimensions,
-            normalize=self.normalize,
+            use_activation=self.normalize,
         )
 
 
@@ -97,7 +97,16 @@ class EmbeddingChatRequest(OpenAIBaseModel):
             "model."
         ),
     )
-
+    continue_final_message: bool = Field(
+        default=False,
+        description=(
+            "If this is set, the chat will be formatted so that the final "
+            "message in the chat is open-ended, without any EOS tokens. The "
+            "model will continue this message rather than starting a new one. "
+            'This allows you to "prefill" part of the model\'s response for it. '
+            "Cannot be used at the same time as `add_generation_prompt`."
+        ),
+    )
     add_special_tokens: bool = Field(
         default=False,
         description=(
@@ -180,7 +189,7 @@ class EmbeddingChatRequest(OpenAIBaseModel):
         return PoolingParams(
             truncate_prompt_tokens=self.truncate_prompt_tokens,
             dimensions=self.dimensions,
-            normalize=self.normalize,
+            use_activation=self.normalize,
         )
 
 
diff --git a/vllm/entrypoints/pooling/embed/serving.py b/vllm/entrypoints/pooling/embed/serving.py
index f5a21208ed8029e57eade6e1070dc7ebff62f9a1..e94b80043962085ece269fa9955d861ae9dc5844 100644
--- a/vllm/entrypoints/pooling/embed/serving.py
+++ b/vllm/entrypoints/pooling/embed/serving.py
@@ -89,7 +89,7 @@ class EmbeddingMixin(OpenAIServing):
                     chat_template=ctx.request.chat_template or ctx.chat_template,
                     chat_template_content_format=ctx.chat_template_content_format,
                     add_generation_prompt=ctx.request.add_generation_prompt,
-                    continue_final_message=False,
+                    continue_final_message=ctx.request.continue_final_message,
                     add_special_tokens=ctx.request.add_special_tokens,
                 )
             else:
diff --git a/vllm/entrypoints/pooling/pooling/api_router.py b/vllm/entrypoints/pooling/pooling/api_router.py
index 4baaf8f30f6bb0c8449ebb7d55d7420ab6cf2bb7..b64b7f6b37904c11a0a3c14afb878ec178ccad58 100644
--- a/vllm/entrypoints/pooling/pooling/api_router.py
+++ b/vllm/entrypoints/pooling/pooling/api_router.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from http import HTTPStatus
 
-from fastapi import APIRouter, Depends, HTTPException, Request
+from fastapi import APIRouter, Depends, Request
 from fastapi.responses import JSONResponse, StreamingResponse
 from typing_extensions import assert_never
 
@@ -44,9 +44,8 @@ async def create_pooling(request: PoolingRequest, raw_request: Request):
     try:
         generator = await handler.create_pooling(request, raw_request)
     except Exception as e:
-        raise HTTPException(
-            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
-        ) from e
+        return handler.create_error_response(e)
+
     if isinstance(generator, ErrorResponse):
         return JSONResponse(
             content=generator.model_dump(), status_code=generator.error.code
diff --git a/vllm/entrypoints/pooling/pooling/protocol.py b/vllm/entrypoints/pooling/pooling/protocol.py
index 76b361b49b6685b5835be260c22a7c4f020b940a..83dafc2ee02d6c544b96f30986f1787f73c34cba 100644
--- a/vllm/entrypoints/pooling/pooling/protocol.py
+++ b/vllm/entrypoints/pooling/pooling/protocol.py
@@ -40,7 +40,6 @@ class PoolingCompletionRequest(EmbeddingCompletionRequest):
         return PoolingParams(
             truncate_prompt_tokens=self.truncate_prompt_tokens,
             dimensions=self.dimensions,
-            normalize=self.normalize,
             use_activation=get_use_activation(self),
         )
 
@@ -66,7 +65,6 @@ class PoolingChatRequest(EmbeddingChatRequest):
         return PoolingParams(
             truncate_prompt_tokens=self.truncate_prompt_tokens,
             dimensions=self.dimensions,
-            normalize=self.normalize,
             use_activation=get_use_activation(self),
         )
 
diff --git a/vllm/entrypoints/pooling/score/api_router.py b/vllm/entrypoints/pooling/score/api_router.py
index c7481ed9fa9681e91cc007f5221cebd9a17ac48b..5443637b1f7657ad1511d88f373f3a68065e3393 100644
--- a/vllm/entrypoints/pooling/score/api_router.py
+++ b/vllm/entrypoints/pooling/score/api_router.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from http import HTTPStatus
 
-from fastapi import APIRouter, Depends, HTTPException, Request
+from fastapi import APIRouter, Depends, Request
 from fastapi.responses import JSONResponse
 from typing_extensions import assert_never
 
@@ -52,9 +52,8 @@ async def create_score(request: ScoreRequest, raw_request: Request):
     try:
         generator = await handler.create_score(request, raw_request)
     except Exception as e:
-        raise HTTPException(
-            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
-        ) from e
+        return handler.create_error_response(e)
+
     if isinstance(generator, ErrorResponse):
         return JSONResponse(
             content=generator.model_dump(), status_code=generator.error.code
@@ -104,9 +103,8 @@ async def do_rerank(request: RerankRequest, raw_request: Request):
     try:
         generator = await handler.do_rerank(request, raw_request)
     except Exception as e:
-        raise HTTPException(
-            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
-        ) from e
+        return handler.create_error_response(e)
+
     if isinstance(generator, ErrorResponse):
         return JSONResponse(
             content=generator.model_dump(), status_code=generator.error.code
diff --git a/vllm/entrypoints/pooling/score/serving.py b/vllm/entrypoints/pooling/score/serving.py
index edbfcd03ac92c7833411be518535296a48b59627..9762b23639853284f2c2a555d531021e2bb054c4 100644
--- a/vllm/entrypoints/pooling/score/serving.py
+++ b/vllm/entrypoints/pooling/score/serving.py
@@ -52,6 +52,7 @@ class ServingScores(OpenAIServing):
         models: OpenAIServingModels,
         *,
         request_logger: RequestLogger | None,
+        score_template: str | None = None,
         log_error_stack: bool = False,
     ) -> None:
         super().__init__(
@@ -60,6 +61,7 @@ class ServingScores(OpenAIServing):
             request_logger=request_logger,
             log_error_stack=log_error_stack,
         )
+        self.score_template = score_template
 
     async def _embedding_score(
         self,
@@ -169,6 +171,7 @@ class ServingScores(OpenAIServing):
             data_2=data_2,
             tokenizer=tokenizer,
             tokenization_kwargs=tokenization_kwargs,
+            score_template=self.score_template,
         )
         self._validate_input(request, engine_prompt["prompt_token_ids"], full_prompt)
         if request.mm_processor_kwargs is not None:
diff --git a/vllm/entrypoints/renderer.py b/vllm/entrypoints/renderer.py
index 0f89c840be80f7f0fd119ec2d133daf0cfad1463..8a88eff430d9bb391a26801898129c617db8cadf 100644
--- a/vllm/entrypoints/renderer.py
+++ b/vllm/entrypoints/renderer.py
@@ -12,6 +12,7 @@ import torch
 from pydantic import Field
 
 from vllm.config import ModelConfig
+from vllm.exceptions import VLLMValidationError
 from vllm.inputs.data import EmbedsPrompt, TextPrompt, TokensPrompt
 from vllm.inputs.parse import get_prompt_components, parse_raw_prompts
 from vllm.tokenizers import TokenizerLike
@@ -43,11 +44,8 @@ class RenderConfig:
     def verify_truncate_prompt_tokens(self, model_config: ModelConfig) -> int | None:
         """Validate and normalize `truncate_prompt_tokens` parameter."""
         truncate_prompt_tokens = self.truncate_prompt_tokens
-        if truncate_prompt_tokens is None:
-            return None
-
-        if truncate_prompt_tokens == 0:
-            return 0
+        if truncate_prompt_tokens is None or truncate_prompt_tokens == 0:
+            return truncate_prompt_tokens
 
         if truncate_prompt_tokens < 0:
             truncate_prompt_tokens = model_config.max_model_len
@@ -162,8 +160,9 @@ class BaseRenderer(ABC):
     ) -> list[EmbedsPrompt]:
         """Load and validate base64-encoded embeddings into prompt objects."""
         if not self.model_config.enable_prompt_embeds:
-            raise ValueError(
-                "You must set `--enable-prompt-embeds` to input `prompt_embeds`."
+            raise VLLMValidationError(
+                "You must set `--enable-prompt-embeds` to input `prompt_embeds`.",
+                parameter="prompt_embeds",
             )
 
         def _load_and_validate_embed(embed: bytes) -> EmbedsPrompt:
@@ -396,10 +395,12 @@ class CompletionRenderer(BaseRenderer):
     ) -> TokensPrompt:
         """Create validated TokensPrompt."""
         if max_length is not None and len(token_ids) > max_length:
-            raise ValueError(
+            raise VLLMValidationError(
                 f"This model's maximum context length is {max_length} tokens. "
                 f"However, your request has {len(token_ids)} input tokens. "
-                "Please reduce the length of the input messages."
+                "Please reduce the length of the input messages.",
+                parameter="input_tokens",
+                value=len(token_ids),
             )
 
         tokens_prompt = TokensPrompt(prompt_token_ids=token_ids)
diff --git a/vllm/entrypoints/responses_utils.py b/vllm/entrypoints/responses_utils.py
index 99080fa43cb8eab6baad99e687d621c6eac6e65f..5fd0cf43e687ad09cb6f04234929cd25895e918d 100644
--- a/vllm/entrypoints/responses_utils.py
+++ b/vllm/entrypoints/responses_utils.py
@@ -16,7 +16,6 @@ from openai.types.responses.response import ToolChoice
 from openai.types.responses.response_function_tool_call_output_item import (
     ResponseFunctionToolCallOutputItem,
 )
-from openai.types.responses.response_output_item import McpCall
 from openai.types.responses.response_output_message import ResponseOutputMessage
 from openai.types.responses.response_reasoning_item import ResponseReasoningItem
 from openai.types.responses.tool import Tool
@@ -27,38 +26,6 @@ from vllm.entrypoints.openai.protocol import (
     ChatCompletionMessageParam,
     ResponseInputOutputItem,
 )
-from vllm.utils import random_uuid
-
-
-def make_response_output_items_from_parsable_context(
-    response_messages: list[ResponseInputOutputItem],
-) -> list[ResponseOutputItem]:
-    """Given a list of sentences, construct ResponseOutput Items."""
-    output_messages: list[ResponseOutputItem] = []
-    for message in response_messages:
-        if not isinstance(message, ResponseFunctionToolCallOutputItem):
-            output_messages.append(message)
-        else:
-            if len(output_messages) == 0:
-                raise ValueError(
-                    "Cannot have a FunctionToolCallOutput before FunctionToolCall."
-                )
-            if isinstance(output_messages[-1], ResponseFunctionToolCall):
-                mcp_message = McpCall(
-                    id=f"{MCP_PREFIX}{random_uuid()}",
-                    arguments=output_messages[-1].arguments,
-                    name=output_messages[-1].name,
-                    server_label=output_messages[
-                        -1
-                    ].name,  # TODO: store the server label
-                    type=f"{MCP_PREFIX}call",
-                    status="completed",
-                    output=message.output,
-                    # TODO: support error output
-                )
-                output_messages[-1] = mcp_message
-
-    return output_messages
 
 
 def construct_input_messages(
@@ -111,7 +78,9 @@ def _maybe_combine_reasoning_and_tool_call(
     This function checks if the last message is a reasoning message and
     the current message is a tool call"""
     if not (
-        isinstance(item, ResponseFunctionToolCall) and item.id.startswith(MCP_PREFIX)
+        isinstance(item, ResponseFunctionToolCall)
+        and item.id
+        and item.id.startswith(MCP_PREFIX)
     ):
         return None
     if len(messages) == 0:
diff --git a/vllm/entrypoints/score_utils.py b/vllm/entrypoints/score_utils.py
index 072ddd4c90b168cea2dadcc9d350442a52edb778..09ef8781b2dac5bc5123ecebe1b9bb36f00e8d22 100644
--- a/vllm/entrypoints/score_utils.py
+++ b/vllm/entrypoints/score_utils.py
@@ -11,9 +11,12 @@ from vllm.entrypoints.chat_utils import (
     ChatCompletionContentPartImageEmbedsParam,
     ChatCompletionContentPartImageParam,
     ChatCompletionContentPartTextParam,
+    ChatCompletionContentPartVideoParam,
+    ChatTemplateResolutionError,
     MultiModalItemTracker,
     _ContentPart,
     _parse_chat_message_content_part,
+    apply_hf_chat_template,
 )
 from vllm.inputs import TokensPrompt
 from vllm.model_executor.models.interfaces import supports_score_template
@@ -22,7 +25,10 @@ from vllm.outputs import PoolingRequestOutput
 from vllm.tokenizers import TokenizerLike
 
 ScoreContentPartParam: TypeAlias = (
-    ChatCompletionContentPartImageParam | ChatCompletionContentPartImageEmbedsParam
+    ChatCompletionContentPartImageParam
+    | ChatCompletionContentPartImageEmbedsParam
+    | ChatCompletionContentPartTextParam
+    | ChatCompletionContentPartVideoParam
 )
 
 
@@ -139,10 +145,8 @@ def _parse_score_content(
     return next(iter(mm_placeholder_storage.values()))[0]
 
 
-def apply_score_template(
-    model_config: ModelConfig,
-    prompt_1: str,
-    prompt_2: str,
+def _apply_model_score_template(
+    model_config: ModelConfig, prompt_1: str, prompt_2: str
 ) -> str:
     # NOTE(Simon): lazy import to avoid bring in all dependencies (e.g. gguf)
     from vllm.model_executor.model_loader import get_model_cls
@@ -181,6 +185,7 @@ def get_score_prompt(
     tokenization_kwargs: dict[str, Any],
     data_1: str | ScoreContentPartParam,
     data_2: str | ScoreContentPartParam,
+    score_template: str | None = None,
 ) -> tuple[str, TokensPrompt]:
     prompt_1, prompt_2, mm_data = parse_score_data(
         data_1,
@@ -190,19 +195,48 @@ def get_score_prompt(
     from vllm.model_executor.model_loader import get_model_cls
 
     model = get_model_cls(model_config)
-    if supports_score_template(model):
-        full_prompt = apply_score_template(model_config, prompt_1, prompt_2)
-        prompt_inputs = tokenizer(full_prompt, **tokenization_kwargs)
-    elif model_config.use_pad_token:
-        # cross_encoder models defaults to using pad_token.
-        prompt_inputs = tokenizer(
-            text=prompt_1, text_pair=prompt_2, **tokenization_kwargs
-        )
-        full_prompt = tokenizer.decode(prompt_inputs["input_ids"])
+
+    def default_tokenizer_encode():
+        if supports_score_template(model):
+            full_prompt = _apply_model_score_template(model_config, prompt_1, prompt_2)
+            prompt_inputs = tokenizer(full_prompt, **tokenization_kwargs)
+        else:
+            if model_config.use_sep_token:
+                # cross_encoder models defaults to using separating token.
+                prompt_inputs = tokenizer(
+                    text=prompt_1, text_pair=prompt_2, **tokenization_kwargs
+                )
+                full_prompt = tokenizer.decode(prompt_inputs["input_ids"])
+            else:
+                # `llm as reranker` defaults to not using separating token.
+                full_prompt = prompt_1 + prompt_2
+                prompt_inputs = tokenizer(text=full_prompt, **tokenization_kwargs)
+        return full_prompt, prompt_inputs
+
+    # FIXME: For now, we only apply a template when one is explicitly provided.
+    # We cannot rely on the tokenizer's chat template because many models
+    # inherit junk templates from their base LLM, which breaks both the models
+    # and the tests that use them.
+    if score_template is None:
+        full_prompt, prompt_inputs = default_tokenizer_encode()
     else:
-        # `llm as reranker` models defaults to not using pad_token.
-        full_prompt = prompt_1 + prompt_2
-        prompt_inputs = tokenizer(text=full_prompt, **tokenization_kwargs)
+        # FIXME: Try applying a score template from the CLI arg or tokenizer_config.json
+        # If that fails because there is no such template,
+        # fall back to the default implementation.
+        try:
+            full_prompt = apply_hf_chat_template(
+                tokenizer,
+                [
+                    {"role": "query", "content": prompt_1},
+                    {"role": "document", "content": prompt_2},
+                ],
+                score_template,
+                tools=None,
+                model_config=model_config,
+            )
+            prompt_inputs = tokenizer(full_prompt, **tokenization_kwargs)
+        except ChatTemplateResolutionError:
+            full_prompt, prompt_inputs = default_tokenizer_encode()
 
     engine_prompt = TokensPrompt(prompt_token_ids=prompt_inputs["input_ids"])
 
diff --git a/vllm/entrypoints/serve/__init__.py b/vllm/entrypoints/serve/__init__.py
index c4fcc92db931f6a7c2bc3ff483ff5142cf4ddab8..f5c80f68240ef431cc13ebd2231ea500bec63c67 100644
--- a/vllm/entrypoints/serve/__init__.py
+++ b/vllm/entrypoints/serve/__init__.py
@@ -1,16 +1,27 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-
 from fastapi import FastAPI
 
+import vllm.envs as envs
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
 
 def register_vllm_serve_api_routers(app: FastAPI):
+    if envs.VLLM_SERVER_DEV_MODE:
+        logger.warning(
+            "SECURITY WARNING: Development endpoints are enabled! "
+            "This should NOT be used in production!"
+        )
+
     from vllm.entrypoints.serve.lora.api_router import (
         attach_router as attach_lora_router,
     )
 
     attach_lora_router(app)
+
     from vllm.entrypoints.serve.elastic_ep.api_router import (
         attach_router as attach_elastic_ep_router,
     )
@@ -29,6 +40,18 @@ def register_vllm_serve_api_routers(app: FastAPI):
 
     attach_sleep_router(app)
 
+    from vllm.entrypoints.serve.rpc.api_router import (
+        attach_router as attach_rpc_router,
+    )
+
+    attach_rpc_router(app)
+
+    from vllm.entrypoints.serve.cache.api_router import (
+        attach_router as attach_cache_router,
+    )
+
+    attach_cache_router(app)
+
     from vllm.entrypoints.serve.tokenize.api_router import (
         attach_router as attach_tokenize_router,
     )
@@ -58,3 +81,14 @@ def register_vllm_serve_api_routers(app: FastAPI):
     )
 
     attach_health_router(app)
+
+    from vllm.entrypoints.serve.instrumentator.offline_docs import (
+        attach_router as attach_offline_docs_router,
+    )
+
+    attach_offline_docs_router(app)
+    from vllm.entrypoints.serve.instrumentator.server_info import (
+        attach_router as attach_server_info_router,
+    )
+
+    attach_server_info_router(app)
diff --git a/vllm/attention/ops/__init__.py b/vllm/entrypoints/serve/cache/__init__.py
similarity index 100%
rename from vllm/attention/ops/__init__.py
rename to vllm/entrypoints/serve/cache/__init__.py
diff --git a/vllm/entrypoints/serve/cache/api_router.py b/vllm/entrypoints/serve/cache/api_router.py
new file mode 100644
index 0000000000000000000000000000000000000000..d659895463273170cf75386db2e7decc3b1ae789
--- /dev/null
+++ b/vllm/entrypoints/serve/cache/api_router.py
@@ -0,0 +1,61 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+from fastapi import APIRouter, FastAPI, Query, Request
+from fastapi.responses import Response
+
+import vllm.envs as envs
+from vllm.engine.protocol import EngineClient
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+router = APIRouter()
+
+
+def engine_client(request: Request) -> EngineClient:
+    return request.app.state.engine_client
+
+
+@router.post("/reset_prefix_cache")
+async def reset_prefix_cache(
+    raw_request: Request,
+    reset_running_requests: bool = Query(default=False),
+    reset_external: bool = Query(default=False),
+):
+    """
+    Reset the local prefix cache.
+
+    Optionally, if the query parameter `reset_external=true`
+    also resets the external (connector-managed) prefix cache.
+
+    Note that we currently do not check if the prefix cache
+    is successfully reset in the API server.
+
+    Example:
+       POST /reset_prefix_cache?reset_external=true
+    """
+    logger.info("Resetting prefix cache...")
+
+    await engine_client(raw_request).reset_prefix_cache(
+        reset_running_requests, reset_external
+    )
+    return Response(status_code=200)
+
+
+@router.post("/reset_mm_cache")
+async def reset_mm_cache(raw_request: Request):
+    """
+    Reset the multi-modal cache. Note that we currently do not check if the
+    multi-modal cache is successfully reset in the API server.
+    """
+    logger.info("Resetting multi-modal cache...")
+    await engine_client(raw_request).reset_mm_cache()
+    return Response(status_code=200)
+
+
+def attach_router(app: FastAPI):
+    if not envs.VLLM_SERVER_DEV_MODE:
+        return
+    app.include_router(router)
diff --git a/vllm/entrypoints/serve/disagg/api_router.py b/vllm/entrypoints/serve/disagg/api_router.py
index c38ede30dad1ccdf8fcb43c20c75471385b0e1cc..c66e048d7fe2850bada067925fcc5a2096184b91 100644
--- a/vllm/entrypoints/serve/disagg/api_router.py
+++ b/vllm/entrypoints/serve/disagg/api_router.py
@@ -67,9 +67,8 @@ async def generate(request: GenerateRequest, raw_request: Request):
     try:
         generator = await handler.serve_tokens(request, raw_request)
     except Exception as e:
-        raise HTTPException(
-            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
-        ) from e
+        return handler.create_error_response(e)
+
     if isinstance(generator, ErrorResponse):
         return JSONResponse(
             content=generator.model_dump(), status_code=generator.error.code
diff --git a/vllm/entrypoints/serve/elastic_ep/api_router.py b/vllm/entrypoints/serve/elastic_ep/api_router.py
index 21d5d2e60778adceaab5a1a5a334715f4defe4ba..e5adb81051ffd790083b8bba383f1ff8a929ec21 100644
--- a/vllm/entrypoints/serve/elastic_ep/api_router.py
+++ b/vllm/entrypoints/serve/elastic_ep/api_router.py
@@ -43,7 +43,7 @@ async def scale_elastic_ep(raw_request: Request):
     try:
         body = await raw_request.json()
     except json.JSONDecodeError as e:
-        raise HTTPException(status_code=400, detail="Invalid JSON format") from e  # noqa: B904
+        raise HTTPException(status_code=400, detail="Invalid JSON format") from e
 
     new_data_parallel_size = body.get("new_data_parallel_size")
     drain_timeout = body.get("drain_timeout", 120)  # Default 2 minutes
diff --git a/vllm/entrypoints/serve/instrumentator/offline_docs.py b/vllm/entrypoints/serve/instrumentator/offline_docs.py
new file mode 100644
index 0000000000000000000000000000000000000000..87395345f8cd86e37ac904bcfd9a261ce8eaa2ac
--- /dev/null
+++ b/vllm/entrypoints/serve/instrumentator/offline_docs.py
@@ -0,0 +1,50 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Offline FastAPI documentation support for air-gapped environments."""
+
+import pathlib
+
+from fastapi import FastAPI
+from fastapi.openapi.docs import (
+    get_swagger_ui_html,
+    get_swagger_ui_oauth2_redirect_html,
+)
+from fastapi.staticfiles import StaticFiles
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+def attach_router(app: FastAPI) -> None:
+    """Attach offline docs router if enabled via args."""
+    args = getattr(app.state, "args", None)
+    if args is None or not getattr(args, "enable_offline_docs", False):
+        return
+
+    static_dir = pathlib.Path(__file__).parent / "static"
+
+    if not static_dir.exists():
+        logger.warning(
+            "Static directory not found at %s. Offline docs will not be available.",
+            static_dir,
+        )
+        return
+
+    app.mount("/static", StaticFiles(directory=str(static_dir)), name="static")
+
+    @app.get("/docs", include_in_schema=False)
+    async def custom_swagger_ui_html():
+        return get_swagger_ui_html(
+            openapi_url=app.openapi_url,
+            title=app.title + " - Swagger UI",
+            oauth2_redirect_url=app.swagger_ui_oauth2_redirect_url,
+            swagger_js_url="/static/swagger-ui-bundle.js",
+            swagger_css_url="/static/swagger-ui.css",
+        )
+
+    @app.get(app.swagger_ui_oauth2_redirect_url, include_in_schema=False)
+    async def swagger_ui_redirect():
+        return get_swagger_ui_oauth2_redirect_html()
+
+    logger.info("Offline documentation enabled with vendored static assets")
diff --git a/vllm/entrypoints/serve/instrumentator/server_info.py b/vllm/entrypoints/serve/instrumentator/server_info.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ba92d5b99c54692bb16f37b0e8d5e0a0fd723b1
--- /dev/null
+++ b/vllm/entrypoints/serve/instrumentator/server_info.py
@@ -0,0 +1,56 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+from typing import Annotated, Literal
+
+import pydantic
+from fastapi import APIRouter, FastAPI, Query, Request
+from fastapi.responses import JSONResponse
+
+import vllm.envs as envs
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+router = APIRouter()
+PydanticVllmConfig = pydantic.TypeAdapter(VllmConfig)
+
+
+def _get_vllm_env_vars():
+    from vllm.config.utils import normalize_value
+
+    vllm_envs = {}
+    for key in dir(envs):
+        if key.startswith("VLLM_") and "KEY" not in key:
+            value = getattr(envs, key, None)
+            if value is not None:
+                value = normalize_value(value)
+                vllm_envs[key] = value
+    return vllm_envs
+
+
+@router.get("/server_info")
+async def show_server_info(
+    raw_request: Request,
+    config_format: Annotated[Literal["text", "json"], Query()] = "text",
+):
+    vllm_config: VllmConfig = raw_request.app.state.vllm_config
+    server_info = {
+        "vllm_config": (
+            str(vllm_config)
+            if config_format == "text"
+            else PydanticVllmConfig.dump_python(vllm_config, mode="json", fallback=str)
+        ),
+        # fallback=str is needed to handle e.g. torch.dtype
+        "vllm_env": _get_vllm_env_vars(),
+    }
+    return JSONResponse(content=server_info)
+
+
+def attach_router(app: FastAPI):
+    if not envs.VLLM_SERVER_DEV_MODE:
+        return
+    app.include_router(router)
diff --git a/vllm/entrypoints/serve/instrumentator/static/swagger-ui-bundle.js b/vllm/entrypoints/serve/instrumentator/static/swagger-ui-bundle.js
new file mode 100644
index 0000000000000000000000000000000000000000..2a9ac731cb7b21b6f6c97f2f36b9fb477742d122
--- /dev/null
+++ b/vllm/entrypoints/serve/instrumentator/static/swagger-ui-bundle.js
@@ -0,0 +1,2 @@
+/*! For license information please see swagger-ui-bundle.js.LICENSE.txt */
+!function webpackUniversalModuleDefinition(s,o){"object"==typeof exports&&"object"==typeof module?module.exports=o():"function"==typeof define&&define.amd?define([],o):"object"==typeof exports?exports.SwaggerUIBundle=o():s.SwaggerUIBundle=o()}(this,(()=>(()=>{var s={251:(s,o)=>{o.read=function(s,o,i,a,u){var _,w,x=8*u-a-1,C=(1<<x)-1,j=C>>1,L=-7,B=i?u-1:0,$=i?-1:1,U=s[o+B];for(B+=$,_=U&(1<<-L)-1,U>>=-L,L+=x;L>0;_=256*_+s[o+B],B+=$,L-=8);for(w=_&(1<<-L)-1,_>>=-L,L+=a;L>0;w=256*w+s[o+B],B+=$,L-=8);if(0===_)_=1-j;else{if(_===C)return w?NaN:1/0*(U?-1:1);w+=Math.pow(2,a),_-=j}return(U?-1:1)*w*Math.pow(2,_-a)},o.write=function(s,o,i,a,u,_){var w,x,C,j=8*_-u-1,L=(1<<j)-1,B=L>>1,$=23===u?Math.pow(2,-24)-Math.pow(2,-77):0,U=a?0:_-1,V=a?1:-1,z=o<0||0===o&&1/o<0?1:0;for(o=Math.abs(o),isNaN(o)||o===1/0?(x=isNaN(o)?1:0,w=L):(w=Math.floor(Math.log(o)/Math.LN2),o*(C=Math.pow(2,-w))<1&&(w--,C*=2),(o+=w+B>=1?$/C:$*Math.pow(2,1-B))*C>=2&&(w++,C/=2),w+B>=L?(x=0,w=L):w+B>=1?(x=(o*C-1)*Math.pow(2,u),w+=B):(x=o*Math.pow(2,B-1)*Math.pow(2,u),w=0));u>=8;s[i+U]=255&x,U+=V,x/=256,u-=8);for(w=w<<u|x,j+=u;j>0;s[i+U]=255&w,U+=V,w/=256,j-=8);s[i+U-V]|=128*z}},462:(s,o,i)=>{"use strict";var a=i(40975);s.exports=a},659:(s,o,i)=>{var a=i(51873),u=Object.prototype,_=u.hasOwnProperty,w=u.toString,x=a?a.toStringTag:void 0;s.exports=function getRawTag(s){var o=_.call(s,x),i=s[x];try{s[x]=void 0;var a=!0}catch(s){}var u=w.call(s);return a&&(o?s[x]=i:delete s[x]),u}},694:(s,o,i)=>{"use strict";i(91599);var a=i(37257);i(12560),s.exports=a},953:(s,o,i)=>{"use strict";s.exports=i(53375)},1733:s=>{var o=/[^\x00-\x2f\x3a-\x40\x5b-\x60\x7b-\x7f]+/g;s.exports=function asciiWords(s){return s.match(o)||[]}},1882:(s,o,i)=>{var a=i(72552),u=i(23805);s.exports=function isFunction(s){if(!u(s))return!1;var o=a(s);return"[object Function]"==o||"[object GeneratorFunction]"==o||"[object AsyncFunction]"==o||"[object Proxy]"==o}},1907:(s,o,i)=>{"use strict";var a=i(41505),u=Function.prototype,_=u.call,w=a&&u.bind.bind(_,_);s.exports=a?w:function(s){return function(){return _.apply(s,arguments)}}},2205:function(s,o,i){var a;a=void 0!==i.g?i.g:this,s.exports=function(s){if(s.CSS&&s.CSS.escape)return s.CSS.escape;var cssEscape=function(s){if(0==arguments.length)throw new TypeError("`CSS.escape` requires an argument.");for(var o,i=String(s),a=i.length,u=-1,_="",w=i.charCodeAt(0);++u<a;)0!=(o=i.charCodeAt(u))?_+=o>=1&&o<=31||127==o||0==u&&o>=48&&o<=57||1==u&&o>=48&&o<=57&&45==w?"\\"+o.toString(16)+" ":0==u&&1==a&&45==o||!(o>=128||45==o||95==o||o>=48&&o<=57||o>=65&&o<=90||o>=97&&o<=122)?"\\"+i.charAt(u):i.charAt(u):_+="�";return _};return s.CSS||(s.CSS={}),s.CSS.escape=cssEscape,cssEscape}(a)},2209:(s,o,i)=>{"use strict";var a,u=i(9404),_=function productionTypeChecker(){invariant(!1,"ImmutablePropTypes type checking code is stripped in production.")};_.isRequired=_;var w=function getProductionTypeChecker(){return _};function getPropType(s){var o=typeof s;return Array.isArray(s)?"array":s instanceof RegExp?"object":s instanceof u.Iterable?"Immutable."+s.toSource().split(" ")[0]:o}function createChainableTypeChecker(s){function checkType(o,i,a,u,_,w){for(var x=arguments.length,C=Array(x>6?x-6:0),j=6;j<x;j++)C[j-6]=arguments[j];return w=w||a,u=u||"<<anonymous>>",null!=i[a]?s.apply(void 0,[i,a,u,_,w].concat(C)):o?new Error("Required "+_+" `"+w+"` was not specified in `"+u+"`."):void 0}var o=checkType.bind(null,!1);return o.isRequired=checkType.bind(null,!0),o}function createIterableSubclassTypeChecker(s,o){return function createImmutableTypeChecker(s,o){return createChainableTypeChecker((function validate(i,a,u,_,w){var x=i[a];if(!o(x)){var C=getPropType(x);return new Error("Invalid "+_+" `"+w+"` of type `"+C+"` supplied to `"+u+"`, expected `"+s+"`.")}return null}))}("Iterable."+s,(function(s){return u.Iterable.isIterable(s)&&o(s)}))}(a={listOf:w,mapOf:w,orderedMapOf:w,setOf:w,orderedSetOf:w,stackOf:w,iterableOf:w,recordOf:w,shape:w,contains:w,mapContains:w,orderedMapContains:w,list:_,map:_,orderedMap:_,set:_,orderedSet:_,stack:_,seq:_,record:_,iterable:_}).iterable.indexed=createIterableSubclassTypeChecker("Indexed",u.Iterable.isIndexed),a.iterable.keyed=createIterableSubclassTypeChecker("Keyed",u.Iterable.isKeyed),s.exports=a},2404:(s,o,i)=>{var a=i(60270);s.exports=function isEqual(s,o){return a(s,o)}},2523:s=>{s.exports=function baseFindIndex(s,o,i,a){for(var u=s.length,_=i+(a?1:-1);a?_--:++_<u;)if(o(s[_],_,s))return _;return-1}},2532:(s,o,i)=>{"use strict";var a=i(45951),u=Object.defineProperty;s.exports=function(s,o){try{u(a,s,{value:o,configurable:!0,writable:!0})}catch(i){a[s]=o}return o}},2694:(s,o,i)=>{"use strict";var a=i(6925);function emptyFunction(){}function emptyFunctionWithReset(){}emptyFunctionWithReset.resetWarningCache=emptyFunction,s.exports=function(){function shim(s,o,i,u,_,w){if(w!==a){var x=new Error("Calling PropTypes validators directly is not supported by the `prop-types` package. Use PropTypes.checkPropTypes() to call them. Read more at http://fb.me/use-check-prop-types");throw x.name="Invariant Violation",x}}function getShim(){return shim}shim.isRequired=shim;var s={array:shim,bigint:shim,bool:shim,func:shim,number:shim,object:shim,string:shim,symbol:shim,any:shim,arrayOf:getShim,element:shim,elementType:shim,instanceOf:getShim,node:shim,objectOf:getShim,oneOf:getShim,oneOfType:getShim,shape:getShim,exact:getShim,checkPropTypes:emptyFunctionWithReset,resetWarningCache:emptyFunction};return s.PropTypes=s,s}},2874:s=>{s.exports={}},2875:(s,o,i)=>{"use strict";var a=i(23045),u=i(80376);s.exports=Object.keys||function keys(s){return a(s,u)}},2955:(s,o,i)=>{"use strict";var a,u=i(65606);function _defineProperty(s,o,i){return(o=function _toPropertyKey(s){var o=function _toPrimitive(s,o){if("object"!=typeof s||null===s)return s;var i=s[Symbol.toPrimitive];if(void 0!==i){var a=i.call(s,o||"default");if("object"!=typeof a)return a;throw new TypeError("@@toPrimitive must return a primitive value.")}return("string"===o?String:Number)(s)}(s,"string");return"symbol"==typeof o?o:String(o)}(o))in s?Object.defineProperty(s,o,{value:i,enumerable:!0,configurable:!0,writable:!0}):s[o]=i,s}var _=i(86238),w=Symbol("lastResolve"),x=Symbol("lastReject"),C=Symbol("error"),j=Symbol("ended"),L=Symbol("lastPromise"),B=Symbol("handlePromise"),$=Symbol("stream");function createIterResult(s,o){return{value:s,done:o}}function readAndResolve(s){var o=s[w];if(null!==o){var i=s[$].read();null!==i&&(s[L]=null,s[w]=null,s[x]=null,o(createIterResult(i,!1)))}}function onReadable(s){u.nextTick(readAndResolve,s)}var U=Object.getPrototypeOf((function(){})),V=Object.setPrototypeOf((_defineProperty(a={get stream(){return this[$]},next:function next(){var s=this,o=this[C];if(null!==o)return Promise.reject(o);if(this[j])return Promise.resolve(createIterResult(void 0,!0));if(this[$].destroyed)return new Promise((function(o,i){u.nextTick((function(){s[C]?i(s[C]):o(createIterResult(void 0,!0))}))}));var i,a=this[L];if(a)i=new Promise(function wrapForNext(s,o){return function(i,a){s.then((function(){o[j]?i(createIterResult(void 0,!0)):o[B](i,a)}),a)}}(a,this));else{var _=this[$].read();if(null!==_)return Promise.resolve(createIterResult(_,!1));i=new Promise(this[B])}return this[L]=i,i}},Symbol.asyncIterator,(function(){return this})),_defineProperty(a,"return",(function _return(){var s=this;return new Promise((function(o,i){s[$].destroy(null,(function(s){s?i(s):o(createIterResult(void 0,!0))}))}))})),a),U);s.exports=function createReadableStreamAsyncIterator(s){var o,i=Object.create(V,(_defineProperty(o={},$,{value:s,writable:!0}),_defineProperty(o,w,{value:null,writable:!0}),_defineProperty(o,x,{value:null,writable:!0}),_defineProperty(o,C,{value:null,writable:!0}),_defineProperty(o,j,{value:s._readableState.endEmitted,writable:!0}),_defineProperty(o,B,{value:function value(s,o){var a=i[$].read();a?(i[L]=null,i[w]=null,i[x]=null,s(createIterResult(a,!1))):(i[w]=s,i[x]=o)},writable:!0}),o));return i[L]=null,_(s,(function(s){if(s&&"ERR_STREAM_PREMATURE_CLOSE"!==s.code){var o=i[x];return null!==o&&(i[L]=null,i[w]=null,i[x]=null,o(s)),void(i[C]=s)}var a=i[w];null!==a&&(i[L]=null,i[w]=null,i[x]=null,a(createIterResult(void 0,!0))),i[j]=!0})),s.on("readable",onReadable.bind(null,i)),i}},3110:(s,o,i)=>{const a=i(5187),u=i(85015),_=i(98023),w=i(53812),x=i(23805),C=i(85105),j=i(86804);class Namespace{constructor(s){this.elementMap={},this.elementDetection=[],this.Element=j.Element,this.KeyValuePair=j.KeyValuePair,s&&s.noDefault||this.useDefault(),this._attributeElementKeys=[],this._attributeElementArrayKeys=[]}use(s){return s.namespace&&s.namespace({base:this}),s.load&&s.load({base:this}),this}useDefault(){return this.register("null",j.NullElement).register("string",j.StringElement).register("number",j.NumberElement).register("boolean",j.BooleanElement).register("array",j.ArrayElement).register("object",j.ObjectElement).register("member",j.MemberElement).register("ref",j.RefElement).register("link",j.LinkElement),this.detect(a,j.NullElement,!1).detect(u,j.StringElement,!1).detect(_,j.NumberElement,!1).detect(w,j.BooleanElement,!1).detect(Array.isArray,j.ArrayElement,!1).detect(x,j.ObjectElement,!1),this}register(s,o){return this._elements=void 0,this.elementMap[s]=o,this}unregister(s){return this._elements=void 0,delete this.elementMap[s],this}detect(s,o,i){return void 0===i||i?this.elementDetection.unshift([s,o]):this.elementDetection.push([s,o]),this}toElement(s){if(s instanceof this.Element)return s;let o;for(let i=0;i<this.elementDetection.length;i+=1){const a=this.elementDetection[i][0],u=this.elementDetection[i][1];if(a(s)){o=new u(s);break}}return o}getElementClass(s){const o=this.elementMap[s];return void 0===o?this.Element:o}fromRefract(s){return this.serialiser.deserialise(s)}toRefract(s){return this.serialiser.serialise(s)}get elements(){return void 0===this._elements&&(this._elements={Element:this.Element},Object.keys(this.elementMap).forEach((s=>{const o=s[0].toUpperCase()+s.substr(1);this._elements[o]=this.elementMap[s]}))),this._elements}get serialiser(){return new C(this)}}C.prototype.Namespace=Namespace,s.exports=Namespace},3121:(s,o,i)=>{"use strict";var a=i(65482),u=Math.min;s.exports=function(s){var o=a(s);return o>0?u(o,9007199254740991):0}},3209:(s,o,i)=>{var a=i(91596),u=i(53320),_=i(36306),w="__lodash_placeholder__",x=128,C=Math.min;s.exports=function mergeData(s,o){var i=s[1],j=o[1],L=i|j,B=L<131,$=j==x&&8==i||j==x&&256==i&&s[7].length<=o[8]||384==j&&o[7].length<=o[8]&&8==i;if(!B&&!$)return s;1&j&&(s[2]=o[2],L|=1&i?0:4);var U=o[3];if(U){var V=s[3];s[3]=V?a(V,U,o[4]):U,s[4]=V?_(s[3],w):o[4]}return(U=o[5])&&(V=s[5],s[5]=V?u(V,U,o[6]):U,s[6]=V?_(s[5],w):o[6]),(U=o[7])&&(s[7]=U),j&x&&(s[8]=null==s[8]?o[8]:C(s[8],o[8])),null==s[9]&&(s[9]=o[9]),s[0]=o[0],s[1]=L,s}},3650:(s,o,i)=>{var a=i(74335)(Object.keys,Object);s.exports=a},3656:(s,o,i)=>{s=i.nmd(s);var a=i(9325),u=i(89935),_=o&&!o.nodeType&&o,w=_&&s&&!s.nodeType&&s,x=w&&w.exports===_?a.Buffer:void 0,C=(x?x.isBuffer:void 0)||u;s.exports=C},4509:(s,o,i)=>{var a=i(12651);s.exports=function mapCacheHas(s){return a(this,s).has(s)}},4640:s=>{"use strict";var o=String;s.exports=function(s){try{return o(s)}catch(s){return"Object"}}},4664:(s,o,i)=>{var a=i(79770),u=i(63345),_=Object.prototype.propertyIsEnumerable,w=Object.getOwnPropertySymbols,x=w?function(s){return null==s?[]:(s=Object(s),a(w(s),(function(o){return _.call(s,o)})))}:u;s.exports=x},4901:(s,o,i)=>{var a=i(72552),u=i(30294),_=i(40346),w={};w["[object Float32Array]"]=w["[object Float64Array]"]=w["[object Int8Array]"]=w["[object Int16Array]"]=w["[object Int32Array]"]=w["[object Uint8Array]"]=w["[object Uint8ClampedArray]"]=w["[object Uint16Array]"]=w["[object Uint32Array]"]=!0,w["[object Arguments]"]=w["[object Array]"]=w["[object ArrayBuffer]"]=w["[object Boolean]"]=w["[object DataView]"]=w["[object Date]"]=w["[object Error]"]=w["[object Function]"]=w["[object Map]"]=w["[object Number]"]=w["[object Object]"]=w["[object RegExp]"]=w["[object Set]"]=w["[object String]"]=w["[object WeakMap]"]=!1,s.exports=function baseIsTypedArray(s){return _(s)&&u(s.length)&&!!w[a(s)]}},4993:(s,o,i)=>{"use strict";var a=i(16946),u=i(74239);s.exports=function(s){return a(u(s))}},5187:s=>{s.exports=function isNull(s){return null===s}},5419:s=>{s.exports=function(s,o,i,a){var u=new Blob(void 0!==a?[a,s]:[s],{type:i||"application/octet-stream"});if(void 0!==window.navigator.msSaveBlob)window.navigator.msSaveBlob(u,o);else{var _=window.URL&&window.URL.createObjectURL?window.URL.createObjectURL(u):window.webkitURL.createObjectURL(u),w=document.createElement("a");w.style.display="none",w.href=_,w.setAttribute("download",o),void 0===w.download&&w.setAttribute("target","_blank"),document.body.appendChild(w),w.click(),setTimeout((function(){document.body.removeChild(w),window.URL.revokeObjectURL(_)}),200)}}},5556:(s,o,i)=>{s.exports=i(2694)()},5861:(s,o,i)=>{var a=i(55580),u=i(68223),_=i(32804),w=i(76545),x=i(28303),C=i(72552),j=i(47473),L="[object Map]",B="[object Promise]",$="[object Set]",U="[object WeakMap]",V="[object DataView]",z=j(a),Y=j(u),Z=j(_),ee=j(w),ie=j(x),ae=C;(a&&ae(new a(new ArrayBuffer(1)))!=V||u&&ae(new u)!=L||_&&ae(_.resolve())!=B||w&&ae(new w)!=$||x&&ae(new x)!=U)&&(ae=function(s){var o=C(s),i="[object Object]"==o?s.constructor:void 0,a=i?j(i):"";if(a)switch(a){case z:return V;case Y:return L;case Z:return B;case ee:return $;case ie:return U}return o}),s.exports=ae},6048:s=>{s.exports=function negate(s){if("function"!=typeof s)throw new TypeError("Expected a function");return function(){var o=arguments;switch(o.length){case 0:return!s.call(this);case 1:return!s.call(this,o[0]);case 2:return!s.call(this,o[0],o[1]);case 3:return!s.call(this,o[0],o[1],o[2])}return!s.apply(this,o)}}},6188:s=>{"use strict";s.exports=Math.max},6205:s=>{s.exports={ROOT:0,GROUP:1,POSITION:2,SET:3,RANGE:4,REPETITION:5,REFERENCE:6,CHAR:7}},6233:(s,o,i)=>{const a=i(6048),u=i(10316),_=i(92340);class ArrayElement extends u{constructor(s,o,i){super(s||[],o,i),this.element="array"}primitive(){return"array"}get(s){return this.content[s]}getValue(s){const o=this.get(s);if(o)return o.toValue()}getIndex(s){return this.content[s]}set(s,o){return this.content[s]=this.refract(o),this}remove(s){const o=this.content.splice(s,1);return o.length?o[0]:null}map(s,o){return this.content.map(s,o)}flatMap(s,o){return this.map(s,o).reduce(((s,o)=>s.concat(o)),[])}compactMap(s,o){const i=[];return this.forEach((a=>{const u=s.bind(o)(a);u&&i.push(u)})),i}filter(s,o){return new _(this.content.filter(s,o))}reject(s,o){return this.filter(a(s),o)}reduce(s,o){let i,a;void 0!==o?(i=0,a=this.refract(o)):(i=1,a="object"===this.primitive()?this.first.value:this.first);for(let o=i;o<this.length;o+=1){const i=this.content[o];a="object"===this.primitive()?this.refract(s(a,i.value,i.key,i,this)):this.refract(s(a,i,o,this))}return a}forEach(s,o){this.content.forEach(((i,a)=>{s.bind(o)(i,this.refract(a))}))}shift(){return this.content.shift()}unshift(s){this.content.unshift(this.refract(s))}push(s){return this.content.push(this.refract(s)),this}add(s){this.push(s)}findElements(s,o){const i=o||{},a=!!i.recursive,u=void 0===i.results?[]:i.results;return this.forEach(((o,i,_)=>{a&&void 0!==o.findElements&&o.findElements(s,{results:u,recursive:a}),s(o,i,_)&&u.push(o)})),u}find(s){return new _(this.findElements(s,{recursive:!0}))}findByElement(s){return this.find((o=>o.element===s))}findByClass(s){return this.find((o=>o.classes.includes(s)))}getById(s){return this.find((o=>o.id.toValue()===s)).first}includes(s){return this.content.some((o=>o.equals(s)))}contains(s){return this.includes(s)}empty(){return new this.constructor([])}"fantasy-land/empty"(){return this.empty()}concat(s){return new this.constructor(this.content.concat(s.content))}"fantasy-land/concat"(s){return this.concat(s)}"fantasy-land/map"(s){return new this.constructor(this.map(s))}"fantasy-land/chain"(s){return this.map((o=>s(o)),this).reduce(((s,o)=>s.concat(o)),this.empty())}"fantasy-land/filter"(s){return new this.constructor(this.content.filter(s))}"fantasy-land/reduce"(s,o){return this.content.reduce(s,o)}get length(){return this.content.length}get isEmpty(){return 0===this.content.length}get first(){return this.getIndex(0)}get second(){return this.getIndex(1)}get last(){return this.getIndex(this.length-1)}}ArrayElement.empty=function empty(){return new this},ArrayElement["fantasy-land/empty"]=ArrayElement.empty,"undefined"!=typeof Symbol&&(ArrayElement.prototype[Symbol.iterator]=function symbol(){return this.content[Symbol.iterator]()}),s.exports=ArrayElement},6499:(s,o,i)=>{"use strict";var a=i(1907),u=0,_=Math.random(),w=a(1..toString);s.exports=function(s){return"Symbol("+(void 0===s?"":s)+")_"+w(++u+_,36)}},6549:s=>{"use strict";s.exports=Object.getOwnPropertyDescriptor},6925:s=>{"use strict";s.exports="SECRET_DO_NOT_PASS_THIS_OR_YOU_WILL_BE_FIRED"},7057:(s,o,i)=>{"use strict";var a=i(11470).charAt,u=i(90160),_=i(64932),w=i(60183),x=i(59550),C="String Iterator",j=_.set,L=_.getterFor(C);w(String,"String",(function(s){j(this,{type:C,string:u(s),index:0})}),(function next(){var s,o=L(this),i=o.string,u=o.index;return u>=i.length?x(void 0,!0):(s=a(i,u),o.index+=s.length,x(s,!1))}))},7176:(s,o,i)=>{"use strict";var a,u=i(73126),_=i(75795);try{a=[].__proto__===Array.prototype}catch(s){if(!s||"object"!=typeof s||!("code"in s)||"ERR_PROTO_ACCESS"!==s.code)throw s}var w=!!a&&_&&_(Object.prototype,"__proto__"),x=Object,C=x.getPrototypeOf;s.exports=w&&"function"==typeof w.get?u([w.get]):"function"==typeof C&&function getDunder(s){return C(null==s?s:x(s))}},7309:(s,o,i)=>{var a=i(62006)(i(24713));s.exports=a},7376:s=>{"use strict";s.exports=!0},7463:(s,o,i)=>{"use strict";var a=i(98828),u=i(62250),_=/#|\.prototype\./,isForced=function(s,o){var i=x[w(s)];return i===j||i!==C&&(u(o)?a(o):!!o)},w=isForced.normalize=function(s){return String(s).replace(_,".").toLowerCase()},x=isForced.data={},C=isForced.NATIVE="N",j=isForced.POLYFILL="P";s.exports=isForced},7666:(s,o,i)=>{var a=i(84851),u=i(953);function _extends(){var o;return s.exports=_extends=a?u(o=a).call(o):function(s){for(var o=1;o<arguments.length;o++){var i=arguments[o];for(var a in i)({}).hasOwnProperty.call(i,a)&&(s[a]=i[a])}return s},s.exports.__esModule=!0,s.exports.default=s.exports,_extends.apply(null,arguments)}s.exports=_extends,s.exports.__esModule=!0,s.exports.default=s.exports},8048:(s,o,i)=>{const a=i(6205);o.wordBoundary=()=>({type:a.POSITION,value:"b"}),o.nonWordBoundary=()=>({type:a.POSITION,value:"B"}),o.begin=()=>({type:a.POSITION,value:"^"}),o.end=()=>({type:a.POSITION,value:"$"})},8068:s=>{"use strict";var o=(()=>{var s=Object.defineProperty,o=Object.getOwnPropertyDescriptor,i=Object.getOwnPropertyNames,a=Object.getOwnPropertySymbols,u=Object.prototype.hasOwnProperty,_=Object.prototype.propertyIsEnumerable,__defNormalProp=(o,i,a)=>i in o?s(o,i,{enumerable:!0,configurable:!0,writable:!0,value:a}):o[i]=a,__spreadValues=(s,o)=>{for(var i in o||(o={}))u.call(o,i)&&__defNormalProp(s,i,o[i]);if(a)for(var i of a(o))_.call(o,i)&&__defNormalProp(s,i,o[i]);return s},__publicField=(s,o,i)=>__defNormalProp(s,"symbol"!=typeof o?o+"":o,i),w={};((o,i)=>{for(var a in i)s(o,a,{get:i[a],enumerable:!0})})(w,{DEFAULT_OPTIONS:()=>C,DEFAULT_UUID_LENGTH:()=>x,default:()=>B});var x=6,C={dictionary:"alphanum",shuffle:!0,debug:!1,length:x,counter:0},j=class _ShortUniqueId{constructor(s={}){__publicField(this,"counter"),__publicField(this,"debug"),__publicField(this,"dict"),__publicField(this,"version"),__publicField(this,"dictIndex",0),__publicField(this,"dictRange",[]),__publicField(this,"lowerBound",0),__publicField(this,"upperBound",0),__publicField(this,"dictLength",0),__publicField(this,"uuidLength"),__publicField(this,"_digit_first_ascii",48),__publicField(this,"_digit_last_ascii",58),__publicField(this,"_alpha_lower_first_ascii",97),__publicField(this,"_alpha_lower_last_ascii",123),__publicField(this,"_hex_last_ascii",103),__publicField(this,"_alpha_upper_first_ascii",65),__publicField(this,"_alpha_upper_last_ascii",91),__publicField(this,"_number_dict_ranges",{digits:[this._digit_first_ascii,this._digit_last_ascii]}),__publicField(this,"_alpha_dict_ranges",{lowerCase:[this._alpha_lower_first_ascii,this._alpha_lower_last_ascii],upperCase:[this._alpha_upper_first_ascii,this._alpha_upper_last_ascii]}),__publicField(this,"_alpha_lower_dict_ranges",{lowerCase:[this._alpha_lower_first_ascii,this._alpha_lower_last_ascii]}),__publicField(this,"_alpha_upper_dict_ranges",{upperCase:[this._alpha_upper_first_ascii,this._alpha_upper_last_ascii]}),__publicField(this,"_alphanum_dict_ranges",{digits:[this._digit_first_ascii,this._digit_last_ascii],lowerCase:[this._alpha_lower_first_ascii,this._alpha_lower_last_ascii],upperCase:[this._alpha_upper_first_ascii,this._alpha_upper_last_ascii]}),__publicField(this,"_alphanum_lower_dict_ranges",{digits:[this._digit_first_ascii,this._digit_last_ascii],lowerCase:[this._alpha_lower_first_ascii,this._alpha_lower_last_ascii]}),__publicField(this,"_alphanum_upper_dict_ranges",{digits:[this._digit_first_ascii,this._digit_last_ascii],upperCase:[this._alpha_upper_first_ascii,this._alpha_upper_last_ascii]}),__publicField(this,"_hex_dict_ranges",{decDigits:[this._digit_first_ascii,this._digit_last_ascii],alphaDigits:[this._alpha_lower_first_ascii,this._hex_last_ascii]}),__publicField(this,"_dict_ranges",{_number_dict_ranges:this._number_dict_ranges,_alpha_dict_ranges:this._alpha_dict_ranges,_alpha_lower_dict_ranges:this._alpha_lower_dict_ranges,_alpha_upper_dict_ranges:this._alpha_upper_dict_ranges,_alphanum_dict_ranges:this._alphanum_dict_ranges,_alphanum_lower_dict_ranges:this._alphanum_lower_dict_ranges,_alphanum_upper_dict_ranges:this._alphanum_upper_dict_ranges,_hex_dict_ranges:this._hex_dict_ranges}),__publicField(this,"log",((...s)=>{const o=[...s];o[0]="[short-unique-id] ".concat(s[0]),!0!==this.debug||"undefined"==typeof console||null===console||console.log(...o)})),__publicField(this,"_normalizeDictionary",((s,o)=>{let i;if(s&&Array.isArray(s)&&s.length>1)i=s;else{i=[],this.dictIndex=0;const o="_".concat(s,"_dict_ranges"),a=this._dict_ranges[o];let u=0;for(const[,s]of Object.entries(a)){const[o,i]=s;u+=Math.abs(i-o)}i=new Array(u);let _=0;for(const[,s]of Object.entries(a)){this.dictRange=s,this.lowerBound=this.dictRange[0],this.upperBound=this.dictRange[1];const o=this.lowerBound<=this.upperBound,a=this.lowerBound,u=this.upperBound;if(o)for(let s=a;s<u;s++)i[_++]=String.fromCharCode(s),this.dictIndex=s;else for(let s=a;s>u;s--)i[_++]=String.fromCharCode(s),this.dictIndex=s}i.length=_}if(o){for(let s=i.length-1;s>0;s--){const o=Math.floor(Math.random()*(s+1));[i[s],i[o]]=[i[o],i[s]]}}return i})),__publicField(this,"setDictionary",((s,o)=>{this.dict=this._normalizeDictionary(s,o),this.dictLength=this.dict.length,this.setCounter(0)})),__publicField(this,"seq",(()=>this.sequentialUUID())),__publicField(this,"sequentialUUID",(()=>{const s=this.dictLength,o=this.dict;let i=this.counter;const a=[];do{const u=i%s;i=Math.trunc(i/s),a.push(o[u])}while(0!==i);const u=a.join("");return this.counter+=1,u})),__publicField(this,"rnd",((s=this.uuidLength||x)=>this.randomUUID(s))),__publicField(this,"randomUUID",((s=this.uuidLength||x)=>{if(null==s||s<1)throw new Error("Invalid UUID Length Provided");const o=new Array(s),i=this.dictLength,a=this.dict;for(let u=0;u<s;u++){const s=Math.floor(Math.random()*i);o[u]=a[s]}return o.join("")})),__publicField(this,"fmt",((s,o)=>this.formattedUUID(s,o))),__publicField(this,"formattedUUID",((s,o)=>{const i={$r:this.randomUUID,$s:this.sequentialUUID,$t:this.stamp};return s.replace(/\$[rs]\d{0,}|\$t0|\$t[1-9]\d{1,}/g,(s=>{const a=s.slice(0,2),u=Number.parseInt(s.slice(2),10);return"$s"===a?i[a]().padStart(u,"0"):"$t"===a&&o?i[a](u,o):i[a](u)}))})),__publicField(this,"availableUUIDs",((s=this.uuidLength)=>Number.parseFloat(([...new Set(this.dict)].length**s).toFixed(0)))),__publicField(this,"_collisionCache",new Map),__publicField(this,"approxMaxBeforeCollision",((s=this.availableUUIDs(this.uuidLength))=>{const o=s,i=this._collisionCache.get(o);if(void 0!==i)return i;const a=Number.parseFloat(Math.sqrt(Math.PI/2*s).toFixed(20));return this._collisionCache.set(o,a),a})),__publicField(this,"collisionProbability",((s=this.availableUUIDs(this.uuidLength),o=this.uuidLength)=>Number.parseFloat((this.approxMaxBeforeCollision(s)/this.availableUUIDs(o)).toFixed(20)))),__publicField(this,"uniqueness",((s=this.availableUUIDs(this.uuidLength))=>{const o=Number.parseFloat((1-this.approxMaxBeforeCollision(s)/s).toFixed(20));return o>1?1:o<0?0:o})),__publicField(this,"getVersion",(()=>this.version)),__publicField(this,"stamp",((s,o)=>{const i=Math.floor(+(o||new Date)/1e3).toString(16);if("number"==typeof s&&0===s)return i;if("number"!=typeof s||s<10)throw new Error(["Param finalLength must be a number greater than or equal to 10,","or 0 if you want the raw hexadecimal timestamp"].join("\n"));const a=s-9,u=Math.round(Math.random()*(a>15?15:a)),_=this.randomUUID(a);return"".concat(_.substring(0,u)).concat(i).concat(_.substring(u)).concat(u.toString(16))})),__publicField(this,"parseStamp",((s,o)=>{if(o&&!/t0|t[1-9]\d{1,}/.test(o))throw new Error("Cannot extract date from a formated UUID with no timestamp in the format");const i=o?o.replace(/\$[rs]\d{0,}|\$t0|\$t[1-9]\d{1,}/g,(s=>{const o={$r:s=>[...Array(s)].map((()=>"r")).join(""),$s:s=>[...Array(s)].map((()=>"s")).join(""),$t:s=>[...Array(s)].map((()=>"t")).join("")},i=s.slice(0,2),a=Number.parseInt(s.slice(2),10);return o[i](a)})).replace(/^(.*?)(t{8,})(.*)$/g,((o,i,a)=>s.substring(i.length,i.length+a.length))):s;if(8===i.length)return new Date(1e3*Number.parseInt(i,16));if(i.length<10)throw new Error("Stamp length invalid");const a=Number.parseInt(i.substring(i.length-1),16);return new Date(1e3*Number.parseInt(i.substring(a,a+8),16))})),__publicField(this,"setCounter",(s=>{this.counter=s})),__publicField(this,"validate",((s,o)=>{const i=o?this._normalizeDictionary(o):this.dict;return s.split("").every((s=>i.includes(s)))}));const o=__spreadValues(__spreadValues({},C),s);this.counter=0,this.debug=!1,this.dict=[],this.version="5.3.2";const{dictionary:i,shuffle:a,length:u,counter:_}=o;this.uuidLength=u,this.setDictionary(i,a),this.setCounter(_),this.debug=o.debug,this.log(this.dict),this.log("Generator instantiated with Dictionary Size ".concat(this.dictLength," and counter set to ").concat(this.counter)),this.log=this.log.bind(this),this.setDictionary=this.setDictionary.bind(this),this.setCounter=this.setCounter.bind(this),this.seq=this.seq.bind(this),this.sequentialUUID=this.sequentialUUID.bind(this),this.rnd=this.rnd.bind(this),this.randomUUID=this.randomUUID.bind(this),this.fmt=this.fmt.bind(this),this.formattedUUID=this.formattedUUID.bind(this),this.availableUUIDs=this.availableUUIDs.bind(this),this.approxMaxBeforeCollision=this.approxMaxBeforeCollision.bind(this),this.collisionProbability=this.collisionProbability.bind(this),this.uniqueness=this.uniqueness.bind(this),this.getVersion=this.getVersion.bind(this),this.stamp=this.stamp.bind(this),this.parseStamp=this.parseStamp.bind(this)}};__publicField(j,"default",j);var L,B=j;return L=w,((a,_,w,x)=>{if(_&&"object"==typeof _||"function"==typeof _)for(let C of i(_))u.call(a,C)||C===w||s(a,C,{get:()=>_[C],enumerable:!(x=o(_,C))||x.enumerable});return a})(s({},"__esModule",{value:!0}),L)})();s.exports=o.default,"undefined"!=typeof window&&(o=o.default)},9325:(s,o,i)=>{var a=i(34840),u="object"==typeof self&&self&&self.Object===Object&&self,_=a||u||Function("return this")();s.exports=_},9404:function(s){s.exports=function(){"use strict";var s=Array.prototype.slice;function createClass(s,o){o&&(s.prototype=Object.create(o.prototype)),s.prototype.constructor=s}function Iterable(s){return isIterable(s)?s:Seq(s)}function KeyedIterable(s){return isKeyed(s)?s:KeyedSeq(s)}function IndexedIterable(s){return isIndexed(s)?s:IndexedSeq(s)}function SetIterable(s){return isIterable(s)&&!isAssociative(s)?s:SetSeq(s)}function isIterable(s){return!(!s||!s[o])}function isKeyed(s){return!(!s||!s[i])}function isIndexed(s){return!(!s||!s[a])}function isAssociative(s){return isKeyed(s)||isIndexed(s)}function isOrdered(s){return!(!s||!s[u])}createClass(KeyedIterable,Iterable),createClass(IndexedIterable,Iterable),createClass(SetIterable,Iterable),Iterable.isIterable=isIterable,Iterable.isKeyed=isKeyed,Iterable.isIndexed=isIndexed,Iterable.isAssociative=isAssociative,Iterable.isOrdered=isOrdered,Iterable.Keyed=KeyedIterable,Iterable.Indexed=IndexedIterable,Iterable.Set=SetIterable;var o="@@__IMMUTABLE_ITERABLE__@@",i="@@__IMMUTABLE_KEYED__@@",a="@@__IMMUTABLE_INDEXED__@@",u="@@__IMMUTABLE_ORDERED__@@",_="delete",w=5,x=1<<w,C=x-1,j={},L={value:!1},B={value:!1};function MakeRef(s){return s.value=!1,s}function SetRef(s){s&&(s.value=!0)}function OwnerID(){}function arrCopy(s,o){o=o||0;for(var i=Math.max(0,s.length-o),a=new Array(i),u=0;u<i;u++)a[u]=s[u+o];return a}function ensureSize(s){return void 0===s.size&&(s.size=s.__iterate(returnTrue)),s.size}function wrapIndex(s,o){if("number"!=typeof o){var i=o>>>0;if(""+i!==o||4294967295===i)return NaN;o=i}return o<0?ensureSize(s)+o:o}function returnTrue(){return!0}function wholeSlice(s,o,i){return(0===s||void 0!==i&&s<=-i)&&(void 0===o||void 0!==i&&o>=i)}function resolveBegin(s,o){return resolveIndex(s,o,0)}function resolveEnd(s,o){return resolveIndex(s,o,o)}function resolveIndex(s,o,i){return void 0===s?i:s<0?Math.max(0,o+s):void 0===o?s:Math.min(o,s)}var $=0,U=1,V=2,z="function"==typeof Symbol&&Symbol.iterator,Y="@@iterator",Z=z||Y;function Iterator(s){this.next=s}function iteratorValue(s,o,i,a){var u=0===s?o:1===s?i:[o,i];return a?a.value=u:a={value:u,done:!1},a}function iteratorDone(){return{value:void 0,done:!0}}function hasIterator(s){return!!getIteratorFn(s)}function isIterator(s){return s&&"function"==typeof s.next}function getIterator(s){var o=getIteratorFn(s);return o&&o.call(s)}function getIteratorFn(s){var o=s&&(z&&s[z]||s[Y]);if("function"==typeof o)return o}function isArrayLike(s){return s&&"number"==typeof s.length}function Seq(s){return null==s?emptySequence():isIterable(s)?s.toSeq():seqFromValue(s)}function KeyedSeq(s){return null==s?emptySequence().toKeyedSeq():isIterable(s)?isKeyed(s)?s.toSeq():s.fromEntrySeq():keyedSeqFromValue(s)}function IndexedSeq(s){return null==s?emptySequence():isIterable(s)?isKeyed(s)?s.entrySeq():s.toIndexedSeq():indexedSeqFromValue(s)}function SetSeq(s){return(null==s?emptySequence():isIterable(s)?isKeyed(s)?s.entrySeq():s:indexedSeqFromValue(s)).toSetSeq()}Iterator.prototype.toString=function(){return"[Iterator]"},Iterator.KEYS=$,Iterator.VALUES=U,Iterator.ENTRIES=V,Iterator.prototype.inspect=Iterator.prototype.toSource=function(){return this.toString()},Iterator.prototype[Z]=function(){return this},createClass(Seq,Iterable),Seq.of=function(){return Seq(arguments)},Seq.prototype.toSeq=function(){return this},Seq.prototype.toString=function(){return this.__toString("Seq {","}")},Seq.prototype.cacheResult=function(){return!this._cache&&this.__iterateUncached&&(this._cache=this.entrySeq().toArray(),this.size=this._cache.length),this},Seq.prototype.__iterate=function(s,o){return seqIterate(this,s,o,!0)},Seq.prototype.__iterator=function(s,o){return seqIterator(this,s,o,!0)},createClass(KeyedSeq,Seq),KeyedSeq.prototype.toKeyedSeq=function(){return this},createClass(IndexedSeq,Seq),IndexedSeq.of=function(){return IndexedSeq(arguments)},IndexedSeq.prototype.toIndexedSeq=function(){return this},IndexedSeq.prototype.toString=function(){return this.__toString("Seq [","]")},IndexedSeq.prototype.__iterate=function(s,o){return seqIterate(this,s,o,!1)},IndexedSeq.prototype.__iterator=function(s,o){return seqIterator(this,s,o,!1)},createClass(SetSeq,Seq),SetSeq.of=function(){return SetSeq(arguments)},SetSeq.prototype.toSetSeq=function(){return this},Seq.isSeq=isSeq,Seq.Keyed=KeyedSeq,Seq.Set=SetSeq,Seq.Indexed=IndexedSeq;var ee,ie,ae,ce="@@__IMMUTABLE_SEQ__@@";function ArraySeq(s){this._array=s,this.size=s.length}function ObjectSeq(s){var o=Object.keys(s);this._object=s,this._keys=o,this.size=o.length}function IterableSeq(s){this._iterable=s,this.size=s.length||s.size}function IteratorSeq(s){this._iterator=s,this._iteratorCache=[]}function isSeq(s){return!(!s||!s[ce])}function emptySequence(){return ee||(ee=new ArraySeq([]))}function keyedSeqFromValue(s){var o=Array.isArray(s)?new ArraySeq(s).fromEntrySeq():isIterator(s)?new IteratorSeq(s).fromEntrySeq():hasIterator(s)?new IterableSeq(s).fromEntrySeq():"object"==typeof s?new ObjectSeq(s):void 0;if(!o)throw new TypeError("Expected Array or iterable object of [k, v] entries, or keyed object: "+s);return o}function indexedSeqFromValue(s){var o=maybeIndexedSeqFromValue(s);if(!o)throw new TypeError("Expected Array or iterable object of values: "+s);return o}function seqFromValue(s){var o=maybeIndexedSeqFromValue(s)||"object"==typeof s&&new ObjectSeq(s);if(!o)throw new TypeError("Expected Array or iterable object of values, or keyed object: "+s);return o}function maybeIndexedSeqFromValue(s){return isArrayLike(s)?new ArraySeq(s):isIterator(s)?new IteratorSeq(s):hasIterator(s)?new IterableSeq(s):void 0}function seqIterate(s,o,i,a){var u=s._cache;if(u){for(var _=u.length-1,w=0;w<=_;w++){var x=u[i?_-w:w];if(!1===o(x[1],a?x[0]:w,s))return w+1}return w}return s.__iterateUncached(o,i)}function seqIterator(s,o,i,a){var u=s._cache;if(u){var _=u.length-1,w=0;return new Iterator((function(){var s=u[i?_-w:w];return w++>_?iteratorDone():iteratorValue(o,a?s[0]:w-1,s[1])}))}return s.__iteratorUncached(o,i)}function fromJS(s,o){return o?fromJSWith(o,s,"",{"":s}):fromJSDefault(s)}function fromJSWith(s,o,i,a){return Array.isArray(o)?s.call(a,i,IndexedSeq(o).map((function(i,a){return fromJSWith(s,i,a,o)}))):isPlainObj(o)?s.call(a,i,KeyedSeq(o).map((function(i,a){return fromJSWith(s,i,a,o)}))):o}function fromJSDefault(s){return Array.isArray(s)?IndexedSeq(s).map(fromJSDefault).toList():isPlainObj(s)?KeyedSeq(s).map(fromJSDefault).toMap():s}function isPlainObj(s){return s&&(s.constructor===Object||void 0===s.constructor)}function is(s,o){if(s===o||s!=s&&o!=o)return!0;if(!s||!o)return!1;if("function"==typeof s.valueOf&&"function"==typeof o.valueOf){if((s=s.valueOf())===(o=o.valueOf())||s!=s&&o!=o)return!0;if(!s||!o)return!1}return!("function"!=typeof s.equals||"function"!=typeof o.equals||!s.equals(o))}function deepEqual(s,o){if(s===o)return!0;if(!isIterable(o)||void 0!==s.size&&void 0!==o.size&&s.size!==o.size||void 0!==s.__hash&&void 0!==o.__hash&&s.__hash!==o.__hash||isKeyed(s)!==isKeyed(o)||isIndexed(s)!==isIndexed(o)||isOrdered(s)!==isOrdered(o))return!1;if(0===s.size&&0===o.size)return!0;var i=!isAssociative(s);if(isOrdered(s)){var a=s.entries();return o.every((function(s,o){var u=a.next().value;return u&&is(u[1],s)&&(i||is(u[0],o))}))&&a.next().done}var u=!1;if(void 0===s.size)if(void 0===o.size)"function"==typeof s.cacheResult&&s.cacheResult();else{u=!0;var _=s;s=o,o=_}var w=!0,x=o.__iterate((function(o,a){if(i?!s.has(o):u?!is(o,s.get(a,j)):!is(s.get(a,j),o))return w=!1,!1}));return w&&s.size===x}function Repeat(s,o){if(!(this instanceof Repeat))return new Repeat(s,o);if(this._value=s,this.size=void 0===o?1/0:Math.max(0,o),0===this.size){if(ie)return ie;ie=this}}function invariant(s,o){if(!s)throw new Error(o)}function Range(s,o,i){if(!(this instanceof Range))return new Range(s,o,i);if(invariant(0!==i,"Cannot step a Range by 0"),s=s||0,void 0===o&&(o=1/0),i=void 0===i?1:Math.abs(i),o<s&&(i=-i),this._start=s,this._end=o,this._step=i,this.size=Math.max(0,Math.ceil((o-s)/i-1)+1),0===this.size){if(ae)return ae;ae=this}}function Collection(){throw TypeError("Abstract")}function KeyedCollection(){}function IndexedCollection(){}function SetCollection(){}Seq.prototype[ce]=!0,createClass(ArraySeq,IndexedSeq),ArraySeq.prototype.get=function(s,o){return this.has(s)?this._array[wrapIndex(this,s)]:o},ArraySeq.prototype.__iterate=function(s,o){for(var i=this._array,a=i.length-1,u=0;u<=a;u++)if(!1===s(i[o?a-u:u],u,this))return u+1;return u},ArraySeq.prototype.__iterator=function(s,o){var i=this._array,a=i.length-1,u=0;return new Iterator((function(){return u>a?iteratorDone():iteratorValue(s,u,i[o?a-u++:u++])}))},createClass(ObjectSeq,KeyedSeq),ObjectSeq.prototype.get=function(s,o){return void 0===o||this.has(s)?this._object[s]:o},ObjectSeq.prototype.has=function(s){return this._object.hasOwnProperty(s)},ObjectSeq.prototype.__iterate=function(s,o){for(var i=this._object,a=this._keys,u=a.length-1,_=0;_<=u;_++){var w=a[o?u-_:_];if(!1===s(i[w],w,this))return _+1}return _},ObjectSeq.prototype.__iterator=function(s,o){var i=this._object,a=this._keys,u=a.length-1,_=0;return new Iterator((function(){var w=a[o?u-_:_];return _++>u?iteratorDone():iteratorValue(s,w,i[w])}))},ObjectSeq.prototype[u]=!0,createClass(IterableSeq,IndexedSeq),IterableSeq.prototype.__iterateUncached=function(s,o){if(o)return this.cacheResult().__iterate(s,o);var i=getIterator(this._iterable),a=0;if(isIterator(i))for(var u;!(u=i.next()).done&&!1!==s(u.value,a++,this););return a},IterableSeq.prototype.__iteratorUncached=function(s,o){if(o)return this.cacheResult().__iterator(s,o);var i=getIterator(this._iterable);if(!isIterator(i))return new Iterator(iteratorDone);var a=0;return new Iterator((function(){var o=i.next();return o.done?o:iteratorValue(s,a++,o.value)}))},createClass(IteratorSeq,IndexedSeq),IteratorSeq.prototype.__iterateUncached=function(s,o){if(o)return this.cacheResult().__iterate(s,o);for(var i,a=this._iterator,u=this._iteratorCache,_=0;_<u.length;)if(!1===s(u[_],_++,this))return _;for(;!(i=a.next()).done;){var w=i.value;if(u[_]=w,!1===s(w,_++,this))break}return _},IteratorSeq.prototype.__iteratorUncached=function(s,o){if(o)return this.cacheResult().__iterator(s,o);var i=this._iterator,a=this._iteratorCache,u=0;return new Iterator((function(){if(u>=a.length){var o=i.next();if(o.done)return o;a[u]=o.value}return iteratorValue(s,u,a[u++])}))},createClass(Repeat,IndexedSeq),Repeat.prototype.toString=function(){return 0===this.size?"Repeat []":"Repeat [ "+this._value+" "+this.size+" times ]"},Repeat.prototype.get=function(s,o){return this.has(s)?this._value:o},Repeat.prototype.includes=function(s){return is(this._value,s)},Repeat.prototype.slice=function(s,o){var i=this.size;return wholeSlice(s,o,i)?this:new Repeat(this._value,resolveEnd(o,i)-resolveBegin(s,i))},Repeat.prototype.reverse=function(){return this},Repeat.prototype.indexOf=function(s){return is(this._value,s)?0:-1},Repeat.prototype.lastIndexOf=function(s){return is(this._value,s)?this.size:-1},Repeat.prototype.__iterate=function(s,o){for(var i=0;i<this.size;i++)if(!1===s(this._value,i,this))return i+1;return i},Repeat.prototype.__iterator=function(s,o){var i=this,a=0;return new Iterator((function(){return a<i.size?iteratorValue(s,a++,i._value):iteratorDone()}))},Repeat.prototype.equals=function(s){return s instanceof Repeat?is(this._value,s._value):deepEqual(s)},createClass(Range,IndexedSeq),Range.prototype.toString=function(){return 0===this.size?"Range []":"Range [ "+this._start+"..."+this._end+(1!==this._step?" by "+this._step:"")+" ]"},Range.prototype.get=function(s,o){return this.has(s)?this._start+wrapIndex(this,s)*this._step:o},Range.prototype.includes=function(s){var o=(s-this._start)/this._step;return o>=0&&o<this.size&&o===Math.floor(o)},Range.prototype.slice=function(s,o){return wholeSlice(s,o,this.size)?this:(s=resolveBegin(s,this.size),(o=resolveEnd(o,this.size))<=s?new Range(0,0):new Range(this.get(s,this._end),this.get(o,this._end),this._step))},Range.prototype.indexOf=function(s){var o=s-this._start;if(o%this._step==0){var i=o/this._step;if(i>=0&&i<this.size)return i}return-1},Range.prototype.lastIndexOf=function(s){return this.indexOf(s)},Range.prototype.__iterate=function(s,o){for(var i=this.size-1,a=this._step,u=o?this._start+i*a:this._start,_=0;_<=i;_++){if(!1===s(u,_,this))return _+1;u+=o?-a:a}return _},Range.prototype.__iterator=function(s,o){var i=this.size-1,a=this._step,u=o?this._start+i*a:this._start,_=0;return new Iterator((function(){var w=u;return u+=o?-a:a,_>i?iteratorDone():iteratorValue(s,_++,w)}))},Range.prototype.equals=function(s){return s instanceof Range?this._start===s._start&&this._end===s._end&&this._step===s._step:deepEqual(this,s)},createClass(Collection,Iterable),createClass(KeyedCollection,Collection),createClass(IndexedCollection,Collection),createClass(SetCollection,Collection),Collection.Keyed=KeyedCollection,Collection.Indexed=IndexedCollection,Collection.Set=SetCollection;var le="function"==typeof Math.imul&&-2===Math.imul(4294967295,2)?Math.imul:function imul(s,o){var i=65535&(s|=0),a=65535&(o|=0);return i*a+((s>>>16)*a+i*(o>>>16)<<16>>>0)|0};function smi(s){return s>>>1&1073741824|3221225471&s}function hash(s){if(!1===s||null==s)return 0;if("function"==typeof s.valueOf&&(!1===(s=s.valueOf())||null==s))return 0;if(!0===s)return 1;var o=typeof s;if("number"===o){if(s!=s||s===1/0)return 0;var i=0|s;for(i!==s&&(i^=4294967295*s);s>4294967295;)i^=s/=4294967295;return smi(i)}if("string"===o)return s.length>Se?cachedHashString(s):hashString(s);if("function"==typeof s.hashCode)return s.hashCode();if("object"===o)return hashJSObj(s);if("function"==typeof s.toString)return hashString(s.toString());throw new Error("Value type "+o+" cannot be hashed.")}function cachedHashString(s){var o=Pe[s];return void 0===o&&(o=hashString(s),xe===we&&(xe=0,Pe={}),xe++,Pe[s]=o),o}function hashString(s){for(var o=0,i=0;i<s.length;i++)o=31*o+s.charCodeAt(i)|0;return smi(o)}function hashJSObj(s){var o;if(ye&&void 0!==(o=fe.get(s)))return o;if(void 0!==(o=s[_e]))return o;if(!de){if(void 0!==(o=s.propertyIsEnumerable&&s.propertyIsEnumerable[_e]))return o;if(void 0!==(o=getIENodeHash(s)))return o}if(o=++be,1073741824&be&&(be=0),ye)fe.set(s,o);else{if(void 0!==pe&&!1===pe(s))throw new Error("Non-extensible objects are not allowed as keys.");if(de)Object.defineProperty(s,_e,{enumerable:!1,configurable:!1,writable:!1,value:o});else if(void 0!==s.propertyIsEnumerable&&s.propertyIsEnumerable===s.constructor.prototype.propertyIsEnumerable)s.propertyIsEnumerable=function(){return this.constructor.prototype.propertyIsEnumerable.apply(this,arguments)},s.propertyIsEnumerable[_e]=o;else{if(void 0===s.nodeType)throw new Error("Unable to set a non-enumerable property on object.");s[_e]=o}}return o}var pe=Object.isExtensible,de=function(){try{return Object.defineProperty({},"@",{}),!0}catch(s){return!1}}();function getIENodeHash(s){if(s&&s.nodeType>0)switch(s.nodeType){case 1:return s.uniqueID;case 9:return s.documentElement&&s.documentElement.uniqueID}}var fe,ye="function"==typeof WeakMap;ye&&(fe=new WeakMap);var be=0,_e="__immutablehash__";"function"==typeof Symbol&&(_e=Symbol(_e));var Se=16,we=255,xe=0,Pe={};function assertNotInfinite(s){invariant(s!==1/0,"Cannot perform this action with an infinite size.")}function Map(s){return null==s?emptyMap():isMap(s)&&!isOrdered(s)?s:emptyMap().withMutations((function(o){var i=KeyedIterable(s);assertNotInfinite(i.size),i.forEach((function(s,i){return o.set(i,s)}))}))}function isMap(s){return!(!s||!s[Re])}createClass(Map,KeyedCollection),Map.of=function(){var o=s.call(arguments,0);return emptyMap().withMutations((function(s){for(var i=0;i<o.length;i+=2){if(i+1>=o.length)throw new Error("Missing value for key: "+o[i]);s.set(o[i],o[i+1])}}))},Map.prototype.toString=function(){return this.__toString("Map {","}")},Map.prototype.get=function(s,o){return this._root?this._root.get(0,void 0,s,o):o},Map.prototype.set=function(s,o){return updateMap(this,s,o)},Map.prototype.setIn=function(s,o){return this.updateIn(s,j,(function(){return o}))},Map.prototype.remove=function(s){return updateMap(this,s,j)},Map.prototype.deleteIn=function(s){return this.updateIn(s,(function(){return j}))},Map.prototype.update=function(s,o,i){return 1===arguments.length?s(this):this.updateIn([s],o,i)},Map.prototype.updateIn=function(s,o,i){i||(i=o,o=void 0);var a=updateInDeepMap(this,forceIterator(s),o,i);return a===j?void 0:a},Map.prototype.clear=function(){return 0===this.size?this:this.__ownerID?(this.size=0,this._root=null,this.__hash=void 0,this.__altered=!0,this):emptyMap()},Map.prototype.merge=function(){return mergeIntoMapWith(this,void 0,arguments)},Map.prototype.mergeWith=function(o){return mergeIntoMapWith(this,o,s.call(arguments,1))},Map.prototype.mergeIn=function(o){var i=s.call(arguments,1);return this.updateIn(o,emptyMap(),(function(s){return"function"==typeof s.merge?s.merge.apply(s,i):i[i.length-1]}))},Map.prototype.mergeDeep=function(){return mergeIntoMapWith(this,deepMerger,arguments)},Map.prototype.mergeDeepWith=function(o){var i=s.call(arguments,1);return mergeIntoMapWith(this,deepMergerWith(o),i)},Map.prototype.mergeDeepIn=function(o){var i=s.call(arguments,1);return this.updateIn(o,emptyMap(),(function(s){return"function"==typeof s.mergeDeep?s.mergeDeep.apply(s,i):i[i.length-1]}))},Map.prototype.sort=function(s){return OrderedMap(sortFactory(this,s))},Map.prototype.sortBy=function(s,o){return OrderedMap(sortFactory(this,o,s))},Map.prototype.withMutations=function(s){var o=this.asMutable();return s(o),o.wasAltered()?o.__ensureOwner(this.__ownerID):this},Map.prototype.asMutable=function(){return this.__ownerID?this:this.__ensureOwner(new OwnerID)},Map.prototype.asImmutable=function(){return this.__ensureOwner()},Map.prototype.wasAltered=function(){return this.__altered},Map.prototype.__iterator=function(s,o){return new MapIterator(this,s,o)},Map.prototype.__iterate=function(s,o){var i=this,a=0;return this._root&&this._root.iterate((function(o){return a++,s(o[1],o[0],i)}),o),a},Map.prototype.__ensureOwner=function(s){return s===this.__ownerID?this:s?makeMap(this.size,this._root,s,this.__hash):(this.__ownerID=s,this.__altered=!1,this)},Map.isMap=isMap;var Te,Re="@@__IMMUTABLE_MAP__@@",$e=Map.prototype;function ArrayMapNode(s,o){this.ownerID=s,this.entries=o}function BitmapIndexedNode(s,o,i){this.ownerID=s,this.bitmap=o,this.nodes=i}function HashArrayMapNode(s,o,i){this.ownerID=s,this.count=o,this.nodes=i}function HashCollisionNode(s,o,i){this.ownerID=s,this.keyHash=o,this.entries=i}function ValueNode(s,o,i){this.ownerID=s,this.keyHash=o,this.entry=i}function MapIterator(s,o,i){this._type=o,this._reverse=i,this._stack=s._root&&mapIteratorFrame(s._root)}function mapIteratorValue(s,o){return iteratorValue(s,o[0],o[1])}function mapIteratorFrame(s,o){return{node:s,index:0,__prev:o}}function makeMap(s,o,i,a){var u=Object.create($e);return u.size=s,u._root=o,u.__ownerID=i,u.__hash=a,u.__altered=!1,u}function emptyMap(){return Te||(Te=makeMap(0))}function updateMap(s,o,i){var a,u;if(s._root){var _=MakeRef(L),w=MakeRef(B);if(a=updateNode(s._root,s.__ownerID,0,void 0,o,i,_,w),!w.value)return s;u=s.size+(_.value?i===j?-1:1:0)}else{if(i===j)return s;u=1,a=new ArrayMapNode(s.__ownerID,[[o,i]])}return s.__ownerID?(s.size=u,s._root=a,s.__hash=void 0,s.__altered=!0,s):a?makeMap(u,a):emptyMap()}function updateNode(s,o,i,a,u,_,w,x){return s?s.update(o,i,a,u,_,w,x):_===j?s:(SetRef(x),SetRef(w),new ValueNode(o,a,[u,_]))}function isLeafNode(s){return s.constructor===ValueNode||s.constructor===HashCollisionNode}function mergeIntoNode(s,o,i,a,u){if(s.keyHash===a)return new HashCollisionNode(o,a,[s.entry,u]);var _,x=(0===i?s.keyHash:s.keyHash>>>i)&C,j=(0===i?a:a>>>i)&C;return new BitmapIndexedNode(o,1<<x|1<<j,x===j?[mergeIntoNode(s,o,i+w,a,u)]:(_=new ValueNode(o,a,u),x<j?[s,_]:[_,s]))}function createNodes(s,o,i,a){s||(s=new OwnerID);for(var u=new ValueNode(s,hash(i),[i,a]),_=0;_<o.length;_++){var w=o[_];u=u.update(s,0,void 0,w[0],w[1])}return u}function packNodes(s,o,i,a){for(var u=0,_=0,w=new Array(i),x=0,C=1,j=o.length;x<j;x++,C<<=1){var L=o[x];void 0!==L&&x!==a&&(u|=C,w[_++]=L)}return new BitmapIndexedNode(s,u,w)}function expandNodes(s,o,i,a,u){for(var _=0,w=new Array(x),C=0;0!==i;C++,i>>>=1)w[C]=1&i?o[_++]:void 0;return w[a]=u,new HashArrayMapNode(s,_+1,w)}function mergeIntoMapWith(s,o,i){for(var a=[],u=0;u<i.length;u++){var _=i[u],w=KeyedIterable(_);isIterable(_)||(w=w.map((function(s){return fromJS(s)}))),a.push(w)}return mergeIntoCollectionWith(s,o,a)}function deepMerger(s,o,i){return s&&s.mergeDeep&&isIterable(o)?s.mergeDeep(o):is(s,o)?s:o}function deepMergerWith(s){return function(o,i,a){if(o&&o.mergeDeepWith&&isIterable(i))return o.mergeDeepWith(s,i);var u=s(o,i,a);return is(o,u)?o:u}}function mergeIntoCollectionWith(s,o,i){return 0===(i=i.filter((function(s){return 0!==s.size}))).length?s:0!==s.size||s.__ownerID||1!==i.length?s.withMutations((function(s){for(var a=o?function(i,a){s.update(a,j,(function(s){return s===j?i:o(s,i,a)}))}:function(o,i){s.set(i,o)},u=0;u<i.length;u++)i[u].forEach(a)})):s.constructor(i[0])}function updateInDeepMap(s,o,i,a){var u=s===j,_=o.next();if(_.done){var w=u?i:s,x=a(w);return x===w?s:x}invariant(u||s&&s.set,"invalid keyPath");var C=_.value,L=u?j:s.get(C,j),B=updateInDeepMap(L,o,i,a);return B===L?s:B===j?s.remove(C):(u?emptyMap():s).set(C,B)}function popCount(s){return s=(s=(858993459&(s-=s>>1&1431655765))+(s>>2&858993459))+(s>>4)&252645135,s+=s>>8,127&(s+=s>>16)}function setIn(s,o,i,a){var u=a?s:arrCopy(s);return u[o]=i,u}function spliceIn(s,o,i,a){var u=s.length+1;if(a&&o+1===u)return s[o]=i,s;for(var _=new Array(u),w=0,x=0;x<u;x++)x===o?(_[x]=i,w=-1):_[x]=s[x+w];return _}function spliceOut(s,o,i){var a=s.length-1;if(i&&o===a)return s.pop(),s;for(var u=new Array(a),_=0,w=0;w<a;w++)w===o&&(_=1),u[w]=s[w+_];return u}$e[Re]=!0,$e[_]=$e.remove,$e.removeIn=$e.deleteIn,ArrayMapNode.prototype.get=function(s,o,i,a){for(var u=this.entries,_=0,w=u.length;_<w;_++)if(is(i,u[_][0]))return u[_][1];return a},ArrayMapNode.prototype.update=function(s,o,i,a,u,_,w){for(var x=u===j,C=this.entries,L=0,B=C.length;L<B&&!is(a,C[L][0]);L++);var $=L<B;if($?C[L][1]===u:x)return this;if(SetRef(w),(x||!$)&&SetRef(_),!x||1!==C.length){if(!$&&!x&&C.length>=qe)return createNodes(s,C,a,u);var U=s&&s===this.ownerID,V=U?C:arrCopy(C);return $?x?L===B-1?V.pop():V[L]=V.pop():V[L]=[a,u]:V.push([a,u]),U?(this.entries=V,this):new ArrayMapNode(s,V)}},BitmapIndexedNode.prototype.get=function(s,o,i,a){void 0===o&&(o=hash(i));var u=1<<((0===s?o:o>>>s)&C),_=this.bitmap;return _&u?this.nodes[popCount(_&u-1)].get(s+w,o,i,a):a},BitmapIndexedNode.prototype.update=function(s,o,i,a,u,_,x){void 0===i&&(i=hash(a));var L=(0===o?i:i>>>o)&C,B=1<<L,$=this.bitmap,U=!!($&B);if(!U&&u===j)return this;var V=popCount($&B-1),z=this.nodes,Y=U?z[V]:void 0,Z=updateNode(Y,s,o+w,i,a,u,_,x);if(Z===Y)return this;if(!U&&Z&&z.length>=ze)return expandNodes(s,z,$,L,Z);if(U&&!Z&&2===z.length&&isLeafNode(z[1^V]))return z[1^V];if(U&&Z&&1===z.length&&isLeafNode(Z))return Z;var ee=s&&s===this.ownerID,ie=U?Z?$:$^B:$|B,ae=U?Z?setIn(z,V,Z,ee):spliceOut(z,V,ee):spliceIn(z,V,Z,ee);return ee?(this.bitmap=ie,this.nodes=ae,this):new BitmapIndexedNode(s,ie,ae)},HashArrayMapNode.prototype.get=function(s,o,i,a){void 0===o&&(o=hash(i));var u=(0===s?o:o>>>s)&C,_=this.nodes[u];return _?_.get(s+w,o,i,a):a},HashArrayMapNode.prototype.update=function(s,o,i,a,u,_,x){void 0===i&&(i=hash(a));var L=(0===o?i:i>>>o)&C,B=u===j,$=this.nodes,U=$[L];if(B&&!U)return this;var V=updateNode(U,s,o+w,i,a,u,_,x);if(V===U)return this;var z=this.count;if(U){if(!V&&--z<We)return packNodes(s,$,z,L)}else z++;var Y=s&&s===this.ownerID,Z=setIn($,L,V,Y);return Y?(this.count=z,this.nodes=Z,this):new HashArrayMapNode(s,z,Z)},HashCollisionNode.prototype.get=function(s,o,i,a){for(var u=this.entries,_=0,w=u.length;_<w;_++)if(is(i,u[_][0]))return u[_][1];return a},HashCollisionNode.prototype.update=function(s,o,i,a,u,_,w){void 0===i&&(i=hash(a));var x=u===j;if(i!==this.keyHash)return x?this:(SetRef(w),SetRef(_),mergeIntoNode(this,s,o,i,[a,u]));for(var C=this.entries,L=0,B=C.length;L<B&&!is(a,C[L][0]);L++);var $=L<B;if($?C[L][1]===u:x)return this;if(SetRef(w),(x||!$)&&SetRef(_),x&&2===B)return new ValueNode(s,this.keyHash,C[1^L]);var U=s&&s===this.ownerID,V=U?C:arrCopy(C);return $?x?L===B-1?V.pop():V[L]=V.pop():V[L]=[a,u]:V.push([a,u]),U?(this.entries=V,this):new HashCollisionNode(s,this.keyHash,V)},ValueNode.prototype.get=function(s,o,i,a){return is(i,this.entry[0])?this.entry[1]:a},ValueNode.prototype.update=function(s,o,i,a,u,_,w){var x=u===j,C=is(a,this.entry[0]);return(C?u===this.entry[1]:x)?this:(SetRef(w),x?void SetRef(_):C?s&&s===this.ownerID?(this.entry[1]=u,this):new ValueNode(s,this.keyHash,[a,u]):(SetRef(_),mergeIntoNode(this,s,o,hash(a),[a,u])))},ArrayMapNode.prototype.iterate=HashCollisionNode.prototype.iterate=function(s,o){for(var i=this.entries,a=0,u=i.length-1;a<=u;a++)if(!1===s(i[o?u-a:a]))return!1},BitmapIndexedNode.prototype.iterate=HashArrayMapNode.prototype.iterate=function(s,o){for(var i=this.nodes,a=0,u=i.length-1;a<=u;a++){var _=i[o?u-a:a];if(_&&!1===_.iterate(s,o))return!1}},ValueNode.prototype.iterate=function(s,o){return s(this.entry)},createClass(MapIterator,Iterator),MapIterator.prototype.next=function(){for(var s=this._type,o=this._stack;o;){var i,a=o.node,u=o.index++;if(a.entry){if(0===u)return mapIteratorValue(s,a.entry)}else if(a.entries){if(u<=(i=a.entries.length-1))return mapIteratorValue(s,a.entries[this._reverse?i-u:u])}else if(u<=(i=a.nodes.length-1)){var _=a.nodes[this._reverse?i-u:u];if(_){if(_.entry)return mapIteratorValue(s,_.entry);o=this._stack=mapIteratorFrame(_,o)}continue}o=this._stack=this._stack.__prev}return iteratorDone()};var qe=x/4,ze=x/2,We=x/4;function List(s){var o=emptyList();if(null==s)return o;if(isList(s))return s;var i=IndexedIterable(s),a=i.size;return 0===a?o:(assertNotInfinite(a),a>0&&a<x?makeList(0,a,w,null,new VNode(i.toArray())):o.withMutations((function(s){s.setSize(a),i.forEach((function(o,i){return s.set(i,o)}))})))}function isList(s){return!(!s||!s[He])}createClass(List,IndexedCollection),List.of=function(){return this(arguments)},List.prototype.toString=function(){return this.__toString("List [","]")},List.prototype.get=function(s,o){if((s=wrapIndex(this,s))>=0&&s<this.size){var i=listNodeFor(this,s+=this._origin);return i&&i.array[s&C]}return o},List.prototype.set=function(s,o){return updateList(this,s,o)},List.prototype.remove=function(s){return this.has(s)?0===s?this.shift():s===this.size-1?this.pop():this.splice(s,1):this},List.prototype.insert=function(s,o){return this.splice(s,0,o)},List.prototype.clear=function(){return 0===this.size?this:this.__ownerID?(this.size=this._origin=this._capacity=0,this._level=w,this._root=this._tail=null,this.__hash=void 0,this.__altered=!0,this):emptyList()},List.prototype.push=function(){var s=arguments,o=this.size;return this.withMutations((function(i){setListBounds(i,0,o+s.length);for(var a=0;a<s.length;a++)i.set(o+a,s[a])}))},List.prototype.pop=function(){return setListBounds(this,0,-1)},List.prototype.unshift=function(){var s=arguments;return this.withMutations((function(o){setListBounds(o,-s.length);for(var i=0;i<s.length;i++)o.set(i,s[i])}))},List.prototype.shift=function(){return setListBounds(this,1)},List.prototype.merge=function(){return mergeIntoListWith(this,void 0,arguments)},List.prototype.mergeWith=function(o){return mergeIntoListWith(this,o,s.call(arguments,1))},List.prototype.mergeDeep=function(){return mergeIntoListWith(this,deepMerger,arguments)},List.prototype.mergeDeepWith=function(o){var i=s.call(arguments,1);return mergeIntoListWith(this,deepMergerWith(o),i)},List.prototype.setSize=function(s){return setListBounds(this,0,s)},List.prototype.slice=function(s,o){var i=this.size;return wholeSlice(s,o,i)?this:setListBounds(this,resolveBegin(s,i),resolveEnd(o,i))},List.prototype.__iterator=function(s,o){var i=0,a=iterateList(this,o);return new Iterator((function(){var o=a();return o===et?iteratorDone():iteratorValue(s,i++,o)}))},List.prototype.__iterate=function(s,o){for(var i,a=0,u=iterateList(this,o);(i=u())!==et&&!1!==s(i,a++,this););return a},List.prototype.__ensureOwner=function(s){return s===this.__ownerID?this:s?makeList(this._origin,this._capacity,this._level,this._root,this._tail,s,this.__hash):(this.__ownerID=s,this)},List.isList=isList;var He="@@__IMMUTABLE_LIST__@@",Ye=List.prototype;function VNode(s,o){this.array=s,this.ownerID=o}Ye[He]=!0,Ye[_]=Ye.remove,Ye.setIn=$e.setIn,Ye.deleteIn=Ye.removeIn=$e.removeIn,Ye.update=$e.update,Ye.updateIn=$e.updateIn,Ye.mergeIn=$e.mergeIn,Ye.mergeDeepIn=$e.mergeDeepIn,Ye.withMutations=$e.withMutations,Ye.asMutable=$e.asMutable,Ye.asImmutable=$e.asImmutable,Ye.wasAltered=$e.wasAltered,VNode.prototype.removeBefore=function(s,o,i){if(i===o?1<<o:0===this.array.length)return this;var a=i>>>o&C;if(a>=this.array.length)return new VNode([],s);var u,_=0===a;if(o>0){var x=this.array[a];if((u=x&&x.removeBefore(s,o-w,i))===x&&_)return this}if(_&&!u)return this;var j=editableVNode(this,s);if(!_)for(var L=0;L<a;L++)j.array[L]=void 0;return u&&(j.array[a]=u),j},VNode.prototype.removeAfter=function(s,o,i){if(i===(o?1<<o:0)||0===this.array.length)return this;var a,u=i-1>>>o&C;if(u>=this.array.length)return this;if(o>0){var _=this.array[u];if((a=_&&_.removeAfter(s,o-w,i))===_&&u===this.array.length-1)return this}var x=editableVNode(this,s);return x.array.splice(u+1),a&&(x.array[u]=a),x};var Xe,Qe,et={};function iterateList(s,o){var i=s._origin,a=s._capacity,u=getTailOffset(a),_=s._tail;return iterateNodeOrLeaf(s._root,s._level,0);function iterateNodeOrLeaf(s,o,i){return 0===o?iterateLeaf(s,i):iterateNode(s,o,i)}function iterateLeaf(s,w){var C=w===u?_&&_.array:s&&s.array,j=w>i?0:i-w,L=a-w;return L>x&&(L=x),function(){if(j===L)return et;var s=o?--L:j++;return C&&C[s]}}function iterateNode(s,u,_){var C,j=s&&s.array,L=_>i?0:i-_>>u,B=1+(a-_>>u);return B>x&&(B=x),function(){for(;;){if(C){var s=C();if(s!==et)return s;C=null}if(L===B)return et;var i=o?--B:L++;C=iterateNodeOrLeaf(j&&j[i],u-w,_+(i<<u))}}}}function makeList(s,o,i,a,u,_,w){var x=Object.create(Ye);return x.size=o-s,x._origin=s,x._capacity=o,x._level=i,x._root=a,x._tail=u,x.__ownerID=_,x.__hash=w,x.__altered=!1,x}function emptyList(){return Xe||(Xe=makeList(0,0,w))}function updateList(s,o,i){if((o=wrapIndex(s,o))!=o)return s;if(o>=s.size||o<0)return s.withMutations((function(s){o<0?setListBounds(s,o).set(0,i):setListBounds(s,0,o+1).set(o,i)}));o+=s._origin;var a=s._tail,u=s._root,_=MakeRef(B);return o>=getTailOffset(s._capacity)?a=updateVNode(a,s.__ownerID,0,o,i,_):u=updateVNode(u,s.__ownerID,s._level,o,i,_),_.value?s.__ownerID?(s._root=u,s._tail=a,s.__hash=void 0,s.__altered=!0,s):makeList(s._origin,s._capacity,s._level,u,a):s}function updateVNode(s,o,i,a,u,_){var x,j=a>>>i&C,L=s&&j<s.array.length;if(!L&&void 0===u)return s;if(i>0){var B=s&&s.array[j],$=updateVNode(B,o,i-w,a,u,_);return $===B?s:((x=editableVNode(s,o)).array[j]=$,x)}return L&&s.array[j]===u?s:(SetRef(_),x=editableVNode(s,o),void 0===u&&j===x.array.length-1?x.array.pop():x.array[j]=u,x)}function editableVNode(s,o){return o&&s&&o===s.ownerID?s:new VNode(s?s.array.slice():[],o)}function listNodeFor(s,o){if(o>=getTailOffset(s._capacity))return s._tail;if(o<1<<s._level+w){for(var i=s._root,a=s._level;i&&a>0;)i=i.array[o>>>a&C],a-=w;return i}}function setListBounds(s,o,i){void 0!==o&&(o|=0),void 0!==i&&(i|=0);var a=s.__ownerID||new OwnerID,u=s._origin,_=s._capacity,x=u+o,j=void 0===i?_:i<0?_+i:u+i;if(x===u&&j===_)return s;if(x>=j)return s.clear();for(var L=s._level,B=s._root,$=0;x+$<0;)B=new VNode(B&&B.array.length?[void 0,B]:[],a),$+=1<<(L+=w);$&&(x+=$,u+=$,j+=$,_+=$);for(var U=getTailOffset(_),V=getTailOffset(j);V>=1<<L+w;)B=new VNode(B&&B.array.length?[B]:[],a),L+=w;var z=s._tail,Y=V<U?listNodeFor(s,j-1):V>U?new VNode([],a):z;if(z&&V>U&&x<_&&z.array.length){for(var Z=B=editableVNode(B,a),ee=L;ee>w;ee-=w){var ie=U>>>ee&C;Z=Z.array[ie]=editableVNode(Z.array[ie],a)}Z.array[U>>>w&C]=z}if(j<_&&(Y=Y&&Y.removeAfter(a,0,j)),x>=V)x-=V,j-=V,L=w,B=null,Y=Y&&Y.removeBefore(a,0,x);else if(x>u||V<U){for($=0;B;){var ae=x>>>L&C;if(ae!==V>>>L&C)break;ae&&($+=(1<<L)*ae),L-=w,B=B.array[ae]}B&&x>u&&(B=B.removeBefore(a,L,x-$)),B&&V<U&&(B=B.removeAfter(a,L,V-$)),$&&(x-=$,j-=$)}return s.__ownerID?(s.size=j-x,s._origin=x,s._capacity=j,s._level=L,s._root=B,s._tail=Y,s.__hash=void 0,s.__altered=!0,s):makeList(x,j,L,B,Y)}function mergeIntoListWith(s,o,i){for(var a=[],u=0,_=0;_<i.length;_++){var w=i[_],x=IndexedIterable(w);x.size>u&&(u=x.size),isIterable(w)||(x=x.map((function(s){return fromJS(s)}))),a.push(x)}return u>s.size&&(s=s.setSize(u)),mergeIntoCollectionWith(s,o,a)}function getTailOffset(s){return s<x?0:s-1>>>w<<w}function OrderedMap(s){return null==s?emptyOrderedMap():isOrderedMap(s)?s:emptyOrderedMap().withMutations((function(o){var i=KeyedIterable(s);assertNotInfinite(i.size),i.forEach((function(s,i){return o.set(i,s)}))}))}function isOrderedMap(s){return isMap(s)&&isOrdered(s)}function makeOrderedMap(s,o,i,a){var u=Object.create(OrderedMap.prototype);return u.size=s?s.size:0,u._map=s,u._list=o,u.__ownerID=i,u.__hash=a,u}function emptyOrderedMap(){return Qe||(Qe=makeOrderedMap(emptyMap(),emptyList()))}function updateOrderedMap(s,o,i){var a,u,_=s._map,w=s._list,C=_.get(o),L=void 0!==C;if(i===j){if(!L)return s;w.size>=x&&w.size>=2*_.size?(a=(u=w.filter((function(s,o){return void 0!==s&&C!==o}))).toKeyedSeq().map((function(s){return s[0]})).flip().toMap(),s.__ownerID&&(a.__ownerID=u.__ownerID=s.__ownerID)):(a=_.remove(o),u=C===w.size-1?w.pop():w.set(C,void 0))}else if(L){if(i===w.get(C)[1])return s;a=_,u=w.set(C,[o,i])}else a=_.set(o,w.size),u=w.set(w.size,[o,i]);return s.__ownerID?(s.size=a.size,s._map=a,s._list=u,s.__hash=void 0,s):makeOrderedMap(a,u)}function ToKeyedSequence(s,o){this._iter=s,this._useKeys=o,this.size=s.size}function ToIndexedSequence(s){this._iter=s,this.size=s.size}function ToSetSequence(s){this._iter=s,this.size=s.size}function FromEntriesSequence(s){this._iter=s,this.size=s.size}function flipFactory(s){var o=makeSequence(s);return o._iter=s,o.size=s.size,o.flip=function(){return s},o.reverse=function(){var o=s.reverse.apply(this);return o.flip=function(){return s.reverse()},o},o.has=function(o){return s.includes(o)},o.includes=function(o){return s.has(o)},o.cacheResult=cacheResultThrough,o.__iterateUncached=function(o,i){var a=this;return s.__iterate((function(s,i){return!1!==o(i,s,a)}),i)},o.__iteratorUncached=function(o,i){if(o===V){var a=s.__iterator(o,i);return new Iterator((function(){var s=a.next();if(!s.done){var o=s.value[0];s.value[0]=s.value[1],s.value[1]=o}return s}))}return s.__iterator(o===U?$:U,i)},o}function mapFactory(s,o,i){var a=makeSequence(s);return a.size=s.size,a.has=function(o){return s.has(o)},a.get=function(a,u){var _=s.get(a,j);return _===j?u:o.call(i,_,a,s)},a.__iterateUncached=function(a,u){var _=this;return s.__iterate((function(s,u,w){return!1!==a(o.call(i,s,u,w),u,_)}),u)},a.__iteratorUncached=function(a,u){var _=s.__iterator(V,u);return new Iterator((function(){var u=_.next();if(u.done)return u;var w=u.value,x=w[0];return iteratorValue(a,x,o.call(i,w[1],x,s),u)}))},a}function reverseFactory(s,o){var i=makeSequence(s);return i._iter=s,i.size=s.size,i.reverse=function(){return s},s.flip&&(i.flip=function(){var o=flipFactory(s);return o.reverse=function(){return s.flip()},o}),i.get=function(i,a){return s.get(o?i:-1-i,a)},i.has=function(i){return s.has(o?i:-1-i)},i.includes=function(o){return s.includes(o)},i.cacheResult=cacheResultThrough,i.__iterate=function(o,i){var a=this;return s.__iterate((function(s,i){return o(s,i,a)}),!i)},i.__iterator=function(o,i){return s.__iterator(o,!i)},i}function filterFactory(s,o,i,a){var u=makeSequence(s);return a&&(u.has=function(a){var u=s.get(a,j);return u!==j&&!!o.call(i,u,a,s)},u.get=function(a,u){var _=s.get(a,j);return _!==j&&o.call(i,_,a,s)?_:u}),u.__iterateUncached=function(u,_){var w=this,x=0;return s.__iterate((function(s,_,C){if(o.call(i,s,_,C))return x++,u(s,a?_:x-1,w)}),_),x},u.__iteratorUncached=function(u,_){var w=s.__iterator(V,_),x=0;return new Iterator((function(){for(;;){var _=w.next();if(_.done)return _;var C=_.value,j=C[0],L=C[1];if(o.call(i,L,j,s))return iteratorValue(u,a?j:x++,L,_)}}))},u}function countByFactory(s,o,i){var a=Map().asMutable();return s.__iterate((function(u,_){a.update(o.call(i,u,_,s),0,(function(s){return s+1}))})),a.asImmutable()}function groupByFactory(s,o,i){var a=isKeyed(s),u=(isOrdered(s)?OrderedMap():Map()).asMutable();s.__iterate((function(_,w){u.update(o.call(i,_,w,s),(function(s){return(s=s||[]).push(a?[w,_]:_),s}))}));var _=iterableClass(s);return u.map((function(o){return reify(s,_(o))}))}function sliceFactory(s,o,i,a){var u=s.size;if(void 0!==o&&(o|=0),void 0!==i&&(i===1/0?i=u:i|=0),wholeSlice(o,i,u))return s;var _=resolveBegin(o,u),w=resolveEnd(i,u);if(_!=_||w!=w)return sliceFactory(s.toSeq().cacheResult(),o,i,a);var x,C=w-_;C==C&&(x=C<0?0:C);var j=makeSequence(s);return j.size=0===x?x:s.size&&x||void 0,!a&&isSeq(s)&&x>=0&&(j.get=function(o,i){return(o=wrapIndex(this,o))>=0&&o<x?s.get(o+_,i):i}),j.__iterateUncached=function(o,i){var u=this;if(0===x)return 0;if(i)return this.cacheResult().__iterate(o,i);var w=0,C=!0,j=0;return s.__iterate((function(s,i){if(!C||!(C=w++<_))return j++,!1!==o(s,a?i:j-1,u)&&j!==x})),j},j.__iteratorUncached=function(o,i){if(0!==x&&i)return this.cacheResult().__iterator(o,i);var u=0!==x&&s.__iterator(o,i),w=0,C=0;return new Iterator((function(){for(;w++<_;)u.next();if(++C>x)return iteratorDone();var s=u.next();return a||o===U?s:iteratorValue(o,C-1,o===$?void 0:s.value[1],s)}))},j}function takeWhileFactory(s,o,i){var a=makeSequence(s);return a.__iterateUncached=function(a,u){var _=this;if(u)return this.cacheResult().__iterate(a,u);var w=0;return s.__iterate((function(s,u,x){return o.call(i,s,u,x)&&++w&&a(s,u,_)})),w},a.__iteratorUncached=function(a,u){var _=this;if(u)return this.cacheResult().__iterator(a,u);var w=s.__iterator(V,u),x=!0;return new Iterator((function(){if(!x)return iteratorDone();var s=w.next();if(s.done)return s;var u=s.value,C=u[0],j=u[1];return o.call(i,j,C,_)?a===V?s:iteratorValue(a,C,j,s):(x=!1,iteratorDone())}))},a}function skipWhileFactory(s,o,i,a){var u=makeSequence(s);return u.__iterateUncached=function(u,_){var w=this;if(_)return this.cacheResult().__iterate(u,_);var x=!0,C=0;return s.__iterate((function(s,_,j){if(!x||!(x=o.call(i,s,_,j)))return C++,u(s,a?_:C-1,w)})),C},u.__iteratorUncached=function(u,_){var w=this;if(_)return this.cacheResult().__iterator(u,_);var x=s.__iterator(V,_),C=!0,j=0;return new Iterator((function(){var s,_,L;do{if((s=x.next()).done)return a||u===U?s:iteratorValue(u,j++,u===$?void 0:s.value[1],s);var B=s.value;_=B[0],L=B[1],C&&(C=o.call(i,L,_,w))}while(C);return u===V?s:iteratorValue(u,_,L,s)}))},u}function concatFactory(s,o){var i=isKeyed(s),a=[s].concat(o).map((function(s){return isIterable(s)?i&&(s=KeyedIterable(s)):s=i?keyedSeqFromValue(s):indexedSeqFromValue(Array.isArray(s)?s:[s]),s})).filter((function(s){return 0!==s.size}));if(0===a.length)return s;if(1===a.length){var u=a[0];if(u===s||i&&isKeyed(u)||isIndexed(s)&&isIndexed(u))return u}var _=new ArraySeq(a);return i?_=_.toKeyedSeq():isIndexed(s)||(_=_.toSetSeq()),(_=_.flatten(!0)).size=a.reduce((function(s,o){if(void 0!==s){var i=o.size;if(void 0!==i)return s+i}}),0),_}function flattenFactory(s,o,i){var a=makeSequence(s);return a.__iterateUncached=function(a,u){var _=0,w=!1;function flatDeep(s,x){var C=this;s.__iterate((function(s,u){return(!o||x<o)&&isIterable(s)?flatDeep(s,x+1):!1===a(s,i?u:_++,C)&&(w=!0),!w}),u)}return flatDeep(s,0),_},a.__iteratorUncached=function(a,u){var _=s.__iterator(a,u),w=[],x=0;return new Iterator((function(){for(;_;){var s=_.next();if(!1===s.done){var C=s.value;if(a===V&&(C=C[1]),o&&!(w.length<o)||!isIterable(C))return i?s:iteratorValue(a,x++,C,s);w.push(_),_=C.__iterator(a,u)}else _=w.pop()}return iteratorDone()}))},a}function flatMapFactory(s,o,i){var a=iterableClass(s);return s.toSeq().map((function(u,_){return a(o.call(i,u,_,s))})).flatten(!0)}function interposeFactory(s,o){var i=makeSequence(s);return i.size=s.size&&2*s.size-1,i.__iterateUncached=function(i,a){var u=this,_=0;return s.__iterate((function(s,a){return(!_||!1!==i(o,_++,u))&&!1!==i(s,_++,u)}),a),_},i.__iteratorUncached=function(i,a){var u,_=s.__iterator(U,a),w=0;return new Iterator((function(){return(!u||w%2)&&(u=_.next()).done?u:w%2?iteratorValue(i,w++,o):iteratorValue(i,w++,u.value,u)}))},i}function sortFactory(s,o,i){o||(o=defaultComparator);var a=isKeyed(s),u=0,_=s.toSeq().map((function(o,a){return[a,o,u++,i?i(o,a,s):o]})).toArray();return _.sort((function(s,i){return o(s[3],i[3])||s[2]-i[2]})).forEach(a?function(s,o){_[o].length=2}:function(s,o){_[o]=s[1]}),a?KeyedSeq(_):isIndexed(s)?IndexedSeq(_):SetSeq(_)}function maxFactory(s,o,i){if(o||(o=defaultComparator),i){var a=s.toSeq().map((function(o,a){return[o,i(o,a,s)]})).reduce((function(s,i){return maxCompare(o,s[1],i[1])?i:s}));return a&&a[0]}return s.reduce((function(s,i){return maxCompare(o,s,i)?i:s}))}function maxCompare(s,o,i){var a=s(i,o);return 0===a&&i!==o&&(null==i||i!=i)||a>0}function zipWithFactory(s,o,i){var a=makeSequence(s);return a.size=new ArraySeq(i).map((function(s){return s.size})).min(),a.__iterate=function(s,o){for(var i,a=this.__iterator(U,o),u=0;!(i=a.next()).done&&!1!==s(i.value,u++,this););return u},a.__iteratorUncached=function(s,a){var u=i.map((function(s){return s=Iterable(s),getIterator(a?s.reverse():s)})),_=0,w=!1;return new Iterator((function(){var i;return w||(i=u.map((function(s){return s.next()})),w=i.some((function(s){return s.done}))),w?iteratorDone():iteratorValue(s,_++,o.apply(null,i.map((function(s){return s.value}))))}))},a}function reify(s,o){return isSeq(s)?o:s.constructor(o)}function validateEntry(s){if(s!==Object(s))throw new TypeError("Expected [K, V] tuple: "+s)}function resolveSize(s){return assertNotInfinite(s.size),ensureSize(s)}function iterableClass(s){return isKeyed(s)?KeyedIterable:isIndexed(s)?IndexedIterable:SetIterable}function makeSequence(s){return Object.create((isKeyed(s)?KeyedSeq:isIndexed(s)?IndexedSeq:SetSeq).prototype)}function cacheResultThrough(){return this._iter.cacheResult?(this._iter.cacheResult(),this.size=this._iter.size,this):Seq.prototype.cacheResult.call(this)}function defaultComparator(s,o){return s>o?1:s<o?-1:0}function forceIterator(s){var o=getIterator(s);if(!o){if(!isArrayLike(s))throw new TypeError("Expected iterable or array-like: "+s);o=getIterator(Iterable(s))}return o}function Record(s,o){var i,a=function Record(_){if(_ instanceof a)return _;if(!(this instanceof a))return new a(_);if(!i){i=!0;var w=Object.keys(s);setProps(u,w),u.size=w.length,u._name=o,u._keys=w,u._defaultValues=s}this._map=Map(_)},u=a.prototype=Object.create(tt);return u.constructor=a,a}createClass(OrderedMap,Map),OrderedMap.of=function(){return this(arguments)},OrderedMap.prototype.toString=function(){return this.__toString("OrderedMap {","}")},OrderedMap.prototype.get=function(s,o){var i=this._map.get(s);return void 0!==i?this._list.get(i)[1]:o},OrderedMap.prototype.clear=function(){return 0===this.size?this:this.__ownerID?(this.size=0,this._map.clear(),this._list.clear(),this):emptyOrderedMap()},OrderedMap.prototype.set=function(s,o){return updateOrderedMap(this,s,o)},OrderedMap.prototype.remove=function(s){return updateOrderedMap(this,s,j)},OrderedMap.prototype.wasAltered=function(){return this._map.wasAltered()||this._list.wasAltered()},OrderedMap.prototype.__iterate=function(s,o){var i=this;return this._list.__iterate((function(o){return o&&s(o[1],o[0],i)}),o)},OrderedMap.prototype.__iterator=function(s,o){return this._list.fromEntrySeq().__iterator(s,o)},OrderedMap.prototype.__ensureOwner=function(s){if(s===this.__ownerID)return this;var o=this._map.__ensureOwner(s),i=this._list.__ensureOwner(s);return s?makeOrderedMap(o,i,s,this.__hash):(this.__ownerID=s,this._map=o,this._list=i,this)},OrderedMap.isOrderedMap=isOrderedMap,OrderedMap.prototype[u]=!0,OrderedMap.prototype[_]=OrderedMap.prototype.remove,createClass(ToKeyedSequence,KeyedSeq),ToKeyedSequence.prototype.get=function(s,o){return this._iter.get(s,o)},ToKeyedSequence.prototype.has=function(s){return this._iter.has(s)},ToKeyedSequence.prototype.valueSeq=function(){return this._iter.valueSeq()},ToKeyedSequence.prototype.reverse=function(){var s=this,o=reverseFactory(this,!0);return this._useKeys||(o.valueSeq=function(){return s._iter.toSeq().reverse()}),o},ToKeyedSequence.prototype.map=function(s,o){var i=this,a=mapFactory(this,s,o);return this._useKeys||(a.valueSeq=function(){return i._iter.toSeq().map(s,o)}),a},ToKeyedSequence.prototype.__iterate=function(s,o){var i,a=this;return this._iter.__iterate(this._useKeys?function(o,i){return s(o,i,a)}:(i=o?resolveSize(this):0,function(u){return s(u,o?--i:i++,a)}),o)},ToKeyedSequence.prototype.__iterator=function(s,o){if(this._useKeys)return this._iter.__iterator(s,o);var i=this._iter.__iterator(U,o),a=o?resolveSize(this):0;return new Iterator((function(){var u=i.next();return u.done?u:iteratorValue(s,o?--a:a++,u.value,u)}))},ToKeyedSequence.prototype[u]=!0,createClass(ToIndexedSequence,IndexedSeq),ToIndexedSequence.prototype.includes=function(s){return this._iter.includes(s)},ToIndexedSequence.prototype.__iterate=function(s,o){var i=this,a=0;return this._iter.__iterate((function(o){return s(o,a++,i)}),o)},ToIndexedSequence.prototype.__iterator=function(s,o){var i=this._iter.__iterator(U,o),a=0;return new Iterator((function(){var o=i.next();return o.done?o:iteratorValue(s,a++,o.value,o)}))},createClass(ToSetSequence,SetSeq),ToSetSequence.prototype.has=function(s){return this._iter.includes(s)},ToSetSequence.prototype.__iterate=function(s,o){var i=this;return this._iter.__iterate((function(o){return s(o,o,i)}),o)},ToSetSequence.prototype.__iterator=function(s,o){var i=this._iter.__iterator(U,o);return new Iterator((function(){var o=i.next();return o.done?o:iteratorValue(s,o.value,o.value,o)}))},createClass(FromEntriesSequence,KeyedSeq),FromEntriesSequence.prototype.entrySeq=function(){return this._iter.toSeq()},FromEntriesSequence.prototype.__iterate=function(s,o){var i=this;return this._iter.__iterate((function(o){if(o){validateEntry(o);var a=isIterable(o);return s(a?o.get(1):o[1],a?o.get(0):o[0],i)}}),o)},FromEntriesSequence.prototype.__iterator=function(s,o){var i=this._iter.__iterator(U,o);return new Iterator((function(){for(;;){var o=i.next();if(o.done)return o;var a=o.value;if(a){validateEntry(a);var u=isIterable(a);return iteratorValue(s,u?a.get(0):a[0],u?a.get(1):a[1],o)}}}))},ToIndexedSequence.prototype.cacheResult=ToKeyedSequence.prototype.cacheResult=ToSetSequence.prototype.cacheResult=FromEntriesSequence.prototype.cacheResult=cacheResultThrough,createClass(Record,KeyedCollection),Record.prototype.toString=function(){return this.__toString(recordName(this)+" {","}")},Record.prototype.has=function(s){return this._defaultValues.hasOwnProperty(s)},Record.prototype.get=function(s,o){if(!this.has(s))return o;var i=this._defaultValues[s];return this._map?this._map.get(s,i):i},Record.prototype.clear=function(){if(this.__ownerID)return this._map&&this._map.clear(),this;var s=this.constructor;return s._empty||(s._empty=makeRecord(this,emptyMap()))},Record.prototype.set=function(s,o){if(!this.has(s))throw new Error('Cannot set unknown key "'+s+'" on '+recordName(this));if(this._map&&!this._map.has(s)&&o===this._defaultValues[s])return this;var i=this._map&&this._map.set(s,o);return this.__ownerID||i===this._map?this:makeRecord(this,i)},Record.prototype.remove=function(s){if(!this.has(s))return this;var o=this._map&&this._map.remove(s);return this.__ownerID||o===this._map?this:makeRecord(this,o)},Record.prototype.wasAltered=function(){return this._map.wasAltered()},Record.prototype.__iterator=function(s,o){var i=this;return KeyedIterable(this._defaultValues).map((function(s,o){return i.get(o)})).__iterator(s,o)},Record.prototype.__iterate=function(s,o){var i=this;return KeyedIterable(this._defaultValues).map((function(s,o){return i.get(o)})).__iterate(s,o)},Record.prototype.__ensureOwner=function(s){if(s===this.__ownerID)return this;var o=this._map&&this._map.__ensureOwner(s);return s?makeRecord(this,o,s):(this.__ownerID=s,this._map=o,this)};var tt=Record.prototype;function makeRecord(s,o,i){var a=Object.create(Object.getPrototypeOf(s));return a._map=o,a.__ownerID=i,a}function recordName(s){return s._name||s.constructor.name||"Record"}function setProps(s,o){try{o.forEach(setProp.bind(void 0,s))}catch(s){}}function setProp(s,o){Object.defineProperty(s,o,{get:function(){return this.get(o)},set:function(s){invariant(this.__ownerID,"Cannot set on an immutable record."),this.set(o,s)}})}function Set(s){return null==s?emptySet():isSet(s)&&!isOrdered(s)?s:emptySet().withMutations((function(o){var i=SetIterable(s);assertNotInfinite(i.size),i.forEach((function(s){return o.add(s)}))}))}function isSet(s){return!(!s||!s[nt])}tt[_]=tt.remove,tt.deleteIn=tt.removeIn=$e.removeIn,tt.merge=$e.merge,tt.mergeWith=$e.mergeWith,tt.mergeIn=$e.mergeIn,tt.mergeDeep=$e.mergeDeep,tt.mergeDeepWith=$e.mergeDeepWith,tt.mergeDeepIn=$e.mergeDeepIn,tt.setIn=$e.setIn,tt.update=$e.update,tt.updateIn=$e.updateIn,tt.withMutations=$e.withMutations,tt.asMutable=$e.asMutable,tt.asImmutable=$e.asImmutable,createClass(Set,SetCollection),Set.of=function(){return this(arguments)},Set.fromKeys=function(s){return this(KeyedIterable(s).keySeq())},Set.prototype.toString=function(){return this.__toString("Set {","}")},Set.prototype.has=function(s){return this._map.has(s)},Set.prototype.add=function(s){return updateSet(this,this._map.set(s,!0))},Set.prototype.remove=function(s){return updateSet(this,this._map.remove(s))},Set.prototype.clear=function(){return updateSet(this,this._map.clear())},Set.prototype.union=function(){var o=s.call(arguments,0);return 0===(o=o.filter((function(s){return 0!==s.size}))).length?this:0!==this.size||this.__ownerID||1!==o.length?this.withMutations((function(s){for(var i=0;i<o.length;i++)SetIterable(o[i]).forEach((function(o){return s.add(o)}))})):this.constructor(o[0])},Set.prototype.intersect=function(){var o=s.call(arguments,0);if(0===o.length)return this;o=o.map((function(s){return SetIterable(s)}));var i=this;return this.withMutations((function(s){i.forEach((function(i){o.every((function(s){return s.includes(i)}))||s.remove(i)}))}))},Set.prototype.subtract=function(){var o=s.call(arguments,0);if(0===o.length)return this;o=o.map((function(s){return SetIterable(s)}));var i=this;return this.withMutations((function(s){i.forEach((function(i){o.some((function(s){return s.includes(i)}))&&s.remove(i)}))}))},Set.prototype.merge=function(){return this.union.apply(this,arguments)},Set.prototype.mergeWith=function(o){var i=s.call(arguments,1);return this.union.apply(this,i)},Set.prototype.sort=function(s){return OrderedSet(sortFactory(this,s))},Set.prototype.sortBy=function(s,o){return OrderedSet(sortFactory(this,o,s))},Set.prototype.wasAltered=function(){return this._map.wasAltered()},Set.prototype.__iterate=function(s,o){var i=this;return this._map.__iterate((function(o,a){return s(a,a,i)}),o)},Set.prototype.__iterator=function(s,o){return this._map.map((function(s,o){return o})).__iterator(s,o)},Set.prototype.__ensureOwner=function(s){if(s===this.__ownerID)return this;var o=this._map.__ensureOwner(s);return s?this.__make(o,s):(this.__ownerID=s,this._map=o,this)},Set.isSet=isSet;var rt,nt="@@__IMMUTABLE_SET__@@",st=Set.prototype;function updateSet(s,o){return s.__ownerID?(s.size=o.size,s._map=o,s):o===s._map?s:0===o.size?s.__empty():s.__make(o)}function makeSet(s,o){var i=Object.create(st);return i.size=s?s.size:0,i._map=s,i.__ownerID=o,i}function emptySet(){return rt||(rt=makeSet(emptyMap()))}function OrderedSet(s){return null==s?emptyOrderedSet():isOrderedSet(s)?s:emptyOrderedSet().withMutations((function(o){var i=SetIterable(s);assertNotInfinite(i.size),i.forEach((function(s){return o.add(s)}))}))}function isOrderedSet(s){return isSet(s)&&isOrdered(s)}st[nt]=!0,st[_]=st.remove,st.mergeDeep=st.merge,st.mergeDeepWith=st.mergeWith,st.withMutations=$e.withMutations,st.asMutable=$e.asMutable,st.asImmutable=$e.asImmutable,st.__empty=emptySet,st.__make=makeSet,createClass(OrderedSet,Set),OrderedSet.of=function(){return this(arguments)},OrderedSet.fromKeys=function(s){return this(KeyedIterable(s).keySeq())},OrderedSet.prototype.toString=function(){return this.__toString("OrderedSet {","}")},OrderedSet.isOrderedSet=isOrderedSet;var ot,it=OrderedSet.prototype;function makeOrderedSet(s,o){var i=Object.create(it);return i.size=s?s.size:0,i._map=s,i.__ownerID=o,i}function emptyOrderedSet(){return ot||(ot=makeOrderedSet(emptyOrderedMap()))}function Stack(s){return null==s?emptyStack():isStack(s)?s:emptyStack().unshiftAll(s)}function isStack(s){return!(!s||!s[ct])}it[u]=!0,it.__empty=emptyOrderedSet,it.__make=makeOrderedSet,createClass(Stack,IndexedCollection),Stack.of=function(){return this(arguments)},Stack.prototype.toString=function(){return this.__toString("Stack [","]")},Stack.prototype.get=function(s,o){var i=this._head;for(s=wrapIndex(this,s);i&&s--;)i=i.next;return i?i.value:o},Stack.prototype.peek=function(){return this._head&&this._head.value},Stack.prototype.push=function(){if(0===arguments.length)return this;for(var s=this.size+arguments.length,o=this._head,i=arguments.length-1;i>=0;i--)o={value:arguments[i],next:o};return this.__ownerID?(this.size=s,this._head=o,this.__hash=void 0,this.__altered=!0,this):makeStack(s,o)},Stack.prototype.pushAll=function(s){if(0===(s=IndexedIterable(s)).size)return this;assertNotInfinite(s.size);var o=this.size,i=this._head;return s.reverse().forEach((function(s){o++,i={value:s,next:i}})),this.__ownerID?(this.size=o,this._head=i,this.__hash=void 0,this.__altered=!0,this):makeStack(o,i)},Stack.prototype.pop=function(){return this.slice(1)},Stack.prototype.unshift=function(){return this.push.apply(this,arguments)},Stack.prototype.unshiftAll=function(s){return this.pushAll(s)},Stack.prototype.shift=function(){return this.pop.apply(this,arguments)},Stack.prototype.clear=function(){return 0===this.size?this:this.__ownerID?(this.size=0,this._head=void 0,this.__hash=void 0,this.__altered=!0,this):emptyStack()},Stack.prototype.slice=function(s,o){if(wholeSlice(s,o,this.size))return this;var i=resolveBegin(s,this.size);if(resolveEnd(o,this.size)!==this.size)return IndexedCollection.prototype.slice.call(this,s,o);for(var a=this.size-i,u=this._head;i--;)u=u.next;return this.__ownerID?(this.size=a,this._head=u,this.__hash=void 0,this.__altered=!0,this):makeStack(a,u)},Stack.prototype.__ensureOwner=function(s){return s===this.__ownerID?this:s?makeStack(this.size,this._head,s,this.__hash):(this.__ownerID=s,this.__altered=!1,this)},Stack.prototype.__iterate=function(s,o){if(o)return this.reverse().__iterate(s);for(var i=0,a=this._head;a&&!1!==s(a.value,i++,this);)a=a.next;return i},Stack.prototype.__iterator=function(s,o){if(o)return this.reverse().__iterator(s);var i=0,a=this._head;return new Iterator((function(){if(a){var o=a.value;return a=a.next,iteratorValue(s,i++,o)}return iteratorDone()}))},Stack.isStack=isStack;var at,ct="@@__IMMUTABLE_STACK__@@",lt=Stack.prototype;function makeStack(s,o,i,a){var u=Object.create(lt);return u.size=s,u._head=o,u.__ownerID=i,u.__hash=a,u.__altered=!1,u}function emptyStack(){return at||(at=makeStack(0))}function mixin(s,o){var keyCopier=function(i){s.prototype[i]=o[i]};return Object.keys(o).forEach(keyCopier),Object.getOwnPropertySymbols&&Object.getOwnPropertySymbols(o).forEach(keyCopier),s}lt[ct]=!0,lt.withMutations=$e.withMutations,lt.asMutable=$e.asMutable,lt.asImmutable=$e.asImmutable,lt.wasAltered=$e.wasAltered,Iterable.Iterator=Iterator,mixin(Iterable,{toArray:function(){assertNotInfinite(this.size);var s=new Array(this.size||0);return this.valueSeq().__iterate((function(o,i){s[i]=o})),s},toIndexedSeq:function(){return new ToIndexedSequence(this)},toJS:function(){return this.toSeq().map((function(s){return s&&"function"==typeof s.toJS?s.toJS():s})).__toJS()},toJSON:function(){return this.toSeq().map((function(s){return s&&"function"==typeof s.toJSON?s.toJSON():s})).__toJS()},toKeyedSeq:function(){return new ToKeyedSequence(this,!0)},toMap:function(){return Map(this.toKeyedSeq())},toObject:function(){assertNotInfinite(this.size);var s={};return this.__iterate((function(o,i){s[i]=o})),s},toOrderedMap:function(){return OrderedMap(this.toKeyedSeq())},toOrderedSet:function(){return OrderedSet(isKeyed(this)?this.valueSeq():this)},toSet:function(){return Set(isKeyed(this)?this.valueSeq():this)},toSetSeq:function(){return new ToSetSequence(this)},toSeq:function(){return isIndexed(this)?this.toIndexedSeq():isKeyed(this)?this.toKeyedSeq():this.toSetSeq()},toStack:function(){return Stack(isKeyed(this)?this.valueSeq():this)},toList:function(){return List(isKeyed(this)?this.valueSeq():this)},toString:function(){return"[Iterable]"},__toString:function(s,o){return 0===this.size?s+o:s+" "+this.toSeq().map(this.__toStringMapper).join(", ")+" "+o},concat:function(){return reify(this,concatFactory(this,s.call(arguments,0)))},includes:function(s){return this.some((function(o){return is(o,s)}))},entries:function(){return this.__iterator(V)},every:function(s,o){assertNotInfinite(this.size);var i=!0;return this.__iterate((function(a,u,_){if(!s.call(o,a,u,_))return i=!1,!1})),i},filter:function(s,o){return reify(this,filterFactory(this,s,o,!0))},find:function(s,o,i){var a=this.findEntry(s,o);return a?a[1]:i},forEach:function(s,o){return assertNotInfinite(this.size),this.__iterate(o?s.bind(o):s)},join:function(s){assertNotInfinite(this.size),s=void 0!==s?""+s:",";var o="",i=!0;return this.__iterate((function(a){i?i=!1:o+=s,o+=null!=a?a.toString():""})),o},keys:function(){return this.__iterator($)},map:function(s,o){return reify(this,mapFactory(this,s,o))},reduce:function(s,o,i){var a,u;return assertNotInfinite(this.size),arguments.length<2?u=!0:a=o,this.__iterate((function(o,_,w){u?(u=!1,a=o):a=s.call(i,a,o,_,w)})),a},reduceRight:function(s,o,i){var a=this.toKeyedSeq().reverse();return a.reduce.apply(a,arguments)},reverse:function(){return reify(this,reverseFactory(this,!0))},slice:function(s,o){return reify(this,sliceFactory(this,s,o,!0))},some:function(s,o){return!this.every(not(s),o)},sort:function(s){return reify(this,sortFactory(this,s))},values:function(){return this.__iterator(U)},butLast:function(){return this.slice(0,-1)},isEmpty:function(){return void 0!==this.size?0===this.size:!this.some((function(){return!0}))},count:function(s,o){return ensureSize(s?this.toSeq().filter(s,o):this)},countBy:function(s,o){return countByFactory(this,s,o)},equals:function(s){return deepEqual(this,s)},entrySeq:function(){var s=this;if(s._cache)return new ArraySeq(s._cache);var o=s.toSeq().map(entryMapper).toIndexedSeq();return o.fromEntrySeq=function(){return s.toSeq()},o},filterNot:function(s,o){return this.filter(not(s),o)},findEntry:function(s,o,i){var a=i;return this.__iterate((function(i,u,_){if(s.call(o,i,u,_))return a=[u,i],!1})),a},findKey:function(s,o){var i=this.findEntry(s,o);return i&&i[0]},findLast:function(s,o,i){return this.toKeyedSeq().reverse().find(s,o,i)},findLastEntry:function(s,o,i){return this.toKeyedSeq().reverse().findEntry(s,o,i)},findLastKey:function(s,o){return this.toKeyedSeq().reverse().findKey(s,o)},first:function(){return this.find(returnTrue)},flatMap:function(s,o){return reify(this,flatMapFactory(this,s,o))},flatten:function(s){return reify(this,flattenFactory(this,s,!0))},fromEntrySeq:function(){return new FromEntriesSequence(this)},get:function(s,o){return this.find((function(o,i){return is(i,s)}),void 0,o)},getIn:function(s,o){for(var i,a=this,u=forceIterator(s);!(i=u.next()).done;){var _=i.value;if((a=a&&a.get?a.get(_,j):j)===j)return o}return a},groupBy:function(s,o){return groupByFactory(this,s,o)},has:function(s){return this.get(s,j)!==j},hasIn:function(s){return this.getIn(s,j)!==j},isSubset:function(s){return s="function"==typeof s.includes?s:Iterable(s),this.every((function(o){return s.includes(o)}))},isSuperset:function(s){return(s="function"==typeof s.isSubset?s:Iterable(s)).isSubset(this)},keyOf:function(s){return this.findKey((function(o){return is(o,s)}))},keySeq:function(){return this.toSeq().map(keyMapper).toIndexedSeq()},last:function(){return this.toSeq().reverse().first()},lastKeyOf:function(s){return this.toKeyedSeq().reverse().keyOf(s)},max:function(s){return maxFactory(this,s)},maxBy:function(s,o){return maxFactory(this,o,s)},min:function(s){return maxFactory(this,s?neg(s):defaultNegComparator)},minBy:function(s,o){return maxFactory(this,o?neg(o):defaultNegComparator,s)},rest:function(){return this.slice(1)},skip:function(s){return this.slice(Math.max(0,s))},skipLast:function(s){return reify(this,this.toSeq().reverse().skip(s).reverse())},skipWhile:function(s,o){return reify(this,skipWhileFactory(this,s,o,!0))},skipUntil:function(s,o){return this.skipWhile(not(s),o)},sortBy:function(s,o){return reify(this,sortFactory(this,o,s))},take:function(s){return this.slice(0,Math.max(0,s))},takeLast:function(s){return reify(this,this.toSeq().reverse().take(s).reverse())},takeWhile:function(s,o){return reify(this,takeWhileFactory(this,s,o))},takeUntil:function(s,o){return this.takeWhile(not(s),o)},valueSeq:function(){return this.toIndexedSeq()},hashCode:function(){return this.__hash||(this.__hash=hashIterable(this))}});var ut=Iterable.prototype;ut[o]=!0,ut[Z]=ut.values,ut.__toJS=ut.toArray,ut.__toStringMapper=quoteString,ut.inspect=ut.toSource=function(){return this.toString()},ut.chain=ut.flatMap,ut.contains=ut.includes,mixin(KeyedIterable,{flip:function(){return reify(this,flipFactory(this))},mapEntries:function(s,o){var i=this,a=0;return reify(this,this.toSeq().map((function(u,_){return s.call(o,[_,u],a++,i)})).fromEntrySeq())},mapKeys:function(s,o){var i=this;return reify(this,this.toSeq().flip().map((function(a,u){return s.call(o,a,u,i)})).flip())}});var pt=KeyedIterable.prototype;function keyMapper(s,o){return o}function entryMapper(s,o){return[o,s]}function not(s){return function(){return!s.apply(this,arguments)}}function neg(s){return function(){return-s.apply(this,arguments)}}function quoteString(s){return"string"==typeof s?JSON.stringify(s):String(s)}function defaultZipper(){return arrCopy(arguments)}function defaultNegComparator(s,o){return s<o?1:s>o?-1:0}function hashIterable(s){if(s.size===1/0)return 0;var o=isOrdered(s),i=isKeyed(s),a=o?1:0;return murmurHashOfSize(s.__iterate(i?o?function(s,o){a=31*a+hashMerge(hash(s),hash(o))|0}:function(s,o){a=a+hashMerge(hash(s),hash(o))|0}:o?function(s){a=31*a+hash(s)|0}:function(s){a=a+hash(s)|0}),a)}function murmurHashOfSize(s,o){return o=le(o,3432918353),o=le(o<<15|o>>>-15,461845907),o=le(o<<13|o>>>-13,5),o=le((o=o+3864292196^s)^o>>>16,2246822507),o=smi((o=le(o^o>>>13,3266489909))^o>>>16)}function hashMerge(s,o){return s^o+2654435769+(s<<6)+(s>>2)}return pt[i]=!0,pt[Z]=ut.entries,pt.__toJS=ut.toObject,pt.__toStringMapper=function(s,o){return JSON.stringify(o)+": "+quoteString(s)},mixin(IndexedIterable,{toKeyedSeq:function(){return new ToKeyedSequence(this,!1)},filter:function(s,o){return reify(this,filterFactory(this,s,o,!1))},findIndex:function(s,o){var i=this.findEntry(s,o);return i?i[0]:-1},indexOf:function(s){var o=this.keyOf(s);return void 0===o?-1:o},lastIndexOf:function(s){var o=this.lastKeyOf(s);return void 0===o?-1:o},reverse:function(){return reify(this,reverseFactory(this,!1))},slice:function(s,o){return reify(this,sliceFactory(this,s,o,!1))},splice:function(s,o){var i=arguments.length;if(o=Math.max(0|o,0),0===i||2===i&&!o)return this;s=resolveBegin(s,s<0?this.count():this.size);var a=this.slice(0,s);return reify(this,1===i?a:a.concat(arrCopy(arguments,2),this.slice(s+o)))},findLastIndex:function(s,o){var i=this.findLastEntry(s,o);return i?i[0]:-1},first:function(){return this.get(0)},flatten:function(s){return reify(this,flattenFactory(this,s,!1))},get:function(s,o){return(s=wrapIndex(this,s))<0||this.size===1/0||void 0!==this.size&&s>this.size?o:this.find((function(o,i){return i===s}),void 0,o)},has:function(s){return(s=wrapIndex(this,s))>=0&&(void 0!==this.size?this.size===1/0||s<this.size:-1!==this.indexOf(s))},interpose:function(s){return reify(this,interposeFactory(this,s))},interleave:function(){var s=[this].concat(arrCopy(arguments)),o=zipWithFactory(this.toSeq(),IndexedSeq.of,s),i=o.flatten(!0);return o.size&&(i.size=o.size*s.length),reify(this,i)},keySeq:function(){return Range(0,this.size)},last:function(){return this.get(-1)},skipWhile:function(s,o){return reify(this,skipWhileFactory(this,s,o,!1))},zip:function(){return reify(this,zipWithFactory(this,defaultZipper,[this].concat(arrCopy(arguments))))},zipWith:function(s){var o=arrCopy(arguments);return o[0]=this,reify(this,zipWithFactory(this,s,o))}}),IndexedIterable.prototype[a]=!0,IndexedIterable.prototype[u]=!0,mixin(SetIterable,{get:function(s,o){return this.has(s)?s:o},includes:function(s){return this.has(s)},keySeq:function(){return this.valueSeq()}}),SetIterable.prototype.has=ut.includes,SetIterable.prototype.contains=SetIterable.prototype.includes,mixin(KeyedSeq,KeyedIterable.prototype),mixin(IndexedSeq,IndexedIterable.prototype),mixin(SetSeq,SetIterable.prototype),mixin(KeyedCollection,KeyedIterable.prototype),mixin(IndexedCollection,IndexedIterable.prototype),mixin(SetCollection,SetIterable.prototype),{Iterable,Seq,Collection,Map,OrderedMap,List,Stack,Set,OrderedSet,Record,Range,Repeat,is,fromJS}}()},9748:(s,o,i)=>{"use strict";i(71340);var a=i(92046);s.exports=a.Object.assign},9957:(s,o,i)=>{"use strict";var a=Function.prototype.call,u=Object.prototype.hasOwnProperty,_=i(66743);s.exports=_.call(a,u)},9999:(s,o,i)=>{var a=i(37217),u=i(83729),_=i(16547),w=i(74733),x=i(43838),C=i(93290),j=i(23007),L=i(92271),B=i(48948),$=i(50002),U=i(83349),V=i(5861),z=i(76189),Y=i(77199),Z=i(35529),ee=i(56449),ie=i(3656),ae=i(87730),ce=i(23805),le=i(38440),pe=i(95950),de=i(37241),fe="[object Arguments]",ye="[object Function]",be="[object Object]",_e={};_e[fe]=_e["[object Array]"]=_e["[object ArrayBuffer]"]=_e["[object DataView]"]=_e["[object Boolean]"]=_e["[object Date]"]=_e["[object Float32Array]"]=_e["[object Float64Array]"]=_e["[object Int8Array]"]=_e["[object Int16Array]"]=_e["[object Int32Array]"]=_e["[object Map]"]=_e["[object Number]"]=_e[be]=_e["[object RegExp]"]=_e["[object Set]"]=_e["[object String]"]=_e["[object Symbol]"]=_e["[object Uint8Array]"]=_e["[object Uint8ClampedArray]"]=_e["[object Uint16Array]"]=_e["[object Uint32Array]"]=!0,_e["[object Error]"]=_e[ye]=_e["[object WeakMap]"]=!1,s.exports=function baseClone(s,o,i,Se,we,xe){var Pe,Te=1&o,Re=2&o,$e=4&o;if(i&&(Pe=we?i(s,Se,we,xe):i(s)),void 0!==Pe)return Pe;if(!ce(s))return s;var qe=ee(s);if(qe){if(Pe=z(s),!Te)return j(s,Pe)}else{var ze=V(s),We=ze==ye||"[object GeneratorFunction]"==ze;if(ie(s))return C(s,Te);if(ze==be||ze==fe||We&&!we){if(Pe=Re||We?{}:Z(s),!Te)return Re?B(s,x(Pe,s)):L(s,w(Pe,s))}else{if(!_e[ze])return we?s:{};Pe=Y(s,ze,Te)}}xe||(xe=new a);var He=xe.get(s);if(He)return He;xe.set(s,Pe),le(s)?s.forEach((function(a){Pe.add(baseClone(a,o,i,a,s,xe))})):ae(s)&&s.forEach((function(a,u){Pe.set(u,baseClone(a,o,i,u,s,xe))}));var Ye=qe?void 0:($e?Re?U:$:Re?de:pe)(s);return u(Ye||s,(function(a,u){Ye&&(a=s[u=a]),_(Pe,u,baseClone(a,o,i,u,s,xe))})),Pe}},10023:(s,o,i)=>{const a=i(6205),INTS=()=>[{type:a.RANGE,from:48,to:57}],WORDS=()=>[{type:a.CHAR,value:95},{type:a.RANGE,from:97,to:122},{type:a.RANGE,from:65,to:90}].concat(INTS()),WHITESPACE=()=>[{type:a.CHAR,value:9},{type:a.CHAR,value:10},{type:a.CHAR,value:11},{type:a.CHAR,value:12},{type:a.CHAR,value:13},{type:a.CHAR,value:32},{type:a.CHAR,value:160},{type:a.CHAR,value:5760},{type:a.RANGE,from:8192,to:8202},{type:a.CHAR,value:8232},{type:a.CHAR,value:8233},{type:a.CHAR,value:8239},{type:a.CHAR,value:8287},{type:a.CHAR,value:12288},{type:a.CHAR,value:65279}];o.words=()=>({type:a.SET,set:WORDS(),not:!1}),o.notWords=()=>({type:a.SET,set:WORDS(),not:!0}),o.ints=()=>({type:a.SET,set:INTS(),not:!1}),o.notInts=()=>({type:a.SET,set:INTS(),not:!0}),o.whitespace=()=>({type:a.SET,set:WHITESPACE(),not:!1}),o.notWhitespace=()=>({type:a.SET,set:WHITESPACE(),not:!0}),o.anyChar=()=>({type:a.SET,set:[{type:a.CHAR,value:10},{type:a.CHAR,value:13},{type:a.CHAR,value:8232},{type:a.CHAR,value:8233}],not:!0})},10043:(s,o,i)=>{"use strict";var a=i(54018),u=String,_=TypeError;s.exports=function(s){if(a(s))return s;throw new _("Can't set "+u(s)+" as a prototype")}},10076:s=>{"use strict";s.exports=Function.prototype.call},10124:(s,o,i)=>{var a=i(9325);s.exports=function(){return a.Date.now()}},10300:(s,o,i)=>{"use strict";var a=i(13930),u=i(82159),_=i(36624),w=i(4640),x=i(73448),C=TypeError;s.exports=function(s,o){var i=arguments.length<2?x(s):o;if(u(i))return _(a(i,s));throw new C(w(s)+" is not iterable")}},10316:(s,o,i)=>{const a=i(2404),u=i(55973),_=i(92340);class Element{constructor(s,o,i){o&&(this.meta=o),i&&(this.attributes=i),this.content=s}freeze(){Object.isFrozen(this)||(this._meta&&(this.meta.parent=this,this.meta.freeze()),this._attributes&&(this.attributes.parent=this,this.attributes.freeze()),this.children.forEach((s=>{s.parent=this,s.freeze()}),this),this.content&&Array.isArray(this.content)&&Object.freeze(this.content),Object.freeze(this))}primitive(){}clone(){const s=new this.constructor;return s.element=this.element,this.meta.length&&(s._meta=this.meta.clone()),this.attributes.length&&(s._attributes=this.attributes.clone()),this.content?this.content.clone?s.content=this.content.clone():Array.isArray(this.content)?s.content=this.content.map((s=>s.clone())):s.content=this.content:s.content=this.content,s}toValue(){return this.content instanceof Element?this.content.toValue():this.content instanceof u?{key:this.content.key.toValue(),value:this.content.value?this.content.value.toValue():void 0}:this.content&&this.content.map?this.content.map((s=>s.toValue()),this):this.content}toRef(s){if(""===this.id.toValue())throw Error("Cannot create reference to an element that does not contain an ID");const o=new this.RefElement(this.id.toValue());return s&&(o.path=s),o}findRecursive(...s){if(arguments.length>1&&!this.isFrozen)throw new Error("Cannot find recursive with multiple element names without first freezing the element. Call `element.freeze()`");const o=s.pop();let i=new _;const append=(s,o)=>(s.push(o),s),checkElement=(s,i)=>{i.element===o&&s.push(i);const a=i.findRecursive(o);return a&&a.reduce(append,s),i.content instanceof u&&(i.content.key&&checkElement(s,i.content.key),i.content.value&&checkElement(s,i.content.value)),s};return this.content&&(this.content.element&&checkElement(i,this.content),Array.isArray(this.content)&&this.content.reduce(checkElement,i)),s.isEmpty||(i=i.filter((o=>{let i=o.parents.map((s=>s.element));for(const o in s){const a=s[o],u=i.indexOf(a);if(-1===u)return!1;i=i.splice(0,u)}return!0}))),i}set(s){return this.content=s,this}equals(s){return a(this.toValue(),s)}getMetaProperty(s,o){if(!this.meta.hasKey(s)){if(this.isFrozen){const s=this.refract(o);return s.freeze(),s}this.meta.set(s,o)}return this.meta.get(s)}setMetaProperty(s,o){this.meta.set(s,o)}get element(){return this._storedElement||"element"}set element(s){this._storedElement=s}get content(){return this._content}set content(s){if(s instanceof Element)this._content=s;else if(s instanceof _)this.content=s.elements;else if("string"==typeof s||"number"==typeof s||"boolean"==typeof s||"null"===s||null==s)this._content=s;else if(s instanceof u)this._content=s;else if(Array.isArray(s))this._content=s.map(this.refract);else{if("object"!=typeof s)throw new Error("Cannot set content to given value");this._content=Object.keys(s).map((o=>new this.MemberElement(o,s[o])))}}get meta(){if(!this._meta){if(this.isFrozen){const s=new this.ObjectElement;return s.freeze(),s}this._meta=new this.ObjectElement}return this._meta}set meta(s){s instanceof this.ObjectElement?this._meta=s:this.meta.set(s||{})}get attributes(){if(!this._attributes){if(this.isFrozen){const s=new this.ObjectElement;return s.freeze(),s}this._attributes=new this.ObjectElement}return this._attributes}set attributes(s){s instanceof this.ObjectElement?this._attributes=s:this.attributes.set(s||{})}get id(){return this.getMetaProperty("id","")}set id(s){this.setMetaProperty("id",s)}get classes(){return this.getMetaProperty("classes",[])}set classes(s){this.setMetaProperty("classes",s)}get title(){return this.getMetaProperty("title","")}set title(s){this.setMetaProperty("title",s)}get description(){return this.getMetaProperty("description","")}set description(s){this.setMetaProperty("description",s)}get links(){return this.getMetaProperty("links",[])}set links(s){this.setMetaProperty("links",s)}get isFrozen(){return Object.isFrozen(this)}get parents(){let{parent:s}=this;const o=new _;for(;s;)o.push(s),s=s.parent;return o}get children(){if(Array.isArray(this.content))return new _(this.content);if(this.content instanceof u){const s=new _([this.content.key]);return this.content.value&&s.push(this.content.value),s}return this.content instanceof Element?new _([this.content]):new _}get recursiveChildren(){const s=new _;return this.children.forEach((o=>{s.push(o),o.recursiveChildren.forEach((o=>{s.push(o)}))})),s}}s.exports=Element},10392:s=>{s.exports=function getValue(s,o){return null==s?void 0:s[o]}},10487:(s,o,i)=>{"use strict";var a=i(96897),u=i(30655),_=i(73126),w=i(12205);s.exports=function callBind(s){var o=_(arguments),i=s.length-(arguments.length-1);return a(o,1+(i>0?i:0),!0)},u?u(s.exports,"apply",{value:w}):s.exports.apply=w},10776:(s,o,i)=>{var a=i(30756),u=i(95950);s.exports=function getMatchData(s){for(var o=u(s),i=o.length;i--;){var _=o[i],w=s[_];o[i]=[_,w,a(w)]}return o}},10866:(s,o,i)=>{const a=i(6048),u=i(92340);class ObjectSlice extends u{map(s,o){return this.elements.map((i=>s.bind(o)(i.value,i.key,i)))}filter(s,o){return new ObjectSlice(this.elements.filter((i=>s.bind(o)(i.value,i.key,i))))}reject(s,o){return this.filter(a(s.bind(o)))}forEach(s,o){return this.elements.forEach(((i,a)=>{s.bind(o)(i.value,i.key,i,a)}))}keys(){return this.map(((s,o)=>o.toValue()))}values(){return this.map((s=>s.toValue()))}}s.exports=ObjectSlice},11002:s=>{"use strict";s.exports=Function.prototype.apply},11042:(s,o,i)=>{"use strict";var a=i(85582),u=i(1907),_=i(24443),w=i(87170),x=i(36624),C=u([].concat);s.exports=a("Reflect","ownKeys")||function ownKeys(s){var o=_.f(x(s)),i=w.f;return i?C(o,i(s)):o}},11091:(s,o,i)=>{"use strict";var a=i(45951),u=i(76024),_=i(92361),w=i(62250),x=i(13846).f,C=i(7463),j=i(92046),L=i(28311),B=i(61626),$=i(49724);i(36128);var wrapConstructor=function(s){var Wrapper=function(o,i,a){if(this instanceof Wrapper){switch(arguments.length){case 0:return new s;case 1:return new s(o);case 2:return new s(o,i)}return new s(o,i,a)}return u(s,this,arguments)};return Wrapper.prototype=s.prototype,Wrapper};s.exports=function(s,o){var i,u,U,V,z,Y,Z,ee,ie,ae=s.target,ce=s.global,le=s.stat,pe=s.proto,de=ce?a:le?a[ae]:a[ae]&&a[ae].prototype,fe=ce?j:j[ae]||B(j,ae,{})[ae],ye=fe.prototype;for(V in o)u=!(i=C(ce?V:ae+(le?".":"#")+V,s.forced))&&de&&$(de,V),Y=fe[V],u&&(Z=s.dontCallGetSet?(ie=x(de,V))&&ie.value:de[V]),z=u&&Z?Z:o[V],(i||pe||typeof Y!=typeof z)&&(ee=s.bind&&u?L(z,a):s.wrap&&u?wrapConstructor(z):pe&&w(z)?_(z):z,(s.sham||z&&z.sham||Y&&Y.sham)&&B(ee,"sham",!0),B(fe,V,ee),pe&&($(j,U=ae+"Prototype")||B(j,U,{}),B(j[U],V,z),s.real&&ye&&(i||!ye[V])&&B(ye,V,z)))}},11287:s=>{s.exports=function getHolder(s){return s.placeholder}},11331:(s,o,i)=>{var a=i(72552),u=i(28879),_=i(40346),w=Function.prototype,x=Object.prototype,C=w.toString,j=x.hasOwnProperty,L=C.call(Object);s.exports=function isPlainObject(s){if(!_(s)||"[object Object]"!=a(s))return!1;var o=u(s);if(null===o)return!0;var i=j.call(o,"constructor")&&o.constructor;return"function"==typeof i&&i instanceof i&&C.call(i)==L}},11470:(s,o,i)=>{"use strict";var a=i(1907),u=i(65482),_=i(90160),w=i(74239),x=a("".charAt),C=a("".charCodeAt),j=a("".slice),createMethod=function(s){return function(o,i){var a,L,B=_(w(o)),$=u(i),U=B.length;return $<0||$>=U?s?"":void 0:(a=C(B,$))<55296||a>56319||$+1===U||(L=C(B,$+1))<56320||L>57343?s?x(B,$):a:s?j(B,$,$+2):L-56320+(a-55296<<10)+65536}};s.exports={codeAt:createMethod(!1),charAt:createMethod(!0)}},11842:(s,o,i)=>{var a=i(82819),u=i(9325);s.exports=function createBind(s,o,i){var _=1&o,w=a(s);return function wrapper(){return(this&&this!==u&&this instanceof wrapper?w:s).apply(_?i:this,arguments)}}},12205:(s,o,i)=>{"use strict";var a=i(66743),u=i(11002),_=i(13144);s.exports=function applyBind(){return _(a,u,arguments)}},12242:(s,o,i)=>{const a=i(10316);s.exports=class BooleanElement extends a{constructor(s,o,i){super(s,o,i),this.element="boolean"}primitive(){return"boolean"}}},12507:(s,o,i)=>{var a=i(28754),u=i(49698),_=i(63912),w=i(13222);s.exports=function createCaseFirst(s){return function(o){o=w(o);var i=u(o)?_(o):void 0,x=i?i[0]:o.charAt(0),C=i?a(i,1).join(""):o.slice(1);return x[s]()+C}}},12560:(s,o,i)=>{"use strict";i(99363);var a=i(19287),u=i(45951),_=i(14840),w=i(93742);for(var x in a)_(u[x],x),w[x]=w.Array},12651:(s,o,i)=>{var a=i(74218);s.exports=function getMapData(s,o){var i=s.__data__;return a(o)?i["string"==typeof o?"string":"hash"]:i.map}},12749:(s,o,i)=>{var a=i(81042),u=Object.prototype.hasOwnProperty;s.exports=function hashHas(s){var o=this.__data__;return a?void 0!==o[s]:u.call(o,s)}},13144:(s,o,i)=>{"use strict";var a=i(66743),u=i(11002),_=i(10076),w=i(47119);s.exports=w||a.call(_,u)},13222:(s,o,i)=>{var a=i(77556);s.exports=function toString(s){return null==s?"":a(s)}},13846:(s,o,i)=>{"use strict";var a=i(39447),u=i(13930),_=i(22574),w=i(75817),x=i(4993),C=i(70470),j=i(49724),L=i(73648),B=Object.getOwnPropertyDescriptor;o.f=a?B:function getOwnPropertyDescriptor(s,o){if(s=x(s),o=C(o),L)try{return B(s,o)}catch(s){}if(j(s,o))return w(!u(_.f,s,o),s[o])}},13930:(s,o,i)=>{"use strict";var a=i(41505),u=Function.prototype.call;s.exports=a?u.bind(u):function(){return u.apply(u,arguments)}},14248:s=>{s.exports=function arraySome(s,o){for(var i=-1,a=null==s?0:s.length;++i<a;)if(o(s[i],i,s))return!0;return!1}},14528:s=>{s.exports=function arrayPush(s,o){for(var i=-1,a=o.length,u=s.length;++i<a;)s[u+i]=o[i];return s}},14540:(s,o,i)=>{const a=i(10316);s.exports=class RefElement extends a{constructor(s,o,i){super(s||[],o,i),this.element="ref",this.path||(this.path="element")}get path(){return this.attributes.get("path")}set path(s){this.attributes.set("path",s)}}},14744:s=>{"use strict";var o=function isMergeableObject(s){return function isNonNullObject(s){return!!s&&"object"==typeof s}(s)&&!function isSpecial(s){var o=Object.prototype.toString.call(s);return"[object RegExp]"===o||"[object Date]"===o||function isReactElement(s){return s.$$typeof===i}(s)}(s)};var i="function"==typeof Symbol&&Symbol.for?Symbol.for("react.element"):60103;function cloneUnlessOtherwiseSpecified(s,o){return!1!==o.clone&&o.isMergeableObject(s)?deepmerge(function emptyTarget(s){return Array.isArray(s)?[]:{}}(s),s,o):s}function defaultArrayMerge(s,o,i){return s.concat(o).map((function(s){return cloneUnlessOtherwiseSpecified(s,i)}))}function getKeys(s){return Object.keys(s).concat(function getEnumerableOwnPropertySymbols(s){return Object.getOwnPropertySymbols?Object.getOwnPropertySymbols(s).filter((function(o){return Object.propertyIsEnumerable.call(s,o)})):[]}(s))}function propertyIsOnObject(s,o){try{return o in s}catch(s){return!1}}function mergeObject(s,o,i){var a={};return i.isMergeableObject(s)&&getKeys(s).forEach((function(o){a[o]=cloneUnlessOtherwiseSpecified(s[o],i)})),getKeys(o).forEach((function(u){(function propertyIsUnsafe(s,o){return propertyIsOnObject(s,o)&&!(Object.hasOwnProperty.call(s,o)&&Object.propertyIsEnumerable.call(s,o))})(s,u)||(propertyIsOnObject(s,u)&&i.isMergeableObject(o[u])?a[u]=function getMergeFunction(s,o){if(!o.customMerge)return deepmerge;var i=o.customMerge(s);return"function"==typeof i?i:deepmerge}(u,i)(s[u],o[u],i):a[u]=cloneUnlessOtherwiseSpecified(o[u],i))})),a}function deepmerge(s,i,a){(a=a||{}).arrayMerge=a.arrayMerge||defaultArrayMerge,a.isMergeableObject=a.isMergeableObject||o,a.cloneUnlessOtherwiseSpecified=cloneUnlessOtherwiseSpecified;var u=Array.isArray(i);return u===Array.isArray(s)?u?a.arrayMerge(s,i,a):mergeObject(s,i,a):cloneUnlessOtherwiseSpecified(i,a)}deepmerge.all=function deepmergeAll(s,o){if(!Array.isArray(s))throw new Error("first argument should be an array");return s.reduce((function(s,i){return deepmerge(s,i,o)}),{})};var a=deepmerge;s.exports=a},14792:(s,o,i)=>{var a=i(13222),u=i(55808);s.exports=function capitalize(s){return u(a(s).toLowerCase())}},14840:(s,o,i)=>{"use strict";var a=i(52623),u=i(74284).f,_=i(61626),w=i(49724),x=i(54878),C=i(76264)("toStringTag");s.exports=function(s,o,i,j){var L=i?s:s&&s.prototype;L&&(w(L,C)||u(L,C,{configurable:!0,value:o}),j&&!a&&_(L,"toString",x))}},14974:s=>{s.exports=function safeGet(s,o){if(("constructor"!==o||"function"!=typeof s[o])&&"__proto__"!=o)return s[o]}},15287:(s,o)=>{"use strict";var i=Symbol.for("react.element"),a=Symbol.for("react.portal"),u=Symbol.for("react.fragment"),_=Symbol.for("react.strict_mode"),w=Symbol.for("react.profiler"),x=Symbol.for("react.provider"),C=Symbol.for("react.context"),j=Symbol.for("react.forward_ref"),L=Symbol.for("react.suspense"),B=Symbol.for("react.memo"),$=Symbol.for("react.lazy"),U=Symbol.iterator;var V={isMounted:function(){return!1},enqueueForceUpdate:function(){},enqueueReplaceState:function(){},enqueueSetState:function(){}},z=Object.assign,Y={};function E(s,o,i){this.props=s,this.context=o,this.refs=Y,this.updater=i||V}function F(){}function G(s,o,i){this.props=s,this.context=o,this.refs=Y,this.updater=i||V}E.prototype.isReactComponent={},E.prototype.setState=function(s,o){if("object"!=typeof s&&"function"!=typeof s&&null!=s)throw Error("setState(...): takes an object of state variables to update or a function which returns an object of state variables.");this.updater.enqueueSetState(this,s,o,"setState")},E.prototype.forceUpdate=function(s){this.updater.enqueueForceUpdate(this,s,"forceUpdate")},F.prototype=E.prototype;var Z=G.prototype=new F;Z.constructor=G,z(Z,E.prototype),Z.isPureReactComponent=!0;var ee=Array.isArray,ie=Object.prototype.hasOwnProperty,ae={current:null},ce={key:!0,ref:!0,__self:!0,__source:!0};function M(s,o,a){var u,_={},w=null,x=null;if(null!=o)for(u in void 0!==o.ref&&(x=o.ref),void 0!==o.key&&(w=""+o.key),o)ie.call(o,u)&&!ce.hasOwnProperty(u)&&(_[u]=o[u]);var C=arguments.length-2;if(1===C)_.children=a;else if(1<C){for(var j=Array(C),L=0;L<C;L++)j[L]=arguments[L+2];_.children=j}if(s&&s.defaultProps)for(u in C=s.defaultProps)void 0===_[u]&&(_[u]=C[u]);return{$$typeof:i,type:s,key:w,ref:x,props:_,_owner:ae.current}}function O(s){return"object"==typeof s&&null!==s&&s.$$typeof===i}var le=/\/+/g;function Q(s,o){return"object"==typeof s&&null!==s&&null!=s.key?function escape(s){var o={"=":"=0",":":"=2"};return"$"+s.replace(/[=:]/g,(function(s){return o[s]}))}(""+s.key):o.toString(36)}function R(s,o,u,_,w){var x=typeof s;"undefined"!==x&&"boolean"!==x||(s=null);var C=!1;if(null===s)C=!0;else switch(x){case"string":case"number":C=!0;break;case"object":switch(s.$$typeof){case i:case a:C=!0}}if(C)return w=w(C=s),s=""===_?"."+Q(C,0):_,ee(w)?(u="",null!=s&&(u=s.replace(le,"$&/")+"/"),R(w,o,u,"",(function(s){return s}))):null!=w&&(O(w)&&(w=function N(s,o){return{$$typeof:i,type:s.type,key:o,ref:s.ref,props:s.props,_owner:s._owner}}(w,u+(!w.key||C&&C.key===w.key?"":(""+w.key).replace(le,"$&/")+"/")+s)),o.push(w)),1;if(C=0,_=""===_?".":_+":",ee(s))for(var j=0;j<s.length;j++){var L=_+Q(x=s[j],j);C+=R(x,o,u,L,w)}else if(L=function A(s){return null===s||"object"!=typeof s?null:"function"==typeof(s=U&&s[U]||s["@@iterator"])?s:null}(s),"function"==typeof L)for(s=L.call(s),j=0;!(x=s.next()).done;)C+=R(x=x.value,o,u,L=_+Q(x,j++),w);else if("object"===x)throw o=String(s),Error("Objects are not valid as a React child (found: "+("[object Object]"===o?"object with keys {"+Object.keys(s).join(", ")+"}":o)+"). If you meant to render a collection of children, use an array instead.");return C}function S(s,o,i){if(null==s)return s;var a=[],u=0;return R(s,a,"","",(function(s){return o.call(i,s,u++)})),a}function T(s){if(-1===s._status){var o=s._result;(o=o()).then((function(o){0!==s._status&&-1!==s._status||(s._status=1,s._result=o)}),(function(o){0!==s._status&&-1!==s._status||(s._status=2,s._result=o)})),-1===s._status&&(s._status=0,s._result=o)}if(1===s._status)return s._result.default;throw s._result}var pe={current:null},de={transition:null},fe={ReactCurrentDispatcher:pe,ReactCurrentBatchConfig:de,ReactCurrentOwner:ae};function X(){throw Error("act(...) is not supported in production builds of React.")}o.Children={map:S,forEach:function(s,o,i){S(s,(function(){o.apply(this,arguments)}),i)},count:function(s){var o=0;return S(s,(function(){o++})),o},toArray:function(s){return S(s,(function(s){return s}))||[]},only:function(s){if(!O(s))throw Error("React.Children.only expected to receive a single React element child.");return s}},o.Component=E,o.Fragment=u,o.Profiler=w,o.PureComponent=G,o.StrictMode=_,o.Suspense=L,o.__SECRET_INTERNALS_DO_NOT_USE_OR_YOU_WILL_BE_FIRED=fe,o.act=X,o.cloneElement=function(s,o,a){if(null==s)throw Error("React.cloneElement(...): The argument must be a React element, but you passed "+s+".");var u=z({},s.props),_=s.key,w=s.ref,x=s._owner;if(null!=o){if(void 0!==o.ref&&(w=o.ref,x=ae.current),void 0!==o.key&&(_=""+o.key),s.type&&s.type.defaultProps)var C=s.type.defaultProps;for(j in o)ie.call(o,j)&&!ce.hasOwnProperty(j)&&(u[j]=void 0===o[j]&&void 0!==C?C[j]:o[j])}var j=arguments.length-2;if(1===j)u.children=a;else if(1<j){C=Array(j);for(var L=0;L<j;L++)C[L]=arguments[L+2];u.children=C}return{$$typeof:i,type:s.type,key:_,ref:w,props:u,_owner:x}},o.createContext=function(s){return(s={$$typeof:C,_currentValue:s,_currentValue2:s,_threadCount:0,Provider:null,Consumer:null,_defaultValue:null,_globalName:null}).Provider={$$typeof:x,_context:s},s.Consumer=s},o.createElement=M,o.createFactory=function(s){var o=M.bind(null,s);return o.type=s,o},o.createRef=function(){return{current:null}},o.forwardRef=function(s){return{$$typeof:j,render:s}},o.isValidElement=O,o.lazy=function(s){return{$$typeof:$,_payload:{_status:-1,_result:s},_init:T}},o.memo=function(s,o){return{$$typeof:B,type:s,compare:void 0===o?null:o}},o.startTransition=function(s){var o=de.transition;de.transition={};try{s()}finally{de.transition=o}},o.unstable_act=X,o.useCallback=function(s,o){return pe.current.useCallback(s,o)},o.useContext=function(s){return pe.current.useContext(s)},o.useDebugValue=function(){},o.useDeferredValue=function(s){return pe.current.useDeferredValue(s)},o.useEffect=function(s,o){return pe.current.useEffect(s,o)},o.useId=function(){return pe.current.useId()},o.useImperativeHandle=function(s,o,i){return pe.current.useImperativeHandle(s,o,i)},o.useInsertionEffect=function(s,o){return pe.current.useInsertionEffect(s,o)},o.useLayoutEffect=function(s,o){return pe.current.useLayoutEffect(s,o)},o.useMemo=function(s,o){return pe.current.useMemo(s,o)},o.useReducer=function(s,o,i){return pe.current.useReducer(s,o,i)},o.useRef=function(s){return pe.current.useRef(s)},o.useState=function(s){return pe.current.useState(s)},o.useSyncExternalStore=function(s,o,i){return pe.current.useSyncExternalStore(s,o,i)},o.useTransition=function(){return pe.current.useTransition()},o.version="18.3.1"},15325:(s,o,i)=>{var a=i(96131);s.exports=function arrayIncludes(s,o){return!!(null==s?0:s.length)&&a(s,o,0)>-1}},15340:()=>{},15377:(s,o,i)=>{"use strict";var a=i(92861).Buffer,u=i(64634),_=i(74372),w=ArrayBuffer.isView||function isView(s){try{return _(s),!0}catch(s){return!1}},x="undefined"!=typeof Uint8Array,C="undefined"!=typeof ArrayBuffer&&"undefined"!=typeof Uint8Array,j=C&&(a.prototype instanceof Uint8Array||a.TYPED_ARRAY_SUPPORT);s.exports=function toBuffer(s,o){if(s instanceof a)return s;if("string"==typeof s)return a.from(s,o);if(C&&w(s)){if(0===s.byteLength)return a.alloc(0);if(j){var i=a.from(s.buffer,s.byteOffset,s.byteLength);if(i.byteLength===s.byteLength)return i}var _=s instanceof Uint8Array?s:new Uint8Array(s.buffer,s.byteOffset,s.byteLength),L=a.from(_);if(L.length===s.byteLength)return L}if(x&&s instanceof Uint8Array)return a.from(s);var B=u(s);if(B)for(var $=0;$<s.length;$+=1){var U=s[$];if("number"!=typeof U||U<0||U>255||~~U!==U)throw new RangeError("Array items must be numbers in the range 0-255.")}if(B||a.isBuffer(s)&&s.constructor&&"function"==typeof s.constructor.isBuffer&&s.constructor.isBuffer(s))return a.from(s);throw new TypeError('The "data" argument must be a string, an Array, a Buffer, a Uint8Array, or a DataView.')}},15389:(s,o,i)=>{var a=i(93663),u=i(87978),_=i(83488),w=i(56449),x=i(50583);s.exports=function baseIteratee(s){return"function"==typeof s?s:null==s?_:"object"==typeof s?w(s)?u(s[0],s[1]):a(s):x(s)}},15972:(s,o,i)=>{"use strict";var a=i(49724),u=i(62250),_=i(39298),w=i(92522),x=i(57382),C=w("IE_PROTO"),j=Object,L=j.prototype;s.exports=x?j.getPrototypeOf:function(s){var o=_(s);if(a(o,C))return o[C];var i=o.constructor;return u(i)&&o instanceof i?i.prototype:o instanceof j?L:null}},16038:(s,o,i)=>{var a=i(5861),u=i(40346);s.exports=function baseIsSet(s){return u(s)&&"[object Set]"==a(s)}},16426:s=>{s.exports=function(){var s=document.getSelection();if(!s.rangeCount)return function(){};for(var o=document.activeElement,i=[],a=0;a<s.rangeCount;a++)i.push(s.getRangeAt(a));switch(o.tagName.toUpperCase()){case"INPUT":case"TEXTAREA":o.blur();break;default:o=null}return s.removeAllRanges(),function(){"Caret"===s.type&&s.removeAllRanges(),s.rangeCount||i.forEach((function(o){s.addRange(o)})),o&&o.focus()}}},16547:(s,o,i)=>{var a=i(43360),u=i(75288),_=Object.prototype.hasOwnProperty;s.exports=function assignValue(s,o,i){var w=s[o];_.call(s,o)&&u(w,i)&&(void 0!==i||o in s)||a(s,o,i)}},16708:(s,o,i)=>{"use strict";var a,u=i(65606);function CorkedRequest(s){var o=this;this.next=null,this.entry=null,this.finish=function(){!function onCorkedFinish(s,o,i){var a=s.entry;s.entry=null;for(;a;){var u=a.callback;o.pendingcb--,u(i),a=a.next}o.corkedRequestsFree.next=s}(o,s)}}s.exports=Writable,Writable.WritableState=WritableState;var _={deprecate:i(94643)},w=i(40345),x=i(48287).Buffer,C=(void 0!==i.g?i.g:"undefined"!=typeof window?window:"undefined"!=typeof self?self:{}).Uint8Array||function(){};var j,L=i(75896),B=i(65291).getHighWaterMark,$=i(86048).F,U=$.ERR_INVALID_ARG_TYPE,V=$.ERR_METHOD_NOT_IMPLEMENTED,z=$.ERR_MULTIPLE_CALLBACK,Y=$.ERR_STREAM_CANNOT_PIPE,Z=$.ERR_STREAM_DESTROYED,ee=$.ERR_STREAM_NULL_VALUES,ie=$.ERR_STREAM_WRITE_AFTER_END,ae=$.ERR_UNKNOWN_ENCODING,ce=L.errorOrDestroy;function nop(){}function WritableState(s,o,_){a=a||i(25382),s=s||{},"boolean"!=typeof _&&(_=o instanceof a),this.objectMode=!!s.objectMode,_&&(this.objectMode=this.objectMode||!!s.writableObjectMode),this.highWaterMark=B(this,s,"writableHighWaterMark",_),this.finalCalled=!1,this.needDrain=!1,this.ending=!1,this.ended=!1,this.finished=!1,this.destroyed=!1;var w=!1===s.decodeStrings;this.decodeStrings=!w,this.defaultEncoding=s.defaultEncoding||"utf8",this.length=0,this.writing=!1,this.corked=0,this.sync=!0,this.bufferProcessing=!1,this.onwrite=function(s){!function onwrite(s,o){var i=s._writableState,a=i.sync,_=i.writecb;if("function"!=typeof _)throw new z;if(function onwriteStateUpdate(s){s.writing=!1,s.writecb=null,s.length-=s.writelen,s.writelen=0}(i),o)!function onwriteError(s,o,i,a,_){--o.pendingcb,i?(u.nextTick(_,a),u.nextTick(finishMaybe,s,o),s._writableState.errorEmitted=!0,ce(s,a)):(_(a),s._writableState.errorEmitted=!0,ce(s,a),finishMaybe(s,o))}(s,i,a,o,_);else{var w=needFinish(i)||s.destroyed;w||i.corked||i.bufferProcessing||!i.bufferedRequest||clearBuffer(s,i),a?u.nextTick(afterWrite,s,i,w,_):afterWrite(s,i,w,_)}}(o,s)},this.writecb=null,this.writelen=0,this.bufferedRequest=null,this.lastBufferedRequest=null,this.pendingcb=0,this.prefinished=!1,this.errorEmitted=!1,this.emitClose=!1!==s.emitClose,this.autoDestroy=!!s.autoDestroy,this.bufferedRequestCount=0,this.corkedRequestsFree=new CorkedRequest(this)}function Writable(s){var o=this instanceof(a=a||i(25382));if(!o&&!j.call(Writable,this))return new Writable(s);this._writableState=new WritableState(s,this,o),this.writable=!0,s&&("function"==typeof s.write&&(this._write=s.write),"function"==typeof s.writev&&(this._writev=s.writev),"function"==typeof s.destroy&&(this._destroy=s.destroy),"function"==typeof s.final&&(this._final=s.final)),w.call(this)}function doWrite(s,o,i,a,u,_,w){o.writelen=a,o.writecb=w,o.writing=!0,o.sync=!0,o.destroyed?o.onwrite(new Z("write")):i?s._writev(u,o.onwrite):s._write(u,_,o.onwrite),o.sync=!1}function afterWrite(s,o,i,a){i||function onwriteDrain(s,o){0===o.length&&o.needDrain&&(o.needDrain=!1,s.emit("drain"))}(s,o),o.pendingcb--,a(),finishMaybe(s,o)}function clearBuffer(s,o){o.bufferProcessing=!0;var i=o.bufferedRequest;if(s._writev&&i&&i.next){var a=o.bufferedRequestCount,u=new Array(a),_=o.corkedRequestsFree;_.entry=i;for(var w=0,x=!0;i;)u[w]=i,i.isBuf||(x=!1),i=i.next,w+=1;u.allBuffers=x,doWrite(s,o,!0,o.length,u,"",_.finish),o.pendingcb++,o.lastBufferedRequest=null,_.next?(o.corkedRequestsFree=_.next,_.next=null):o.corkedRequestsFree=new CorkedRequest(o),o.bufferedRequestCount=0}else{for(;i;){var C=i.chunk,j=i.encoding,L=i.callback;if(doWrite(s,o,!1,o.objectMode?1:C.length,C,j,L),i=i.next,o.bufferedRequestCount--,o.writing)break}null===i&&(o.lastBufferedRequest=null)}o.bufferedRequest=i,o.bufferProcessing=!1}function needFinish(s){return s.ending&&0===s.length&&null===s.bufferedRequest&&!s.finished&&!s.writing}function callFinal(s,o){s._final((function(i){o.pendingcb--,i&&ce(s,i),o.prefinished=!0,s.emit("prefinish"),finishMaybe(s,o)}))}function finishMaybe(s,o){var i=needFinish(o);if(i&&(function prefinish(s,o){o.prefinished||o.finalCalled||("function"!=typeof s._final||o.destroyed?(o.prefinished=!0,s.emit("prefinish")):(o.pendingcb++,o.finalCalled=!0,u.nextTick(callFinal,s,o)))}(s,o),0===o.pendingcb&&(o.finished=!0,s.emit("finish"),o.autoDestroy))){var a=s._readableState;(!a||a.autoDestroy&&a.endEmitted)&&s.destroy()}return i}i(56698)(Writable,w),WritableState.prototype.getBuffer=function getBuffer(){for(var s=this.bufferedRequest,o=[];s;)o.push(s),s=s.next;return o},function(){try{Object.defineProperty(WritableState.prototype,"buffer",{get:_.deprecate((function writableStateBufferGetter(){return this.getBuffer()}),"_writableState.buffer is deprecated. Use _writableState.getBuffer instead.","DEP0003")})}catch(s){}}(),"function"==typeof Symbol&&Symbol.hasInstance&&"function"==typeof Function.prototype[Symbol.hasInstance]?(j=Function.prototype[Symbol.hasInstance],Object.defineProperty(Writable,Symbol.hasInstance,{value:function value(s){return!!j.call(this,s)||this===Writable&&(s&&s._writableState instanceof WritableState)}})):j=function realHasInstance(s){return s instanceof this},Writable.prototype.pipe=function(){ce(this,new Y)},Writable.prototype.write=function(s,o,i){var a=this._writableState,_=!1,w=!a.objectMode&&function _isUint8Array(s){return x.isBuffer(s)||s instanceof C}(s);return w&&!x.isBuffer(s)&&(s=function _uint8ArrayToBuffer(s){return x.from(s)}(s)),"function"==typeof o&&(i=o,o=null),w?o="buffer":o||(o=a.defaultEncoding),"function"!=typeof i&&(i=nop),a.ending?function writeAfterEnd(s,o){var i=new ie;ce(s,i),u.nextTick(o,i)}(this,i):(w||function validChunk(s,o,i,a){var _;return null===i?_=new ee:"string"==typeof i||o.objectMode||(_=new U("chunk",["string","Buffer"],i)),!_||(ce(s,_),u.nextTick(a,_),!1)}(this,a,s,i))&&(a.pendingcb++,_=function writeOrBuffer(s,o,i,a,u,_){if(!i){var w=function decodeChunk(s,o,i){s.objectMode||!1===s.decodeStrings||"string"!=typeof o||(o=x.from(o,i));return o}(o,a,u);a!==w&&(i=!0,u="buffer",a=w)}var C=o.objectMode?1:a.length;o.length+=C;var j=o.length<o.highWaterMark;j||(o.needDrain=!0);if(o.writing||o.corked){var L=o.lastBufferedRequest;o.lastBufferedRequest={chunk:a,encoding:u,isBuf:i,callback:_,next:null},L?L.next=o.lastBufferedRequest:o.bufferedRequest=o.lastBufferedRequest,o.bufferedRequestCount+=1}else doWrite(s,o,!1,C,a,u,_);return j}(this,a,w,s,o,i)),_},Writable.prototype.cork=function(){this._writableState.corked++},Writable.prototype.uncork=function(){var s=this._writableState;s.corked&&(s.corked--,s.writing||s.corked||s.bufferProcessing||!s.bufferedRequest||clearBuffer(this,s))},Writable.prototype.setDefaultEncoding=function setDefaultEncoding(s){if("string"==typeof s&&(s=s.toLowerCase()),!(["hex","utf8","utf-8","ascii","binary","base64","ucs2","ucs-2","utf16le","utf-16le","raw"].indexOf((s+"").toLowerCase())>-1))throw new ae(s);return this._writableState.defaultEncoding=s,this},Object.defineProperty(Writable.prototype,"writableBuffer",{enumerable:!1,get:function get(){return this._writableState&&this._writableState.getBuffer()}}),Object.defineProperty(Writable.prototype,"writableHighWaterMark",{enumerable:!1,get:function get(){return this._writableState.highWaterMark}}),Writable.prototype._write=function(s,o,i){i(new V("_write()"))},Writable.prototype._writev=null,Writable.prototype.end=function(s,o,i){var a=this._writableState;return"function"==typeof s?(i=s,s=null,o=null):"function"==typeof o&&(i=o,o=null),null!=s&&this.write(s,o),a.corked&&(a.corked=1,this.uncork()),a.ending||function endWritable(s,o,i){o.ending=!0,finishMaybe(s,o),i&&(o.finished?u.nextTick(i):s.once("finish",i));o.ended=!0,s.writable=!1}(this,a,i),this},Object.defineProperty(Writable.prototype,"writableLength",{enumerable:!1,get:function get(){return this._writableState.length}}),Object.defineProperty(Writable.prototype,"destroyed",{enumerable:!1,get:function get(){return void 0!==this._writableState&&this._writableState.destroyed},set:function set(s){this._writableState&&(this._writableState.destroyed=s)}}),Writable.prototype.destroy=L.destroy,Writable.prototype._undestroy=L.undestroy,Writable.prototype._destroy=function(s,o){o(s)}},16946:(s,o,i)=>{"use strict";var a=i(1907),u=i(98828),_=i(45807),w=Object,x=a("".split);s.exports=u((function(){return!w("z").propertyIsEnumerable(0)}))?function(s){return"String"===_(s)?x(s,""):w(s)}:w},16962:(s,o)=>{o.aliasToReal={each:"forEach",eachRight:"forEachRight",entries:"toPairs",entriesIn:"toPairsIn",extend:"assignIn",extendAll:"assignInAll",extendAllWith:"assignInAllWith",extendWith:"assignInWith",first:"head",conforms:"conformsTo",matches:"isMatch",property:"get",__:"placeholder",F:"stubFalse",T:"stubTrue",all:"every",allPass:"overEvery",always:"constant",any:"some",anyPass:"overSome",apply:"spread",assoc:"set",assocPath:"set",complement:"negate",compose:"flowRight",contains:"includes",dissoc:"unset",dissocPath:"unset",dropLast:"dropRight",dropLastWhile:"dropRightWhile",equals:"isEqual",identical:"eq",indexBy:"keyBy",init:"initial",invertObj:"invert",juxt:"over",omitAll:"omit",nAry:"ary",path:"get",pathEq:"matchesProperty",pathOr:"getOr",paths:"at",pickAll:"pick",pipe:"flow",pluck:"map",prop:"get",propEq:"matchesProperty",propOr:"getOr",props:"at",symmetricDifference:"xor",symmetricDifferenceBy:"xorBy",symmetricDifferenceWith:"xorWith",takeLast:"takeRight",takeLastWhile:"takeRightWhile",unapply:"rest",unnest:"flatten",useWith:"overArgs",where:"conformsTo",whereEq:"isMatch",zipObj:"zipObject"},o.aryMethod={1:["assignAll","assignInAll","attempt","castArray","ceil","create","curry","curryRight","defaultsAll","defaultsDeepAll","floor","flow","flowRight","fromPairs","invert","iteratee","memoize","method","mergeAll","methodOf","mixin","nthArg","over","overEvery","overSome","rest","reverse","round","runInContext","spread","template","trim","trimEnd","trimStart","uniqueId","words","zipAll"],2:["add","after","ary","assign","assignAllWith","assignIn","assignInAllWith","at","before","bind","bindAll","bindKey","chunk","cloneDeepWith","cloneWith","concat","conformsTo","countBy","curryN","curryRightN","debounce","defaults","defaultsDeep","defaultTo","delay","difference","divide","drop","dropRight","dropRightWhile","dropWhile","endsWith","eq","every","filter","find","findIndex","findKey","findLast","findLastIndex","findLastKey","flatMap","flatMapDeep","flattenDepth","forEach","forEachRight","forIn","forInRight","forOwn","forOwnRight","get","groupBy","gt","gte","has","hasIn","includes","indexOf","intersection","invertBy","invoke","invokeMap","isEqual","isMatch","join","keyBy","lastIndexOf","lt","lte","map","mapKeys","mapValues","matchesProperty","maxBy","meanBy","merge","mergeAllWith","minBy","multiply","nth","omit","omitBy","overArgs","pad","padEnd","padStart","parseInt","partial","partialRight","partition","pick","pickBy","propertyOf","pull","pullAll","pullAt","random","range","rangeRight","rearg","reject","remove","repeat","restFrom","result","sampleSize","some","sortBy","sortedIndex","sortedIndexOf","sortedLastIndex","sortedLastIndexOf","sortedUniqBy","split","spreadFrom","startsWith","subtract","sumBy","take","takeRight","takeRightWhile","takeWhile","tap","throttle","thru","times","trimChars","trimCharsEnd","trimCharsStart","truncate","union","uniqBy","uniqWith","unset","unzipWith","without","wrap","xor","zip","zipObject","zipObjectDeep"],3:["assignInWith","assignWith","clamp","differenceBy","differenceWith","findFrom","findIndexFrom","findLastFrom","findLastIndexFrom","getOr","includesFrom","indexOfFrom","inRange","intersectionBy","intersectionWith","invokeArgs","invokeArgsMap","isEqualWith","isMatchWith","flatMapDepth","lastIndexOfFrom","mergeWith","orderBy","padChars","padCharsEnd","padCharsStart","pullAllBy","pullAllWith","rangeStep","rangeStepRight","reduce","reduceRight","replace","set","slice","sortedIndexBy","sortedLastIndexBy","transform","unionBy","unionWith","update","xorBy","xorWith","zipWith"],4:["fill","setWith","updateWith"]},o.aryRearg={2:[1,0],3:[2,0,1],4:[3,2,0,1]},o.iterateeAry={dropRightWhile:1,dropWhile:1,every:1,filter:1,find:1,findFrom:1,findIndex:1,findIndexFrom:1,findKey:1,findLast:1,findLastFrom:1,findLastIndex:1,findLastIndexFrom:1,findLastKey:1,flatMap:1,flatMapDeep:1,flatMapDepth:1,forEach:1,forEachRight:1,forIn:1,forInRight:1,forOwn:1,forOwnRight:1,map:1,mapKeys:1,mapValues:1,partition:1,reduce:2,reduceRight:2,reject:1,remove:1,some:1,takeRightWhile:1,takeWhile:1,times:1,transform:2},o.iterateeRearg={mapKeys:[1],reduceRight:[1,0]},o.methodRearg={assignInAllWith:[1,0],assignInWith:[1,2,0],assignAllWith:[1,0],assignWith:[1,2,0],differenceBy:[1,2,0],differenceWith:[1,2,0],getOr:[2,1,0],intersectionBy:[1,2,0],intersectionWith:[1,2,0],isEqualWith:[1,2,0],isMatchWith:[2,1,0],mergeAllWith:[1,0],mergeWith:[1,2,0],padChars:[2,1,0],padCharsEnd:[2,1,0],padCharsStart:[2,1,0],pullAllBy:[2,1,0],pullAllWith:[2,1,0],rangeStep:[1,2,0],rangeStepRight:[1,2,0],setWith:[3,1,2,0],sortedIndexBy:[2,1,0],sortedLastIndexBy:[2,1,0],unionBy:[1,2,0],unionWith:[1,2,0],updateWith:[3,1,2,0],xorBy:[1,2,0],xorWith:[1,2,0],zipWith:[1,2,0]},o.methodSpread={assignAll:{start:0},assignAllWith:{start:0},assignInAll:{start:0},assignInAllWith:{start:0},defaultsAll:{start:0},defaultsDeepAll:{start:0},invokeArgs:{start:2},invokeArgsMap:{start:2},mergeAll:{start:0},mergeAllWith:{start:0},partial:{start:1},partialRight:{start:1},without:{start:1},zipAll:{start:0}},o.mutate={array:{fill:!0,pull:!0,pullAll:!0,pullAllBy:!0,pullAllWith:!0,pullAt:!0,remove:!0,reverse:!0},object:{assign:!0,assignAll:!0,assignAllWith:!0,assignIn:!0,assignInAll:!0,assignInAllWith:!0,assignInWith:!0,assignWith:!0,defaults:!0,defaultsAll:!0,defaultsDeep:!0,defaultsDeepAll:!0,merge:!0,mergeAll:!0,mergeAllWith:!0,mergeWith:!0},set:{set:!0,setWith:!0,unset:!0,update:!0,updateWith:!0}},o.realToAlias=function(){var s=Object.prototype.hasOwnProperty,i=o.aliasToReal,a={};for(var u in i){var _=i[u];s.call(a,_)?a[_].push(u):a[_]=[u]}return a}(),o.remap={assignAll:"assign",assignAllWith:"assignWith",assignInAll:"assignIn",assignInAllWith:"assignInWith",curryN:"curry",curryRightN:"curryRight",defaultsAll:"defaults",defaultsDeepAll:"defaultsDeep",findFrom:"find",findIndexFrom:"findIndex",findLastFrom:"findLast",findLastIndexFrom:"findLastIndex",getOr:"get",includesFrom:"includes",indexOfFrom:"indexOf",invokeArgs:"invoke",invokeArgsMap:"invokeMap",lastIndexOfFrom:"lastIndexOf",mergeAll:"merge",mergeAllWith:"mergeWith",padChars:"pad",padCharsEnd:"padEnd",padCharsStart:"padStart",propertyOf:"get",rangeStep:"range",rangeStepRight:"rangeRight",restFrom:"rest",spreadFrom:"spread",trimChars:"trim",trimCharsEnd:"trimEnd",trimCharsStart:"trimStart",zipAll:"zip"},o.skipFixed={castArray:!0,flow:!0,flowRight:!0,iteratee:!0,mixin:!0,rearg:!0,runInContext:!0},o.skipRearg={add:!0,assign:!0,assignIn:!0,bind:!0,bindKey:!0,concat:!0,difference:!0,divide:!0,eq:!0,gt:!0,gte:!0,isEqual:!0,lt:!0,lte:!0,matchesProperty:!0,merge:!0,multiply:!0,overArgs:!0,partial:!0,partialRight:!0,propertyOf:!0,random:!0,range:!0,rangeRight:!0,subtract:!0,zip:!0,zipObject:!0,zipObjectDeep:!0}},17255:(s,o,i)=>{var a=i(47422);s.exports=function basePropertyDeep(s){return function(o){return a(o,s)}}},17285:s=>{function source(s){return s?"string"==typeof s?s:s.source:null}function lookahead(s){return concat("(?=",s,")")}function concat(...s){return s.map((s=>source(s))).join("")}function either(...s){return"("+s.map((s=>source(s))).join("|")+")"}s.exports=function xml(s){const o=concat(/[A-Z_]/,function optional(s){return concat("(",s,")?")}(/[A-Z0-9_.-]*:/),/[A-Z0-9_.-]*/),i={className:"symbol",begin:/&[a-z]+;|&#[0-9]+;|&#x[a-f0-9]+;/},a={begin:/\s/,contains:[{className:"meta-keyword",begin:/#?[a-z_][a-z1-9_-]+/,illegal:/\n/}]},u=s.inherit(a,{begin:/\(/,end:/\)/}),_=s.inherit(s.APOS_STRING_MODE,{className:"meta-string"}),w=s.inherit(s.QUOTE_STRING_MODE,{className:"meta-string"}),x={endsWithParent:!0,illegal:/</,relevance:0,contains:[{className:"attr",begin:/[A-Za-z0-9._:-]+/,relevance:0},{begin:/=\s*/,relevance:0,contains:[{className:"string",endsParent:!0,variants:[{begin:/"/,end:/"/,contains:[i]},{begin:/'/,end:/'/,contains:[i]},{begin:/[^\s"'=<>`]+/}]}]}]};return{name:"HTML, XML",aliases:["html","xhtml","rss","atom","xjb","xsd","xsl","plist","wsf","svg"],case_insensitive:!0,contains:[{className:"meta",begin:/<![a-z]/,end:/>/,relevance:10,contains:[a,w,_,u,{begin:/\[/,end:/\]/,contains:[{className:"meta",begin:/<![a-z]/,end:/>/,contains:[a,u,w,_]}]}]},s.COMMENT(/<!--/,/-->/,{relevance:10}),{begin:/<!\[CDATA\[/,end:/\]\]>/,relevance:10},i,{className:"meta",begin:/<\?xml/,end:/\?>/,relevance:10},{className:"tag",begin:/<style(?=\s|>)/,end:/>/,keywords:{name:"style"},contains:[x],starts:{end:/<\/style>/,returnEnd:!0,subLanguage:["css","xml"]}},{className:"tag",begin:/<script(?=\s|>)/,end:/>/,keywords:{name:"script"},contains:[x],starts:{end:/<\/script>/,returnEnd:!0,subLanguage:["javascript","handlebars","xml"]}},{className:"tag",begin:/<>|<\/>/},{className:"tag",begin:concat(/</,lookahead(concat(o,either(/\/>/,/>/,/\s/)))),end:/\/?>/,contains:[{className:"name",begin:o,relevance:0,starts:x}]},{className:"tag",begin:concat(/<\//,lookahead(concat(o,/>/))),contains:[{className:"name",begin:o,relevance:0},{begin:/>/,relevance:0,endsParent:!0}]}]}}},17400:(s,o,i)=>{var a=i(99374),u=1/0;s.exports=function toFinite(s){return s?(s=a(s))===u||s===-1/0?17976931348623157e292*(s<0?-1:1):s==s?s:0:0===s?s:0}},17533:s=>{s.exports=function yaml(s){var o="true false yes no null",i="[\\w#;/?:@&=+$,.~*'()[\\]]+",a={className:"string",relevance:0,variants:[{begin:/'/,end:/'/},{begin:/"/,end:/"/},{begin:/\S+/}],contains:[s.BACKSLASH_ESCAPE,{className:"template-variable",variants:[{begin:/\{\{/,end:/\}\}/},{begin:/%\{/,end:/\}/}]}]},u=s.inherit(a,{variants:[{begin:/'/,end:/'/},{begin:/"/,end:/"/},{begin:/[^\s,{}[\]]+/}]}),_={className:"number",begin:"\\b[0-9]{4}(-[0-9][0-9]){0,2}([Tt \\t][0-9][0-9]?(:[0-9][0-9]){2})?(\\.[0-9]*)?([ \\t])*(Z|[-+][0-9][0-9]?(:[0-9][0-9])?)?\\b"},w={end:",",endsWithParent:!0,excludeEnd:!0,keywords:o,relevance:0},x={begin:/\{/,end:/\}/,contains:[w],illegal:"\\n",relevance:0},C={begin:"\\[",end:"\\]",contains:[w],illegal:"\\n",relevance:0},j=[{className:"attr",variants:[{begin:"\\w[\\w :\\/.-]*:(?=[ \t]|$)"},{begin:'"\\w[\\w :\\/.-]*":(?=[ \t]|$)'},{begin:"'\\w[\\w :\\/.-]*':(?=[ \t]|$)"}]},{className:"meta",begin:"^---\\s*$",relevance:10},{className:"string",begin:"[\\|>]([1-9]?[+-])?[ ]*\\n( +)[^ ][^\\n]*\\n(\\2[^\\n]+\\n?)*"},{begin:"<%[%=-]?",end:"[%-]?%>",subLanguage:"ruby",excludeBegin:!0,excludeEnd:!0,relevance:0},{className:"type",begin:"!\\w+!"+i},{className:"type",begin:"!<"+i+">"},{className:"type",begin:"!"+i},{className:"type",begin:"!!"+i},{className:"meta",begin:"&"+s.UNDERSCORE_IDENT_RE+"$"},{className:"meta",begin:"\\*"+s.UNDERSCORE_IDENT_RE+"$"},{className:"bullet",begin:"-(?=[ ]|$)",relevance:0},s.HASH_COMMENT_MODE,{beginKeywords:o,keywords:{literal:o}},_,{className:"number",begin:s.C_NUMBER_RE+"\\b",relevance:0},x,C,a],L=[...j];return L.pop(),L.push(u),w.contains=L,{name:"YAML",case_insensitive:!0,aliases:["yml"],contains:j}}},17670:(s,o,i)=>{var a=i(12651);s.exports=function mapCacheDelete(s){var o=a(this,s).delete(s);return this.size-=o?1:0,o}},17965:(s,o,i)=>{"use strict";var a=i(16426),u={"text/plain":"Text","text/html":"Url",default:"Text"};s.exports=function copy(s,o){var i,_,w,x,C,j,L=!1;o||(o={}),i=o.debug||!1;try{if(w=a(),x=document.createRange(),C=document.getSelection(),(j=document.createElement("span")).textContent=s,j.ariaHidden="true",j.style.all="unset",j.style.position="fixed",j.style.top=0,j.style.clip="rect(0, 0, 0, 0)",j.style.whiteSpace="pre",j.style.webkitUserSelect="text",j.style.MozUserSelect="text",j.style.msUserSelect="text",j.style.userSelect="text",j.addEventListener("copy",(function(a){if(a.stopPropagation(),o.format)if(a.preventDefault(),void 0===a.clipboardData){i&&console.warn("unable to use e.clipboardData"),i&&console.warn("trying IE specific stuff"),window.clipboardData.clearData();var _=u[o.format]||u.default;window.clipboardData.setData(_,s)}else a.clipboardData.clearData(),a.clipboardData.setData(o.format,s);o.onCopy&&(a.preventDefault(),o.onCopy(a.clipboardData))})),document.body.appendChild(j),x.selectNodeContents(j),C.addRange(x),!document.execCommand("copy"))throw new Error("copy command was unsuccessful");L=!0}catch(a){i&&console.error("unable to copy using execCommand: ",a),i&&console.warn("trying IE specific stuff");try{window.clipboardData.setData(o.format||"text",s),o.onCopy&&o.onCopy(window.clipboardData),L=!0}catch(a){i&&console.error("unable to copy using clipboardData: ",a),i&&console.error("falling back to prompt"),_=function format(s){var o=(/mac os x/i.test(navigator.userAgent)?"⌘":"Ctrl")+"+C";return s.replace(/#{\s*key\s*}/g,o)}("message"in o?o.message:"Copy to clipboard: #{key}, Enter"),window.prompt(_,s)}}finally{C&&("function"==typeof C.removeRange?C.removeRange(x):C.removeAllRanges()),j&&document.body.removeChild(j),w()}return L}},18073:(s,o,i)=>{var a=i(85087),u=i(54641),_=i(70981);s.exports=function createRecurry(s,o,i,w,x,C,j,L,B,$){var U=8&o;o|=U?32:64,4&(o&=~(U?64:32))||(o&=-4);var V=[s,o,x,U?C:void 0,U?j:void 0,U?void 0:C,U?void 0:j,L,B,$],z=i.apply(void 0,V);return a(s)&&u(z,V),z.placeholder=w,_(z,s,o)}},19123:(s,o,i)=>{var a=i(65606),u=i(31499),_=i(88310).Stream;function resolve(s,o,i){var a,_=function create_indent(s,o){return new Array(o||0).join(s||"")}(o,i=i||0),w=s;if("object"==typeof s&&((w=s[a=Object.keys(s)[0]])&&w._elem))return w._elem.name=a,w._elem.icount=i,w._elem.indent=o,w._elem.indents=_,w._elem.interrupt=w,w._elem;var x,C=[],j=[];function get_attributes(s){Object.keys(s).forEach((function(o){C.push(function attribute(s,o){return s+'="'+u(o)+'"'}(o,s[o]))}))}switch(typeof w){case"object":if(null===w)break;w._attr&&get_attributes(w._attr),w._cdata&&j.push(("<![CDATA["+w._cdata).replace(/\]\]>/g,"]]]]><![CDATA[>")+"]]>"),w.forEach&&(x=!1,j.push(""),w.forEach((function(s){"object"==typeof s?"_attr"==Object.keys(s)[0]?get_attributes(s._attr):j.push(resolve(s,o,i+1)):(j.pop(),x=!0,j.push(u(s)))})),x||j.push(""));break;default:j.push(u(w))}return{name:a,interrupt:!1,attributes:C,content:j,icount:i,indents:_,indent:o}}function format(s,o,i){if("object"!=typeof o)return s(!1,o);var a=o.interrupt?1:o.content.length;function proceed(){for(;o.content.length;){var u=o.content.shift();if(void 0!==u){if(interrupt(u))return;format(s,u)}}s(!1,(a>1?o.indents:"")+(o.name?"</"+o.name+">":"")+(o.indent&&!i?"\n":"")),i&&i()}function interrupt(o){return!!o.interrupt&&(o.interrupt.append=s,o.interrupt.end=proceed,o.interrupt=!1,s(!0),!0)}if(s(!1,o.indents+(o.name?"<"+o.name:"")+(o.attributes.length?" "+o.attributes.join(" "):"")+(a?o.name?">":"":o.name?"/>":"")+(o.indent&&a>1?"\n":"")),!a)return s(!1,o.indent?"\n":"");interrupt(o)||proceed()}s.exports=function xml(s,o){"object"!=typeof o&&(o={indent:o});var i=o.stream?new _:null,u="",w=!1,x=o.indent?!0===o.indent?"    ":o.indent:"",C=!0;function delay(s){C?a.nextTick(s):s()}function append(s,o){if(void 0!==o&&(u+=o),s&&!w&&(i=i||new _,w=!0),s&&w){var a=u;delay((function(){i.emit("data",a)})),u=""}}function add(s,o){format(append,resolve(s,x,x?1:0),o)}function end(){if(i){var s=u;delay((function(){i.emit("data",s),i.emit("end"),i.readable=!1,i.emit("close")}))}}return delay((function(){C=!1})),o.declaration&&function addXmlDeclaration(s){var o={version:"1.0",encoding:s.encoding||"UTF-8"};s.standalone&&(o.standalone=s.standalone),add({"?xml":{_attr:o}}),u=u.replace("/>","?>")}(o.declaration),s&&s.forEach?s.forEach((function(o,i){var a;i+1===s.length&&(a=end),add(o,a)})):add(s,end),i?(i.readable=!0,i):u},s.exports.element=s.exports.Element=function element(){var s={_elem:resolve(Array.prototype.slice.call(arguments)),push:function(s){if(!this.append)throw new Error("not assigned to a parent!");var o=this,i=this._elem.indent;format(this.append,resolve(s,i,this._elem.icount+(i?1:0)),(function(){o.append(!0)}))},close:function(s){void 0!==s&&this.push(s),this.end&&this.end()}};return s}},19219:s=>{s.exports=function cacheHas(s,o){return s.has(o)}},19287:s=>{"use strict";s.exports={CSSRuleList:0,CSSStyleDeclaration:0,CSSValueList:0,ClientRectList:0,DOMRectList:0,DOMStringList:0,DOMTokenList:1,DataTransferItemList:0,FileList:0,HTMLAllCollection:0,HTMLCollection:0,HTMLFormElement:0,HTMLSelectElement:0,MediaList:0,MimeTypeArray:0,NamedNodeMap:0,NodeList:1,PaintRequestList:0,Plugin:0,PluginArray:0,SVGLengthList:0,SVGNumberList:0,SVGPathSegList:0,SVGPointList:0,SVGStringList:0,SVGTransformList:0,SourceBufferList:0,StyleSheetList:0,TextTrackCueList:0,TextTrackList:0,TouchList:0}},19358:(s,o,i)=>{"use strict";var a=i(85582),u=i(49724),_=i(61626),w=i(88280),x=i(79192),C=i(19595),j=i(54829),L=i(34084),B=i(32096),$=i(39259),U=i(85884),V=i(39447),z=i(7376);s.exports=function(s,o,i,Y){var Z="stackTraceLimit",ee=Y?2:1,ie=s.split("."),ae=ie[ie.length-1],ce=a.apply(null,ie);if(ce){var le=ce.prototype;if(!z&&u(le,"cause")&&delete le.cause,!i)return ce;var pe=a("Error"),de=o((function(s,o){var i=B(Y?o:s,void 0),a=Y?new ce(s):new ce;return void 0!==i&&_(a,"message",i),U(a,de,a.stack,2),this&&w(le,this)&&L(a,this,de),arguments.length>ee&&$(a,arguments[ee]),a}));if(de.prototype=le,"Error"!==ae?x?x(de,pe):C(de,pe,{name:!0}):V&&Z in ce&&(j(de,ce,Z),j(de,ce,"prepareStackTrace")),C(de,ce),!z)try{le.name!==ae&&_(le,"name",ae),le.constructor=de}catch(s){}return de}}},19570:(s,o,i)=>{var a=i(37334),u=i(93243),_=i(83488),w=u?function(s,o){return u(s,"toString",{configurable:!0,enumerable:!1,value:a(o),writable:!0})}:_;s.exports=w},19595:(s,o,i)=>{"use strict";var a=i(49724),u=i(11042),_=i(13846),w=i(74284);s.exports=function(s,o,i){for(var x=u(o),C=w.f,j=_.f,L=0;L<x.length;L++){var B=x[L];a(s,B)||i&&a(i,B)||C(s,B,j(o,B))}}},19709:(s,o,i)=>{"use strict";var a=i(23034);s.exports=a},19846:(s,o,i)=>{"use strict";var a=i(20798),u=i(98828),_=i(45951).String;s.exports=!!Object.getOwnPropertySymbols&&!u((function(){var s=Symbol("symbol detection");return!_(s)||!(Object(s)instanceof Symbol)||!Symbol.sham&&a&&a<41}))},19931:(s,o,i)=>{var a=i(31769),u=i(68090),_=i(68969),w=i(77797);s.exports=function baseUnset(s,o){return o=a(o,s),null==(s=_(s,o))||delete s[w(u(o))]}},20181:(s,o,i)=>{var a=/^\s+|\s+$/g,u=/^[-+]0x[0-9a-f]+$/i,_=/^0b[01]+$/i,w=/^0o[0-7]+$/i,x=parseInt,C="object"==typeof i.g&&i.g&&i.g.Object===Object&&i.g,j="object"==typeof self&&self&&self.Object===Object&&self,L=C||j||Function("return this")(),B=Object.prototype.toString,$=Math.max,U=Math.min,now=function(){return L.Date.now()};function isObject(s){var o=typeof s;return!!s&&("object"==o||"function"==o)}function toNumber(s){if("number"==typeof s)return s;if(function isSymbol(s){return"symbol"==typeof s||function isObjectLike(s){return!!s&&"object"==typeof s}(s)&&"[object Symbol]"==B.call(s)}(s))return NaN;if(isObject(s)){var o="function"==typeof s.valueOf?s.valueOf():s;s=isObject(o)?o+"":o}if("string"!=typeof s)return 0===s?s:+s;s=s.replace(a,"");var i=_.test(s);return i||w.test(s)?x(s.slice(2),i?2:8):u.test(s)?NaN:+s}s.exports=function debounce(s,o,i){var a,u,_,w,x,C,j=0,L=!1,B=!1,V=!0;if("function"!=typeof s)throw new TypeError("Expected a function");function invokeFunc(o){var i=a,_=u;return a=u=void 0,j=o,w=s.apply(_,i)}function shouldInvoke(s){var i=s-C;return void 0===C||i>=o||i<0||B&&s-j>=_}function timerExpired(){var s=now();if(shouldInvoke(s))return trailingEdge(s);x=setTimeout(timerExpired,function remainingWait(s){var i=o-(s-C);return B?U(i,_-(s-j)):i}(s))}function trailingEdge(s){return x=void 0,V&&a?invokeFunc(s):(a=u=void 0,w)}function debounced(){var s=now(),i=shouldInvoke(s);if(a=arguments,u=this,C=s,i){if(void 0===x)return function leadingEdge(s){return j=s,x=setTimeout(timerExpired,o),L?invokeFunc(s):w}(C);if(B)return x=setTimeout(timerExpired,o),invokeFunc(C)}return void 0===x&&(x=setTimeout(timerExpired,o)),w}return o=toNumber(o)||0,isObject(i)&&(L=!!i.leading,_=(B="maxWait"in i)?$(toNumber(i.maxWait)||0,o):_,V="trailing"in i?!!i.trailing:V),debounced.cancel=function cancel(){void 0!==x&&clearTimeout(x),j=0,a=C=u=x=void 0},debounced.flush=function flush(){return void 0===x?w:trailingEdge(now())},debounced}},20317:s=>{s.exports=function mapToArray(s){var o=-1,i=Array(s.size);return s.forEach((function(s,a){i[++o]=[a,s]})),i}},20334:(s,o,i)=>{"use strict";var a=i(48287).Buffer;class NonError extends Error{constructor(s){super(NonError._prepareSuperMessage(s)),Object.defineProperty(this,"name",{value:"NonError",configurable:!0,writable:!0}),Error.captureStackTrace&&Error.captureStackTrace(this,NonError)}static _prepareSuperMessage(s){try{return JSON.stringify(s)}catch{return String(s)}}}const u=[{property:"name",enumerable:!1},{property:"message",enumerable:!1},{property:"stack",enumerable:!1},{property:"code",enumerable:!0}],_=Symbol(".toJSON called"),destroyCircular=({from:s,seen:o,to_:i,forceEnumerable:w,maxDepth:x,depth:C})=>{const j=i||(Array.isArray(s)?[]:{});if(o.push(s),C>=x)return j;if("function"==typeof s.toJSON&&!0!==s[_])return(s=>{s[_]=!0;const o=s.toJSON();return delete s[_],o})(s);for(const[i,u]of Object.entries(s))"function"==typeof a&&a.isBuffer(u)?j[i]="[object Buffer]":"function"!=typeof u&&(u&&"object"==typeof u?o.includes(s[i])?j[i]="[Circular]":(C++,j[i]=destroyCircular({from:s[i],seen:o.slice(),forceEnumerable:w,maxDepth:x,depth:C})):j[i]=u);for(const{property:o,enumerable:i}of u)"string"==typeof s[o]&&Object.defineProperty(j,o,{value:s[o],enumerable:!!w||i,configurable:!0,writable:!0});return j};s.exports={serializeError:(s,o={})=>{const{maxDepth:i=Number.POSITIVE_INFINITY}=o;return"object"==typeof s&&null!==s?destroyCircular({from:s,seen:[],forceEnumerable:!0,maxDepth:i,depth:0}):"function"==typeof s?`[Function: ${s.name||"anonymous"}]`:s},deserializeError:(s,o={})=>{const{maxDepth:i=Number.POSITIVE_INFINITY}=o;if(s instanceof Error)return s;if("object"==typeof s&&null!==s&&!Array.isArray(s)){const o=new Error;return destroyCircular({from:s,seen:[],to_:o,maxDepth:i,depth:0}),o}return new NonError(s)}}},20426:s=>{var o=Object.prototype.hasOwnProperty;s.exports=function baseHas(s,i){return null!=s&&o.call(s,i)}},20575:(s,o,i)=>{"use strict";var a=i(3121);s.exports=function(s){return a(s.length)}},20798:(s,o,i)=>{"use strict";var a,u,_=i(45951),w=i(96794),x=_.process,C=_.Deno,j=x&&x.versions||C&&C.version,L=j&&j.v8;L&&(u=(a=L.split("."))[0]>0&&a[0]<4?1:+(a[0]+a[1])),!u&&w&&(!(a=w.match(/Edge\/(\d+)/))||a[1]>=74)&&(a=w.match(/Chrome\/(\d+)/))&&(u=+a[1]),s.exports=u},20850:(s,o,i)=>{"use strict";s.exports=i(46076)},20999:(s,o,i)=>{var a=i(69302),u=i(36800);s.exports=function createAssigner(s){return a((function(o,i){var a=-1,_=i.length,w=_>1?i[_-1]:void 0,x=_>2?i[2]:void 0;for(w=s.length>3&&"function"==typeof w?(_--,w):void 0,x&&u(i[0],i[1],x)&&(w=_<3?void 0:w,_=1),o=Object(o);++a<_;){var C=i[a];C&&s(o,C,a,w)}return o}))}},21549:(s,o,i)=>{var a=i(22032),u=i(63862),_=i(66721),w=i(12749),x=i(35749);function Hash(s){var o=-1,i=null==s?0:s.length;for(this.clear();++o<i;){var a=s[o];this.set(a[0],a[1])}}Hash.prototype.clear=a,Hash.prototype.delete=u,Hash.prototype.get=_,Hash.prototype.has=w,Hash.prototype.set=x,s.exports=Hash},21791:(s,o,i)=>{var a=i(16547),u=i(43360);s.exports=function copyObject(s,o,i,_){var w=!i;i||(i={});for(var x=-1,C=o.length;++x<C;){var j=o[x],L=_?_(i[j],s[j],j,i,s):void 0;void 0===L&&(L=s[j]),w?u(i,j,L):a(i,j,L)}return i}},21986:(s,o,i)=>{var a=i(51873),u=i(37828),_=i(75288),w=i(25911),x=i(20317),C=i(84247),j=a?a.prototype:void 0,L=j?j.valueOf:void 0;s.exports=function equalByTag(s,o,i,a,j,B,$){switch(i){case"[object DataView]":if(s.byteLength!=o.byteLength||s.byteOffset!=o.byteOffset)return!1;s=s.buffer,o=o.buffer;case"[object ArrayBuffer]":return!(s.byteLength!=o.byteLength||!B(new u(s),new u(o)));case"[object Boolean]":case"[object Date]":case"[object Number]":return _(+s,+o);case"[object Error]":return s.name==o.name&&s.message==o.message;case"[object RegExp]":case"[object String]":return s==o+"";case"[object Map]":var U=x;case"[object Set]":var V=1&a;if(U||(U=C),s.size!=o.size&&!V)return!1;var z=$.get(s);if(z)return z==o;a|=2,$.set(s,o);var Y=w(U(s),U(o),a,j,B,$);return $.delete(s),Y;case"[object Symbol]":if(L)return L.call(s)==L.call(o)}return!1}},22032:(s,o,i)=>{var a=i(81042);s.exports=function hashClear(){this.__data__=a?a(null):{},this.size=0}},22225:s=>{var o="\\ud800-\\udfff",i="\\u2700-\\u27bf",a="a-z\\xdf-\\xf6\\xf8-\\xff",u="A-Z\\xc0-\\xd6\\xd8-\\xde",_="\\xac\\xb1\\xd7\\xf7\\x00-\\x2f\\x3a-\\x40\\x5b-\\x60\\x7b-\\xbf\\u2000-\\u206f \\t\\x0b\\f\\xa0\\ufeff\\n\\r\\u2028\\u2029\\u1680\\u180e\\u2000\\u2001\\u2002\\u2003\\u2004\\u2005\\u2006\\u2007\\u2008\\u2009\\u200a\\u202f\\u205f\\u3000",w="["+_+"]",x="\\d+",C="["+i+"]",j="["+a+"]",L="[^"+o+_+x+i+a+u+"]",B="(?:\\ud83c[\\udde6-\\uddff]){2}",$="[\\ud800-\\udbff][\\udc00-\\udfff]",U="["+u+"]",V="(?:"+j+"|"+L+")",z="(?:"+U+"|"+L+")",Y="(?:['’](?:d|ll|m|re|s|t|ve))?",Z="(?:['’](?:D|LL|M|RE|S|T|VE))?",ee="(?:[\\u0300-\\u036f\\ufe20-\\ufe2f\\u20d0-\\u20ff]|\\ud83c[\\udffb-\\udfff])?",ie="[\\ufe0e\\ufe0f]?",ae=ie+ee+("(?:\\u200d(?:"+["[^"+o+"]",B,$].join("|")+")"+ie+ee+")*"),ce="(?:"+[C,B,$].join("|")+")"+ae,le=RegExp([U+"?"+j+"+"+Y+"(?="+[w,U,"$"].join("|")+")",z+"+"+Z+"(?="+[w,U+V,"$"].join("|")+")",U+"?"+V+"+"+Y,U+"+"+Z,"\\d*(?:1ST|2ND|3RD|(?![123])\\dTH)(?=\\b|[a-z_])","\\d*(?:1st|2nd|3rd|(?![123])\\dth)(?=\\b|[A-Z_])",x,ce].join("|"),"g");s.exports=function unicodeWords(s){return s.match(le)||[]}},22551:(s,o,i)=>{"use strict";var a=i(96540),u=i(69982);function p(s){for(var o="https://reactjs.org/docs/error-decoder.html?invariant="+s,i=1;i<arguments.length;i++)o+="&args[]="+encodeURIComponent(arguments[i]);return"Minified React error #"+s+"; visit "+o+" for the full message or use the non-minified dev environment for full errors and additional helpful warnings."}var _=new Set,w={};function fa(s,o){ha(s,o),ha(s+"Capture",o)}function ha(s,o){for(w[s]=o,s=0;s<o.length;s++)_.add(o[s])}var x=!("undefined"==typeof window||void 0===window.document||void 0===window.document.createElement),C=Object.prototype.hasOwnProperty,j=/^[:A-Z_a-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD][:A-Z_a-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\-.0-9\u00B7\u0300-\u036F\u203F-\u2040]*$/,L={},B={};function v(s,o,i,a,u,_,w){this.acceptsBooleans=2===o||3===o||4===o,this.attributeName=a,this.attributeNamespace=u,this.mustUseProperty=i,this.propertyName=s,this.type=o,this.sanitizeURL=_,this.removeEmptyString=w}var $={};"children dangerouslySetInnerHTML defaultValue defaultChecked innerHTML suppressContentEditableWarning suppressHydrationWarning style".split(" ").forEach((function(s){$[s]=new v(s,0,!1,s,null,!1,!1)})),[["acceptCharset","accept-charset"],["className","class"],["htmlFor","for"],["httpEquiv","http-equiv"]].forEach((function(s){var o=s[0];$[o]=new v(o,1,!1,s[1],null,!1,!1)})),["contentEditable","draggable","spellCheck","value"].forEach((function(s){$[s]=new v(s,2,!1,s.toLowerCase(),null,!1,!1)})),["autoReverse","externalResourcesRequired","focusable","preserveAlpha"].forEach((function(s){$[s]=new v(s,2,!1,s,null,!1,!1)})),"allowFullScreen async autoFocus autoPlay controls default defer disabled disablePictureInPicture disableRemotePlayback formNoValidate hidden loop noModule noValidate open playsInline readOnly required reversed scoped seamless itemScope".split(" ").forEach((function(s){$[s]=new v(s,3,!1,s.toLowerCase(),null,!1,!1)})),["checked","multiple","muted","selected"].forEach((function(s){$[s]=new v(s,3,!0,s,null,!1,!1)})),["capture","download"].forEach((function(s){$[s]=new v(s,4,!1,s,null,!1,!1)})),["cols","rows","size","span"].forEach((function(s){$[s]=new v(s,6,!1,s,null,!1,!1)})),["rowSpan","start"].forEach((function(s){$[s]=new v(s,5,!1,s.toLowerCase(),null,!1,!1)}));var U=/[\-:]([a-z])/g;function sa(s){return s[1].toUpperCase()}function ta(s,o,i,a){var u=$.hasOwnProperty(o)?$[o]:null;(null!==u?0!==u.type:a||!(2<o.length)||"o"!==o[0]&&"O"!==o[0]||"n"!==o[1]&&"N"!==o[1])&&(function qa(s,o,i,a){if(null==o||function pa(s,o,i,a){if(null!==i&&0===i.type)return!1;switch(typeof o){case"function":case"symbol":return!0;case"boolean":return!a&&(null!==i?!i.acceptsBooleans:"data-"!==(s=s.toLowerCase().slice(0,5))&&"aria-"!==s);default:return!1}}(s,o,i,a))return!0;if(a)return!1;if(null!==i)switch(i.type){case 3:return!o;case 4:return!1===o;case 5:return isNaN(o);case 6:return isNaN(o)||1>o}return!1}(o,i,u,a)&&(i=null),a||null===u?function oa(s){return!!C.call(B,s)||!C.call(L,s)&&(j.test(s)?B[s]=!0:(L[s]=!0,!1))}(o)&&(null===i?s.removeAttribute(o):s.setAttribute(o,""+i)):u.mustUseProperty?s[u.propertyName]=null===i?3!==u.type&&"":i:(o=u.attributeName,a=u.attributeNamespace,null===i?s.removeAttribute(o):(i=3===(u=u.type)||4===u&&!0===i?"":""+i,a?s.setAttributeNS(a,o,i):s.setAttribute(o,i))))}"accent-height alignment-baseline arabic-form baseline-shift cap-height clip-path clip-rule color-interpolation color-interpolation-filters color-profile color-rendering dominant-baseline enable-background fill-opacity fill-rule flood-color flood-opacity font-family font-size font-size-adjust font-stretch font-style font-variant font-weight glyph-name glyph-orientation-horizontal glyph-orientation-vertical horiz-adv-x horiz-origin-x image-rendering letter-spacing lighting-color marker-end marker-mid marker-start overline-position overline-thickness paint-order panose-1 pointer-events rendering-intent shape-rendering stop-color stop-opacity strikethrough-position strikethrough-thickness stroke-dasharray stroke-dashoffset stroke-linecap stroke-linejoin stroke-miterlimit stroke-opacity stroke-width text-anchor text-decoration text-rendering underline-position underline-thickness unicode-bidi unicode-range units-per-em v-alphabetic v-hanging v-ideographic v-mathematical vector-effect vert-adv-y vert-origin-x vert-origin-y word-spacing writing-mode xmlns:xlink x-height".split(" ").forEach((function(s){var o=s.replace(U,sa);$[o]=new v(o,1,!1,s,null,!1,!1)})),"xlink:actuate xlink:arcrole xlink:role xlink:show xlink:title xlink:type".split(" ").forEach((function(s){var o=s.replace(U,sa);$[o]=new v(o,1,!1,s,"http://www.w3.org/1999/xlink",!1,!1)})),["xml:base","xml:lang","xml:space"].forEach((function(s){var o=s.replace(U,sa);$[o]=new v(o,1,!1,s,"http://www.w3.org/XML/1998/namespace",!1,!1)})),["tabIndex","crossOrigin"].forEach((function(s){$[s]=new v(s,1,!1,s.toLowerCase(),null,!1,!1)})),$.xlinkHref=new v("xlinkHref",1,!1,"xlink:href","http://www.w3.org/1999/xlink",!0,!1),["src","href","action","formAction"].forEach((function(s){$[s]=new v(s,1,!1,s.toLowerCase(),null,!0,!0)}));var V=a.__SECRET_INTERNALS_DO_NOT_USE_OR_YOU_WILL_BE_FIRED,z=Symbol.for("react.element"),Y=Symbol.for("react.portal"),Z=Symbol.for("react.fragment"),ee=Symbol.for("react.strict_mode"),ie=Symbol.for("react.profiler"),ae=Symbol.for("react.provider"),ce=Symbol.for("react.context"),le=Symbol.for("react.forward_ref"),pe=Symbol.for("react.suspense"),de=Symbol.for("react.suspense_list"),fe=Symbol.for("react.memo"),ye=Symbol.for("react.lazy");Symbol.for("react.scope"),Symbol.for("react.debug_trace_mode");var be=Symbol.for("react.offscreen");Symbol.for("react.legacy_hidden"),Symbol.for("react.cache"),Symbol.for("react.tracing_marker");var _e=Symbol.iterator;function Ka(s){return null===s||"object"!=typeof s?null:"function"==typeof(s=_e&&s[_e]||s["@@iterator"])?s:null}var Se,we=Object.assign;function Ma(s){if(void 0===Se)try{throw Error()}catch(s){var o=s.stack.trim().match(/\n( *(at )?)/);Se=o&&o[1]||""}return"\n"+Se+s}var xe=!1;function Oa(s,o){if(!s||xe)return"";xe=!0;var i=Error.prepareStackTrace;Error.prepareStackTrace=void 0;try{if(o)if(o=function(){throw Error()},Object.defineProperty(o.prototype,"props",{set:function(){throw Error()}}),"object"==typeof Reflect&&Reflect.construct){try{Reflect.construct(o,[])}catch(s){var a=s}Reflect.construct(s,[],o)}else{try{o.call()}catch(s){a=s}s.call(o.prototype)}else{try{throw Error()}catch(s){a=s}s()}}catch(o){if(o&&a&&"string"==typeof o.stack){for(var u=o.stack.split("\n"),_=a.stack.split("\n"),w=u.length-1,x=_.length-1;1<=w&&0<=x&&u[w]!==_[x];)x--;for(;1<=w&&0<=x;w--,x--)if(u[w]!==_[x]){if(1!==w||1!==x)do{if(w--,0>--x||u[w]!==_[x]){var C="\n"+u[w].replace(" at new "," at ");return s.displayName&&C.includes("<anonymous>")&&(C=C.replace("<anonymous>",s.displayName)),C}}while(1<=w&&0<=x);break}}}finally{xe=!1,Error.prepareStackTrace=i}return(s=s?s.displayName||s.name:"")?Ma(s):""}function Pa(s){switch(s.tag){case 5:return Ma(s.type);case 16:return Ma("Lazy");case 13:return Ma("Suspense");case 19:return Ma("SuspenseList");case 0:case 2:case 15:return s=Oa(s.type,!1);case 11:return s=Oa(s.type.render,!1);case 1:return s=Oa(s.type,!0);default:return""}}function Qa(s){if(null==s)return null;if("function"==typeof s)return s.displayName||s.name||null;if("string"==typeof s)return s;switch(s){case Z:return"Fragment";case Y:return"Portal";case ie:return"Profiler";case ee:return"StrictMode";case pe:return"Suspense";case de:return"SuspenseList"}if("object"==typeof s)switch(s.$$typeof){case ce:return(s.displayName||"Context")+".Consumer";case ae:return(s._context.displayName||"Context")+".Provider";case le:var o=s.render;return(s=s.displayName)||(s=""!==(s=o.displayName||o.name||"")?"ForwardRef("+s+")":"ForwardRef"),s;case fe:return null!==(o=s.displayName||null)?o:Qa(s.type)||"Memo";case ye:o=s._payload,s=s._init;try{return Qa(s(o))}catch(s){}}return null}function Ra(s){var o=s.type;switch(s.tag){case 24:return"Cache";case 9:return(o.displayName||"Context")+".Consumer";case 10:return(o._context.displayName||"Context")+".Provider";case 18:return"DehydratedFragment";case 11:return s=(s=o.render).displayName||s.name||"",o.displayName||(""!==s?"ForwardRef("+s+")":"ForwardRef");case 7:return"Fragment";case 5:return o;case 4:return"Portal";case 3:return"Root";case 6:return"Text";case 16:return Qa(o);case 8:return o===ee?"StrictMode":"Mode";case 22:return"Offscreen";case 12:return"Profiler";case 21:return"Scope";case 13:return"Suspense";case 19:return"SuspenseList";case 25:return"TracingMarker";case 1:case 0:case 17:case 2:case 14:case 15:if("function"==typeof o)return o.displayName||o.name||null;if("string"==typeof o)return o}return null}function Sa(s){switch(typeof s){case"boolean":case"number":case"string":case"undefined":case"object":return s;default:return""}}function Ta(s){var o=s.type;return(s=s.nodeName)&&"input"===s.toLowerCase()&&("checkbox"===o||"radio"===o)}function Va(s){s._valueTracker||(s._valueTracker=function Ua(s){var o=Ta(s)?"checked":"value",i=Object.getOwnPropertyDescriptor(s.constructor.prototype,o),a=""+s[o];if(!s.hasOwnProperty(o)&&void 0!==i&&"function"==typeof i.get&&"function"==typeof i.set){var u=i.get,_=i.set;return Object.defineProperty(s,o,{configurable:!0,get:function(){return u.call(this)},set:function(s){a=""+s,_.call(this,s)}}),Object.defineProperty(s,o,{enumerable:i.enumerable}),{getValue:function(){return a},setValue:function(s){a=""+s},stopTracking:function(){s._valueTracker=null,delete s[o]}}}}(s))}function Wa(s){if(!s)return!1;var o=s._valueTracker;if(!o)return!0;var i=o.getValue(),a="";return s&&(a=Ta(s)?s.checked?"true":"false":s.value),(s=a)!==i&&(o.setValue(s),!0)}function Xa(s){if(void 0===(s=s||("undefined"!=typeof document?document:void 0)))return null;try{return s.activeElement||s.body}catch(o){return s.body}}function Ya(s,o){var i=o.checked;return we({},o,{defaultChecked:void 0,defaultValue:void 0,value:void 0,checked:null!=i?i:s._wrapperState.initialChecked})}function Za(s,o){var i=null==o.defaultValue?"":o.defaultValue,a=null!=o.checked?o.checked:o.defaultChecked;i=Sa(null!=o.value?o.value:i),s._wrapperState={initialChecked:a,initialValue:i,controlled:"checkbox"===o.type||"radio"===o.type?null!=o.checked:null!=o.value}}function ab(s,o){null!=(o=o.checked)&&ta(s,"checked",o,!1)}function bb(s,o){ab(s,o);var i=Sa(o.value),a=o.type;if(null!=i)"number"===a?(0===i&&""===s.value||s.value!=i)&&(s.value=""+i):s.value!==""+i&&(s.value=""+i);else if("submit"===a||"reset"===a)return void s.removeAttribute("value");o.hasOwnProperty("value")?cb(s,o.type,i):o.hasOwnProperty("defaultValue")&&cb(s,o.type,Sa(o.defaultValue)),null==o.checked&&null!=o.defaultChecked&&(s.defaultChecked=!!o.defaultChecked)}function db(s,o,i){if(o.hasOwnProperty("value")||o.hasOwnProperty("defaultValue")){var a=o.type;if(!("submit"!==a&&"reset"!==a||void 0!==o.value&&null!==o.value))return;o=""+s._wrapperState.initialValue,i||o===s.value||(s.value=o),s.defaultValue=o}""!==(i=s.name)&&(s.name=""),s.defaultChecked=!!s._wrapperState.initialChecked,""!==i&&(s.name=i)}function cb(s,o,i){"number"===o&&Xa(s.ownerDocument)===s||(null==i?s.defaultValue=""+s._wrapperState.initialValue:s.defaultValue!==""+i&&(s.defaultValue=""+i))}var Pe=Array.isArray;function fb(s,o,i,a){if(s=s.options,o){o={};for(var u=0;u<i.length;u++)o["$"+i[u]]=!0;for(i=0;i<s.length;i++)u=o.hasOwnProperty("$"+s[i].value),s[i].selected!==u&&(s[i].selected=u),u&&a&&(s[i].defaultSelected=!0)}else{for(i=""+Sa(i),o=null,u=0;u<s.length;u++){if(s[u].value===i)return s[u].selected=!0,void(a&&(s[u].defaultSelected=!0));null!==o||s[u].disabled||(o=s[u])}null!==o&&(o.selected=!0)}}function gb(s,o){if(null!=o.dangerouslySetInnerHTML)throw Error(p(91));return we({},o,{value:void 0,defaultValue:void 0,children:""+s._wrapperState.initialValue})}function hb(s,o){var i=o.value;if(null==i){if(i=o.children,o=o.defaultValue,null!=i){if(null!=o)throw Error(p(92));if(Pe(i)){if(1<i.length)throw Error(p(93));i=i[0]}o=i}null==o&&(o=""),i=o}s._wrapperState={initialValue:Sa(i)}}function ib(s,o){var i=Sa(o.value),a=Sa(o.defaultValue);null!=i&&((i=""+i)!==s.value&&(s.value=i),null==o.defaultValue&&s.defaultValue!==i&&(s.defaultValue=i)),null!=a&&(s.defaultValue=""+a)}function jb(s){var o=s.textContent;o===s._wrapperState.initialValue&&""!==o&&null!==o&&(s.value=o)}function kb(s){switch(s){case"svg":return"http://www.w3.org/2000/svg";case"math":return"http://www.w3.org/1998/Math/MathML";default:return"http://www.w3.org/1999/xhtml"}}function lb(s,o){return null==s||"http://www.w3.org/1999/xhtml"===s?kb(o):"http://www.w3.org/2000/svg"===s&&"foreignObject"===o?"http://www.w3.org/1999/xhtml":s}var Te,Re,$e=(Re=function(s,o){if("http://www.w3.org/2000/svg"!==s.namespaceURI||"innerHTML"in s)s.innerHTML=o;else{for((Te=Te||document.createElement("div")).innerHTML="<svg>"+o.valueOf().toString()+"</svg>",o=Te.firstChild;s.firstChild;)s.removeChild(s.firstChild);for(;o.firstChild;)s.appendChild(o.firstChild)}},"undefined"!=typeof MSApp&&MSApp.execUnsafeLocalFunction?function(s,o,i,a){MSApp.execUnsafeLocalFunction((function(){return Re(s,o)}))}:Re);function ob(s,o){if(o){var i=s.firstChild;if(i&&i===s.lastChild&&3===i.nodeType)return void(i.nodeValue=o)}s.textContent=o}var qe={animationIterationCount:!0,aspectRatio:!0,borderImageOutset:!0,borderImageSlice:!0,borderImageWidth:!0,boxFlex:!0,boxFlexGroup:!0,boxOrdinalGroup:!0,columnCount:!0,columns:!0,flex:!0,flexGrow:!0,flexPositive:!0,flexShrink:!0,flexNegative:!0,flexOrder:!0,gridArea:!0,gridRow:!0,gridRowEnd:!0,gridRowSpan:!0,gridRowStart:!0,gridColumn:!0,gridColumnEnd:!0,gridColumnSpan:!0,gridColumnStart:!0,fontWeight:!0,lineClamp:!0,lineHeight:!0,opacity:!0,order:!0,orphans:!0,tabSize:!0,widows:!0,zIndex:!0,zoom:!0,fillOpacity:!0,floodOpacity:!0,stopOpacity:!0,strokeDasharray:!0,strokeDashoffset:!0,strokeMiterlimit:!0,strokeOpacity:!0,strokeWidth:!0},ze=["Webkit","ms","Moz","O"];function rb(s,o,i){return null==o||"boolean"==typeof o||""===o?"":i||"number"!=typeof o||0===o||qe.hasOwnProperty(s)&&qe[s]?(""+o).trim():o+"px"}function sb(s,o){for(var i in s=s.style,o)if(o.hasOwnProperty(i)){var a=0===i.indexOf("--"),u=rb(i,o[i],a);"float"===i&&(i="cssFloat"),a?s.setProperty(i,u):s[i]=u}}Object.keys(qe).forEach((function(s){ze.forEach((function(o){o=o+s.charAt(0).toUpperCase()+s.substring(1),qe[o]=qe[s]}))}));var We=we({menuitem:!0},{area:!0,base:!0,br:!0,col:!0,embed:!0,hr:!0,img:!0,input:!0,keygen:!0,link:!0,meta:!0,param:!0,source:!0,track:!0,wbr:!0});function ub(s,o){if(o){if(We[s]&&(null!=o.children||null!=o.dangerouslySetInnerHTML))throw Error(p(137,s));if(null!=o.dangerouslySetInnerHTML){if(null!=o.children)throw Error(p(60));if("object"!=typeof o.dangerouslySetInnerHTML||!("__html"in o.dangerouslySetInnerHTML))throw Error(p(61))}if(null!=o.style&&"object"!=typeof o.style)throw Error(p(62))}}function vb(s,o){if(-1===s.indexOf("-"))return"string"==typeof o.is;switch(s){case"annotation-xml":case"color-profile":case"font-face":case"font-face-src":case"font-face-uri":case"font-face-format":case"font-face-name":case"missing-glyph":return!1;default:return!0}}var He=null;function xb(s){return(s=s.target||s.srcElement||window).correspondingUseElement&&(s=s.correspondingUseElement),3===s.nodeType?s.parentNode:s}var Ye=null,Xe=null,Qe=null;function Bb(s){if(s=Cb(s)){if("function"!=typeof Ye)throw Error(p(280));var o=s.stateNode;o&&(o=Db(o),Ye(s.stateNode,s.type,o))}}function Eb(s){Xe?Qe?Qe.push(s):Qe=[s]:Xe=s}function Fb(){if(Xe){var s=Xe,o=Qe;if(Qe=Xe=null,Bb(s),o)for(s=0;s<o.length;s++)Bb(o[s])}}function Gb(s,o){return s(o)}function Hb(){}var et=!1;function Jb(s,o,i){if(et)return s(o,i);et=!0;try{return Gb(s,o,i)}finally{et=!1,(null!==Xe||null!==Qe)&&(Hb(),Fb())}}function Kb(s,o){var i=s.stateNode;if(null===i)return null;var a=Db(i);if(null===a)return null;i=a[o];e:switch(o){case"onClick":case"onClickCapture":case"onDoubleClick":case"onDoubleClickCapture":case"onMouseDown":case"onMouseDownCapture":case"onMouseMove":case"onMouseMoveCapture":case"onMouseUp":case"onMouseUpCapture":case"onMouseEnter":(a=!a.disabled)||(a=!("button"===(s=s.type)||"input"===s||"select"===s||"textarea"===s)),s=!a;break e;default:s=!1}if(s)return null;if(i&&"function"!=typeof i)throw Error(p(231,o,typeof i));return i}var tt=!1;if(x)try{var rt={};Object.defineProperty(rt,"passive",{get:function(){tt=!0}}),window.addEventListener("test",rt,rt),window.removeEventListener("test",rt,rt)}catch(Re){tt=!1}function Nb(s,o,i,a,u,_,w,x,C){var j=Array.prototype.slice.call(arguments,3);try{o.apply(i,j)}catch(s){this.onError(s)}}var nt=!1,st=null,ot=!1,it=null,at={onError:function(s){nt=!0,st=s}};function Tb(s,o,i,a,u,_,w,x,C){nt=!1,st=null,Nb.apply(at,arguments)}function Vb(s){var o=s,i=s;if(s.alternate)for(;o.return;)o=o.return;else{s=o;do{!!(4098&(o=s).flags)&&(i=o.return),s=o.return}while(s)}return 3===o.tag?i:null}function Wb(s){if(13===s.tag){var o=s.memoizedState;if(null===o&&(null!==(s=s.alternate)&&(o=s.memoizedState)),null!==o)return o.dehydrated}return null}function Xb(s){if(Vb(s)!==s)throw Error(p(188))}function Zb(s){return null!==(s=function Yb(s){var o=s.alternate;if(!o){if(null===(o=Vb(s)))throw Error(p(188));return o!==s?null:s}for(var i=s,a=o;;){var u=i.return;if(null===u)break;var _=u.alternate;if(null===_){if(null!==(a=u.return)){i=a;continue}break}if(u.child===_.child){for(_=u.child;_;){if(_===i)return Xb(u),s;if(_===a)return Xb(u),o;_=_.sibling}throw Error(p(188))}if(i.return!==a.return)i=u,a=_;else{for(var w=!1,x=u.child;x;){if(x===i){w=!0,i=u,a=_;break}if(x===a){w=!0,a=u,i=_;break}x=x.sibling}if(!w){for(x=_.child;x;){if(x===i){w=!0,i=_,a=u;break}if(x===a){w=!0,a=_,i=u;break}x=x.sibling}if(!w)throw Error(p(189))}}if(i.alternate!==a)throw Error(p(190))}if(3!==i.tag)throw Error(p(188));return i.stateNode.current===i?s:o}(s))?$b(s):null}function $b(s){if(5===s.tag||6===s.tag)return s;for(s=s.child;null!==s;){var o=$b(s);if(null!==o)return o;s=s.sibling}return null}var ct=u.unstable_scheduleCallback,lt=u.unstable_cancelCallback,ut=u.unstable_shouldYield,pt=u.unstable_requestPaint,ht=u.unstable_now,dt=u.unstable_getCurrentPriorityLevel,mt=u.unstable_ImmediatePriority,gt=u.unstable_UserBlockingPriority,yt=u.unstable_NormalPriority,vt=u.unstable_LowPriority,bt=u.unstable_IdlePriority,_t=null,St=null;var Et=Math.clz32?Math.clz32:function nc(s){return s>>>=0,0===s?32:31-(wt(s)/xt|0)|0},wt=Math.log,xt=Math.LN2;var kt=64,Ot=4194304;function tc(s){switch(s&-s){case 1:return 1;case 2:return 2;case 4:return 4;case 8:return 8;case 16:return 16;case 32:return 32;case 64:case 128:case 256:case 512:case 1024:case 2048:case 4096:case 8192:case 16384:case 32768:case 65536:case 131072:case 262144:case 524288:case 1048576:case 2097152:return 4194240&s;case 4194304:case 8388608:case 16777216:case 33554432:case 67108864:return 130023424&s;case 134217728:return 134217728;case 268435456:return 268435456;case 536870912:return 536870912;case 1073741824:return 1073741824;default:return s}}function uc(s,o){var i=s.pendingLanes;if(0===i)return 0;var a=0,u=s.suspendedLanes,_=s.pingedLanes,w=268435455&i;if(0!==w){var x=w&~u;0!==x?a=tc(x):0!==(_&=w)&&(a=tc(_))}else 0!==(w=i&~u)?a=tc(w):0!==_&&(a=tc(_));if(0===a)return 0;if(0!==o&&o!==a&&!(o&u)&&((u=a&-a)>=(_=o&-o)||16===u&&4194240&_))return o;if(4&a&&(a|=16&i),0!==(o=s.entangledLanes))for(s=s.entanglements,o&=a;0<o;)u=1<<(i=31-Et(o)),a|=s[i],o&=~u;return a}function vc(s,o){switch(s){case 1:case 2:case 4:return o+250;case 8:case 16:case 32:case 64:case 128:case 256:case 512:case 1024:case 2048:case 4096:case 8192:case 16384:case 32768:case 65536:case 131072:case 262144:case 524288:case 1048576:case 2097152:return o+5e3;default:return-1}}function xc(s){return 0!==(s=-1073741825&s.pendingLanes)?s:1073741824&s?1073741824:0}function yc(){var s=kt;return!(4194240&(kt<<=1))&&(kt=64),s}function zc(s){for(var o=[],i=0;31>i;i++)o.push(s);return o}function Ac(s,o,i){s.pendingLanes|=o,536870912!==o&&(s.suspendedLanes=0,s.pingedLanes=0),(s=s.eventTimes)[o=31-Et(o)]=i}function Cc(s,o){var i=s.entangledLanes|=o;for(s=s.entanglements;i;){var a=31-Et(i),u=1<<a;u&o|s[a]&o&&(s[a]|=o),i&=~u}}var At=0;function Dc(s){return 1<(s&=-s)?4<s?268435455&s?16:536870912:4:1}var Ct,jt,Pt,It,Tt,Nt=!1,Mt=[],Rt=null,Dt=null,Lt=null,Ft=new Map,Bt=new Map,$t=[],qt="mousedown mouseup touchcancel touchend touchstart auxclick dblclick pointercancel pointerdown pointerup dragend dragstart drop compositionend compositionstart keydown keypress keyup input textInput copy cut paste click change contextmenu reset submit".split(" ");function Sc(s,o){switch(s){case"focusin":case"focusout":Rt=null;break;case"dragenter":case"dragleave":Dt=null;break;case"mouseover":case"mouseout":Lt=null;break;case"pointerover":case"pointerout":Ft.delete(o.pointerId);break;case"gotpointercapture":case"lostpointercapture":Bt.delete(o.pointerId)}}function Tc(s,o,i,a,u,_){return null===s||s.nativeEvent!==_?(s={blockedOn:o,domEventName:i,eventSystemFlags:a,nativeEvent:_,targetContainers:[u]},null!==o&&(null!==(o=Cb(o))&&jt(o)),s):(s.eventSystemFlags|=a,o=s.targetContainers,null!==u&&-1===o.indexOf(u)&&o.push(u),s)}function Vc(s){var o=Wc(s.target);if(null!==o){var i=Vb(o);if(null!==i)if(13===(o=i.tag)){if(null!==(o=Wb(i)))return s.blockedOn=o,void Tt(s.priority,(function(){Pt(i)}))}else if(3===o&&i.stateNode.current.memoizedState.isDehydrated)return void(s.blockedOn=3===i.tag?i.stateNode.containerInfo:null)}s.blockedOn=null}function Xc(s){if(null!==s.blockedOn)return!1;for(var o=s.targetContainers;0<o.length;){var i=Yc(s.domEventName,s.eventSystemFlags,o[0],s.nativeEvent);if(null!==i)return null!==(o=Cb(i))&&jt(o),s.blockedOn=i,!1;var a=new(i=s.nativeEvent).constructor(i.type,i);He=a,i.target.dispatchEvent(a),He=null,o.shift()}return!0}function Zc(s,o,i){Xc(s)&&i.delete(o)}function $c(){Nt=!1,null!==Rt&&Xc(Rt)&&(Rt=null),null!==Dt&&Xc(Dt)&&(Dt=null),null!==Lt&&Xc(Lt)&&(Lt=null),Ft.forEach(Zc),Bt.forEach(Zc)}function ad(s,o){s.blockedOn===o&&(s.blockedOn=null,Nt||(Nt=!0,u.unstable_scheduleCallback(u.unstable_NormalPriority,$c)))}function bd(s){function b(o){return ad(o,s)}if(0<Mt.length){ad(Mt[0],s);for(var o=1;o<Mt.length;o++){var i=Mt[o];i.blockedOn===s&&(i.blockedOn=null)}}for(null!==Rt&&ad(Rt,s),null!==Dt&&ad(Dt,s),null!==Lt&&ad(Lt,s),Ft.forEach(b),Bt.forEach(b),o=0;o<$t.length;o++)(i=$t[o]).blockedOn===s&&(i.blockedOn=null);for(;0<$t.length&&null===(o=$t[0]).blockedOn;)Vc(o),null===o.blockedOn&&$t.shift()}var Ut=V.ReactCurrentBatchConfig,Vt=!0;function ed(s,o,i,a){var u=At,_=Ut.transition;Ut.transition=null;try{At=1,fd(s,o,i,a)}finally{At=u,Ut.transition=_}}function gd(s,o,i,a){var u=At,_=Ut.transition;Ut.transition=null;try{At=4,fd(s,o,i,a)}finally{At=u,Ut.transition=_}}function fd(s,o,i,a){if(Vt){var u=Yc(s,o,i,a);if(null===u)hd(s,o,a,zt,i),Sc(s,a);else if(function Uc(s,o,i,a,u){switch(o){case"focusin":return Rt=Tc(Rt,s,o,i,a,u),!0;case"dragenter":return Dt=Tc(Dt,s,o,i,a,u),!0;case"mouseover":return Lt=Tc(Lt,s,o,i,a,u),!0;case"pointerover":var _=u.pointerId;return Ft.set(_,Tc(Ft.get(_)||null,s,o,i,a,u)),!0;case"gotpointercapture":return _=u.pointerId,Bt.set(_,Tc(Bt.get(_)||null,s,o,i,a,u)),!0}return!1}(u,s,o,i,a))a.stopPropagation();else if(Sc(s,a),4&o&&-1<qt.indexOf(s)){for(;null!==u;){var _=Cb(u);if(null!==_&&Ct(_),null===(_=Yc(s,o,i,a))&&hd(s,o,a,zt,i),_===u)break;u=_}null!==u&&a.stopPropagation()}else hd(s,o,a,null,i)}}var zt=null;function Yc(s,o,i,a){if(zt=null,null!==(s=Wc(s=xb(a))))if(null===(o=Vb(s)))s=null;else if(13===(i=o.tag)){if(null!==(s=Wb(o)))return s;s=null}else if(3===i){if(o.stateNode.current.memoizedState.isDehydrated)return 3===o.tag?o.stateNode.containerInfo:null;s=null}else o!==s&&(s=null);return zt=s,null}function jd(s){switch(s){case"cancel":case"click":case"close":case"contextmenu":case"copy":case"cut":case"auxclick":case"dblclick":case"dragend":case"dragstart":case"drop":case"focusin":case"focusout":case"input":case"invalid":case"keydown":case"keypress":case"keyup":case"mousedown":case"mouseup":case"paste":case"pause":case"play":case"pointercancel":case"pointerdown":case"pointerup":case"ratechange":case"reset":case"resize":case"seeked":case"submit":case"touchcancel":case"touchend":case"touchstart":case"volumechange":case"change":case"selectionchange":case"textInput":case"compositionstart":case"compositionend":case"compositionupdate":case"beforeblur":case"afterblur":case"beforeinput":case"blur":case"fullscreenchange":case"focus":case"hashchange":case"popstate":case"select":case"selectstart":return 1;case"drag":case"dragenter":case"dragexit":case"dragleave":case"dragover":case"mousemove":case"mouseout":case"mouseover":case"pointermove":case"pointerout":case"pointerover":case"scroll":case"toggle":case"touchmove":case"wheel":case"mouseenter":case"mouseleave":case"pointerenter":case"pointerleave":return 4;case"message":switch(dt()){case mt:return 1;case gt:return 4;case yt:case vt:return 16;case bt:return 536870912;default:return 16}default:return 16}}var Wt=null,Jt=null,Ht=null;function nd(){if(Ht)return Ht;var s,o,i=Jt,a=i.length,u="value"in Wt?Wt.value:Wt.textContent,_=u.length;for(s=0;s<a&&i[s]===u[s];s++);var w=a-s;for(o=1;o<=w&&i[a-o]===u[_-o];o++);return Ht=u.slice(s,1<o?1-o:void 0)}function od(s){var o=s.keyCode;return"charCode"in s?0===(s=s.charCode)&&13===o&&(s=13):s=o,10===s&&(s=13),32<=s||13===s?s:0}function pd(){return!0}function qd(){return!1}function rd(s){function b(o,i,a,u,_){for(var w in this._reactName=o,this._targetInst=a,this.type=i,this.nativeEvent=u,this.target=_,this.currentTarget=null,s)s.hasOwnProperty(w)&&(o=s[w],this[w]=o?o(u):u[w]);return this.isDefaultPrevented=(null!=u.defaultPrevented?u.defaultPrevented:!1===u.returnValue)?pd:qd,this.isPropagationStopped=qd,this}return we(b.prototype,{preventDefault:function(){this.defaultPrevented=!0;var s=this.nativeEvent;s&&(s.preventDefault?s.preventDefault():"unknown"!=typeof s.returnValue&&(s.returnValue=!1),this.isDefaultPrevented=pd)},stopPropagation:function(){var s=this.nativeEvent;s&&(s.stopPropagation?s.stopPropagation():"unknown"!=typeof s.cancelBubble&&(s.cancelBubble=!0),this.isPropagationStopped=pd)},persist:function(){},isPersistent:pd}),b}var Kt,Gt,Yt,Xt={eventPhase:0,bubbles:0,cancelable:0,timeStamp:function(s){return s.timeStamp||Date.now()},defaultPrevented:0,isTrusted:0},Qt=rd(Xt),Zt=we({},Xt,{view:0,detail:0}),er=rd(Zt),tr=we({},Zt,{screenX:0,screenY:0,clientX:0,clientY:0,pageX:0,pageY:0,ctrlKey:0,shiftKey:0,altKey:0,metaKey:0,getModifierState:zd,button:0,buttons:0,relatedTarget:function(s){return void 0===s.relatedTarget?s.fromElement===s.srcElement?s.toElement:s.fromElement:s.relatedTarget},movementX:function(s){return"movementX"in s?s.movementX:(s!==Yt&&(Yt&&"mousemove"===s.type?(Kt=s.screenX-Yt.screenX,Gt=s.screenY-Yt.screenY):Gt=Kt=0,Yt=s),Kt)},movementY:function(s){return"movementY"in s?s.movementY:Gt}}),rr=rd(tr),nr=rd(we({},tr,{dataTransfer:0})),sr=rd(we({},Zt,{relatedTarget:0})),ir=rd(we({},Xt,{animationName:0,elapsedTime:0,pseudoElement:0})),ar=we({},Xt,{clipboardData:function(s){return"clipboardData"in s?s.clipboardData:window.clipboardData}}),cr=rd(ar),lr=rd(we({},Xt,{data:0})),ur={Esc:"Escape",Spacebar:" ",Left:"ArrowLeft",Up:"ArrowUp",Right:"ArrowRight",Down:"ArrowDown",Del:"Delete",Win:"OS",Menu:"ContextMenu",Apps:"ContextMenu",Scroll:"ScrollLock",MozPrintableKey:"Unidentified"},pr={8:"Backspace",9:"Tab",12:"Clear",13:"Enter",16:"Shift",17:"Control",18:"Alt",19:"Pause",20:"CapsLock",27:"Escape",32:" ",33:"PageUp",34:"PageDown",35:"End",36:"Home",37:"ArrowLeft",38:"ArrowUp",39:"ArrowRight",40:"ArrowDown",45:"Insert",46:"Delete",112:"F1",113:"F2",114:"F3",115:"F4",116:"F5",117:"F6",118:"F7",119:"F8",120:"F9",121:"F10",122:"F11",123:"F12",144:"NumLock",145:"ScrollLock",224:"Meta"},dr={Alt:"altKey",Control:"ctrlKey",Meta:"metaKey",Shift:"shiftKey"};function Pd(s){var o=this.nativeEvent;return o.getModifierState?o.getModifierState(s):!!(s=dr[s])&&!!o[s]}function zd(){return Pd}var fr=we({},Zt,{key:function(s){if(s.key){var o=ur[s.key]||s.key;if("Unidentified"!==o)return o}return"keypress"===s.type?13===(s=od(s))?"Enter":String.fromCharCode(s):"keydown"===s.type||"keyup"===s.type?pr[s.keyCode]||"Unidentified":""},code:0,location:0,ctrlKey:0,shiftKey:0,altKey:0,metaKey:0,repeat:0,locale:0,getModifierState:zd,charCode:function(s){return"keypress"===s.type?od(s):0},keyCode:function(s){return"keydown"===s.type||"keyup"===s.type?s.keyCode:0},which:function(s){return"keypress"===s.type?od(s):"keydown"===s.type||"keyup"===s.type?s.keyCode:0}}),mr=rd(fr),gr=rd(we({},tr,{pointerId:0,width:0,height:0,pressure:0,tangentialPressure:0,tiltX:0,tiltY:0,twist:0,pointerType:0,isPrimary:0})),yr=rd(we({},Zt,{touches:0,targetTouches:0,changedTouches:0,altKey:0,metaKey:0,ctrlKey:0,shiftKey:0,getModifierState:zd})),vr=rd(we({},Xt,{propertyName:0,elapsedTime:0,pseudoElement:0})),br=we({},tr,{deltaX:function(s){return"deltaX"in s?s.deltaX:"wheelDeltaX"in s?-s.wheelDeltaX:0},deltaY:function(s){return"deltaY"in s?s.deltaY:"wheelDeltaY"in s?-s.wheelDeltaY:"wheelDelta"in s?-s.wheelDelta:0},deltaZ:0,deltaMode:0}),_r=rd(br),Sr=[9,13,27,32],Er=x&&"CompositionEvent"in window,wr=null;x&&"documentMode"in document&&(wr=document.documentMode);var xr=x&&"TextEvent"in window&&!wr,kr=x&&(!Er||wr&&8<wr&&11>=wr),Or=String.fromCharCode(32),Ar=!1;function ge(s,o){switch(s){case"keyup":return-1!==Sr.indexOf(o.keyCode);case"keydown":return 229!==o.keyCode;case"keypress":case"mousedown":case"focusout":return!0;default:return!1}}function he(s){return"object"==typeof(s=s.detail)&&"data"in s?s.data:null}var Cr=!1;var jr={color:!0,date:!0,datetime:!0,"datetime-local":!0,email:!0,month:!0,number:!0,password:!0,range:!0,search:!0,tel:!0,text:!0,time:!0,url:!0,week:!0};function me(s){var o=s&&s.nodeName&&s.nodeName.toLowerCase();return"input"===o?!!jr[s.type]:"textarea"===o}function ne(s,o,i,a){Eb(a),0<(o=oe(o,"onChange")).length&&(i=new Qt("onChange","change",null,i,a),s.push({event:i,listeners:o}))}var Pr=null,Ir=null;function re(s){se(s,0)}function te(s){if(Wa(ue(s)))return s}function ve(s,o){if("change"===s)return o}var Tr=!1;if(x){var Nr;if(x){var Mr="oninput"in document;if(!Mr){var Rr=document.createElement("div");Rr.setAttribute("oninput","return;"),Mr="function"==typeof Rr.oninput}Nr=Mr}else Nr=!1;Tr=Nr&&(!document.documentMode||9<document.documentMode)}function Ae(){Pr&&(Pr.detachEvent("onpropertychange",Be),Ir=Pr=null)}function Be(s){if("value"===s.propertyName&&te(Ir)){var o=[];ne(o,Ir,s,xb(s)),Jb(re,o)}}function Ce(s,o,i){"focusin"===s?(Ae(),Ir=i,(Pr=o).attachEvent("onpropertychange",Be)):"focusout"===s&&Ae()}function De(s){if("selectionchange"===s||"keyup"===s||"keydown"===s)return te(Ir)}function Ee(s,o){if("click"===s)return te(o)}function Fe(s,o){if("input"===s||"change"===s)return te(o)}var Dr="function"==typeof Object.is?Object.is:function Ge(s,o){return s===o&&(0!==s||1/s==1/o)||s!=s&&o!=o};function Ie(s,o){if(Dr(s,o))return!0;if("object"!=typeof s||null===s||"object"!=typeof o||null===o)return!1;var i=Object.keys(s),a=Object.keys(o);if(i.length!==a.length)return!1;for(a=0;a<i.length;a++){var u=i[a];if(!C.call(o,u)||!Dr(s[u],o[u]))return!1}return!0}function Je(s){for(;s&&s.firstChild;)s=s.firstChild;return s}function Ke(s,o){var i,a=Je(s);for(s=0;a;){if(3===a.nodeType){if(i=s+a.textContent.length,s<=o&&i>=o)return{node:a,offset:o-s};s=i}e:{for(;a;){if(a.nextSibling){a=a.nextSibling;break e}a=a.parentNode}a=void 0}a=Je(a)}}function Le(s,o){return!(!s||!o)&&(s===o||(!s||3!==s.nodeType)&&(o&&3===o.nodeType?Le(s,o.parentNode):"contains"in s?s.contains(o):!!s.compareDocumentPosition&&!!(16&s.compareDocumentPosition(o))))}function Me(){for(var s=window,o=Xa();o instanceof s.HTMLIFrameElement;){try{var i="string"==typeof o.contentWindow.location.href}catch(s){i=!1}if(!i)break;o=Xa((s=o.contentWindow).document)}return o}function Ne(s){var o=s&&s.nodeName&&s.nodeName.toLowerCase();return o&&("input"===o&&("text"===s.type||"search"===s.type||"tel"===s.type||"url"===s.type||"password"===s.type)||"textarea"===o||"true"===s.contentEditable)}function Oe(s){var o=Me(),i=s.focusedElem,a=s.selectionRange;if(o!==i&&i&&i.ownerDocument&&Le(i.ownerDocument.documentElement,i)){if(null!==a&&Ne(i))if(o=a.start,void 0===(s=a.end)&&(s=o),"selectionStart"in i)i.selectionStart=o,i.selectionEnd=Math.min(s,i.value.length);else if((s=(o=i.ownerDocument||document)&&o.defaultView||window).getSelection){s=s.getSelection();var u=i.textContent.length,_=Math.min(a.start,u);a=void 0===a.end?_:Math.min(a.end,u),!s.extend&&_>a&&(u=a,a=_,_=u),u=Ke(i,_);var w=Ke(i,a);u&&w&&(1!==s.rangeCount||s.anchorNode!==u.node||s.anchorOffset!==u.offset||s.focusNode!==w.node||s.focusOffset!==w.offset)&&((o=o.createRange()).setStart(u.node,u.offset),s.removeAllRanges(),_>a?(s.addRange(o),s.extend(w.node,w.offset)):(o.setEnd(w.node,w.offset),s.addRange(o)))}for(o=[],s=i;s=s.parentNode;)1===s.nodeType&&o.push({element:s,left:s.scrollLeft,top:s.scrollTop});for("function"==typeof i.focus&&i.focus(),i=0;i<o.length;i++)(s=o[i]).element.scrollLeft=s.left,s.element.scrollTop=s.top}}var Lr=x&&"documentMode"in document&&11>=document.documentMode,Fr=null,Br=null,$r=null,qr=!1;function Ue(s,o,i){var a=i.window===i?i.document:9===i.nodeType?i:i.ownerDocument;qr||null==Fr||Fr!==Xa(a)||("selectionStart"in(a=Fr)&&Ne(a)?a={start:a.selectionStart,end:a.selectionEnd}:a={anchorNode:(a=(a.ownerDocument&&a.ownerDocument.defaultView||window).getSelection()).anchorNode,anchorOffset:a.anchorOffset,focusNode:a.focusNode,focusOffset:a.focusOffset},$r&&Ie($r,a)||($r=a,0<(a=oe(Br,"onSelect")).length&&(o=new Qt("onSelect","select",null,o,i),s.push({event:o,listeners:a}),o.target=Fr)))}function Ve(s,o){var i={};return i[s.toLowerCase()]=o.toLowerCase(),i["Webkit"+s]="webkit"+o,i["Moz"+s]="moz"+o,i}var Ur={animationend:Ve("Animation","AnimationEnd"),animationiteration:Ve("Animation","AnimationIteration"),animationstart:Ve("Animation","AnimationStart"),transitionend:Ve("Transition","TransitionEnd")},Vr={},zr={};function Ze(s){if(Vr[s])return Vr[s];if(!Ur[s])return s;var o,i=Ur[s];for(o in i)if(i.hasOwnProperty(o)&&o in zr)return Vr[s]=i[o];return s}x&&(zr=document.createElement("div").style,"AnimationEvent"in window||(delete Ur.animationend.animation,delete Ur.animationiteration.animation,delete Ur.animationstart.animation),"TransitionEvent"in window||delete Ur.transitionend.transition);var Wr=Ze("animationend"),Jr=Ze("animationiteration"),Hr=Ze("animationstart"),Kr=Ze("transitionend"),Gr=new Map,Yr="abort auxClick cancel canPlay canPlayThrough click close contextMenu copy cut drag dragEnd dragEnter dragExit dragLeave dragOver dragStart drop durationChange emptied encrypted ended error gotPointerCapture input invalid keyDown keyPress keyUp load loadedData loadedMetadata loadStart lostPointerCapture mouseDown mouseMove mouseOut mouseOver mouseUp paste pause play playing pointerCancel pointerDown pointerMove pointerOut pointerOver pointerUp progress rateChange reset resize seeked seeking stalled submit suspend timeUpdate touchCancel touchEnd touchStart volumeChange scroll toggle touchMove waiting wheel".split(" ");function ff(s,o){Gr.set(s,o),fa(o,[s])}for(var Xr=0;Xr<Yr.length;Xr++){var Qr=Yr[Xr];ff(Qr.toLowerCase(),"on"+(Qr[0].toUpperCase()+Qr.slice(1)))}ff(Wr,"onAnimationEnd"),ff(Jr,"onAnimationIteration"),ff(Hr,"onAnimationStart"),ff("dblclick","onDoubleClick"),ff("focusin","onFocus"),ff("focusout","onBlur"),ff(Kr,"onTransitionEnd"),ha("onMouseEnter",["mouseout","mouseover"]),ha("onMouseLeave",["mouseout","mouseover"]),ha("onPointerEnter",["pointerout","pointerover"]),ha("onPointerLeave",["pointerout","pointerover"]),fa("onChange","change click focusin focusout input keydown keyup selectionchange".split(" ")),fa("onSelect","focusout contextmenu dragend focusin keydown keyup mousedown mouseup selectionchange".split(" ")),fa("onBeforeInput",["compositionend","keypress","textInput","paste"]),fa("onCompositionEnd","compositionend focusout keydown keypress keyup mousedown".split(" ")),fa("onCompositionStart","compositionstart focusout keydown keypress keyup mousedown".split(" ")),fa("onCompositionUpdate","compositionupdate focusout keydown keypress keyup mousedown".split(" "));var Zr="abort canplay canplaythrough durationchange emptied encrypted ended error loadeddata loadedmetadata loadstart pause play playing progress ratechange resize seeked seeking stalled suspend timeupdate volumechange waiting".split(" "),en=new Set("cancel close invalid load scroll toggle".split(" ").concat(Zr));function nf(s,o,i){var a=s.type||"unknown-event";s.currentTarget=i,function Ub(s,o,i,a,u,_,w,x,C){if(Tb.apply(this,arguments),nt){if(!nt)throw Error(p(198));var j=st;nt=!1,st=null,ot||(ot=!0,it=j)}}(a,o,void 0,s),s.currentTarget=null}function se(s,o){o=!!(4&o);for(var i=0;i<s.length;i++){var a=s[i],u=a.event;a=a.listeners;e:{var _=void 0;if(o)for(var w=a.length-1;0<=w;w--){var x=a[w],C=x.instance,j=x.currentTarget;if(x=x.listener,C!==_&&u.isPropagationStopped())break e;nf(u,x,j),_=C}else for(w=0;w<a.length;w++){if(C=(x=a[w]).instance,j=x.currentTarget,x=x.listener,C!==_&&u.isPropagationStopped())break e;nf(u,x,j),_=C}}}if(ot)throw s=it,ot=!1,it=null,s}function D(s,o){var i=o[mn];void 0===i&&(i=o[mn]=new Set);var a=s+"__bubble";i.has(a)||(pf(o,s,2,!1),i.add(a))}function qf(s,o,i){var a=0;o&&(a|=4),pf(i,s,a,o)}var tn="_reactListening"+Math.random().toString(36).slice(2);function sf(s){if(!s[tn]){s[tn]=!0,_.forEach((function(o){"selectionchange"!==o&&(en.has(o)||qf(o,!1,s),qf(o,!0,s))}));var o=9===s.nodeType?s:s.ownerDocument;null===o||o[tn]||(o[tn]=!0,qf("selectionchange",!1,o))}}function pf(s,o,i,a){switch(jd(o)){case 1:var u=ed;break;case 4:u=gd;break;default:u=fd}i=u.bind(null,o,i,s),u=void 0,!tt||"touchstart"!==o&&"touchmove"!==o&&"wheel"!==o||(u=!0),a?void 0!==u?s.addEventListener(o,i,{capture:!0,passive:u}):s.addEventListener(o,i,!0):void 0!==u?s.addEventListener(o,i,{passive:u}):s.addEventListener(o,i,!1)}function hd(s,o,i,a,u){var _=a;if(!(1&o||2&o||null===a))e:for(;;){if(null===a)return;var w=a.tag;if(3===w||4===w){var x=a.stateNode.containerInfo;if(x===u||8===x.nodeType&&x.parentNode===u)break;if(4===w)for(w=a.return;null!==w;){var C=w.tag;if((3===C||4===C)&&((C=w.stateNode.containerInfo)===u||8===C.nodeType&&C.parentNode===u))return;w=w.return}for(;null!==x;){if(null===(w=Wc(x)))return;if(5===(C=w.tag)||6===C){a=_=w;continue e}x=x.parentNode}}a=a.return}Jb((function(){var a=_,u=xb(i),w=[];e:{var x=Gr.get(s);if(void 0!==x){var C=Qt,j=s;switch(s){case"keypress":if(0===od(i))break e;case"keydown":case"keyup":C=mr;break;case"focusin":j="focus",C=sr;break;case"focusout":j="blur",C=sr;break;case"beforeblur":case"afterblur":C=sr;break;case"click":if(2===i.button)break e;case"auxclick":case"dblclick":case"mousedown":case"mousemove":case"mouseup":case"mouseout":case"mouseover":case"contextmenu":C=rr;break;case"drag":case"dragend":case"dragenter":case"dragexit":case"dragleave":case"dragover":case"dragstart":case"drop":C=nr;break;case"touchcancel":case"touchend":case"touchmove":case"touchstart":C=yr;break;case Wr:case Jr:case Hr:C=ir;break;case Kr:C=vr;break;case"scroll":C=er;break;case"wheel":C=_r;break;case"copy":case"cut":case"paste":C=cr;break;case"gotpointercapture":case"lostpointercapture":case"pointercancel":case"pointerdown":case"pointermove":case"pointerout":case"pointerover":case"pointerup":C=gr}var L=!!(4&o),B=!L&&"scroll"===s,$=L?null!==x?x+"Capture":null:x;L=[];for(var U,V=a;null!==V;){var z=(U=V).stateNode;if(5===U.tag&&null!==z&&(U=z,null!==$&&(null!=(z=Kb(V,$))&&L.push(tf(V,z,U)))),B)break;V=V.return}0<L.length&&(x=new C(x,j,null,i,u),w.push({event:x,listeners:L}))}}if(!(7&o)){if(C="mouseout"===s||"pointerout"===s,(!(x="mouseover"===s||"pointerover"===s)||i===He||!(j=i.relatedTarget||i.fromElement)||!Wc(j)&&!j[fn])&&(C||x)&&(x=u.window===u?u:(x=u.ownerDocument)?x.defaultView||x.parentWindow:window,C?(C=a,null!==(j=(j=i.relatedTarget||i.toElement)?Wc(j):null)&&(j!==(B=Vb(j))||5!==j.tag&&6!==j.tag)&&(j=null)):(C=null,j=a),C!==j)){if(L=rr,z="onMouseLeave",$="onMouseEnter",V="mouse","pointerout"!==s&&"pointerover"!==s||(L=gr,z="onPointerLeave",$="onPointerEnter",V="pointer"),B=null==C?x:ue(C),U=null==j?x:ue(j),(x=new L(z,V+"leave",C,i,u)).target=B,x.relatedTarget=U,z=null,Wc(u)===a&&((L=new L($,V+"enter",j,i,u)).target=U,L.relatedTarget=B,z=L),B=z,C&&j)e:{for($=j,V=0,U=L=C;U;U=vf(U))V++;for(U=0,z=$;z;z=vf(z))U++;for(;0<V-U;)L=vf(L),V--;for(;0<U-V;)$=vf($),U--;for(;V--;){if(L===$||null!==$&&L===$.alternate)break e;L=vf(L),$=vf($)}L=null}else L=null;null!==C&&wf(w,x,C,L,!1),null!==j&&null!==B&&wf(w,B,j,L,!0)}if("select"===(C=(x=a?ue(a):window).nodeName&&x.nodeName.toLowerCase())||"input"===C&&"file"===x.type)var Y=ve;else if(me(x))if(Tr)Y=Fe;else{Y=De;var Z=Ce}else(C=x.nodeName)&&"input"===C.toLowerCase()&&("checkbox"===x.type||"radio"===x.type)&&(Y=Ee);switch(Y&&(Y=Y(s,a))?ne(w,Y,i,u):(Z&&Z(s,x,a),"focusout"===s&&(Z=x._wrapperState)&&Z.controlled&&"number"===x.type&&cb(x,"number",x.value)),Z=a?ue(a):window,s){case"focusin":(me(Z)||"true"===Z.contentEditable)&&(Fr=Z,Br=a,$r=null);break;case"focusout":$r=Br=Fr=null;break;case"mousedown":qr=!0;break;case"contextmenu":case"mouseup":case"dragend":qr=!1,Ue(w,i,u);break;case"selectionchange":if(Lr)break;case"keydown":case"keyup":Ue(w,i,u)}var ee;if(Er)e:{switch(s){case"compositionstart":var ie="onCompositionStart";break e;case"compositionend":ie="onCompositionEnd";break e;case"compositionupdate":ie="onCompositionUpdate";break e}ie=void 0}else Cr?ge(s,i)&&(ie="onCompositionEnd"):"keydown"===s&&229===i.keyCode&&(ie="onCompositionStart");ie&&(kr&&"ko"!==i.locale&&(Cr||"onCompositionStart"!==ie?"onCompositionEnd"===ie&&Cr&&(ee=nd()):(Jt="value"in(Wt=u)?Wt.value:Wt.textContent,Cr=!0)),0<(Z=oe(a,ie)).length&&(ie=new lr(ie,s,null,i,u),w.push({event:ie,listeners:Z}),ee?ie.data=ee:null!==(ee=he(i))&&(ie.data=ee))),(ee=xr?function je(s,o){switch(s){case"compositionend":return he(o);case"keypress":return 32!==o.which?null:(Ar=!0,Or);case"textInput":return(s=o.data)===Or&&Ar?null:s;default:return null}}(s,i):function ke(s,o){if(Cr)return"compositionend"===s||!Er&&ge(s,o)?(s=nd(),Ht=Jt=Wt=null,Cr=!1,s):null;switch(s){case"paste":default:return null;case"keypress":if(!(o.ctrlKey||o.altKey||o.metaKey)||o.ctrlKey&&o.altKey){if(o.char&&1<o.char.length)return o.char;if(o.which)return String.fromCharCode(o.which)}return null;case"compositionend":return kr&&"ko"!==o.locale?null:o.data}}(s,i))&&(0<(a=oe(a,"onBeforeInput")).length&&(u=new lr("onBeforeInput","beforeinput",null,i,u),w.push({event:u,listeners:a}),u.data=ee))}se(w,o)}))}function tf(s,o,i){return{instance:s,listener:o,currentTarget:i}}function oe(s,o){for(var i=o+"Capture",a=[];null!==s;){var u=s,_=u.stateNode;5===u.tag&&null!==_&&(u=_,null!=(_=Kb(s,i))&&a.unshift(tf(s,_,u)),null!=(_=Kb(s,o))&&a.push(tf(s,_,u))),s=s.return}return a}function vf(s){if(null===s)return null;do{s=s.return}while(s&&5!==s.tag);return s||null}function wf(s,o,i,a,u){for(var _=o._reactName,w=[];null!==i&&i!==a;){var x=i,C=x.alternate,j=x.stateNode;if(null!==C&&C===a)break;5===x.tag&&null!==j&&(x=j,u?null!=(C=Kb(i,_))&&w.unshift(tf(i,C,x)):u||null!=(C=Kb(i,_))&&w.push(tf(i,C,x))),i=i.return}0!==w.length&&s.push({event:o,listeners:w})}var rn=/\r\n?/g,nn=/\u0000|\uFFFD/g;function zf(s){return("string"==typeof s?s:""+s).replace(rn,"\n").replace(nn,"")}function Af(s,o,i){if(o=zf(o),zf(s)!==o&&i)throw Error(p(425))}function Bf(){}var sn=null,on=null;function Ef(s,o){return"textarea"===s||"noscript"===s||"string"==typeof o.children||"number"==typeof o.children||"object"==typeof o.dangerouslySetInnerHTML&&null!==o.dangerouslySetInnerHTML&&null!=o.dangerouslySetInnerHTML.__html}var an="function"==typeof setTimeout?setTimeout:void 0,cn="function"==typeof clearTimeout?clearTimeout:void 0,ln="function"==typeof Promise?Promise:void 0,un="function"==typeof queueMicrotask?queueMicrotask:void 0!==ln?function(s){return ln.resolve(null).then(s).catch(If)}:an;function If(s){setTimeout((function(){throw s}))}function Kf(s,o){var i=o,a=0;do{var u=i.nextSibling;if(s.removeChild(i),u&&8===u.nodeType)if("/$"===(i=u.data)){if(0===a)return s.removeChild(u),void bd(o);a--}else"$"!==i&&"$?"!==i&&"$!"!==i||a++;i=u}while(i);bd(o)}function Lf(s){for(;null!=s;s=s.nextSibling){var o=s.nodeType;if(1===o||3===o)break;if(8===o){if("$"===(o=s.data)||"$!"===o||"$?"===o)break;if("/$"===o)return null}}return s}function Mf(s){s=s.previousSibling;for(var o=0;s;){if(8===s.nodeType){var i=s.data;if("$"===i||"$!"===i||"$?"===i){if(0===o)return s;o--}else"/$"===i&&o++}s=s.previousSibling}return null}var pn=Math.random().toString(36).slice(2),hn="__reactFiber$"+pn,dn="__reactProps$"+pn,fn="__reactContainer$"+pn,mn="__reactEvents$"+pn,gn="__reactListeners$"+pn,yn="__reactHandles$"+pn;function Wc(s){var o=s[hn];if(o)return o;for(var i=s.parentNode;i;){if(o=i[fn]||i[hn]){if(i=o.alternate,null!==o.child||null!==i&&null!==i.child)for(s=Mf(s);null!==s;){if(i=s[hn])return i;s=Mf(s)}return o}i=(s=i).parentNode}return null}function Cb(s){return!(s=s[hn]||s[fn])||5!==s.tag&&6!==s.tag&&13!==s.tag&&3!==s.tag?null:s}function ue(s){if(5===s.tag||6===s.tag)return s.stateNode;throw Error(p(33))}function Db(s){return s[dn]||null}var vn=[],bn=-1;function Uf(s){return{current:s}}function E(s){0>bn||(s.current=vn[bn],vn[bn]=null,bn--)}function G(s,o){bn++,vn[bn]=s.current,s.current=o}var _n={},Sn=Uf(_n),En=Uf(!1),wn=_n;function Yf(s,o){var i=s.type.contextTypes;if(!i)return _n;var a=s.stateNode;if(a&&a.__reactInternalMemoizedUnmaskedChildContext===o)return a.__reactInternalMemoizedMaskedChildContext;var u,_={};for(u in i)_[u]=o[u];return a&&((s=s.stateNode).__reactInternalMemoizedUnmaskedChildContext=o,s.__reactInternalMemoizedMaskedChildContext=_),_}function Zf(s){return null!=(s=s.childContextTypes)}function $f(){E(En),E(Sn)}function ag(s,o,i){if(Sn.current!==_n)throw Error(p(168));G(Sn,o),G(En,i)}function bg(s,o,i){var a=s.stateNode;if(o=o.childContextTypes,"function"!=typeof a.getChildContext)return i;for(var u in a=a.getChildContext())if(!(u in o))throw Error(p(108,Ra(s)||"Unknown",u));return we({},i,a)}function cg(s){return s=(s=s.stateNode)&&s.__reactInternalMemoizedMergedChildContext||_n,wn=Sn.current,G(Sn,s),G(En,En.current),!0}function dg(s,o,i){var a=s.stateNode;if(!a)throw Error(p(169));i?(s=bg(s,o,wn),a.__reactInternalMemoizedMergedChildContext=s,E(En),E(Sn),G(Sn,s)):E(En),G(En,i)}var xn=null,kn=!1,On=!1;function hg(s){null===xn?xn=[s]:xn.push(s)}function jg(){if(!On&&null!==xn){On=!0;var s=0,o=At;try{var i=xn;for(At=1;s<i.length;s++){var a=i[s];do{a=a(!0)}while(null!==a)}xn=null,kn=!1}catch(o){throw null!==xn&&(xn=xn.slice(s+1)),ct(mt,jg),o}finally{At=o,On=!1}}return null}var An=[],Cn=0,jn=null,Pn=0,In=[],Tn=0,Nn=null,Mn=1,Rn="";function tg(s,o){An[Cn++]=Pn,An[Cn++]=jn,jn=s,Pn=o}function ug(s,o,i){In[Tn++]=Mn,In[Tn++]=Rn,In[Tn++]=Nn,Nn=s;var a=Mn;s=Rn;var u=32-Et(a)-1;a&=~(1<<u),i+=1;var _=32-Et(o)+u;if(30<_){var w=u-u%5;_=(a&(1<<w)-1).toString(32),a>>=w,u-=w,Mn=1<<32-Et(o)+u|i<<u|a,Rn=_+s}else Mn=1<<_|i<<u|a,Rn=s}function vg(s){null!==s.return&&(tg(s,1),ug(s,1,0))}function wg(s){for(;s===jn;)jn=An[--Cn],An[Cn]=null,Pn=An[--Cn],An[Cn]=null;for(;s===Nn;)Nn=In[--Tn],In[Tn]=null,Rn=In[--Tn],In[Tn]=null,Mn=In[--Tn],In[Tn]=null}var Dn=null,Ln=null,Fn=!1,Bn=null;function Ag(s,o){var i=Bg(5,null,null,0);i.elementType="DELETED",i.stateNode=o,i.return=s,null===(o=s.deletions)?(s.deletions=[i],s.flags|=16):o.push(i)}function Cg(s,o){switch(s.tag){case 5:var i=s.type;return null!==(o=1!==o.nodeType||i.toLowerCase()!==o.nodeName.toLowerCase()?null:o)&&(s.stateNode=o,Dn=s,Ln=Lf(o.firstChild),!0);case 6:return null!==(o=""===s.pendingProps||3!==o.nodeType?null:o)&&(s.stateNode=o,Dn=s,Ln=null,!0);case 13:return null!==(o=8!==o.nodeType?null:o)&&(i=null!==Nn?{id:Mn,overflow:Rn}:null,s.memoizedState={dehydrated:o,treeContext:i,retryLane:1073741824},(i=Bg(18,null,null,0)).stateNode=o,i.return=s,s.child=i,Dn=s,Ln=null,!0);default:return!1}}function Dg(s){return!(!(1&s.mode)||128&s.flags)}function Eg(s){if(Fn){var o=Ln;if(o){var i=o;if(!Cg(s,o)){if(Dg(s))throw Error(p(418));o=Lf(i.nextSibling);var a=Dn;o&&Cg(s,o)?Ag(a,i):(s.flags=-4097&s.flags|2,Fn=!1,Dn=s)}}else{if(Dg(s))throw Error(p(418));s.flags=-4097&s.flags|2,Fn=!1,Dn=s}}}function Fg(s){for(s=s.return;null!==s&&5!==s.tag&&3!==s.tag&&13!==s.tag;)s=s.return;Dn=s}function Gg(s){if(s!==Dn)return!1;if(!Fn)return Fg(s),Fn=!0,!1;var o;if((o=3!==s.tag)&&!(o=5!==s.tag)&&(o="head"!==(o=s.type)&&"body"!==o&&!Ef(s.type,s.memoizedProps)),o&&(o=Ln)){if(Dg(s))throw Hg(),Error(p(418));for(;o;)Ag(s,o),o=Lf(o.nextSibling)}if(Fg(s),13===s.tag){if(!(s=null!==(s=s.memoizedState)?s.dehydrated:null))throw Error(p(317));e:{for(s=s.nextSibling,o=0;s;){if(8===s.nodeType){var i=s.data;if("/$"===i){if(0===o){Ln=Lf(s.nextSibling);break e}o--}else"$"!==i&&"$!"!==i&&"$?"!==i||o++}s=s.nextSibling}Ln=null}}else Ln=Dn?Lf(s.stateNode.nextSibling):null;return!0}function Hg(){for(var s=Ln;s;)s=Lf(s.nextSibling)}function Ig(){Ln=Dn=null,Fn=!1}function Jg(s){null===Bn?Bn=[s]:Bn.push(s)}var $n=V.ReactCurrentBatchConfig;function Lg(s,o,i){if(null!==(s=i.ref)&&"function"!=typeof s&&"object"!=typeof s){if(i._owner){if(i=i._owner){if(1!==i.tag)throw Error(p(309));var a=i.stateNode}if(!a)throw Error(p(147,s));var u=a,_=""+s;return null!==o&&null!==o.ref&&"function"==typeof o.ref&&o.ref._stringRef===_?o.ref:(o=function(s){var o=u.refs;null===s?delete o[_]:o[_]=s},o._stringRef=_,o)}if("string"!=typeof s)throw Error(p(284));if(!i._owner)throw Error(p(290,s))}return s}function Mg(s,o){throw s=Object.prototype.toString.call(o),Error(p(31,"[object Object]"===s?"object with keys {"+Object.keys(o).join(", ")+"}":s))}function Ng(s){return(0,s._init)(s._payload)}function Og(s){function b(o,i){if(s){var a=o.deletions;null===a?(o.deletions=[i],o.flags|=16):a.push(i)}}function c(o,i){if(!s)return null;for(;null!==i;)b(o,i),i=i.sibling;return null}function d(s,o){for(s=new Map;null!==o;)null!==o.key?s.set(o.key,o):s.set(o.index,o),o=o.sibling;return s}function e(s,o){return(s=Pg(s,o)).index=0,s.sibling=null,s}function f(o,i,a){return o.index=a,s?null!==(a=o.alternate)?(a=a.index)<i?(o.flags|=2,i):a:(o.flags|=2,i):(o.flags|=1048576,i)}function g(o){return s&&null===o.alternate&&(o.flags|=2),o}function h(s,o,i,a){return null===o||6!==o.tag?((o=Qg(i,s.mode,a)).return=s,o):((o=e(o,i)).return=s,o)}function k(s,o,i,a){var u=i.type;return u===Z?m(s,o,i.props.children,a,i.key):null!==o&&(o.elementType===u||"object"==typeof u&&null!==u&&u.$$typeof===ye&&Ng(u)===o.type)?((a=e(o,i.props)).ref=Lg(s,o,i),a.return=s,a):((a=Rg(i.type,i.key,i.props,null,s.mode,a)).ref=Lg(s,o,i),a.return=s,a)}function l(s,o,i,a){return null===o||4!==o.tag||o.stateNode.containerInfo!==i.containerInfo||o.stateNode.implementation!==i.implementation?((o=Sg(i,s.mode,a)).return=s,o):((o=e(o,i.children||[])).return=s,o)}function m(s,o,i,a,u){return null===o||7!==o.tag?((o=Tg(i,s.mode,a,u)).return=s,o):((o=e(o,i)).return=s,o)}function q(s,o,i){if("string"==typeof o&&""!==o||"number"==typeof o)return(o=Qg(""+o,s.mode,i)).return=s,o;if("object"==typeof o&&null!==o){switch(o.$$typeof){case z:return(i=Rg(o.type,o.key,o.props,null,s.mode,i)).ref=Lg(s,null,o),i.return=s,i;case Y:return(o=Sg(o,s.mode,i)).return=s,o;case ye:return q(s,(0,o._init)(o._payload),i)}if(Pe(o)||Ka(o))return(o=Tg(o,s.mode,i,null)).return=s,o;Mg(s,o)}return null}function r(s,o,i,a){var u=null!==o?o.key:null;if("string"==typeof i&&""!==i||"number"==typeof i)return null!==u?null:h(s,o,""+i,a);if("object"==typeof i&&null!==i){switch(i.$$typeof){case z:return i.key===u?k(s,o,i,a):null;case Y:return i.key===u?l(s,o,i,a):null;case ye:return r(s,o,(u=i._init)(i._payload),a)}if(Pe(i)||Ka(i))return null!==u?null:m(s,o,i,a,null);Mg(s,i)}return null}function y(s,o,i,a,u){if("string"==typeof a&&""!==a||"number"==typeof a)return h(o,s=s.get(i)||null,""+a,u);if("object"==typeof a&&null!==a){switch(a.$$typeof){case z:return k(o,s=s.get(null===a.key?i:a.key)||null,a,u);case Y:return l(o,s=s.get(null===a.key?i:a.key)||null,a,u);case ye:return y(s,o,i,(0,a._init)(a._payload),u)}if(Pe(a)||Ka(a))return m(o,s=s.get(i)||null,a,u,null);Mg(o,a)}return null}function n(o,i,a,u){for(var _=null,w=null,x=i,C=i=0,j=null;null!==x&&C<a.length;C++){x.index>C?(j=x,x=null):j=x.sibling;var L=r(o,x,a[C],u);if(null===L){null===x&&(x=j);break}s&&x&&null===L.alternate&&b(o,x),i=f(L,i,C),null===w?_=L:w.sibling=L,w=L,x=j}if(C===a.length)return c(o,x),Fn&&tg(o,C),_;if(null===x){for(;C<a.length;C++)null!==(x=q(o,a[C],u))&&(i=f(x,i,C),null===w?_=x:w.sibling=x,w=x);return Fn&&tg(o,C),_}for(x=d(o,x);C<a.length;C++)null!==(j=y(x,o,C,a[C],u))&&(s&&null!==j.alternate&&x.delete(null===j.key?C:j.key),i=f(j,i,C),null===w?_=j:w.sibling=j,w=j);return s&&x.forEach((function(s){return b(o,s)})),Fn&&tg(o,C),_}function t(o,i,a,u){var _=Ka(a);if("function"!=typeof _)throw Error(p(150));if(null==(a=_.call(a)))throw Error(p(151));for(var w=_=null,x=i,C=i=0,j=null,L=a.next();null!==x&&!L.done;C++,L=a.next()){x.index>C?(j=x,x=null):j=x.sibling;var B=r(o,x,L.value,u);if(null===B){null===x&&(x=j);break}s&&x&&null===B.alternate&&b(o,x),i=f(B,i,C),null===w?_=B:w.sibling=B,w=B,x=j}if(L.done)return c(o,x),Fn&&tg(o,C),_;if(null===x){for(;!L.done;C++,L=a.next())null!==(L=q(o,L.value,u))&&(i=f(L,i,C),null===w?_=L:w.sibling=L,w=L);return Fn&&tg(o,C),_}for(x=d(o,x);!L.done;C++,L=a.next())null!==(L=y(x,o,C,L.value,u))&&(s&&null!==L.alternate&&x.delete(null===L.key?C:L.key),i=f(L,i,C),null===w?_=L:w.sibling=L,w=L);return s&&x.forEach((function(s){return b(o,s)})),Fn&&tg(o,C),_}return function J(s,o,i,a){if("object"==typeof i&&null!==i&&i.type===Z&&null===i.key&&(i=i.props.children),"object"==typeof i&&null!==i){switch(i.$$typeof){case z:e:{for(var u=i.key,_=o;null!==_;){if(_.key===u){if((u=i.type)===Z){if(7===_.tag){c(s,_.sibling),(o=e(_,i.props.children)).return=s,s=o;break e}}else if(_.elementType===u||"object"==typeof u&&null!==u&&u.$$typeof===ye&&Ng(u)===_.type){c(s,_.sibling),(o=e(_,i.props)).ref=Lg(s,_,i),o.return=s,s=o;break e}c(s,_);break}b(s,_),_=_.sibling}i.type===Z?((o=Tg(i.props.children,s.mode,a,i.key)).return=s,s=o):((a=Rg(i.type,i.key,i.props,null,s.mode,a)).ref=Lg(s,o,i),a.return=s,s=a)}return g(s);case Y:e:{for(_=i.key;null!==o;){if(o.key===_){if(4===o.tag&&o.stateNode.containerInfo===i.containerInfo&&o.stateNode.implementation===i.implementation){c(s,o.sibling),(o=e(o,i.children||[])).return=s,s=o;break e}c(s,o);break}b(s,o),o=o.sibling}(o=Sg(i,s.mode,a)).return=s,s=o}return g(s);case ye:return J(s,o,(_=i._init)(i._payload),a)}if(Pe(i))return n(s,o,i,a);if(Ka(i))return t(s,o,i,a);Mg(s,i)}return"string"==typeof i&&""!==i||"number"==typeof i?(i=""+i,null!==o&&6===o.tag?(c(s,o.sibling),(o=e(o,i)).return=s,s=o):(c(s,o),(o=Qg(i,s.mode,a)).return=s,s=o),g(s)):c(s,o)}}var qn=Og(!0),Un=Og(!1),Vn=Uf(null),zn=null,Wn=null,Jn=null;function $g(){Jn=Wn=zn=null}function ah(s){var o=Vn.current;E(Vn),s._currentValue=o}function bh(s,o,i){for(;null!==s;){var a=s.alternate;if((s.childLanes&o)!==o?(s.childLanes|=o,null!==a&&(a.childLanes|=o)):null!==a&&(a.childLanes&o)!==o&&(a.childLanes|=o),s===i)break;s=s.return}}function ch(s,o){zn=s,Jn=Wn=null,null!==(s=s.dependencies)&&null!==s.firstContext&&(!!(s.lanes&o)&&(bs=!0),s.firstContext=null)}function eh(s){var o=s._currentValue;if(Jn!==s)if(s={context:s,memoizedValue:o,next:null},null===Wn){if(null===zn)throw Error(p(308));Wn=s,zn.dependencies={lanes:0,firstContext:s}}else Wn=Wn.next=s;return o}var Hn=null;function gh(s){null===Hn?Hn=[s]:Hn.push(s)}function hh(s,o,i,a){var u=o.interleaved;return null===u?(i.next=i,gh(o)):(i.next=u.next,u.next=i),o.interleaved=i,ih(s,a)}function ih(s,o){s.lanes|=o;var i=s.alternate;for(null!==i&&(i.lanes|=o),i=s,s=s.return;null!==s;)s.childLanes|=o,null!==(i=s.alternate)&&(i.childLanes|=o),i=s,s=s.return;return 3===i.tag?i.stateNode:null}var Kn=!1;function kh(s){s.updateQueue={baseState:s.memoizedState,firstBaseUpdate:null,lastBaseUpdate:null,shared:{pending:null,interleaved:null,lanes:0},effects:null}}function lh(s,o){s=s.updateQueue,o.updateQueue===s&&(o.updateQueue={baseState:s.baseState,firstBaseUpdate:s.firstBaseUpdate,lastBaseUpdate:s.lastBaseUpdate,shared:s.shared,effects:s.effects})}function mh(s,o){return{eventTime:s,lane:o,tag:0,payload:null,callback:null,next:null}}function nh(s,o,i){var a=s.updateQueue;if(null===a)return null;if(a=a.shared,2&Ls){var u=a.pending;return null===u?o.next=o:(o.next=u.next,u.next=o),a.pending=o,ih(s,i)}return null===(u=a.interleaved)?(o.next=o,gh(a)):(o.next=u.next,u.next=o),a.interleaved=o,ih(s,i)}function oh(s,o,i){if(null!==(o=o.updateQueue)&&(o=o.shared,4194240&i)){var a=o.lanes;i|=a&=s.pendingLanes,o.lanes=i,Cc(s,i)}}function ph(s,o){var i=s.updateQueue,a=s.alternate;if(null!==a&&i===(a=a.updateQueue)){var u=null,_=null;if(null!==(i=i.firstBaseUpdate)){do{var w={eventTime:i.eventTime,lane:i.lane,tag:i.tag,payload:i.payload,callback:i.callback,next:null};null===_?u=_=w:_=_.next=w,i=i.next}while(null!==i);null===_?u=_=o:_=_.next=o}else u=_=o;return i={baseState:a.baseState,firstBaseUpdate:u,lastBaseUpdate:_,shared:a.shared,effects:a.effects},void(s.updateQueue=i)}null===(s=i.lastBaseUpdate)?i.firstBaseUpdate=o:s.next=o,i.lastBaseUpdate=o}function qh(s,o,i,a){var u=s.updateQueue;Kn=!1;var _=u.firstBaseUpdate,w=u.lastBaseUpdate,x=u.shared.pending;if(null!==x){u.shared.pending=null;var C=x,j=C.next;C.next=null,null===w?_=j:w.next=j,w=C;var L=s.alternate;null!==L&&((x=(L=L.updateQueue).lastBaseUpdate)!==w&&(null===x?L.firstBaseUpdate=j:x.next=j,L.lastBaseUpdate=C))}if(null!==_){var B=u.baseState;for(w=0,L=j=C=null,x=_;;){var $=x.lane,U=x.eventTime;if((a&$)===$){null!==L&&(L=L.next={eventTime:U,lane:0,tag:x.tag,payload:x.payload,callback:x.callback,next:null});e:{var V=s,z=x;switch($=o,U=i,z.tag){case 1:if("function"==typeof(V=z.payload)){B=V.call(U,B,$);break e}B=V;break e;case 3:V.flags=-65537&V.flags|128;case 0:if(null==($="function"==typeof(V=z.payload)?V.call(U,B,$):V))break e;B=we({},B,$);break e;case 2:Kn=!0}}null!==x.callback&&0!==x.lane&&(s.flags|=64,null===($=u.effects)?u.effects=[x]:$.push(x))}else U={eventTime:U,lane:$,tag:x.tag,payload:x.payload,callback:x.callback,next:null},null===L?(j=L=U,C=B):L=L.next=U,w|=$;if(null===(x=x.next)){if(null===(x=u.shared.pending))break;x=($=x).next,$.next=null,u.lastBaseUpdate=$,u.shared.pending=null}}if(null===L&&(C=B),u.baseState=C,u.firstBaseUpdate=j,u.lastBaseUpdate=L,null!==(o=u.shared.interleaved)){u=o;do{w|=u.lane,u=u.next}while(u!==o)}else null===_&&(u.shared.lanes=0);Ws|=w,s.lanes=w,s.memoizedState=B}}function sh(s,o,i){if(s=o.effects,o.effects=null,null!==s)for(o=0;o<s.length;o++){var a=s[o],u=a.callback;if(null!==u){if(a.callback=null,a=i,"function"!=typeof u)throw Error(p(191,u));u.call(a)}}}var Gn={},Yn=Uf(Gn),Xn=Uf(Gn),Qn=Uf(Gn);function xh(s){if(s===Gn)throw Error(p(174));return s}function yh(s,o){switch(G(Qn,o),G(Xn,s),G(Yn,Gn),s=o.nodeType){case 9:case 11:o=(o=o.documentElement)?o.namespaceURI:lb(null,"");break;default:o=lb(o=(s=8===s?o.parentNode:o).namespaceURI||null,s=s.tagName)}E(Yn),G(Yn,o)}function zh(){E(Yn),E(Xn),E(Qn)}function Ah(s){xh(Qn.current);var o=xh(Yn.current),i=lb(o,s.type);o!==i&&(G(Xn,s),G(Yn,i))}function Bh(s){Xn.current===s&&(E(Yn),E(Xn))}var Zn=Uf(0);function Ch(s){for(var o=s;null!==o;){if(13===o.tag){var i=o.memoizedState;if(null!==i&&(null===(i=i.dehydrated)||"$?"===i.data||"$!"===i.data))return o}else if(19===o.tag&&void 0!==o.memoizedProps.revealOrder){if(128&o.flags)return o}else if(null!==o.child){o.child.return=o,o=o.child;continue}if(o===s)break;for(;null===o.sibling;){if(null===o.return||o.return===s)return null;o=o.return}o.sibling.return=o.return,o=o.sibling}return null}var es=[];function Eh(){for(var s=0;s<es.length;s++)es[s]._workInProgressVersionPrimary=null;es.length=0}var ts=V.ReactCurrentDispatcher,rs=V.ReactCurrentBatchConfig,ns=0,ss=null,os=null,as=null,cs=!1,ls=!1,us=0,ps=0;function P(){throw Error(p(321))}function Mh(s,o){if(null===o)return!1;for(var i=0;i<o.length&&i<s.length;i++)if(!Dr(s[i],o[i]))return!1;return!0}function Nh(s,o,i,a,u,_){if(ns=_,ss=o,o.memoizedState=null,o.updateQueue=null,o.lanes=0,ts.current=null===s||null===s.memoizedState?ds:fs,s=i(a,u),ls){_=0;do{if(ls=!1,us=0,25<=_)throw Error(p(301));_+=1,as=os=null,o.updateQueue=null,ts.current=ms,s=i(a,u)}while(ls)}if(ts.current=hs,o=null!==os&&null!==os.next,ns=0,as=os=ss=null,cs=!1,o)throw Error(p(300));return s}function Sh(){var s=0!==us;return us=0,s}function Th(){var s={memoizedState:null,baseState:null,baseQueue:null,queue:null,next:null};return null===as?ss.memoizedState=as=s:as=as.next=s,as}function Uh(){if(null===os){var s=ss.alternate;s=null!==s?s.memoizedState:null}else s=os.next;var o=null===as?ss.memoizedState:as.next;if(null!==o)as=o,os=s;else{if(null===s)throw Error(p(310));s={memoizedState:(os=s).memoizedState,baseState:os.baseState,baseQueue:os.baseQueue,queue:os.queue,next:null},null===as?ss.memoizedState=as=s:as=as.next=s}return as}function Vh(s,o){return"function"==typeof o?o(s):o}function Wh(s){var o=Uh(),i=o.queue;if(null===i)throw Error(p(311));i.lastRenderedReducer=s;var a=os,u=a.baseQueue,_=i.pending;if(null!==_){if(null!==u){var w=u.next;u.next=_.next,_.next=w}a.baseQueue=u=_,i.pending=null}if(null!==u){_=u.next,a=a.baseState;var x=w=null,C=null,j=_;do{var L=j.lane;if((ns&L)===L)null!==C&&(C=C.next={lane:0,action:j.action,hasEagerState:j.hasEagerState,eagerState:j.eagerState,next:null}),a=j.hasEagerState?j.eagerState:s(a,j.action);else{var B={lane:L,action:j.action,hasEagerState:j.hasEagerState,eagerState:j.eagerState,next:null};null===C?(x=C=B,w=a):C=C.next=B,ss.lanes|=L,Ws|=L}j=j.next}while(null!==j&&j!==_);null===C?w=a:C.next=x,Dr(a,o.memoizedState)||(bs=!0),o.memoizedState=a,o.baseState=w,o.baseQueue=C,i.lastRenderedState=a}if(null!==(s=i.interleaved)){u=s;do{_=u.lane,ss.lanes|=_,Ws|=_,u=u.next}while(u!==s)}else null===u&&(i.lanes=0);return[o.memoizedState,i.dispatch]}function Xh(s){var o=Uh(),i=o.queue;if(null===i)throw Error(p(311));i.lastRenderedReducer=s;var a=i.dispatch,u=i.pending,_=o.memoizedState;if(null!==u){i.pending=null;var w=u=u.next;do{_=s(_,w.action),w=w.next}while(w!==u);Dr(_,o.memoizedState)||(bs=!0),o.memoizedState=_,null===o.baseQueue&&(o.baseState=_),i.lastRenderedState=_}return[_,a]}function Yh(){}function Zh(s,o){var i=ss,a=Uh(),u=o(),_=!Dr(a.memoizedState,u);if(_&&(a.memoizedState=u,bs=!0),a=a.queue,$h(ai.bind(null,i,a,s),[s]),a.getSnapshot!==o||_||null!==as&&1&as.memoizedState.tag){if(i.flags|=2048,bi(9,ci.bind(null,i,a,u,o),void 0,null),null===Fs)throw Error(p(349));30&ns||di(i,o,u)}return u}function di(s,o,i){s.flags|=16384,s={getSnapshot:o,value:i},null===(o=ss.updateQueue)?(o={lastEffect:null,stores:null},ss.updateQueue=o,o.stores=[s]):null===(i=o.stores)?o.stores=[s]:i.push(s)}function ci(s,o,i,a){o.value=i,o.getSnapshot=a,ei(o)&&fi(s)}function ai(s,o,i){return i((function(){ei(o)&&fi(s)}))}function ei(s){var o=s.getSnapshot;s=s.value;try{var i=o();return!Dr(s,i)}catch(s){return!0}}function fi(s){var o=ih(s,1);null!==o&&gi(o,s,1,-1)}function hi(s){var o=Th();return"function"==typeof s&&(s=s()),o.memoizedState=o.baseState=s,s={pending:null,interleaved:null,lanes:0,dispatch:null,lastRenderedReducer:Vh,lastRenderedState:s},o.queue=s,s=s.dispatch=ii.bind(null,ss,s),[o.memoizedState,s]}function bi(s,o,i,a){return s={tag:s,create:o,destroy:i,deps:a,next:null},null===(o=ss.updateQueue)?(o={lastEffect:null,stores:null},ss.updateQueue=o,o.lastEffect=s.next=s):null===(i=o.lastEffect)?o.lastEffect=s.next=s:(a=i.next,i.next=s,s.next=a,o.lastEffect=s),s}function ji(){return Uh().memoizedState}function ki(s,o,i,a){var u=Th();ss.flags|=s,u.memoizedState=bi(1|o,i,void 0,void 0===a?null:a)}function li(s,o,i,a){var u=Uh();a=void 0===a?null:a;var _=void 0;if(null!==os){var w=os.memoizedState;if(_=w.destroy,null!==a&&Mh(a,w.deps))return void(u.memoizedState=bi(o,i,_,a))}ss.flags|=s,u.memoizedState=bi(1|o,i,_,a)}function mi(s,o){return ki(8390656,8,s,o)}function $h(s,o){return li(2048,8,s,o)}function ni(s,o){return li(4,2,s,o)}function oi(s,o){return li(4,4,s,o)}function pi(s,o){return"function"==typeof o?(s=s(),o(s),function(){o(null)}):null!=o?(s=s(),o.current=s,function(){o.current=null}):void 0}function qi(s,o,i){return i=null!=i?i.concat([s]):null,li(4,4,pi.bind(null,o,s),i)}function ri(){}function si(s,o){var i=Uh();o=void 0===o?null:o;var a=i.memoizedState;return null!==a&&null!==o&&Mh(o,a[1])?a[0]:(i.memoizedState=[s,o],s)}function ti(s,o){var i=Uh();o=void 0===o?null:o;var a=i.memoizedState;return null!==a&&null!==o&&Mh(o,a[1])?a[0]:(s=s(),i.memoizedState=[s,o],s)}function ui(s,o,i){return 21&ns?(Dr(i,o)||(i=yc(),ss.lanes|=i,Ws|=i,s.baseState=!0),o):(s.baseState&&(s.baseState=!1,bs=!0),s.memoizedState=i)}function vi(s,o){var i=At;At=0!==i&&4>i?i:4,s(!0);var a=rs.transition;rs.transition={};try{s(!1),o()}finally{At=i,rs.transition=a}}function wi(){return Uh().memoizedState}function xi(s,o,i){var a=yi(s);if(i={lane:a,action:i,hasEagerState:!1,eagerState:null,next:null},zi(s))Ai(o,i);else if(null!==(i=hh(s,o,i,a))){gi(i,s,a,R()),Bi(i,o,a)}}function ii(s,o,i){var a=yi(s),u={lane:a,action:i,hasEagerState:!1,eagerState:null,next:null};if(zi(s))Ai(o,u);else{var _=s.alternate;if(0===s.lanes&&(null===_||0===_.lanes)&&null!==(_=o.lastRenderedReducer))try{var w=o.lastRenderedState,x=_(w,i);if(u.hasEagerState=!0,u.eagerState=x,Dr(x,w)){var C=o.interleaved;return null===C?(u.next=u,gh(o)):(u.next=C.next,C.next=u),void(o.interleaved=u)}}catch(s){}null!==(i=hh(s,o,u,a))&&(gi(i,s,a,u=R()),Bi(i,o,a))}}function zi(s){var o=s.alternate;return s===ss||null!==o&&o===ss}function Ai(s,o){ls=cs=!0;var i=s.pending;null===i?o.next=o:(o.next=i.next,i.next=o),s.pending=o}function Bi(s,o,i){if(4194240&i){var a=o.lanes;i|=a&=s.pendingLanes,o.lanes=i,Cc(s,i)}}var hs={readContext:eh,useCallback:P,useContext:P,useEffect:P,useImperativeHandle:P,useInsertionEffect:P,useLayoutEffect:P,useMemo:P,useReducer:P,useRef:P,useState:P,useDebugValue:P,useDeferredValue:P,useTransition:P,useMutableSource:P,useSyncExternalStore:P,useId:P,unstable_isNewReconciler:!1},ds={readContext:eh,useCallback:function(s,o){return Th().memoizedState=[s,void 0===o?null:o],s},useContext:eh,useEffect:mi,useImperativeHandle:function(s,o,i){return i=null!=i?i.concat([s]):null,ki(4194308,4,pi.bind(null,o,s),i)},useLayoutEffect:function(s,o){return ki(4194308,4,s,o)},useInsertionEffect:function(s,o){return ki(4,2,s,o)},useMemo:function(s,o){var i=Th();return o=void 0===o?null:o,s=s(),i.memoizedState=[s,o],s},useReducer:function(s,o,i){var a=Th();return o=void 0!==i?i(o):o,a.memoizedState=a.baseState=o,s={pending:null,interleaved:null,lanes:0,dispatch:null,lastRenderedReducer:s,lastRenderedState:o},a.queue=s,s=s.dispatch=xi.bind(null,ss,s),[a.memoizedState,s]},useRef:function(s){return s={current:s},Th().memoizedState=s},useState:hi,useDebugValue:ri,useDeferredValue:function(s){return Th().memoizedState=s},useTransition:function(){var s=hi(!1),o=s[0];return s=vi.bind(null,s[1]),Th().memoizedState=s,[o,s]},useMutableSource:function(){},useSyncExternalStore:function(s,o,i){var a=ss,u=Th();if(Fn){if(void 0===i)throw Error(p(407));i=i()}else{if(i=o(),null===Fs)throw Error(p(349));30&ns||di(a,o,i)}u.memoizedState=i;var _={value:i,getSnapshot:o};return u.queue=_,mi(ai.bind(null,a,_,s),[s]),a.flags|=2048,bi(9,ci.bind(null,a,_,i,o),void 0,null),i},useId:function(){var s=Th(),o=Fs.identifierPrefix;if(Fn){var i=Rn;o=":"+o+"R"+(i=(Mn&~(1<<32-Et(Mn)-1)).toString(32)+i),0<(i=us++)&&(o+="H"+i.toString(32)),o+=":"}else o=":"+o+"r"+(i=ps++).toString(32)+":";return s.memoizedState=o},unstable_isNewReconciler:!1},fs={readContext:eh,useCallback:si,useContext:eh,useEffect:$h,useImperativeHandle:qi,useInsertionEffect:ni,useLayoutEffect:oi,useMemo:ti,useReducer:Wh,useRef:ji,useState:function(){return Wh(Vh)},useDebugValue:ri,useDeferredValue:function(s){return ui(Uh(),os.memoizedState,s)},useTransition:function(){return[Wh(Vh)[0],Uh().memoizedState]},useMutableSource:Yh,useSyncExternalStore:Zh,useId:wi,unstable_isNewReconciler:!1},ms={readContext:eh,useCallback:si,useContext:eh,useEffect:$h,useImperativeHandle:qi,useInsertionEffect:ni,useLayoutEffect:oi,useMemo:ti,useReducer:Xh,useRef:ji,useState:function(){return Xh(Vh)},useDebugValue:ri,useDeferredValue:function(s){var o=Uh();return null===os?o.memoizedState=s:ui(o,os.memoizedState,s)},useTransition:function(){return[Xh(Vh)[0],Uh().memoizedState]},useMutableSource:Yh,useSyncExternalStore:Zh,useId:wi,unstable_isNewReconciler:!1};function Ci(s,o){if(s&&s.defaultProps){for(var i in o=we({},o),s=s.defaultProps)void 0===o[i]&&(o[i]=s[i]);return o}return o}function Di(s,o,i,a){i=null==(i=i(a,o=s.memoizedState))?o:we({},o,i),s.memoizedState=i,0===s.lanes&&(s.updateQueue.baseState=i)}var gs={isMounted:function(s){return!!(s=s._reactInternals)&&Vb(s)===s},enqueueSetState:function(s,o,i){s=s._reactInternals;var a=R(),u=yi(s),_=mh(a,u);_.payload=o,null!=i&&(_.callback=i),null!==(o=nh(s,_,u))&&(gi(o,s,u,a),oh(o,s,u))},enqueueReplaceState:function(s,o,i){s=s._reactInternals;var a=R(),u=yi(s),_=mh(a,u);_.tag=1,_.payload=o,null!=i&&(_.callback=i),null!==(o=nh(s,_,u))&&(gi(o,s,u,a),oh(o,s,u))},enqueueForceUpdate:function(s,o){s=s._reactInternals;var i=R(),a=yi(s),u=mh(i,a);u.tag=2,null!=o&&(u.callback=o),null!==(o=nh(s,u,a))&&(gi(o,s,a,i),oh(o,s,a))}};function Fi(s,o,i,a,u,_,w){return"function"==typeof(s=s.stateNode).shouldComponentUpdate?s.shouldComponentUpdate(a,_,w):!o.prototype||!o.prototype.isPureReactComponent||(!Ie(i,a)||!Ie(u,_))}function Gi(s,o,i){var a=!1,u=_n,_=o.contextType;return"object"==typeof _&&null!==_?_=eh(_):(u=Zf(o)?wn:Sn.current,_=(a=null!=(a=o.contextTypes))?Yf(s,u):_n),o=new o(i,_),s.memoizedState=null!==o.state&&void 0!==o.state?o.state:null,o.updater=gs,s.stateNode=o,o._reactInternals=s,a&&((s=s.stateNode).__reactInternalMemoizedUnmaskedChildContext=u,s.__reactInternalMemoizedMaskedChildContext=_),o}function Hi(s,o,i,a){s=o.state,"function"==typeof o.componentWillReceiveProps&&o.componentWillReceiveProps(i,a),"function"==typeof o.UNSAFE_componentWillReceiveProps&&o.UNSAFE_componentWillReceiveProps(i,a),o.state!==s&&gs.enqueueReplaceState(o,o.state,null)}function Ii(s,o,i,a){var u=s.stateNode;u.props=i,u.state=s.memoizedState,u.refs={},kh(s);var _=o.contextType;"object"==typeof _&&null!==_?u.context=eh(_):(_=Zf(o)?wn:Sn.current,u.context=Yf(s,_)),u.state=s.memoizedState,"function"==typeof(_=o.getDerivedStateFromProps)&&(Di(s,o,_,i),u.state=s.memoizedState),"function"==typeof o.getDerivedStateFromProps||"function"==typeof u.getSnapshotBeforeUpdate||"function"!=typeof u.UNSAFE_componentWillMount&&"function"!=typeof u.componentWillMount||(o=u.state,"function"==typeof u.componentWillMount&&u.componentWillMount(),"function"==typeof u.UNSAFE_componentWillMount&&u.UNSAFE_componentWillMount(),o!==u.state&&gs.enqueueReplaceState(u,u.state,null),qh(s,i,u,a),u.state=s.memoizedState),"function"==typeof u.componentDidMount&&(s.flags|=4194308)}function Ji(s,o){try{var i="",a=o;do{i+=Pa(a),a=a.return}while(a);var u=i}catch(s){u="\nError generating stack: "+s.message+"\n"+s.stack}return{value:s,source:o,stack:u,digest:null}}function Ki(s,o,i){return{value:s,source:null,stack:null!=i?i:null,digest:null!=o?o:null}}function Li(s,o){try{console.error(o.value)}catch(s){setTimeout((function(){throw s}))}}var ys="function"==typeof WeakMap?WeakMap:Map;function Ni(s,o,i){(i=mh(-1,i)).tag=3,i.payload={element:null};var a=o.value;return i.callback=function(){Zs||(Zs=!0,eo=a),Li(0,o)},i}function Qi(s,o,i){(i=mh(-1,i)).tag=3;var a=s.type.getDerivedStateFromError;if("function"==typeof a){var u=o.value;i.payload=function(){return a(u)},i.callback=function(){Li(0,o)}}var _=s.stateNode;return null!==_&&"function"==typeof _.componentDidCatch&&(i.callback=function(){Li(0,o),"function"!=typeof a&&(null===to?to=new Set([this]):to.add(this));var s=o.stack;this.componentDidCatch(o.value,{componentStack:null!==s?s:""})}),i}function Si(s,o,i){var a=s.pingCache;if(null===a){a=s.pingCache=new ys;var u=new Set;a.set(o,u)}else void 0===(u=a.get(o))&&(u=new Set,a.set(o,u));u.has(i)||(u.add(i),s=Ti.bind(null,s,o,i),o.then(s,s))}function Ui(s){do{var o;if((o=13===s.tag)&&(o=null===(o=s.memoizedState)||null!==o.dehydrated),o)return s;s=s.return}while(null!==s);return null}function Vi(s,o,i,a,u){return 1&s.mode?(s.flags|=65536,s.lanes=u,s):(s===o?s.flags|=65536:(s.flags|=128,i.flags|=131072,i.flags&=-52805,1===i.tag&&(null===i.alternate?i.tag=17:((o=mh(-1,1)).tag=2,nh(i,o,1))),i.lanes|=1),s)}var vs=V.ReactCurrentOwner,bs=!1;function Xi(s,o,i,a){o.child=null===s?Un(o,null,i,a):qn(o,s.child,i,a)}function Yi(s,o,i,a,u){i=i.render;var _=o.ref;return ch(o,u),a=Nh(s,o,i,a,_,u),i=Sh(),null===s||bs?(Fn&&i&&vg(o),o.flags|=1,Xi(s,o,a,u),o.child):(o.updateQueue=s.updateQueue,o.flags&=-2053,s.lanes&=~u,Zi(s,o,u))}function $i(s,o,i,a,u){if(null===s){var _=i.type;return"function"!=typeof _||aj(_)||void 0!==_.defaultProps||null!==i.compare||void 0!==i.defaultProps?((s=Rg(i.type,null,a,o,o.mode,u)).ref=o.ref,s.return=o,o.child=s):(o.tag=15,o.type=_,bj(s,o,_,a,u))}if(_=s.child,!(s.lanes&u)){var w=_.memoizedProps;if((i=null!==(i=i.compare)?i:Ie)(w,a)&&s.ref===o.ref)return Zi(s,o,u)}return o.flags|=1,(s=Pg(_,a)).ref=o.ref,s.return=o,o.child=s}function bj(s,o,i,a,u){if(null!==s){var _=s.memoizedProps;if(Ie(_,a)&&s.ref===o.ref){if(bs=!1,o.pendingProps=a=_,!(s.lanes&u))return o.lanes=s.lanes,Zi(s,o,u);131072&s.flags&&(bs=!0)}}return cj(s,o,i,a,u)}function dj(s,o,i){var a=o.pendingProps,u=a.children,_=null!==s?s.memoizedState:null;if("hidden"===a.mode)if(1&o.mode){if(!(1073741824&i))return s=null!==_?_.baseLanes|i:i,o.lanes=o.childLanes=1073741824,o.memoizedState={baseLanes:s,cachePool:null,transitions:null},o.updateQueue=null,G(Us,qs),qs|=s,null;o.memoizedState={baseLanes:0,cachePool:null,transitions:null},a=null!==_?_.baseLanes:i,G(Us,qs),qs|=a}else o.memoizedState={baseLanes:0,cachePool:null,transitions:null},G(Us,qs),qs|=i;else null!==_?(a=_.baseLanes|i,o.memoizedState=null):a=i,G(Us,qs),qs|=a;return Xi(s,o,u,i),o.child}function gj(s,o){var i=o.ref;(null===s&&null!==i||null!==s&&s.ref!==i)&&(o.flags|=512,o.flags|=2097152)}function cj(s,o,i,a,u){var _=Zf(i)?wn:Sn.current;return _=Yf(o,_),ch(o,u),i=Nh(s,o,i,a,_,u),a=Sh(),null===s||bs?(Fn&&a&&vg(o),o.flags|=1,Xi(s,o,i,u),o.child):(o.updateQueue=s.updateQueue,o.flags&=-2053,s.lanes&=~u,Zi(s,o,u))}function hj(s,o,i,a,u){if(Zf(i)){var _=!0;cg(o)}else _=!1;if(ch(o,u),null===o.stateNode)ij(s,o),Gi(o,i,a),Ii(o,i,a,u),a=!0;else if(null===s){var w=o.stateNode,x=o.memoizedProps;w.props=x;var C=w.context,j=i.contextType;"object"==typeof j&&null!==j?j=eh(j):j=Yf(o,j=Zf(i)?wn:Sn.current);var L=i.getDerivedStateFromProps,B="function"==typeof L||"function"==typeof w.getSnapshotBeforeUpdate;B||"function"!=typeof w.UNSAFE_componentWillReceiveProps&&"function"!=typeof w.componentWillReceiveProps||(x!==a||C!==j)&&Hi(o,w,a,j),Kn=!1;var $=o.memoizedState;w.state=$,qh(o,a,w,u),C=o.memoizedState,x!==a||$!==C||En.current||Kn?("function"==typeof L&&(Di(o,i,L,a),C=o.memoizedState),(x=Kn||Fi(o,i,x,a,$,C,j))?(B||"function"!=typeof w.UNSAFE_componentWillMount&&"function"!=typeof w.componentWillMount||("function"==typeof w.componentWillMount&&w.componentWillMount(),"function"==typeof w.UNSAFE_componentWillMount&&w.UNSAFE_componentWillMount()),"function"==typeof w.componentDidMount&&(o.flags|=4194308)):("function"==typeof w.componentDidMount&&(o.flags|=4194308),o.memoizedProps=a,o.memoizedState=C),w.props=a,w.state=C,w.context=j,a=x):("function"==typeof w.componentDidMount&&(o.flags|=4194308),a=!1)}else{w=o.stateNode,lh(s,o),x=o.memoizedProps,j=o.type===o.elementType?x:Ci(o.type,x),w.props=j,B=o.pendingProps,$=w.context,"object"==typeof(C=i.contextType)&&null!==C?C=eh(C):C=Yf(o,C=Zf(i)?wn:Sn.current);var U=i.getDerivedStateFromProps;(L="function"==typeof U||"function"==typeof w.getSnapshotBeforeUpdate)||"function"!=typeof w.UNSAFE_componentWillReceiveProps&&"function"!=typeof w.componentWillReceiveProps||(x!==B||$!==C)&&Hi(o,w,a,C),Kn=!1,$=o.memoizedState,w.state=$,qh(o,a,w,u);var V=o.memoizedState;x!==B||$!==V||En.current||Kn?("function"==typeof U&&(Di(o,i,U,a),V=o.memoizedState),(j=Kn||Fi(o,i,j,a,$,V,C)||!1)?(L||"function"!=typeof w.UNSAFE_componentWillUpdate&&"function"!=typeof w.componentWillUpdate||("function"==typeof w.componentWillUpdate&&w.componentWillUpdate(a,V,C),"function"==typeof w.UNSAFE_componentWillUpdate&&w.UNSAFE_componentWillUpdate(a,V,C)),"function"==typeof w.componentDidUpdate&&(o.flags|=4),"function"==typeof w.getSnapshotBeforeUpdate&&(o.flags|=1024)):("function"!=typeof w.componentDidUpdate||x===s.memoizedProps&&$===s.memoizedState||(o.flags|=4),"function"!=typeof w.getSnapshotBeforeUpdate||x===s.memoizedProps&&$===s.memoizedState||(o.flags|=1024),o.memoizedProps=a,o.memoizedState=V),w.props=a,w.state=V,w.context=C,a=j):("function"!=typeof w.componentDidUpdate||x===s.memoizedProps&&$===s.memoizedState||(o.flags|=4),"function"!=typeof w.getSnapshotBeforeUpdate||x===s.memoizedProps&&$===s.memoizedState||(o.flags|=1024),a=!1)}return jj(s,o,i,a,_,u)}function jj(s,o,i,a,u,_){gj(s,o);var w=!!(128&o.flags);if(!a&&!w)return u&&dg(o,i,!1),Zi(s,o,_);a=o.stateNode,vs.current=o;var x=w&&"function"!=typeof i.getDerivedStateFromError?null:a.render();return o.flags|=1,null!==s&&w?(o.child=qn(o,s.child,null,_),o.child=qn(o,null,x,_)):Xi(s,o,x,_),o.memoizedState=a.state,u&&dg(o,i,!0),o.child}function kj(s){var o=s.stateNode;o.pendingContext?ag(0,o.pendingContext,o.pendingContext!==o.context):o.context&&ag(0,o.context,!1),yh(s,o.containerInfo)}function lj(s,o,i,a,u){return Ig(),Jg(u),o.flags|=256,Xi(s,o,i,a),o.child}var _s,Ss,Es,ws,xs={dehydrated:null,treeContext:null,retryLane:0};function nj(s){return{baseLanes:s,cachePool:null,transitions:null}}function oj(s,o,i){var a,u=o.pendingProps,_=Zn.current,w=!1,x=!!(128&o.flags);if((a=x)||(a=(null===s||null!==s.memoizedState)&&!!(2&_)),a?(w=!0,o.flags&=-129):null!==s&&null===s.memoizedState||(_|=1),G(Zn,1&_),null===s)return Eg(o),null!==(s=o.memoizedState)&&null!==(s=s.dehydrated)?(1&o.mode?"$!"===s.data?o.lanes=8:o.lanes=1073741824:o.lanes=1,null):(x=u.children,s=u.fallback,w?(u=o.mode,w=o.child,x={mode:"hidden",children:x},1&u||null===w?w=pj(x,u,0,null):(w.childLanes=0,w.pendingProps=x),s=Tg(s,u,i,null),w.return=o,s.return=o,w.sibling=s,o.child=w,o.child.memoizedState=nj(i),o.memoizedState=xs,s):qj(o,x));if(null!==(_=s.memoizedState)&&null!==(a=_.dehydrated))return function rj(s,o,i,a,u,_,w){if(i)return 256&o.flags?(o.flags&=-257,sj(s,o,w,a=Ki(Error(p(422))))):null!==o.memoizedState?(o.child=s.child,o.flags|=128,null):(_=a.fallback,u=o.mode,a=pj({mode:"visible",children:a.children},u,0,null),(_=Tg(_,u,w,null)).flags|=2,a.return=o,_.return=o,a.sibling=_,o.child=a,1&o.mode&&qn(o,s.child,null,w),o.child.memoizedState=nj(w),o.memoizedState=xs,_);if(!(1&o.mode))return sj(s,o,w,null);if("$!"===u.data){if(a=u.nextSibling&&u.nextSibling.dataset)var x=a.dgst;return a=x,sj(s,o,w,a=Ki(_=Error(p(419)),a,void 0))}if(x=!!(w&s.childLanes),bs||x){if(null!==(a=Fs)){switch(w&-w){case 4:u=2;break;case 16:u=8;break;case 64:case 128:case 256:case 512:case 1024:case 2048:case 4096:case 8192:case 16384:case 32768:case 65536:case 131072:case 262144:case 524288:case 1048576:case 2097152:case 4194304:case 8388608:case 16777216:case 33554432:case 67108864:u=32;break;case 536870912:u=268435456;break;default:u=0}0!==(u=u&(a.suspendedLanes|w)?0:u)&&u!==_.retryLane&&(_.retryLane=u,ih(s,u),gi(a,s,u,-1))}return tj(),sj(s,o,w,a=Ki(Error(p(421))))}return"$?"===u.data?(o.flags|=128,o.child=s.child,o=uj.bind(null,s),u._reactRetry=o,null):(s=_.treeContext,Ln=Lf(u.nextSibling),Dn=o,Fn=!0,Bn=null,null!==s&&(In[Tn++]=Mn,In[Tn++]=Rn,In[Tn++]=Nn,Mn=s.id,Rn=s.overflow,Nn=o),o=qj(o,a.children),o.flags|=4096,o)}(s,o,x,u,a,_,i);if(w){w=u.fallback,x=o.mode,a=(_=s.child).sibling;var C={mode:"hidden",children:u.children};return 1&x||o.child===_?(u=Pg(_,C)).subtreeFlags=14680064&_.subtreeFlags:((u=o.child).childLanes=0,u.pendingProps=C,o.deletions=null),null!==a?w=Pg(a,w):(w=Tg(w,x,i,null)).flags|=2,w.return=o,u.return=o,u.sibling=w,o.child=u,u=w,w=o.child,x=null===(x=s.child.memoizedState)?nj(i):{baseLanes:x.baseLanes|i,cachePool:null,transitions:x.transitions},w.memoizedState=x,w.childLanes=s.childLanes&~i,o.memoizedState=xs,u}return s=(w=s.child).sibling,u=Pg(w,{mode:"visible",children:u.children}),!(1&o.mode)&&(u.lanes=i),u.return=o,u.sibling=null,null!==s&&(null===(i=o.deletions)?(o.deletions=[s],o.flags|=16):i.push(s)),o.child=u,o.memoizedState=null,u}function qj(s,o){return(o=pj({mode:"visible",children:o},s.mode,0,null)).return=s,s.child=o}function sj(s,o,i,a){return null!==a&&Jg(a),qn(o,s.child,null,i),(s=qj(o,o.pendingProps.children)).flags|=2,o.memoizedState=null,s}function vj(s,o,i){s.lanes|=o;var a=s.alternate;null!==a&&(a.lanes|=o),bh(s.return,o,i)}function wj(s,o,i,a,u){var _=s.memoizedState;null===_?s.memoizedState={isBackwards:o,rendering:null,renderingStartTime:0,last:a,tail:i,tailMode:u}:(_.isBackwards=o,_.rendering=null,_.renderingStartTime=0,_.last=a,_.tail=i,_.tailMode=u)}function xj(s,o,i){var a=o.pendingProps,u=a.revealOrder,_=a.tail;if(Xi(s,o,a.children,i),2&(a=Zn.current))a=1&a|2,o.flags|=128;else{if(null!==s&&128&s.flags)e:for(s=o.child;null!==s;){if(13===s.tag)null!==s.memoizedState&&vj(s,i,o);else if(19===s.tag)vj(s,i,o);else if(null!==s.child){s.child.return=s,s=s.child;continue}if(s===o)break e;for(;null===s.sibling;){if(null===s.return||s.return===o)break e;s=s.return}s.sibling.return=s.return,s=s.sibling}a&=1}if(G(Zn,a),1&o.mode)switch(u){case"forwards":for(i=o.child,u=null;null!==i;)null!==(s=i.alternate)&&null===Ch(s)&&(u=i),i=i.sibling;null===(i=u)?(u=o.child,o.child=null):(u=i.sibling,i.sibling=null),wj(o,!1,u,i,_);break;case"backwards":for(i=null,u=o.child,o.child=null;null!==u;){if(null!==(s=u.alternate)&&null===Ch(s)){o.child=u;break}s=u.sibling,u.sibling=i,i=u,u=s}wj(o,!0,i,null,_);break;case"together":wj(o,!1,null,null,void 0);break;default:o.memoizedState=null}else o.memoizedState=null;return o.child}function ij(s,o){!(1&o.mode)&&null!==s&&(s.alternate=null,o.alternate=null,o.flags|=2)}function Zi(s,o,i){if(null!==s&&(o.dependencies=s.dependencies),Ws|=o.lanes,!(i&o.childLanes))return null;if(null!==s&&o.child!==s.child)throw Error(p(153));if(null!==o.child){for(i=Pg(s=o.child,s.pendingProps),o.child=i,i.return=o;null!==s.sibling;)s=s.sibling,(i=i.sibling=Pg(s,s.pendingProps)).return=o;i.sibling=null}return o.child}function Dj(s,o){if(!Fn)switch(s.tailMode){case"hidden":o=s.tail;for(var i=null;null!==o;)null!==o.alternate&&(i=o),o=o.sibling;null===i?s.tail=null:i.sibling=null;break;case"collapsed":i=s.tail;for(var a=null;null!==i;)null!==i.alternate&&(a=i),i=i.sibling;null===a?o||null===s.tail?s.tail=null:s.tail.sibling=null:a.sibling=null}}function S(s){var o=null!==s.alternate&&s.alternate.child===s.child,i=0,a=0;if(o)for(var u=s.child;null!==u;)i|=u.lanes|u.childLanes,a|=14680064&u.subtreeFlags,a|=14680064&u.flags,u.return=s,u=u.sibling;else for(u=s.child;null!==u;)i|=u.lanes|u.childLanes,a|=u.subtreeFlags,a|=u.flags,u.return=s,u=u.sibling;return s.subtreeFlags|=a,s.childLanes=i,o}function Ej(s,o,i){var a=o.pendingProps;switch(wg(o),o.tag){case 2:case 16:case 15:case 0:case 11:case 7:case 8:case 12:case 9:case 14:return S(o),null;case 1:case 17:return Zf(o.type)&&$f(),S(o),null;case 3:return a=o.stateNode,zh(),E(En),E(Sn),Eh(),a.pendingContext&&(a.context=a.pendingContext,a.pendingContext=null),null!==s&&null!==s.child||(Gg(o)?o.flags|=4:null===s||s.memoizedState.isDehydrated&&!(256&o.flags)||(o.flags|=1024,null!==Bn&&(Fj(Bn),Bn=null))),Ss(s,o),S(o),null;case 5:Bh(o);var u=xh(Qn.current);if(i=o.type,null!==s&&null!=o.stateNode)Es(s,o,i,a,u),s.ref!==o.ref&&(o.flags|=512,o.flags|=2097152);else{if(!a){if(null===o.stateNode)throw Error(p(166));return S(o),null}if(s=xh(Yn.current),Gg(o)){a=o.stateNode,i=o.type;var _=o.memoizedProps;switch(a[hn]=o,a[dn]=_,s=!!(1&o.mode),i){case"dialog":D("cancel",a),D("close",a);break;case"iframe":case"object":case"embed":D("load",a);break;case"video":case"audio":for(u=0;u<Zr.length;u++)D(Zr[u],a);break;case"source":D("error",a);break;case"img":case"image":case"link":D("error",a),D("load",a);break;case"details":D("toggle",a);break;case"input":Za(a,_),D("invalid",a);break;case"select":a._wrapperState={wasMultiple:!!_.multiple},D("invalid",a);break;case"textarea":hb(a,_),D("invalid",a)}for(var x in ub(i,_),u=null,_)if(_.hasOwnProperty(x)){var C=_[x];"children"===x?"string"==typeof C?a.textContent!==C&&(!0!==_.suppressHydrationWarning&&Af(a.textContent,C,s),u=["children",C]):"number"==typeof C&&a.textContent!==""+C&&(!0!==_.suppressHydrationWarning&&Af(a.textContent,C,s),u=["children",""+C]):w.hasOwnProperty(x)&&null!=C&&"onScroll"===x&&D("scroll",a)}switch(i){case"input":Va(a),db(a,_,!0);break;case"textarea":Va(a),jb(a);break;case"select":case"option":break;default:"function"==typeof _.onClick&&(a.onclick=Bf)}a=u,o.updateQueue=a,null!==a&&(o.flags|=4)}else{x=9===u.nodeType?u:u.ownerDocument,"http://www.w3.org/1999/xhtml"===s&&(s=kb(i)),"http://www.w3.org/1999/xhtml"===s?"script"===i?((s=x.createElement("div")).innerHTML="<script><\/script>",s=s.removeChild(s.firstChild)):"string"==typeof a.is?s=x.createElement(i,{is:a.is}):(s=x.createElement(i),"select"===i&&(x=s,a.multiple?x.multiple=!0:a.size&&(x.size=a.size))):s=x.createElementNS(s,i),s[hn]=o,s[dn]=a,_s(s,o,!1,!1),o.stateNode=s;e:{switch(x=vb(i,a),i){case"dialog":D("cancel",s),D("close",s),u=a;break;case"iframe":case"object":case"embed":D("load",s),u=a;break;case"video":case"audio":for(u=0;u<Zr.length;u++)D(Zr[u],s);u=a;break;case"source":D("error",s),u=a;break;case"img":case"image":case"link":D("error",s),D("load",s),u=a;break;case"details":D("toggle",s),u=a;break;case"input":Za(s,a),u=Ya(s,a),D("invalid",s);break;case"option":default:u=a;break;case"select":s._wrapperState={wasMultiple:!!a.multiple},u=we({},a,{value:void 0}),D("invalid",s);break;case"textarea":hb(s,a),u=gb(s,a),D("invalid",s)}for(_ in ub(i,u),C=u)if(C.hasOwnProperty(_)){var j=C[_];"style"===_?sb(s,j):"dangerouslySetInnerHTML"===_?null!=(j=j?j.__html:void 0)&&$e(s,j):"children"===_?"string"==typeof j?("textarea"!==i||""!==j)&&ob(s,j):"number"==typeof j&&ob(s,""+j):"suppressContentEditableWarning"!==_&&"suppressHydrationWarning"!==_&&"autoFocus"!==_&&(w.hasOwnProperty(_)?null!=j&&"onScroll"===_&&D("scroll",s):null!=j&&ta(s,_,j,x))}switch(i){case"input":Va(s),db(s,a,!1);break;case"textarea":Va(s),jb(s);break;case"option":null!=a.value&&s.setAttribute("value",""+Sa(a.value));break;case"select":s.multiple=!!a.multiple,null!=(_=a.value)?fb(s,!!a.multiple,_,!1):null!=a.defaultValue&&fb(s,!!a.multiple,a.defaultValue,!0);break;default:"function"==typeof u.onClick&&(s.onclick=Bf)}switch(i){case"button":case"input":case"select":case"textarea":a=!!a.autoFocus;break e;case"img":a=!0;break e;default:a=!1}}a&&(o.flags|=4)}null!==o.ref&&(o.flags|=512,o.flags|=2097152)}return S(o),null;case 6:if(s&&null!=o.stateNode)ws(s,o,s.memoizedProps,a);else{if("string"!=typeof a&&null===o.stateNode)throw Error(p(166));if(i=xh(Qn.current),xh(Yn.current),Gg(o)){if(a=o.stateNode,i=o.memoizedProps,a[hn]=o,(_=a.nodeValue!==i)&&null!==(s=Dn))switch(s.tag){case 3:Af(a.nodeValue,i,!!(1&s.mode));break;case 5:!0!==s.memoizedProps.suppressHydrationWarning&&Af(a.nodeValue,i,!!(1&s.mode))}_&&(o.flags|=4)}else(a=(9===i.nodeType?i:i.ownerDocument).createTextNode(a))[hn]=o,o.stateNode=a}return S(o),null;case 13:if(E(Zn),a=o.memoizedState,null===s||null!==s.memoizedState&&null!==s.memoizedState.dehydrated){if(Fn&&null!==Ln&&1&o.mode&&!(128&o.flags))Hg(),Ig(),o.flags|=98560,_=!1;else if(_=Gg(o),null!==a&&null!==a.dehydrated){if(null===s){if(!_)throw Error(p(318));if(!(_=null!==(_=o.memoizedState)?_.dehydrated:null))throw Error(p(317));_[hn]=o}else Ig(),!(128&o.flags)&&(o.memoizedState=null),o.flags|=4;S(o),_=!1}else null!==Bn&&(Fj(Bn),Bn=null),_=!0;if(!_)return 65536&o.flags?o:null}return 128&o.flags?(o.lanes=i,o):((a=null!==a)!==(null!==s&&null!==s.memoizedState)&&a&&(o.child.flags|=8192,1&o.mode&&(null===s||1&Zn.current?0===Vs&&(Vs=3):tj())),null!==o.updateQueue&&(o.flags|=4),S(o),null);case 4:return zh(),Ss(s,o),null===s&&sf(o.stateNode.containerInfo),S(o),null;case 10:return ah(o.type._context),S(o),null;case 19:if(E(Zn),null===(_=o.memoizedState))return S(o),null;if(a=!!(128&o.flags),null===(x=_.rendering))if(a)Dj(_,!1);else{if(0!==Vs||null!==s&&128&s.flags)for(s=o.child;null!==s;){if(null!==(x=Ch(s))){for(o.flags|=128,Dj(_,!1),null!==(a=x.updateQueue)&&(o.updateQueue=a,o.flags|=4),o.subtreeFlags=0,a=i,i=o.child;null!==i;)s=a,(_=i).flags&=14680066,null===(x=_.alternate)?(_.childLanes=0,_.lanes=s,_.child=null,_.subtreeFlags=0,_.memoizedProps=null,_.memoizedState=null,_.updateQueue=null,_.dependencies=null,_.stateNode=null):(_.childLanes=x.childLanes,_.lanes=x.lanes,_.child=x.child,_.subtreeFlags=0,_.deletions=null,_.memoizedProps=x.memoizedProps,_.memoizedState=x.memoizedState,_.updateQueue=x.updateQueue,_.type=x.type,s=x.dependencies,_.dependencies=null===s?null:{lanes:s.lanes,firstContext:s.firstContext}),i=i.sibling;return G(Zn,1&Zn.current|2),o.child}s=s.sibling}null!==_.tail&&ht()>Xs&&(o.flags|=128,a=!0,Dj(_,!1),o.lanes=4194304)}else{if(!a)if(null!==(s=Ch(x))){if(o.flags|=128,a=!0,null!==(i=s.updateQueue)&&(o.updateQueue=i,o.flags|=4),Dj(_,!0),null===_.tail&&"hidden"===_.tailMode&&!x.alternate&&!Fn)return S(o),null}else 2*ht()-_.renderingStartTime>Xs&&1073741824!==i&&(o.flags|=128,a=!0,Dj(_,!1),o.lanes=4194304);_.isBackwards?(x.sibling=o.child,o.child=x):(null!==(i=_.last)?i.sibling=x:o.child=x,_.last=x)}return null!==_.tail?(o=_.tail,_.rendering=o,_.tail=o.sibling,_.renderingStartTime=ht(),o.sibling=null,i=Zn.current,G(Zn,a?1&i|2:1&i),o):(S(o),null);case 22:case 23:return Hj(),a=null!==o.memoizedState,null!==s&&null!==s.memoizedState!==a&&(o.flags|=8192),a&&1&o.mode?!!(1073741824&qs)&&(S(o),6&o.subtreeFlags&&(o.flags|=8192)):S(o),null;case 24:case 25:return null}throw Error(p(156,o.tag))}function Ij(s,o){switch(wg(o),o.tag){case 1:return Zf(o.type)&&$f(),65536&(s=o.flags)?(o.flags=-65537&s|128,o):null;case 3:return zh(),E(En),E(Sn),Eh(),65536&(s=o.flags)&&!(128&s)?(o.flags=-65537&s|128,o):null;case 5:return Bh(o),null;case 13:if(E(Zn),null!==(s=o.memoizedState)&&null!==s.dehydrated){if(null===o.alternate)throw Error(p(340));Ig()}return 65536&(s=o.flags)?(o.flags=-65537&s|128,o):null;case 19:return E(Zn),null;case 4:return zh(),null;case 10:return ah(o.type._context),null;case 22:case 23:return Hj(),null;default:return null}}_s=function(s,o){for(var i=o.child;null!==i;){if(5===i.tag||6===i.tag)s.appendChild(i.stateNode);else if(4!==i.tag&&null!==i.child){i.child.return=i,i=i.child;continue}if(i===o)break;for(;null===i.sibling;){if(null===i.return||i.return===o)return;i=i.return}i.sibling.return=i.return,i=i.sibling}},Ss=function(){},Es=function(s,o,i,a){var u=s.memoizedProps;if(u!==a){s=o.stateNode,xh(Yn.current);var _,x=null;switch(i){case"input":u=Ya(s,u),a=Ya(s,a),x=[];break;case"select":u=we({},u,{value:void 0}),a=we({},a,{value:void 0}),x=[];break;case"textarea":u=gb(s,u),a=gb(s,a),x=[];break;default:"function"!=typeof u.onClick&&"function"==typeof a.onClick&&(s.onclick=Bf)}for(L in ub(i,a),i=null,u)if(!a.hasOwnProperty(L)&&u.hasOwnProperty(L)&&null!=u[L])if("style"===L){var C=u[L];for(_ in C)C.hasOwnProperty(_)&&(i||(i={}),i[_]="")}else"dangerouslySetInnerHTML"!==L&&"children"!==L&&"suppressContentEditableWarning"!==L&&"suppressHydrationWarning"!==L&&"autoFocus"!==L&&(w.hasOwnProperty(L)?x||(x=[]):(x=x||[]).push(L,null));for(L in a){var j=a[L];if(C=null!=u?u[L]:void 0,a.hasOwnProperty(L)&&j!==C&&(null!=j||null!=C))if("style"===L)if(C){for(_ in C)!C.hasOwnProperty(_)||j&&j.hasOwnProperty(_)||(i||(i={}),i[_]="");for(_ in j)j.hasOwnProperty(_)&&C[_]!==j[_]&&(i||(i={}),i[_]=j[_])}else i||(x||(x=[]),x.push(L,i)),i=j;else"dangerouslySetInnerHTML"===L?(j=j?j.__html:void 0,C=C?C.__html:void 0,null!=j&&C!==j&&(x=x||[]).push(L,j)):"children"===L?"string"!=typeof j&&"number"!=typeof j||(x=x||[]).push(L,""+j):"suppressContentEditableWarning"!==L&&"suppressHydrationWarning"!==L&&(w.hasOwnProperty(L)?(null!=j&&"onScroll"===L&&D("scroll",s),x||C===j||(x=[])):(x=x||[]).push(L,j))}i&&(x=x||[]).push("style",i);var L=x;(o.updateQueue=L)&&(o.flags|=4)}},ws=function(s,o,i,a){i!==a&&(o.flags|=4)};var ks=!1,Os=!1,As="function"==typeof WeakSet?WeakSet:Set,Cs=null;function Lj(s,o){var i=s.ref;if(null!==i)if("function"==typeof i)try{i(null)}catch(i){W(s,o,i)}else i.current=null}function Mj(s,o,i){try{i()}catch(i){W(s,o,i)}}var js=!1;function Pj(s,o,i){var a=o.updateQueue;if(null!==(a=null!==a?a.lastEffect:null)){var u=a=a.next;do{if((u.tag&s)===s){var _=u.destroy;u.destroy=void 0,void 0!==_&&Mj(o,i,_)}u=u.next}while(u!==a)}}function Qj(s,o){if(null!==(o=null!==(o=o.updateQueue)?o.lastEffect:null)){var i=o=o.next;do{if((i.tag&s)===s){var a=i.create;i.destroy=a()}i=i.next}while(i!==o)}}function Rj(s){var o=s.ref;if(null!==o){var i=s.stateNode;s.tag,s=i,"function"==typeof o?o(s):o.current=s}}function Sj(s){var o=s.alternate;null!==o&&(s.alternate=null,Sj(o)),s.child=null,s.deletions=null,s.sibling=null,5===s.tag&&(null!==(o=s.stateNode)&&(delete o[hn],delete o[dn],delete o[mn],delete o[gn],delete o[yn])),s.stateNode=null,s.return=null,s.dependencies=null,s.memoizedProps=null,s.memoizedState=null,s.pendingProps=null,s.stateNode=null,s.updateQueue=null}function Tj(s){return 5===s.tag||3===s.tag||4===s.tag}function Uj(s){e:for(;;){for(;null===s.sibling;){if(null===s.return||Tj(s.return))return null;s=s.return}for(s.sibling.return=s.return,s=s.sibling;5!==s.tag&&6!==s.tag&&18!==s.tag;){if(2&s.flags)continue e;if(null===s.child||4===s.tag)continue e;s.child.return=s,s=s.child}if(!(2&s.flags))return s.stateNode}}function Vj(s,o,i){var a=s.tag;if(5===a||6===a)s=s.stateNode,o?8===i.nodeType?i.parentNode.insertBefore(s,o):i.insertBefore(s,o):(8===i.nodeType?(o=i.parentNode).insertBefore(s,i):(o=i).appendChild(s),null!=(i=i._reactRootContainer)||null!==o.onclick||(o.onclick=Bf));else if(4!==a&&null!==(s=s.child))for(Vj(s,o,i),s=s.sibling;null!==s;)Vj(s,o,i),s=s.sibling}function Wj(s,o,i){var a=s.tag;if(5===a||6===a)s=s.stateNode,o?i.insertBefore(s,o):i.appendChild(s);else if(4!==a&&null!==(s=s.child))for(Wj(s,o,i),s=s.sibling;null!==s;)Wj(s,o,i),s=s.sibling}var Ps=null,Is=!1;function Yj(s,o,i){for(i=i.child;null!==i;)Zj(s,o,i),i=i.sibling}function Zj(s,o,i){if(St&&"function"==typeof St.onCommitFiberUnmount)try{St.onCommitFiberUnmount(_t,i)}catch(s){}switch(i.tag){case 5:Os||Lj(i,o);case 6:var a=Ps,u=Is;Ps=null,Yj(s,o,i),Is=u,null!==(Ps=a)&&(Is?(s=Ps,i=i.stateNode,8===s.nodeType?s.parentNode.removeChild(i):s.removeChild(i)):Ps.removeChild(i.stateNode));break;case 18:null!==Ps&&(Is?(s=Ps,i=i.stateNode,8===s.nodeType?Kf(s.parentNode,i):1===s.nodeType&&Kf(s,i),bd(s)):Kf(Ps,i.stateNode));break;case 4:a=Ps,u=Is,Ps=i.stateNode.containerInfo,Is=!0,Yj(s,o,i),Ps=a,Is=u;break;case 0:case 11:case 14:case 15:if(!Os&&(null!==(a=i.updateQueue)&&null!==(a=a.lastEffect))){u=a=a.next;do{var _=u,w=_.destroy;_=_.tag,void 0!==w&&(2&_||4&_)&&Mj(i,o,w),u=u.next}while(u!==a)}Yj(s,o,i);break;case 1:if(!Os&&(Lj(i,o),"function"==typeof(a=i.stateNode).componentWillUnmount))try{a.props=i.memoizedProps,a.state=i.memoizedState,a.componentWillUnmount()}catch(s){W(i,o,s)}Yj(s,o,i);break;case 21:Yj(s,o,i);break;case 22:1&i.mode?(Os=(a=Os)||null!==i.memoizedState,Yj(s,o,i),Os=a):Yj(s,o,i);break;default:Yj(s,o,i)}}function ak(s){var o=s.updateQueue;if(null!==o){s.updateQueue=null;var i=s.stateNode;null===i&&(i=s.stateNode=new As),o.forEach((function(o){var a=bk.bind(null,s,o);i.has(o)||(i.add(o),o.then(a,a))}))}}function ck(s,o){var i=o.deletions;if(null!==i)for(var a=0;a<i.length;a++){var u=i[a];try{var _=s,w=o,x=w;e:for(;null!==x;){switch(x.tag){case 5:Ps=x.stateNode,Is=!1;break e;case 3:case 4:Ps=x.stateNode.containerInfo,Is=!0;break e}x=x.return}if(null===Ps)throw Error(p(160));Zj(_,w,u),Ps=null,Is=!1;var C=u.alternate;null!==C&&(C.return=null),u.return=null}catch(s){W(u,o,s)}}if(12854&o.subtreeFlags)for(o=o.child;null!==o;)dk(o,s),o=o.sibling}function dk(s,o){var i=s.alternate,a=s.flags;switch(s.tag){case 0:case 11:case 14:case 15:if(ck(o,s),ek(s),4&a){try{Pj(3,s,s.return),Qj(3,s)}catch(o){W(s,s.return,o)}try{Pj(5,s,s.return)}catch(o){W(s,s.return,o)}}break;case 1:ck(o,s),ek(s),512&a&&null!==i&&Lj(i,i.return);break;case 5:if(ck(o,s),ek(s),512&a&&null!==i&&Lj(i,i.return),32&s.flags){var u=s.stateNode;try{ob(u,"")}catch(o){W(s,s.return,o)}}if(4&a&&null!=(u=s.stateNode)){var _=s.memoizedProps,w=null!==i?i.memoizedProps:_,x=s.type,C=s.updateQueue;if(s.updateQueue=null,null!==C)try{"input"===x&&"radio"===_.type&&null!=_.name&&ab(u,_),vb(x,w);var j=vb(x,_);for(w=0;w<C.length;w+=2){var L=C[w],B=C[w+1];"style"===L?sb(u,B):"dangerouslySetInnerHTML"===L?$e(u,B):"children"===L?ob(u,B):ta(u,L,B,j)}switch(x){case"input":bb(u,_);break;case"textarea":ib(u,_);break;case"select":var $=u._wrapperState.wasMultiple;u._wrapperState.wasMultiple=!!_.multiple;var U=_.value;null!=U?fb(u,!!_.multiple,U,!1):$!==!!_.multiple&&(null!=_.defaultValue?fb(u,!!_.multiple,_.defaultValue,!0):fb(u,!!_.multiple,_.multiple?[]:"",!1))}u[dn]=_}catch(o){W(s,s.return,o)}}break;case 6:if(ck(o,s),ek(s),4&a){if(null===s.stateNode)throw Error(p(162));u=s.stateNode,_=s.memoizedProps;try{u.nodeValue=_}catch(o){W(s,s.return,o)}}break;case 3:if(ck(o,s),ek(s),4&a&&null!==i&&i.memoizedState.isDehydrated)try{bd(o.containerInfo)}catch(o){W(s,s.return,o)}break;case 4:default:ck(o,s),ek(s);break;case 13:ck(o,s),ek(s),8192&(u=s.child).flags&&(_=null!==u.memoizedState,u.stateNode.isHidden=_,!_||null!==u.alternate&&null!==u.alternate.memoizedState||(Ys=ht())),4&a&&ak(s);break;case 22:if(L=null!==i&&null!==i.memoizedState,1&s.mode?(Os=(j=Os)||L,ck(o,s),Os=j):ck(o,s),ek(s),8192&a){if(j=null!==s.memoizedState,(s.stateNode.isHidden=j)&&!L&&1&s.mode)for(Cs=s,L=s.child;null!==L;){for(B=Cs=L;null!==Cs;){switch(U=($=Cs).child,$.tag){case 0:case 11:case 14:case 15:Pj(4,$,$.return);break;case 1:Lj($,$.return);var V=$.stateNode;if("function"==typeof V.componentWillUnmount){a=$,i=$.return;try{o=a,V.props=o.memoizedProps,V.state=o.memoizedState,V.componentWillUnmount()}catch(s){W(a,i,s)}}break;case 5:Lj($,$.return);break;case 22:if(null!==$.memoizedState){gk(B);continue}}null!==U?(U.return=$,Cs=U):gk(B)}L=L.sibling}e:for(L=null,B=s;;){if(5===B.tag){if(null===L){L=B;try{u=B.stateNode,j?"function"==typeof(_=u.style).setProperty?_.setProperty("display","none","important"):_.display="none":(x=B.stateNode,w=null!=(C=B.memoizedProps.style)&&C.hasOwnProperty("display")?C.display:null,x.style.display=rb("display",w))}catch(o){W(s,s.return,o)}}}else if(6===B.tag){if(null===L)try{B.stateNode.nodeValue=j?"":B.memoizedProps}catch(o){W(s,s.return,o)}}else if((22!==B.tag&&23!==B.tag||null===B.memoizedState||B===s)&&null!==B.child){B.child.return=B,B=B.child;continue}if(B===s)break e;for(;null===B.sibling;){if(null===B.return||B.return===s)break e;L===B&&(L=null),B=B.return}L===B&&(L=null),B.sibling.return=B.return,B=B.sibling}}break;case 19:ck(o,s),ek(s),4&a&&ak(s);case 21:}}function ek(s){var o=s.flags;if(2&o){try{e:{for(var i=s.return;null!==i;){if(Tj(i)){var a=i;break e}i=i.return}throw Error(p(160))}switch(a.tag){case 5:var u=a.stateNode;32&a.flags&&(ob(u,""),a.flags&=-33),Wj(s,Uj(s),u);break;case 3:case 4:var _=a.stateNode.containerInfo;Vj(s,Uj(s),_);break;default:throw Error(p(161))}}catch(o){W(s,s.return,o)}s.flags&=-3}4096&o&&(s.flags&=-4097)}function hk(s,o,i){Cs=s,ik(s,o,i)}function ik(s,o,i){for(var a=!!(1&s.mode);null!==Cs;){var u=Cs,_=u.child;if(22===u.tag&&a){var w=null!==u.memoizedState||ks;if(!w){var x=u.alternate,C=null!==x&&null!==x.memoizedState||Os;x=ks;var j=Os;if(ks=w,(Os=C)&&!j)for(Cs=u;null!==Cs;)C=(w=Cs).child,22===w.tag&&null!==w.memoizedState?jk(u):null!==C?(C.return=w,Cs=C):jk(u);for(;null!==_;)Cs=_,ik(_,o,i),_=_.sibling;Cs=u,ks=x,Os=j}kk(s)}else 8772&u.subtreeFlags&&null!==_?(_.return=u,Cs=_):kk(s)}}function kk(s){for(;null!==Cs;){var o=Cs;if(8772&o.flags){var i=o.alternate;try{if(8772&o.flags)switch(o.tag){case 0:case 11:case 15:Os||Qj(5,o);break;case 1:var a=o.stateNode;if(4&o.flags&&!Os)if(null===i)a.componentDidMount();else{var u=o.elementType===o.type?i.memoizedProps:Ci(o.type,i.memoizedProps);a.componentDidUpdate(u,i.memoizedState,a.__reactInternalSnapshotBeforeUpdate)}var _=o.updateQueue;null!==_&&sh(o,_,a);break;case 3:var w=o.updateQueue;if(null!==w){if(i=null,null!==o.child)switch(o.child.tag){case 5:case 1:i=o.child.stateNode}sh(o,w,i)}break;case 5:var x=o.stateNode;if(null===i&&4&o.flags){i=x;var C=o.memoizedProps;switch(o.type){case"button":case"input":case"select":case"textarea":C.autoFocus&&i.focus();break;case"img":C.src&&(i.src=C.src)}}break;case 6:case 4:case 12:case 19:case 17:case 21:case 22:case 23:case 25:break;case 13:if(null===o.memoizedState){var j=o.alternate;if(null!==j){var L=j.memoizedState;if(null!==L){var B=L.dehydrated;null!==B&&bd(B)}}}break;default:throw Error(p(163))}Os||512&o.flags&&Rj(o)}catch(s){W(o,o.return,s)}}if(o===s){Cs=null;break}if(null!==(i=o.sibling)){i.return=o.return,Cs=i;break}Cs=o.return}}function gk(s){for(;null!==Cs;){var o=Cs;if(o===s){Cs=null;break}var i=o.sibling;if(null!==i){i.return=o.return,Cs=i;break}Cs=o.return}}function jk(s){for(;null!==Cs;){var o=Cs;try{switch(o.tag){case 0:case 11:case 15:var i=o.return;try{Qj(4,o)}catch(s){W(o,i,s)}break;case 1:var a=o.stateNode;if("function"==typeof a.componentDidMount){var u=o.return;try{a.componentDidMount()}catch(s){W(o,u,s)}}var _=o.return;try{Rj(o)}catch(s){W(o,_,s)}break;case 5:var w=o.return;try{Rj(o)}catch(s){W(o,w,s)}}}catch(s){W(o,o.return,s)}if(o===s){Cs=null;break}var x=o.sibling;if(null!==x){x.return=o.return,Cs=x;break}Cs=o.return}}var Ts,Ns=Math.ceil,Ms=V.ReactCurrentDispatcher,Rs=V.ReactCurrentOwner,Ds=V.ReactCurrentBatchConfig,Ls=0,Fs=null,Bs=null,$s=0,qs=0,Us=Uf(0),Vs=0,zs=null,Ws=0,Js=0,Hs=0,Ks=null,Gs=null,Ys=0,Xs=1/0,Qs=null,Zs=!1,eo=null,to=null,ro=!1,no=null,so=0,oo=0,io=null,ao=-1,co=0;function R(){return 6&Ls?ht():-1!==ao?ao:ao=ht()}function yi(s){return 1&s.mode?2&Ls&&0!==$s?$s&-$s:null!==$n.transition?(0===co&&(co=yc()),co):0!==(s=At)?s:s=void 0===(s=window.event)?16:jd(s.type):1}function gi(s,o,i,a){if(50<oo)throw oo=0,io=null,Error(p(185));Ac(s,i,a),2&Ls&&s===Fs||(s===Fs&&(!(2&Ls)&&(Js|=i),4===Vs&&Ck(s,$s)),Dk(s,a),1===i&&0===Ls&&!(1&o.mode)&&(Xs=ht()+500,kn&&jg()))}function Dk(s,o){var i=s.callbackNode;!function wc(s,o){for(var i=s.suspendedLanes,a=s.pingedLanes,u=s.expirationTimes,_=s.pendingLanes;0<_;){var w=31-Et(_),x=1<<w,C=u[w];-1===C?x&i&&!(x&a)||(u[w]=vc(x,o)):C<=o&&(s.expiredLanes|=x),_&=~x}}(s,o);var a=uc(s,s===Fs?$s:0);if(0===a)null!==i&&lt(i),s.callbackNode=null,s.callbackPriority=0;else if(o=a&-a,s.callbackPriority!==o){if(null!=i&&lt(i),1===o)0===s.tag?function ig(s){kn=!0,hg(s)}(Ek.bind(null,s)):hg(Ek.bind(null,s)),un((function(){!(6&Ls)&&jg()})),i=null;else{switch(Dc(a)){case 1:i=mt;break;case 4:i=gt;break;case 16:default:i=yt;break;case 536870912:i=bt}i=Fk(i,Gk.bind(null,s))}s.callbackPriority=o,s.callbackNode=i}}function Gk(s,o){if(ao=-1,co=0,6&Ls)throw Error(p(327));var i=s.callbackNode;if(Hk()&&s.callbackNode!==i)return null;var a=uc(s,s===Fs?$s:0);if(0===a)return null;if(30&a||a&s.expiredLanes||o)o=Ik(s,a);else{o=a;var u=Ls;Ls|=2;var _=Jk();for(Fs===s&&$s===o||(Qs=null,Xs=ht()+500,Kk(s,o));;)try{Lk();break}catch(o){Mk(s,o)}$g(),Ms.current=_,Ls=u,null!==Bs?o=0:(Fs=null,$s=0,o=Vs)}if(0!==o){if(2===o&&(0!==(u=xc(s))&&(a=u,o=Nk(s,u))),1===o)throw i=zs,Kk(s,0),Ck(s,a),Dk(s,ht()),i;if(6===o)Ck(s,a);else{if(u=s.current.alternate,!(30&a||function Ok(s){for(var o=s;;){if(16384&o.flags){var i=o.updateQueue;if(null!==i&&null!==(i=i.stores))for(var a=0;a<i.length;a++){var u=i[a],_=u.getSnapshot;u=u.value;try{if(!Dr(_(),u))return!1}catch(s){return!1}}}if(i=o.child,16384&o.subtreeFlags&&null!==i)i.return=o,o=i;else{if(o===s)break;for(;null===o.sibling;){if(null===o.return||o.return===s)return!0;o=o.return}o.sibling.return=o.return,o=o.sibling}}return!0}(u)||(o=Ik(s,a),2===o&&(_=xc(s),0!==_&&(a=_,o=Nk(s,_))),1!==o)))throw i=zs,Kk(s,0),Ck(s,a),Dk(s,ht()),i;switch(s.finishedWork=u,s.finishedLanes=a,o){case 0:case 1:throw Error(p(345));case 2:case 5:Pk(s,Gs,Qs);break;case 3:if(Ck(s,a),(130023424&a)===a&&10<(o=Ys+500-ht())){if(0!==uc(s,0))break;if(((u=s.suspendedLanes)&a)!==a){R(),s.pingedLanes|=s.suspendedLanes&u;break}s.timeoutHandle=an(Pk.bind(null,s,Gs,Qs),o);break}Pk(s,Gs,Qs);break;case 4:if(Ck(s,a),(4194240&a)===a)break;for(o=s.eventTimes,u=-1;0<a;){var w=31-Et(a);_=1<<w,(w=o[w])>u&&(u=w),a&=~_}if(a=u,10<(a=(120>(a=ht()-a)?120:480>a?480:1080>a?1080:1920>a?1920:3e3>a?3e3:4320>a?4320:1960*Ns(a/1960))-a)){s.timeoutHandle=an(Pk.bind(null,s,Gs,Qs),a);break}Pk(s,Gs,Qs);break;default:throw Error(p(329))}}}return Dk(s,ht()),s.callbackNode===i?Gk.bind(null,s):null}function Nk(s,o){var i=Ks;return s.current.memoizedState.isDehydrated&&(Kk(s,o).flags|=256),2!==(s=Ik(s,o))&&(o=Gs,Gs=i,null!==o&&Fj(o)),s}function Fj(s){null===Gs?Gs=s:Gs.push.apply(Gs,s)}function Ck(s,o){for(o&=~Hs,o&=~Js,s.suspendedLanes|=o,s.pingedLanes&=~o,s=s.expirationTimes;0<o;){var i=31-Et(o),a=1<<i;s[i]=-1,o&=~a}}function Ek(s){if(6&Ls)throw Error(p(327));Hk();var o=uc(s,0);if(!(1&o))return Dk(s,ht()),null;var i=Ik(s,o);if(0!==s.tag&&2===i){var a=xc(s);0!==a&&(o=a,i=Nk(s,a))}if(1===i)throw i=zs,Kk(s,0),Ck(s,o),Dk(s,ht()),i;if(6===i)throw Error(p(345));return s.finishedWork=s.current.alternate,s.finishedLanes=o,Pk(s,Gs,Qs),Dk(s,ht()),null}function Qk(s,o){var i=Ls;Ls|=1;try{return s(o)}finally{0===(Ls=i)&&(Xs=ht()+500,kn&&jg())}}function Rk(s){null!==no&&0===no.tag&&!(6&Ls)&&Hk();var o=Ls;Ls|=1;var i=Ds.transition,a=At;try{if(Ds.transition=null,At=1,s)return s()}finally{At=a,Ds.transition=i,!(6&(Ls=o))&&jg()}}function Hj(){qs=Us.current,E(Us)}function Kk(s,o){s.finishedWork=null,s.finishedLanes=0;var i=s.timeoutHandle;if(-1!==i&&(s.timeoutHandle=-1,cn(i)),null!==Bs)for(i=Bs.return;null!==i;){var a=i;switch(wg(a),a.tag){case 1:null!=(a=a.type.childContextTypes)&&$f();break;case 3:zh(),E(En),E(Sn),Eh();break;case 5:Bh(a);break;case 4:zh();break;case 13:case 19:E(Zn);break;case 10:ah(a.type._context);break;case 22:case 23:Hj()}i=i.return}if(Fs=s,Bs=s=Pg(s.current,null),$s=qs=o,Vs=0,zs=null,Hs=Js=Ws=0,Gs=Ks=null,null!==Hn){for(o=0;o<Hn.length;o++)if(null!==(a=(i=Hn[o]).interleaved)){i.interleaved=null;var u=a.next,_=i.pending;if(null!==_){var w=_.next;_.next=u,a.next=w}i.pending=a}Hn=null}return s}function Mk(s,o){for(;;){var i=Bs;try{if($g(),ts.current=hs,cs){for(var a=ss.memoizedState;null!==a;){var u=a.queue;null!==u&&(u.pending=null),a=a.next}cs=!1}if(ns=0,as=os=ss=null,ls=!1,us=0,Rs.current=null,null===i||null===i.return){Vs=1,zs=o,Bs=null;break}e:{var _=s,w=i.return,x=i,C=o;if(o=$s,x.flags|=32768,null!==C&&"object"==typeof C&&"function"==typeof C.then){var j=C,L=x,B=L.tag;if(!(1&L.mode||0!==B&&11!==B&&15!==B)){var $=L.alternate;$?(L.updateQueue=$.updateQueue,L.memoizedState=$.memoizedState,L.lanes=$.lanes):(L.updateQueue=null,L.memoizedState=null)}var U=Ui(w);if(null!==U){U.flags&=-257,Vi(U,w,x,0,o),1&U.mode&&Si(_,j,o),C=j;var V=(o=U).updateQueue;if(null===V){var z=new Set;z.add(C),o.updateQueue=z}else V.add(C);break e}if(!(1&o)){Si(_,j,o),tj();break e}C=Error(p(426))}else if(Fn&&1&x.mode){var Y=Ui(w);if(null!==Y){!(65536&Y.flags)&&(Y.flags|=256),Vi(Y,w,x,0,o),Jg(Ji(C,x));break e}}_=C=Ji(C,x),4!==Vs&&(Vs=2),null===Ks?Ks=[_]:Ks.push(_),_=w;do{switch(_.tag){case 3:_.flags|=65536,o&=-o,_.lanes|=o,ph(_,Ni(0,C,o));break e;case 1:x=C;var Z=_.type,ee=_.stateNode;if(!(128&_.flags||"function"!=typeof Z.getDerivedStateFromError&&(null===ee||"function"!=typeof ee.componentDidCatch||null!==to&&to.has(ee)))){_.flags|=65536,o&=-o,_.lanes|=o,ph(_,Qi(_,x,o));break e}}_=_.return}while(null!==_)}Sk(i)}catch(s){o=s,Bs===i&&null!==i&&(Bs=i=i.return);continue}break}}function Jk(){var s=Ms.current;return Ms.current=hs,null===s?hs:s}function tj(){0!==Vs&&3!==Vs&&2!==Vs||(Vs=4),null===Fs||!(268435455&Ws)&&!(268435455&Js)||Ck(Fs,$s)}function Ik(s,o){var i=Ls;Ls|=2;var a=Jk();for(Fs===s&&$s===o||(Qs=null,Kk(s,o));;)try{Tk();break}catch(o){Mk(s,o)}if($g(),Ls=i,Ms.current=a,null!==Bs)throw Error(p(261));return Fs=null,$s=0,Vs}function Tk(){for(;null!==Bs;)Uk(Bs)}function Lk(){for(;null!==Bs&&!ut();)Uk(Bs)}function Uk(s){var o=Ts(s.alternate,s,qs);s.memoizedProps=s.pendingProps,null===o?Sk(s):Bs=o,Rs.current=null}function Sk(s){var o=s;do{var i=o.alternate;if(s=o.return,32768&o.flags){if(null!==(i=Ij(i,o)))return i.flags&=32767,void(Bs=i);if(null===s)return Vs=6,void(Bs=null);s.flags|=32768,s.subtreeFlags=0,s.deletions=null}else if(null!==(i=Ej(i,o,qs)))return void(Bs=i);if(null!==(o=o.sibling))return void(Bs=o);Bs=o=s}while(null!==o);0===Vs&&(Vs=5)}function Pk(s,o,i){var a=At,u=Ds.transition;try{Ds.transition=null,At=1,function Wk(s,o,i,a){do{Hk()}while(null!==no);if(6&Ls)throw Error(p(327));i=s.finishedWork;var u=s.finishedLanes;if(null===i)return null;if(s.finishedWork=null,s.finishedLanes=0,i===s.current)throw Error(p(177));s.callbackNode=null,s.callbackPriority=0;var _=i.lanes|i.childLanes;if(function Bc(s,o){var i=s.pendingLanes&~o;s.pendingLanes=o,s.suspendedLanes=0,s.pingedLanes=0,s.expiredLanes&=o,s.mutableReadLanes&=o,s.entangledLanes&=o,o=s.entanglements;var a=s.eventTimes;for(s=s.expirationTimes;0<i;){var u=31-Et(i),_=1<<u;o[u]=0,a[u]=-1,s[u]=-1,i&=~_}}(s,_),s===Fs&&(Bs=Fs=null,$s=0),!(2064&i.subtreeFlags)&&!(2064&i.flags)||ro||(ro=!0,Fk(yt,(function(){return Hk(),null}))),_=!!(15990&i.flags),!!(15990&i.subtreeFlags)||_){_=Ds.transition,Ds.transition=null;var w=At;At=1;var x=Ls;Ls|=4,Rs.current=null,function Oj(s,o){if(sn=Vt,Ne(s=Me())){if("selectionStart"in s)var i={start:s.selectionStart,end:s.selectionEnd};else e:{var a=(i=(i=s.ownerDocument)&&i.defaultView||window).getSelection&&i.getSelection();if(a&&0!==a.rangeCount){i=a.anchorNode;var u=a.anchorOffset,_=a.focusNode;a=a.focusOffset;try{i.nodeType,_.nodeType}catch(s){i=null;break e}var w=0,x=-1,C=-1,j=0,L=0,B=s,$=null;t:for(;;){for(var U;B!==i||0!==u&&3!==B.nodeType||(x=w+u),B!==_||0!==a&&3!==B.nodeType||(C=w+a),3===B.nodeType&&(w+=B.nodeValue.length),null!==(U=B.firstChild);)$=B,B=U;for(;;){if(B===s)break t;if($===i&&++j===u&&(x=w),$===_&&++L===a&&(C=w),null!==(U=B.nextSibling))break;$=(B=$).parentNode}B=U}i=-1===x||-1===C?null:{start:x,end:C}}else i=null}i=i||{start:0,end:0}}else i=null;for(on={focusedElem:s,selectionRange:i},Vt=!1,Cs=o;null!==Cs;)if(s=(o=Cs).child,1028&o.subtreeFlags&&null!==s)s.return=o,Cs=s;else for(;null!==Cs;){o=Cs;try{var V=o.alternate;if(1024&o.flags)switch(o.tag){case 0:case 11:case 15:case 5:case 6:case 4:case 17:break;case 1:if(null!==V){var z=V.memoizedProps,Y=V.memoizedState,Z=o.stateNode,ee=Z.getSnapshotBeforeUpdate(o.elementType===o.type?z:Ci(o.type,z),Y);Z.__reactInternalSnapshotBeforeUpdate=ee}break;case 3:var ie=o.stateNode.containerInfo;1===ie.nodeType?ie.textContent="":9===ie.nodeType&&ie.documentElement&&ie.removeChild(ie.documentElement);break;default:throw Error(p(163))}}catch(s){W(o,o.return,s)}if(null!==(s=o.sibling)){s.return=o.return,Cs=s;break}Cs=o.return}return V=js,js=!1,V}(s,i),dk(i,s),Oe(on),Vt=!!sn,on=sn=null,s.current=i,hk(i,s,u),pt(),Ls=x,At=w,Ds.transition=_}else s.current=i;if(ro&&(ro=!1,no=s,so=u),_=s.pendingLanes,0===_&&(to=null),function mc(s){if(St&&"function"==typeof St.onCommitFiberRoot)try{St.onCommitFiberRoot(_t,s,void 0,!(128&~s.current.flags))}catch(s){}}(i.stateNode),Dk(s,ht()),null!==o)for(a=s.onRecoverableError,i=0;i<o.length;i++)u=o[i],a(u.value,{componentStack:u.stack,digest:u.digest});if(Zs)throw Zs=!1,s=eo,eo=null,s;return!!(1&so)&&0!==s.tag&&Hk(),_=s.pendingLanes,1&_?s===io?oo++:(oo=0,io=s):oo=0,jg(),null}(s,o,i,a)}finally{Ds.transition=u,At=a}return null}function Hk(){if(null!==no){var s=Dc(so),o=Ds.transition,i=At;try{if(Ds.transition=null,At=16>s?16:s,null===no)var a=!1;else{if(s=no,no=null,so=0,6&Ls)throw Error(p(331));var u=Ls;for(Ls|=4,Cs=s.current;null!==Cs;){var _=Cs,w=_.child;if(16&Cs.flags){var x=_.deletions;if(null!==x){for(var C=0;C<x.length;C++){var j=x[C];for(Cs=j;null!==Cs;){var L=Cs;switch(L.tag){case 0:case 11:case 15:Pj(8,L,_)}var B=L.child;if(null!==B)B.return=L,Cs=B;else for(;null!==Cs;){var $=(L=Cs).sibling,U=L.return;if(Sj(L),L===j){Cs=null;break}if(null!==$){$.return=U,Cs=$;break}Cs=U}}}var V=_.alternate;if(null!==V){var z=V.child;if(null!==z){V.child=null;do{var Y=z.sibling;z.sibling=null,z=Y}while(null!==z)}}Cs=_}}if(2064&_.subtreeFlags&&null!==w)w.return=_,Cs=w;else e:for(;null!==Cs;){if(2048&(_=Cs).flags)switch(_.tag){case 0:case 11:case 15:Pj(9,_,_.return)}var Z=_.sibling;if(null!==Z){Z.return=_.return,Cs=Z;break e}Cs=_.return}}var ee=s.current;for(Cs=ee;null!==Cs;){var ie=(w=Cs).child;if(2064&w.subtreeFlags&&null!==ie)ie.return=w,Cs=ie;else e:for(w=ee;null!==Cs;){if(2048&(x=Cs).flags)try{switch(x.tag){case 0:case 11:case 15:Qj(9,x)}}catch(s){W(x,x.return,s)}if(x===w){Cs=null;break e}var ae=x.sibling;if(null!==ae){ae.return=x.return,Cs=ae;break e}Cs=x.return}}if(Ls=u,jg(),St&&"function"==typeof St.onPostCommitFiberRoot)try{St.onPostCommitFiberRoot(_t,s)}catch(s){}a=!0}return a}finally{At=i,Ds.transition=o}}return!1}function Xk(s,o,i){s=nh(s,o=Ni(0,o=Ji(i,o),1),1),o=R(),null!==s&&(Ac(s,1,o),Dk(s,o))}function W(s,o,i){if(3===s.tag)Xk(s,s,i);else for(;null!==o;){if(3===o.tag){Xk(o,s,i);break}if(1===o.tag){var a=o.stateNode;if("function"==typeof o.type.getDerivedStateFromError||"function"==typeof a.componentDidCatch&&(null===to||!to.has(a))){o=nh(o,s=Qi(o,s=Ji(i,s),1),1),s=R(),null!==o&&(Ac(o,1,s),Dk(o,s));break}}o=o.return}}function Ti(s,o,i){var a=s.pingCache;null!==a&&a.delete(o),o=R(),s.pingedLanes|=s.suspendedLanes&i,Fs===s&&($s&i)===i&&(4===Vs||3===Vs&&(130023424&$s)===$s&&500>ht()-Ys?Kk(s,0):Hs|=i),Dk(s,o)}function Yk(s,o){0===o&&(1&s.mode?(o=Ot,!(130023424&(Ot<<=1))&&(Ot=4194304)):o=1);var i=R();null!==(s=ih(s,o))&&(Ac(s,o,i),Dk(s,i))}function uj(s){var o=s.memoizedState,i=0;null!==o&&(i=o.retryLane),Yk(s,i)}function bk(s,o){var i=0;switch(s.tag){case 13:var a=s.stateNode,u=s.memoizedState;null!==u&&(i=u.retryLane);break;case 19:a=s.stateNode;break;default:throw Error(p(314))}null!==a&&a.delete(o),Yk(s,i)}function Fk(s,o){return ct(s,o)}function $k(s,o,i,a){this.tag=s,this.key=i,this.sibling=this.child=this.return=this.stateNode=this.type=this.elementType=null,this.index=0,this.ref=null,this.pendingProps=o,this.dependencies=this.memoizedState=this.updateQueue=this.memoizedProps=null,this.mode=a,this.subtreeFlags=this.flags=0,this.deletions=null,this.childLanes=this.lanes=0,this.alternate=null}function Bg(s,o,i,a){return new $k(s,o,i,a)}function aj(s){return!(!(s=s.prototype)||!s.isReactComponent)}function Pg(s,o){var i=s.alternate;return null===i?((i=Bg(s.tag,o,s.key,s.mode)).elementType=s.elementType,i.type=s.type,i.stateNode=s.stateNode,i.alternate=s,s.alternate=i):(i.pendingProps=o,i.type=s.type,i.flags=0,i.subtreeFlags=0,i.deletions=null),i.flags=14680064&s.flags,i.childLanes=s.childLanes,i.lanes=s.lanes,i.child=s.child,i.memoizedProps=s.memoizedProps,i.memoizedState=s.memoizedState,i.updateQueue=s.updateQueue,o=s.dependencies,i.dependencies=null===o?null:{lanes:o.lanes,firstContext:o.firstContext},i.sibling=s.sibling,i.index=s.index,i.ref=s.ref,i}function Rg(s,o,i,a,u,_){var w=2;if(a=s,"function"==typeof s)aj(s)&&(w=1);else if("string"==typeof s)w=5;else e:switch(s){case Z:return Tg(i.children,u,_,o);case ee:w=8,u|=8;break;case ie:return(s=Bg(12,i,o,2|u)).elementType=ie,s.lanes=_,s;case pe:return(s=Bg(13,i,o,u)).elementType=pe,s.lanes=_,s;case de:return(s=Bg(19,i,o,u)).elementType=de,s.lanes=_,s;case be:return pj(i,u,_,o);default:if("object"==typeof s&&null!==s)switch(s.$$typeof){case ae:w=10;break e;case ce:w=9;break e;case le:w=11;break e;case fe:w=14;break e;case ye:w=16,a=null;break e}throw Error(p(130,null==s?s:typeof s,""))}return(o=Bg(w,i,o,u)).elementType=s,o.type=a,o.lanes=_,o}function Tg(s,o,i,a){return(s=Bg(7,s,a,o)).lanes=i,s}function pj(s,o,i,a){return(s=Bg(22,s,a,o)).elementType=be,s.lanes=i,s.stateNode={isHidden:!1},s}function Qg(s,o,i){return(s=Bg(6,s,null,o)).lanes=i,s}function Sg(s,o,i){return(o=Bg(4,null!==s.children?s.children:[],s.key,o)).lanes=i,o.stateNode={containerInfo:s.containerInfo,pendingChildren:null,implementation:s.implementation},o}function al(s,o,i,a,u){this.tag=o,this.containerInfo=s,this.finishedWork=this.pingCache=this.current=this.pendingChildren=null,this.timeoutHandle=-1,this.callbackNode=this.pendingContext=this.context=null,this.callbackPriority=0,this.eventTimes=zc(0),this.expirationTimes=zc(-1),this.entangledLanes=this.finishedLanes=this.mutableReadLanes=this.expiredLanes=this.pingedLanes=this.suspendedLanes=this.pendingLanes=0,this.entanglements=zc(0),this.identifierPrefix=a,this.onRecoverableError=u,this.mutableSourceEagerHydrationData=null}function bl(s,o,i,a,u,_,w,x,C){return s=new al(s,o,i,x,C),1===o?(o=1,!0===_&&(o|=8)):o=0,_=Bg(3,null,null,o),s.current=_,_.stateNode=s,_.memoizedState={element:a,isDehydrated:i,cache:null,transitions:null,pendingSuspenseBoundaries:null},kh(_),s}function dl(s){if(!s)return _n;e:{if(Vb(s=s._reactInternals)!==s||1!==s.tag)throw Error(p(170));var o=s;do{switch(o.tag){case 3:o=o.stateNode.context;break e;case 1:if(Zf(o.type)){o=o.stateNode.__reactInternalMemoizedMergedChildContext;break e}}o=o.return}while(null!==o);throw Error(p(171))}if(1===s.tag){var i=s.type;if(Zf(i))return bg(s,i,o)}return o}function el(s,o,i,a,u,_,w,x,C){return(s=bl(i,a,!0,s,0,_,0,x,C)).context=dl(null),i=s.current,(_=mh(a=R(),u=yi(i))).callback=null!=o?o:null,nh(i,_,u),s.current.lanes=u,Ac(s,u,a),Dk(s,a),s}function fl(s,o,i,a){var u=o.current,_=R(),w=yi(u);return i=dl(i),null===o.context?o.context=i:o.pendingContext=i,(o=mh(_,w)).payload={element:s},null!==(a=void 0===a?null:a)&&(o.callback=a),null!==(s=nh(u,o,w))&&(gi(s,u,w,_),oh(s,u,w)),w}function gl(s){return(s=s.current).child?(s.child.tag,s.child.stateNode):null}function hl(s,o){if(null!==(s=s.memoizedState)&&null!==s.dehydrated){var i=s.retryLane;s.retryLane=0!==i&&i<o?i:o}}function il(s,o){hl(s,o),(s=s.alternate)&&hl(s,o)}Ts=function(s,o,i){if(null!==s)if(s.memoizedProps!==o.pendingProps||En.current)bs=!0;else{if(!(s.lanes&i||128&o.flags))return bs=!1,function yj(s,o,i){switch(o.tag){case 3:kj(o),Ig();break;case 5:Ah(o);break;case 1:Zf(o.type)&&cg(o);break;case 4:yh(o,o.stateNode.containerInfo);break;case 10:var a=o.type._context,u=o.memoizedProps.value;G(Vn,a._currentValue),a._currentValue=u;break;case 13:if(null!==(a=o.memoizedState))return null!==a.dehydrated?(G(Zn,1&Zn.current),o.flags|=128,null):i&o.child.childLanes?oj(s,o,i):(G(Zn,1&Zn.current),null!==(s=Zi(s,o,i))?s.sibling:null);G(Zn,1&Zn.current);break;case 19:if(a=!!(i&o.childLanes),128&s.flags){if(a)return xj(s,o,i);o.flags|=128}if(null!==(u=o.memoizedState)&&(u.rendering=null,u.tail=null,u.lastEffect=null),G(Zn,Zn.current),a)break;return null;case 22:case 23:return o.lanes=0,dj(s,o,i)}return Zi(s,o,i)}(s,o,i);bs=!!(131072&s.flags)}else bs=!1,Fn&&1048576&o.flags&&ug(o,Pn,o.index);switch(o.lanes=0,o.tag){case 2:var a=o.type;ij(s,o),s=o.pendingProps;var u=Yf(o,Sn.current);ch(o,i),u=Nh(null,o,a,s,u,i);var _=Sh();return o.flags|=1,"object"==typeof u&&null!==u&&"function"==typeof u.render&&void 0===u.$$typeof?(o.tag=1,o.memoizedState=null,o.updateQueue=null,Zf(a)?(_=!0,cg(o)):_=!1,o.memoizedState=null!==u.state&&void 0!==u.state?u.state:null,kh(o),u.updater=gs,o.stateNode=u,u._reactInternals=o,Ii(o,a,s,i),o=jj(null,o,a,!0,_,i)):(o.tag=0,Fn&&_&&vg(o),Xi(null,o,u,i),o=o.child),o;case 16:a=o.elementType;e:{switch(ij(s,o),s=o.pendingProps,a=(u=a._init)(a._payload),o.type=a,u=o.tag=function Zk(s){if("function"==typeof s)return aj(s)?1:0;if(null!=s){if((s=s.$$typeof)===le)return 11;if(s===fe)return 14}return 2}(a),s=Ci(a,s),u){case 0:o=cj(null,o,a,s,i);break e;case 1:o=hj(null,o,a,s,i);break e;case 11:o=Yi(null,o,a,s,i);break e;case 14:o=$i(null,o,a,Ci(a.type,s),i);break e}throw Error(p(306,a,""))}return o;case 0:return a=o.type,u=o.pendingProps,cj(s,o,a,u=o.elementType===a?u:Ci(a,u),i);case 1:return a=o.type,u=o.pendingProps,hj(s,o,a,u=o.elementType===a?u:Ci(a,u),i);case 3:e:{if(kj(o),null===s)throw Error(p(387));a=o.pendingProps,u=(_=o.memoizedState).element,lh(s,o),qh(o,a,null,i);var w=o.memoizedState;if(a=w.element,_.isDehydrated){if(_={element:a,isDehydrated:!1,cache:w.cache,pendingSuspenseBoundaries:w.pendingSuspenseBoundaries,transitions:w.transitions},o.updateQueue.baseState=_,o.memoizedState=_,256&o.flags){o=lj(s,o,a,i,u=Ji(Error(p(423)),o));break e}if(a!==u){o=lj(s,o,a,i,u=Ji(Error(p(424)),o));break e}for(Ln=Lf(o.stateNode.containerInfo.firstChild),Dn=o,Fn=!0,Bn=null,i=Un(o,null,a,i),o.child=i;i;)i.flags=-3&i.flags|4096,i=i.sibling}else{if(Ig(),a===u){o=Zi(s,o,i);break e}Xi(s,o,a,i)}o=o.child}return o;case 5:return Ah(o),null===s&&Eg(o),a=o.type,u=o.pendingProps,_=null!==s?s.memoizedProps:null,w=u.children,Ef(a,u)?w=null:null!==_&&Ef(a,_)&&(o.flags|=32),gj(s,o),Xi(s,o,w,i),o.child;case 6:return null===s&&Eg(o),null;case 13:return oj(s,o,i);case 4:return yh(o,o.stateNode.containerInfo),a=o.pendingProps,null===s?o.child=qn(o,null,a,i):Xi(s,o,a,i),o.child;case 11:return a=o.type,u=o.pendingProps,Yi(s,o,a,u=o.elementType===a?u:Ci(a,u),i);case 7:return Xi(s,o,o.pendingProps,i),o.child;case 8:case 12:return Xi(s,o,o.pendingProps.children,i),o.child;case 10:e:{if(a=o.type._context,u=o.pendingProps,_=o.memoizedProps,w=u.value,G(Vn,a._currentValue),a._currentValue=w,null!==_)if(Dr(_.value,w)){if(_.children===u.children&&!En.current){o=Zi(s,o,i);break e}}else for(null!==(_=o.child)&&(_.return=o);null!==_;){var x=_.dependencies;if(null!==x){w=_.child;for(var C=x.firstContext;null!==C;){if(C.context===a){if(1===_.tag){(C=mh(-1,i&-i)).tag=2;var j=_.updateQueue;if(null!==j){var L=(j=j.shared).pending;null===L?C.next=C:(C.next=L.next,L.next=C),j.pending=C}}_.lanes|=i,null!==(C=_.alternate)&&(C.lanes|=i),bh(_.return,i,o),x.lanes|=i;break}C=C.next}}else if(10===_.tag)w=_.type===o.type?null:_.child;else if(18===_.tag){if(null===(w=_.return))throw Error(p(341));w.lanes|=i,null!==(x=w.alternate)&&(x.lanes|=i),bh(w,i,o),w=_.sibling}else w=_.child;if(null!==w)w.return=_;else for(w=_;null!==w;){if(w===o){w=null;break}if(null!==(_=w.sibling)){_.return=w.return,w=_;break}w=w.return}_=w}Xi(s,o,u.children,i),o=o.child}return o;case 9:return u=o.type,a=o.pendingProps.children,ch(o,i),a=a(u=eh(u)),o.flags|=1,Xi(s,o,a,i),o.child;case 14:return u=Ci(a=o.type,o.pendingProps),$i(s,o,a,u=Ci(a.type,u),i);case 15:return bj(s,o,o.type,o.pendingProps,i);case 17:return a=o.type,u=o.pendingProps,u=o.elementType===a?u:Ci(a,u),ij(s,o),o.tag=1,Zf(a)?(s=!0,cg(o)):s=!1,ch(o,i),Gi(o,a,u),Ii(o,a,u,i),jj(null,o,a,!0,s,i);case 19:return xj(s,o,i);case 22:return dj(s,o,i)}throw Error(p(156,o.tag))};var lo="function"==typeof reportError?reportError:function(s){console.error(s)};function ll(s){this._internalRoot=s}function ml(s){this._internalRoot=s}function nl(s){return!(!s||1!==s.nodeType&&9!==s.nodeType&&11!==s.nodeType)}function ol(s){return!(!s||1!==s.nodeType&&9!==s.nodeType&&11!==s.nodeType&&(8!==s.nodeType||" react-mount-point-unstable "!==s.nodeValue))}function pl(){}function rl(s,o,i,a,u){var _=i._reactRootContainer;if(_){var w=_;if("function"==typeof u){var x=u;u=function(){var s=gl(w);x.call(s)}}fl(o,w,s,u)}else w=function ql(s,o,i,a,u){if(u){if("function"==typeof a){var _=a;a=function(){var s=gl(w);_.call(s)}}var w=el(o,a,s,0,null,!1,0,"",pl);return s._reactRootContainer=w,s[fn]=w.current,sf(8===s.nodeType?s.parentNode:s),Rk(),w}for(;u=s.lastChild;)s.removeChild(u);if("function"==typeof a){var x=a;a=function(){var s=gl(C);x.call(s)}}var C=bl(s,0,!1,null,0,!1,0,"",pl);return s._reactRootContainer=C,s[fn]=C.current,sf(8===s.nodeType?s.parentNode:s),Rk((function(){fl(o,C,i,a)})),C}(i,o,s,u,a);return gl(w)}ml.prototype.render=ll.prototype.render=function(s){var o=this._internalRoot;if(null===o)throw Error(p(409));fl(s,o,null,null)},ml.prototype.unmount=ll.prototype.unmount=function(){var s=this._internalRoot;if(null!==s){this._internalRoot=null;var o=s.containerInfo;Rk((function(){fl(null,s,null,null)})),o[fn]=null}},ml.prototype.unstable_scheduleHydration=function(s){if(s){var o=It();s={blockedOn:null,target:s,priority:o};for(var i=0;i<$t.length&&0!==o&&o<$t[i].priority;i++);$t.splice(i,0,s),0===i&&Vc(s)}},Ct=function(s){switch(s.tag){case 3:var o=s.stateNode;if(o.current.memoizedState.isDehydrated){var i=tc(o.pendingLanes);0!==i&&(Cc(o,1|i),Dk(o,ht()),!(6&Ls)&&(Xs=ht()+500,jg()))}break;case 13:Rk((function(){var o=ih(s,1);if(null!==o){var i=R();gi(o,s,1,i)}})),il(s,1)}},jt=function(s){if(13===s.tag){var o=ih(s,134217728);if(null!==o)gi(o,s,134217728,R());il(s,134217728)}},Pt=function(s){if(13===s.tag){var o=yi(s),i=ih(s,o);if(null!==i)gi(i,s,o,R());il(s,o)}},It=function(){return At},Tt=function(s,o){var i=At;try{return At=s,o()}finally{At=i}},Ye=function(s,o,i){switch(o){case"input":if(bb(s,i),o=i.name,"radio"===i.type&&null!=o){for(i=s;i.parentNode;)i=i.parentNode;for(i=i.querySelectorAll("input[name="+JSON.stringify(""+o)+'][type="radio"]'),o=0;o<i.length;o++){var a=i[o];if(a!==s&&a.form===s.form){var u=Db(a);if(!u)throw Error(p(90));Wa(a),bb(a,u)}}}break;case"textarea":ib(s,i);break;case"select":null!=(o=i.value)&&fb(s,!!i.multiple,o,!1)}},Gb=Qk,Hb=Rk;var uo={usingClientEntryPoint:!1,Events:[Cb,ue,Db,Eb,Fb,Qk]},po={findFiberByHostInstance:Wc,bundleType:0,version:"18.3.1",rendererPackageName:"react-dom"},ho={bundleType:po.bundleType,version:po.version,rendererPackageName:po.rendererPackageName,rendererConfig:po.rendererConfig,overrideHookState:null,overrideHookStateDeletePath:null,overrideHookStateRenamePath:null,overrideProps:null,overridePropsDeletePath:null,overridePropsRenamePath:null,setErrorHandler:null,setSuspenseHandler:null,scheduleUpdate:null,currentDispatcherRef:V.ReactCurrentDispatcher,findHostInstanceByFiber:function(s){return null===(s=Zb(s))?null:s.stateNode},findFiberByHostInstance:po.findFiberByHostInstance||function jl(){return null},findHostInstancesForRefresh:null,scheduleRefresh:null,scheduleRoot:null,setRefreshHandler:null,getCurrentFiber:null,reconcilerVersion:"18.3.1-next-f1338f8080-20240426"};if("undefined"!=typeof __REACT_DEVTOOLS_GLOBAL_HOOK__){var fo=__REACT_DEVTOOLS_GLOBAL_HOOK__;if(!fo.isDisabled&&fo.supportsFiber)try{_t=fo.inject(ho),St=fo}catch(Re){}}o.__SECRET_INTERNALS_DO_NOT_USE_OR_YOU_WILL_BE_FIRED=uo,o.createPortal=function(s,o){var i=2<arguments.length&&void 0!==arguments[2]?arguments[2]:null;if(!nl(o))throw Error(p(200));return function cl(s,o,i){var a=3<arguments.length&&void 0!==arguments[3]?arguments[3]:null;return{$$typeof:Y,key:null==a?null:""+a,children:s,containerInfo:o,implementation:i}}(s,o,null,i)},o.createRoot=function(s,o){if(!nl(s))throw Error(p(299));var i=!1,a="",u=lo;return null!=o&&(!0===o.unstable_strictMode&&(i=!0),void 0!==o.identifierPrefix&&(a=o.identifierPrefix),void 0!==o.onRecoverableError&&(u=o.onRecoverableError)),o=bl(s,1,!1,null,0,i,0,a,u),s[fn]=o.current,sf(8===s.nodeType?s.parentNode:s),new ll(o)},o.findDOMNode=function(s){if(null==s)return null;if(1===s.nodeType)return s;var o=s._reactInternals;if(void 0===o){if("function"==typeof s.render)throw Error(p(188));throw s=Object.keys(s).join(","),Error(p(268,s))}return s=null===(s=Zb(o))?null:s.stateNode},o.flushSync=function(s){return Rk(s)},o.hydrate=function(s,o,i){if(!ol(o))throw Error(p(200));return rl(null,s,o,!0,i)},o.hydrateRoot=function(s,o,i){if(!nl(s))throw Error(p(405));var a=null!=i&&i.hydratedSources||null,u=!1,_="",w=lo;if(null!=i&&(!0===i.unstable_strictMode&&(u=!0),void 0!==i.identifierPrefix&&(_=i.identifierPrefix),void 0!==i.onRecoverableError&&(w=i.onRecoverableError)),o=el(o,null,s,1,null!=i?i:null,u,0,_,w),s[fn]=o.current,sf(s),a)for(s=0;s<a.length;s++)u=(u=(i=a[s])._getVersion)(i._source),null==o.mutableSourceEagerHydrationData?o.mutableSourceEagerHydrationData=[i,u]:o.mutableSourceEagerHydrationData.push(i,u);return new ml(o)},o.render=function(s,o,i){if(!ol(o))throw Error(p(200));return rl(null,s,o,!1,i)},o.unmountComponentAtNode=function(s){if(!ol(s))throw Error(p(40));return!!s._reactRootContainer&&(Rk((function(){rl(null,null,s,!1,(function(){s._reactRootContainer=null,s[fn]=null}))})),!0)},o.unstable_batchedUpdates=Qk,o.unstable_renderSubtreeIntoContainer=function(s,o,i,a){if(!ol(i))throw Error(p(200));if(null==s||void 0===s._reactInternals)throw Error(p(38));return rl(s,o,i,!1,a)},o.version="18.3.1-next-f1338f8080-20240426"},22574:(s,o)=>{"use strict";var i={}.propertyIsEnumerable,a=Object.getOwnPropertyDescriptor,u=a&&!i.call({1:2},1);o.f=u?function propertyIsEnumerable(s){var o=a(this,s);return!!o&&o.enumerable}:i},23007:s=>{s.exports=function copyArray(s,o){var i=-1,a=s.length;for(o||(o=Array(a));++i<a;)o[i]=s[i];return o}},23034:(s,o,i)=>{"use strict";var a=i(88280),u=i(32567),_=Function.prototype;s.exports=function(s){var o=s.bind;return s===_||a(_,s)&&o===_.bind?u:o}},23045:(s,o,i)=>{"use strict";var a=i(1907),u=i(49724),_=i(4993),w=i(74436).indexOf,x=i(38530),C=a([].push);s.exports=function(s,o){var i,a=_(s),j=0,L=[];for(i in a)!u(x,i)&&u(a,i)&&C(L,i);for(;o.length>j;)u(a,i=o[j++])&&(~w(L,i)||C(L,i));return L}},23546:(s,o,i)=>{var a=i(72552),u=i(40346),_=i(11331);s.exports=function isError(s){if(!u(s))return!1;var o=a(s);return"[object Error]"==o||"[object DOMException]"==o||"string"==typeof s.message&&"string"==typeof s.name&&!_(s)}},23805:s=>{s.exports=function isObject(s){var o=typeof s;return null!=s&&("object"==o||"function"==o)}},23888:(s,o,i)=>{"use strict";var a=i(98828),u=i(75817);s.exports=!a((function(){var s=new Error("a");return!("stack"in s)||(Object.defineProperty(s,"stack",u(1,7)),7!==s.stack)}))},24107:(s,o,i)=>{"use strict";var a=i(56698),u=i(90392),_=i(92861).Buffer,w=[1116352408,1899447441,3049323471,3921009573,961987163,1508970993,2453635748,2870763221,3624381080,310598401,607225278,1426881987,1925078388,2162078206,2614888103,3248222580,3835390401,4022224774,264347078,604807628,770255983,1249150122,1555081692,1996064986,2554220882,2821834349,2952996808,3210313671,3336571891,3584528711,113926993,338241895,666307205,773529912,1294757372,1396182291,1695183700,1986661051,2177026350,2456956037,2730485921,2820302411,3259730800,3345764771,3516065817,3600352804,4094571909,275423344,430227734,506948616,659060556,883997877,958139571,1322822218,1537002063,1747873779,1955562222,2024104815,2227730452,2361852424,2428436474,2756734187,3204031479,3329325298],x=new Array(64);function Sha256(){this.init(),this._w=x,u.call(this,64,56)}function ch(s,o,i){return i^s&(o^i)}function maj(s,o,i){return s&o|i&(s|o)}function sigma0(s){return(s>>>2|s<<30)^(s>>>13|s<<19)^(s>>>22|s<<10)}function sigma1(s){return(s>>>6|s<<26)^(s>>>11|s<<21)^(s>>>25|s<<7)}function gamma0(s){return(s>>>7|s<<25)^(s>>>18|s<<14)^s>>>3}a(Sha256,u),Sha256.prototype.init=function(){return this._a=1779033703,this._b=3144134277,this._c=1013904242,this._d=2773480762,this._e=1359893119,this._f=2600822924,this._g=528734635,this._h=1541459225,this},Sha256.prototype._update=function(s){for(var o,i=this._w,a=0|this._a,u=0|this._b,_=0|this._c,x=0|this._d,C=0|this._e,j=0|this._f,L=0|this._g,B=0|this._h,$=0;$<16;++$)i[$]=s.readInt32BE(4*$);for(;$<64;++$)i[$]=0|(((o=i[$-2])>>>17|o<<15)^(o>>>19|o<<13)^o>>>10)+i[$-7]+gamma0(i[$-15])+i[$-16];for(var U=0;U<64;++U){var V=B+sigma1(C)+ch(C,j,L)+w[U]+i[U]|0,z=sigma0(a)+maj(a,u,_)|0;B=L,L=j,j=C,C=x+V|0,x=_,_=u,u=a,a=V+z|0}this._a=a+this._a|0,this._b=u+this._b|0,this._c=_+this._c|0,this._d=x+this._d|0,this._e=C+this._e|0,this._f=j+this._f|0,this._g=L+this._g|0,this._h=B+this._h|0},Sha256.prototype._hash=function(){var s=_.allocUnsafe(32);return s.writeInt32BE(this._a,0),s.writeInt32BE(this._b,4),s.writeInt32BE(this._c,8),s.writeInt32BE(this._d,12),s.writeInt32BE(this._e,16),s.writeInt32BE(this._f,20),s.writeInt32BE(this._g,24),s.writeInt32BE(this._h,28),s},s.exports=Sha256},24168:(s,o,i)=>{var a=i(91033),u=i(82819),_=i(9325);s.exports=function createPartial(s,o,i,w){var x=1&o,C=u(s);return function wrapper(){for(var o=-1,u=arguments.length,j=-1,L=w.length,B=Array(L+u),$=this&&this!==_&&this instanceof wrapper?C:s;++j<L;)B[j]=w[j];for(;u--;)B[j++]=arguments[++o];return a($,x?i:this,B)}}},24443:(s,o,i)=>{"use strict";var a=i(23045),u=i(80376).concat("length","prototype");o.f=Object.getOwnPropertyNames||function getOwnPropertyNames(s){return a(s,u)}},24647:(s,o,i)=>{var a=i(54552)({À:"A",Á:"A",Â:"A",Ã:"A",Ä:"A",Å:"A",à:"a",á:"a",â:"a",ã:"a",ä:"a",å:"a",Ç:"C",ç:"c",Ð:"D",ð:"d",È:"E",É:"E",Ê:"E",Ë:"E",è:"e",é:"e",ê:"e",ë:"e",Ì:"I",Í:"I",Î:"I",Ï:"I",ì:"i",í:"i",î:"i",ï:"i",Ñ:"N",ñ:"n",Ò:"O",Ó:"O",Ô:"O",Õ:"O",Ö:"O",Ø:"O",ò:"o",ó:"o",ô:"o",õ:"o",ö:"o",ø:"o",Ù:"U",Ú:"U",Û:"U",Ü:"U",ù:"u",ú:"u",û:"u",ü:"u",Ý:"Y",ý:"y",ÿ:"y",Æ:"Ae",æ:"ae",Þ:"Th",þ:"th",ß:"ss",Ā:"A",Ă:"A",Ą:"A",ā:"a",ă:"a",ą:"a",Ć:"C",Ĉ:"C",Ċ:"C",Č:"C",ć:"c",ĉ:"c",ċ:"c",č:"c",Ď:"D",Đ:"D",ď:"d",đ:"d",Ē:"E",Ĕ:"E",Ė:"E",Ę:"E",Ě:"E",ē:"e",ĕ:"e",ė:"e",ę:"e",ě:"e",Ĝ:"G",Ğ:"G",Ġ:"G",Ģ:"G",ĝ:"g",ğ:"g",ġ:"g",ģ:"g",Ĥ:"H",Ħ:"H",ĥ:"h",ħ:"h",Ĩ:"I",Ī:"I",Ĭ:"I",Į:"I",İ:"I",ĩ:"i",ī:"i",ĭ:"i",į:"i",ı:"i",Ĵ:"J",ĵ:"j",Ķ:"K",ķ:"k",ĸ:"k",Ĺ:"L",Ļ:"L",Ľ:"L",Ŀ:"L",Ł:"L",ĺ:"l",ļ:"l",ľ:"l",ŀ:"l",ł:"l",Ń:"N",Ņ:"N",Ň:"N",Ŋ:"N",ń:"n",ņ:"n",ň:"n",ŋ:"n",Ō:"O",Ŏ:"O",Ő:"O",ō:"o",ŏ:"o",ő:"o",Ŕ:"R",Ŗ:"R",Ř:"R",ŕ:"r",ŗ:"r",ř:"r",Ś:"S",Ŝ:"S",Ş:"S",Š:"S",ś:"s",ŝ:"s",ş:"s",š:"s",Ţ:"T",Ť:"T",Ŧ:"T",ţ:"t",ť:"t",ŧ:"t",Ũ:"U",Ū:"U",Ŭ:"U",Ů:"U",Ű:"U",Ų:"U",ũ:"u",ū:"u",ŭ:"u",ů:"u",ű:"u",ų:"u",Ŵ:"W",ŵ:"w",Ŷ:"Y",ŷ:"y",Ÿ:"Y",Ź:"Z",Ż:"Z",Ž:"Z",ź:"z",ż:"z",ž:"z",Ĳ:"IJ",ĳ:"ij",Œ:"Oe",œ:"oe",ŉ:"'n",ſ:"s"});s.exports=a},24677:(s,o,i)=>{"use strict";var a=i(81214).DebounceInput;a.DebounceInput=a,s.exports=a},24713:(s,o,i)=>{var a=i(2523),u=i(15389),_=i(61489),w=Math.max;s.exports=function findIndex(s,o,i){var x=null==s?0:s.length;if(!x)return-1;var C=null==i?0:_(i);return C<0&&(C=w(x+C,0)),a(s,u(o,3),C)}},24739:(s,o,i)=>{var a=i(26025);s.exports=function listCacheGet(s){var o=this.__data__,i=a(o,s);return i<0?void 0:o[i][1]}},24823:(s,o,i)=>{"use strict";var a=i(28311),u=i(13930),_=i(36624),w=i(4640),x=i(37812),C=i(20575),j=i(88280),L=i(10300),B=i(73448),$=i(40154),U=TypeError,Result=function(s,o){this.stopped=s,this.result=o},V=Result.prototype;s.exports=function(s,o,i){var z,Y,Z,ee,ie,ae,ce,le=i&&i.that,pe=!(!i||!i.AS_ENTRIES),de=!(!i||!i.IS_RECORD),fe=!(!i||!i.IS_ITERATOR),ye=!(!i||!i.INTERRUPTED),be=a(o,le),stop=function(s){return z&&$(z,"normal",s),new Result(!0,s)},callFn=function(s){return pe?(_(s),ye?be(s[0],s[1],stop):be(s[0],s[1])):ye?be(s,stop):be(s)};if(de)z=s.iterator;else if(fe)z=s;else{if(!(Y=B(s)))throw new U(w(s)+" is not iterable");if(x(Y)){for(Z=0,ee=C(s);ee>Z;Z++)if((ie=callFn(s[Z]))&&j(V,ie))return ie;return new Result(!1)}z=L(s,Y)}for(ae=de?s.next:z.next;!(ce=u(ae,z)).done;){try{ie=callFn(ce.value)}catch(s){$(z,"throw",s)}if("object"==typeof ie&&ie&&j(V,ie))return ie}return new Result(!1)}},25160:s=>{s.exports=function baseSlice(s,o,i){var a=-1,u=s.length;o<0&&(o=-o>u?0:u+o),(i=i>u?u:i)<0&&(i+=u),u=o>i?0:i-o>>>0,o>>>=0;for(var _=Array(u);++a<u;)_[a]=s[a+o];return _}},25264:(s,o,i)=>{"use strict";function _typeof(s){return _typeof="function"==typeof Symbol&&"symbol"==typeof Symbol.iterator?function(s){return typeof s}:function(s){return s&&"function"==typeof Symbol&&s.constructor===Symbol&&s!==Symbol.prototype?"symbol":typeof s},_typeof(s)}Object.defineProperty(o,"__esModule",{value:!0}),o.CopyToClipboard=void 0;var a=_interopRequireDefault(i(96540)),u=_interopRequireDefault(i(17965)),_=["text","onCopy","options","children"];function _interopRequireDefault(s){return s&&s.__esModule?s:{default:s}}function ownKeys(s,o){var i=Object.keys(s);if(Object.getOwnPropertySymbols){var a=Object.getOwnPropertySymbols(s);o&&(a=a.filter((function(o){return Object.getOwnPropertyDescriptor(s,o).enumerable}))),i.push.apply(i,a)}return i}function _objectSpread(s){for(var o=1;o<arguments.length;o++){var i=null!=arguments[o]?arguments[o]:{};o%2?ownKeys(Object(i),!0).forEach((function(o){_defineProperty(s,o,i[o])})):Object.getOwnPropertyDescriptors?Object.defineProperties(s,Object.getOwnPropertyDescriptors(i)):ownKeys(Object(i)).forEach((function(o){Object.defineProperty(s,o,Object.getOwnPropertyDescriptor(i,o))}))}return s}function _objectWithoutProperties(s,o){if(null==s)return{};var i,a,u=function _objectWithoutPropertiesLoose(s,o){if(null==s)return{};var i,a,u={},_=Object.keys(s);for(a=0;a<_.length;a++)i=_[a],o.indexOf(i)>=0||(u[i]=s[i]);return u}(s,o);if(Object.getOwnPropertySymbols){var _=Object.getOwnPropertySymbols(s);for(a=0;a<_.length;a++)i=_[a],o.indexOf(i)>=0||Object.prototype.propertyIsEnumerable.call(s,i)&&(u[i]=s[i])}return u}function _defineProperties(s,o){for(var i=0;i<o.length;i++){var a=o[i];a.enumerable=a.enumerable||!1,a.configurable=!0,"value"in a&&(a.writable=!0),Object.defineProperty(s,a.key,a)}}function _setPrototypeOf(s,o){return _setPrototypeOf=Object.setPrototypeOf||function _setPrototypeOf(s,o){return s.__proto__=o,s},_setPrototypeOf(s,o)}function _createSuper(s){var o=function _isNativeReflectConstruct(){if("undefined"==typeof Reflect||!Reflect.construct)return!1;if(Reflect.construct.sham)return!1;if("function"==typeof Proxy)return!0;try{return Boolean.prototype.valueOf.call(Reflect.construct(Boolean,[],(function(){}))),!0}catch(s){return!1}}();return function _createSuperInternal(){var i,a=_getPrototypeOf(s);if(o){var u=_getPrototypeOf(this).constructor;i=Reflect.construct(a,arguments,u)}else i=a.apply(this,arguments);return function _possibleConstructorReturn(s,o){if(o&&("object"===_typeof(o)||"function"==typeof o))return o;if(void 0!==o)throw new TypeError("Derived constructors may only return object or undefined");return _assertThisInitialized(s)}(this,i)}}function _assertThisInitialized(s){if(void 0===s)throw new ReferenceError("this hasn't been initialised - super() hasn't been called");return s}function _getPrototypeOf(s){return _getPrototypeOf=Object.setPrototypeOf?Object.getPrototypeOf:function _getPrototypeOf(s){return s.__proto__||Object.getPrototypeOf(s)},_getPrototypeOf(s)}function _defineProperty(s,o,i){return o in s?Object.defineProperty(s,o,{value:i,enumerable:!0,configurable:!0,writable:!0}):s[o]=i,s}var w=function(s){!function _inherits(s,o){if("function"!=typeof o&&null!==o)throw new TypeError("Super expression must either be null or a function");s.prototype=Object.create(o&&o.prototype,{constructor:{value:s,writable:!0,configurable:!0}}),Object.defineProperty(s,"prototype",{writable:!1}),o&&_setPrototypeOf(s,o)}(CopyToClipboard,s);var o=_createSuper(CopyToClipboard);function CopyToClipboard(){var s;!function _classCallCheck(s,o){if(!(s instanceof o))throw new TypeError("Cannot call a class as a function")}(this,CopyToClipboard);for(var i=arguments.length,_=new Array(i),w=0;w<i;w++)_[w]=arguments[w];return _defineProperty(_assertThisInitialized(s=o.call.apply(o,[this].concat(_))),"onClick",(function(o){var i=s.props,_=i.text,w=i.onCopy,x=i.children,C=i.options,j=a.default.Children.only(x),L=(0,u.default)(_,C);w&&w(_,L),j&&j.props&&"function"==typeof j.props.onClick&&j.props.onClick(o)})),s}return function _createClass(s,o,i){return o&&_defineProperties(s.prototype,o),i&&_defineProperties(s,i),Object.defineProperty(s,"prototype",{writable:!1}),s}(CopyToClipboard,[{key:"render",value:function render(){var s=this.props,o=(s.text,s.onCopy,s.options,s.children),i=_objectWithoutProperties(s,_),u=a.default.Children.only(o);return a.default.cloneElement(u,_objectSpread(_objectSpread({},i),{},{onClick:this.onClick}))}}]),CopyToClipboard}(a.default.PureComponent);o.CopyToClipboard=w,_defineProperty(w,"defaultProps",{onCopy:void 0,options:void 0})},25382:(s,o,i)=>{"use strict";var a=i(65606),u=Object.keys||function(s){var o=[];for(var i in s)o.push(i);return o};s.exports=Duplex;var _=i(45412),w=i(16708);i(56698)(Duplex,_);for(var x=u(w.prototype),C=0;C<x.length;C++){var j=x[C];Duplex.prototype[j]||(Duplex.prototype[j]=w.prototype[j])}function Duplex(s){if(!(this instanceof Duplex))return new Duplex(s);_.call(this,s),w.call(this,s),this.allowHalfOpen=!0,s&&(!1===s.readable&&(this.readable=!1),!1===s.writable&&(this.writable=!1),!1===s.allowHalfOpen&&(this.allowHalfOpen=!1,this.once("end",onend)))}function onend(){this._writableState.ended||a.nextTick(onEndNT,this)}function onEndNT(s){s.end()}Object.defineProperty(Duplex.prototype,"writableHighWaterMark",{enumerable:!1,get:function get(){return this._writableState.highWaterMark}}),Object.defineProperty(Duplex.prototype,"writableBuffer",{enumerable:!1,get:function get(){return this._writableState&&this._writableState.getBuffer()}}),Object.defineProperty(Duplex.prototype,"writableLength",{enumerable:!1,get:function get(){return this._writableState.length}}),Object.defineProperty(Duplex.prototype,"destroyed",{enumerable:!1,get:function get(){return void 0!==this._readableState&&void 0!==this._writableState&&(this._readableState.destroyed&&this._writableState.destroyed)},set:function set(s){void 0!==this._readableState&&void 0!==this._writableState&&(this._readableState.destroyed=s,this._writableState.destroyed=s)}})},25594:(s,o,i)=>{"use strict";var a=i(85582),u=i(62250),_=i(88280),w=i(51175),x=Object;s.exports=w?function(s){return"symbol"==typeof s}:function(s){var o=a("Symbol");return u(o)&&_(o.prototype,x(s))}},25767:(s,o,i)=>{"use strict";var a=i(82682),u=i(39209),_=i(10487),w=i(36556),x=i(75795),C=w("Object.prototype.toString"),j=i(49092)(),L="undefined"==typeof globalThis?i.g:globalThis,B=u(),$=w("String.prototype.slice"),U=Object.getPrototypeOf,V=w("Array.prototype.indexOf",!0)||function indexOf(s,o){for(var i=0;i<s.length;i+=1)if(s[i]===o)return i;return-1},z={__proto__:null};a(B,j&&x&&U?function(s){var o=new L[s];if(Symbol.toStringTag in o){var i=U(o),a=x(i,Symbol.toStringTag);if(!a){var u=U(i);a=x(u,Symbol.toStringTag)}z["$"+s]=_(a.get)}}:function(s){var o=new L[s],i=o.slice||o.set;i&&(z["$"+s]=_(i))});s.exports=function whichTypedArray(s){if(!s||"object"!=typeof s)return!1;if(!j){var o=$(C(s),8,-1);return V(B,o)>-1?o:"Object"===o&&function tryAllSlices(s){var o=!1;return a(z,(function(i,a){if(!o)try{i(s),o=$(a,1)}catch(s){}})),o}(s)}return x?function tryAllTypedArrays(s){var o=!1;return a(z,(function(i,a){if(!o)try{"$"+i(s)===a&&(o=$(a,1))}catch(s){}})),o}(s):null}},25911:(s,o,i)=>{var a=i(38859),u=i(14248),_=i(19219);s.exports=function equalArrays(s,o,i,w,x,C){var j=1&i,L=s.length,B=o.length;if(L!=B&&!(j&&B>L))return!1;var $=C.get(s),U=C.get(o);if($&&U)return $==o&&U==s;var V=-1,z=!0,Y=2&i?new a:void 0;for(C.set(s,o),C.set(o,s);++V<L;){var Z=s[V],ee=o[V];if(w)var ie=j?w(ee,Z,V,o,s,C):w(Z,ee,V,s,o,C);if(void 0!==ie){if(ie)continue;z=!1;break}if(Y){if(!u(o,(function(s,o){if(!_(Y,o)&&(Z===s||x(Z,s,i,w,C)))return Y.push(o)}))){z=!1;break}}else if(Z!==ee&&!x(Z,ee,i,w,C)){z=!1;break}}return C.delete(s),C.delete(o),z}},26025:(s,o,i)=>{var a=i(75288);s.exports=function assocIndexOf(s,o){for(var i=s.length;i--;)if(a(s[i][0],o))return i;return-1}},26311:s=>{!function(){var o;function format(s){for(var o,i,a,u,_=1,w=[].slice.call(arguments),x=0,C=s.length,j="",L=!1,B=!1,nextArg=function(){return w[_++]},slurpNumber=function(){for(var i="";/\d/.test(s[x]);)i+=s[x++],o=s[x];return i.length>0?parseInt(i):null};x<C;++x)if(o=s[x],L)switch(L=!1,"."==o?(B=!1,o=s[++x]):"0"==o&&"."==s[x+1]?(B=!0,o=s[x+=2]):B=!0,u=slurpNumber(),o){case"b":j+=parseInt(nextArg(),10).toString(2);break;case"c":j+="string"==typeof(i=nextArg())||i instanceof String?i:String.fromCharCode(parseInt(i,10));break;case"d":j+=parseInt(nextArg(),10);break;case"f":a=String(parseFloat(nextArg()).toFixed(u||6)),j+=B?a:a.replace(/^0/,"");break;case"j":j+=JSON.stringify(nextArg());break;case"o":j+="0"+parseInt(nextArg(),10).toString(8);break;case"s":j+=nextArg();break;case"x":j+="0x"+parseInt(nextArg(),10).toString(16);break;case"X":j+="0x"+parseInt(nextArg(),10).toString(16).toUpperCase();break;default:j+=o}else"%"===o?L=!0:j+=o;return j}(o=s.exports=format).format=format,o.vsprintf=function vsprintf(s,o){return format.apply(null,[s].concat(o))},"undefined"!=typeof console&&"function"==typeof console.log&&(o.printf=function printf(){console.log(format.apply(null,arguments))})}()},26571:s=>{s.exports=function powershell(s){const o={$pattern:/-?[A-z\.\-]+\b/,keyword:"if else foreach return do while until elseif begin for trap data dynamicparam end break throw param continue finally in switch exit filter try process catch hidden static parameter",built_in:"ac asnp cat cd CFS chdir clc clear clhy cli clp cls clv cnsn compare copy cp cpi cpp curl cvpa dbp del diff dir dnsn ebp echo|0 epal epcsv epsn erase etsn exsn fc fhx fl ft fw gal gbp gc gcb gci gcm gcs gdr gerr ghy gi gin gjb gl gm gmo gp gps gpv group gsn gsnp gsv gtz gu gv gwmi h history icm iex ihy ii ipal ipcsv ipmo ipsn irm ise iwmi iwr kill lp ls man md measure mi mount move mp mv nal ndr ni nmo npssc nsn nv ogv oh popd ps pushd pwd r rbp rcjb rcsn rd rdr ren ri rjb rm rmdir rmo rni rnp rp rsn rsnp rujb rv rvpa rwmi sajb sal saps sasv sbp sc scb select set shcm si sl sleep sls sort sp spjb spps spsv start stz sujb sv swmi tee trcm type wget where wjb write"},i={begin:"`[\\s\\S]",relevance:0},a={className:"variable",variants:[{begin:/\$\B/},{className:"keyword",begin:/\$this/},{begin:/\$[\w\d][\w\d_:]*/}]},u={className:"string",variants:[{begin:/"/,end:/"/},{begin:/@"/,end:/^"@/}],contains:[i,a,{className:"variable",begin:/\$[A-z]/,end:/[^A-z]/}]},_={className:"string",variants:[{begin:/'/,end:/'/},{begin:/@'/,end:/^'@/}]},w=s.inherit(s.COMMENT(null,null),{variants:[{begin:/#/,end:/$/},{begin:/<#/,end:/#>/}],contains:[{className:"doctag",variants:[{begin:/\.(synopsis|description|example|inputs|outputs|notes|link|component|role|functionality)/},{begin:/\.(parameter|forwardhelptargetname|forwardhelpcategory|remotehelprunspace|externalhelp)\s+\S+/}]}]}),x={className:"built_in",variants:[{begin:"(".concat("Add|Clear|Close|Copy|Enter|Exit|Find|Format|Get|Hide|Join|Lock|Move|New|Open|Optimize|Pop|Push|Redo|Remove|Rename|Reset|Resize|Search|Select|Set|Show|Skip|Split|Step|Switch|Undo|Unlock|Watch|Backup|Checkpoint|Compare|Compress|Convert|ConvertFrom|ConvertTo|Dismount|Edit|Expand|Export|Group|Import|Initialize|Limit|Merge|Mount|Out|Publish|Restore|Save|Sync|Unpublish|Update|Approve|Assert|Build|Complete|Confirm|Deny|Deploy|Disable|Enable|Install|Invoke|Register|Request|Restart|Resume|Start|Stop|Submit|Suspend|Uninstall|Unregister|Wait|Debug|Measure|Ping|Repair|Resolve|Test|Trace|Connect|Disconnect|Read|Receive|Send|Write|Block|Grant|Protect|Revoke|Unblock|Unprotect|Use|ForEach|Sort|Tee|Where",")+(-)[\\w\\d]+")}]},C={className:"class",beginKeywords:"class enum",end:/\s*[{]/,excludeEnd:!0,relevance:0,contains:[s.TITLE_MODE]},j={className:"function",begin:/function\s+/,end:/\s*\{|$/,excludeEnd:!0,returnBegin:!0,relevance:0,contains:[{begin:"function",relevance:0,className:"keyword"},{className:"title",begin:/\w[\w\d]*((-)[\w\d]+)*/,relevance:0},{begin:/\(/,end:/\)/,className:"params",relevance:0,contains:[a]}]},L={begin:/using\s/,end:/$/,returnBegin:!0,contains:[u,_,{className:"keyword",begin:/(using|assembly|command|module|namespace|type)/}]},B={variants:[{className:"operator",begin:"(".concat("-and|-as|-band|-bnot|-bor|-bxor|-casesensitive|-ccontains|-ceq|-cge|-cgt|-cle|-clike|-clt|-cmatch|-cne|-cnotcontains|-cnotlike|-cnotmatch|-contains|-creplace|-csplit|-eq|-exact|-f|-file|-ge|-gt|-icontains|-ieq|-ige|-igt|-ile|-ilike|-ilt|-imatch|-in|-ine|-inotcontains|-inotlike|-inotmatch|-ireplace|-is|-isnot|-isplit|-join|-le|-like|-lt|-match|-ne|-not|-notcontains|-notin|-notlike|-notmatch|-or|-regex|-replace|-shl|-shr|-split|-wildcard|-xor",")\\b")},{className:"literal",begin:/(-)[\w\d]+/,relevance:0}]},$={className:"function",begin:/\[.*\]\s*[\w]+[ ]??\(/,end:/$/,returnBegin:!0,relevance:0,contains:[{className:"keyword",begin:"(".concat(o.keyword.toString().replace(/\s/g,"|"),")\\b"),endsParent:!0,relevance:0},s.inherit(s.TITLE_MODE,{endsParent:!0})]},U=[$,w,i,s.NUMBER_MODE,u,_,x,a,{className:"literal",begin:/\$(null|true|false)\b/},{className:"selector-tag",begin:/@\B/,relevance:0}],V={begin:/\[/,end:/\]/,excludeBegin:!0,excludeEnd:!0,relevance:0,contains:[].concat("self",U,{begin:"("+["string","char","byte","int","long","bool","decimal","single","double","DateTime","xml","array","hashtable","void"].join("|")+")",className:"built_in",relevance:0},{className:"type",begin:/[\.\w\d]+/,relevance:0})};return $.contains.unshift(V),{name:"PowerShell",aliases:["ps","ps1"],case_insensitive:!0,keywords:o,contains:U.concat(C,j,L,B,V)}}},26657:(s,o,i)=>{"use strict";var a=i(75208),u=function isClosingTag(s){return/<\/+[^>]+>/.test(s)},_=function isSelfClosingTag(s){return/<[^>]+\/>/.test(s)};function getType(s){return u(s)?"ClosingTag":function isOpeningTag(s){return function isTag(s){return/<[^>!]+>/.test(s)}(s)&&!u(s)&&!_(s)}(s)?"OpeningTag":_(s)?"SelfClosingTag":"Text"}s.exports=function(s){var o=arguments.length>1&&void 0!==arguments[1]?arguments[1]:{},i=o.indentor,u=o.textNodesOnSameLine,_=0,w=[];i=i||"    ";var x=function lexer(s){return function splitOnTags(s){return s.split(/(<\/?[^>]+>)/g).filter((function(s){return""!==s.trim()}))}(s).map((function(s){return{value:s,type:getType(s)}}))}(s).map((function(s,o,x){var C=s.value,j=s.type;"ClosingTag"===j&&_--;var L=a(i,_),B=L+C;if("OpeningTag"===j&&_++,u){var $=x[o-1],U=x[o-2];"ClosingTag"===j&&"Text"===$.type&&"OpeningTag"===U.type&&(B=""+L+U.value+$.value+C,w.push(o-2,o-1))}return B}));return w.forEach((function(s){return x[s]=null})),x.filter((function(s){return!!s})).join("\n")}},26710:(s,o,i)=>{"use strict";var a=i(56698),u=i(24107),_=i(90392),w=i(92861).Buffer,x=new Array(64);function Sha224(){this.init(),this._w=x,_.call(this,64,56)}a(Sha224,u),Sha224.prototype.init=function(){return this._a=3238371032,this._b=914150663,this._c=812702999,this._d=4144912697,this._e=4290775857,this._f=1750603025,this._g=1694076839,this._h=3204075428,this},Sha224.prototype._hash=function(){var s=w.allocUnsafe(28);return s.writeInt32BE(this._a,0),s.writeInt32BE(this._b,4),s.writeInt32BE(this._c,8),s.writeInt32BE(this._d,12),s.writeInt32BE(this._e,16),s.writeInt32BE(this._f,20),s.writeInt32BE(this._g,24),s},s.exports=Sha224},27096:(s,o,i)=>{const a=i(87586),u=i(6205),_=i(10023),w=i(8048);s.exports=s=>{var o,i,x=0,C={type:u.ROOT,stack:[]},j=C,L=C.stack,B=[],repeatErr=o=>{a.error(s,"Nothing to repeat at column "+(o-1))},$=a.strToChars(s);for(o=$.length;x<o;)switch(i=$[x++]){case"\\":switch(i=$[x++]){case"b":L.push(w.wordBoundary());break;case"B":L.push(w.nonWordBoundary());break;case"w":L.push(_.words());break;case"W":L.push(_.notWords());break;case"d":L.push(_.ints());break;case"D":L.push(_.notInts());break;case"s":L.push(_.whitespace());break;case"S":L.push(_.notWhitespace());break;default:/\d/.test(i)?L.push({type:u.REFERENCE,value:parseInt(i,10)}):L.push({type:u.CHAR,value:i.charCodeAt(0)})}break;case"^":L.push(w.begin());break;case"$":L.push(w.end());break;case"[":var U;"^"===$[x]?(U=!0,x++):U=!1;var V=a.tokenizeClass($.slice(x),s);x+=V[1],L.push({type:u.SET,set:V[0],not:U});break;case".":L.push(_.anyChar());break;case"(":var z={type:u.GROUP,stack:[],remember:!0};"?"===(i=$[x])&&(i=$[x+1],x+=2,"="===i?z.followedBy=!0:"!"===i?z.notFollowedBy=!0:":"!==i&&a.error(s,`Invalid group, character '${i}' after '?' at column `+(x-1)),z.remember=!1),L.push(z),B.push(j),j=z,L=z.stack;break;case")":0===B.length&&a.error(s,"Unmatched ) at column "+(x-1)),L=(j=B.pop()).options?j.options[j.options.length-1]:j.stack;break;case"|":j.options||(j.options=[j.stack],delete j.stack);var Y=[];j.options.push(Y),L=Y;break;case"{":var Z,ee,ie=/^(\d+)(,(\d+)?)?\}/.exec($.slice(x));null!==ie?(0===L.length&&repeatErr(x),Z=parseInt(ie[1],10),ee=ie[2]?ie[3]?parseInt(ie[3],10):1/0:Z,x+=ie[0].length,L.push({type:u.REPETITION,min:Z,max:ee,value:L.pop()})):L.push({type:u.CHAR,value:123});break;case"?":0===L.length&&repeatErr(x),L.push({type:u.REPETITION,min:0,max:1,value:L.pop()});break;case"+":0===L.length&&repeatErr(x),L.push({type:u.REPETITION,min:1,max:1/0,value:L.pop()});break;case"*":0===L.length&&repeatErr(x),L.push({type:u.REPETITION,min:0,max:1/0,value:L.pop()});break;default:L.push({type:u.CHAR,value:i.charCodeAt(0)})}return 0!==B.length&&a.error(s,"Unterminated group"),C},s.exports.types=u},27301:s=>{s.exports=function baseUnary(s){return function(o){return s(o)}}},27374:(s,o)=>{"use strict";Object.defineProperty(o,"__esModule",{value:!0}),o.default=function(s,o,i){if(void 0===s)throw new Error('Reducer "'+o+'" returned undefined when handling "'+i.type+'" action. To ignore an action, you must explicitly return the previous state.')},s.exports=o.default},27534:(s,o,i)=>{var a=i(72552),u=i(40346);s.exports=function baseIsArguments(s){return u(s)&&"[object Arguments]"==a(s)}},27816:(s,o,i)=>{"use strict";var a=i(56698),u=i(90392),_=i(92861).Buffer,w=[1518500249,1859775393,-1894007588,-899497514],x=new Array(80);function Sha(){this.init(),this._w=x,u.call(this,64,56)}function rotl30(s){return s<<30|s>>>2}function ft(s,o,i,a){return 0===s?o&i|~o&a:2===s?o&i|o&a|i&a:o^i^a}a(Sha,u),Sha.prototype.init=function(){return this._a=1732584193,this._b=4023233417,this._c=2562383102,this._d=271733878,this._e=3285377520,this},Sha.prototype._update=function(s){for(var o,i=this._w,a=0|this._a,u=0|this._b,_=0|this._c,x=0|this._d,C=0|this._e,j=0;j<16;++j)i[j]=s.readInt32BE(4*j);for(;j<80;++j)i[j]=i[j-3]^i[j-8]^i[j-14]^i[j-16];for(var L=0;L<80;++L){var B=~~(L/20),$=0|((o=a)<<5|o>>>27)+ft(B,u,_,x)+C+i[L]+w[B];C=x,x=_,_=rotl30(u),u=a,a=$}this._a=a+this._a|0,this._b=u+this._b|0,this._c=_+this._c|0,this._d=x+this._d|0,this._e=C+this._e|0},Sha.prototype._hash=function(){var s=_.allocUnsafe(20);return s.writeInt32BE(0|this._a,0),s.writeInt32BE(0|this._b,4),s.writeInt32BE(0|this._c,8),s.writeInt32BE(0|this._d,12),s.writeInt32BE(0|this._e,16),s},s.exports=Sha},28077:s=>{s.exports=function baseHasIn(s,o){return null!=s&&o in Object(s)}},28303:(s,o,i)=>{var a=i(56110)(i(9325),"WeakMap");s.exports=a},28311:(s,o,i)=>{"use strict";var a=i(92361),u=i(82159),_=i(41505),w=a(a.bind);s.exports=function(s,o){return u(s),void 0===o?s:_?w(s,o):function(){return s.apply(o,arguments)}}},28586:(s,o,i)=>{var a=i(56449),u=i(44394),_=/\.|\[(?:[^[\]]*|(["'])(?:(?!\1)[^\\]|\\.)*?\1)\]/,w=/^\w*$/;s.exports=function isKey(s,o){if(a(s))return!1;var i=typeof s;return!("number"!=i&&"symbol"!=i&&"boolean"!=i&&null!=s&&!u(s))||(w.test(s)||!_.test(s)||null!=o&&s in Object(o))}},28754:(s,o,i)=>{var a=i(25160);s.exports=function castSlice(s,o,i){var u=s.length;return i=void 0===i?u:i,!o&&i>=u?s:a(s,o,i)}},28879:(s,o,i)=>{var a=i(74335)(Object.getPrototypeOf,Object);s.exports=a},29172:(s,o,i)=>{var a=i(5861),u=i(40346);s.exports=function baseIsMap(s){return u(s)&&"[object Map]"==a(s)}},29367:(s,o,i)=>{"use strict";var a=i(82159),u=i(87136);s.exports=function(s,o){var i=s[o];return u(i)?void 0:a(i)}},29538:(s,o,i)=>{"use strict";var a=i(39447),u=i(1907),_=i(13930),w=i(98828),x=i(2875),C=i(87170),j=i(22574),L=i(39298),B=i(16946),$=Object.assign,U=Object.defineProperty,V=u([].concat);s.exports=!$||w((function(){if(a&&1!==$({b:1},$(U({},"a",{enumerable:!0,get:function(){U(this,"b",{value:3,enumerable:!1})}}),{b:2})).b)return!0;var s={},o={},i=Symbol("assign detection"),u="abcdefghijklmnopqrst";return s[i]=7,u.split("").forEach((function(s){o[s]=s})),7!==$({},s)[i]||x($({},o)).join("")!==u}))?function assign(s,o){for(var i=L(s),u=arguments.length,w=1,$=C.f,U=j.f;u>w;)for(var z,Y=B(arguments[w++]),Z=$?V(x(Y),$(Y)):x(Y),ee=Z.length,ie=0;ee>ie;)z=Z[ie++],a&&!_(U,Y,z)||(i[z]=Y[z]);return i}:$},29817:s=>{s.exports=function stackHas(s){return this.__data__.has(s)}},29844:(s,o)=>{"use strict";function f(s,o){var i=s.length;s.push(o);e:for(;0<i;){var a=i-1>>>1,u=s[a];if(!(0<g(u,o)))break e;s[a]=o,s[i]=u,i=a}}function h(s){return 0===s.length?null:s[0]}function k(s){if(0===s.length)return null;var o=s[0],i=s.pop();if(i!==o){s[0]=i;e:for(var a=0,u=s.length,_=u>>>1;a<_;){var w=2*(a+1)-1,x=s[w],C=w+1,j=s[C];if(0>g(x,i))C<u&&0>g(j,x)?(s[a]=j,s[C]=i,a=C):(s[a]=x,s[w]=i,a=w);else{if(!(C<u&&0>g(j,i)))break e;s[a]=j,s[C]=i,a=C}}}return o}function g(s,o){var i=s.sortIndex-o.sortIndex;return 0!==i?i:s.id-o.id}if("object"==typeof performance&&"function"==typeof performance.now){var i=performance;o.unstable_now=function(){return i.now()}}else{var a=Date,u=a.now();o.unstable_now=function(){return a.now()-u}}var _=[],w=[],x=1,C=null,j=3,L=!1,B=!1,$=!1,U="function"==typeof setTimeout?setTimeout:null,V="function"==typeof clearTimeout?clearTimeout:null,z="undefined"!=typeof setImmediate?setImmediate:null;function G(s){for(var o=h(w);null!==o;){if(null===o.callback)k(w);else{if(!(o.startTime<=s))break;k(w),o.sortIndex=o.expirationTime,f(_,o)}o=h(w)}}function H(s){if($=!1,G(s),!B)if(null!==h(_))B=!0,I(J);else{var o=h(w);null!==o&&K(H,o.startTime-s)}}function J(s,i){B=!1,$&&($=!1,V(ie),ie=-1),L=!0;var a=j;try{for(G(i),C=h(_);null!==C&&(!(C.expirationTime>i)||s&&!M());){var u=C.callback;if("function"==typeof u){C.callback=null,j=C.priorityLevel;var x=u(C.expirationTime<=i);i=o.unstable_now(),"function"==typeof x?C.callback=x:C===h(_)&&k(_),G(i)}else k(_);C=h(_)}if(null!==C)var U=!0;else{var z=h(w);null!==z&&K(H,z.startTime-i),U=!1}return U}finally{C=null,j=a,L=!1}}"undefined"!=typeof navigator&&void 0!==navigator.scheduling&&void 0!==navigator.scheduling.isInputPending&&navigator.scheduling.isInputPending.bind(navigator.scheduling);var Y,Z=!1,ee=null,ie=-1,ae=5,ce=-1;function M(){return!(o.unstable_now()-ce<ae)}function R(){if(null!==ee){var s=o.unstable_now();ce=s;var i=!0;try{i=ee(!0,s)}finally{i?Y():(Z=!1,ee=null)}}else Z=!1}if("function"==typeof z)Y=function(){z(R)};else if("undefined"!=typeof MessageChannel){var le=new MessageChannel,pe=le.port2;le.port1.onmessage=R,Y=function(){pe.postMessage(null)}}else Y=function(){U(R,0)};function I(s){ee=s,Z||(Z=!0,Y())}function K(s,i){ie=U((function(){s(o.unstable_now())}),i)}o.unstable_IdlePriority=5,o.unstable_ImmediatePriority=1,o.unstable_LowPriority=4,o.unstable_NormalPriority=3,o.unstable_Profiling=null,o.unstable_UserBlockingPriority=2,o.unstable_cancelCallback=function(s){s.callback=null},o.unstable_continueExecution=function(){B||L||(B=!0,I(J))},o.unstable_forceFrameRate=function(s){0>s||125<s?console.error("forceFrameRate takes a positive int between 0 and 125, forcing frame rates higher than 125 fps is not supported"):ae=0<s?Math.floor(1e3/s):5},o.unstable_getCurrentPriorityLevel=function(){return j},o.unstable_getFirstCallbackNode=function(){return h(_)},o.unstable_next=function(s){switch(j){case 1:case 2:case 3:var o=3;break;default:o=j}var i=j;j=o;try{return s()}finally{j=i}},o.unstable_pauseExecution=function(){},o.unstable_requestPaint=function(){},o.unstable_runWithPriority=function(s,o){switch(s){case 1:case 2:case 3:case 4:case 5:break;default:s=3}var i=j;j=s;try{return o()}finally{j=i}},o.unstable_scheduleCallback=function(s,i,a){var u=o.unstable_now();switch("object"==typeof a&&null!==a?a="number"==typeof(a=a.delay)&&0<a?u+a:u:a=u,s){case 1:var C=-1;break;case 2:C=250;break;case 5:C=1073741823;break;case 4:C=1e4;break;default:C=5e3}return s={id:x++,callback:i,priorityLevel:s,startTime:a,expirationTime:C=a+C,sortIndex:-1},a>u?(s.sortIndex=a,f(w,s),null===h(_)&&s===h(w)&&($?(V(ie),ie=-1):$=!0,K(H,a-u))):(s.sortIndex=C,f(_,s),B||L||(B=!0,I(J))),s},o.unstable_shouldYield=M,o.unstable_wrapCallback=function(s){var o=j;return function(){var i=j;j=o;try{return s.apply(this,arguments)}finally{j=i}}}},30041:(s,o,i)=>{"use strict";var a=i(30655),u=i(58068),_=i(69675),w=i(75795);s.exports=function defineDataProperty(s,o,i){if(!s||"object"!=typeof s&&"function"!=typeof s)throw new _("`obj` must be an object or a function`");if("string"!=typeof o&&"symbol"!=typeof o)throw new _("`property` must be a string or a symbol`");if(arguments.length>3&&"boolean"!=typeof arguments[3]&&null!==arguments[3])throw new _("`nonEnumerable`, if provided, must be a boolean or null");if(arguments.length>4&&"boolean"!=typeof arguments[4]&&null!==arguments[4])throw new _("`nonWritable`, if provided, must be a boolean or null");if(arguments.length>5&&"boolean"!=typeof arguments[5]&&null!==arguments[5])throw new _("`nonConfigurable`, if provided, must be a boolean or null");if(arguments.length>6&&"boolean"!=typeof arguments[6])throw new _("`loose`, if provided, must be a boolean");var x=arguments.length>3?arguments[3]:null,C=arguments.length>4?arguments[4]:null,j=arguments.length>5?arguments[5]:null,L=arguments.length>6&&arguments[6],B=!!w&&w(s,o);if(a)a(s,o,{configurable:null===j&&B?B.configurable:!j,enumerable:null===x&&B?B.enumerable:!x,value:i,writable:null===C&&B?B.writable:!C});else{if(!L&&(x||C||j))throw new u("This environment does not support defining a property as non-configurable, non-writable, or non-enumerable.");s[o]=i}}},30294:s=>{s.exports=function isLength(s){return"number"==typeof s&&s>-1&&s%1==0&&s<=9007199254740991}},30361:s=>{var o=/^(?:0|[1-9]\d*)$/;s.exports=function isIndex(s,i){var a=typeof s;return!!(i=null==i?9007199254740991:i)&&("number"==a||"symbol"!=a&&o.test(s))&&s>-1&&s%1==0&&s<i}},30592:(s,o,i)=>{"use strict";var a=i(30655),u=function hasPropertyDescriptors(){return!!a};u.hasArrayLengthDefineBug=function hasArrayLengthDefineBug(){if(!a)return null;try{return 1!==a([],"length",{value:1}).length}catch(s){return!0}},s.exports=u},30641:(s,o,i)=>{var a=i(86649),u=i(95950);s.exports=function baseForOwn(s,o){return s&&a(s,o,u)}},30655:s=>{"use strict";var o=Object.defineProperty||!1;if(o)try{o({},"a",{value:1})}catch(s){o=!1}s.exports=o},30756:(s,o,i)=>{var a=i(23805);s.exports=function isStrictComparable(s){return s==s&&!a(s)}},30980:(s,o,i)=>{var a=i(39344),u=i(94033);function LazyWrapper(s){this.__wrapped__=s,this.__actions__=[],this.__dir__=1,this.__filtered__=!1,this.__iteratees__=[],this.__takeCount__=4294967295,this.__views__=[]}LazyWrapper.prototype=a(u.prototype),LazyWrapper.prototype.constructor=LazyWrapper,s.exports=LazyWrapper},31175:(s,o,i)=>{var a=i(26025);s.exports=function listCacheSet(s,o){var i=this.__data__,u=a(i,s);return u<0?(++this.size,i.push([s,o])):i[u][1]=o,this}},31380:s=>{s.exports=function setCacheAdd(s){return this.__data__.set(s,"__lodash_hash_undefined__"),this}},31499:s=>{var o={"&":"&amp;",'"':"&quot;","'":"&apos;","<":"&lt;",">":"&gt;"};s.exports=function escapeForXML(s){return s&&s.replace?s.replace(/([&"<>'])/g,(function(s,i){return o[i]})):s}},31769:(s,o,i)=>{var a=i(56449),u=i(28586),_=i(61802),w=i(13222);s.exports=function castPath(s,o){return a(s)?s:u(s,o)?[s]:_(w(s))}},31800:s=>{var o=/\s/;s.exports=function trimmedEndIndex(s){for(var i=s.length;i--&&o.test(s.charAt(i)););return i}},32096:(s,o,i)=>{"use strict";var a=i(90160);s.exports=function(s,o){return void 0===s?arguments.length<2?"":o:a(s)}},32567:(s,o,i)=>{"use strict";i(79307);var a=i(61747);s.exports=a("Function","bind")},32629:(s,o,i)=>{var a=i(9999);s.exports=function clone(s){return a(s,4)}},32804:(s,o,i)=>{var a=i(56110)(i(9325),"Promise");s.exports=a},32827:(s,o,i)=>{"use strict";var a=i(56698),u=i(82890),_=i(90392),w=i(92861).Buffer,x=new Array(160);function Sha384(){this.init(),this._w=x,_.call(this,128,112)}a(Sha384,u),Sha384.prototype.init=function(){return this._ah=3418070365,this._bh=1654270250,this._ch=2438529370,this._dh=355462360,this._eh=1731405415,this._fh=2394180231,this._gh=3675008525,this._hh=1203062813,this._al=3238371032,this._bl=914150663,this._cl=812702999,this._dl=4144912697,this._el=4290775857,this._fl=1750603025,this._gl=1694076839,this._hl=3204075428,this},Sha384.prototype._hash=function(){var s=w.allocUnsafe(48);function writeInt64BE(o,i,a){s.writeInt32BE(o,a),s.writeInt32BE(i,a+4)}return writeInt64BE(this._ah,this._al,0),writeInt64BE(this._bh,this._bl,8),writeInt64BE(this._ch,this._cl,16),writeInt64BE(this._dh,this._dl,24),writeInt64BE(this._eh,this._el,32),writeInt64BE(this._fh,this._fl,40),s},s.exports=Sha384},32865:(s,o,i)=>{var a=i(19570),u=i(51811)(a);s.exports=u},33855:(s,o,i)=>{var a=i(9999),u=i(15389);s.exports=function iteratee(s){return u("function"==typeof s?s:a(s,1))}},34035:(s,o,i)=>{const a=i(3110),u=i(86804);o.g$=a,o.KeyValuePair=i(55973),o.G6=u.ArraySlice,o.ot=u.ObjectSlice,o.Hg=u.Element,o.Om=u.StringElement,o.kT=u.NumberElement,o.bd=u.BooleanElement,o.Os=u.NullElement,o.wE=u.ArrayElement,o.Sh=u.ObjectElement,o.Pr=u.MemberElement,o.sI=u.RefElement,o.Ft=u.LinkElement,o.e=u.refract,i(85105),i(75147)},34084:(s,o,i)=>{"use strict";var a=i(62250),u=i(46285),_=i(79192);s.exports=function(s,o,i){var w,x;return _&&a(w=o.constructor)&&w!==i&&u(x=w.prototype)&&x!==i.prototype&&_(s,x),s}},34840:(s,o,i)=>{var a="object"==typeof i.g&&i.g&&i.g.Object===Object&&i.g;s.exports=a},34849:(s,o,i)=>{"use strict";var a=i(65482),u=Math.max,_=Math.min;s.exports=function(s,o){var i=a(s);return i<0?u(i+o,0):_(i,o)}},34932:s=>{s.exports=function arrayMap(s,o){for(var i=-1,a=null==s?0:s.length,u=Array(a);++i<a;)u[i]=o(s[i],i,s);return u}},35344:s=>{function concat(...s){return s.map((s=>function source(s){return s?"string"==typeof s?s:s.source:null}(s))).join("")}s.exports=function bash(s){const o={},i={begin:/\$\{/,end:/\}/,contains:["self",{begin:/:-/,contains:[o]}]};Object.assign(o,{className:"variable",variants:[{begin:concat(/\$[\w\d#@][\w\d_]*/,"(?![\\w\\d])(?![$])")},i]});const a={className:"subst",begin:/\$\(/,end:/\)/,contains:[s.BACKSLASH_ESCAPE]},u={begin:/<<-?\s*(?=\w+)/,starts:{contains:[s.END_SAME_AS_BEGIN({begin:/(\w+)/,end:/(\w+)/,className:"string"})]}},_={className:"string",begin:/"/,end:/"/,contains:[s.BACKSLASH_ESCAPE,o,a]};a.contains.push(_);const w={begin:/\$\(\(/,end:/\)\)/,contains:[{begin:/\d+#[0-9a-f]+/,className:"number"},s.NUMBER_MODE,o]},x=s.SHEBANG({binary:`(${["fish","bash","zsh","sh","csh","ksh","tcsh","dash","scsh"].join("|")})`,relevance:10}),C={className:"function",begin:/\w[\w\d_]*\s*\(\s*\)\s*\{/,returnBegin:!0,contains:[s.inherit(s.TITLE_MODE,{begin:/\w[\w\d_]*/})],relevance:0};return{name:"Bash",aliases:["sh","zsh"],keywords:{$pattern:/\b[a-z._-]+\b/,keyword:"if then else elif fi for while in do done case esac function",literal:"true false",built_in:"break cd continue eval exec exit export getopts hash pwd readonly return shift test times trap umask unset alias bind builtin caller command declare echo enable help let local logout mapfile printf read readarray source type typeset ulimit unalias set shopt autoload bg bindkey bye cap chdir clone comparguments compcall compctl compdescribe compfiles compgroups compquote comptags comptry compvalues dirs disable disown echotc echoti emulate fc fg float functions getcap getln history integer jobs kill limit log noglob popd print pushd pushln rehash sched setcap setopt stat suspend ttyctl unfunction unhash unlimit unsetopt vared wait whence where which zcompile zformat zftp zle zmodload zparseopts zprof zpty zregexparse zsocket zstyle ztcp"},contains:[x,s.SHEBANG(),C,w,s.HASH_COMMENT_MODE,u,_,{className:"",begin:/\\"/},{className:"string",begin:/'/,end:/'/},o]}}},35345:s=>{"use strict";s.exports=URIError},35529:(s,o,i)=>{var a=i(39344),u=i(28879),_=i(55527);s.exports=function initCloneObject(s){return"function"!=typeof s.constructor||_(s)?{}:a(u(s))}},35680:(s,o,i)=>{"use strict";var a=i(25767);s.exports=function isTypedArray(s){return!!a(s)}},35749:(s,o,i)=>{var a=i(81042);s.exports=function hashSet(s,o){var i=this.__data__;return this.size+=this.has(s)?0:1,i[s]=a&&void 0===o?"__lodash_hash_undefined__":o,this}},35970:(s,o,i)=>{var a=i(83120);s.exports=function flatten(s){return(null==s?0:s.length)?a(s,1):[]}},36128:(s,o,i)=>{"use strict";var a=i(7376),u=i(45951),_=i(2532),w="__core-js_shared__",x=s.exports=u[w]||_(w,{});(x.versions||(x.versions=[])).push({version:"3.40.0",mode:a?"pure":"global",copyright:"© 2014-2025 Denis Pushkarev (zloirock.ru)",license:"https://github.com/zloirock/core-js/blob/v3.40.0/LICENSE",source:"https://github.com/zloirock/core-js"})},36306:s=>{var o="__lodash_placeholder__";s.exports=function replaceHolders(s,i){for(var a=-1,u=s.length,_=0,w=[];++a<u;){var x=s[a];x!==i&&x!==o||(s[a]=o,w[_++]=a)}return w}},36371:(s,o,i)=>{"use strict";var a=i(11091),u=i(85582),_=i(76024),w=i(98828),x=i(19358),C="AggregateError",j=u(C),L=!w((function(){return 1!==j([1]).errors[0]}))&&w((function(){return 7!==j([1],C,{cause:7}).cause}));a({global:!0,constructor:!0,arity:2,forced:L},{AggregateError:x(C,(function(s){return function AggregateError(o,i){return _(s,this,arguments)}}),L,!0)})},36556:(s,o,i)=>{"use strict";var a=i(70453),u=i(73126),_=u([a("%String.prototype.indexOf%")]);s.exports=function callBoundIntrinsic(s,o){var i=a(s,!!o);return"function"==typeof i&&_(s,".prototype.")>-1?u([i]):i}},36624:(s,o,i)=>{"use strict";var a=i(46285),u=String,_=TypeError;s.exports=function(s){if(a(s))return s;throw new _(u(s)+" is not an object")}},36800:(s,o,i)=>{var a=i(75288),u=i(64894),_=i(30361),w=i(23805);s.exports=function isIterateeCall(s,o,i){if(!w(i))return!1;var x=typeof o;return!!("number"==x?u(i)&&_(o,i.length):"string"==x&&o in i)&&a(i[o],s)}},36833:(s,o,i)=>{"use strict";var a=i(39447),u=i(49724),_=Function.prototype,w=a&&Object.getOwnPropertyDescriptor,x=u(_,"name"),C=x&&"something"===function something(){}.name,j=x&&(!a||a&&w(_,"name").configurable);s.exports={EXISTS:x,PROPER:C,CONFIGURABLE:j}},37007:s=>{"use strict";var o,i="object"==typeof Reflect?Reflect:null,a=i&&"function"==typeof i.apply?i.apply:function ReflectApply(s,o,i){return Function.prototype.apply.call(s,o,i)};o=i&&"function"==typeof i.ownKeys?i.ownKeys:Object.getOwnPropertySymbols?function ReflectOwnKeys(s){return Object.getOwnPropertyNames(s).concat(Object.getOwnPropertySymbols(s))}:function ReflectOwnKeys(s){return Object.getOwnPropertyNames(s)};var u=Number.isNaN||function NumberIsNaN(s){return s!=s};function EventEmitter(){EventEmitter.init.call(this)}s.exports=EventEmitter,s.exports.once=function once(s,o){return new Promise((function(i,a){function errorListener(i){s.removeListener(o,resolver),a(i)}function resolver(){"function"==typeof s.removeListener&&s.removeListener("error",errorListener),i([].slice.call(arguments))}eventTargetAgnosticAddListener(s,o,resolver,{once:!0}),"error"!==o&&function addErrorHandlerIfEventEmitter(s,o,i){"function"==typeof s.on&&eventTargetAgnosticAddListener(s,"error",o,i)}(s,errorListener,{once:!0})}))},EventEmitter.EventEmitter=EventEmitter,EventEmitter.prototype._events=void 0,EventEmitter.prototype._eventsCount=0,EventEmitter.prototype._maxListeners=void 0;var _=10;function checkListener(s){if("function"!=typeof s)throw new TypeError('The "listener" argument must be of type Function. Received type '+typeof s)}function _getMaxListeners(s){return void 0===s._maxListeners?EventEmitter.defaultMaxListeners:s._maxListeners}function _addListener(s,o,i,a){var u,_,w;if(checkListener(i),void 0===(_=s._events)?(_=s._events=Object.create(null),s._eventsCount=0):(void 0!==_.newListener&&(s.emit("newListener",o,i.listener?i.listener:i),_=s._events),w=_[o]),void 0===w)w=_[o]=i,++s._eventsCount;else if("function"==typeof w?w=_[o]=a?[i,w]:[w,i]:a?w.unshift(i):w.push(i),(u=_getMaxListeners(s))>0&&w.length>u&&!w.warned){w.warned=!0;var x=new Error("Possible EventEmitter memory leak detected. "+w.length+" "+String(o)+" listeners added. Use emitter.setMaxListeners() to increase limit");x.name="MaxListenersExceededWarning",x.emitter=s,x.type=o,x.count=w.length,function ProcessEmitWarning(s){console&&console.warn&&console.warn(s)}(x)}return s}function onceWrapper(){if(!this.fired)return this.target.removeListener(this.type,this.wrapFn),this.fired=!0,0===arguments.length?this.listener.call(this.target):this.listener.apply(this.target,arguments)}function _onceWrap(s,o,i){var a={fired:!1,wrapFn:void 0,target:s,type:o,listener:i},u=onceWrapper.bind(a);return u.listener=i,a.wrapFn=u,u}function _listeners(s,o,i){var a=s._events;if(void 0===a)return[];var u=a[o];return void 0===u?[]:"function"==typeof u?i?[u.listener||u]:[u]:i?function unwrapListeners(s){for(var o=new Array(s.length),i=0;i<o.length;++i)o[i]=s[i].listener||s[i];return o}(u):arrayClone(u,u.length)}function listenerCount(s){var o=this._events;if(void 0!==o){var i=o[s];if("function"==typeof i)return 1;if(void 0!==i)return i.length}return 0}function arrayClone(s,o){for(var i=new Array(o),a=0;a<o;++a)i[a]=s[a];return i}function eventTargetAgnosticAddListener(s,o,i,a){if("function"==typeof s.on)a.once?s.once(o,i):s.on(o,i);else{if("function"!=typeof s.addEventListener)throw new TypeError('The "emitter" argument must be of type EventEmitter. Received type '+typeof s);s.addEventListener(o,(function wrapListener(u){a.once&&s.removeEventListener(o,wrapListener),i(u)}))}}Object.defineProperty(EventEmitter,"defaultMaxListeners",{enumerable:!0,get:function(){return _},set:function(s){if("number"!=typeof s||s<0||u(s))throw new RangeError('The value of "defaultMaxListeners" is out of range. It must be a non-negative number. Received '+s+".");_=s}}),EventEmitter.init=function(){void 0!==this._events&&this._events!==Object.getPrototypeOf(this)._events||(this._events=Object.create(null),this._eventsCount=0),this._maxListeners=this._maxListeners||void 0},EventEmitter.prototype.setMaxListeners=function setMaxListeners(s){if("number"!=typeof s||s<0||u(s))throw new RangeError('The value of "n" is out of range. It must be a non-negative number. Received '+s+".");return this._maxListeners=s,this},EventEmitter.prototype.getMaxListeners=function getMaxListeners(){return _getMaxListeners(this)},EventEmitter.prototype.emit=function emit(s){for(var o=[],i=1;i<arguments.length;i++)o.push(arguments[i]);var u="error"===s,_=this._events;if(void 0!==_)u=u&&void 0===_.error;else if(!u)return!1;if(u){var w;if(o.length>0&&(w=o[0]),w instanceof Error)throw w;var x=new Error("Unhandled error."+(w?" ("+w.message+")":""));throw x.context=w,x}var C=_[s];if(void 0===C)return!1;if("function"==typeof C)a(C,this,o);else{var j=C.length,L=arrayClone(C,j);for(i=0;i<j;++i)a(L[i],this,o)}return!0},EventEmitter.prototype.addListener=function addListener(s,o){return _addListener(this,s,o,!1)},EventEmitter.prototype.on=EventEmitter.prototype.addListener,EventEmitter.prototype.prependListener=function prependListener(s,o){return _addListener(this,s,o,!0)},EventEmitter.prototype.once=function once(s,o){return checkListener(o),this.on(s,_onceWrap(this,s,o)),this},EventEmitter.prototype.prependOnceListener=function prependOnceListener(s,o){return checkListener(o),this.prependListener(s,_onceWrap(this,s,o)),this},EventEmitter.prototype.removeListener=function removeListener(s,o){var i,a,u,_,w;if(checkListener(o),void 0===(a=this._events))return this;if(void 0===(i=a[s]))return this;if(i===o||i.listener===o)0==--this._eventsCount?this._events=Object.create(null):(delete a[s],a.removeListener&&this.emit("removeListener",s,i.listener||o));else if("function"!=typeof i){for(u=-1,_=i.length-1;_>=0;_--)if(i[_]===o||i[_].listener===o){w=i[_].listener,u=_;break}if(u<0)return this;0===u?i.shift():function spliceOne(s,o){for(;o+1<s.length;o++)s[o]=s[o+1];s.pop()}(i,u),1===i.length&&(a[s]=i[0]),void 0!==a.removeListener&&this.emit("removeListener",s,w||o)}return this},EventEmitter.prototype.off=EventEmitter.prototype.removeListener,EventEmitter.prototype.removeAllListeners=function removeAllListeners(s){var o,i,a;if(void 0===(i=this._events))return this;if(void 0===i.removeListener)return 0===arguments.length?(this._events=Object.create(null),this._eventsCount=0):void 0!==i[s]&&(0==--this._eventsCount?this._events=Object.create(null):delete i[s]),this;if(0===arguments.length){var u,_=Object.keys(i);for(a=0;a<_.length;++a)"removeListener"!==(u=_[a])&&this.removeAllListeners(u);return this.removeAllListeners("removeListener"),this._events=Object.create(null),this._eventsCount=0,this}if("function"==typeof(o=i[s]))this.removeListener(s,o);else if(void 0!==o)for(a=o.length-1;a>=0;a--)this.removeListener(s,o[a]);return this},EventEmitter.prototype.listeners=function listeners(s){return _listeners(this,s,!0)},EventEmitter.prototype.rawListeners=function rawListeners(s){return _listeners(this,s,!1)},EventEmitter.listenerCount=function(s,o){return"function"==typeof s.listenerCount?s.listenerCount(o):listenerCount.call(s,o)},EventEmitter.prototype.listenerCount=listenerCount,EventEmitter.prototype.eventNames=function eventNames(){return this._eventsCount>0?o(this._events):[]}},37167:(s,o,i)=>{var a=i(4901),u=i(27301),_=i(86009),w=_&&_.isTypedArray,x=w?u(w):a;s.exports=x},37217:(s,o,i)=>{var a=i(80079),u=i(51420),_=i(90938),w=i(63605),x=i(29817),C=i(80945);function Stack(s){var o=this.__data__=new a(s);this.size=o.size}Stack.prototype.clear=u,Stack.prototype.delete=_,Stack.prototype.get=w,Stack.prototype.has=x,Stack.prototype.set=C,s.exports=Stack},37241:(s,o,i)=>{var a=i(70695),u=i(72903),_=i(64894);s.exports=function keysIn(s){return _(s)?a(s,!0):u(s)}},37257:(s,o,i)=>{"use strict";i(96605),i(64502),i(36371),i(99363),i(7057);var a=i(92046);s.exports=a.AggregateError},37334:s=>{s.exports=function constant(s){return function(){return s}}},37381:(s,o,i)=>{var a=i(48152),u=i(63950),_=a?function(s){return a.get(s)}:u;s.exports=_},37471:(s,o,i)=>{var a=i(91596),u=i(53320),_=i(58523),w=i(82819),x=i(18073),C=i(11287),j=i(68294),L=i(36306),B=i(9325);s.exports=function createHybrid(s,o,i,$,U,V,z,Y,Z,ee){var ie=128&o,ae=1&o,ce=2&o,le=24&o,pe=512&o,de=ce?void 0:w(s);return function wrapper(){for(var fe=arguments.length,ye=Array(fe),be=fe;be--;)ye[be]=arguments[be];if(le)var _e=C(wrapper),Se=_(ye,_e);if($&&(ye=a(ye,$,U,le)),V&&(ye=u(ye,V,z,le)),fe-=Se,le&&fe<ee){var we=L(ye,_e);return x(s,o,createHybrid,wrapper.placeholder,i,ye,we,Y,Z,ee-fe)}var xe=ae?i:this,Pe=ce?xe[s]:s;return fe=ye.length,Y?ye=j(ye,Y):pe&&fe>1&&ye.reverse(),ie&&Z<fe&&(ye.length=Z),this&&this!==B&&this instanceof wrapper&&(Pe=de||w(Pe)),Pe.apply(xe,ye)}}},37812:(s,o,i)=>{"use strict";var a=i(76264),u=i(93742),_=a("iterator"),w=Array.prototype;s.exports=function(s){return void 0!==s&&(u.Array===s||w[_]===s)}},37828:(s,o,i)=>{var a=i(9325).Uint8Array;s.exports=a},38221:(s,o,i)=>{var a=i(23805),u=i(10124),_=i(99374),w=Math.max,x=Math.min;s.exports=function debounce(s,o,i){var C,j,L,B,$,U,V=0,z=!1,Y=!1,Z=!0;if("function"!=typeof s)throw new TypeError("Expected a function");function invokeFunc(o){var i=C,a=j;return C=j=void 0,V=o,B=s.apply(a,i)}function shouldInvoke(s){var i=s-U;return void 0===U||i>=o||i<0||Y&&s-V>=L}function timerExpired(){var s=u();if(shouldInvoke(s))return trailingEdge(s);$=setTimeout(timerExpired,function remainingWait(s){var i=o-(s-U);return Y?x(i,L-(s-V)):i}(s))}function trailingEdge(s){return $=void 0,Z&&C?invokeFunc(s):(C=j=void 0,B)}function debounced(){var s=u(),i=shouldInvoke(s);if(C=arguments,j=this,U=s,i){if(void 0===$)return function leadingEdge(s){return V=s,$=setTimeout(timerExpired,o),z?invokeFunc(s):B}(U);if(Y)return clearTimeout($),$=setTimeout(timerExpired,o),invokeFunc(U)}return void 0===$&&($=setTimeout(timerExpired,o)),B}return o=_(o)||0,a(i)&&(z=!!i.leading,L=(Y="maxWait"in i)?w(_(i.maxWait)||0,o):L,Z="trailing"in i?!!i.trailing:Z),debounced.cancel=function cancel(){void 0!==$&&clearTimeout($),V=0,C=U=j=$=void 0},debounced.flush=function flush(){return void 0===$?B:trailingEdge(u())},debounced}},38329:(s,o,i)=>{var a=i(64894);s.exports=function createBaseEach(s,o){return function(i,u){if(null==i)return i;if(!a(i))return s(i,u);for(var _=i.length,w=o?_:-1,x=Object(i);(o?w--:++w<_)&&!1!==u(x[w],w,x););return i}}},38440:(s,o,i)=>{var a=i(16038),u=i(27301),_=i(86009),w=_&&_.isSet,x=w?u(w):a;s.exports=x},38530:s=>{"use strict";s.exports={}},38816:(s,o,i)=>{var a=i(35970),u=i(56757),_=i(32865);s.exports=function flatRest(s){return _(u(s,void 0,a),s+"")}},38859:(s,o,i)=>{var a=i(53661),u=i(31380),_=i(51459);function SetCache(s){var o=-1,i=null==s?0:s.length;for(this.__data__=new a;++o<i;)this.add(s[o])}SetCache.prototype.add=SetCache.prototype.push=u,SetCache.prototype.has=_,s.exports=SetCache},39209:(s,o,i)=>{"use strict";var a=i(76578),u="undefined"==typeof globalThis?i.g:globalThis;s.exports=function availableTypedArrays(){for(var s=[],o=0;o<a.length;o++)"function"==typeof u[a[o]]&&(s[s.length]=a[o]);return s}},39259:(s,o,i)=>{"use strict";var a=i(46285),u=i(61626);s.exports=function(s,o){a(o)&&"cause"in o&&u(s,"cause",o.cause)}},39298:(s,o,i)=>{"use strict";var a=i(74239),u=Object;s.exports=function(s){return u(a(s))}},39344:(s,o,i)=>{var a=i(23805),u=Object.create,_=function(){function object(){}return function(s){if(!a(s))return{};if(u)return u(s);object.prototype=s;var o=new object;return object.prototype=void 0,o}}();s.exports=_},39447:(s,o,i)=>{"use strict";var a=i(98828);s.exports=!a((function(){return 7!==Object.defineProperty({},1,{get:function(){return 7}})[1]}))},40154:(s,o,i)=>{"use strict";var a=i(13930),u=i(36624),_=i(29367);s.exports=function(s,o,i){var w,x;u(s);try{if(!(w=_(s,"return"))){if("throw"===o)throw i;return i}w=a(w,s)}catch(s){x=!0,w=s}if("throw"===o)throw i;if(x)throw w;return u(w),i}},40239:(s,o,i)=>{const a=i(10316);s.exports=class NumberElement extends a{constructor(s,o,i){super(s,o,i),this.element="number"}primitive(){return"number"}}},40345:(s,o,i)=>{s.exports=i(37007).EventEmitter},40346:s=>{s.exports=function isObjectLike(s){return null!=s&&"object"==typeof s}},40551:(s,o,i)=>{"use strict";var a=i(45951),u=i(62250),_=a.WeakMap;s.exports=u(_)&&/native code/.test(String(_))},40860:(s,o,i)=>{var a=i(40882),u=i(80909),_=i(15389),w=i(85558),x=i(56449);s.exports=function reduce(s,o,i){var C=x(s)?a:w,j=arguments.length<3;return C(s,_(o,4),i,j,u)}},40882:s=>{s.exports=function arrayReduce(s,o,i,a){var u=-1,_=null==s?0:s.length;for(a&&_&&(i=s[++u]);++u<_;)i=o(i,s[u],u,s);return i}},40961:(s,o,i)=>{"use strict";!function checkDCE(){if("undefined"!=typeof __REACT_DEVTOOLS_GLOBAL_HOOK__&&"function"==typeof __REACT_DEVTOOLS_GLOBAL_HOOK__.checkDCE)try{__REACT_DEVTOOLS_GLOBAL_HOOK__.checkDCE(checkDCE)}catch(s){console.error(s)}}(),s.exports=i(22551)},40975:(s,o,i)=>{"use strict";var a=i(9748);s.exports=a},41067:(s,o,i)=>{const a=i(10316);s.exports=class NullElement extends a{constructor(s,o,i){super(s||null,o,i),this.element="null"}primitive(){return"null"}set(){return new Error("Cannot set the value of null")}}},41176:s=>{"use strict";var o=Math.ceil,i=Math.floor;s.exports=Math.trunc||function trunc(s){var a=+s;return(a>0?i:o)(a)}},41237:s=>{"use strict";s.exports=EvalError},41333:s=>{"use strict";s.exports=function hasSymbols(){if("function"!=typeof Symbol||"function"!=typeof Object.getOwnPropertySymbols)return!1;if("symbol"==typeof Symbol.iterator)return!0;var s={},o=Symbol("test"),i=Object(o);if("string"==typeof o)return!1;if("[object Symbol]"!==Object.prototype.toString.call(o))return!1;if("[object Symbol]"!==Object.prototype.toString.call(i))return!1;for(var a in s[o]=42,s)return!1;if("function"==typeof Object.keys&&0!==Object.keys(s).length)return!1;if("function"==typeof Object.getOwnPropertyNames&&0!==Object.getOwnPropertyNames(s).length)return!1;var u=Object.getOwnPropertySymbols(s);if(1!==u.length||u[0]!==o)return!1;if(!Object.prototype.propertyIsEnumerable.call(s,o))return!1;if("function"==typeof Object.getOwnPropertyDescriptor){var _=Object.getOwnPropertyDescriptor(s,o);if(42!==_.value||!0!==_.enumerable)return!1}return!0}},41505:(s,o,i)=>{"use strict";var a=i(98828);s.exports=!a((function(){var s=function(){}.bind();return"function"!=typeof s||s.hasOwnProperty("prototype")}))},41799:(s,o,i)=>{var a=i(37217),u=i(60270);s.exports=function baseIsMatch(s,o,i,_){var w=i.length,x=w,C=!_;if(null==s)return!x;for(s=Object(s);w--;){var j=i[w];if(C&&j[2]?j[1]!==s[j[0]]:!(j[0]in s))return!1}for(;++w<x;){var L=(j=i[w])[0],B=s[L],$=j[1];if(C&&j[2]){if(void 0===B&&!(L in s))return!1}else{var U=new a;if(_)var V=_(B,$,L,s,o,U);if(!(void 0===V?u($,B,3,_,U):V))return!1}}return!0}},41859:(s,o,i)=>{const a=i(27096),u=i(78004),_=a.types;s.exports=class RandExp{constructor(s,o){if(this._setDefaults(s),s instanceof RegExp)this.ignoreCase=s.ignoreCase,this.multiline=s.multiline,s=s.source;else{if("string"!=typeof s)throw new Error("Expected a regexp or string");this.ignoreCase=o&&-1!==o.indexOf("i"),this.multiline=o&&-1!==o.indexOf("m")}this.tokens=a(s)}_setDefaults(s){this.max=null!=s.max?s.max:null!=RandExp.prototype.max?RandExp.prototype.max:100,this.defaultRange=s.defaultRange?s.defaultRange:this.defaultRange.clone(),s.randInt&&(this.randInt=s.randInt)}gen(){return this._gen(this.tokens,[])}_gen(s,o){var i,a,u,w,x;switch(s.type){case _.ROOT:case _.GROUP:if(s.followedBy||s.notFollowedBy)return"";for(s.remember&&void 0===s.groupNumber&&(s.groupNumber=o.push(null)-1),a="",w=0,x=(i=s.options?this._randSelect(s.options):s.stack).length;w<x;w++)a+=this._gen(i[w],o);return s.remember&&(o[s.groupNumber]=a),a;case _.POSITION:return"";case _.SET:var C=this._expand(s);return C.length?String.fromCharCode(this._randSelect(C)):"";case _.REPETITION:for(u=this.randInt(s.min,s.max===1/0?s.min+this.max:s.max),a="",w=0;w<u;w++)a+=this._gen(s.value,o);return a;case _.REFERENCE:return o[s.value-1]||"";case _.CHAR:var j=this.ignoreCase&&this._randBool()?this._toOtherCase(s.value):s.value;return String.fromCharCode(j)}}_toOtherCase(s){return s+(97<=s&&s<=122?-32:65<=s&&s<=90?32:0)}_randBool(){return!this.randInt(0,1)}_randSelect(s){return s instanceof u?s.index(this.randInt(0,s.length-1)):s[this.randInt(0,s.length-1)]}_expand(s){if(s.type===a.types.CHAR)return new u(s.value);if(s.type===a.types.RANGE)return new u(s.from,s.to);{let o=new u;for(let i=0;i<s.set.length;i++){let a=this._expand(s.set[i]);if(o.add(a),this.ignoreCase)for(let s=0;s<a.length;s++){let i=a.index(s),u=this._toOtherCase(i);i!==u&&o.add(u)}}return s.not?this.defaultRange.clone().subtract(o):this.defaultRange.clone().intersect(o)}}randInt(s,o){return s+Math.floor(Math.random()*(1+o-s))}get defaultRange(){return this._range=this._range||new u(32,126)}set defaultRange(s){this._range=s}static randexp(s,o){var i;return"string"==typeof s&&(s=new RegExp(s,o)),void 0===s._randexp?(i=new RandExp(s,o),s._randexp=i):(i=s._randexp)._setDefaults(s),i.gen()}static sugar(){RegExp.prototype.gen=function(){return RandExp.randexp(this)}}}},42054:s=>{var o="\\ud800-\\udfff",i="["+o+"]",a="[\\u0300-\\u036f\\ufe20-\\ufe2f\\u20d0-\\u20ff]",u="\\ud83c[\\udffb-\\udfff]",_="[^"+o+"]",w="(?:\\ud83c[\\udde6-\\uddff]){2}",x="[\\ud800-\\udbff][\\udc00-\\udfff]",C="(?:"+a+"|"+u+")"+"?",j="[\\ufe0e\\ufe0f]?",L=j+C+("(?:\\u200d(?:"+[_,w,x].join("|")+")"+j+C+")*"),B="(?:"+[_+a+"?",a,w,x,i].join("|")+")",$=RegExp(u+"(?="+u+")|"+B+L,"g");s.exports=function unicodeToArray(s){return s.match($)||[]}},42072:(s,o,i)=>{var a=i(34932),u=i(23007),_=i(56449),w=i(44394),x=i(61802),C=i(77797),j=i(13222);s.exports=function toPath(s){return _(s)?a(s,C):w(s)?[s]:u(x(j(s)))}},42156:s=>{"use strict";s.exports=function(){}},42220:(s,o,i)=>{"use strict";var a=i(39447),u=i(58661),_=i(74284),w=i(36624),x=i(4993),C=i(2875);o.f=a&&!u?Object.defineProperties:function defineProperties(s,o){w(s);for(var i,a=x(o),u=C(o),j=u.length,L=0;j>L;)_.f(s,i=u[L++],a[i]);return s}},42426:(s,o,i)=>{var a=i(14248),u=i(15389),_=i(90916),w=i(56449),x=i(36800);s.exports=function some(s,o,i){var C=w(s)?a:_;return i&&x(s,o,i)&&(o=void 0),C(s,u(o,3))}},42824:(s,o,i)=>{var a=i(87805),u=i(93290),_=i(71961),w=i(23007),x=i(35529),C=i(72428),j=i(56449),L=i(83693),B=i(3656),$=i(1882),U=i(23805),V=i(11331),z=i(37167),Y=i(14974),Z=i(69884);s.exports=function baseMergeDeep(s,o,i,ee,ie,ae,ce){var le=Y(s,i),pe=Y(o,i),de=ce.get(pe);if(de)a(s,i,de);else{var fe=ae?ae(le,pe,i+"",s,o,ce):void 0,ye=void 0===fe;if(ye){var be=j(pe),_e=!be&&B(pe),Se=!be&&!_e&&z(pe);fe=pe,be||_e||Se?j(le)?fe=le:L(le)?fe=w(le):_e?(ye=!1,fe=u(pe,!0)):Se?(ye=!1,fe=_(pe,!0)):fe=[]:V(pe)||C(pe)?(fe=le,C(le)?fe=Z(le):U(le)&&!$(le)||(fe=x(pe))):ye=!1}ye&&(ce.set(pe,fe),ie(fe,pe,ee,ae,ce),ce.delete(pe)),a(s,i,fe)}}},43360:(s,o,i)=>{var a=i(93243);s.exports=function baseAssignValue(s,o,i){"__proto__"==o&&a?a(s,o,{configurable:!0,enumerable:!0,value:i,writable:!0}):s[o]=i}},43768:(s,o,i)=>{"use strict";var a=i(45981),u=i(85587);o.highlight=highlight,o.highlightAuto=function highlightAuto(s,o){var i,w,x,C,j=o||{},L=j.subset||a.listLanguages(),B=j.prefix,$=L.length,U=-1;null==B&&(B=_);if("string"!=typeof s)throw u("Expected `string` for value, got `%s`",s);w={relevance:0,language:null,value:[]},i={relevance:0,language:null,value:[]};for(;++U<$;)C=L[U],a.getLanguage(C)&&((x=highlight(C,s,o)).language=C,x.relevance>w.relevance&&(w=x),x.relevance>i.relevance&&(w=i,i=x));w.language&&(i.secondBest=w);return i},o.registerLanguage=function registerLanguage(s,o){a.registerLanguage(s,o)},o.listLanguages=function listLanguages(){return a.listLanguages()},o.registerAlias=function registerAlias(s,o){var i,u=s;o&&((u={})[s]=o);for(i in u)a.registerAliases(u[i],{languageName:i})},Emitter.prototype.addText=function text(s){var o,i,a=this.stack;if(""===s)return;o=a[a.length-1],(i=o.children[o.children.length-1])&&"text"===i.type?i.value+=s:o.children.push({type:"text",value:s})},Emitter.prototype.addKeyword=function addKeyword(s,o){this.openNode(o),this.addText(s),this.closeNode()},Emitter.prototype.addSublanguage=function addSublanguage(s,o){var i=this.stack,a=i[i.length-1],u=s.rootNode.children,_=o?{type:"element",tagName:"span",properties:{className:[o]},children:u}:u;a.children=a.children.concat(_)},Emitter.prototype.openNode=function open(s){var o=this.stack,i=this.options.classPrefix+s,a=o[o.length-1],u={type:"element",tagName:"span",properties:{className:[i]},children:[]};a.children.push(u),o.push(u)},Emitter.prototype.closeNode=function close(){this.stack.pop()},Emitter.prototype.closeAllNodes=noop,Emitter.prototype.finalize=noop,Emitter.prototype.toHTML=function toHtmlNoop(){return""};var _="hljs-";function highlight(s,o,i){var w,x=a.configure({}),C=(i||{}).prefix;if("string"!=typeof s)throw u("Expected `string` for name, got `%s`",s);if(!a.getLanguage(s))throw u("Unknown language: `%s` is not registered",s);if("string"!=typeof o)throw u("Expected `string` for value, got `%s`",o);if(null==C&&(C=_),a.configure({__emitter:Emitter,classPrefix:C}),w=a.highlight(o,{language:s,ignoreIllegals:!0}),a.configure(x||{}),w.errorRaised)throw w.errorRaised;return{relevance:w.relevance,language:w.language,value:w.emitter.rootNode.children}}function Emitter(s){this.options=s,this.rootNode={children:[]},this.stack=[this.rootNode]}function noop(){}},43838:(s,o,i)=>{var a=i(21791),u=i(37241);s.exports=function baseAssignIn(s,o){return s&&a(o,u(o),s)}},44394:(s,o,i)=>{var a=i(72552),u=i(40346);s.exports=function isSymbol(s){return"symbol"==typeof s||u(s)&&"[object Symbol]"==a(s)}},44673:(s,o,i)=>{"use strict";var a=i(1907),u=i(82159),_=i(46285),w=i(49724),x=i(93427),C=i(41505),j=Function,L=a([].concat),B=a([].join),$={};s.exports=C?j.bind:function bind(s){var o=u(this),i=o.prototype,a=x(arguments,1),C=function bound(){var i=L(a,x(arguments));return this instanceof C?function(s,o,i){if(!w($,o)){for(var a=[],u=0;u<o;u++)a[u]="a["+u+"]";$[o]=j("C,a","return new C("+B(a,",")+")")}return $[o](s,i)}(o,i.length,i):o.apply(s,i)};return _(i)&&(C.prototype=i),C}},45083:(s,o,i)=>{var a=i(1882),u=i(87296),_=i(23805),w=i(47473),x=/^\[object .+?Constructor\]$/,C=Function.prototype,j=Object.prototype,L=C.toString,B=j.hasOwnProperty,$=RegExp("^"+L.call(B).replace(/[\\^$.*+?()[\]{}|]/g,"\\$&").replace(/hasOwnProperty|(function).*?(?=\\\()| for .+?(?=\\\])/g,"$1.*?")+"$");s.exports=function baseIsNative(s){return!(!_(s)||u(s))&&(a(s)?$:x).test(w(s))}},45412:(s,o,i)=>{"use strict";var a,u=i(65606);s.exports=Readable,Readable.ReadableState=ReadableState;i(37007).EventEmitter;var _=function EElistenerCount(s,o){return s.listeners(o).length},w=i(40345),x=i(48287).Buffer,C=(void 0!==i.g?i.g:"undefined"!=typeof window?window:"undefined"!=typeof self?self:{}).Uint8Array||function(){};var j,L=i(79838);j=L&&L.debuglog?L.debuglog("stream"):function debug(){};var B,$,U,V=i(80345),z=i(75896),Y=i(65291).getHighWaterMark,Z=i(86048).F,ee=Z.ERR_INVALID_ARG_TYPE,ie=Z.ERR_STREAM_PUSH_AFTER_EOF,ae=Z.ERR_METHOD_NOT_IMPLEMENTED,ce=Z.ERR_STREAM_UNSHIFT_AFTER_END_EVENT;i(56698)(Readable,w);var le=z.errorOrDestroy,pe=["error","close","destroy","pause","resume"];function ReadableState(s,o,u){a=a||i(25382),s=s||{},"boolean"!=typeof u&&(u=o instanceof a),this.objectMode=!!s.objectMode,u&&(this.objectMode=this.objectMode||!!s.readableObjectMode),this.highWaterMark=Y(this,s,"readableHighWaterMark",u),this.buffer=new V,this.length=0,this.pipes=null,this.pipesCount=0,this.flowing=null,this.ended=!1,this.endEmitted=!1,this.reading=!1,this.sync=!0,this.needReadable=!1,this.emittedReadable=!1,this.readableListening=!1,this.resumeScheduled=!1,this.paused=!0,this.emitClose=!1!==s.emitClose,this.autoDestroy=!!s.autoDestroy,this.destroyed=!1,this.defaultEncoding=s.defaultEncoding||"utf8",this.awaitDrain=0,this.readingMore=!1,this.decoder=null,this.encoding=null,s.encoding&&(B||(B=i(83141).I),this.decoder=new B(s.encoding),this.encoding=s.encoding)}function Readable(s){if(a=a||i(25382),!(this instanceof Readable))return new Readable(s);var o=this instanceof a;this._readableState=new ReadableState(s,this,o),this.readable=!0,s&&("function"==typeof s.read&&(this._read=s.read),"function"==typeof s.destroy&&(this._destroy=s.destroy)),w.call(this)}function readableAddChunk(s,o,i,a,u){j("readableAddChunk",o);var _,w=s._readableState;if(null===o)w.reading=!1,function onEofChunk(s,o){if(j("onEofChunk"),o.ended)return;if(o.decoder){var i=o.decoder.end();i&&i.length&&(o.buffer.push(i),o.length+=o.objectMode?1:i.length)}o.ended=!0,o.sync?emitReadable(s):(o.needReadable=!1,o.emittedReadable||(o.emittedReadable=!0,emitReadable_(s)))}(s,w);else if(u||(_=function chunkInvalid(s,o){var i;(function _isUint8Array(s){return x.isBuffer(s)||s instanceof C})(o)||"string"==typeof o||void 0===o||s.objectMode||(i=new ee("chunk",["string","Buffer","Uint8Array"],o));return i}(w,o)),_)le(s,_);else if(w.objectMode||o&&o.length>0)if("string"==typeof o||w.objectMode||Object.getPrototypeOf(o)===x.prototype||(o=function _uint8ArrayToBuffer(s){return x.from(s)}(o)),a)w.endEmitted?le(s,new ce):addChunk(s,w,o,!0);else if(w.ended)le(s,new ie);else{if(w.destroyed)return!1;w.reading=!1,w.decoder&&!i?(o=w.decoder.write(o),w.objectMode||0!==o.length?addChunk(s,w,o,!1):maybeReadMore(s,w)):addChunk(s,w,o,!1)}else a||(w.reading=!1,maybeReadMore(s,w));return!w.ended&&(w.length<w.highWaterMark||0===w.length)}function addChunk(s,o,i,a){o.flowing&&0===o.length&&!o.sync?(o.awaitDrain=0,s.emit("data",i)):(o.length+=o.objectMode?1:i.length,a?o.buffer.unshift(i):o.buffer.push(i),o.needReadable&&emitReadable(s)),maybeReadMore(s,o)}Object.defineProperty(Readable.prototype,"destroyed",{enumerable:!1,get:function get(){return void 0!==this._readableState&&this._readableState.destroyed},set:function set(s){this._readableState&&(this._readableState.destroyed=s)}}),Readable.prototype.destroy=z.destroy,Readable.prototype._undestroy=z.undestroy,Readable.prototype._destroy=function(s,o){o(s)},Readable.prototype.push=function(s,o){var i,a=this._readableState;return a.objectMode?i=!0:"string"==typeof s&&((o=o||a.defaultEncoding)!==a.encoding&&(s=x.from(s,o),o=""),i=!0),readableAddChunk(this,s,o,!1,i)},Readable.prototype.unshift=function(s){return readableAddChunk(this,s,null,!0,!1)},Readable.prototype.isPaused=function(){return!1===this._readableState.flowing},Readable.prototype.setEncoding=function(s){B||(B=i(83141).I);var o=new B(s);this._readableState.decoder=o,this._readableState.encoding=this._readableState.decoder.encoding;for(var a=this._readableState.buffer.head,u="";null!==a;)u+=o.write(a.data),a=a.next;return this._readableState.buffer.clear(),""!==u&&this._readableState.buffer.push(u),this._readableState.length=u.length,this};var de=1073741824;function howMuchToRead(s,o){return s<=0||0===o.length&&o.ended?0:o.objectMode?1:s!=s?o.flowing&&o.length?o.buffer.head.data.length:o.length:(s>o.highWaterMark&&(o.highWaterMark=function computeNewHighWaterMark(s){return s>=de?s=de:(s--,s|=s>>>1,s|=s>>>2,s|=s>>>4,s|=s>>>8,s|=s>>>16,s++),s}(s)),s<=o.length?s:o.ended?o.length:(o.needReadable=!0,0))}function emitReadable(s){var o=s._readableState;j("emitReadable",o.needReadable,o.emittedReadable),o.needReadable=!1,o.emittedReadable||(j("emitReadable",o.flowing),o.emittedReadable=!0,u.nextTick(emitReadable_,s))}function emitReadable_(s){var o=s._readableState;j("emitReadable_",o.destroyed,o.length,o.ended),o.destroyed||!o.length&&!o.ended||(s.emit("readable"),o.emittedReadable=!1),o.needReadable=!o.flowing&&!o.ended&&o.length<=o.highWaterMark,flow(s)}function maybeReadMore(s,o){o.readingMore||(o.readingMore=!0,u.nextTick(maybeReadMore_,s,o))}function maybeReadMore_(s,o){for(;!o.reading&&!o.ended&&(o.length<o.highWaterMark||o.flowing&&0===o.length);){var i=o.length;if(j("maybeReadMore read 0"),s.read(0),i===o.length)break}o.readingMore=!1}function updateReadableListening(s){var o=s._readableState;o.readableListening=s.listenerCount("readable")>0,o.resumeScheduled&&!o.paused?o.flowing=!0:s.listenerCount("data")>0&&s.resume()}function nReadingNextTick(s){j("readable nexttick read 0"),s.read(0)}function resume_(s,o){j("resume",o.reading),o.reading||s.read(0),o.resumeScheduled=!1,s.emit("resume"),flow(s),o.flowing&&!o.reading&&s.read(0)}function flow(s){var o=s._readableState;for(j("flow",o.flowing);o.flowing&&null!==s.read(););}function fromList(s,o){return 0===o.length?null:(o.objectMode?i=o.buffer.shift():!s||s>=o.length?(i=o.decoder?o.buffer.join(""):1===o.buffer.length?o.buffer.first():o.buffer.concat(o.length),o.buffer.clear()):i=o.buffer.consume(s,o.decoder),i);var i}function endReadable(s){var o=s._readableState;j("endReadable",o.endEmitted),o.endEmitted||(o.ended=!0,u.nextTick(endReadableNT,o,s))}function endReadableNT(s,o){if(j("endReadableNT",s.endEmitted,s.length),!s.endEmitted&&0===s.length&&(s.endEmitted=!0,o.readable=!1,o.emit("end"),s.autoDestroy)){var i=o._writableState;(!i||i.autoDestroy&&i.finished)&&o.destroy()}}function indexOf(s,o){for(var i=0,a=s.length;i<a;i++)if(s[i]===o)return i;return-1}Readable.prototype.read=function(s){j("read",s),s=parseInt(s,10);var o=this._readableState,i=s;if(0!==s&&(o.emittedReadable=!1),0===s&&o.needReadable&&((0!==o.highWaterMark?o.length>=o.highWaterMark:o.length>0)||o.ended))return j("read: emitReadable",o.length,o.ended),0===o.length&&o.ended?endReadable(this):emitReadable(this),null;if(0===(s=howMuchToRead(s,o))&&o.ended)return 0===o.length&&endReadable(this),null;var a,u=o.needReadable;return j("need readable",u),(0===o.length||o.length-s<o.highWaterMark)&&j("length less than watermark",u=!0),o.ended||o.reading?j("reading or ended",u=!1):u&&(j("do read"),o.reading=!0,o.sync=!0,0===o.length&&(o.needReadable=!0),this._read(o.highWaterMark),o.sync=!1,o.reading||(s=howMuchToRead(i,o))),null===(a=s>0?fromList(s,o):null)?(o.needReadable=o.length<=o.highWaterMark,s=0):(o.length-=s,o.awaitDrain=0),0===o.length&&(o.ended||(o.needReadable=!0),i!==s&&o.ended&&endReadable(this)),null!==a&&this.emit("data",a),a},Readable.prototype._read=function(s){le(this,new ae("_read()"))},Readable.prototype.pipe=function(s,o){var i=this,a=this._readableState;switch(a.pipesCount){case 0:a.pipes=s;break;case 1:a.pipes=[a.pipes,s];break;default:a.pipes.push(s)}a.pipesCount+=1,j("pipe count=%d opts=%j",a.pipesCount,o);var w=(!o||!1!==o.end)&&s!==u.stdout&&s!==u.stderr?onend:unpipe;function onunpipe(o,u){j("onunpipe"),o===i&&u&&!1===u.hasUnpiped&&(u.hasUnpiped=!0,function cleanup(){j("cleanup"),s.removeListener("close",onclose),s.removeListener("finish",onfinish),s.removeListener("drain",x),s.removeListener("error",onerror),s.removeListener("unpipe",onunpipe),i.removeListener("end",onend),i.removeListener("end",unpipe),i.removeListener("data",ondata),C=!0,!a.awaitDrain||s._writableState&&!s._writableState.needDrain||x()}())}function onend(){j("onend"),s.end()}a.endEmitted?u.nextTick(w):i.once("end",w),s.on("unpipe",onunpipe);var x=function pipeOnDrain(s){return function pipeOnDrainFunctionResult(){var o=s._readableState;j("pipeOnDrain",o.awaitDrain),o.awaitDrain&&o.awaitDrain--,0===o.awaitDrain&&_(s,"data")&&(o.flowing=!0,flow(s))}}(i);s.on("drain",x);var C=!1;function ondata(o){j("ondata");var u=s.write(o);j("dest.write",u),!1===u&&((1===a.pipesCount&&a.pipes===s||a.pipesCount>1&&-1!==indexOf(a.pipes,s))&&!C&&(j("false write response, pause",a.awaitDrain),a.awaitDrain++),i.pause())}function onerror(o){j("onerror",o),unpipe(),s.removeListener("error",onerror),0===_(s,"error")&&le(s,o)}function onclose(){s.removeListener("finish",onfinish),unpipe()}function onfinish(){j("onfinish"),s.removeListener("close",onclose),unpipe()}function unpipe(){j("unpipe"),i.unpipe(s)}return i.on("data",ondata),function prependListener(s,o,i){if("function"==typeof s.prependListener)return s.prependListener(o,i);s._events&&s._events[o]?Array.isArray(s._events[o])?s._events[o].unshift(i):s._events[o]=[i,s._events[o]]:s.on(o,i)}(s,"error",onerror),s.once("close",onclose),s.once("finish",onfinish),s.emit("pipe",i),a.flowing||(j("pipe resume"),i.resume()),s},Readable.prototype.unpipe=function(s){var o=this._readableState,i={hasUnpiped:!1};if(0===o.pipesCount)return this;if(1===o.pipesCount)return s&&s!==o.pipes||(s||(s=o.pipes),o.pipes=null,o.pipesCount=0,o.flowing=!1,s&&s.emit("unpipe",this,i)),this;if(!s){var a=o.pipes,u=o.pipesCount;o.pipes=null,o.pipesCount=0,o.flowing=!1;for(var _=0;_<u;_++)a[_].emit("unpipe",this,{hasUnpiped:!1});return this}var w=indexOf(o.pipes,s);return-1===w||(o.pipes.splice(w,1),o.pipesCount-=1,1===o.pipesCount&&(o.pipes=o.pipes[0]),s.emit("unpipe",this,i)),this},Readable.prototype.on=function(s,o){var i=w.prototype.on.call(this,s,o),a=this._readableState;return"data"===s?(a.readableListening=this.listenerCount("readable")>0,!1!==a.flowing&&this.resume()):"readable"===s&&(a.endEmitted||a.readableListening||(a.readableListening=a.needReadable=!0,a.flowing=!1,a.emittedReadable=!1,j("on readable",a.length,a.reading),a.length?emitReadable(this):a.reading||u.nextTick(nReadingNextTick,this))),i},Readable.prototype.addListener=Readable.prototype.on,Readable.prototype.removeListener=function(s,o){var i=w.prototype.removeListener.call(this,s,o);return"readable"===s&&u.nextTick(updateReadableListening,this),i},Readable.prototype.removeAllListeners=function(s){var o=w.prototype.removeAllListeners.apply(this,arguments);return"readable"!==s&&void 0!==s||u.nextTick(updateReadableListening,this),o},Readable.prototype.resume=function(){var s=this._readableState;return s.flowing||(j("resume"),s.flowing=!s.readableListening,function resume(s,o){o.resumeScheduled||(o.resumeScheduled=!0,u.nextTick(resume_,s,o))}(this,s)),s.paused=!1,this},Readable.prototype.pause=function(){return j("call pause flowing=%j",this._readableState.flowing),!1!==this._readableState.flowing&&(j("pause"),this._readableState.flowing=!1,this.emit("pause")),this._readableState.paused=!0,this},Readable.prototype.wrap=function(s){var o=this,i=this._readableState,a=!1;for(var u in s.on("end",(function(){if(j("wrapped end"),i.decoder&&!i.ended){var s=i.decoder.end();s&&s.length&&o.push(s)}o.push(null)})),s.on("data",(function(u){(j("wrapped data"),i.decoder&&(u=i.decoder.write(u)),i.objectMode&&null==u)||(i.objectMode||u&&u.length)&&(o.push(u)||(a=!0,s.pause()))})),s)void 0===this[u]&&"function"==typeof s[u]&&(this[u]=function methodWrap(o){return function methodWrapReturnFunction(){return s[o].apply(s,arguments)}}(u));for(var _=0;_<pe.length;_++)s.on(pe[_],this.emit.bind(this,pe[_]));return this._read=function(o){j("wrapped _read",o),a&&(a=!1,s.resume())},this},"function"==typeof Symbol&&(Readable.prototype[Symbol.asyncIterator]=function(){return void 0===$&&($=i(2955)),$(this)}),Object.defineProperty(Readable.prototype,"readableHighWaterMark",{enumerable:!1,get:function get(){return this._readableState.highWaterMark}}),Object.defineProperty(Readable.prototype,"readableBuffer",{enumerable:!1,get:function get(){return this._readableState&&this._readableState.buffer}}),Object.defineProperty(Readable.prototype,"readableFlowing",{enumerable:!1,get:function get(){return this._readableState.flowing},set:function set(s){this._readableState&&(this._readableState.flowing=s)}}),Readable._fromList=fromList,Object.defineProperty(Readable.prototype,"readableLength",{enumerable:!1,get:function get(){return this._readableState.length}}),"function"==typeof Symbol&&(Readable.from=function(s,o){return void 0===U&&(U=i(55157)),U(Readable,s,o)})},45434:s=>{var o=/[a-z][A-Z]|[A-Z]{2}[a-z]|[0-9][a-zA-Z]|[a-zA-Z][0-9]|[^a-zA-Z0-9 ]/;s.exports=function hasUnicodeWord(s){return o.test(s)}},45539:(s,o,i)=>{var a=i(40882),u=i(50828),_=i(66645),w=RegExp("['’]","g");s.exports=function createCompounder(s){return function(o){return a(_(u(o).replace(w,"")),s,"")}}},45807:(s,o,i)=>{"use strict";var a=i(1907),u=a({}.toString),_=a("".slice);s.exports=function(s){return _(u(s),8,-1)}},45891:(s,o,i)=>{var a=i(51873),u=i(72428),_=i(56449),w=a?a.isConcatSpreadable:void 0;s.exports=function isFlattenable(s){return _(s)||u(s)||!!(w&&s&&s[w])}},45951:function(s,o,i){"use strict";var check=function(s){return s&&s.Math===Math&&s};s.exports=check("object"==typeof globalThis&&globalThis)||check("object"==typeof window&&window)||check("object"==typeof self&&self)||check("object"==typeof i.g&&i.g)||check("object"==typeof this&&this)||function(){return this}()||Function("return this")()},45981:s=>{function deepFreeze(s){return s instanceof Map?s.clear=s.delete=s.set=function(){throw new Error("map is read-only")}:s instanceof Set&&(s.add=s.clear=s.delete=function(){throw new Error("set is read-only")}),Object.freeze(s),Object.getOwnPropertyNames(s).forEach((function(o){var i=s[o];"object"!=typeof i||Object.isFrozen(i)||deepFreeze(i)})),s}var o=deepFreeze,i=deepFreeze;o.default=i;class Response{constructor(s){void 0===s.data&&(s.data={}),this.data=s.data,this.isMatchIgnored=!1}ignoreMatch(){this.isMatchIgnored=!0}}function escapeHTML(s){return s.replace(/&/g,"&amp;").replace(/</g,"&lt;").replace(/>/g,"&gt;").replace(/"/g,"&quot;").replace(/'/g,"&#x27;")}function inherit(s,...o){const i=Object.create(null);for(const o in s)i[o]=s[o];return o.forEach((function(s){for(const o in s)i[o]=s[o]})),i}const emitsWrappingTags=s=>!!s.kind;class HTMLRenderer{constructor(s,o){this.buffer="",this.classPrefix=o.classPrefix,s.walk(this)}addText(s){this.buffer+=escapeHTML(s)}openNode(s){if(!emitsWrappingTags(s))return;let o=s.kind;s.sublanguage||(o=`${this.classPrefix}${o}`),this.span(o)}closeNode(s){emitsWrappingTags(s)&&(this.buffer+="</span>")}value(){return this.buffer}span(s){this.buffer+=`<span class="${s}">`}}class TokenTree{constructor(){this.rootNode={children:[]},this.stack=[this.rootNode]}get top(){return this.stack[this.stack.length-1]}get root(){return this.rootNode}add(s){this.top.children.push(s)}openNode(s){const o={kind:s,children:[]};this.add(o),this.stack.push(o)}closeNode(){if(this.stack.length>1)return this.stack.pop()}closeAllNodes(){for(;this.closeNode(););}toJSON(){return JSON.stringify(this.rootNode,null,4)}walk(s){return this.constructor._walk(s,this.rootNode)}static _walk(s,o){return"string"==typeof o?s.addText(o):o.children&&(s.openNode(o),o.children.forEach((o=>this._walk(s,o))),s.closeNode(o)),s}static _collapse(s){"string"!=typeof s&&s.children&&(s.children.every((s=>"string"==typeof s))?s.children=[s.children.join("")]:s.children.forEach((s=>{TokenTree._collapse(s)})))}}class TokenTreeEmitter extends TokenTree{constructor(s){super(),this.options=s}addKeyword(s,o){""!==s&&(this.openNode(o),this.addText(s),this.closeNode())}addText(s){""!==s&&this.add(s)}addSublanguage(s,o){const i=s.root;i.kind=o,i.sublanguage=!0,this.add(i)}toHTML(){return new HTMLRenderer(this,this.options).value()}finalize(){return!0}}function source(s){return s?"string"==typeof s?s:s.source:null}const a=/\[(?:[^\\\]]|\\.)*\]|\(\??|\\([1-9][0-9]*)|\\./;const u="[a-zA-Z]\\w*",_="[a-zA-Z_]\\w*",w="\\b\\d+(\\.\\d+)?",x="(-?)(\\b0[xX][a-fA-F0-9]+|(\\b\\d+(\\.\\d*)?|\\.\\d+)([eE][-+]?\\d+)?)",C="\\b(0b[01]+)",j={begin:"\\\\[\\s\\S]",relevance:0},L={className:"string",begin:"'",end:"'",illegal:"\\n",contains:[j]},B={className:"string",begin:'"',end:'"',illegal:"\\n",contains:[j]},$={begin:/\b(a|an|the|are|I'm|isn't|don't|doesn't|won't|but|just|should|pretty|simply|enough|gonna|going|wtf|so|such|will|you|your|they|like|more)\b/},COMMENT=function(s,o,i={}){const a=inherit({className:"comment",begin:s,end:o,contains:[]},i);return a.contains.push($),a.contains.push({className:"doctag",begin:"(?:TODO|FIXME|NOTE|BUG|OPTIMIZE|HACK|XXX):",relevance:0}),a},U=COMMENT("//","$"),V=COMMENT("/\\*","\\*/"),z=COMMENT("#","$"),Y={className:"number",begin:w,relevance:0},Z={className:"number",begin:x,relevance:0},ee={className:"number",begin:C,relevance:0},ie={className:"number",begin:w+"(%|em|ex|ch|rem|vw|vh|vmin|vmax|cm|mm|in|pt|pc|px|deg|grad|rad|turn|s|ms|Hz|kHz|dpi|dpcm|dppx)?",relevance:0},ae={begin:/(?=\/[^/\n]*\/)/,contains:[{className:"regexp",begin:/\//,end:/\/[gimuy]*/,illegal:/\n/,contains:[j,{begin:/\[/,end:/\]/,relevance:0,contains:[j]}]}]},ce={className:"title",begin:u,relevance:0},le={className:"title",begin:_,relevance:0},pe={begin:"\\.\\s*"+_,relevance:0};var de=Object.freeze({__proto__:null,MATCH_NOTHING_RE:/\b\B/,IDENT_RE:u,UNDERSCORE_IDENT_RE:_,NUMBER_RE:w,C_NUMBER_RE:x,BINARY_NUMBER_RE:C,RE_STARTERS_RE:"!|!=|!==|%|%=|&|&&|&=|\\*|\\*=|\\+|\\+=|,|-|-=|/=|/|:|;|<<|<<=|<=|<|===|==|=|>>>=|>>=|>=|>>>|>>|>|\\?|\\[|\\{|\\(|\\^|\\^=|\\||\\|=|\\|\\||~",SHEBANG:(s={})=>{const o=/^#![ ]*\//;return s.binary&&(s.begin=function concat(...s){return s.map((s=>source(s))).join("")}(o,/.*\b/,s.binary,/\b.*/)),inherit({className:"meta",begin:o,end:/$/,relevance:0,"on:begin":(s,o)=>{0!==s.index&&o.ignoreMatch()}},s)},BACKSLASH_ESCAPE:j,APOS_STRING_MODE:L,QUOTE_STRING_MODE:B,PHRASAL_WORDS_MODE:$,COMMENT,C_LINE_COMMENT_MODE:U,C_BLOCK_COMMENT_MODE:V,HASH_COMMENT_MODE:z,NUMBER_MODE:Y,C_NUMBER_MODE:Z,BINARY_NUMBER_MODE:ee,CSS_NUMBER_MODE:ie,REGEXP_MODE:ae,TITLE_MODE:ce,UNDERSCORE_TITLE_MODE:le,METHOD_GUARD:pe,END_SAME_AS_BEGIN:function(s){return Object.assign(s,{"on:begin":(s,o)=>{o.data._beginMatch=s[1]},"on:end":(s,o)=>{o.data._beginMatch!==s[1]&&o.ignoreMatch()}})}});function skipIfhasPrecedingDot(s,o){"."===s.input[s.index-1]&&o.ignoreMatch()}function beginKeywords(s,o){o&&s.beginKeywords&&(s.begin="\\b("+s.beginKeywords.split(" ").join("|")+")(?!\\.)(?=\\b|\\s)",s.__beforeBegin=skipIfhasPrecedingDot,s.keywords=s.keywords||s.beginKeywords,delete s.beginKeywords,void 0===s.relevance&&(s.relevance=0))}function compileIllegal(s,o){Array.isArray(s.illegal)&&(s.illegal=function either(...s){return"("+s.map((s=>source(s))).join("|")+")"}(...s.illegal))}function compileMatch(s,o){if(s.match){if(s.begin||s.end)throw new Error("begin & end are not supported with match");s.begin=s.match,delete s.match}}function compileRelevance(s,o){void 0===s.relevance&&(s.relevance=1)}const fe=["of","and","for","in","not","or","if","then","parent","list","value"];function compileKeywords(s,o,i="keyword"){const a={};return"string"==typeof s?compileList(i,s.split(" ")):Array.isArray(s)?compileList(i,s):Object.keys(s).forEach((function(i){Object.assign(a,compileKeywords(s[i],o,i))})),a;function compileList(s,i){o&&(i=i.map((s=>s.toLowerCase()))),i.forEach((function(o){const i=o.split("|");a[i[0]]=[s,scoreForKeyword(i[0],i[1])]}))}}function scoreForKeyword(s,o){return o?Number(o):function commonKeyword(s){return fe.includes(s.toLowerCase())}(s)?0:1}function compileLanguage(s,{plugins:o}){function langRe(o,i){return new RegExp(source(o),"m"+(s.case_insensitive?"i":"")+(i?"g":""))}class MultiRegex{constructor(){this.matchIndexes={},this.regexes=[],this.matchAt=1,this.position=0}addRule(s,o){o.position=this.position++,this.matchIndexes[this.matchAt]=o,this.regexes.push([o,s]),this.matchAt+=function countMatchGroups(s){return new RegExp(s.toString()+"|").exec("").length-1}(s)+1}compile(){0===this.regexes.length&&(this.exec=()=>null);const s=this.regexes.map((s=>s[1]));this.matcherRe=langRe(function join(s,o="|"){let i=0;return s.map((s=>{i+=1;const o=i;let u=source(s),_="";for(;u.length>0;){const s=a.exec(u);if(!s){_+=u;break}_+=u.substring(0,s.index),u=u.substring(s.index+s[0].length),"\\"===s[0][0]&&s[1]?_+="\\"+String(Number(s[1])+o):(_+=s[0],"("===s[0]&&i++)}return _})).map((s=>`(${s})`)).join(o)}(s),!0),this.lastIndex=0}exec(s){this.matcherRe.lastIndex=this.lastIndex;const o=this.matcherRe.exec(s);if(!o)return null;const i=o.findIndex(((s,o)=>o>0&&void 0!==s)),a=this.matchIndexes[i];return o.splice(0,i),Object.assign(o,a)}}class ResumableMultiRegex{constructor(){this.rules=[],this.multiRegexes=[],this.count=0,this.lastIndex=0,this.regexIndex=0}getMatcher(s){if(this.multiRegexes[s])return this.multiRegexes[s];const o=new MultiRegex;return this.rules.slice(s).forEach((([s,i])=>o.addRule(s,i))),o.compile(),this.multiRegexes[s]=o,o}resumingScanAtSamePosition(){return 0!==this.regexIndex}considerAll(){this.regexIndex=0}addRule(s,o){this.rules.push([s,o]),"begin"===o.type&&this.count++}exec(s){const o=this.getMatcher(this.regexIndex);o.lastIndex=this.lastIndex;let i=o.exec(s);if(this.resumingScanAtSamePosition())if(i&&i.index===this.lastIndex);else{const o=this.getMatcher(0);o.lastIndex=this.lastIndex+1,i=o.exec(s)}return i&&(this.regexIndex+=i.position+1,this.regexIndex===this.count&&this.considerAll()),i}}if(s.compilerExtensions||(s.compilerExtensions=[]),s.contains&&s.contains.includes("self"))throw new Error("ERR: contains `self` is not supported at the top-level of a language.  See documentation.");return s.classNameAliases=inherit(s.classNameAliases||{}),function compileMode(o,i){const a=o;if(o.isCompiled)return a;[compileMatch].forEach((s=>s(o,i))),s.compilerExtensions.forEach((s=>s(o,i))),o.__beforeBegin=null,[beginKeywords,compileIllegal,compileRelevance].forEach((s=>s(o,i))),o.isCompiled=!0;let u=null;if("object"==typeof o.keywords&&(u=o.keywords.$pattern,delete o.keywords.$pattern),o.keywords&&(o.keywords=compileKeywords(o.keywords,s.case_insensitive)),o.lexemes&&u)throw new Error("ERR: Prefer `keywords.$pattern` to `mode.lexemes`, BOTH are not allowed. (see mode reference) ");return u=u||o.lexemes||/\w+/,a.keywordPatternRe=langRe(u,!0),i&&(o.begin||(o.begin=/\B|\b/),a.beginRe=langRe(o.begin),o.endSameAsBegin&&(o.end=o.begin),o.end||o.endsWithParent||(o.end=/\B|\b/),o.end&&(a.endRe=langRe(o.end)),a.terminatorEnd=source(o.end)||"",o.endsWithParent&&i.terminatorEnd&&(a.terminatorEnd+=(o.end?"|":"")+i.terminatorEnd)),o.illegal&&(a.illegalRe=langRe(o.illegal)),o.contains||(o.contains=[]),o.contains=[].concat(...o.contains.map((function(s){return function expandOrCloneMode(s){s.variants&&!s.cachedVariants&&(s.cachedVariants=s.variants.map((function(o){return inherit(s,{variants:null},o)})));if(s.cachedVariants)return s.cachedVariants;if(dependencyOnParent(s))return inherit(s,{starts:s.starts?inherit(s.starts):null});if(Object.isFrozen(s))return inherit(s);return s}("self"===s?o:s)}))),o.contains.forEach((function(s){compileMode(s,a)})),o.starts&&compileMode(o.starts,i),a.matcher=function buildModeRegex(s){const o=new ResumableMultiRegex;return s.contains.forEach((s=>o.addRule(s.begin,{rule:s,type:"begin"}))),s.terminatorEnd&&o.addRule(s.terminatorEnd,{type:"end"}),s.illegal&&o.addRule(s.illegal,{type:"illegal"}),o}(a),a}(s)}function dependencyOnParent(s){return!!s&&(s.endsWithParent||dependencyOnParent(s.starts))}function BuildVuePlugin(s){const o={props:["language","code","autodetect"],data:function(){return{detectedLanguage:"",unknownLanguage:!1}},computed:{className(){return this.unknownLanguage?"":"hljs "+this.detectedLanguage},highlighted(){if(!this.autoDetect&&!s.getLanguage(this.language))return console.warn(`The language "${this.language}" you specified could not be found.`),this.unknownLanguage=!0,escapeHTML(this.code);let o={};return this.autoDetect?(o=s.highlightAuto(this.code),this.detectedLanguage=o.language):(o=s.highlight(this.language,this.code,this.ignoreIllegals),this.detectedLanguage=this.language),o.value},autoDetect(){return!this.language||function hasValueOrEmptyAttribute(s){return Boolean(s||""===s)}(this.autodetect)},ignoreIllegals:()=>!0},render(s){return s("pre",{},[s("code",{class:this.className,domProps:{innerHTML:this.highlighted}})])}};return{Component:o,VuePlugin:{install(s){s.component("highlightjs",o)}}}}const ye={"after:highlightElement":({el:s,result:o,text:i})=>{const a=nodeStream(s);if(!a.length)return;const u=document.createElement("div");u.innerHTML=o.value,o.value=function mergeStreams(s,o,i){let a=0,u="";const _=[];function selectStream(){return s.length&&o.length?s[0].offset!==o[0].offset?s[0].offset<o[0].offset?s:o:"start"===o[0].event?s:o:s.length?s:o}function open(s){function attributeString(s){return" "+s.nodeName+'="'+escapeHTML(s.value)+'"'}u+="<"+tag(s)+[].map.call(s.attributes,attributeString).join("")+">"}function close(s){u+="</"+tag(s)+">"}function render(s){("start"===s.event?open:close)(s.node)}for(;s.length||o.length;){let o=selectStream();if(u+=escapeHTML(i.substring(a,o[0].offset)),a=o[0].offset,o===s){_.reverse().forEach(close);do{render(o.splice(0,1)[0]),o=selectStream()}while(o===s&&o.length&&o[0].offset===a);_.reverse().forEach(open)}else"start"===o[0].event?_.push(o[0].node):_.pop(),render(o.splice(0,1)[0])}return u+escapeHTML(i.substr(a))}(a,nodeStream(u),i)}};function tag(s){return s.nodeName.toLowerCase()}function nodeStream(s){const o=[];return function _nodeStream(s,i){for(let a=s.firstChild;a;a=a.nextSibling)3===a.nodeType?i+=a.nodeValue.length:1===a.nodeType&&(o.push({event:"start",offset:i,node:a}),i=_nodeStream(a,i),tag(a).match(/br|hr|img|input/)||o.push({event:"stop",offset:i,node:a}));return i}(s,0),o}const be={},error=s=>{console.error(s)},warn=(s,...o)=>{console.log(`WARN: ${s}`,...o)},deprecated=(s,o)=>{be[`${s}/${o}`]||(console.log(`Deprecated as of ${s}. ${o}`),be[`${s}/${o}`]=!0)},_e=escapeHTML,Se=inherit,we=Symbol("nomatch");var xe=function(s){const i=Object.create(null),a=Object.create(null),u=[];let _=!0;const w=/(^(<[^>]+>|\t|)+|\n)/gm,x="Could not find the language '{}', did you forget to load/include a language module?",C={disableAutodetect:!0,name:"Plain text",contains:[]};let j={noHighlightRe:/^(no-?highlight)$/i,languageDetectRe:/\blang(?:uage)?-([\w-]+)\b/i,classPrefix:"hljs-",tabReplace:null,useBR:!1,languages:null,__emitter:TokenTreeEmitter};function shouldNotHighlight(s){return j.noHighlightRe.test(s)}function highlight(s,o,i,a){let u="",_="";"object"==typeof o?(u=s,i=o.ignoreIllegals,_=o.language,a=void 0):(deprecated("10.7.0","highlight(lang, code, ...args) has been deprecated."),deprecated("10.7.0","Please use highlight(code, options) instead.\nhttps://github.com/highlightjs/highlight.js/issues/2277"),_=s,u=o);const w={code:u,language:_};fire("before:highlight",w);const x=w.result?w.result:_highlight(w.language,w.code,i,a);return x.code=w.code,fire("after:highlight",x),x}function _highlight(s,o,a,w){function keywordData(s,o){const i=L.case_insensitive?o[0].toLowerCase():o[0];return Object.prototype.hasOwnProperty.call(s.keywords,i)&&s.keywords[i]}function processBuffer(){null!=U.subLanguage?function processSubLanguage(){if(""===Y)return;let s=null;if("string"==typeof U.subLanguage){if(!i[U.subLanguage])return void z.addText(Y);s=_highlight(U.subLanguage,Y,!0,V[U.subLanguage]),V[U.subLanguage]=s.top}else s=highlightAuto(Y,U.subLanguage.length?U.subLanguage:null);U.relevance>0&&(Z+=s.relevance),z.addSublanguage(s.emitter,s.language)}():function processKeywords(){if(!U.keywords)return void z.addText(Y);let s=0;U.keywordPatternRe.lastIndex=0;let o=U.keywordPatternRe.exec(Y),i="";for(;o;){i+=Y.substring(s,o.index);const a=keywordData(U,o);if(a){const[s,u]=a;if(z.addText(i),i="",Z+=u,s.startsWith("_"))i+=o[0];else{const i=L.classNameAliases[s]||s;z.addKeyword(o[0],i)}}else i+=o[0];s=U.keywordPatternRe.lastIndex,o=U.keywordPatternRe.exec(Y)}i+=Y.substr(s),z.addText(i)}(),Y=""}function startNewMode(s){return s.className&&z.openNode(L.classNameAliases[s.className]||s.className),U=Object.create(s,{parent:{value:U}}),U}function endOfMode(s,o,i){let a=function startsWith(s,o){const i=s&&s.exec(o);return i&&0===i.index}(s.endRe,i);if(a){if(s["on:end"]){const i=new Response(s);s["on:end"](o,i),i.isMatchIgnored&&(a=!1)}if(a){for(;s.endsParent&&s.parent;)s=s.parent;return s}}if(s.endsWithParent)return endOfMode(s.parent,o,i)}function doIgnore(s){return 0===U.matcher.regexIndex?(Y+=s[0],1):(ae=!0,0)}function doBeginMatch(s){const o=s[0],i=s.rule,a=new Response(i),u=[i.__beforeBegin,i["on:begin"]];for(const i of u)if(i&&(i(s,a),a.isMatchIgnored))return doIgnore(o);return i&&i.endSameAsBegin&&(i.endRe=function escape(s){return new RegExp(s.replace(/[-/\\^$*+?.()|[\]{}]/g,"\\$&"),"m")}(o)),i.skip?Y+=o:(i.excludeBegin&&(Y+=o),processBuffer(),i.returnBegin||i.excludeBegin||(Y=o)),startNewMode(i),i.returnBegin?0:o.length}function doEndMatch(s){const i=s[0],a=o.substr(s.index),u=endOfMode(U,s,a);if(!u)return we;const _=U;_.skip?Y+=i:(_.returnEnd||_.excludeEnd||(Y+=i),processBuffer(),_.excludeEnd&&(Y=i));do{U.className&&z.closeNode(),U.skip||U.subLanguage||(Z+=U.relevance),U=U.parent}while(U!==u.parent);return u.starts&&(u.endSameAsBegin&&(u.starts.endRe=u.endRe),startNewMode(u.starts)),_.returnEnd?0:i.length}let C={};function processLexeme(i,u){const w=u&&u[0];if(Y+=i,null==w)return processBuffer(),0;if("begin"===C.type&&"end"===u.type&&C.index===u.index&&""===w){if(Y+=o.slice(u.index,u.index+1),!_){const o=new Error("0 width match regex");throw o.languageName=s,o.badRule=C.rule,o}return 1}if(C=u,"begin"===u.type)return doBeginMatch(u);if("illegal"===u.type&&!a){const s=new Error('Illegal lexeme "'+w+'" for mode "'+(U.className||"<unnamed>")+'"');throw s.mode=U,s}if("end"===u.type){const s=doEndMatch(u);if(s!==we)return s}if("illegal"===u.type&&""===w)return 1;if(ie>1e5&&ie>3*u.index){throw new Error("potential infinite loop, way more iterations than matches")}return Y+=w,w.length}const L=getLanguage(s);if(!L)throw error(x.replace("{}",s)),new Error('Unknown language: "'+s+'"');const B=compileLanguage(L,{plugins:u});let $="",U=w||B;const V={},z=new j.__emitter(j);!function processContinuations(){const s=[];for(let o=U;o!==L;o=o.parent)o.className&&s.unshift(o.className);s.forEach((s=>z.openNode(s)))}();let Y="",Z=0,ee=0,ie=0,ae=!1;try{for(U.matcher.considerAll();;){ie++,ae?ae=!1:U.matcher.considerAll(),U.matcher.lastIndex=ee;const s=U.matcher.exec(o);if(!s)break;const i=processLexeme(o.substring(ee,s.index),s);ee=s.index+i}return processLexeme(o.substr(ee)),z.closeAllNodes(),z.finalize(),$=z.toHTML(),{relevance:Math.floor(Z),value:$,language:s,illegal:!1,emitter:z,top:U}}catch(i){if(i.message&&i.message.includes("Illegal"))return{illegal:!0,illegalBy:{msg:i.message,context:o.slice(ee-100,ee+100),mode:i.mode},sofar:$,relevance:0,value:_e(o),emitter:z};if(_)return{illegal:!1,relevance:0,value:_e(o),emitter:z,language:s,top:U,errorRaised:i};throw i}}function highlightAuto(s,o){o=o||j.languages||Object.keys(i);const a=function justTextHighlightResult(s){const o={relevance:0,emitter:new j.__emitter(j),value:_e(s),illegal:!1,top:C};return o.emitter.addText(s),o}(s),u=o.filter(getLanguage).filter(autoDetection).map((o=>_highlight(o,s,!1)));u.unshift(a);const _=u.sort(((s,o)=>{if(s.relevance!==o.relevance)return o.relevance-s.relevance;if(s.language&&o.language){if(getLanguage(s.language).supersetOf===o.language)return 1;if(getLanguage(o.language).supersetOf===s.language)return-1}return 0})),[w,x]=_,L=w;return L.second_best=x,L}const L={"before:highlightElement":({el:s})=>{j.useBR&&(s.innerHTML=s.innerHTML.replace(/\n/g,"").replace(/<br[ /]*>/g,"\n"))},"after:highlightElement":({result:s})=>{j.useBR&&(s.value=s.value.replace(/\n/g,"<br>"))}},B=/^(<[^>]+>|\t)+/gm,$={"after:highlightElement":({result:s})=>{j.tabReplace&&(s.value=s.value.replace(B,(s=>s.replace(/\t/g,j.tabReplace))))}};function highlightElement(s){let o=null;const i=function blockLanguage(s){let o=s.className+" ";o+=s.parentNode?s.parentNode.className:"";const i=j.languageDetectRe.exec(o);if(i){const o=getLanguage(i[1]);return o||(warn(x.replace("{}",i[1])),warn("Falling back to no-highlight mode for this block.",s)),o?i[1]:"no-highlight"}return o.split(/\s+/).find((s=>shouldNotHighlight(s)||getLanguage(s)))}(s);if(shouldNotHighlight(i))return;fire("before:highlightElement",{el:s,language:i}),o=s;const u=o.textContent,_=i?highlight(u,{language:i,ignoreIllegals:!0}):highlightAuto(u);fire("after:highlightElement",{el:s,result:_,text:u}),s.innerHTML=_.value,function updateClassName(s,o,i){const u=o?a[o]:i;s.classList.add("hljs"),u&&s.classList.add(u)}(s,i,_.language),s.result={language:_.language,re:_.relevance,relavance:_.relevance},_.second_best&&(s.second_best={language:_.second_best.language,re:_.second_best.relevance,relavance:_.second_best.relevance})}const initHighlighting=()=>{if(initHighlighting.called)return;initHighlighting.called=!0,deprecated("10.6.0","initHighlighting() is deprecated.  Use highlightAll() instead.");document.querySelectorAll("pre code").forEach(highlightElement)};let U=!1;function highlightAll(){if("loading"===document.readyState)return void(U=!0);document.querySelectorAll("pre code").forEach(highlightElement)}function getLanguage(s){return s=(s||"").toLowerCase(),i[s]||i[a[s]]}function registerAliases(s,{languageName:o}){"string"==typeof s&&(s=[s]),s.forEach((s=>{a[s.toLowerCase()]=o}))}function autoDetection(s){const o=getLanguage(s);return o&&!o.disableAutodetect}function fire(s,o){const i=s;u.forEach((function(s){s[i]&&s[i](o)}))}"undefined"!=typeof window&&window.addEventListener&&window.addEventListener("DOMContentLoaded",(function boot(){U&&highlightAll()}),!1),Object.assign(s,{highlight,highlightAuto,highlightAll,fixMarkup:function deprecateFixMarkup(s){return deprecated("10.2.0","fixMarkup will be removed entirely in v11.0"),deprecated("10.2.0","Please see https://github.com/highlightjs/highlight.js/issues/2534"),function fixMarkup(s){return j.tabReplace||j.useBR?s.replace(w,(s=>"\n"===s?j.useBR?"<br>":s:j.tabReplace?s.replace(/\t/g,j.tabReplace):s)):s}(s)},highlightElement,highlightBlock:function deprecateHighlightBlock(s){return deprecated("10.7.0","highlightBlock will be removed entirely in v12.0"),deprecated("10.7.0","Please use highlightElement now."),highlightElement(s)},configure:function configure(s){s.useBR&&(deprecated("10.3.0","'useBR' will be removed entirely in v11.0"),deprecated("10.3.0","Please see https://github.com/highlightjs/highlight.js/issues/2559")),j=Se(j,s)},initHighlighting,initHighlightingOnLoad:function initHighlightingOnLoad(){deprecated("10.6.0","initHighlightingOnLoad() is deprecated.  Use highlightAll() instead."),U=!0},registerLanguage:function registerLanguage(o,a){let u=null;try{u=a(s)}catch(s){if(error("Language definition for '{}' could not be registered.".replace("{}",o)),!_)throw s;error(s),u=C}u.name||(u.name=o),i[o]=u,u.rawDefinition=a.bind(null,s),u.aliases&&registerAliases(u.aliases,{languageName:o})},unregisterLanguage:function unregisterLanguage(s){delete i[s];for(const o of Object.keys(a))a[o]===s&&delete a[o]},listLanguages:function listLanguages(){return Object.keys(i)},getLanguage,registerAliases,requireLanguage:function requireLanguage(s){deprecated("10.4.0","requireLanguage will be removed entirely in v11."),deprecated("10.4.0","Please see https://github.com/highlightjs/highlight.js/pull/2844");const o=getLanguage(s);if(o)return o;throw new Error("The '{}' language is required, but not loaded.".replace("{}",s))},autoDetection,inherit:Se,addPlugin:function addPlugin(s){!function upgradePluginAPI(s){s["before:highlightBlock"]&&!s["before:highlightElement"]&&(s["before:highlightElement"]=o=>{s["before:highlightBlock"](Object.assign({block:o.el},o))}),s["after:highlightBlock"]&&!s["after:highlightElement"]&&(s["after:highlightElement"]=o=>{s["after:highlightBlock"](Object.assign({block:o.el},o))})}(s),u.push(s)},vuePlugin:BuildVuePlugin(s).VuePlugin}),s.debugMode=function(){_=!1},s.safeMode=function(){_=!0},s.versionString="10.7.3";for(const s in de)"object"==typeof de[s]&&o(de[s]);return Object.assign(s,de),s.addPlugin(L),s.addPlugin(ye),s.addPlugin($),s}({});s.exports=xe},46028:(s,o,i)=>{"use strict";var a=i(13930),u=i(46285),_=i(25594),w=i(29367),x=i(60581),C=i(76264),j=TypeError,L=C("toPrimitive");s.exports=function(s,o){if(!u(s)||_(s))return s;var i,C=w(s,L);if(C){if(void 0===o&&(o="default"),i=a(C,s,o),!u(i)||_(i))return i;throw new j("Can't convert object to primitive value")}return void 0===o&&(o="number"),x(s,o)}},46076:(s,o,i)=>{"use strict";i(91599);var a=i(68623);s.exports=a},46285:(s,o,i)=>{"use strict";var a=i(62250);s.exports=function(s){return"object"==typeof s?null!==s:a(s)}},46942:(s,o)=>{var i;!function(){"use strict";var a={}.hasOwnProperty;function classNames(){for(var s="",o=0;o<arguments.length;o++){var i=arguments[o];i&&(s=appendClass(s,parseValue(i)))}return s}function parseValue(s){if("string"==typeof s||"number"==typeof s)return s;if("object"!=typeof s)return"";if(Array.isArray(s))return classNames.apply(null,s);if(s.toString!==Object.prototype.toString&&!s.toString.toString().includes("[native code]"))return s.toString();var o="";for(var i in s)a.call(s,i)&&s[i]&&(o=appendClass(o,i));return o}function appendClass(s,o){return o?s?s+" "+o:s+o:s}s.exports?(classNames.default=classNames,s.exports=classNames):void 0===(i=function(){return classNames}.apply(o,[]))||(s.exports=i)}()},47119:s=>{"use strict";s.exports="undefined"!=typeof Reflect&&Reflect&&Reflect.apply},47181:(s,o,i)=>{"use strict";var a=i(95116).IteratorPrototype,u=i(58075),_=i(75817),w=i(14840),x=i(93742),returnThis=function(){return this};s.exports=function(s,o,i,C){var j=o+" Iterator";return s.prototype=u(a,{next:_(+!C,i)}),w(s,j,!1,!0),x[j]=returnThis,s}},47237:s=>{s.exports=function baseProperty(s){return function(o){return null==o?void 0:o[s]}}},47248:(s,o,i)=>{var a=i(16547),u=i(51234);s.exports=function zipObject(s,o){return u(s||[],o||[],a)}},47422:(s,o,i)=>{var a=i(31769),u=i(77797);s.exports=function baseGet(s,o){for(var i=0,_=(o=a(o,s)).length;null!=s&&i<_;)s=s[u(o[i++])];return i&&i==_?s:void 0}},47473:s=>{var o=Function.prototype.toString;s.exports=function toSource(s){if(null!=s){try{return o.call(s)}catch(s){}try{return s+""}catch(s){}}return""}},47886:(s,o,i)=>{var a=i(5861),u=i(40346);s.exports=function isWeakMap(s){return u(s)&&"[object WeakMap]"==a(s)}},47934:(s,o,i)=>{s.exports={ary:i(64626),assign:i(74733),clone:i(32629),curry:i(49747),forEach:i(83729),isArray:i(56449),isError:i(23546),isFunction:i(1882),isWeakMap:i(47886),iteratee:i(33855),keys:i(88984),rearg:i(84195),toInteger:i(61489),toPath:i(42072)}},48152:(s,o,i)=>{var a=i(28303),u=a&&new a;s.exports=u},48287:(s,o,i)=>{"use strict";const a=i(67526),u=i(251),_="function"==typeof Symbol&&"function"==typeof Symbol.for?Symbol.for("nodejs.util.inspect.custom"):null;o.Buffer=Buffer,o.SlowBuffer=function SlowBuffer(s){+s!=s&&(s=0);return Buffer.alloc(+s)},o.INSPECT_MAX_BYTES=50;const w=2147483647;function createBuffer(s){if(s>w)throw new RangeError('The value "'+s+'" is invalid for option "size"');const o=new Uint8Array(s);return Object.setPrototypeOf(o,Buffer.prototype),o}function Buffer(s,o,i){if("number"==typeof s){if("string"==typeof o)throw new TypeError('The "string" argument must be of type string. Received type number');return allocUnsafe(s)}return from(s,o,i)}function from(s,o,i){if("string"==typeof s)return function fromString(s,o){"string"==typeof o&&""!==o||(o="utf8");if(!Buffer.isEncoding(o))throw new TypeError("Unknown encoding: "+o);const i=0|byteLength(s,o);let a=createBuffer(i);const u=a.write(s,o);u!==i&&(a=a.slice(0,u));return a}(s,o);if(ArrayBuffer.isView(s))return function fromArrayView(s){if(isInstance(s,Uint8Array)){const o=new Uint8Array(s);return fromArrayBuffer(o.buffer,o.byteOffset,o.byteLength)}return fromArrayLike(s)}(s);if(null==s)throw new TypeError("The first argument must be one of type string, Buffer, ArrayBuffer, Array, or Array-like Object. Received type "+typeof s);if(isInstance(s,ArrayBuffer)||s&&isInstance(s.buffer,ArrayBuffer))return fromArrayBuffer(s,o,i);if("undefined"!=typeof SharedArrayBuffer&&(isInstance(s,SharedArrayBuffer)||s&&isInstance(s.buffer,SharedArrayBuffer)))return fromArrayBuffer(s,o,i);if("number"==typeof s)throw new TypeError('The "value" argument must not be of type number. Received type number');const a=s.valueOf&&s.valueOf();if(null!=a&&a!==s)return Buffer.from(a,o,i);const u=function fromObject(s){if(Buffer.isBuffer(s)){const o=0|checked(s.length),i=createBuffer(o);return 0===i.length||s.copy(i,0,0,o),i}if(void 0!==s.length)return"number"!=typeof s.length||numberIsNaN(s.length)?createBuffer(0):fromArrayLike(s);if("Buffer"===s.type&&Array.isArray(s.data))return fromArrayLike(s.data)}(s);if(u)return u;if("undefined"!=typeof Symbol&&null!=Symbol.toPrimitive&&"function"==typeof s[Symbol.toPrimitive])return Buffer.from(s[Symbol.toPrimitive]("string"),o,i);throw new TypeError("The first argument must be one of type string, Buffer, ArrayBuffer, Array, or Array-like Object. Received type "+typeof s)}function assertSize(s){if("number"!=typeof s)throw new TypeError('"size" argument must be of type number');if(s<0)throw new RangeError('The value "'+s+'" is invalid for option "size"')}function allocUnsafe(s){return assertSize(s),createBuffer(s<0?0:0|checked(s))}function fromArrayLike(s){const o=s.length<0?0:0|checked(s.length),i=createBuffer(o);for(let a=0;a<o;a+=1)i[a]=255&s[a];return i}function fromArrayBuffer(s,o,i){if(o<0||s.byteLength<o)throw new RangeError('"offset" is outside of buffer bounds');if(s.byteLength<o+(i||0))throw new RangeError('"length" is outside of buffer bounds');let a;return a=void 0===o&&void 0===i?new Uint8Array(s):void 0===i?new Uint8Array(s,o):new Uint8Array(s,o,i),Object.setPrototypeOf(a,Buffer.prototype),a}function checked(s){if(s>=w)throw new RangeError("Attempt to allocate Buffer larger than maximum size: 0x"+w.toString(16)+" bytes");return 0|s}function byteLength(s,o){if(Buffer.isBuffer(s))return s.length;if(ArrayBuffer.isView(s)||isInstance(s,ArrayBuffer))return s.byteLength;if("string"!=typeof s)throw new TypeError('The "string" argument must be one of type string, Buffer, or ArrayBuffer. Received type '+typeof s);const i=s.length,a=arguments.length>2&&!0===arguments[2];if(!a&&0===i)return 0;let u=!1;for(;;)switch(o){case"ascii":case"latin1":case"binary":return i;case"utf8":case"utf-8":return utf8ToBytes(s).length;case"ucs2":case"ucs-2":case"utf16le":case"utf-16le":return 2*i;case"hex":return i>>>1;case"base64":return base64ToBytes(s).length;default:if(u)return a?-1:utf8ToBytes(s).length;o=(""+o).toLowerCase(),u=!0}}function slowToString(s,o,i){let a=!1;if((void 0===o||o<0)&&(o=0),o>this.length)return"";if((void 0===i||i>this.length)&&(i=this.length),i<=0)return"";if((i>>>=0)<=(o>>>=0))return"";for(s||(s="utf8");;)switch(s){case"hex":return hexSlice(this,o,i);case"utf8":case"utf-8":return utf8Slice(this,o,i);case"ascii":return asciiSlice(this,o,i);case"latin1":case"binary":return latin1Slice(this,o,i);case"base64":return base64Slice(this,o,i);case"ucs2":case"ucs-2":case"utf16le":case"utf-16le":return utf16leSlice(this,o,i);default:if(a)throw new TypeError("Unknown encoding: "+s);s=(s+"").toLowerCase(),a=!0}}function swap(s,o,i){const a=s[o];s[o]=s[i],s[i]=a}function bidirectionalIndexOf(s,o,i,a,u){if(0===s.length)return-1;if("string"==typeof i?(a=i,i=0):i>2147483647?i=2147483647:i<-2147483648&&(i=-2147483648),numberIsNaN(i=+i)&&(i=u?0:s.length-1),i<0&&(i=s.length+i),i>=s.length){if(u)return-1;i=s.length-1}else if(i<0){if(!u)return-1;i=0}if("string"==typeof o&&(o=Buffer.from(o,a)),Buffer.isBuffer(o))return 0===o.length?-1:arrayIndexOf(s,o,i,a,u);if("number"==typeof o)return o&=255,"function"==typeof Uint8Array.prototype.indexOf?u?Uint8Array.prototype.indexOf.call(s,o,i):Uint8Array.prototype.lastIndexOf.call(s,o,i):arrayIndexOf(s,[o],i,a,u);throw new TypeError("val must be string, number or Buffer")}function arrayIndexOf(s,o,i,a,u){let _,w=1,x=s.length,C=o.length;if(void 0!==a&&("ucs2"===(a=String(a).toLowerCase())||"ucs-2"===a||"utf16le"===a||"utf-16le"===a)){if(s.length<2||o.length<2)return-1;w=2,x/=2,C/=2,i/=2}function read(s,o){return 1===w?s[o]:s.readUInt16BE(o*w)}if(u){let a=-1;for(_=i;_<x;_++)if(read(s,_)===read(o,-1===a?0:_-a)){if(-1===a&&(a=_),_-a+1===C)return a*w}else-1!==a&&(_-=_-a),a=-1}else for(i+C>x&&(i=x-C),_=i;_>=0;_--){let i=!0;for(let a=0;a<C;a++)if(read(s,_+a)!==read(o,a)){i=!1;break}if(i)return _}return-1}function hexWrite(s,o,i,a){i=Number(i)||0;const u=s.length-i;a?(a=Number(a))>u&&(a=u):a=u;const _=o.length;let w;for(a>_/2&&(a=_/2),w=0;w<a;++w){const a=parseInt(o.substr(2*w,2),16);if(numberIsNaN(a))return w;s[i+w]=a}return w}function utf8Write(s,o,i,a){return blitBuffer(utf8ToBytes(o,s.length-i),s,i,a)}function asciiWrite(s,o,i,a){return blitBuffer(function asciiToBytes(s){const o=[];for(let i=0;i<s.length;++i)o.push(255&s.charCodeAt(i));return o}(o),s,i,a)}function base64Write(s,o,i,a){return blitBuffer(base64ToBytes(o),s,i,a)}function ucs2Write(s,o,i,a){return blitBuffer(function utf16leToBytes(s,o){let i,a,u;const _=[];for(let w=0;w<s.length&&!((o-=2)<0);++w)i=s.charCodeAt(w),a=i>>8,u=i%256,_.push(u),_.push(a);return _}(o,s.length-i),s,i,a)}function base64Slice(s,o,i){return 0===o&&i===s.length?a.fromByteArray(s):a.fromByteArray(s.slice(o,i))}function utf8Slice(s,o,i){i=Math.min(s.length,i);const a=[];let u=o;for(;u<i;){const o=s[u];let _=null,w=o>239?4:o>223?3:o>191?2:1;if(u+w<=i){let i,a,x,C;switch(w){case 1:o<128&&(_=o);break;case 2:i=s[u+1],128==(192&i)&&(C=(31&o)<<6|63&i,C>127&&(_=C));break;case 3:i=s[u+1],a=s[u+2],128==(192&i)&&128==(192&a)&&(C=(15&o)<<12|(63&i)<<6|63&a,C>2047&&(C<55296||C>57343)&&(_=C));break;case 4:i=s[u+1],a=s[u+2],x=s[u+3],128==(192&i)&&128==(192&a)&&128==(192&x)&&(C=(15&o)<<18|(63&i)<<12|(63&a)<<6|63&x,C>65535&&C<1114112&&(_=C))}}null===_?(_=65533,w=1):_>65535&&(_-=65536,a.push(_>>>10&1023|55296),_=56320|1023&_),a.push(_),u+=w}return function decodeCodePointsArray(s){const o=s.length;if(o<=x)return String.fromCharCode.apply(String,s);let i="",a=0;for(;a<o;)i+=String.fromCharCode.apply(String,s.slice(a,a+=x));return i}(a)}o.kMaxLength=w,Buffer.TYPED_ARRAY_SUPPORT=function typedArraySupport(){try{const s=new Uint8Array(1),o={foo:function(){return 42}};return Object.setPrototypeOf(o,Uint8Array.prototype),Object.setPrototypeOf(s,o),42===s.foo()}catch(s){return!1}}(),Buffer.TYPED_ARRAY_SUPPORT||"undefined"==typeof console||"function"!=typeof console.error||console.error("This browser lacks typed array (Uint8Array) support which is required by `buffer` v5.x. Use `buffer` v4.x if you require old browser support."),Object.defineProperty(Buffer.prototype,"parent",{enumerable:!0,get:function(){if(Buffer.isBuffer(this))return this.buffer}}),Object.defineProperty(Buffer.prototype,"offset",{enumerable:!0,get:function(){if(Buffer.isBuffer(this))return this.byteOffset}}),Buffer.poolSize=8192,Buffer.from=function(s,o,i){return from(s,o,i)},Object.setPrototypeOf(Buffer.prototype,Uint8Array.prototype),Object.setPrototypeOf(Buffer,Uint8Array),Buffer.alloc=function(s,o,i){return function alloc(s,o,i){return assertSize(s),s<=0?createBuffer(s):void 0!==o?"string"==typeof i?createBuffer(s).fill(o,i):createBuffer(s).fill(o):createBuffer(s)}(s,o,i)},Buffer.allocUnsafe=function(s){return allocUnsafe(s)},Buffer.allocUnsafeSlow=function(s){return allocUnsafe(s)},Buffer.isBuffer=function isBuffer(s){return null!=s&&!0===s._isBuffer&&s!==Buffer.prototype},Buffer.compare=function compare(s,o){if(isInstance(s,Uint8Array)&&(s=Buffer.from(s,s.offset,s.byteLength)),isInstance(o,Uint8Array)&&(o=Buffer.from(o,o.offset,o.byteLength)),!Buffer.isBuffer(s)||!Buffer.isBuffer(o))throw new TypeError('The "buf1", "buf2" arguments must be one of type Buffer or Uint8Array');if(s===o)return 0;let i=s.length,a=o.length;for(let u=0,_=Math.min(i,a);u<_;++u)if(s[u]!==o[u]){i=s[u],a=o[u];break}return i<a?-1:a<i?1:0},Buffer.isEncoding=function isEncoding(s){switch(String(s).toLowerCase()){case"hex":case"utf8":case"utf-8":case"ascii":case"latin1":case"binary":case"base64":case"ucs2":case"ucs-2":case"utf16le":case"utf-16le":return!0;default:return!1}},Buffer.concat=function concat(s,o){if(!Array.isArray(s))throw new TypeError('"list" argument must be an Array of Buffers');if(0===s.length)return Buffer.alloc(0);let i;if(void 0===o)for(o=0,i=0;i<s.length;++i)o+=s[i].length;const a=Buffer.allocUnsafe(o);let u=0;for(i=0;i<s.length;++i){let o=s[i];if(isInstance(o,Uint8Array))u+o.length>a.length?(Buffer.isBuffer(o)||(o=Buffer.from(o)),o.copy(a,u)):Uint8Array.prototype.set.call(a,o,u);else{if(!Buffer.isBuffer(o))throw new TypeError('"list" argument must be an Array of Buffers');o.copy(a,u)}u+=o.length}return a},Buffer.byteLength=byteLength,Buffer.prototype._isBuffer=!0,Buffer.prototype.swap16=function swap16(){const s=this.length;if(s%2!=0)throw new RangeError("Buffer size must be a multiple of 16-bits");for(let o=0;o<s;o+=2)swap(this,o,o+1);return this},Buffer.prototype.swap32=function swap32(){const s=this.length;if(s%4!=0)throw new RangeError("Buffer size must be a multiple of 32-bits");for(let o=0;o<s;o+=4)swap(this,o,o+3),swap(this,o+1,o+2);return this},Buffer.prototype.swap64=function swap64(){const s=this.length;if(s%8!=0)throw new RangeError("Buffer size must be a multiple of 64-bits");for(let o=0;o<s;o+=8)swap(this,o,o+7),swap(this,o+1,o+6),swap(this,o+2,o+5),swap(this,o+3,o+4);return this},Buffer.prototype.toString=function toString(){const s=this.length;return 0===s?"":0===arguments.length?utf8Slice(this,0,s):slowToString.apply(this,arguments)},Buffer.prototype.toLocaleString=Buffer.prototype.toString,Buffer.prototype.equals=function equals(s){if(!Buffer.isBuffer(s))throw new TypeError("Argument must be a Buffer");return this===s||0===Buffer.compare(this,s)},Buffer.prototype.inspect=function inspect(){let s="";const i=o.INSPECT_MAX_BYTES;return s=this.toString("hex",0,i).replace(/(.{2})/g,"$1 ").trim(),this.length>i&&(s+=" ... "),"<Buffer "+s+">"},_&&(Buffer.prototype[_]=Buffer.prototype.inspect),Buffer.prototype.compare=function compare(s,o,i,a,u){if(isInstance(s,Uint8Array)&&(s=Buffer.from(s,s.offset,s.byteLength)),!Buffer.isBuffer(s))throw new TypeError('The "target" argument must be one of type Buffer or Uint8Array. Received type '+typeof s);if(void 0===o&&(o=0),void 0===i&&(i=s?s.length:0),void 0===a&&(a=0),void 0===u&&(u=this.length),o<0||i>s.length||a<0||u>this.length)throw new RangeError("out of range index");if(a>=u&&o>=i)return 0;if(a>=u)return-1;if(o>=i)return 1;if(this===s)return 0;let _=(u>>>=0)-(a>>>=0),w=(i>>>=0)-(o>>>=0);const x=Math.min(_,w),C=this.slice(a,u),j=s.slice(o,i);for(let s=0;s<x;++s)if(C[s]!==j[s]){_=C[s],w=j[s];break}return _<w?-1:w<_?1:0},Buffer.prototype.includes=function includes(s,o,i){return-1!==this.indexOf(s,o,i)},Buffer.prototype.indexOf=function indexOf(s,o,i){return bidirectionalIndexOf(this,s,o,i,!0)},Buffer.prototype.lastIndexOf=function lastIndexOf(s,o,i){return bidirectionalIndexOf(this,s,o,i,!1)},Buffer.prototype.write=function write(s,o,i,a){if(void 0===o)a="utf8",i=this.length,o=0;else if(void 0===i&&"string"==typeof o)a=o,i=this.length,o=0;else{if(!isFinite(o))throw new Error("Buffer.write(string, encoding, offset[, length]) is no longer supported");o>>>=0,isFinite(i)?(i>>>=0,void 0===a&&(a="utf8")):(a=i,i=void 0)}const u=this.length-o;if((void 0===i||i>u)&&(i=u),s.length>0&&(i<0||o<0)||o>this.length)throw new RangeError("Attempt to write outside buffer bounds");a||(a="utf8");let _=!1;for(;;)switch(a){case"hex":return hexWrite(this,s,o,i);case"utf8":case"utf-8":return utf8Write(this,s,o,i);case"ascii":case"latin1":case"binary":return asciiWrite(this,s,o,i);case"base64":return base64Write(this,s,o,i);case"ucs2":case"ucs-2":case"utf16le":case"utf-16le":return ucs2Write(this,s,o,i);default:if(_)throw new TypeError("Unknown encoding: "+a);a=(""+a).toLowerCase(),_=!0}},Buffer.prototype.toJSON=function toJSON(){return{type:"Buffer",data:Array.prototype.slice.call(this._arr||this,0)}};const x=4096;function asciiSlice(s,o,i){let a="";i=Math.min(s.length,i);for(let u=o;u<i;++u)a+=String.fromCharCode(127&s[u]);return a}function latin1Slice(s,o,i){let a="";i=Math.min(s.length,i);for(let u=o;u<i;++u)a+=String.fromCharCode(s[u]);return a}function hexSlice(s,o,i){const a=s.length;(!o||o<0)&&(o=0),(!i||i<0||i>a)&&(i=a);let u="";for(let a=o;a<i;++a)u+=L[s[a]];return u}function utf16leSlice(s,o,i){const a=s.slice(o,i);let u="";for(let s=0;s<a.length-1;s+=2)u+=String.fromCharCode(a[s]+256*a[s+1]);return u}function checkOffset(s,o,i){if(s%1!=0||s<0)throw new RangeError("offset is not uint");if(s+o>i)throw new RangeError("Trying to access beyond buffer length")}function checkInt(s,o,i,a,u,_){if(!Buffer.isBuffer(s))throw new TypeError('"buffer" argument must be a Buffer instance');if(o>u||o<_)throw new RangeError('"value" argument is out of bounds');if(i+a>s.length)throw new RangeError("Index out of range")}function wrtBigUInt64LE(s,o,i,a,u){checkIntBI(o,a,u,s,i,7);let _=Number(o&BigInt(4294967295));s[i++]=_,_>>=8,s[i++]=_,_>>=8,s[i++]=_,_>>=8,s[i++]=_;let w=Number(o>>BigInt(32)&BigInt(4294967295));return s[i++]=w,w>>=8,s[i++]=w,w>>=8,s[i++]=w,w>>=8,s[i++]=w,i}function wrtBigUInt64BE(s,o,i,a,u){checkIntBI(o,a,u,s,i,7);let _=Number(o&BigInt(4294967295));s[i+7]=_,_>>=8,s[i+6]=_,_>>=8,s[i+5]=_,_>>=8,s[i+4]=_;let w=Number(o>>BigInt(32)&BigInt(4294967295));return s[i+3]=w,w>>=8,s[i+2]=w,w>>=8,s[i+1]=w,w>>=8,s[i]=w,i+8}function checkIEEE754(s,o,i,a,u,_){if(i+a>s.length)throw new RangeError("Index out of range");if(i<0)throw new RangeError("Index out of range")}function writeFloat(s,o,i,a,_){return o=+o,i>>>=0,_||checkIEEE754(s,0,i,4),u.write(s,o,i,a,23,4),i+4}function writeDouble(s,o,i,a,_){return o=+o,i>>>=0,_||checkIEEE754(s,0,i,8),u.write(s,o,i,a,52,8),i+8}Buffer.prototype.slice=function slice(s,o){const i=this.length;(s=~~s)<0?(s+=i)<0&&(s=0):s>i&&(s=i),(o=void 0===o?i:~~o)<0?(o+=i)<0&&(o=0):o>i&&(o=i),o<s&&(o=s);const a=this.subarray(s,o);return Object.setPrototypeOf(a,Buffer.prototype),a},Buffer.prototype.readUintLE=Buffer.prototype.readUIntLE=function readUIntLE(s,o,i){s>>>=0,o>>>=0,i||checkOffset(s,o,this.length);let a=this[s],u=1,_=0;for(;++_<o&&(u*=256);)a+=this[s+_]*u;return a},Buffer.prototype.readUintBE=Buffer.prototype.readUIntBE=function readUIntBE(s,o,i){s>>>=0,o>>>=0,i||checkOffset(s,o,this.length);let a=this[s+--o],u=1;for(;o>0&&(u*=256);)a+=this[s+--o]*u;return a},Buffer.prototype.readUint8=Buffer.prototype.readUInt8=function readUInt8(s,o){return s>>>=0,o||checkOffset(s,1,this.length),this[s]},Buffer.prototype.readUint16LE=Buffer.prototype.readUInt16LE=function readUInt16LE(s,o){return s>>>=0,o||checkOffset(s,2,this.length),this[s]|this[s+1]<<8},Buffer.prototype.readUint16BE=Buffer.prototype.readUInt16BE=function readUInt16BE(s,o){return s>>>=0,o||checkOffset(s,2,this.length),this[s]<<8|this[s+1]},Buffer.prototype.readUint32LE=Buffer.prototype.readUInt32LE=function readUInt32LE(s,o){return s>>>=0,o||checkOffset(s,4,this.length),(this[s]|this[s+1]<<8|this[s+2]<<16)+16777216*this[s+3]},Buffer.prototype.readUint32BE=Buffer.prototype.readUInt32BE=function readUInt32BE(s,o){return s>>>=0,o||checkOffset(s,4,this.length),16777216*this[s]+(this[s+1]<<16|this[s+2]<<8|this[s+3])},Buffer.prototype.readBigUInt64LE=defineBigIntMethod((function readBigUInt64LE(s){validateNumber(s>>>=0,"offset");const o=this[s],i=this[s+7];void 0!==o&&void 0!==i||boundsError(s,this.length-8);const a=o+256*this[++s]+65536*this[++s]+this[++s]*2**24,u=this[++s]+256*this[++s]+65536*this[++s]+i*2**24;return BigInt(a)+(BigInt(u)<<BigInt(32))})),Buffer.prototype.readBigUInt64BE=defineBigIntMethod((function readBigUInt64BE(s){validateNumber(s>>>=0,"offset");const o=this[s],i=this[s+7];void 0!==o&&void 0!==i||boundsError(s,this.length-8);const a=o*2**24+65536*this[++s]+256*this[++s]+this[++s],u=this[++s]*2**24+65536*this[++s]+256*this[++s]+i;return(BigInt(a)<<BigInt(32))+BigInt(u)})),Buffer.prototype.readIntLE=function readIntLE(s,o,i){s>>>=0,o>>>=0,i||checkOffset(s,o,this.length);let a=this[s],u=1,_=0;for(;++_<o&&(u*=256);)a+=this[s+_]*u;return u*=128,a>=u&&(a-=Math.pow(2,8*o)),a},Buffer.prototype.readIntBE=function readIntBE(s,o,i){s>>>=0,o>>>=0,i||checkOffset(s,o,this.length);let a=o,u=1,_=this[s+--a];for(;a>0&&(u*=256);)_+=this[s+--a]*u;return u*=128,_>=u&&(_-=Math.pow(2,8*o)),_},Buffer.prototype.readInt8=function readInt8(s,o){return s>>>=0,o||checkOffset(s,1,this.length),128&this[s]?-1*(255-this[s]+1):this[s]},Buffer.prototype.readInt16LE=function readInt16LE(s,o){s>>>=0,o||checkOffset(s,2,this.length);const i=this[s]|this[s+1]<<8;return 32768&i?4294901760|i:i},Buffer.prototype.readInt16BE=function readInt16BE(s,o){s>>>=0,o||checkOffset(s,2,this.length);const i=this[s+1]|this[s]<<8;return 32768&i?4294901760|i:i},Buffer.prototype.readInt32LE=function readInt32LE(s,o){return s>>>=0,o||checkOffset(s,4,this.length),this[s]|this[s+1]<<8|this[s+2]<<16|this[s+3]<<24},Buffer.prototype.readInt32BE=function readInt32BE(s,o){return s>>>=0,o||checkOffset(s,4,this.length),this[s]<<24|this[s+1]<<16|this[s+2]<<8|this[s+3]},Buffer.prototype.readBigInt64LE=defineBigIntMethod((function readBigInt64LE(s){validateNumber(s>>>=0,"offset");const o=this[s],i=this[s+7];void 0!==o&&void 0!==i||boundsError(s,this.length-8);const a=this[s+4]+256*this[s+5]+65536*this[s+6]+(i<<24);return(BigInt(a)<<BigInt(32))+BigInt(o+256*this[++s]+65536*this[++s]+this[++s]*2**24)})),Buffer.prototype.readBigInt64BE=defineBigIntMethod((function readBigInt64BE(s){validateNumber(s>>>=0,"offset");const o=this[s],i=this[s+7];void 0!==o&&void 0!==i||boundsError(s,this.length-8);const a=(o<<24)+65536*this[++s]+256*this[++s]+this[++s];return(BigInt(a)<<BigInt(32))+BigInt(this[++s]*2**24+65536*this[++s]+256*this[++s]+i)})),Buffer.prototype.readFloatLE=function readFloatLE(s,o){return s>>>=0,o||checkOffset(s,4,this.length),u.read(this,s,!0,23,4)},Buffer.prototype.readFloatBE=function readFloatBE(s,o){return s>>>=0,o||checkOffset(s,4,this.length),u.read(this,s,!1,23,4)},Buffer.prototype.readDoubleLE=function readDoubleLE(s,o){return s>>>=0,o||checkOffset(s,8,this.length),u.read(this,s,!0,52,8)},Buffer.prototype.readDoubleBE=function readDoubleBE(s,o){return s>>>=0,o||checkOffset(s,8,this.length),u.read(this,s,!1,52,8)},Buffer.prototype.writeUintLE=Buffer.prototype.writeUIntLE=function writeUIntLE(s,o,i,a){if(s=+s,o>>>=0,i>>>=0,!a){checkInt(this,s,o,i,Math.pow(2,8*i)-1,0)}let u=1,_=0;for(this[o]=255&s;++_<i&&(u*=256);)this[o+_]=s/u&255;return o+i},Buffer.prototype.writeUintBE=Buffer.prototype.writeUIntBE=function writeUIntBE(s,o,i,a){if(s=+s,o>>>=0,i>>>=0,!a){checkInt(this,s,o,i,Math.pow(2,8*i)-1,0)}let u=i-1,_=1;for(this[o+u]=255&s;--u>=0&&(_*=256);)this[o+u]=s/_&255;return o+i},Buffer.prototype.writeUint8=Buffer.prototype.writeUInt8=function writeUInt8(s,o,i){return s=+s,o>>>=0,i||checkInt(this,s,o,1,255,0),this[o]=255&s,o+1},Buffer.prototype.writeUint16LE=Buffer.prototype.writeUInt16LE=function writeUInt16LE(s,o,i){return s=+s,o>>>=0,i||checkInt(this,s,o,2,65535,0),this[o]=255&s,this[o+1]=s>>>8,o+2},Buffer.prototype.writeUint16BE=Buffer.prototype.writeUInt16BE=function writeUInt16BE(s,o,i){return s=+s,o>>>=0,i||checkInt(this,s,o,2,65535,0),this[o]=s>>>8,this[o+1]=255&s,o+2},Buffer.prototype.writeUint32LE=Buffer.prototype.writeUInt32LE=function writeUInt32LE(s,o,i){return s=+s,o>>>=0,i||checkInt(this,s,o,4,4294967295,0),this[o+3]=s>>>24,this[o+2]=s>>>16,this[o+1]=s>>>8,this[o]=255&s,o+4},Buffer.prototype.writeUint32BE=Buffer.prototype.writeUInt32BE=function writeUInt32BE(s,o,i){return s=+s,o>>>=0,i||checkInt(this,s,o,4,4294967295,0),this[o]=s>>>24,this[o+1]=s>>>16,this[o+2]=s>>>8,this[o+3]=255&s,o+4},Buffer.prototype.writeBigUInt64LE=defineBigIntMethod((function writeBigUInt64LE(s,o=0){return wrtBigUInt64LE(this,s,o,BigInt(0),BigInt("0xffffffffffffffff"))})),Buffer.prototype.writeBigUInt64BE=defineBigIntMethod((function writeBigUInt64BE(s,o=0){return wrtBigUInt64BE(this,s,o,BigInt(0),BigInt("0xffffffffffffffff"))})),Buffer.prototype.writeIntLE=function writeIntLE(s,o,i,a){if(s=+s,o>>>=0,!a){const a=Math.pow(2,8*i-1);checkInt(this,s,o,i,a-1,-a)}let u=0,_=1,w=0;for(this[o]=255&s;++u<i&&(_*=256);)s<0&&0===w&&0!==this[o+u-1]&&(w=1),this[o+u]=(s/_|0)-w&255;return o+i},Buffer.prototype.writeIntBE=function writeIntBE(s,o,i,a){if(s=+s,o>>>=0,!a){const a=Math.pow(2,8*i-1);checkInt(this,s,o,i,a-1,-a)}let u=i-1,_=1,w=0;for(this[o+u]=255&s;--u>=0&&(_*=256);)s<0&&0===w&&0!==this[o+u+1]&&(w=1),this[o+u]=(s/_|0)-w&255;return o+i},Buffer.prototype.writeInt8=function writeInt8(s,o,i){return s=+s,o>>>=0,i||checkInt(this,s,o,1,127,-128),s<0&&(s=255+s+1),this[o]=255&s,o+1},Buffer.prototype.writeInt16LE=function writeInt16LE(s,o,i){return s=+s,o>>>=0,i||checkInt(this,s,o,2,32767,-32768),this[o]=255&s,this[o+1]=s>>>8,o+2},Buffer.prototype.writeInt16BE=function writeInt16BE(s,o,i){return s=+s,o>>>=0,i||checkInt(this,s,o,2,32767,-32768),this[o]=s>>>8,this[o+1]=255&s,o+2},Buffer.prototype.writeInt32LE=function writeInt32LE(s,o,i){return s=+s,o>>>=0,i||checkInt(this,s,o,4,2147483647,-2147483648),this[o]=255&s,this[o+1]=s>>>8,this[o+2]=s>>>16,this[o+3]=s>>>24,o+4},Buffer.prototype.writeInt32BE=function writeInt32BE(s,o,i){return s=+s,o>>>=0,i||checkInt(this,s,o,4,2147483647,-2147483648),s<0&&(s=4294967295+s+1),this[o]=s>>>24,this[o+1]=s>>>16,this[o+2]=s>>>8,this[o+3]=255&s,o+4},Buffer.prototype.writeBigInt64LE=defineBigIntMethod((function writeBigInt64LE(s,o=0){return wrtBigUInt64LE(this,s,o,-BigInt("0x8000000000000000"),BigInt("0x7fffffffffffffff"))})),Buffer.prototype.writeBigInt64BE=defineBigIntMethod((function writeBigInt64BE(s,o=0){return wrtBigUInt64BE(this,s,o,-BigInt("0x8000000000000000"),BigInt("0x7fffffffffffffff"))})),Buffer.prototype.writeFloatLE=function writeFloatLE(s,o,i){return writeFloat(this,s,o,!0,i)},Buffer.prototype.writeFloatBE=function writeFloatBE(s,o,i){return writeFloat(this,s,o,!1,i)},Buffer.prototype.writeDoubleLE=function writeDoubleLE(s,o,i){return writeDouble(this,s,o,!0,i)},Buffer.prototype.writeDoubleBE=function writeDoubleBE(s,o,i){return writeDouble(this,s,o,!1,i)},Buffer.prototype.copy=function copy(s,o,i,a){if(!Buffer.isBuffer(s))throw new TypeError("argument should be a Buffer");if(i||(i=0),a||0===a||(a=this.length),o>=s.length&&(o=s.length),o||(o=0),a>0&&a<i&&(a=i),a===i)return 0;if(0===s.length||0===this.length)return 0;if(o<0)throw new RangeError("targetStart out of bounds");if(i<0||i>=this.length)throw new RangeError("Index out of range");if(a<0)throw new RangeError("sourceEnd out of bounds");a>this.length&&(a=this.length),s.length-o<a-i&&(a=s.length-o+i);const u=a-i;return this===s&&"function"==typeof Uint8Array.prototype.copyWithin?this.copyWithin(o,i,a):Uint8Array.prototype.set.call(s,this.subarray(i,a),o),u},Buffer.prototype.fill=function fill(s,o,i,a){if("string"==typeof s){if("string"==typeof o?(a=o,o=0,i=this.length):"string"==typeof i&&(a=i,i=this.length),void 0!==a&&"string"!=typeof a)throw new TypeError("encoding must be a string");if("string"==typeof a&&!Buffer.isEncoding(a))throw new TypeError("Unknown encoding: "+a);if(1===s.length){const o=s.charCodeAt(0);("utf8"===a&&o<128||"latin1"===a)&&(s=o)}}else"number"==typeof s?s&=255:"boolean"==typeof s&&(s=Number(s));if(o<0||this.length<o||this.length<i)throw new RangeError("Out of range index");if(i<=o)return this;let u;if(o>>>=0,i=void 0===i?this.length:i>>>0,s||(s=0),"number"==typeof s)for(u=o;u<i;++u)this[u]=s;else{const _=Buffer.isBuffer(s)?s:Buffer.from(s,a),w=_.length;if(0===w)throw new TypeError('The value "'+s+'" is invalid for argument "value"');for(u=0;u<i-o;++u)this[u+o]=_[u%w]}return this};const C={};function E(s,o,i){C[s]=class NodeError extends i{constructor(){super(),Object.defineProperty(this,"message",{value:o.apply(this,arguments),writable:!0,configurable:!0}),this.name=`${this.name} [${s}]`,this.stack,delete this.name}get code(){return s}set code(s){Object.defineProperty(this,"code",{configurable:!0,enumerable:!0,value:s,writable:!0})}toString(){return`${this.name} [${s}]: ${this.message}`}}}function addNumericalSeparator(s){let o="",i=s.length;const a="-"===s[0]?1:0;for(;i>=a+4;i-=3)o=`_${s.slice(i-3,i)}${o}`;return`${s.slice(0,i)}${o}`}function checkIntBI(s,o,i,a,u,_){if(s>i||s<o){const a="bigint"==typeof o?"n":"";let u;throw u=_>3?0===o||o===BigInt(0)?`>= 0${a} and < 2${a} ** ${8*(_+1)}${a}`:`>= -(2${a} ** ${8*(_+1)-1}${a}) and < 2 ** ${8*(_+1)-1}${a}`:`>= ${o}${a} and <= ${i}${a}`,new C.ERR_OUT_OF_RANGE("value",u,s)}!function checkBounds(s,o,i){validateNumber(o,"offset"),void 0!==s[o]&&void 0!==s[o+i]||boundsError(o,s.length-(i+1))}(a,u,_)}function validateNumber(s,o){if("number"!=typeof s)throw new C.ERR_INVALID_ARG_TYPE(o,"number",s)}function boundsError(s,o,i){if(Math.floor(s)!==s)throw validateNumber(s,i),new C.ERR_OUT_OF_RANGE(i||"offset","an integer",s);if(o<0)throw new C.ERR_BUFFER_OUT_OF_BOUNDS;throw new C.ERR_OUT_OF_RANGE(i||"offset",`>= ${i?1:0} and <= ${o}`,s)}E("ERR_BUFFER_OUT_OF_BOUNDS",(function(s){return s?`${s} is outside of buffer bounds`:"Attempt to access memory outside buffer bounds"}),RangeError),E("ERR_INVALID_ARG_TYPE",(function(s,o){return`The "${s}" argument must be of type number. Received type ${typeof o}`}),TypeError),E("ERR_OUT_OF_RANGE",(function(s,o,i){let a=`The value of "${s}" is out of range.`,u=i;return Number.isInteger(i)&&Math.abs(i)>2**32?u=addNumericalSeparator(String(i)):"bigint"==typeof i&&(u=String(i),(i>BigInt(2)**BigInt(32)||i<-(BigInt(2)**BigInt(32)))&&(u=addNumericalSeparator(u)),u+="n"),a+=` It must be ${o}. Received ${u}`,a}),RangeError);const j=/[^+/0-9A-Za-z-_]/g;function utf8ToBytes(s,o){let i;o=o||1/0;const a=s.length;let u=null;const _=[];for(let w=0;w<a;++w){if(i=s.charCodeAt(w),i>55295&&i<57344){if(!u){if(i>56319){(o-=3)>-1&&_.push(239,191,189);continue}if(w+1===a){(o-=3)>-1&&_.push(239,191,189);continue}u=i;continue}if(i<56320){(o-=3)>-1&&_.push(239,191,189),u=i;continue}i=65536+(u-55296<<10|i-56320)}else u&&(o-=3)>-1&&_.push(239,191,189);if(u=null,i<128){if((o-=1)<0)break;_.push(i)}else if(i<2048){if((o-=2)<0)break;_.push(i>>6|192,63&i|128)}else if(i<65536){if((o-=3)<0)break;_.push(i>>12|224,i>>6&63|128,63&i|128)}else{if(!(i<1114112))throw new Error("Invalid code point");if((o-=4)<0)break;_.push(i>>18|240,i>>12&63|128,i>>6&63|128,63&i|128)}}return _}function base64ToBytes(s){return a.toByteArray(function base64clean(s){if((s=(s=s.split("=")[0]).trim().replace(j,"")).length<2)return"";for(;s.length%4!=0;)s+="=";return s}(s))}function blitBuffer(s,o,i,a){let u;for(u=0;u<a&&!(u+i>=o.length||u>=s.length);++u)o[u+i]=s[u];return u}function isInstance(s,o){return s instanceof o||null!=s&&null!=s.constructor&&null!=s.constructor.name&&s.constructor.name===o.name}function numberIsNaN(s){return s!=s}const L=function(){const s="0123456789abcdef",o=new Array(256);for(let i=0;i<16;++i){const a=16*i;for(let u=0;u<16;++u)o[a+u]=s[i]+s[u]}return o}();function defineBigIntMethod(s){return"undefined"==typeof BigInt?BufferBigIntNotDefined:s}function BufferBigIntNotDefined(){throw new Error("BigInt not supported")}},48590:(s,o)=>{"use strict";Object.defineProperty(o,"__esModule",{value:!0}),o.default=function(s){return s&&"@@redux/INIT"===s.type?"initialState argument passed to createStore":"previous state received by the reducer"},s.exports=o.default},48648:s=>{"use strict";s.exports="undefined"!=typeof Reflect&&Reflect.getPrototypeOf||null},48655:(s,o,i)=>{var a=i(26025);s.exports=function listCacheHas(s){return a(this.__data__,s)>-1}},48675:(s,o,i)=>{s.exports=i(20850)},48948:(s,o,i)=>{var a=i(21791),u=i(86375);s.exports=function copySymbolsIn(s,o){return a(s,u(s),o)}},49092:(s,o,i)=>{"use strict";var a=i(41333);s.exports=function hasToStringTagShams(){return a()&&!!Symbol.toStringTag}},49326:(s,o,i)=>{var a=i(31769),u=i(72428),_=i(56449),w=i(30361),x=i(30294),C=i(77797);s.exports=function hasPath(s,o,i){for(var j=-1,L=(o=a(o,s)).length,B=!1;++j<L;){var $=C(o[j]);if(!(B=null!=s&&i(s,$)))break;s=s[$]}return B||++j!=L?B:!!(L=null==s?0:s.length)&&x(L)&&w($,L)&&(_(s)||u(s))}},49552:(s,o,i)=>{"use strict";var a=i(45951),u=i(46285),_=a.document,w=u(_)&&u(_.createElement);s.exports=function(s){return w?_.createElement(s):{}}},49653:(s,o,i)=>{var a=i(37828);s.exports=function cloneArrayBuffer(s){var o=new s.constructor(s.byteLength);return new a(o).set(new a(s)),o}},49698:s=>{var o=RegExp("[\\u200d\\ud800-\\udfff\\u0300-\\u036f\\ufe20-\\ufe2f\\u20d0-\\u20ff\\ufe0e\\ufe0f]");s.exports=function hasUnicode(s){return o.test(s)}},49724:(s,o,i)=>{"use strict";var a=i(1907),u=i(39298),_=a({}.hasOwnProperty);s.exports=Object.hasOwn||function hasOwn(s,o){return _(u(s),o)}},49747:(s,o,i)=>{var a=i(66977);function curry(s,o,i){var u=a(s,8,void 0,void 0,void 0,void 0,void 0,o=i?void 0:o);return u.placeholder=curry.placeholder,u}curry.placeholder={},s.exports=curry},50002:(s,o,i)=>{var a=i(82199),u=i(4664),_=i(95950);s.exports=function getAllKeys(s){return a(s,_,u)}},50104:(s,o,i)=>{var a=i(53661);function memoize(s,o){if("function"!=typeof s||null!=o&&"function"!=typeof o)throw new TypeError("Expected a function");var memoized=function(){var i=arguments,a=o?o.apply(this,i):i[0],u=memoized.cache;if(u.has(a))return u.get(a);var _=s.apply(this,i);return memoized.cache=u.set(a,_)||u,_};return memoized.cache=new(memoize.Cache||a),memoized}memoize.Cache=a,s.exports=memoize},50583:(s,o,i)=>{var a=i(47237),u=i(17255),_=i(28586),w=i(77797);s.exports=function property(s){return _(s)?a(w(s)):u(s)}},50689:(s,o,i)=>{var a=i(50002),u=Object.prototype.hasOwnProperty;s.exports=function equalObjects(s,o,i,_,w,x){var C=1&i,j=a(s),L=j.length;if(L!=a(o).length&&!C)return!1;for(var B=L;B--;){var $=j[B];if(!(C?$ in o:u.call(o,$)))return!1}var U=x.get(s),V=x.get(o);if(U&&V)return U==o&&V==s;var z=!0;x.set(s,o),x.set(o,s);for(var Y=C;++B<L;){var Z=s[$=j[B]],ee=o[$];if(_)var ie=C?_(ee,Z,$,o,s,x):_(Z,ee,$,s,o,x);if(!(void 0===ie?Z===ee||w(Z,ee,i,_,x):ie)){z=!1;break}Y||(Y="constructor"==$)}if(z&&!Y){var ae=s.constructor,ce=o.constructor;ae==ce||!("constructor"in s)||!("constructor"in o)||"function"==typeof ae&&ae instanceof ae&&"function"==typeof ce&&ce instanceof ce||(z=!1)}return x.delete(s),x.delete(o),z}},50828:(s,o,i)=>{var a=i(24647),u=i(13222),_=/[\xc0-\xd6\xd8-\xf6\xf8-\xff\u0100-\u017f]/g,w=RegExp("[\\u0300-\\u036f\\ufe20-\\ufe2f\\u20d0-\\u20ff]","g");s.exports=function deburr(s){return(s=u(s))&&s.replace(_,a).replace(w,"")}},51175:(s,o,i)=>{"use strict";var a=i(19846);s.exports=a&&!Symbol.sham&&"symbol"==typeof Symbol.iterator},51234:s=>{s.exports=function baseZipObject(s,o,i){for(var a=-1,u=s.length,_=o.length,w={};++a<u;){var x=a<_?o[a]:void 0;i(w,s[a],x)}return w}},51420:(s,o,i)=>{var a=i(80079);s.exports=function stackClear(){this.__data__=new a,this.size=0}},51459:s=>{s.exports=function setCacheHas(s){return this.__data__.has(s)}},51811:s=>{var o=Date.now;s.exports=function shortOut(s){var i=0,a=0;return function(){var u=o(),_=16-(u-a);if(a=u,_>0){if(++i>=800)return arguments[0]}else i=0;return s.apply(void 0,arguments)}}},51871:(s,o,i)=>{"use strict";var a=i(1907),u=i(82159);s.exports=function(s,o,i){try{return a(u(Object.getOwnPropertyDescriptor(s,o)[i]))}catch(s){}}},51873:(s,o,i)=>{var a=i(9325).Symbol;s.exports=a},52623:(s,o,i)=>{"use strict";var a={};a[i(76264)("toStringTag")]="z",s.exports="[object z]"===String(a)},53138:(s,o,i)=>{var a=i(11331);s.exports=function customOmitClone(s){return a(s)?void 0:s}},53209:(s,o,i)=>{"use strict";var a=i(65606),u=65536,_=4294967295;var w=i(92861).Buffer,x=i.g.crypto||i.g.msCrypto;x&&x.getRandomValues?s.exports=function randomBytes(s,o){if(s>_)throw new RangeError("requested too many random bytes");var i=w.allocUnsafe(s);if(s>0)if(s>u)for(var C=0;C<s;C+=u)x.getRandomValues(i.slice(C,C+u));else x.getRandomValues(i);if("function"==typeof o)return a.nextTick((function(){o(null,i)}));return i}:s.exports=function oldBrowser(){throw new Error("Secure random number generation is not supported by this browser.\nUse Chrome, Firefox or Internet Explorer 11")}},53320:s=>{var o=Math.max;s.exports=function composeArgsRight(s,i,a,u){for(var _=-1,w=s.length,x=-1,C=a.length,j=-1,L=i.length,B=o(w-C,0),$=Array(B+L),U=!u;++_<B;)$[_]=s[_];for(var V=_;++j<L;)$[V+j]=i[j];for(;++x<C;)(U||_<w)&&($[V+a[x]]=s[_++]);return $}},53375:(s,o,i)=>{"use strict";var a=i(93700);s.exports=a},53661:(s,o,i)=>{var a=i(63040),u=i(17670),_=i(90289),w=i(4509),x=i(72949);function MapCache(s){var o=-1,i=null==s?0:s.length;for(this.clear();++o<i;){var a=s[o];this.set(a[0],a[1])}}MapCache.prototype.clear=a,MapCache.prototype.delete=u,MapCache.prototype.get=_,MapCache.prototype.has=w,MapCache.prototype.set=x,s.exports=MapCache},53758:(s,o,i)=>{var a=i(30980),u=i(56017),_=i(94033),w=i(56449),x=i(40346),C=i(80257),j=Object.prototype.hasOwnProperty;function lodash(s){if(x(s)&&!w(s)&&!(s instanceof a)){if(s instanceof u)return s;if(j.call(s,"__wrapped__"))return C(s)}return new u(s)}lodash.prototype=_.prototype,lodash.prototype.constructor=lodash,s.exports=lodash},53812:(s,o,i)=>{var a=i(72552),u=i(40346);s.exports=function isBoolean(s){return!0===s||!1===s||u(s)&&"[object Boolean]"==a(s)}},54018:(s,o,i)=>{"use strict";var a=i(46285);s.exports=function(s){return a(s)||null===s}},54128:(s,o,i)=>{var a=i(31800),u=/^\s+/;s.exports=function baseTrim(s){return s?s.slice(0,a(s)+1).replace(u,""):s}},54552:s=>{s.exports=function basePropertyOf(s){return function(o){return null==s?void 0:s[o]}}},54641:(s,o,i)=>{var a=i(68882),u=i(51811)(a);s.exports=u},54829:(s,o,i)=>{"use strict";var a=i(74284).f;s.exports=function(s,o,i){i in s||a(s,i,{configurable:!0,get:function(){return o[i]},set:function(s){o[i]=s}})}},54878:(s,o,i)=>{"use strict";var a=i(52623),u=i(73948);s.exports=a?{}.toString:function toString(){return"[object "+u(this)+"]"}},55157:s=>{s.exports=function(){throw new Error("Readable.from is not available in the browser")}},55364:(s,o,i)=>{var a=i(85250),u=i(20999)((function(s,o,i){a(s,o,i)}));s.exports=u},55481:(s,o,i)=>{var a=i(9325)["__core-js_shared__"];s.exports=a},55527:s=>{var o=Object.prototype;s.exports=function isPrototype(s){var i=s&&s.constructor;return s===("function"==typeof i&&i.prototype||o)}},55580:(s,o,i)=>{var a=i(56110)(i(9325),"DataView");s.exports=a},55674:(s,o,i)=>{"use strict";Object.defineProperty(o,"__esModule",{value:!0}),o.validateNextState=o.getUnexpectedInvocationParameterMessage=o.getStateName=void 0;var a=_interopRequireDefault(i(48590)),u=_interopRequireDefault(i(82261)),_=_interopRequireDefault(i(27374));function _interopRequireDefault(s){return s&&s.__esModule?s:{default:s}}o.getStateName=a.default,o.getUnexpectedInvocationParameterMessage=u.default,o.validateNextState=_.default},55808:(s,o,i)=>{var a=i(12507)("toUpperCase");s.exports=a},55973:s=>{class KeyValuePair{constructor(s,o){this.key=s,this.value=o}clone(){const s=new KeyValuePair;return this.key&&(s.key=this.key.clone()),this.value&&(s.value=this.value.clone()),s}}s.exports=KeyValuePair},56017:(s,o,i)=>{var a=i(39344),u=i(94033);function LodashWrapper(s,o){this.__wrapped__=s,this.__actions__=[],this.__chain__=!!o,this.__index__=0,this.__values__=void 0}LodashWrapper.prototype=a(u.prototype),LodashWrapper.prototype.constructor=LodashWrapper,s.exports=LodashWrapper},56110:(s,o,i)=>{var a=i(45083),u=i(10392);s.exports=function getNative(s,o){var i=u(s,o);return a(i)?i:void 0}},56367:(s,o,i)=>{s.exports=i(77731)},56449:s=>{var o=Array.isArray;s.exports=o},56698:s=>{"function"==typeof Object.create?s.exports=function inherits(s,o){o&&(s.super_=o,s.prototype=Object.create(o.prototype,{constructor:{value:s,enumerable:!1,writable:!0,configurable:!0}}))}:s.exports=function inherits(s,o){if(o){s.super_=o;var TempCtor=function(){};TempCtor.prototype=o.prototype,s.prototype=new TempCtor,s.prototype.constructor=s}}},56757:(s,o,i)=>{var a=i(91033),u=Math.max;s.exports=function overRest(s,o,i){return o=u(void 0===o?s.length-1:o,0),function(){for(var _=arguments,w=-1,x=u(_.length-o,0),C=Array(x);++w<x;)C[w]=_[o+w];w=-1;for(var j=Array(o+1);++w<o;)j[w]=_[w];return j[o]=i(C),a(s,this,j)}}},57382:(s,o,i)=>{"use strict";var a=i(98828);s.exports=!a((function(){function F(){}return F.prototype.constructor=null,Object.getPrototypeOf(new F)!==F.prototype}))},57758:(s,o,i)=>{"use strict";var a;var u=i(86048).F,_=u.ERR_MISSING_ARGS,w=u.ERR_STREAM_DESTROYED;function noop(s){if(s)throw s}function call(s){s()}function pipe(s,o){return s.pipe(o)}s.exports=function pipeline(){for(var s=arguments.length,o=new Array(s),u=0;u<s;u++)o[u]=arguments[u];var x,C=function popCallback(s){return s.length?"function"!=typeof s[s.length-1]?noop:s.pop():noop}(o);if(Array.isArray(o[0])&&(o=o[0]),o.length<2)throw new _("streams");var j=o.map((function(s,u){var _=u<o.length-1;return function destroyer(s,o,u,_){_=function once(s){var o=!1;return function(){o||(o=!0,s.apply(void 0,arguments))}}(_);var x=!1;s.on("close",(function(){x=!0})),void 0===a&&(a=i(86238)),a(s,{readable:o,writable:u},(function(s){if(s)return _(s);x=!0,_()}));var C=!1;return function(o){if(!x&&!C)return C=!0,function isRequest(s){return s.setHeader&&"function"==typeof s.abort}(s)?s.abort():"function"==typeof s.destroy?s.destroy():void _(o||new w("pipe"))}}(s,_,u>0,(function(s){x||(x=s),s&&j.forEach(call),_||(j.forEach(call),C(x))}))}));return o.reduce(pipe)}},58068:s=>{"use strict";s.exports=SyntaxError},58075:(s,o,i)=>{"use strict";var a,u=i(36624),_=i(42220),w=i(80376),x=i(38530),C=i(62416),j=i(49552),L=i(92522),B="prototype",$="script",U=L("IE_PROTO"),EmptyConstructor=function(){},scriptTag=function(s){return"<"+$+">"+s+"</"+$+">"},NullProtoObjectViaActiveX=function(s){s.write(scriptTag("")),s.close();var o=s.parentWindow.Object;return s=null,o},NullProtoObject=function(){try{a=new ActiveXObject("htmlfile")}catch(s){}var s,o,i;NullProtoObject="undefined"!=typeof document?document.domain&&a?NullProtoObjectViaActiveX(a):(o=j("iframe"),i="java"+$+":",o.style.display="none",C.appendChild(o),o.src=String(i),(s=o.contentWindow.document).open(),s.write(scriptTag("document.F=Object")),s.close(),s.F):NullProtoObjectViaActiveX(a);for(var u=w.length;u--;)delete NullProtoObject[B][w[u]];return NullProtoObject()};x[U]=!0,s.exports=Object.create||function create(s,o){var i;return null!==s?(EmptyConstructor[B]=u(s),i=new EmptyConstructor,EmptyConstructor[B]=null,i[U]=s):i=NullProtoObject(),void 0===o?i:_.f(i,o)}},58156:(s,o,i)=>{var a=i(47422);s.exports=function get(s,o,i){var u=null==s?void 0:a(s,o);return void 0===u?i:u}},58523:s=>{s.exports=function countHolders(s,o){for(var i=s.length,a=0;i--;)s[i]===o&&++a;return a}},58661:(s,o,i)=>{"use strict";var a=i(39447),u=i(98828);s.exports=a&&u((function(){return 42!==Object.defineProperty((function(){}),"prototype",{value:42,writable:!1}).prototype}))},58968:s=>{"use strict";s.exports=Math.floor},59350:s=>{var o=Object.prototype.toString;s.exports=function objectToString(s){return o.call(s)}},59399:(s,o,i)=>{"use strict";var a=i(25264).CopyToClipboard;a.CopyToClipboard=a,s.exports=a},59550:s=>{"use strict";s.exports=function(s,o){return{value:s,done:o}}},60183:(s,o,i)=>{"use strict";var a=i(11091),u=i(13930),_=i(7376),w=i(36833),x=i(62250),C=i(47181),j=i(15972),L=i(79192),B=i(14840),$=i(61626),U=i(68055),V=i(76264),z=i(93742),Y=i(95116),Z=w.PROPER,ee=w.CONFIGURABLE,ie=Y.IteratorPrototype,ae=Y.BUGGY_SAFARI_ITERATORS,ce=V("iterator"),le="keys",pe="values",de="entries",returnThis=function(){return this};s.exports=function(s,o,i,w,V,Y,fe){C(i,o,w);var ye,be,_e,getIterationMethod=function(s){if(s===V&&Te)return Te;if(!ae&&s&&s in xe)return xe[s];switch(s){case le:return function keys(){return new i(this,s)};case pe:return function values(){return new i(this,s)};case de:return function entries(){return new i(this,s)}}return function(){return new i(this)}},Se=o+" Iterator",we=!1,xe=s.prototype,Pe=xe[ce]||xe["@@iterator"]||V&&xe[V],Te=!ae&&Pe||getIterationMethod(V),Re="Array"===o&&xe.entries||Pe;if(Re&&(ye=j(Re.call(new s)))!==Object.prototype&&ye.next&&(_||j(ye)===ie||(L?L(ye,ie):x(ye[ce])||U(ye,ce,returnThis)),B(ye,Se,!0,!0),_&&(z[Se]=returnThis)),Z&&V===pe&&Pe&&Pe.name!==pe&&(!_&&ee?$(xe,"name",pe):(we=!0,Te=function values(){return u(Pe,this)})),V)if(be={values:getIterationMethod(pe),keys:Y?Te:getIterationMethod(le),entries:getIterationMethod(de)},fe)for(_e in be)(ae||we||!(_e in xe))&&U(xe,_e,be[_e]);else a({target:o,proto:!0,forced:ae||we},be);return _&&!fe||xe[ce]===Te||U(xe,ce,Te,{name:V}),z[o]=Te,be}},60270:(s,o,i)=>{var a=i(87068),u=i(40346);s.exports=function baseIsEqual(s,o,i,_,w){return s===o||(null==s||null==o||!u(s)&&!u(o)?s!=s&&o!=o:a(s,o,i,_,baseIsEqual,w))}},60581:(s,o,i)=>{"use strict";var a=i(13930),u=i(62250),_=i(46285),w=TypeError;s.exports=function(s,o){var i,x;if("string"===o&&u(i=s.toString)&&!_(x=a(i,s)))return x;if(u(i=s.valueOf)&&!_(x=a(i,s)))return x;if("string"!==o&&u(i=s.toString)&&!_(x=a(i,s)))return x;throw new w("Can't convert object to primitive value")}},60680:(s,o,i)=>{var a=i(13222),u=/[\\^$.*+?()[\]{}|]/g,_=RegExp(u.source);s.exports=function escapeRegExp(s){return(s=a(s))&&_.test(s)?s.replace(u,"\\$&"):s}},61045:(s,o,i)=>{const a=i(6048),u=i(23805),_=i(6233),w=i(87726),x=i(10866);s.exports=class ObjectElement extends _{constructor(s,o,i){super(s||[],o,i),this.element="object"}primitive(){return"object"}toValue(){return this.content.reduce(((s,o)=>(s[o.key.toValue()]=o.value?o.value.toValue():void 0,s)),{})}get(s){const o=this.getMember(s);if(o)return o.value}getMember(s){if(void 0!==s)return this.content.find((o=>o.key.toValue()===s))}remove(s){let o=null;return this.content=this.content.filter((i=>i.key.toValue()!==s||(o=i,!1))),o}getKey(s){const o=this.getMember(s);if(o)return o.key}set(s,o){if(u(s))return Object.keys(s).forEach((o=>{this.set(o,s[o])})),this;const i=s,a=this.getMember(i);return a?a.value=o:this.content.push(new w(i,o)),this}keys(){return this.content.map((s=>s.key.toValue()))}values(){return this.content.map((s=>s.value.toValue()))}hasKey(s){return this.content.some((o=>o.key.equals(s)))}items(){return this.content.map((s=>[s.key.toValue(),s.value.toValue()]))}map(s,o){return this.content.map((i=>s.bind(o)(i.value,i.key,i)))}compactMap(s,o){const i=[];return this.forEach(((a,u,_)=>{const w=s.bind(o)(a,u,_);w&&i.push(w)})),i}filter(s,o){return new x(this.content).filter(s,o)}reject(s,o){return this.filter(a(s),o)}forEach(s,o){return this.content.forEach((i=>s.bind(o)(i.value,i.key,i)))}}},61074:s=>{s.exports=function asciiToArray(s){return s.split("")}},61160:(s,o,i)=>{"use strict";var a=i(92063),u=i(73992),_=/^[\x00-\x20\u00a0\u1680\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff]+/,w=/[\n\r\t]/g,x=/^[A-Za-z][A-Za-z0-9+-.]*:\/\//,C=/:\d+$/,j=/^([a-z][a-z0-9.+-]*:)?(\/\/)?([\\/]+)?([\S\s]*)/i,L=/^[a-zA-Z]:/;function trimLeft(s){return(s||"").toString().replace(_,"")}var B=[["#","hash"],["?","query"],function sanitize(s,o){return isSpecial(o.protocol)?s.replace(/\\/g,"/"):s},["/","pathname"],["@","auth",1],[NaN,"host",void 0,1,1],[/:(\d*)$/,"port",void 0,1],[NaN,"hostname",void 0,1,1]],$={hash:1,query:1};function lolcation(s){var o,a=("undefined"!=typeof window?window:void 0!==i.g?i.g:"undefined"!=typeof self?self:{}).location||{},u={},_=typeof(s=s||a);if("blob:"===s.protocol)u=new Url(unescape(s.pathname),{});else if("string"===_)for(o in u=new Url(s,{}),$)delete u[o];else if("object"===_){for(o in s)o in $||(u[o]=s[o]);void 0===u.slashes&&(u.slashes=x.test(s.href))}return u}function isSpecial(s){return"file:"===s||"ftp:"===s||"http:"===s||"https:"===s||"ws:"===s||"wss:"===s}function extractProtocol(s,o){s=(s=trimLeft(s)).replace(w,""),o=o||{};var i,a=j.exec(s),u=a[1]?a[1].toLowerCase():"",_=!!a[2],x=!!a[3],C=0;return _?x?(i=a[2]+a[3]+a[4],C=a[2].length+a[3].length):(i=a[2]+a[4],C=a[2].length):x?(i=a[3]+a[4],C=a[3].length):i=a[4],"file:"===u?C>=2&&(i=i.slice(2)):isSpecial(u)?i=a[4]:u?_&&(i=i.slice(2)):C>=2&&isSpecial(o.protocol)&&(i=a[4]),{protocol:u,slashes:_||isSpecial(u),slashesCount:C,rest:i}}function Url(s,o,i){if(s=(s=trimLeft(s)).replace(w,""),!(this instanceof Url))return new Url(s,o,i);var _,x,C,j,$,U,V=B.slice(),z=typeof o,Y=this,Z=0;for("object"!==z&&"string"!==z&&(i=o,o=null),i&&"function"!=typeof i&&(i=u.parse),_=!(x=extractProtocol(s||"",o=lolcation(o))).protocol&&!x.slashes,Y.slashes=x.slashes||_&&o.slashes,Y.protocol=x.protocol||o.protocol||"",s=x.rest,("file:"===x.protocol&&(2!==x.slashesCount||L.test(s))||!x.slashes&&(x.protocol||x.slashesCount<2||!isSpecial(Y.protocol)))&&(V[3]=[/(.*)/,"pathname"]);Z<V.length;Z++)"function"!=typeof(j=V[Z])?(C=j[0],U=j[1],C!=C?Y[U]=s:"string"==typeof C?~($="@"===C?s.lastIndexOf(C):s.indexOf(C))&&("number"==typeof j[2]?(Y[U]=s.slice(0,$),s=s.slice($+j[2])):(Y[U]=s.slice($),s=s.slice(0,$))):($=C.exec(s))&&(Y[U]=$[1],s=s.slice(0,$.index)),Y[U]=Y[U]||_&&j[3]&&o[U]||"",j[4]&&(Y[U]=Y[U].toLowerCase())):s=j(s,Y);i&&(Y.query=i(Y.query)),_&&o.slashes&&"/"!==Y.pathname.charAt(0)&&(""!==Y.pathname||""!==o.pathname)&&(Y.pathname=function resolve(s,o){if(""===s)return o;for(var i=(o||"/").split("/").slice(0,-1).concat(s.split("/")),a=i.length,u=i[a-1],_=!1,w=0;a--;)"."===i[a]?i.splice(a,1):".."===i[a]?(i.splice(a,1),w++):w&&(0===a&&(_=!0),i.splice(a,1),w--);return _&&i.unshift(""),"."!==u&&".."!==u||i.push(""),i.join("/")}(Y.pathname,o.pathname)),"/"!==Y.pathname.charAt(0)&&isSpecial(Y.protocol)&&(Y.pathname="/"+Y.pathname),a(Y.port,Y.protocol)||(Y.host=Y.hostname,Y.port=""),Y.username=Y.password="",Y.auth&&(~($=Y.auth.indexOf(":"))?(Y.username=Y.auth.slice(0,$),Y.username=encodeURIComponent(decodeURIComponent(Y.username)),Y.password=Y.auth.slice($+1),Y.password=encodeURIComponent(decodeURIComponent(Y.password))):Y.username=encodeURIComponent(decodeURIComponent(Y.auth)),Y.auth=Y.password?Y.username+":"+Y.password:Y.username),Y.origin="file:"!==Y.protocol&&isSpecial(Y.protocol)&&Y.host?Y.protocol+"//"+Y.host:"null",Y.href=Y.toString()}Url.prototype={set:function set(s,o,i){var _=this;switch(s){case"query":"string"==typeof o&&o.length&&(o=(i||u.parse)(o)),_[s]=o;break;case"port":_[s]=o,a(o,_.protocol)?o&&(_.host=_.hostname+":"+o):(_.host=_.hostname,_[s]="");break;case"hostname":_[s]=o,_.port&&(o+=":"+_.port),_.host=o;break;case"host":_[s]=o,C.test(o)?(o=o.split(":"),_.port=o.pop(),_.hostname=o.join(":")):(_.hostname=o,_.port="");break;case"protocol":_.protocol=o.toLowerCase(),_.slashes=!i;break;case"pathname":case"hash":if(o){var w="pathname"===s?"/":"#";_[s]=o.charAt(0)!==w?w+o:o}else _[s]=o;break;case"username":case"password":_[s]=encodeURIComponent(o);break;case"auth":var x=o.indexOf(":");~x?(_.username=o.slice(0,x),_.username=encodeURIComponent(decodeURIComponent(_.username)),_.password=o.slice(x+1),_.password=encodeURIComponent(decodeURIComponent(_.password))):_.username=encodeURIComponent(decodeURIComponent(o))}for(var j=0;j<B.length;j++){var L=B[j];L[4]&&(_[L[1]]=_[L[1]].toLowerCase())}return _.auth=_.password?_.username+":"+_.password:_.username,_.origin="file:"!==_.protocol&&isSpecial(_.protocol)&&_.host?_.protocol+"//"+_.host:"null",_.href=_.toString(),_},toString:function toString(s){s&&"function"==typeof s||(s=u.stringify);var o,i=this,a=i.host,_=i.protocol;_&&":"!==_.charAt(_.length-1)&&(_+=":");var w=_+(i.protocol&&i.slashes||isSpecial(i.protocol)?"//":"");return i.username?(w+=i.username,i.password&&(w+=":"+i.password),w+="@"):i.password?(w+=":"+i.password,w+="@"):"file:"!==i.protocol&&isSpecial(i.protocol)&&!a&&"/"!==i.pathname&&(w+="@"),(":"===a[a.length-1]||C.test(i.hostname)&&!i.port)&&(a+=":"),w+=a+i.pathname,(o="object"==typeof i.query?s(i.query):i.query)&&(w+="?"!==o.charAt(0)?"?"+o:o),i.hash&&(w+=i.hash),w}},Url.extractProtocol=extractProtocol,Url.location=lolcation,Url.trimLeft=trimLeft,Url.qs=u,s.exports=Url},61448:(s,o,i)=>{var a=i(20426),u=i(49326);s.exports=function has(s,o){return null!=s&&u(s,o,a)}},61489:(s,o,i)=>{var a=i(17400);s.exports=function toInteger(s){var o=a(s),i=o%1;return o==o?i?o-i:o:0}},61626:(s,o,i)=>{"use strict";var a=i(39447),u=i(74284),_=i(75817);s.exports=a?function(s,o,i){return u.f(s,o,_(1,i))}:function(s,o,i){return s[o]=i,s}},61747:(s,o,i)=>{"use strict";var a=i(45951),u=i(92046);s.exports=function(s,o){var i=u[s+"Prototype"],_=i&&i[o];if(_)return _;var w=a[s],x=w&&w.prototype;return x&&x[o]}},61802:(s,o,i)=>{var a=i(62224),u=/[^.[\]]+|\[(?:(-?\d+(?:\.\d+)?)|(["'])((?:(?!\2)[^\\]|\\.)*?)\2)\]|(?=(?:\.|\[\])(?:\.|\[\]|$))/g,_=/\\(\\)?/g,w=a((function(s){var o=[];return 46===s.charCodeAt(0)&&o.push(""),s.replace(u,(function(s,i,a,u){o.push(a?u.replace(_,"$1"):i||s)})),o}));s.exports=w},62006:(s,o,i)=>{var a=i(15389),u=i(64894),_=i(95950);s.exports=function createFind(s){return function(o,i,w){var x=Object(o);if(!u(o)){var C=a(i,3);o=_(o),i=function(s){return C(x[s],s,x)}}var j=s(o,i,w);return j>-1?x[C?o[j]:j]:void 0}}},62060:s=>{var o=/\{(?:\n\/\* \[wrapped with .+\] \*\/)?\n?/;s.exports=function insertWrapDetails(s,i){var a=i.length;if(!a)return s;var u=a-1;return i[u]=(a>1?"& ":"")+i[u],i=i.join(a>2?", ":" "),s.replace(o,"{\n/* [wrapped with "+i+"] */\n")}},62193:(s,o,i)=>{var a=i(88984),u=i(5861),_=i(72428),w=i(56449),x=i(64894),C=i(3656),j=i(55527),L=i(37167),B=Object.prototype.hasOwnProperty;s.exports=function isEmpty(s){if(null==s)return!0;if(x(s)&&(w(s)||"string"==typeof s||"function"==typeof s.splice||C(s)||L(s)||_(s)))return!s.length;var o=u(s);if("[object Map]"==o||"[object Set]"==o)return!s.size;if(j(s))return!a(s).length;for(var i in s)if(B.call(s,i))return!1;return!0}},62224:(s,o,i)=>{var a=i(50104);s.exports=function memoizeCapped(s){var o=a(s,(function(s){return 500===i.size&&i.clear(),s})),i=o.cache;return o}},62250:s=>{"use strict";var o="object"==typeof document&&document.all;s.exports=void 0===o&&void 0!==o?function(s){return"function"==typeof s||s===o}:function(s){return"function"==typeof s}},62284:(s,o,i)=>{var a=i(84629),u=Object.prototype.hasOwnProperty;s.exports=function getFuncName(s){for(var o=s.name+"",i=a[o],_=u.call(a,o)?i.length:0;_--;){var w=i[_],x=w.func;if(null==x||x==s)return w.name}return o}},62416:(s,o,i)=>{"use strict";var a=i(85582);s.exports=a("document","documentElement")},62802:(s,o,i)=>{"use strict";s.exports=function SHA(o){var i=o.toLowerCase(),a=s.exports[i];if(!a)throw new Error(i+" is not supported (we accept pull requests)");return new a},s.exports.sha=i(27816),s.exports.sha1=i(63737),s.exports.sha224=i(26710),s.exports.sha256=i(24107),s.exports.sha384=i(32827),s.exports.sha512=i(82890)},63040:(s,o,i)=>{var a=i(21549),u=i(80079),_=i(68223);s.exports=function mapCacheClear(){this.size=0,this.__data__={hash:new a,map:new(_||u),string:new a}}},63345:s=>{s.exports=function stubArray(){return[]}},63560:(s,o,i)=>{var a=i(73170);s.exports=function set(s,o,i){return null==s?s:a(s,o,i)}},63600:(s,o,i)=>{"use strict";s.exports=PassThrough;var a=i(74610);function PassThrough(s){if(!(this instanceof PassThrough))return new PassThrough(s);a.call(this,s)}i(56698)(PassThrough,a),PassThrough.prototype._transform=function(s,o,i){i(null,s)}},63605:s=>{s.exports=function stackGet(s){return this.__data__.get(s)}},63702:s=>{s.exports=function listCacheClear(){this.__data__=[],this.size=0}},63737:(s,o,i)=>{"use strict";var a=i(56698),u=i(90392),_=i(92861).Buffer,w=[1518500249,1859775393,-1894007588,-899497514],x=new Array(80);function Sha1(){this.init(),this._w=x,u.call(this,64,56)}function rotl5(s){return s<<5|s>>>27}function rotl30(s){return s<<30|s>>>2}function ft(s,o,i,a){return 0===s?o&i|~o&a:2===s?o&i|o&a|i&a:o^i^a}a(Sha1,u),Sha1.prototype.init=function(){return this._a=1732584193,this._b=4023233417,this._c=2562383102,this._d=271733878,this._e=3285377520,this},Sha1.prototype._update=function(s){for(var o,i=this._w,a=0|this._a,u=0|this._b,_=0|this._c,x=0|this._d,C=0|this._e,j=0;j<16;++j)i[j]=s.readInt32BE(4*j);for(;j<80;++j)i[j]=(o=i[j-3]^i[j-8]^i[j-14]^i[j-16])<<1|o>>>31;for(var L=0;L<80;++L){var B=~~(L/20),$=rotl5(a)+ft(B,u,_,x)+C+i[L]+w[B]|0;C=x,x=_,_=rotl30(u),u=a,a=$}this._a=a+this._a|0,this._b=u+this._b|0,this._c=_+this._c|0,this._d=x+this._d|0,this._e=C+this._e|0},Sha1.prototype._hash=function(){var s=_.allocUnsafe(20);return s.writeInt32BE(0|this._a,0),s.writeInt32BE(0|this._b,4),s.writeInt32BE(0|this._c,8),s.writeInt32BE(0|this._d,12),s.writeInt32BE(0|this._e,16),s},s.exports=Sha1},63862:s=>{s.exports=function hashDelete(s){var o=this.has(s)&&delete this.__data__[s];return this.size-=o?1:0,o}},63912:(s,o,i)=>{var a=i(61074),u=i(49698),_=i(42054);s.exports=function stringToArray(s){return u(s)?_(s):a(s)}},63950:s=>{s.exports=function noop(){}},64039:(s,o,i)=>{"use strict";var a="undefined"!=typeof Symbol&&Symbol,u=i(41333);s.exports=function hasNativeSymbols(){return"function"==typeof a&&("function"==typeof Symbol&&("symbol"==typeof a("foo")&&("symbol"==typeof Symbol("bar")&&u())))}},64502:(s,o,i)=>{"use strict";i(82048)},64626:(s,o,i)=>{var a=i(66977);s.exports=function ary(s,o,i){return o=i?void 0:o,o=s&&null==o?s.length:o,a(s,128,void 0,void 0,void 0,void 0,o)}},64634:s=>{var o={}.toString;s.exports=Array.isArray||function(s){return"[object Array]"==o.call(s)}},64894:(s,o,i)=>{var a=i(1882),u=i(30294);s.exports=function isArrayLike(s){return null!=s&&u(s.length)&&!a(s)}},64932:(s,o,i)=>{"use strict";var a,u,_,w=i(40551),x=i(45951),C=i(46285),j=i(61626),L=i(49724),B=i(36128),$=i(92522),U=i(38530),V="Object already initialized",z=x.TypeError,Y=x.WeakMap;if(w||B.state){var Z=B.state||(B.state=new Y);Z.get=Z.get,Z.has=Z.has,Z.set=Z.set,a=function(s,o){if(Z.has(s))throw new z(V);return o.facade=s,Z.set(s,o),o},u=function(s){return Z.get(s)||{}},_=function(s){return Z.has(s)}}else{var ee=$("state");U[ee]=!0,a=function(s,o){if(L(s,ee))throw new z(V);return o.facade=s,j(s,ee,o),o},u=function(s){return L(s,ee)?s[ee]:{}},_=function(s){return L(s,ee)}}s.exports={set:a,get:u,has:_,enforce:function(s){return _(s)?u(s):a(s,{})},getterFor:function(s){return function(o){var i;if(!C(o)||(i=u(o)).type!==s)throw new z("Incompatible receiver, "+s+" required");return i}}}},65291:(s,o,i)=>{"use strict";var a=i(86048).F.ERR_INVALID_OPT_VALUE;s.exports={getHighWaterMark:function getHighWaterMark(s,o,i,u){var _=function highWaterMarkFrom(s,o,i){return null!=s.highWaterMark?s.highWaterMark:o?s[i]:null}(o,u,i);if(null!=_){if(!isFinite(_)||Math.floor(_)!==_||_<0)throw new a(u?i:"highWaterMark",_);return Math.floor(_)}return s.objectMode?16:16384}}},65482:(s,o,i)=>{"use strict";var a=i(41176);s.exports=function(s){var o=+s;return o!=o||0===o?0:a(o)}},65606:s=>{var o,i,a=s.exports={};function defaultSetTimout(){throw new Error("setTimeout has not been defined")}function defaultClearTimeout(){throw new Error("clearTimeout has not been defined")}function runTimeout(s){if(o===setTimeout)return setTimeout(s,0);if((o===defaultSetTimout||!o)&&setTimeout)return o=setTimeout,setTimeout(s,0);try{return o(s,0)}catch(i){try{return o.call(null,s,0)}catch(i){return o.call(this,s,0)}}}!function(){try{o="function"==typeof setTimeout?setTimeout:defaultSetTimout}catch(s){o=defaultSetTimout}try{i="function"==typeof clearTimeout?clearTimeout:defaultClearTimeout}catch(s){i=defaultClearTimeout}}();var u,_=[],w=!1,x=-1;function cleanUpNextTick(){w&&u&&(w=!1,u.length?_=u.concat(_):x=-1,_.length&&drainQueue())}function drainQueue(){if(!w){var s=runTimeout(cleanUpNextTick);w=!0;for(var o=_.length;o;){for(u=_,_=[];++x<o;)u&&u[x].run();x=-1,o=_.length}u=null,w=!1,function runClearTimeout(s){if(i===clearTimeout)return clearTimeout(s);if((i===defaultClearTimeout||!i)&&clearTimeout)return i=clearTimeout,clearTimeout(s);try{return i(s)}catch(o){try{return i.call(null,s)}catch(o){return i.call(this,s)}}}(s)}}function Item(s,o){this.fun=s,this.array=o}function noop(){}a.nextTick=function(s){var o=new Array(arguments.length-1);if(arguments.length>1)for(var i=1;i<arguments.length;i++)o[i-1]=arguments[i];_.push(new Item(s,o)),1!==_.length||w||runTimeout(drainQueue)},Item.prototype.run=function(){this.fun.apply(null,this.array)},a.title="browser",a.browser=!0,a.env={},a.argv=[],a.version="",a.versions={},a.on=noop,a.addListener=noop,a.once=noop,a.off=noop,a.removeListener=noop,a.removeAllListeners=noop,a.emit=noop,a.prependListener=noop,a.prependOnceListener=noop,a.listeners=function(s){return[]},a.binding=function(s){throw new Error("process.binding is not supported")},a.cwd=function(){return"/"},a.chdir=function(s){throw new Error("process.chdir is not supported")},a.umask=function(){return 0}},65772:s=>{s.exports=function json(s){const o={literal:"true false null"},i=[s.C_LINE_COMMENT_MODE,s.C_BLOCK_COMMENT_MODE],a=[s.QUOTE_STRING_MODE,s.C_NUMBER_MODE],u={end:",",endsWithParent:!0,excludeEnd:!0,contains:a,keywords:o},_={begin:/\{/,end:/\}/,contains:[{className:"attr",begin:/"/,end:/"/,contains:[s.BACKSLASH_ESCAPE],illegal:"\\n"},s.inherit(u,{begin:/:/})].concat(i),illegal:"\\S"},w={begin:"\\[",end:"\\]",contains:[s.inherit(u)],illegal:"\\S"};return a.push(_,w),i.forEach((function(s){a.push(s)})),{name:"JSON",contains:a,keywords:o,illegal:"\\S"}}},66645:(s,o,i)=>{var a=i(1733),u=i(45434),_=i(13222),w=i(22225);s.exports=function words(s,o,i){return s=_(s),void 0===(o=i?void 0:o)?u(s)?w(s):a(s):s.match(o)||[]}},66721:(s,o,i)=>{var a=i(81042),u=Object.prototype.hasOwnProperty;s.exports=function hashGet(s){var o=this.__data__;if(a){var i=o[s];return"__lodash_hash_undefined__"===i?void 0:i}return u.call(o,s)?o[s]:void 0}},66743:(s,o,i)=>{"use strict";var a=i(89353);s.exports=Function.prototype.bind||a},66977:(s,o,i)=>{var a=i(68882),u=i(11842),_=i(77078),w=i(37471),x=i(24168),C=i(37381),j=i(3209),L=i(54641),B=i(70981),$=i(61489),U=Math.max;s.exports=function createWrap(s,o,i,V,z,Y,Z,ee){var ie=2&o;if(!ie&&"function"!=typeof s)throw new TypeError("Expected a function");var ae=V?V.length:0;if(ae||(o&=-97,V=z=void 0),Z=void 0===Z?Z:U($(Z),0),ee=void 0===ee?ee:$(ee),ae-=z?z.length:0,64&o){var ce=V,le=z;V=z=void 0}var pe=ie?void 0:C(s),de=[s,o,i,V,z,ce,le,Y,Z,ee];if(pe&&j(de,pe),s=de[0],o=de[1],i=de[2],V=de[3],z=de[4],!(ee=de[9]=void 0===de[9]?ie?0:s.length:U(de[9]-ae,0))&&24&o&&(o&=-25),o&&1!=o)fe=8==o||16==o?_(s,o,ee):32!=o&&33!=o||z.length?w.apply(void 0,de):x(s,o,i,V);else var fe=u(s,o,i);return B((pe?a:L)(fe,de),s,o)}},67197:s=>{s.exports=function matchesStrictComparable(s,o){return function(i){return null!=i&&(i[s]===o&&(void 0!==o||s in Object(i)))}}},67526:(s,o)=>{"use strict";o.byteLength=function byteLength(s){var o=getLens(s),i=o[0],a=o[1];return 3*(i+a)/4-a},o.toByteArray=function toByteArray(s){var o,i,_=getLens(s),w=_[0],x=_[1],C=new u(function _byteLength(s,o,i){return 3*(o+i)/4-i}(0,w,x)),j=0,L=x>0?w-4:w;for(i=0;i<L;i+=4)o=a[s.charCodeAt(i)]<<18|a[s.charCodeAt(i+1)]<<12|a[s.charCodeAt(i+2)]<<6|a[s.charCodeAt(i+3)],C[j++]=o>>16&255,C[j++]=o>>8&255,C[j++]=255&o;2===x&&(o=a[s.charCodeAt(i)]<<2|a[s.charCodeAt(i+1)]>>4,C[j++]=255&o);1===x&&(o=a[s.charCodeAt(i)]<<10|a[s.charCodeAt(i+1)]<<4|a[s.charCodeAt(i+2)]>>2,C[j++]=o>>8&255,C[j++]=255&o);return C},o.fromByteArray=function fromByteArray(s){for(var o,a=s.length,u=a%3,_=[],w=16383,x=0,C=a-u;x<C;x+=w)_.push(encodeChunk(s,x,x+w>C?C:x+w));1===u?(o=s[a-1],_.push(i[o>>2]+i[o<<4&63]+"==")):2===u&&(o=(s[a-2]<<8)+s[a-1],_.push(i[o>>10]+i[o>>4&63]+i[o<<2&63]+"="));return _.join("")};for(var i=[],a=[],u="undefined"!=typeof Uint8Array?Uint8Array:Array,_="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/",w=0;w<64;++w)i[w]=_[w],a[_.charCodeAt(w)]=w;function getLens(s){var o=s.length;if(o%4>0)throw new Error("Invalid string. Length must be a multiple of 4");var i=s.indexOf("=");return-1===i&&(i=o),[i,i===o?0:4-i%4]}function encodeChunk(s,o,a){for(var u,_,w=[],x=o;x<a;x+=3)u=(s[x]<<16&16711680)+(s[x+1]<<8&65280)+(255&s[x+2]),w.push(i[(_=u)>>18&63]+i[_>>12&63]+i[_>>6&63]+i[63&_]);return w.join("")}a["-".charCodeAt(0)]=62,a["_".charCodeAt(0)]=63},68002:s=>{"use strict";s.exports=Math.min},68055:(s,o,i)=>{"use strict";var a=i(61626);s.exports=function(s,o,i,u){return u&&u.enumerable?s[o]=i:a(s,o,i),s}},68090:s=>{s.exports=function last(s){var o=null==s?0:s.length;return o?s[o-1]:void 0}},68223:(s,o,i)=>{var a=i(56110)(i(9325),"Map");s.exports=a},68294:(s,o,i)=>{var a=i(23007),u=i(30361),_=Math.min;s.exports=function reorder(s,o){for(var i=s.length,w=_(o.length,i),x=a(s);w--;){var C=o[w];s[w]=u(C,i)?x[C]:void 0}return s}},68623:(s,o,i)=>{"use strict";var a=i(694);s.exports=a},68882:(s,o,i)=>{var a=i(83488),u=i(48152),_=u?function(s,o){return u.set(s,o),s}:a;s.exports=_},68969:(s,o,i)=>{var a=i(47422),u=i(25160);s.exports=function parent(s,o){return o.length<2?s:a(s,u(o,0,-1))}},69302:(s,o,i)=>{var a=i(83488),u=i(56757),_=i(32865);s.exports=function baseRest(s,o){return _(u(s,o,a),s+"")}},69383:s=>{"use strict";s.exports=Error},69600:s=>{"use strict";var o,i,a=Function.prototype.toString,u="object"==typeof Reflect&&null!==Reflect&&Reflect.apply;if("function"==typeof u&&"function"==typeof Object.defineProperty)try{o=Object.defineProperty({},"length",{get:function(){throw i}}),i={},u((function(){throw 42}),null,o)}catch(s){s!==i&&(u=null)}else u=null;var _=/^\s*class\b/,w=function isES6ClassFunction(s){try{var o=a.call(s);return _.test(o)}catch(s){return!1}},x=function tryFunctionToStr(s){try{return!w(s)&&(a.call(s),!0)}catch(s){return!1}},C=Object.prototype.toString,j="function"==typeof Symbol&&!!Symbol.toStringTag,L=!(0 in[,]),B=function isDocumentDotAll(){return!1};if("object"==typeof document){var $=document.all;C.call($)===C.call(document.all)&&(B=function isDocumentDotAll(s){if((L||!s)&&(void 0===s||"object"==typeof s))try{var o=C.call(s);return("[object HTMLAllCollection]"===o||"[object HTML document.all class]"===o||"[object HTMLCollection]"===o||"[object Object]"===o)&&null==s("")}catch(s){}return!1})}s.exports=u?function isCallable(s){if(B(s))return!0;if(!s)return!1;if("function"!=typeof s&&"object"!=typeof s)return!1;try{u(s,null,o)}catch(s){if(s!==i)return!1}return!w(s)&&x(s)}:function isCallable(s){if(B(s))return!0;if(!s)return!1;if("function"!=typeof s&&"object"!=typeof s)return!1;if(j)return x(s);if(w(s))return!1;var o=C.call(s);return!("[object Function]"!==o&&"[object GeneratorFunction]"!==o&&!/^\[object HTML/.test(o))&&x(s)}},69675:s=>{"use strict";s.exports=TypeError},69884:(s,o,i)=>{var a=i(21791),u=i(37241);s.exports=function toPlainObject(s){return a(s,u(s))}},69982:(s,o,i)=>{"use strict";s.exports=i(29844)},70080:(s,o,i)=>{var a=i(26025),u=Array.prototype.splice;s.exports=function listCacheDelete(s){var o=this.__data__,i=a(o,s);return!(i<0)&&(i==o.length-1?o.pop():u.call(o,i,1),--this.size,!0)}},70414:s=>{"use strict";s.exports=Math.round},70453:(s,o,i)=>{"use strict";var a,u=i(79612),_=i(69383),w=i(41237),x=i(79290),C=i(79538),j=i(58068),L=i(69675),B=i(35345),$=i(71514),U=i(58968),V=i(6188),z=i(68002),Y=i(75880),Z=i(70414),ee=i(73093),ie=Function,getEvalledConstructor=function(s){try{return ie('"use strict"; return ('+s+").constructor;")()}catch(s){}},ae=i(75795),ce=i(30655),throwTypeError=function(){throw new L},le=ae?function(){try{return throwTypeError}catch(s){try{return ae(arguments,"callee").get}catch(s){return throwTypeError}}}():throwTypeError,pe=i(64039)(),de=i(93628),fe=i(71064),ye=i(48648),be=i(11002),_e=i(10076),Se={},we="undefined"!=typeof Uint8Array&&de?de(Uint8Array):a,xe={__proto__:null,"%AggregateError%":"undefined"==typeof AggregateError?a:AggregateError,"%Array%":Array,"%ArrayBuffer%":"undefined"==typeof ArrayBuffer?a:ArrayBuffer,"%ArrayIteratorPrototype%":pe&&de?de([][Symbol.iterator]()):a,"%AsyncFromSyncIteratorPrototype%":a,"%AsyncFunction%":Se,"%AsyncGenerator%":Se,"%AsyncGeneratorFunction%":Se,"%AsyncIteratorPrototype%":Se,"%Atomics%":"undefined"==typeof Atomics?a:Atomics,"%BigInt%":"undefined"==typeof BigInt?a:BigInt,"%BigInt64Array%":"undefined"==typeof BigInt64Array?a:BigInt64Array,"%BigUint64Array%":"undefined"==typeof BigUint64Array?a:BigUint64Array,"%Boolean%":Boolean,"%DataView%":"undefined"==typeof DataView?a:DataView,"%Date%":Date,"%decodeURI%":decodeURI,"%decodeURIComponent%":decodeURIComponent,"%encodeURI%":encodeURI,"%encodeURIComponent%":encodeURIComponent,"%Error%":_,"%eval%":eval,"%EvalError%":w,"%Float32Array%":"undefined"==typeof Float32Array?a:Float32Array,"%Float64Array%":"undefined"==typeof Float64Array?a:Float64Array,"%FinalizationRegistry%":"undefined"==typeof FinalizationRegistry?a:FinalizationRegistry,"%Function%":ie,"%GeneratorFunction%":Se,"%Int8Array%":"undefined"==typeof Int8Array?a:Int8Array,"%Int16Array%":"undefined"==typeof Int16Array?a:Int16Array,"%Int32Array%":"undefined"==typeof Int32Array?a:Int32Array,"%isFinite%":isFinite,"%isNaN%":isNaN,"%IteratorPrototype%":pe&&de?de(de([][Symbol.iterator]())):a,"%JSON%":"object"==typeof JSON?JSON:a,"%Map%":"undefined"==typeof Map?a:Map,"%MapIteratorPrototype%":"undefined"!=typeof Map&&pe&&de?de((new Map)[Symbol.iterator]()):a,"%Math%":Math,"%Number%":Number,"%Object%":u,"%Object.getOwnPropertyDescriptor%":ae,"%parseFloat%":parseFloat,"%parseInt%":parseInt,"%Promise%":"undefined"==typeof Promise?a:Promise,"%Proxy%":"undefined"==typeof Proxy?a:Proxy,"%RangeError%":x,"%ReferenceError%":C,"%Reflect%":"undefined"==typeof Reflect?a:Reflect,"%RegExp%":RegExp,"%Set%":"undefined"==typeof Set?a:Set,"%SetIteratorPrototype%":"undefined"!=typeof Set&&pe&&de?de((new Set)[Symbol.iterator]()):a,"%SharedArrayBuffer%":"undefined"==typeof SharedArrayBuffer?a:SharedArrayBuffer,"%String%":String,"%StringIteratorPrototype%":pe&&de?de(""[Symbol.iterator]()):a,"%Symbol%":pe?Symbol:a,"%SyntaxError%":j,"%ThrowTypeError%":le,"%TypedArray%":we,"%TypeError%":L,"%Uint8Array%":"undefined"==typeof Uint8Array?a:Uint8Array,"%Uint8ClampedArray%":"undefined"==typeof Uint8ClampedArray?a:Uint8ClampedArray,"%Uint16Array%":"undefined"==typeof Uint16Array?a:Uint16Array,"%Uint32Array%":"undefined"==typeof Uint32Array?a:Uint32Array,"%URIError%":B,"%WeakMap%":"undefined"==typeof WeakMap?a:WeakMap,"%WeakRef%":"undefined"==typeof WeakRef?a:WeakRef,"%WeakSet%":"undefined"==typeof WeakSet?a:WeakSet,"%Function.prototype.call%":_e,"%Function.prototype.apply%":be,"%Object.defineProperty%":ce,"%Object.getPrototypeOf%":fe,"%Math.abs%":$,"%Math.floor%":U,"%Math.max%":V,"%Math.min%":z,"%Math.pow%":Y,"%Math.round%":Z,"%Math.sign%":ee,"%Reflect.getPrototypeOf%":ye};if(de)try{null.error}catch(s){var Pe=de(de(s));xe["%Error.prototype%"]=Pe}var Te=function doEval(s){var o;if("%AsyncFunction%"===s)o=getEvalledConstructor("async function () {}");else if("%GeneratorFunction%"===s)o=getEvalledConstructor("function* () {}");else if("%AsyncGeneratorFunction%"===s)o=getEvalledConstructor("async function* () {}");else if("%AsyncGenerator%"===s){var i=doEval("%AsyncGeneratorFunction%");i&&(o=i.prototype)}else if("%AsyncIteratorPrototype%"===s){var a=doEval("%AsyncGenerator%");a&&de&&(o=de(a.prototype))}return xe[s]=o,o},Re={__proto__:null,"%ArrayBufferPrototype%":["ArrayBuffer","prototype"],"%ArrayPrototype%":["Array","prototype"],"%ArrayProto_entries%":["Array","prototype","entries"],"%ArrayProto_forEach%":["Array","prototype","forEach"],"%ArrayProto_keys%":["Array","prototype","keys"],"%ArrayProto_values%":["Array","prototype","values"],"%AsyncFunctionPrototype%":["AsyncFunction","prototype"],"%AsyncGenerator%":["AsyncGeneratorFunction","prototype"],"%AsyncGeneratorPrototype%":["AsyncGeneratorFunction","prototype","prototype"],"%BooleanPrototype%":["Boolean","prototype"],"%DataViewPrototype%":["DataView","prototype"],"%DatePrototype%":["Date","prototype"],"%ErrorPrototype%":["Error","prototype"],"%EvalErrorPrototype%":["EvalError","prototype"],"%Float32ArrayPrototype%":["Float32Array","prototype"],"%Float64ArrayPrototype%":["Float64Array","prototype"],"%FunctionPrototype%":["Function","prototype"],"%Generator%":["GeneratorFunction","prototype"],"%GeneratorPrototype%":["GeneratorFunction","prototype","prototype"],"%Int8ArrayPrototype%":["Int8Array","prototype"],"%Int16ArrayPrototype%":["Int16Array","prototype"],"%Int32ArrayPrototype%":["Int32Array","prototype"],"%JSONParse%":["JSON","parse"],"%JSONStringify%":["JSON","stringify"],"%MapPrototype%":["Map","prototype"],"%NumberPrototype%":["Number","prototype"],"%ObjectPrototype%":["Object","prototype"],"%ObjProto_toString%":["Object","prototype","toString"],"%ObjProto_valueOf%":["Object","prototype","valueOf"],"%PromisePrototype%":["Promise","prototype"],"%PromiseProto_then%":["Promise","prototype","then"],"%Promise_all%":["Promise","all"],"%Promise_reject%":["Promise","reject"],"%Promise_resolve%":["Promise","resolve"],"%RangeErrorPrototype%":["RangeError","prototype"],"%ReferenceErrorPrototype%":["ReferenceError","prototype"],"%RegExpPrototype%":["RegExp","prototype"],"%SetPrototype%":["Set","prototype"],"%SharedArrayBufferPrototype%":["SharedArrayBuffer","prototype"],"%StringPrototype%":["String","prototype"],"%SymbolPrototype%":["Symbol","prototype"],"%SyntaxErrorPrototype%":["SyntaxError","prototype"],"%TypedArrayPrototype%":["TypedArray","prototype"],"%TypeErrorPrototype%":["TypeError","prototype"],"%Uint8ArrayPrototype%":["Uint8Array","prototype"],"%Uint8ClampedArrayPrototype%":["Uint8ClampedArray","prototype"],"%Uint16ArrayPrototype%":["Uint16Array","prototype"],"%Uint32ArrayPrototype%":["Uint32Array","prototype"],"%URIErrorPrototype%":["URIError","prototype"],"%WeakMapPrototype%":["WeakMap","prototype"],"%WeakSetPrototype%":["WeakSet","prototype"]},$e=i(66743),qe=i(9957),ze=$e.call(_e,Array.prototype.concat),We=$e.call(be,Array.prototype.splice),He=$e.call(_e,String.prototype.replace),Ye=$e.call(_e,String.prototype.slice),Xe=$e.call(_e,RegExp.prototype.exec),Qe=/[^%.[\]]+|\[(?:(-?\d+(?:\.\d+)?)|(["'])((?:(?!\2)[^\\]|\\.)*?)\2)\]|(?=(?:\.|\[\])(?:\.|\[\]|%$))/g,et=/\\(\\)?/g,tt=function getBaseIntrinsic(s,o){var i,a=s;if(qe(Re,a)&&(a="%"+(i=Re[a])[0]+"%"),qe(xe,a)){var u=xe[a];if(u===Se&&(u=Te(a)),void 0===u&&!o)throw new L("intrinsic "+s+" exists, but is not available. Please file an issue!");return{alias:i,name:a,value:u}}throw new j("intrinsic "+s+" does not exist!")};s.exports=function GetIntrinsic(s,o){if("string"!=typeof s||0===s.length)throw new L("intrinsic name must be a non-empty string");if(arguments.length>1&&"boolean"!=typeof o)throw new L('"allowMissing" argument must be a boolean');if(null===Xe(/^%?[^%]*%?$/,s))throw new j("`%` may not be present anywhere but at the beginning and end of the intrinsic name");var i=function stringToPath(s){var o=Ye(s,0,1),i=Ye(s,-1);if("%"===o&&"%"!==i)throw new j("invalid intrinsic syntax, expected closing `%`");if("%"===i&&"%"!==o)throw new j("invalid intrinsic syntax, expected opening `%`");var a=[];return He(s,Qe,(function(s,o,i,u){a[a.length]=i?He(u,et,"$1"):o||s})),a}(s),a=i.length>0?i[0]:"",u=tt("%"+a+"%",o),_=u.name,w=u.value,x=!1,C=u.alias;C&&(a=C[0],We(i,ze([0,1],C)));for(var B=1,$=!0;B<i.length;B+=1){var U=i[B],V=Ye(U,0,1),z=Ye(U,-1);if(('"'===V||"'"===V||"`"===V||'"'===z||"'"===z||"`"===z)&&V!==z)throw new j("property names with quotes must have matching quotes");if("constructor"!==U&&$||(x=!0),qe(xe,_="%"+(a+="."+U)+"%"))w=xe[_];else if(null!=w){if(!(U in w)){if(!o)throw new L("base intrinsic for "+s+" exists, but the property is not available.");return}if(ae&&B+1>=i.length){var Y=ae(w,U);w=($=!!Y)&&"get"in Y&&!("originalValue"in Y.get)?Y.get:w[U]}else $=qe(w,U),w=w[U];$&&!x&&(xe[_]=w)}}return w}},70470:(s,o,i)=>{"use strict";var a=i(46028),u=i(25594);s.exports=function(s){var o=a(s,"string");return u(o)?o:o+""}},70695:(s,o,i)=>{var a=i(78096),u=i(72428),_=i(56449),w=i(3656),x=i(30361),C=i(37167),j=Object.prototype.hasOwnProperty;s.exports=function arrayLikeKeys(s,o){var i=_(s),L=!i&&u(s),B=!i&&!L&&w(s),$=!i&&!L&&!B&&C(s),U=i||L||B||$,V=U?a(s.length,String):[],z=V.length;for(var Y in s)!o&&!j.call(s,Y)||U&&("length"==Y||B&&("offset"==Y||"parent"==Y)||$&&("buffer"==Y||"byteLength"==Y||"byteOffset"==Y)||x(Y,z))||V.push(Y);return V}},70981:(s,o,i)=>{var a=i(75251),u=i(62060),_=i(32865),w=i(75948);s.exports=function setWrapToString(s,o,i){var x=o+"";return _(s,u(x,w(a(x),i)))}},71064:(s,o,i)=>{"use strict";var a=i(79612);s.exports=a.getPrototypeOf||null},71167:(s,o,i)=>{const a=i(10316);s.exports=class StringElement extends a{constructor(s,o,i){super(s,o,i),this.element="string"}primitive(){return"string"}get length(){return this.content.length}}},71340:(s,o,i)=>{"use strict";var a=i(11091),u=i(29538);a({target:"Object",stat:!0,arity:2,forced:Object.assign!==u},{assign:u})},71514:s=>{"use strict";s.exports=Math.abs},71961:(s,o,i)=>{var a=i(49653);s.exports=function cloneTypedArray(s,o){var i=o?a(s.buffer):s.buffer;return new s.constructor(i,s.byteOffset,s.length)}},72428:(s,o,i)=>{var a=i(27534),u=i(40346),_=Object.prototype,w=_.hasOwnProperty,x=_.propertyIsEnumerable,C=a(function(){return arguments}())?a:function(s){return u(s)&&w.call(s,"callee")&&!x.call(s,"callee")};s.exports=C},72552:(s,o,i)=>{var a=i(51873),u=i(659),_=i(59350),w=a?a.toStringTag:void 0;s.exports=function baseGetTag(s){return null==s?void 0===s?"[object Undefined]":"[object Null]":w&&w in Object(s)?u(s):_(s)}},72903:(s,o,i)=>{var a=i(23805),u=i(55527),_=i(90181),w=Object.prototype.hasOwnProperty;s.exports=function baseKeysIn(s){if(!a(s))return _(s);var o=u(s),i=[];for(var x in s)("constructor"!=x||!o&&w.call(s,x))&&i.push(x);return i}},72949:(s,o,i)=>{var a=i(12651);s.exports=function mapCacheSet(s,o){var i=a(this,s),u=i.size;return i.set(s,o),this.size+=i.size==u?0:1,this}},73093:(s,o,i)=>{"use strict";var a=i(94459);s.exports=function sign(s){return a(s)||0===s?s:s<0?-1:1}},73126:(s,o,i)=>{"use strict";var a=i(66743),u=i(69675),_=i(10076),w=i(13144);s.exports=function callBindBasic(s){if(s.length<1||"function"!=typeof s[0])throw new u("a function is required");return w(a,_,s)}},73170:(s,o,i)=>{var a=i(16547),u=i(31769),_=i(30361),w=i(23805),x=i(77797);s.exports=function baseSet(s,o,i,C){if(!w(s))return s;for(var j=-1,L=(o=u(o,s)).length,B=L-1,$=s;null!=$&&++j<L;){var U=x(o[j]),V=i;if("__proto__"===U||"constructor"===U||"prototype"===U)return s;if(j!=B){var z=$[U];void 0===(V=C?C(z,U,$):void 0)&&(V=w(z)?z:_(o[j+1])?[]:{})}a($,U,V),$=$[U]}return s}},73201:s=>{var o=/\w*$/;s.exports=function cloneRegExp(s){var i=new s.constructor(s.source,o.exec(s));return i.lastIndex=s.lastIndex,i}},73402:s=>{function concat(...s){return s.map((s=>function source(s){return s?"string"==typeof s?s:s.source:null}(s))).join("")}s.exports=function http(s){const o="HTTP/(2|1\\.[01])",i={className:"attribute",begin:concat("^",/[A-Za-z][A-Za-z0-9-]*/,"(?=\\:\\s)"),starts:{contains:[{className:"punctuation",begin:/: /,relevance:0,starts:{end:"$",relevance:0}}]}},a=[i,{begin:"\\n\\n",starts:{subLanguage:[],endsWithParent:!0}}];return{name:"HTTP",aliases:["https"],illegal:/\S/,contains:[{begin:"^(?="+o+" \\d{3})",end:/$/,contains:[{className:"meta",begin:o},{className:"number",begin:"\\b\\d{3}\\b"}],starts:{end:/\b\B/,illegal:/\S/,contains:a}},{begin:"(?=^[A-Z]+ (.*?) "+o+"$)",end:/$/,contains:[{className:"string",begin:" ",end:" ",excludeBegin:!0,excludeEnd:!0},{className:"meta",begin:o},{className:"keyword",begin:"[A-Z]+"}],starts:{end:/\b\B/,illegal:/\S/,contains:a}},s.inherit(i,{relevance:0})]}}},73424:(s,o,i)=>{var a=i(16962),u=i(2874),_=Array.prototype.push;function baseAry(s,o){return 2==o?function(o,i){return s(o,i)}:function(o){return s(o)}}function cloneArray(s){for(var o=s?s.length:0,i=Array(o);o--;)i[o]=s[o];return i}function wrapImmutable(s,o){return function(){var i=arguments.length;if(i){for(var a=Array(i);i--;)a[i]=arguments[i];var u=a[0]=o.apply(void 0,a);return s.apply(void 0,a),u}}}s.exports=function baseConvert(s,o,i,w){var x="function"==typeof o,C=o===Object(o);if(C&&(w=i,i=o,o=void 0),null==i)throw new TypeError;w||(w={});var j=!("cap"in w)||w.cap,L=!("curry"in w)||w.curry,B=!("fixed"in w)||w.fixed,$=!("immutable"in w)||w.immutable,U=!("rearg"in w)||w.rearg,V=x?i:u,z="curry"in w&&w.curry,Y="fixed"in w&&w.fixed,Z="rearg"in w&&w.rearg,ee=x?i.runInContext():void 0,ie=x?i:{ary:s.ary,assign:s.assign,clone:s.clone,curry:s.curry,forEach:s.forEach,isArray:s.isArray,isError:s.isError,isFunction:s.isFunction,isWeakMap:s.isWeakMap,iteratee:s.iteratee,keys:s.keys,rearg:s.rearg,toInteger:s.toInteger,toPath:s.toPath},ae=ie.ary,ce=ie.assign,le=ie.clone,pe=ie.curry,de=ie.forEach,fe=ie.isArray,ye=ie.isError,be=ie.isFunction,_e=ie.isWeakMap,Se=ie.keys,we=ie.rearg,xe=ie.toInteger,Pe=ie.toPath,Te=Se(a.aryMethod),Re={castArray:function(s){return function(){var o=arguments[0];return fe(o)?s(cloneArray(o)):s.apply(void 0,arguments)}},iteratee:function(s){return function(){var o=arguments[1],i=s(arguments[0],o),a=i.length;return j&&"number"==typeof o?(o=o>2?o-2:1,a&&a<=o?i:baseAry(i,o)):i}},mixin:function(s){return function(o){var i=this;if(!be(i))return s(i,Object(o));var a=[];return de(Se(o),(function(s){be(o[s])&&a.push([s,i.prototype[s]])})),s(i,Object(o)),de(a,(function(s){var o=s[1];be(o)?i.prototype[s[0]]=o:delete i.prototype[s[0]]})),i}},nthArg:function(s){return function(o){var i=o<0?1:xe(o)+1;return pe(s(o),i)}},rearg:function(s){return function(o,i){var a=i?i.length:0;return pe(s(o,i),a)}},runInContext:function(o){return function(i){return baseConvert(s,o(i),w)}}};function castCap(s,o){if(j){var i=a.iterateeRearg[s];if(i)return function iterateeRearg(s,o){return overArg(s,(function(s){var i=o.length;return function baseArity(s,o){return 2==o?function(o,i){return s.apply(void 0,arguments)}:function(o){return s.apply(void 0,arguments)}}(we(baseAry(s,i),o),i)}))}(o,i);var u=!x&&a.iterateeAry[s];if(u)return function iterateeAry(s,o){return overArg(s,(function(s){return"function"==typeof s?baseAry(s,o):s}))}(o,u)}return o}function castFixed(s,o,i){if(B&&(Y||!a.skipFixed[s])){var u=a.methodSpread[s],w=u&&u.start;return void 0===w?ae(o,i):function flatSpread(s,o){return function(){for(var i=arguments.length,a=i-1,u=Array(i);i--;)u[i]=arguments[i];var w=u[o],x=u.slice(0,o);return w&&_.apply(x,w),o!=a&&_.apply(x,u.slice(o+1)),s.apply(this,x)}}(o,w)}return o}function castRearg(s,o,i){return U&&i>1&&(Z||!a.skipRearg[s])?we(o,a.methodRearg[s]||a.aryRearg[i]):o}function cloneByPath(s,o){for(var i=-1,a=(o=Pe(o)).length,u=a-1,_=le(Object(s)),w=_;null!=w&&++i<a;){var x=o[i],C=w[x];null==C||be(C)||ye(C)||_e(C)||(w[x]=le(i==u?C:Object(C))),w=w[x]}return _}function createConverter(s,o){var i=a.aliasToReal[s]||s,u=a.remap[i]||i,_=w;return function(s){var a=x?ee:ie,w=x?ee[u]:o,C=ce(ce({},_),s);return baseConvert(a,i,w,C)}}function overArg(s,o){return function(){var i=arguments.length;if(!i)return s();for(var a=Array(i);i--;)a[i]=arguments[i];var u=U?0:i-1;return a[u]=o(a[u]),s.apply(void 0,a)}}function wrap(s,o,i){var u,_=a.aliasToReal[s]||s,w=o,x=Re[_];return x?w=x(o):$&&(a.mutate.array[_]?w=wrapImmutable(o,cloneArray):a.mutate.object[_]?w=wrapImmutable(o,function createCloner(s){return function(o){return s({},o)}}(o)):a.mutate.set[_]&&(w=wrapImmutable(o,cloneByPath))),de(Te,(function(s){return de(a.aryMethod[s],(function(o){if(_==o){var i=a.methodSpread[_],x=i&&i.afterRearg;return u=x?castFixed(_,castRearg(_,w,s),s):castRearg(_,castFixed(_,w,s),s),u=function castCurry(s,o,i){return z||L&&i>1?pe(o,i):o}(0,u=castCap(_,u),s),!1}})),!u})),u||(u=w),u==o&&(u=z?pe(u,1):function(){return o.apply(this,arguments)}),u.convert=createConverter(_,o),u.placeholder=o.placeholder=i,u}if(!C)return wrap(o,i,V);var $e=i,qe=[];return de(Te,(function(s){de(a.aryMethod[s],(function(s){var o=$e[a.remap[s]||s];o&&qe.push([s,wrap(s,o,$e)])}))})),de(Se($e),(function(s){var o=$e[s];if("function"==typeof o){for(var i=qe.length;i--;)if(qe[i][0]==s)return;o.convert=createConverter(s,o),qe.push([s,o])}})),de(qe,(function(s){$e[s[0]]=s[1]})),$e.convert=function convertLib(s){return $e.runInContext.convert(s)(void 0)},$e.placeholder=$e,de(Se($e),(function(s){de(a.realToAlias[s]||[],(function(o){$e[o]=$e[s]}))})),$e}},73448:(s,o,i)=>{"use strict";var a=i(73948),u=i(29367),_=i(87136),w=i(93742),x=i(76264)("iterator");s.exports=function(s){if(!_(s))return u(s,x)||u(s,"@@iterator")||w[a(s)]}},73648:(s,o,i)=>{"use strict";var a=i(39447),u=i(98828),_=i(49552);s.exports=!a&&!u((function(){return 7!==Object.defineProperty(_("div"),"a",{get:function(){return 7}}).a}))},73948:(s,o,i)=>{"use strict";var a=i(52623),u=i(62250),_=i(45807),w=i(76264)("toStringTag"),x=Object,C="Arguments"===_(function(){return arguments}());s.exports=a?_:function(s){var o,i,a;return void 0===s?"Undefined":null===s?"Null":"string"==typeof(i=function(s,o){try{return s[o]}catch(s){}}(o=x(s),w))?i:C?_(o):"Object"===(a=_(o))&&u(o.callee)?"Arguments":a}},73992:(s,o)=>{"use strict";var i=Object.prototype.hasOwnProperty;function decode(s){try{return decodeURIComponent(s.replace(/\+/g," "))}catch(s){return null}}function encode(s){try{return encodeURIComponent(s)}catch(s){return null}}o.stringify=function querystringify(s,o){o=o||"";var a,u,_=[];for(u in"string"!=typeof o&&(o="?"),s)if(i.call(s,u)){if((a=s[u])||null!=a&&!isNaN(a)||(a=""),u=encode(u),a=encode(a),null===u||null===a)continue;_.push(u+"="+a)}return _.length?o+_.join("&"):""},o.parse=function querystring(s){for(var o,i=/([^=?#&]+)=?([^&]*)/g,a={};o=i.exec(s);){var u=decode(o[1]),_=decode(o[2]);null===u||null===_||u in a||(a[u]=_)}return a}},74218:s=>{s.exports=function isKeyable(s){var o=typeof s;return"string"==o||"number"==o||"symbol"==o||"boolean"==o?"__proto__"!==s:null===s}},74239:(s,o,i)=>{"use strict";var a=i(87136),u=TypeError;s.exports=function(s){if(a(s))throw new u("Can't call method on "+s);return s}},74284:(s,o,i)=>{"use strict";var a=i(39447),u=i(73648),_=i(58661),w=i(36624),x=i(70470),C=TypeError,j=Object.defineProperty,L=Object.getOwnPropertyDescriptor,B="enumerable",$="configurable",U="writable";o.f=a?_?function defineProperty(s,o,i){if(w(s),o=x(o),w(i),"function"==typeof s&&"prototype"===o&&"value"in i&&U in i&&!i[U]){var a=L(s,o);a&&a[U]&&(s[o]=i.value,i={configurable:$ in i?i[$]:a[$],enumerable:B in i?i[B]:a[B],writable:!1})}return j(s,o,i)}:j:function defineProperty(s,o,i){if(w(s),o=x(o),w(i),u)try{return j(s,o,i)}catch(s){}if("get"in i||"set"in i)throw new C("Accessors not supported");return"value"in i&&(s[o]=i.value),s}},74335:s=>{s.exports=function overArg(s,o){return function(i){return s(o(i))}}},74372:(s,o,i)=>{"use strict";var a=i(69675),u=i(36556)("TypedArray.prototype.buffer",!0),_=i(35680);s.exports=u||function typedArrayBuffer(s){if(!_(s))throw new a("Not a Typed Array");return s.buffer}},74436:(s,o,i)=>{"use strict";var a=i(4993),u=i(34849),_=i(20575),createMethod=function(s){return function(o,i,w){var x=a(o),C=_(x);if(0===C)return!s&&-1;var j,L=u(w,C);if(s&&i!=i){for(;C>L;)if((j=x[L++])!=j)return!0}else for(;C>L;L++)if((s||L in x)&&x[L]===i)return s||L||0;return!s&&-1}};s.exports={includes:createMethod(!0),indexOf:createMethod(!1)}},74610:(s,o,i)=>{"use strict";s.exports=Transform;var a=i(86048).F,u=a.ERR_METHOD_NOT_IMPLEMENTED,_=a.ERR_MULTIPLE_CALLBACK,w=a.ERR_TRANSFORM_ALREADY_TRANSFORMING,x=a.ERR_TRANSFORM_WITH_LENGTH_0,C=i(25382);function afterTransform(s,o){var i=this._transformState;i.transforming=!1;var a=i.writecb;if(null===a)return this.emit("error",new _);i.writechunk=null,i.writecb=null,null!=o&&this.push(o),a(s);var u=this._readableState;u.reading=!1,(u.needReadable||u.length<u.highWaterMark)&&this._read(u.highWaterMark)}function Transform(s){if(!(this instanceof Transform))return new Transform(s);C.call(this,s),this._transformState={afterTransform:afterTransform.bind(this),needTransform:!1,transforming:!1,writecb:null,writechunk:null,writeencoding:null},this._readableState.needReadable=!0,this._readableState.sync=!1,s&&("function"==typeof s.transform&&(this._transform=s.transform),"function"==typeof s.flush&&(this._flush=s.flush)),this.on("prefinish",prefinish)}function prefinish(){var s=this;"function"!=typeof this._flush||this._readableState.destroyed?done(this,null,null):this._flush((function(o,i){done(s,o,i)}))}function done(s,o,i){if(o)return s.emit("error",o);if(null!=i&&s.push(i),s._writableState.length)throw new x;if(s._transformState.transforming)throw new w;return s.push(null)}i(56698)(Transform,C),Transform.prototype.push=function(s,o){return this._transformState.needTransform=!1,C.prototype.push.call(this,s,o)},Transform.prototype._transform=function(s,o,i){i(new u("_transform()"))},Transform.prototype._write=function(s,o,i){var a=this._transformState;if(a.writecb=i,a.writechunk=s,a.writeencoding=o,!a.transforming){var u=this._readableState;(a.needTransform||u.needReadable||u.length<u.highWaterMark)&&this._read(u.highWaterMark)}},Transform.prototype._read=function(s){var o=this._transformState;null===o.writechunk||o.transforming?o.needTransform=!0:(o.transforming=!0,this._transform(o.writechunk,o.writeencoding,o.afterTransform))},Transform.prototype._destroy=function(s,o){C.prototype._destroy.call(this,s,(function(s){o(s)}))}},74733:(s,o,i)=>{var a=i(21791),u=i(95950);s.exports=function baseAssign(s,o){return s&&a(o,u(o),s)}},75147:(s,o,i)=>{const a=i(85105);s.exports=class JSON06Serialiser extends a{serialise(s){if(!(s instanceof this.namespace.elements.Element))throw new TypeError(`Given element \`${s}\` is not an Element instance`);let o;s._attributes&&s.attributes.get("variable")&&(o=s.attributes.get("variable"));const i={element:s.element};s._meta&&s._meta.length>0&&(i.meta=this.serialiseObject(s.meta));const a="enum"===s.element||-1!==s.attributes.keys().indexOf("enumerations");if(a){const o=this.enumSerialiseAttributes(s);o&&(i.attributes=o)}else if(s._attributes&&s._attributes.length>0){let{attributes:a}=s;a.get("metadata")&&(a=a.clone(),a.set("meta",a.get("metadata")),a.remove("metadata")),"member"===s.element&&o&&(a=a.clone(),a.remove("variable")),a.length>0&&(i.attributes=this.serialiseObject(a))}if(a)i.content=this.enumSerialiseContent(s,i);else if(this[`${s.element}SerialiseContent`])i.content=this[`${s.element}SerialiseContent`](s,i);else if(void 0!==s.content){let a;o&&s.content.key?(a=s.content.clone(),a.key.attributes.set("variable",o),a=this.serialiseContent(a)):a=this.serialiseContent(s.content),this.shouldSerialiseContent(s,a)&&(i.content=a)}else this.shouldSerialiseContent(s,s.content)&&s instanceof this.namespace.elements.Array&&(i.content=[]);return i}shouldSerialiseContent(s,o){return"parseResult"===s.element||"httpRequest"===s.element||"httpResponse"===s.element||"category"===s.element||"link"===s.element||void 0!==o&&(!Array.isArray(o)||0!==o.length)}refSerialiseContent(s,o){return delete o.attributes,{href:s.toValue(),path:s.path.toValue()}}sourceMapSerialiseContent(s){return s.toValue()}dataStructureSerialiseContent(s){return[this.serialiseContent(s.content)]}enumSerialiseAttributes(s){const o=s.attributes.clone(),i=o.remove("enumerations")||new this.namespace.elements.Array([]),a=o.get("default");let u=o.get("samples")||new this.namespace.elements.Array([]);if(a&&a.content&&(a.content.attributes&&a.content.attributes.remove("typeAttributes"),o.set("default",new this.namespace.elements.Array([a.content]))),u.forEach((s=>{s.content&&s.content.element&&s.content.attributes.remove("typeAttributes")})),s.content&&0!==i.length&&u.unshift(s.content),u=u.map((s=>s instanceof this.namespace.elements.Array?[s]:new this.namespace.elements.Array([s.content]))),u.length&&o.set("samples",u),o.length>0)return this.serialiseObject(o)}enumSerialiseContent(s){if(s._attributes){const o=s.attributes.get("enumerations");if(o&&o.length>0)return o.content.map((s=>{const o=s.clone();return o.attributes.remove("typeAttributes"),this.serialise(o)}))}if(s.content){const o=s.content.clone();return o.attributes.remove("typeAttributes"),[this.serialise(o)]}return[]}deserialise(s){if("string"==typeof s)return new this.namespace.elements.String(s);if("number"==typeof s)return new this.namespace.elements.Number(s);if("boolean"==typeof s)return new this.namespace.elements.Boolean(s);if(null===s)return new this.namespace.elements.Null;if(Array.isArray(s))return new this.namespace.elements.Array(s.map(this.deserialise,this));const o=this.namespace.getElementClass(s.element),i=new o;i.element!==s.element&&(i.element=s.element),s.meta&&this.deserialiseObject(s.meta,i.meta),s.attributes&&this.deserialiseObject(s.attributes,i.attributes);const a=this.deserialiseContent(s.content);if(void 0===a&&null!==i.content||(i.content=a),"enum"===i.element){i.content&&i.attributes.set("enumerations",i.content);let s=i.attributes.get("samples");if(i.attributes.remove("samples"),s){const a=s;s=new this.namespace.elements.Array,a.forEach((a=>{a.forEach((a=>{const u=new o(a);u.element=i.element,s.push(u)}))}));const u=s.shift();i.content=u?u.content:void 0,i.attributes.set("samples",s)}else i.content=void 0;let a=i.attributes.get("default");if(a&&a.length>0){a=a.get(0);const s=new o(a);s.element=i.element,i.attributes.set("default",s)}}else if("dataStructure"===i.element&&Array.isArray(i.content))[i.content]=i.content;else if("category"===i.element){const s=i.attributes.get("meta");s&&(i.attributes.set("metadata",s),i.attributes.remove("meta"))}else"member"===i.element&&i.key&&i.key._attributes&&i.key._attributes.getValue("variable")&&(i.attributes.set("variable",i.key.attributes.get("variable")),i.key.attributes.remove("variable"));return i}serialiseContent(s){if(s instanceof this.namespace.elements.Element)return this.serialise(s);if(s instanceof this.namespace.KeyValuePair){const o={key:this.serialise(s.key)};return s.value&&(o.value=this.serialise(s.value)),o}return s&&s.map?s.map(this.serialise,this):s}deserialiseContent(s){if(s){if(s.element)return this.deserialise(s);if(s.key){const o=new this.namespace.KeyValuePair(this.deserialise(s.key));return s.value&&(o.value=this.deserialise(s.value)),o}if(s.map)return s.map(this.deserialise,this)}return s}shouldRefract(s){return!!(s._attributes&&s.attributes.keys().length||s._meta&&s.meta.keys().length)||"enum"!==s.element&&(s.element!==s.primitive()||"member"===s.element)}convertKeyToRefract(s,o){return this.shouldRefract(o)?this.serialise(o):"enum"===o.element?this.serialiseEnum(o):"array"===o.element?o.map((o=>this.shouldRefract(o)||"default"===s?this.serialise(o):"array"===o.element||"object"===o.element||"enum"===o.element?o.children.map((s=>this.serialise(s))):o.toValue())):"object"===o.element?(o.content||[]).map(this.serialise,this):o.toValue()}serialiseEnum(s){return s.children.map((s=>this.serialise(s)))}serialiseObject(s){const o={};return s.forEach(((s,i)=>{if(s){const a=i.toValue();o[a]=this.convertKeyToRefract(a,s)}})),o}deserialiseObject(s,o){Object.keys(s).forEach((i=>{o.set(i,this.deserialise(s[i]))}))}}},75208:s=>{"use strict";var o,i="";s.exports=function repeat(s,a){if("string"!=typeof s)throw new TypeError("expected a string");if(1===a)return s;if(2===a)return s+s;var u=s.length*a;if(o!==s||void 0===o)o=s,i="";else if(i.length>=u)return i.substr(0,u);for(;u>i.length&&a>1;)1&a&&(i+=s),a>>=1,s+=s;return i=(i+=s).substr(0,u)}},75251:s=>{var o=/\{\n\/\* \[wrapped with (.+)\] \*/,i=/,? & /;s.exports=function getWrapDetails(s){var a=s.match(o);return a?a[1].split(i):[]}},75288:s=>{s.exports=function eq(s,o){return s===o||s!=s&&o!=o}},75795:(s,o,i)=>{"use strict";var a=i(6549);if(a)try{a([],"length")}catch(s){a=null}s.exports=a},75817:s=>{"use strict";s.exports=function(s,o){return{enumerable:!(1&s),configurable:!(2&s),writable:!(4&s),value:o}}},75880:s=>{"use strict";s.exports=Math.pow},75896:(s,o,i)=>{"use strict";var a=i(65606);function emitErrorAndCloseNT(s,o){emitErrorNT(s,o),emitCloseNT(s)}function emitCloseNT(s){s._writableState&&!s._writableState.emitClose||s._readableState&&!s._readableState.emitClose||s.emit("close")}function emitErrorNT(s,o){s.emit("error",o)}s.exports={destroy:function destroy(s,o){var i=this,u=this._readableState&&this._readableState.destroyed,_=this._writableState&&this._writableState.destroyed;return u||_?(o?o(s):s&&(this._writableState?this._writableState.errorEmitted||(this._writableState.errorEmitted=!0,a.nextTick(emitErrorNT,this,s)):a.nextTick(emitErrorNT,this,s)),this):(this._readableState&&(this._readableState.destroyed=!0),this._writableState&&(this._writableState.destroyed=!0),this._destroy(s||null,(function(s){!o&&s?i._writableState?i._writableState.errorEmitted?a.nextTick(emitCloseNT,i):(i._writableState.errorEmitted=!0,a.nextTick(emitErrorAndCloseNT,i,s)):a.nextTick(emitErrorAndCloseNT,i,s):o?(a.nextTick(emitCloseNT,i),o(s)):a.nextTick(emitCloseNT,i)})),this)},undestroy:function undestroy(){this._readableState&&(this._readableState.destroyed=!1,this._readableState.reading=!1,this._readableState.ended=!1,this._readableState.endEmitted=!1),this._writableState&&(this._writableState.destroyed=!1,this._writableState.ended=!1,this._writableState.ending=!1,this._writableState.finalCalled=!1,this._writableState.prefinished=!1,this._writableState.finished=!1,this._writableState.errorEmitted=!1)},errorOrDestroy:function errorOrDestroy(s,o){var i=s._readableState,a=s._writableState;i&&i.autoDestroy||a&&a.autoDestroy?s.destroy(o):s.emit("error",o)}}},75948:(s,o,i)=>{var a=i(83729),u=i(15325),_=[["ary",128],["bind",1],["bindKey",2],["curry",8],["curryRight",16],["flip",512],["partial",32],["partialRight",64],["rearg",256]];s.exports=function updateWrapDetails(s,o){return a(_,(function(i){var a="_."+i[0];o&i[1]&&!u(s,a)&&s.push(a)})),s.sort()}},76024:(s,o,i)=>{"use strict";var a=i(41505),u=Function.prototype,_=u.apply,w=u.call;s.exports="object"==typeof Reflect&&Reflect.apply||(a?w.bind(_):function(){return w.apply(_,arguments)})},76169:(s,o,i)=>{var a=i(49653);s.exports=function cloneDataView(s,o){var i=o?a(s.buffer):s.buffer;return new s.constructor(i,s.byteOffset,s.byteLength)}},76189:s=>{var o=Object.prototype.hasOwnProperty;s.exports=function initCloneArray(s){var i=s.length,a=new s.constructor(i);return i&&"string"==typeof s[0]&&o.call(s,"index")&&(a.index=s.index,a.input=s.input),a}},76264:(s,o,i)=>{"use strict";var a=i(45951),u=i(85816),_=i(49724),w=i(6499),x=i(19846),C=i(51175),j=a.Symbol,L=u("wks"),B=C?j.for||j:j&&j.withoutSetter||w;s.exports=function(s){return _(L,s)||(L[s]=x&&_(j,s)?j[s]:B("Symbol."+s)),L[s]}},76545:(s,o,i)=>{var a=i(56110)(i(9325),"Set");s.exports=a},76578:s=>{"use strict";s.exports=["Float16Array","Float32Array","Float64Array","Int8Array","Int16Array","Int32Array","Uint8Array","Uint8ClampedArray","Uint16Array","Uint32Array","BigInt64Array","BigUint64Array"]},76959:s=>{s.exports=function strictIndexOf(s,o,i){for(var a=i-1,u=s.length;++a<u;)if(s[a]===o)return a;return-1}},77078:(s,o,i)=>{var a=i(91033),u=i(82819),_=i(37471),w=i(18073),x=i(11287),C=i(36306),j=i(9325);s.exports=function createCurry(s,o,i){var L=u(s);return function wrapper(){for(var u=arguments.length,B=Array(u),$=u,U=x(wrapper);$--;)B[$]=arguments[$];var V=u<3&&B[0]!==U&&B[u-1]!==U?[]:C(B,U);return(u-=V.length)<i?w(s,o,_,wrapper.placeholder,void 0,B,V,void 0,void 0,i-u):a(this&&this!==j&&this instanceof wrapper?L:s,this,B)}}},77199:(s,o,i)=>{var a=i(49653),u=i(76169),_=i(73201),w=i(93736),x=i(71961);s.exports=function initCloneByTag(s,o,i){var C=s.constructor;switch(o){case"[object ArrayBuffer]":return a(s);case"[object Boolean]":case"[object Date]":return new C(+s);case"[object DataView]":return u(s,i);case"[object Float32Array]":case"[object Float64Array]":case"[object Int8Array]":case"[object Int16Array]":case"[object Int32Array]":case"[object Uint8Array]":case"[object Uint8ClampedArray]":case"[object Uint16Array]":case"[object Uint32Array]":return x(s,i);case"[object Map]":case"[object Set]":return new C;case"[object Number]":case"[object String]":return new C(s);case"[object RegExp]":return _(s);case"[object Symbol]":return w(s)}}},77556:(s,o,i)=>{var a=i(51873),u=i(34932),_=i(56449),w=i(44394),x=a?a.prototype:void 0,C=x?x.toString:void 0;s.exports=function baseToString(s){if("string"==typeof s)return s;if(_(s))return u(s,baseToString)+"";if(w(s))return C?C.call(s):"";var o=s+"";return"0"==o&&1/s==-1/0?"-0":o}},77731:(s,o,i)=>{var a=i(79920)("set",i(63560));a.placeholder=i(2874),s.exports=a},77797:(s,o,i)=>{var a=i(44394);s.exports=function toKey(s){if("string"==typeof s||a(s))return s;var o=s+"";return"0"==o&&1/s==-1/0?"-0":o}},78004:s=>{"use strict";class SubRange{constructor(s,o){this.low=s,this.high=o,this.length=1+o-s}overlaps(s){return!(this.high<s.low||this.low>s.high)}touches(s){return!(this.high+1<s.low||this.low-1>s.high)}add(s){return new SubRange(Math.min(this.low,s.low),Math.max(this.high,s.high))}subtract(s){return s.low<=this.low&&s.high>=this.high?[]:s.low>this.low&&s.high<this.high?[new SubRange(this.low,s.low-1),new SubRange(s.high+1,this.high)]:s.low<=this.low?[new SubRange(s.high+1,this.high)]:[new SubRange(this.low,s.low-1)]}toString(){return this.low==this.high?this.low.toString():this.low+"-"+this.high}}class DRange{constructor(s,o){this.ranges=[],this.length=0,null!=s&&this.add(s,o)}_update_length(){this.length=this.ranges.reduce(((s,o)=>s+o.length),0)}add(s,o){var _add=s=>{for(var o=0;o<this.ranges.length&&!s.touches(this.ranges[o]);)o++;for(var i=this.ranges.slice(0,o);o<this.ranges.length&&s.touches(this.ranges[o]);)s=s.add(this.ranges[o]),o++;i.push(s),this.ranges=i.concat(this.ranges.slice(o)),this._update_length()};return s instanceof DRange?s.ranges.forEach(_add):(null==o&&(o=s),_add(new SubRange(s,o))),this}subtract(s,o){var _subtract=s=>{for(var o=0;o<this.ranges.length&&!s.overlaps(this.ranges[o]);)o++;for(var i=this.ranges.slice(0,o);o<this.ranges.length&&s.overlaps(this.ranges[o]);)i=i.concat(this.ranges[o].subtract(s)),o++;this.ranges=i.concat(this.ranges.slice(o)),this._update_length()};return s instanceof DRange?s.ranges.forEach(_subtract):(null==o&&(o=s),_subtract(new SubRange(s,o))),this}intersect(s,o){var i=[],_intersect=s=>{for(var o=0;o<this.ranges.length&&!s.overlaps(this.ranges[o]);)o++;for(;o<this.ranges.length&&s.overlaps(this.ranges[o]);){var a=Math.max(this.ranges[o].low,s.low),u=Math.min(this.ranges[o].high,s.high);i.push(new SubRange(a,u)),o++}};return s instanceof DRange?s.ranges.forEach(_intersect):(null==o&&(o=s),_intersect(new SubRange(s,o))),this.ranges=i,this._update_length(),this}index(s){for(var o=0;o<this.ranges.length&&this.ranges[o].length<=s;)s-=this.ranges[o].length,o++;return this.ranges[o].low+s}toString(){return"[ "+this.ranges.join(", ")+" ]"}clone(){return new DRange(this)}numbers(){return this.ranges.reduce(((s,o)=>{for(var i=o.low;i<=o.high;)s.push(i),i++;return s}),[])}subranges(){return this.ranges.map((s=>({low:s.low,high:s.high,length:1+s.high-s.low})))}}s.exports=DRange},78096:s=>{s.exports=function baseTimes(s,o){for(var i=-1,a=Array(s);++i<s;)a[i]=o(i);return a}},78418:(s,o,i)=>{"use strict";i(85160)},79192:(s,o,i)=>{"use strict";var a=i(51871),u=i(46285),_=i(74239),w=i(10043);s.exports=Object.setPrototypeOf||("__proto__"in{}?function(){var s,o=!1,i={};try{(s=a(Object.prototype,"__proto__","set"))(i,[]),o=i instanceof Array}catch(s){}return function setPrototypeOf(i,a){return _(i),w(a),u(i)?(o?s(i,a):i.__proto__=a,i):i}}():void 0)},79290:s=>{"use strict";s.exports=RangeError},79307:(s,o,i)=>{"use strict";var a=i(11091),u=i(44673);a({target:"Function",proto:!0,forced:Function.bind!==u},{bind:u})},79538:s=>{"use strict";s.exports=ReferenceError},79612:s=>{"use strict";s.exports=Object},79770:s=>{s.exports=function arrayFilter(s,o){for(var i=-1,a=null==s?0:s.length,u=0,_=[];++i<a;){var w=s[i];o(w,i,s)&&(_[u++]=w)}return _}},79838:()=>{},79920:(s,o,i)=>{var a=i(73424),u=i(47934);s.exports=function convert(s,o,i){return a(u,s,o,i)}},80079:(s,o,i)=>{var a=i(63702),u=i(70080),_=i(24739),w=i(48655),x=i(31175);function ListCache(s){var o=-1,i=null==s?0:s.length;for(this.clear();++o<i;){var a=s[o];this.set(a[0],a[1])}}ListCache.prototype.clear=a,ListCache.prototype.delete=u,ListCache.prototype.get=_,ListCache.prototype.has=w,ListCache.prototype.set=x,s.exports=ListCache},80218:(s,o,i)=>{var a=i(13222);s.exports=function toLower(s){return a(s).toLowerCase()}},80257:(s,o,i)=>{var a=i(30980),u=i(56017),_=i(23007);s.exports=function wrapperClone(s){if(s instanceof a)return s.clone();var o=new u(s.__wrapped__,s.__chain__);return o.__actions__=_(s.__actions__),o.__index__=s.__index__,o.__values__=s.__values__,o}},80345:(s,o,i)=>{"use strict";function ownKeys(s,o){var i=Object.keys(s);if(Object.getOwnPropertySymbols){var a=Object.getOwnPropertySymbols(s);o&&(a=a.filter((function(o){return Object.getOwnPropertyDescriptor(s,o).enumerable}))),i.push.apply(i,a)}return i}function _objectSpread(s){for(var o=1;o<arguments.length;o++){var i=null!=arguments[o]?arguments[o]:{};o%2?ownKeys(Object(i),!0).forEach((function(o){_defineProperty(s,o,i[o])})):Object.getOwnPropertyDescriptors?Object.defineProperties(s,Object.getOwnPropertyDescriptors(i)):ownKeys(Object(i)).forEach((function(o){Object.defineProperty(s,o,Object.getOwnPropertyDescriptor(i,o))}))}return s}function _defineProperty(s,o,i){return(o=_toPropertyKey(o))in s?Object.defineProperty(s,o,{value:i,enumerable:!0,configurable:!0,writable:!0}):s[o]=i,s}function _defineProperties(s,o){for(var i=0;i<o.length;i++){var a=o[i];a.enumerable=a.enumerable||!1,a.configurable=!0,"value"in a&&(a.writable=!0),Object.defineProperty(s,_toPropertyKey(a.key),a)}}function _toPropertyKey(s){var o=function _toPrimitive(s,o){if("object"!=typeof s||null===s)return s;var i=s[Symbol.toPrimitive];if(void 0!==i){var a=i.call(s,o||"default");if("object"!=typeof a)return a;throw new TypeError("@@toPrimitive must return a primitive value.")}return("string"===o?String:Number)(s)}(s,"string");return"symbol"==typeof o?o:String(o)}var a=i(48287).Buffer,u=i(15340).inspect,_=u&&u.custom||"inspect";s.exports=function(){function BufferList(){!function _classCallCheck(s,o){if(!(s instanceof o))throw new TypeError("Cannot call a class as a function")}(this,BufferList),this.head=null,this.tail=null,this.length=0}return function _createClass(s,o,i){return o&&_defineProperties(s.prototype,o),i&&_defineProperties(s,i),Object.defineProperty(s,"prototype",{writable:!1}),s}(BufferList,[{key:"push",value:function push(s){var o={data:s,next:null};this.length>0?this.tail.next=o:this.head=o,this.tail=o,++this.length}},{key:"unshift",value:function unshift(s){var o={data:s,next:this.head};0===this.length&&(this.tail=o),this.head=o,++this.length}},{key:"shift",value:function shift(){if(0!==this.length){var s=this.head.data;return 1===this.length?this.head=this.tail=null:this.head=this.head.next,--this.length,s}}},{key:"clear",value:function clear(){this.head=this.tail=null,this.length=0}},{key:"join",value:function join(s){if(0===this.length)return"";for(var o=this.head,i=""+o.data;o=o.next;)i+=s+o.data;return i}},{key:"concat",value:function concat(s){if(0===this.length)return a.alloc(0);for(var o,i,u,_=a.allocUnsafe(s>>>0),w=this.head,x=0;w;)o=w.data,i=_,u=x,a.prototype.copy.call(o,i,u),x+=w.data.length,w=w.next;return _}},{key:"consume",value:function consume(s,o){var i;return s<this.head.data.length?(i=this.head.data.slice(0,s),this.head.data=this.head.data.slice(s)):i=s===this.head.data.length?this.shift():o?this._getString(s):this._getBuffer(s),i}},{key:"first",value:function first(){return this.head.data}},{key:"_getString",value:function _getString(s){var o=this.head,i=1,a=o.data;for(s-=a.length;o=o.next;){var u=o.data,_=s>u.length?u.length:s;if(_===u.length?a+=u:a+=u.slice(0,s),0===(s-=_)){_===u.length?(++i,o.next?this.head=o.next:this.head=this.tail=null):(this.head=o,o.data=u.slice(_));break}++i}return this.length-=i,a}},{key:"_getBuffer",value:function _getBuffer(s){var o=a.allocUnsafe(s),i=this.head,u=1;for(i.data.copy(o),s-=i.data.length;i=i.next;){var _=i.data,w=s>_.length?_.length:s;if(_.copy(o,o.length-s,0,w),0===(s-=w)){w===_.length?(++u,i.next?this.head=i.next:this.head=this.tail=null):(this.head=i,i.data=_.slice(w));break}++u}return this.length-=u,o}},{key:_,value:function value(s,o){return u(this,_objectSpread(_objectSpread({},o),{},{depth:0,customInspect:!1}))}}]),BufferList}()},80376:s=>{"use strict";s.exports=["constructor","hasOwnProperty","isPrototypeOf","propertyIsEnumerable","toLocaleString","toString","valueOf"]},80631:(s,o,i)=>{var a=i(28077),u=i(49326);s.exports=function hasIn(s,o){return null!=s&&u(s,o,a)}},80909:(s,o,i)=>{var a=i(30641),u=i(38329)(a);s.exports=u},80945:(s,o,i)=>{var a=i(80079),u=i(68223),_=i(53661);s.exports=function stackSet(s,o){var i=this.__data__;if(i instanceof a){var w=i.__data__;if(!u||w.length<199)return w.push([s,o]),this.size=++i.size,this;i=this.__data__=new _(w)}return i.set(s,o),this.size=i.size,this}},81042:(s,o,i)=>{var a=i(56110)(Object,"create");s.exports=a},81214:(s,o,i)=>{"use strict";function _typeof(s){return _typeof="function"==typeof Symbol&&"symbol"==typeof Symbol.iterator?function(s){return typeof s}:function(s){return s&&"function"==typeof Symbol&&s.constructor===Symbol&&s!==Symbol.prototype?"symbol":typeof s},_typeof(s)}Object.defineProperty(o,"__esModule",{value:!0}),o.DebounceInput=void 0;var a=_interopRequireDefault(i(96540)),u=_interopRequireDefault(i(20181)),_=["element","onChange","value","minLength","debounceTimeout","forceNotifyByEnter","forceNotifyOnBlur","onKeyDown","onBlur","inputRef"];function _interopRequireDefault(s){return s&&s.__esModule?s:{default:s}}function _objectWithoutProperties(s,o){if(null==s)return{};var i,a,u=function _objectWithoutPropertiesLoose(s,o){if(null==s)return{};var i,a,u={},_=Object.keys(s);for(a=0;a<_.length;a++)i=_[a],o.indexOf(i)>=0||(u[i]=s[i]);return u}(s,o);if(Object.getOwnPropertySymbols){var _=Object.getOwnPropertySymbols(s);for(a=0;a<_.length;a++)i=_[a],o.indexOf(i)>=0||Object.prototype.propertyIsEnumerable.call(s,i)&&(u[i]=s[i])}return u}function ownKeys(s,o){var i=Object.keys(s);if(Object.getOwnPropertySymbols){var a=Object.getOwnPropertySymbols(s);o&&(a=a.filter((function(o){return Object.getOwnPropertyDescriptor(s,o).enumerable}))),i.push.apply(i,a)}return i}function _objectSpread(s){for(var o=1;o<arguments.length;o++){var i=null!=arguments[o]?arguments[o]:{};o%2?ownKeys(Object(i),!0).forEach((function(o){_defineProperty(s,o,i[o])})):Object.getOwnPropertyDescriptors?Object.defineProperties(s,Object.getOwnPropertyDescriptors(i)):ownKeys(Object(i)).forEach((function(o){Object.defineProperty(s,o,Object.getOwnPropertyDescriptor(i,o))}))}return s}function _defineProperties(s,o){for(var i=0;i<o.length;i++){var a=o[i];a.enumerable=a.enumerable||!1,a.configurable=!0,"value"in a&&(a.writable=!0),Object.defineProperty(s,a.key,a)}}function _setPrototypeOf(s,o){return _setPrototypeOf=Object.setPrototypeOf||function _setPrototypeOf(s,o){return s.__proto__=o,s},_setPrototypeOf(s,o)}function _createSuper(s){var o=function _isNativeReflectConstruct(){if("undefined"==typeof Reflect||!Reflect.construct)return!1;if(Reflect.construct.sham)return!1;if("function"==typeof Proxy)return!0;try{return Boolean.prototype.valueOf.call(Reflect.construct(Boolean,[],(function(){}))),!0}catch(s){return!1}}();return function _createSuperInternal(){var i,a=_getPrototypeOf(s);if(o){var u=_getPrototypeOf(this).constructor;i=Reflect.construct(a,arguments,u)}else i=a.apply(this,arguments);return function _possibleConstructorReturn(s,o){if(o&&("object"===_typeof(o)||"function"==typeof o))return o;if(void 0!==o)throw new TypeError("Derived constructors may only return object or undefined");return _assertThisInitialized(s)}(this,i)}}function _assertThisInitialized(s){if(void 0===s)throw new ReferenceError("this hasn't been initialised - super() hasn't been called");return s}function _getPrototypeOf(s){return _getPrototypeOf=Object.setPrototypeOf?Object.getPrototypeOf:function _getPrototypeOf(s){return s.__proto__||Object.getPrototypeOf(s)},_getPrototypeOf(s)}function _defineProperty(s,o,i){return o in s?Object.defineProperty(s,o,{value:i,enumerable:!0,configurable:!0,writable:!0}):s[o]=i,s}var w=function(s){!function _inherits(s,o){if("function"!=typeof o&&null!==o)throw new TypeError("Super expression must either be null or a function");s.prototype=Object.create(o&&o.prototype,{constructor:{value:s,writable:!0,configurable:!0}}),Object.defineProperty(s,"prototype",{writable:!1}),o&&_setPrototypeOf(s,o)}(DebounceInput,s);var o=_createSuper(DebounceInput);function DebounceInput(s){var i;!function _classCallCheck(s,o){if(!(s instanceof o))throw new TypeError("Cannot call a class as a function")}(this,DebounceInput),_defineProperty(_assertThisInitialized(i=o.call(this,s)),"onChange",(function(s){s.persist();var o=i.state.value,a=i.props.minLength;i.setState({value:s.target.value},(function(){var u=i.state.value;u.length>=a?i.notify(s):o.length>u.length&&i.notify(_objectSpread(_objectSpread({},s),{},{target:_objectSpread(_objectSpread({},s.target),{},{value:""})}))}))})),_defineProperty(_assertThisInitialized(i),"onKeyDown",(function(s){"Enter"===s.key&&i.forceNotify(s);var o=i.props.onKeyDown;o&&(s.persist(),o(s))})),_defineProperty(_assertThisInitialized(i),"onBlur",(function(s){i.forceNotify(s);var o=i.props.onBlur;o&&(s.persist(),o(s))})),_defineProperty(_assertThisInitialized(i),"createNotifier",(function(s){if(s<0)i.notify=function(){return null};else if(0===s)i.notify=i.doNotify;else{var o=(0,u.default)((function(s){i.isDebouncing=!1,i.doNotify(s)}),s);i.notify=function(s){i.isDebouncing=!0,o(s)},i.flush=function(){return o.flush()},i.cancel=function(){i.isDebouncing=!1,o.cancel()}}})),_defineProperty(_assertThisInitialized(i),"doNotify",(function(){i.props.onChange.apply(void 0,arguments)})),_defineProperty(_assertThisInitialized(i),"forceNotify",(function(s){var o=i.props.debounceTimeout;if(i.isDebouncing||!(o>0)){i.cancel&&i.cancel();var a=i.state.value,u=i.props.minLength;a.length>=u?i.doNotify(s):i.doNotify(_objectSpread(_objectSpread({},s),{},{target:_objectSpread(_objectSpread({},s.target),{},{value:a})}))}})),i.isDebouncing=!1,i.state={value:void 0===s.value||null===s.value?"":s.value};var a=i.props.debounceTimeout;return i.createNotifier(a),i}return function _createClass(s,o,i){return o&&_defineProperties(s.prototype,o),i&&_defineProperties(s,i),Object.defineProperty(s,"prototype",{writable:!1}),s}(DebounceInput,[{key:"componentDidUpdate",value:function componentDidUpdate(s){if(!this.isDebouncing){var o=this.props,i=o.value,a=o.debounceTimeout,u=s.debounceTimeout,_=s.value,w=this.state.value;void 0!==i&&_!==i&&w!==i&&this.setState({value:i}),a!==u&&this.createNotifier(a)}}},{key:"componentWillUnmount",value:function componentWillUnmount(){this.flush&&this.flush()}},{key:"render",value:function render(){var s,o,i=this.props,u=i.element,w=(i.onChange,i.value,i.minLength,i.debounceTimeout,i.forceNotifyByEnter),x=i.forceNotifyOnBlur,C=i.onKeyDown,j=i.onBlur,L=i.inputRef,B=_objectWithoutProperties(i,_),$=this.state.value;s=w?{onKeyDown:this.onKeyDown}:C?{onKeyDown:C}:{},o=x?{onBlur:this.onBlur}:j?{onBlur:j}:{};var U=L?{ref:L}:{};return a.default.createElement(u,_objectSpread(_objectSpread(_objectSpread(_objectSpread({},B),{},{onChange:this.onChange,value:$},s),o),U))}}]),DebounceInput}(a.default.PureComponent);o.DebounceInput=w,_defineProperty(w,"defaultProps",{element:"input",type:"text",onKeyDown:void 0,onBlur:void 0,value:void 0,minLength:0,debounceTimeout:100,forceNotifyByEnter:!0,forceNotifyOnBlur:!0,inputRef:void 0})},81919:(s,o,i)=>{"use strict";var a=i(48287).Buffer;function isSpecificValue(s){return s instanceof a||s instanceof Date||s instanceof RegExp}function cloneSpecificValue(s){if(s instanceof a){var o=a.alloc?a.alloc(s.length):new a(s.length);return s.copy(o),o}if(s instanceof Date)return new Date(s.getTime());if(s instanceof RegExp)return new RegExp(s);throw new Error("Unexpected situation")}function deepCloneArray(s){var o=[];return s.forEach((function(s,i){"object"==typeof s&&null!==s?Array.isArray(s)?o[i]=deepCloneArray(s):isSpecificValue(s)?o[i]=cloneSpecificValue(s):o[i]=u({},s):o[i]=s})),o}function safeGetProperty(s,o){return"__proto__"===o?void 0:s[o]}var u=s.exports=function(){if(arguments.length<1||"object"!=typeof arguments[0])return!1;if(arguments.length<2)return arguments[0];var s,o,i=arguments[0];return Array.prototype.slice.call(arguments,1).forEach((function(a){"object"!=typeof a||null===a||Array.isArray(a)||Object.keys(a).forEach((function(_){return o=safeGetProperty(i,_),(s=safeGetProperty(a,_))===i?void 0:"object"!=typeof s||null===s?void(i[_]=s):Array.isArray(s)?void(i[_]=deepCloneArray(s)):isSpecificValue(s)?void(i[_]=cloneSpecificValue(s)):"object"!=typeof o||null===o||Array.isArray(o)?void(i[_]=u({},s)):void(i[_]=u(o,s))}))})),i}},82048:(s,o,i)=>{"use strict";var a=i(11091),u=i(88280),_=i(15972),w=i(79192),x=i(19595),C=i(58075),j=i(61626),L=i(75817),B=i(39259),$=i(85884),U=i(24823),V=i(32096),z=i(76264)("toStringTag"),Y=Error,Z=[].push,ee=function AggregateError(s,o){var i,a=u(ie,this);w?i=w(new Y,a?_(this):ie):(i=a?this:C(ie),j(i,z,"Error")),void 0!==o&&j(i,"message",V(o)),$(i,ee,i.stack,1),arguments.length>2&&B(i,arguments[2]);var x=[];return U(s,Z,{that:x}),j(i,"errors",x),i};w?w(ee,Y):x(ee,Y,{name:!0});var ie=ee.prototype=C(Y.prototype,{constructor:L(1,ee),message:L(1,""),name:L(1,"AggregateError")});a({global:!0,constructor:!0,arity:2},{AggregateError:ee})},82159:(s,o,i)=>{"use strict";var a=i(62250),u=i(4640),_=TypeError;s.exports=function(s){if(a(s))return s;throw new _(u(s)+" is not a function")}},82199:(s,o,i)=>{var a=i(14528),u=i(56449);s.exports=function baseGetAllKeys(s,o,i){var _=o(s);return u(s)?_:a(_,i(s))}},82261:(s,o,i)=>{"use strict";Object.defineProperty(o,"__esModule",{value:!0});var a=_interopRequireDefault(i(9404)),u=_interopRequireDefault(i(48590));function _interopRequireDefault(s){return s&&s.__esModule?s:{default:s}}o.default=function(s,o,i){var _=Object.keys(o);if(!_.length)return"Store does not have a valid reducer. Make sure the argument passed to combineReducers is an object whose values are reducers.";var w=(0,u.default)(i);if(a.default.isImmutable?!a.default.isImmutable(s):!a.default.Iterable.isIterable(s))return"The "+w+' is of unexpected type. Expected argument to be an instance of Immutable.Collection or Immutable.Record with the following properties: "'+_.join('", "')+'".';var x=s.toSeq().keySeq().toArray().filter((function(s){return!o.hasOwnProperty(s)}));return x.length>0?"Unexpected "+(1===x.length?"property":"properties")+' "'+x.join('", "')+'" found in '+w+'. Expected to find one of the known reducer property names instead: "'+_.join('", "')+'". Unexpected properties will be ignored.':null},s.exports=o.default},82682:(s,o,i)=>{"use strict";var a=i(69600),u=Object.prototype.toString,_=Object.prototype.hasOwnProperty;s.exports=function forEach(s,o,i){if(!a(o))throw new TypeError("iterator must be a function");var w;arguments.length>=3&&(w=i),function isArray(s){return"[object Array]"===u.call(s)}(s)?function forEachArray(s,o,i){for(var a=0,u=s.length;a<u;a++)_.call(s,a)&&(null==i?o(s[a],a,s):o.call(i,s[a],a,s))}(s,o,w):"string"==typeof s?function forEachString(s,o,i){for(var a=0,u=s.length;a<u;a++)null==i?o(s.charAt(a),a,s):o.call(i,s.charAt(a),a,s)}(s,o,w):function forEachObject(s,o,i){for(var a in s)_.call(s,a)&&(null==i?o(s[a],a,s):o.call(i,s[a],a,s))}(s,o,w)}},82819:(s,o,i)=>{var a=i(39344),u=i(23805);s.exports=function createCtor(s){return function(){var o=arguments;switch(o.length){case 0:return new s;case 1:return new s(o[0]);case 2:return new s(o[0],o[1]);case 3:return new s(o[0],o[1],o[2]);case 4:return new s(o[0],o[1],o[2],o[3]);case 5:return new s(o[0],o[1],o[2],o[3],o[4]);case 6:return new s(o[0],o[1],o[2],o[3],o[4],o[5]);case 7:return new s(o[0],o[1],o[2],o[3],o[4],o[5],o[6])}var i=a(s.prototype),_=s.apply(i,o);return u(_)?_:i}}},82890:(s,o,i)=>{"use strict";var a=i(56698),u=i(90392),_=i(92861).Buffer,w=[1116352408,3609767458,1899447441,602891725,3049323471,3964484399,3921009573,2173295548,961987163,4081628472,1508970993,3053834265,2453635748,2937671579,2870763221,3664609560,3624381080,2734883394,310598401,1164996542,607225278,1323610764,1426881987,3590304994,1925078388,4068182383,2162078206,991336113,2614888103,633803317,3248222580,3479774868,3835390401,2666613458,4022224774,944711139,264347078,2341262773,604807628,2007800933,770255983,1495990901,1249150122,1856431235,1555081692,3175218132,1996064986,2198950837,2554220882,3999719339,2821834349,766784016,2952996808,2566594879,3210313671,3203337956,3336571891,1034457026,3584528711,2466948901,113926993,3758326383,338241895,168717936,666307205,1188179964,773529912,1546045734,1294757372,1522805485,1396182291,2643833823,1695183700,2343527390,1986661051,1014477480,2177026350,1206759142,2456956037,344077627,2730485921,1290863460,2820302411,3158454273,3259730800,3505952657,3345764771,106217008,3516065817,3606008344,3600352804,1432725776,4094571909,1467031594,275423344,851169720,430227734,3100823752,506948616,1363258195,659060556,3750685593,883997877,3785050280,958139571,3318307427,1322822218,3812723403,1537002063,2003034995,1747873779,3602036899,1955562222,1575990012,2024104815,1125592928,2227730452,2716904306,2361852424,442776044,2428436474,593698344,2756734187,3733110249,3204031479,2999351573,3329325298,3815920427,3391569614,3928383900,3515267271,566280711,3940187606,3454069534,4118630271,4000239992,116418474,1914138554,174292421,2731055270,289380356,3203993006,460393269,320620315,685471733,587496836,852142971,1086792851,1017036298,365543100,1126000580,2618297676,1288033470,3409855158,1501505948,4234509866,1607167915,987167468,1816402316,1246189591],x=new Array(160);function Sha512(){this.init(),this._w=x,u.call(this,128,112)}function Ch(s,o,i){return i^s&(o^i)}function maj(s,o,i){return s&o|i&(s|o)}function sigma0(s,o){return(s>>>28|o<<4)^(o>>>2|s<<30)^(o>>>7|s<<25)}function sigma1(s,o){return(s>>>14|o<<18)^(s>>>18|o<<14)^(o>>>9|s<<23)}function Gamma0(s,o){return(s>>>1|o<<31)^(s>>>8|o<<24)^s>>>7}function Gamma0l(s,o){return(s>>>1|o<<31)^(s>>>8|o<<24)^(s>>>7|o<<25)}function Gamma1(s,o){return(s>>>19|o<<13)^(o>>>29|s<<3)^s>>>6}function Gamma1l(s,o){return(s>>>19|o<<13)^(o>>>29|s<<3)^(s>>>6|o<<26)}function getCarry(s,o){return s>>>0<o>>>0?1:0}a(Sha512,u),Sha512.prototype.init=function(){return this._ah=1779033703,this._bh=3144134277,this._ch=1013904242,this._dh=2773480762,this._eh=1359893119,this._fh=2600822924,this._gh=528734635,this._hh=1541459225,this._al=4089235720,this._bl=2227873595,this._cl=4271175723,this._dl=1595750129,this._el=2917565137,this._fl=725511199,this._gl=4215389547,this._hl=327033209,this},Sha512.prototype._update=function(s){for(var o=this._w,i=0|this._ah,a=0|this._bh,u=0|this._ch,_=0|this._dh,x=0|this._eh,C=0|this._fh,j=0|this._gh,L=0|this._hh,B=0|this._al,$=0|this._bl,U=0|this._cl,V=0|this._dl,z=0|this._el,Y=0|this._fl,Z=0|this._gl,ee=0|this._hl,ie=0;ie<32;ie+=2)o[ie]=s.readInt32BE(4*ie),o[ie+1]=s.readInt32BE(4*ie+4);for(;ie<160;ie+=2){var ae=o[ie-30],ce=o[ie-30+1],le=Gamma0(ae,ce),pe=Gamma0l(ce,ae),de=Gamma1(ae=o[ie-4],ce=o[ie-4+1]),fe=Gamma1l(ce,ae),ye=o[ie-14],be=o[ie-14+1],_e=o[ie-32],Se=o[ie-32+1],we=pe+be|0,xe=le+ye+getCarry(we,pe)|0;xe=(xe=xe+de+getCarry(we=we+fe|0,fe)|0)+_e+getCarry(we=we+Se|0,Se)|0,o[ie]=xe,o[ie+1]=we}for(var Pe=0;Pe<160;Pe+=2){xe=o[Pe],we=o[Pe+1];var Te=maj(i,a,u),Re=maj(B,$,U),$e=sigma0(i,B),qe=sigma0(B,i),ze=sigma1(x,z),We=sigma1(z,x),He=w[Pe],Ye=w[Pe+1],Xe=Ch(x,C,j),Qe=Ch(z,Y,Z),et=ee+We|0,tt=L+ze+getCarry(et,ee)|0;tt=(tt=(tt=tt+Xe+getCarry(et=et+Qe|0,Qe)|0)+He+getCarry(et=et+Ye|0,Ye)|0)+xe+getCarry(et=et+we|0,we)|0;var rt=qe+Re|0,nt=$e+Te+getCarry(rt,qe)|0;L=j,ee=Z,j=C,Z=Y,C=x,Y=z,x=_+tt+getCarry(z=V+et|0,V)|0,_=u,V=U,u=a,U=$,a=i,$=B,i=tt+nt+getCarry(B=et+rt|0,et)|0}this._al=this._al+B|0,this._bl=this._bl+$|0,this._cl=this._cl+U|0,this._dl=this._dl+V|0,this._el=this._el+z|0,this._fl=this._fl+Y|0,this._gl=this._gl+Z|0,this._hl=this._hl+ee|0,this._ah=this._ah+i+getCarry(this._al,B)|0,this._bh=this._bh+a+getCarry(this._bl,$)|0,this._ch=this._ch+u+getCarry(this._cl,U)|0,this._dh=this._dh+_+getCarry(this._dl,V)|0,this._eh=this._eh+x+getCarry(this._el,z)|0,this._fh=this._fh+C+getCarry(this._fl,Y)|0,this._gh=this._gh+j+getCarry(this._gl,Z)|0,this._hh=this._hh+L+getCarry(this._hl,ee)|0},Sha512.prototype._hash=function(){var s=_.allocUnsafe(64);function writeInt64BE(o,i,a){s.writeInt32BE(o,a),s.writeInt32BE(i,a+4)}return writeInt64BE(this._ah,this._al,0),writeInt64BE(this._bh,this._bl,8),writeInt64BE(this._ch,this._cl,16),writeInt64BE(this._dh,this._dl,24),writeInt64BE(this._eh,this._el,32),writeInt64BE(this._fh,this._fl,40),writeInt64BE(this._gh,this._gl,48),writeInt64BE(this._hh,this._hl,56),s},s.exports=Sha512},83120:(s,o,i)=>{var a=i(14528),u=i(45891);s.exports=function baseFlatten(s,o,i,_,w){var x=-1,C=s.length;for(i||(i=u),w||(w=[]);++x<C;){var j=s[x];o>0&&i(j)?o>1?baseFlatten(j,o-1,i,_,w):a(w,j):_||(w[w.length]=j)}return w}},83141:(s,o,i)=>{"use strict";var a=i(92861).Buffer,u=a.isEncoding||function(s){switch((s=""+s)&&s.toLowerCase()){case"hex":case"utf8":case"utf-8":case"ascii":case"binary":case"base64":case"ucs2":case"ucs-2":case"utf16le":case"utf-16le":case"raw":return!0;default:return!1}};function StringDecoder(s){var o;switch(this.encoding=function normalizeEncoding(s){var o=function _normalizeEncoding(s){if(!s)return"utf8";for(var o;;)switch(s){case"utf8":case"utf-8":return"utf8";case"ucs2":case"ucs-2":case"utf16le":case"utf-16le":return"utf16le";case"latin1":case"binary":return"latin1";case"base64":case"ascii":case"hex":return s;default:if(o)return;s=(""+s).toLowerCase(),o=!0}}(s);if("string"!=typeof o&&(a.isEncoding===u||!u(s)))throw new Error("Unknown encoding: "+s);return o||s}(s),this.encoding){case"utf16le":this.text=utf16Text,this.end=utf16End,o=4;break;case"utf8":this.fillLast=utf8FillLast,o=4;break;case"base64":this.text=base64Text,this.end=base64End,o=3;break;default:return this.write=simpleWrite,void(this.end=simpleEnd)}this.lastNeed=0,this.lastTotal=0,this.lastChar=a.allocUnsafe(o)}function utf8CheckByte(s){return s<=127?0:s>>5==6?2:s>>4==14?3:s>>3==30?4:s>>6==2?-1:-2}function utf8FillLast(s){var o=this.lastTotal-this.lastNeed,i=function utf8CheckExtraBytes(s,o,i){if(128!=(192&o[0]))return s.lastNeed=0,"�";if(s.lastNeed>1&&o.length>1){if(128!=(192&o[1]))return s.lastNeed=1,"�";if(s.lastNeed>2&&o.length>2&&128!=(192&o[2]))return s.lastNeed=2,"�"}}(this,s);return void 0!==i?i:this.lastNeed<=s.length?(s.copy(this.lastChar,o,0,this.lastNeed),this.lastChar.toString(this.encoding,0,this.lastTotal)):(s.copy(this.lastChar,o,0,s.length),void(this.lastNeed-=s.length))}function utf16Text(s,o){if((s.length-o)%2==0){var i=s.toString("utf16le",o);if(i){var a=i.charCodeAt(i.length-1);if(a>=55296&&a<=56319)return this.lastNeed=2,this.lastTotal=4,this.lastChar[0]=s[s.length-2],this.lastChar[1]=s[s.length-1],i.slice(0,-1)}return i}return this.lastNeed=1,this.lastTotal=2,this.lastChar[0]=s[s.length-1],s.toString("utf16le",o,s.length-1)}function utf16End(s){var o=s&&s.length?this.write(s):"";if(this.lastNeed){var i=this.lastTotal-this.lastNeed;return o+this.lastChar.toString("utf16le",0,i)}return o}function base64Text(s,o){var i=(s.length-o)%3;return 0===i?s.toString("base64",o):(this.lastNeed=3-i,this.lastTotal=3,1===i?this.lastChar[0]=s[s.length-1]:(this.lastChar[0]=s[s.length-2],this.lastChar[1]=s[s.length-1]),s.toString("base64",o,s.length-i))}function base64End(s){var o=s&&s.length?this.write(s):"";return this.lastNeed?o+this.lastChar.toString("base64",0,3-this.lastNeed):o}function simpleWrite(s){return s.toString(this.encoding)}function simpleEnd(s){return s&&s.length?this.write(s):""}o.I=StringDecoder,StringDecoder.prototype.write=function(s){if(0===s.length)return"";var o,i;if(this.lastNeed){if(void 0===(o=this.fillLast(s)))return"";i=this.lastNeed,this.lastNeed=0}else i=0;return i<s.length?o?o+this.text(s,i):this.text(s,i):o||""},StringDecoder.prototype.end=function utf8End(s){var o=s&&s.length?this.write(s):"";return this.lastNeed?o+"�":o},StringDecoder.prototype.text=function utf8Text(s,o){var i=function utf8CheckIncomplete(s,o,i){var a=o.length-1;if(a<i)return 0;var u=utf8CheckByte(o[a]);if(u>=0)return u>0&&(s.lastNeed=u-1),u;if(--a<i||-2===u)return 0;if(u=utf8CheckByte(o[a]),u>=0)return u>0&&(s.lastNeed=u-2),u;if(--a<i||-2===u)return 0;if(u=utf8CheckByte(o[a]),u>=0)return u>0&&(2===u?u=0:s.lastNeed=u-3),u;return 0}(this,s,o);if(!this.lastNeed)return s.toString("utf8",o);this.lastTotal=i;var a=s.length-(i-this.lastNeed);return s.copy(this.lastChar,0,a),s.toString("utf8",o,a)},StringDecoder.prototype.fillLast=function(s){if(this.lastNeed<=s.length)return s.copy(this.lastChar,this.lastTotal-this.lastNeed,0,this.lastNeed),this.lastChar.toString(this.encoding,0,this.lastTotal);s.copy(this.lastChar,this.lastTotal-this.lastNeed,0,s.length),this.lastNeed-=s.length}},83221:s=>{s.exports=function createBaseFor(s){return function(o,i,a){for(var u=-1,_=Object(o),w=a(o),x=w.length;x--;){var C=w[s?x:++u];if(!1===i(_[C],C,_))break}return o}}},83349:(s,o,i)=>{var a=i(82199),u=i(86375),_=i(37241);s.exports=function getAllKeysIn(s){return a(s,_,u)}},83488:s=>{s.exports=function identity(s){return s}},83693:(s,o,i)=>{var a=i(64894),u=i(40346);s.exports=function isArrayLikeObject(s){return u(s)&&a(s)}},83729:s=>{s.exports=function arrayEach(s,o){for(var i=-1,a=null==s?0:s.length;++i<a&&!1!==o(s[i],i,s););return s}},84058:(s,o,i)=>{var a=i(14792),u=i(45539)((function(s,o,i){return o=o.toLowerCase(),s+(i?a(o):o)}));s.exports=u},84195:(s,o,i)=>{var a=i(66977),u=i(38816),_=u((function(s,o){return a(s,256,void 0,void 0,void 0,o)}));s.exports=_},84247:s=>{s.exports=function setToArray(s){var o=-1,i=Array(s.size);return s.forEach((function(s){i[++o]=s})),i}},84629:s=>{s.exports={}},84851:(s,o,i)=>{"use strict";s.exports=i(85401)},84977:(s,o,i)=>{"use strict";Object.defineProperty(o,"__esModule",{value:!0});var a=function _interopRequireDefault(s){return s&&s.__esModule?s:{default:s}}(i(9404)),u=i(55674);o.default=function(s){var o=arguments.length>1&&void 0!==arguments[1]?arguments[1]:a.default.Map,i=Object.keys(s);return function(){var a=arguments.length>0&&void 0!==arguments[0]?arguments[0]:o(),_=arguments[1];return a.withMutations((function(o){i.forEach((function(i){var a=(0,s[i])(o.get(i),_);(0,u.validateNextState)(a,i,_),o.set(i,a)}))}))}},s.exports=o.default},85015:(s,o,i)=>{var a=i(72552),u=i(56449),_=i(40346);s.exports=function isString(s){return"string"==typeof s||!u(s)&&_(s)&&"[object String]"==a(s)}},85087:(s,o,i)=>{var a=i(30980),u=i(37381),_=i(62284),w=i(53758);s.exports=function isLaziable(s){var o=_(s),i=w[o];if("function"!=typeof i||!(o in a.prototype))return!1;if(s===i)return!0;var x=u(i);return!!x&&s===x[0]}},85105:s=>{s.exports=class JSONSerialiser{constructor(s){this.namespace=s||new this.Namespace}serialise(s){if(!(s instanceof this.namespace.elements.Element))throw new TypeError(`Given element \`${s}\` is not an Element instance`);const o={element:s.element};s._meta&&s._meta.length>0&&(o.meta=this.serialiseObject(s.meta)),s._attributes&&s._attributes.length>0&&(o.attributes=this.serialiseObject(s.attributes));const i=this.serialiseContent(s.content);return void 0!==i&&(o.content=i),o}deserialise(s){if(!s.element)throw new Error("Given value is not an object containing an element name");const o=new(this.namespace.getElementClass(s.element));o.element!==s.element&&(o.element=s.element),s.meta&&this.deserialiseObject(s.meta,o.meta),s.attributes&&this.deserialiseObject(s.attributes,o.attributes);const i=this.deserialiseContent(s.content);return void 0===i&&null!==o.content||(o.content=i),o}serialiseContent(s){if(s instanceof this.namespace.elements.Element)return this.serialise(s);if(s instanceof this.namespace.KeyValuePair){const o={key:this.serialise(s.key)};return s.value&&(o.value=this.serialise(s.value)),o}if(s&&s.map){if(0===s.length)return;return s.map(this.serialise,this)}return s}deserialiseContent(s){if(s){if(s.element)return this.deserialise(s);if(s.key){const o=new this.namespace.KeyValuePair(this.deserialise(s.key));return s.value&&(o.value=this.deserialise(s.value)),o}if(s.map)return s.map(this.deserialise,this)}return s}serialiseObject(s){const o={};if(s.forEach(((s,i)=>{s&&(o[i.toValue()]=this.serialise(s))})),0!==Object.keys(o).length)return o}deserialiseObject(s,o){Object.keys(s).forEach((i=>{o.set(i,this.deserialise(s[i]))}))}}},85160:(s,o,i)=>{"use strict";var a=i(96540);var u="function"==typeof Object.is?Object.is:function is(s,o){return s===o&&(0!==s||1/s==1/o)||s!=s&&o!=o},_=a.useSyncExternalStore,w=a.useRef,x=a.useEffect,C=a.useMemo,j=a.useDebugValue},85250:(s,o,i)=>{var a=i(37217),u=i(87805),_=i(86649),w=i(42824),x=i(23805),C=i(37241),j=i(14974);s.exports=function baseMerge(s,o,i,L,B){s!==o&&_(o,(function(_,C){if(B||(B=new a),x(_))w(s,o,C,i,baseMerge,L,B);else{var $=L?L(j(s,C),_,C+"",s,o,B):void 0;void 0===$&&($=_),u(s,C,$)}}),C)}},85401:(s,o,i)=>{"use strict";var a=i(462);s.exports=a},85463:s=>{s.exports=function baseIsNaN(s){return s!=s}},85558:s=>{s.exports=function baseReduce(s,o,i,a,u){return u(s,(function(s,u,_){i=a?(a=!1,s):o(i,s,u,_)})),i}},85582:(s,o,i)=>{"use strict";var a=i(92046),u=i(45951),_=i(62250),aFunction=function(s){return _(s)?s:void 0};s.exports=function(s,o){return arguments.length<2?aFunction(a[s])||aFunction(u[s]):a[s]&&a[s][o]||u[s]&&u[s][o]}},85587:(s,o,i)=>{"use strict";var a=i(26311),u=create(Error);function create(s){return FormattedError.displayName=s.displayName||s.name,FormattedError;function FormattedError(o){return o&&(o=a.apply(null,arguments)),new s(o)}}s.exports=u,u.eval=create(EvalError),u.range=create(RangeError),u.reference=create(ReferenceError),u.syntax=create(SyntaxError),u.type=create(TypeError),u.uri=create(URIError),u.create=create},85762:(s,o,i)=>{"use strict";var a=i(1907),u=Error,_=a("".replace),w=String(new u("zxcasd").stack),x=/\n\s*at [^:]*:[^\n]*/,C=x.test(w);s.exports=function(s,o){if(C&&"string"==typeof s&&!u.prepareStackTrace)for(;o--;)s=_(s,x,"");return s}},85816:(s,o,i)=>{"use strict";var a=i(36128);s.exports=function(s,o){return a[s]||(a[s]=o||{})}},85884:(s,o,i)=>{"use strict";var a=i(61626),u=i(85762),_=i(23888),w=Error.captureStackTrace;s.exports=function(s,o,i,x){_&&(w?w(s,o):a(s,"stack",u(i,x)))}},86009:(s,o,i)=>{s=i.nmd(s);var a=i(34840),u=o&&!o.nodeType&&o,_=u&&s&&!s.nodeType&&s,w=_&&_.exports===u&&a.process,x=function(){try{var s=_&&_.require&&_.require("util").types;return s||w&&w.binding&&w.binding("util")}catch(s){}}();s.exports=x},86048:s=>{"use strict";var o={};function createErrorType(s,i,a){a||(a=Error);var u=function(s){function NodeError(o,a,u){return s.call(this,function getMessage(s,o,a){return"string"==typeof i?i:i(s,o,a)}(o,a,u))||this}return function _inheritsLoose(s,o){s.prototype=Object.create(o.prototype),s.prototype.constructor=s,s.__proto__=o}(NodeError,s),NodeError}(a);u.prototype.name=a.name,u.prototype.code=s,o[s]=u}function oneOf(s,o){if(Array.isArray(s)){var i=s.length;return s=s.map((function(s){return String(s)})),i>2?"one of ".concat(o," ").concat(s.slice(0,i-1).join(", "),", or ")+s[i-1]:2===i?"one of ".concat(o," ").concat(s[0]," or ").concat(s[1]):"of ".concat(o," ").concat(s[0])}return"of ".concat(o," ").concat(String(s))}createErrorType("ERR_INVALID_OPT_VALUE",(function(s,o){return'The value "'+o+'" is invalid for option "'+s+'"'}),TypeError),createErrorType("ERR_INVALID_ARG_TYPE",(function(s,o,i){var a,u;if("string"==typeof o&&function startsWith(s,o,i){return s.substr(!i||i<0?0:+i,o.length)===o}(o,"not ")?(a="must not be",o=o.replace(/^not /,"")):a="must be",function endsWith(s,o,i){return(void 0===i||i>s.length)&&(i=s.length),s.substring(i-o.length,i)===o}(s," argument"))u="The ".concat(s," ").concat(a," ").concat(oneOf(o,"type"));else{var _=function includes(s,o,i){return"number"!=typeof i&&(i=0),!(i+o.length>s.length)&&-1!==s.indexOf(o,i)}(s,".")?"property":"argument";u='The "'.concat(s,'" ').concat(_," ").concat(a," ").concat(oneOf(o,"type"))}return u+=". Received type ".concat(typeof i)}),TypeError),createErrorType("ERR_STREAM_PUSH_AFTER_EOF","stream.push() after EOF"),createErrorType("ERR_METHOD_NOT_IMPLEMENTED",(function(s){return"The "+s+" method is not implemented"})),createErrorType("ERR_STREAM_PREMATURE_CLOSE","Premature close"),createErrorType("ERR_STREAM_DESTROYED",(function(s){return"Cannot call "+s+" after a stream was destroyed"})),createErrorType("ERR_MULTIPLE_CALLBACK","Callback called multiple times"),createErrorType("ERR_STREAM_CANNOT_PIPE","Cannot pipe, not readable"),createErrorType("ERR_STREAM_WRITE_AFTER_END","write after end"),createErrorType("ERR_STREAM_NULL_VALUES","May not write null values to stream",TypeError),createErrorType("ERR_UNKNOWN_ENCODING",(function(s){return"Unknown encoding: "+s}),TypeError),createErrorType("ERR_STREAM_UNSHIFT_AFTER_END_EVENT","stream.unshift() after end event"),s.exports.F=o},86215:function(s,o){var i,a,u;a=[],i=function(){"use strict";var isNativeSmoothScrollEnabledOn=function(s){return s&&"getComputedStyle"in window&&"smooth"===window.getComputedStyle(s)["scroll-behavior"]};if("undefined"==typeof window||!("document"in window))return{};var makeScroller=function(s,o,i){var a;o=o||999,i||0===i||(i=9);var setScrollTimeoutId=function(s){a=s},stopScroll=function(){clearTimeout(a),setScrollTimeoutId(0)},getTopWithEdgeOffset=function(o){return Math.max(0,s.getTopOf(o)-i)},scrollToY=function(i,a,u){if(stopScroll(),0===a||a&&a<0||isNativeSmoothScrollEnabledOn(s.body))s.toY(i),u&&u();else{var _=s.getY(),w=Math.max(0,i)-_,x=(new Date).getTime();a=a||Math.min(Math.abs(w),o),function loopScroll(){setScrollTimeoutId(setTimeout((function(){var o=Math.min(1,((new Date).getTime()-x)/a),i=Math.max(0,Math.floor(_+w*(o<.5?2*o*o:o*(4-2*o)-1)));s.toY(i),o<1&&s.getHeight()+i<s.body.scrollHeight?loopScroll():(setTimeout(stopScroll,99),u&&u())}),9))}()}},scrollToElem=function(s,o,i){scrollToY(getTopWithEdgeOffset(s),o,i)},scrollIntoView=function(o,a,u){var _=o.getBoundingClientRect().height,w=s.getTopOf(o)+_,x=s.getHeight(),C=s.getY(),j=C+x;getTopWithEdgeOffset(o)<C||_+i>x?scrollToElem(o,a,u):w+i>j?scrollToY(w-x+i,a,u):u&&u()},scrollToCenterOf=function(o,i,a,u){scrollToY(Math.max(0,s.getTopOf(o)-s.getHeight()/2+(a||o.getBoundingClientRect().height/2)),i,u)};return{setup:function(s,a){return(0===s||s)&&(o=s),(0===a||a)&&(i=a),{defaultDuration:o,edgeOffset:i}},to:scrollToElem,toY:scrollToY,intoView:scrollIntoView,center:scrollToCenterOf,stop:stopScroll,moving:function(){return!!a},getY:s.getY,getTopOf:s.getTopOf}},s=document.documentElement,getDocY=function(){return window.scrollY||s.scrollTop},o=makeScroller({body:document.scrollingElement||document.body,toY:function(s){window.scrollTo(0,s)},getY:getDocY,getHeight:function(){return window.innerHeight||s.clientHeight},getTopOf:function(o){return o.getBoundingClientRect().top+getDocY()-s.offsetTop}});if(o.createScroller=function(o,i,a){return makeScroller({body:o,toY:function(s){o.scrollTop=s},getY:function(){return o.scrollTop},getHeight:function(){return Math.min(o.clientHeight,window.innerHeight||s.clientHeight)},getTopOf:function(s){return s.offsetTop}},i,a)},"addEventListener"in window&&!window.noZensmooth&&!isNativeSmoothScrollEnabledOn(document.body)){var i="history"in window&&"pushState"in history,a=i&&"scrollRestoration"in history;a&&(history.scrollRestoration="auto"),window.addEventListener("load",(function(){a&&(setTimeout((function(){history.scrollRestoration="manual"}),9),window.addEventListener("popstate",(function(s){s.state&&"zenscrollY"in s.state&&o.toY(s.state.zenscrollY)}),!1)),window.location.hash&&setTimeout((function(){var s=o.setup().edgeOffset;if(s){var i=document.getElementById(window.location.href.split("#")[1]);if(i){var a=Math.max(0,o.getTopOf(i)-s),u=o.getY()-a;0<=u&&u<9&&window.scrollTo(0,a)}}}),9)}),!1);var u=new RegExp("(^|\\s)noZensmooth(\\s|$)");window.addEventListener("click",(function(s){for(var _=s.target;_&&"A"!==_.tagName;)_=_.parentNode;if(!(!_||1!==s.which||s.shiftKey||s.metaKey||s.ctrlKey||s.altKey)){if(a){var w=history.state&&"object"==typeof history.state?history.state:{};w.zenscrollY=o.getY();try{history.replaceState(w,"")}catch(s){}}var x=_.getAttribute("href")||"";if(0===x.indexOf("#")&&!u.test(_.className)){var C=0,j=document.getElementById(x.substring(1));if("#"!==x){if(!j)return;C=o.getTopOf(j)}s.preventDefault();var onDone=function(){window.location=x},L=o.setup().edgeOffset;L&&(C=Math.max(0,C-L),i&&(onDone=function(){history.pushState({},"",x)})),o.toY(C,null,onDone)}}}),!1)}return o}(),void 0===(u="function"==typeof i?i.apply(o,a):i)||(s.exports=u)},86238:(s,o,i)=>{"use strict";var a=i(86048).F.ERR_STREAM_PREMATURE_CLOSE;function noop(){}s.exports=function eos(s,o,i){if("function"==typeof o)return eos(s,null,o);o||(o={}),i=function once(s){var o=!1;return function(){if(!o){o=!0;for(var i=arguments.length,a=new Array(i),u=0;u<i;u++)a[u]=arguments[u];s.apply(this,a)}}}(i||noop);var u=o.readable||!1!==o.readable&&s.readable,_=o.writable||!1!==o.writable&&s.writable,w=function onlegacyfinish(){s.writable||C()},x=s._writableState&&s._writableState.finished,C=function onfinish(){_=!1,x=!0,u||i.call(s)},j=s._readableState&&s._readableState.endEmitted,L=function onend(){u=!1,j=!0,_||i.call(s)},B=function onerror(o){i.call(s,o)},$=function onclose(){var o;return u&&!j?(s._readableState&&s._readableState.ended||(o=new a),i.call(s,o)):_&&!x?(s._writableState&&s._writableState.ended||(o=new a),i.call(s,o)):void 0},U=function onrequest(){s.req.on("finish",C)};return!function isRequest(s){return s.setHeader&&"function"==typeof s.abort}(s)?_&&!s._writableState&&(s.on("end",w),s.on("close",w)):(s.on("complete",C),s.on("abort",$),s.req?U():s.on("request",U)),s.on("end",L),s.on("finish",C),!1!==o.error&&s.on("error",B),s.on("close",$),function(){s.removeListener("complete",C),s.removeListener("abort",$),s.removeListener("request",U),s.req&&s.req.removeListener("finish",C),s.removeListener("end",w),s.removeListener("close",w),s.removeListener("finish",C),s.removeListener("end",L),s.removeListener("error",B),s.removeListener("close",$)}}},86303:(s,o,i)=>{const a=i(10316);s.exports=class LinkElement extends a{constructor(s,o,i){super(s||[],o,i),this.element="link"}get relation(){return this.attributes.get("relation")}set relation(s){this.attributes.set("relation",s)}get href(){return this.attributes.get("href")}set href(s){this.attributes.set("href",s)}}},86375:(s,o,i)=>{var a=i(14528),u=i(28879),_=i(4664),w=i(63345),x=Object.getOwnPropertySymbols?function(s){for(var o=[];s;)a(o,_(s)),s=u(s);return o}:w;s.exports=x},86649:(s,o,i)=>{var a=i(83221)();s.exports=a},86804:(s,o,i)=>{const a=i(10316),u=i(41067),_=i(71167),w=i(40239),x=i(12242),C=i(6233),j=i(87726),L=i(61045),B=i(86303),$=i(14540),U=i(92340),V=i(10866),z=i(55973);function refract(s){if(s instanceof a)return s;if("string"==typeof s)return new _(s);if("number"==typeof s)return new w(s);if("boolean"==typeof s)return new x(s);if(null===s)return new u;if(Array.isArray(s))return new C(s.map(refract));if("object"==typeof s){return new L(s)}return s}a.prototype.ObjectElement=L,a.prototype.RefElement=$,a.prototype.MemberElement=j,a.prototype.refract=refract,U.prototype.refract=refract,s.exports={Element:a,NullElement:u,StringElement:_,NumberElement:w,BooleanElement:x,ArrayElement:C,MemberElement:j,ObjectElement:L,LinkElement:B,RefElement:$,refract,ArraySlice:U,ObjectSlice:V,KeyValuePair:z}},87068:(s,o,i)=>{var a=i(37217),u=i(25911),_=i(21986),w=i(50689),x=i(5861),C=i(56449),j=i(3656),L=i(37167),B="[object Arguments]",$="[object Array]",U="[object Object]",V=Object.prototype.hasOwnProperty;s.exports=function baseIsEqualDeep(s,o,i,z,Y,Z){var ee=C(s),ie=C(o),ae=ee?$:x(s),ce=ie?$:x(o),le=(ae=ae==B?U:ae)==U,pe=(ce=ce==B?U:ce)==U,de=ae==ce;if(de&&j(s)){if(!j(o))return!1;ee=!0,le=!1}if(de&&!le)return Z||(Z=new a),ee||L(s)?u(s,o,i,z,Y,Z):_(s,o,ae,i,z,Y,Z);if(!(1&i)){var fe=le&&V.call(s,"__wrapped__"),ye=pe&&V.call(o,"__wrapped__");if(fe||ye){var be=fe?s.value():s,_e=ye?o.value():o;return Z||(Z=new a),Y(be,_e,i,z,Z)}}return!!de&&(Z||(Z=new a),w(s,o,i,z,Y,Z))}},87136:s=>{"use strict";s.exports=function(s){return null==s}},87170:(s,o)=>{"use strict";o.f=Object.getOwnPropertySymbols},87296:(s,o,i)=>{var a,u=i(55481),_=(a=/[^.]+$/.exec(u&&u.keys&&u.keys.IE_PROTO||""))?"Symbol(src)_1."+a:"";s.exports=function isMasked(s){return!!_&&_ in s}},87586:(s,o,i)=>{const a=i(6205),u=i(10023),_={0:0,t:9,n:10,v:11,f:12,r:13};o.strToChars=function(s){return s=s.replace(/(\[\\b\])|(\\)?\\(?:u([A-F0-9]{4})|x([A-F0-9]{2})|(0?[0-7]{2})|c([@A-Z[\\\]^?])|([0tnvfr]))/g,(function(s,o,i,a,u,w,x,C){if(i)return s;var j=o?8:a?parseInt(a,16):u?parseInt(u,16):w?parseInt(w,8):x?"@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^ ?".indexOf(x):_[C],L=String.fromCharCode(j);return/[[\]{}^$.|?*+()]/.test(L)&&(L="\\"+L),L}))},o.tokenizeClass=(s,i)=>{for(var _,w,x=[],C=/\\(?:(w)|(d)|(s)|(W)|(D)|(S))|((?:(?:\\)(.)|([^\]\\]))-(?:\\)?([^\]]))|(\])|(?:\\)?([^])/g;null!=(_=C.exec(s));)if(_[1])x.push(u.words());else if(_[2])x.push(u.ints());else if(_[3])x.push(u.whitespace());else if(_[4])x.push(u.notWords());else if(_[5])x.push(u.notInts());else if(_[6])x.push(u.notWhitespace());else if(_[7])x.push({type:a.RANGE,from:(_[8]||_[9]).charCodeAt(0),to:_[10].charCodeAt(0)});else{if(!(w=_[12]))return[x,C.lastIndex];x.push({type:a.CHAR,value:w.charCodeAt(0)})}o.error(i,"Unterminated character class")},o.error=(s,o)=>{throw new SyntaxError("Invalid regular expression: /"+s+"/: "+o)}},87726:(s,o,i)=>{const a=i(55973),u=i(10316);s.exports=class MemberElement extends u{constructor(s,o,i,u){super(new a,i,u),this.element="member",this.key=s,this.value=o}get key(){return this.content.key}set key(s){this.content.key=this.refract(s)}get value(){return this.content.value}set value(s){this.content.value=this.refract(s)}}},87730:(s,o,i)=>{var a=i(29172),u=i(27301),_=i(86009),w=_&&_.isMap,x=w?u(w):a;s.exports=x},87805:(s,o,i)=>{var a=i(43360),u=i(75288);s.exports=function assignMergeValue(s,o,i){(void 0!==i&&!u(s[o],i)||void 0===i&&!(o in s))&&a(s,o,i)}},87978:(s,o,i)=>{var a=i(60270),u=i(58156),_=i(80631),w=i(28586),x=i(30756),C=i(67197),j=i(77797);s.exports=function baseMatchesProperty(s,o){return w(s)&&x(o)?C(j(s),o):function(i){var w=u(i,s);return void 0===w&&w===o?_(i,s):a(o,w,3)}}},88280:(s,o,i)=>{"use strict";var a=i(1907);s.exports=a({}.isPrototypeOf)},88310:(s,o,i)=>{s.exports=Stream;var a=i(37007).EventEmitter;function Stream(){a.call(this)}i(56698)(Stream,a),Stream.Readable=i(45412),Stream.Writable=i(16708),Stream.Duplex=i(25382),Stream.Transform=i(74610),Stream.PassThrough=i(63600),Stream.finished=i(86238),Stream.pipeline=i(57758),Stream.Stream=Stream,Stream.prototype.pipe=function(s,o){var i=this;function ondata(o){s.writable&&!1===s.write(o)&&i.pause&&i.pause()}function ondrain(){i.readable&&i.resume&&i.resume()}i.on("data",ondata),s.on("drain",ondrain),s._isStdio||o&&!1===o.end||(i.on("end",onend),i.on("close",onclose));var u=!1;function onend(){u||(u=!0,s.end())}function onclose(){u||(u=!0,"function"==typeof s.destroy&&s.destroy())}function onerror(s){if(cleanup(),0===a.listenerCount(this,"error"))throw s}function cleanup(){i.removeListener("data",ondata),s.removeListener("drain",ondrain),i.removeListener("end",onend),i.removeListener("close",onclose),i.removeListener("error",onerror),s.removeListener("error",onerror),i.removeListener("end",cleanup),i.removeListener("close",cleanup),s.removeListener("close",cleanup)}return i.on("error",onerror),s.on("error",onerror),i.on("end",cleanup),i.on("close",cleanup),s.on("close",cleanup),s.emit("pipe",i),s}},88984:(s,o,i)=>{var a=i(55527),u=i(3650),_=Object.prototype.hasOwnProperty;s.exports=function baseKeys(s){if(!a(s))return u(s);var o=[];for(var i in Object(s))_.call(s,i)&&"constructor"!=i&&o.push(i);return o}},89353:s=>{"use strict";var o=Object.prototype.toString,i=Math.max,a=function concatty(s,o){for(var i=[],a=0;a<s.length;a+=1)i[a]=s[a];for(var u=0;u<o.length;u+=1)i[u+s.length]=o[u];return i};s.exports=function bind(s){var u=this;if("function"!=typeof u||"[object Function]"!==o.apply(u))throw new TypeError("Function.prototype.bind called on incompatible "+u);for(var _,w=function slicy(s,o){for(var i=[],a=o||0,u=0;a<s.length;a+=1,u+=1)i[u]=s[a];return i}(arguments,1),x=i(0,u.length-w.length),C=[],j=0;j<x;j++)C[j]="$"+j;if(_=Function("binder","return function ("+function(s,o){for(var i="",a=0;a<s.length;a+=1)i+=s[a],a+1<s.length&&(i+=o);return i}(C,",")+"){ return binder.apply(this,arguments); }")((function(){if(this instanceof _){var o=u.apply(this,a(w,arguments));return Object(o)===o?o:this}return u.apply(s,a(w,arguments))})),u.prototype){var L=function Empty(){};L.prototype=u.prototype,_.prototype=new L,L.prototype=null}return _}},89593:(s,o,i)=>{"use strict";o.H=void 0;var a=function _interopRequireDefault(s){return s&&s.__esModule?s:{default:s}}(i(84977));o.H=a.default},89935:s=>{s.exports=function stubFalse(){return!1}},90160:(s,o,i)=>{"use strict";var a=i(73948),u=String;s.exports=function(s){if("Symbol"===a(s))throw new TypeError("Cannot convert a Symbol value to a string");return u(s)}},90179:(s,o,i)=>{var a=i(34932),u=i(9999),_=i(19931),w=i(31769),x=i(21791),C=i(53138),j=i(38816),L=i(83349),B=j((function(s,o){var i={};if(null==s)return i;var j=!1;o=a(o,(function(o){return o=w(o,s),j||(j=o.length>1),o})),x(s,L(s),i),j&&(i=u(i,7,C));for(var B=o.length;B--;)_(i,o[B]);return i}));s.exports=B},90181:s=>{s.exports=function nativeKeysIn(s){var o=[];if(null!=s)for(var i in Object(s))o.push(i);return o}},90289:(s,o,i)=>{var a=i(12651);s.exports=function mapCacheGet(s){return a(this,s).get(s)}},90392:(s,o,i)=>{"use strict";var a=i(92861).Buffer,u=i(15377);function Hash(s,o){this._block=a.alloc(s),this._finalSize=o,this._blockSize=s,this._len=0}Hash.prototype.update=function(s,o){s=u(s,o||"utf8");for(var i=this._block,a=this._blockSize,_=s.length,w=this._len,x=0;x<_;){for(var C=w%a,j=Math.min(_-x,a-C),L=0;L<j;L++)i[C+L]=s[x+L];x+=j,(w+=j)%a==0&&this._update(i)}return this._len+=_,this},Hash.prototype.digest=function(s){var o=this._len%this._blockSize;this._block[o]=128,this._block.fill(0,o+1),o>=this._finalSize&&(this._update(this._block),this._block.fill(0));var i=8*this._len;if(i<=4294967295)this._block.writeUInt32BE(i,this._blockSize-4);else{var a=(4294967295&i)>>>0,u=(i-a)/4294967296;this._block.writeUInt32BE(u,this._blockSize-8),this._block.writeUInt32BE(a,this._blockSize-4)}this._update(this._block);var _=this._hash();return s?_.toString(s):_},Hash.prototype._update=function(){throw new Error("_update must be implemented by subclass")},s.exports=Hash},90916:(s,o,i)=>{var a=i(80909);s.exports=function baseSome(s,o){var i;return a(s,(function(s,a,u){return!(i=o(s,a,u))})),!!i}},90938:s=>{s.exports=function stackDelete(s){var o=this.__data__,i=o.delete(s);return this.size=o.size,i}},91033:s=>{s.exports=function apply(s,o,i){switch(i.length){case 0:return s.call(o);case 1:return s.call(o,i[0]);case 2:return s.call(o,i[0],i[1]);case 3:return s.call(o,i[0],i[1],i[2])}return s.apply(o,i)}},91596:s=>{var o=Math.max;s.exports=function composeArgs(s,i,a,u){for(var _=-1,w=s.length,x=a.length,C=-1,j=i.length,L=o(w-x,0),B=Array(j+L),$=!u;++C<j;)B[C]=i[C];for(;++_<x;)($||_<w)&&(B[a[_]]=s[_]);for(;L--;)B[C++]=s[_++];return B}},91599:(s,o,i)=>{"use strict";i(64502)},92046:s=>{"use strict";s.exports={}},92063:s=>{"use strict";s.exports=function required(s,o){if(o=o.split(":")[0],!(s=+s))return!1;switch(o){case"http":case"ws":return 80!==s;case"https":case"wss":return 443!==s;case"ftp":return 21!==s;case"gopher":return 70!==s;case"file":return!1}return 0!==s}},92271:(s,o,i)=>{var a=i(21791),u=i(4664);s.exports=function copySymbols(s,o){return a(s,u(s),o)}},92340:(s,o,i)=>{const a=i(6048);function coerceElementMatchingCallback(s){return"string"==typeof s?o=>o.element===s:s.constructor&&s.extend?o=>o instanceof s:s}class ArraySlice{constructor(s){this.elements=s||[]}toValue(){return this.elements.map((s=>s.toValue()))}map(s,o){return this.elements.map(s,o)}flatMap(s,o){return this.map(s,o).reduce(((s,o)=>s.concat(o)),[])}compactMap(s,o){const i=[];return this.forEach((a=>{const u=s.bind(o)(a);u&&i.push(u)})),i}filter(s,o){return s=coerceElementMatchingCallback(s),new ArraySlice(this.elements.filter(s,o))}reject(s,o){return s=coerceElementMatchingCallback(s),new ArraySlice(this.elements.filter(a(s),o))}find(s,o){return s=coerceElementMatchingCallback(s),this.elements.find(s,o)}forEach(s,o){this.elements.forEach(s,o)}reduce(s,o){return this.elements.reduce(s,o)}includes(s){return this.elements.some((o=>o.equals(s)))}shift(){return this.elements.shift()}unshift(s){this.elements.unshift(this.refract(s))}push(s){return this.elements.push(this.refract(s)),this}add(s){this.push(s)}get(s){return this.elements[s]}getValue(s){const o=this.elements[s];if(o)return o.toValue()}get length(){return this.elements.length}get isEmpty(){return 0===this.elements.length}get first(){return this.elements[0]}}"undefined"!=typeof Symbol&&(ArraySlice.prototype[Symbol.iterator]=function symbol(){return this.elements[Symbol.iterator]()}),s.exports=ArraySlice},92361:(s,o,i)=>{"use strict";var a=i(45807),u=i(1907);s.exports=function(s){if("Function"===a(s))return u(s)}},92522:(s,o,i)=>{"use strict";var a=i(85816),u=i(6499),_=a("keys");s.exports=function(s){return _[s]||(_[s]=u(s))}},92861:(s,o,i)=>{var a=i(48287),u=a.Buffer;function copyProps(s,o){for(var i in s)o[i]=s[i]}function SafeBuffer(s,o,i){return u(s,o,i)}u.from&&u.alloc&&u.allocUnsafe&&u.allocUnsafeSlow?s.exports=a:(copyProps(a,o),o.Buffer=SafeBuffer),SafeBuffer.prototype=Object.create(u.prototype),copyProps(u,SafeBuffer),SafeBuffer.from=function(s,o,i){if("number"==typeof s)throw new TypeError("Argument must not be a number");return u(s,o,i)},SafeBuffer.alloc=function(s,o,i){if("number"!=typeof s)throw new TypeError("Argument must be a number");var a=u(s);return void 0!==o?"string"==typeof i?a.fill(o,i):a.fill(o):a.fill(0),a},SafeBuffer.allocUnsafe=function(s){if("number"!=typeof s)throw new TypeError("Argument must be a number");return u(s)},SafeBuffer.allocUnsafeSlow=function(s){if("number"!=typeof s)throw new TypeError("Argument must be a number");return a.SlowBuffer(s)}},93243:(s,o,i)=>{var a=i(56110),u=function(){try{var s=a(Object,"defineProperty");return s({},"",{}),s}catch(s){}}();s.exports=u},93290:(s,o,i)=>{s=i.nmd(s);var a=i(9325),u=o&&!o.nodeType&&o,_=u&&s&&!s.nodeType&&s,w=_&&_.exports===u?a.Buffer:void 0,x=w?w.allocUnsafe:void 0;s.exports=function cloneBuffer(s,o){if(o)return s.slice();var i=s.length,a=x?x(i):new s.constructor(i);return s.copy(a),a}},93427:(s,o,i)=>{"use strict";var a=i(1907);s.exports=a([].slice)},93628:(s,o,i)=>{"use strict";var a=i(48648),u=i(71064),_=i(7176);s.exports=a?function getProto(s){return a(s)}:u?function getProto(s){if(!s||"object"!=typeof s&&"function"!=typeof s)throw new TypeError("getProto: not an object");return u(s)}:_?function getProto(s){return _(s)}:null},93663:(s,o,i)=>{var a=i(41799),u=i(10776),_=i(67197);s.exports=function baseMatches(s){var o=u(s);return 1==o.length&&o[0][2]?_(o[0][0],o[0][1]):function(i){return i===s||a(i,s,o)}}},93700:(s,o,i)=>{"use strict";var a=i(19709);s.exports=a},93736:(s,o,i)=>{var a=i(51873),u=a?a.prototype:void 0,_=u?u.valueOf:void 0;s.exports=function cloneSymbol(s){return _?Object(_.call(s)):{}}},93742:s=>{"use strict";s.exports={}},94033:s=>{s.exports=function baseLodash(){}},94459:s=>{"use strict";s.exports=Number.isNaN||function isNaN(s){return s!=s}},94643:(s,o,i)=>{function config(s){try{if(!i.g.localStorage)return!1}catch(s){return!1}var o=i.g.localStorage[s];return null!=o&&"true"===String(o).toLowerCase()}s.exports=function deprecate(s,o){if(config("noDeprecation"))return s;var i=!1;return function deprecated(){if(!i){if(config("throwDeprecation"))throw new Error(o);config("traceDeprecation")?console.trace(o):console.warn(o),i=!0}return s.apply(this,arguments)}}},95089:s=>{const o="[A-Za-z$_][0-9A-Za-z$_]*",i=["as","in","of","if","for","while","finally","var","new","function","do","return","void","else","break","catch","instanceof","with","throw","case","default","try","switch","continue","typeof","delete","let","yield","const","class","debugger","async","await","static","import","from","export","extends"],a=["true","false","null","undefined","NaN","Infinity"],u=[].concat(["setInterval","setTimeout","clearInterval","clearTimeout","require","exports","eval","isFinite","isNaN","parseFloat","parseInt","decodeURI","decodeURIComponent","encodeURI","encodeURIComponent","escape","unescape"],["arguments","this","super","console","window","document","localStorage","module","global"],["Intl","DataView","Number","Math","Date","String","RegExp","Object","Function","Boolean","Error","Symbol","Set","Map","WeakSet","WeakMap","Proxy","Reflect","JSON","Promise","Float64Array","Int16Array","Int32Array","Int8Array","Uint16Array","Uint32Array","Float32Array","Array","Uint8Array","Uint8ClampedArray","ArrayBuffer","BigInt64Array","BigUint64Array","BigInt"],["EvalError","InternalError","RangeError","ReferenceError","SyntaxError","TypeError","URIError"]);function lookahead(s){return concat("(?=",s,")")}function concat(...s){return s.map((s=>function source(s){return s?"string"==typeof s?s:s.source:null}(s))).join("")}s.exports=function javascript(s){const _=o,w="<>",x="</>",C={begin:/<[A-Za-z0-9\\._:-]+/,end:/\/[A-Za-z0-9\\._:-]+>|\/>/,isTrulyOpeningTag:(s,o)=>{const i=s[0].length+s.index,a=s.input[i];"<"!==a?">"===a&&(((s,{after:o})=>{const i="</"+s[0].slice(1);return-1!==s.input.indexOf(i,o)})(s,{after:i})||o.ignoreMatch()):o.ignoreMatch()}},j={$pattern:o,keyword:i,literal:a,built_in:u},L="[0-9](_?[0-9])*",B=`\\.(${L})`,$="0|[1-9](_?[0-9])*|0[0-7]*[89][0-9]*",U={className:"number",variants:[{begin:`(\\b(${$})((${B})|\\.)?|(${B}))[eE][+-]?(${L})\\b`},{begin:`\\b(${$})\\b((${B})\\b|\\.)?|(${B})\\b`},{begin:"\\b(0|[1-9](_?[0-9])*)n\\b"},{begin:"\\b0[xX][0-9a-fA-F](_?[0-9a-fA-F])*n?\\b"},{begin:"\\b0[bB][0-1](_?[0-1])*n?\\b"},{begin:"\\b0[oO][0-7](_?[0-7])*n?\\b"},{begin:"\\b0[0-7]+n?\\b"}],relevance:0},V={className:"subst",begin:"\\$\\{",end:"\\}",keywords:j,contains:[]},z={begin:"html`",end:"",starts:{end:"`",returnEnd:!1,contains:[s.BACKSLASH_ESCAPE,V],subLanguage:"xml"}},Y={begin:"css`",end:"",starts:{end:"`",returnEnd:!1,contains:[s.BACKSLASH_ESCAPE,V],subLanguage:"css"}},Z={className:"string",begin:"`",end:"`",contains:[s.BACKSLASH_ESCAPE,V]},ee={className:"comment",variants:[s.COMMENT(/\/\*\*(?!\/)/,"\\*/",{relevance:0,contains:[{className:"doctag",begin:"@[A-Za-z]+",contains:[{className:"type",begin:"\\{",end:"\\}",relevance:0},{className:"variable",begin:_+"(?=\\s*(-)|$)",endsParent:!0,relevance:0},{begin:/(?=[^\n])\s/,relevance:0}]}]}),s.C_BLOCK_COMMENT_MODE,s.C_LINE_COMMENT_MODE]},ie=[s.APOS_STRING_MODE,s.QUOTE_STRING_MODE,z,Y,Z,U,s.REGEXP_MODE];V.contains=ie.concat({begin:/\{/,end:/\}/,keywords:j,contains:["self"].concat(ie)});const ae=[].concat(ee,V.contains),ce=ae.concat([{begin:/\(/,end:/\)/,keywords:j,contains:["self"].concat(ae)}]),le={className:"params",begin:/\(/,end:/\)/,excludeBegin:!0,excludeEnd:!0,keywords:j,contains:ce};return{name:"Javascript",aliases:["js","jsx","mjs","cjs"],keywords:j,exports:{PARAMS_CONTAINS:ce},illegal:/#(?![$_A-z])/,contains:[s.SHEBANG({label:"shebang",binary:"node",relevance:5}),{label:"use_strict",className:"meta",relevance:10,begin:/^\s*['"]use (strict|asm)['"]/},s.APOS_STRING_MODE,s.QUOTE_STRING_MODE,z,Y,Z,ee,U,{begin:concat(/[{,\n]\s*/,lookahead(concat(/(((\/\/.*$)|(\/\*(\*[^/]|[^*])*\*\/))\s*)*/,_+"\\s*:"))),relevance:0,contains:[{className:"attr",begin:_+lookahead("\\s*:"),relevance:0}]},{begin:"("+s.RE_STARTERS_RE+"|\\b(case|return|throw)\\b)\\s*",keywords:"return throw case",contains:[ee,s.REGEXP_MODE,{className:"function",begin:"(\\([^()]*(\\([^()]*(\\([^()]*\\)[^()]*)*\\)[^()]*)*\\)|"+s.UNDERSCORE_IDENT_RE+")\\s*=>",returnBegin:!0,end:"\\s*=>",contains:[{className:"params",variants:[{begin:s.UNDERSCORE_IDENT_RE,relevance:0},{className:null,begin:/\(\s*\)/,skip:!0},{begin:/\(/,end:/\)/,excludeBegin:!0,excludeEnd:!0,keywords:j,contains:ce}]}]},{begin:/,/,relevance:0},{className:"",begin:/\s/,end:/\s*/,skip:!0},{variants:[{begin:w,end:x},{begin:C.begin,"on:begin":C.isTrulyOpeningTag,end:C.end}],subLanguage:"xml",contains:[{begin:C.begin,end:C.end,skip:!0,contains:["self"]}]}],relevance:0},{className:"function",beginKeywords:"function",end:/[{;]/,excludeEnd:!0,keywords:j,contains:["self",s.inherit(s.TITLE_MODE,{begin:_}),le],illegal:/%/},{beginKeywords:"while if switch catch for"},{className:"function",begin:s.UNDERSCORE_IDENT_RE+"\\([^()]*(\\([^()]*(\\([^()]*\\)[^()]*)*\\)[^()]*)*\\)\\s*\\{",returnBegin:!0,contains:[le,s.inherit(s.TITLE_MODE,{begin:_})]},{variants:[{begin:"\\."+_},{begin:"\\$"+_}],relevance:0},{className:"class",beginKeywords:"class",end:/[{;=]/,excludeEnd:!0,illegal:/[:"[\]]/,contains:[{beginKeywords:"extends"},s.UNDERSCORE_TITLE_MODE]},{begin:/\b(?=constructor)/,end:/[{;]/,excludeEnd:!0,contains:[s.inherit(s.TITLE_MODE,{begin:_}),"self",le]},{begin:"(get|set)\\s+(?="+_+"\\()",end:/\{/,keywords:"get set",contains:[s.inherit(s.TITLE_MODE,{begin:_}),{begin:/\(\)/},le]},{begin:/\$[(.]/}]}}},95116:(s,o,i)=>{"use strict";var a,u,_,w=i(98828),x=i(62250),C=i(46285),j=i(58075),L=i(15972),B=i(68055),$=i(76264),U=i(7376),V=$("iterator"),z=!1;[].keys&&("next"in(_=[].keys())?(u=L(L(_)))!==Object.prototype&&(a=u):z=!0),!C(a)||w((function(){var s={};return a[V].call(s)!==s}))?a={}:U&&(a=j(a)),x(a[V])||B(a,V,(function(){return this})),s.exports={IteratorPrototype:a,BUGGY_SAFARI_ITERATORS:z}},95950:(s,o,i)=>{var a=i(70695),u=i(88984),_=i(64894);s.exports=function keys(s){return _(s)?a(s):u(s)}},96131:(s,o,i)=>{var a=i(2523),u=i(85463),_=i(76959);s.exports=function baseIndexOf(s,o,i){return o==o?_(s,o,i):a(s,u,i)}},96540:(s,o,i)=>{"use strict";s.exports=i(15287)},96605:(s,o,i)=>{"use strict";var a=i(11091),u=i(45951),_=i(76024),w=i(19358),x="WebAssembly",C=u[x],j=7!==new Error("e",{cause:7}).cause,exportGlobalErrorCauseWrapper=function(s,o){var i={};i[s]=w(s,o,j),a({global:!0,constructor:!0,arity:1,forced:j},i)},exportWebAssemblyErrorCauseWrapper=function(s,o){if(C&&C[s]){var i={};i[s]=w(x+"."+s,o,j),a({target:x,stat:!0,constructor:!0,arity:1,forced:j},i)}};exportGlobalErrorCauseWrapper("Error",(function(s){return function Error(o){return _(s,this,arguments)}})),exportGlobalErrorCauseWrapper("EvalError",(function(s){return function EvalError(o){return _(s,this,arguments)}})),exportGlobalErrorCauseWrapper("RangeError",(function(s){return function RangeError(o){return _(s,this,arguments)}})),exportGlobalErrorCauseWrapper("ReferenceError",(function(s){return function ReferenceError(o){return _(s,this,arguments)}})),exportGlobalErrorCauseWrapper("SyntaxError",(function(s){return function SyntaxError(o){return _(s,this,arguments)}})),exportGlobalErrorCauseWrapper("TypeError",(function(s){return function TypeError(o){return _(s,this,arguments)}})),exportGlobalErrorCauseWrapper("URIError",(function(s){return function URIError(o){return _(s,this,arguments)}})),exportWebAssemblyErrorCauseWrapper("CompileError",(function(s){return function CompileError(o){return _(s,this,arguments)}})),exportWebAssemblyErrorCauseWrapper("LinkError",(function(s){return function LinkError(o){return _(s,this,arguments)}})),exportWebAssemblyErrorCauseWrapper("RuntimeError",(function(s){return function RuntimeError(o){return _(s,this,arguments)}}))},96794:(s,o,i)=>{"use strict";var a=i(45951).navigator,u=a&&a.userAgent;s.exports=u?String(u):""},96897:(s,o,i)=>{"use strict";var a=i(70453),u=i(30041),_=i(30592)(),w=i(75795),x=i(69675),C=a("%Math.floor%");s.exports=function setFunctionLength(s,o){if("function"!=typeof s)throw new x("`fn` is not a function");if("number"!=typeof o||o<0||o>4294967295||C(o)!==o)throw new x("`length` must be a positive 32-bit integer");var i=arguments.length>2&&!!arguments[2],a=!0,j=!0;if("length"in s&&w){var L=w(s,"length");L&&!L.configurable&&(a=!1),L&&!L.writable&&(j=!1)}return(a||j||!i)&&(_?u(s,"length",o,!0,!0):u(s,"length",o)),s}},98023:(s,o,i)=>{var a=i(72552),u=i(40346);s.exports=function isNumber(s){return"number"==typeof s||u(s)&&"[object Number]"==a(s)}},98828:s=>{"use strict";s.exports=function(s){try{return!!s()}catch(s){return!0}}},99363:(s,o,i)=>{"use strict";var a=i(4993),u=i(42156),_=i(93742),w=i(64932),x=i(74284).f,C=i(60183),j=i(59550),L=i(7376),B=i(39447),$="Array Iterator",U=w.set,V=w.getterFor($);s.exports=C(Array,"Array",(function(s,o){U(this,{type:$,target:a(s),index:0,kind:o})}),(function(){var s=V(this),o=s.target,i=s.index++;if(!o||i>=o.length)return s.target=null,j(void 0,!0);switch(s.kind){case"keys":return j(i,!1);case"values":return j(o[i],!1)}return j([i,o[i]],!1)}),"values");var z=_.Arguments=_.Array;if(u("keys"),u("values"),u("entries"),!L&&B&&"values"!==z.name)try{x(z,"name",{value:"values"})}catch(s){}},99374:(s,o,i)=>{var a=i(54128),u=i(23805),_=i(44394),w=/^[-+]0x[0-9a-f]+$/i,x=/^0b[01]+$/i,C=/^0o[0-7]+$/i,j=parseInt;s.exports=function toNumber(s){if("number"==typeof s)return s;if(_(s))return NaN;if(u(s)){var o="function"==typeof s.valueOf?s.valueOf():s;s=u(o)?o+"":o}if("string"!=typeof s)return 0===s?s:+s;s=a(s);var i=x.test(s);return i||C.test(s)?j(s.slice(2),i?2:8):w.test(s)?NaN:+s}}},o={};function __webpack_require__(i){var a=o[i];if(void 0!==a)return a.exports;var u=o[i]={id:i,loaded:!1,exports:{}};return s[i].call(u.exports,u,u.exports,__webpack_require__),u.loaded=!0,u.exports}__webpack_require__.n=s=>{var o=s&&s.__esModule?()=>s.default:()=>s;return __webpack_require__.d(o,{a:o}),o},__webpack_require__.d=(s,o)=>{for(var i in o)__webpack_require__.o(o,i)&&!__webpack_require__.o(s,i)&&Object.defineProperty(s,i,{enumerable:!0,get:o[i]})},__webpack_require__.g=function(){if("object"==typeof globalThis)return globalThis;try{return this||new Function("return this")()}catch(s){if("object"==typeof window)return window}}(),__webpack_require__.o=(s,o)=>Object.prototype.hasOwnProperty.call(s,o),__webpack_require__.r=s=>{"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(s,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(s,"__esModule",{value:!0})},__webpack_require__.nmd=s=>(s.paths=[],s.children||(s.children=[]),s);var i={};return(()=>{"use strict";__webpack_require__.d(i,{default:()=>WT});var s={};__webpack_require__.r(s),__webpack_require__.d(s,{CLEAR:()=>at,CLEAR_BY:()=>ct,NEW_AUTH_ERR:()=>it,NEW_SPEC_ERR:()=>st,NEW_SPEC_ERR_BATCH:()=>ot,NEW_THROWN_ERR:()=>rt,NEW_THROWN_ERR_BATCH:()=>nt,clear:()=>clear,clearBy:()=>clearBy,newAuthErr:()=>newAuthErr,newSpecErr:()=>newSpecErr,newSpecErrBatch:()=>newSpecErrBatch,newThrownErr:()=>newThrownErr,newThrownErrBatch:()=>newThrownErrBatch});var o={};__webpack_require__.r(o),__webpack_require__.d(o,{AUTHORIZE:()=>Rt,AUTHORIZE_OAUTH2:()=>Lt,CONFIGURE_AUTH:()=>Ft,LOGOUT:()=>Dt,RESTORE_AUTHORIZATION:()=>Bt,SHOW_AUTH_POPUP:()=>Mt,authPopup:()=>authPopup,authorize:()=>authorize,authorizeAccessCodeWithBasicAuthentication:()=>authorizeAccessCodeWithBasicAuthentication,authorizeAccessCodeWithFormParams:()=>authorizeAccessCodeWithFormParams,authorizeApplication:()=>authorizeApplication,authorizeOauth2:()=>authorizeOauth2,authorizeOauth2WithPersistOption:()=>authorizeOauth2WithPersistOption,authorizePassword:()=>authorizePassword,authorizeRequest:()=>authorizeRequest,authorizeWithPersistOption:()=>authorizeWithPersistOption,configureAuth:()=>configureAuth,logout:()=>logout,logoutWithPersistOption:()=>logoutWithPersistOption,persistAuthorizationIfNeeded:()=>persistAuthorizationIfNeeded,preAuthorizeImplicit:()=>preAuthorizeImplicit,restoreAuthorization:()=>restoreAuthorization,showDefinitions:()=>showDefinitions});var a={};__webpack_require__.r(a),__webpack_require__.d(a,{authorized:()=>Jt,definitionsForRequirements:()=>definitionsForRequirements,definitionsToAuthorize:()=>Wt,getConfigs:()=>Ht,getDefinitionsByNames:()=>getDefinitionsByNames,isAuthorized:()=>isAuthorized,selectAuthPath:()=>selectAuthPath,shownDefinitions:()=>zt});var u={};__webpack_require__.r(u),__webpack_require__.d(u,{TOGGLE_CONFIGS:()=>gn,UPDATE_CONFIGS:()=>mn,downloadConfig:()=>downloadConfig,getConfigByUrl:()=>getConfigByUrl,loaded:()=>actions_loaded,toggle:()=>toggle,update:()=>update});var _={};__webpack_require__.r(_),__webpack_require__.d(_,{get:()=>get});var w={};__webpack_require__.r(w),__webpack_require__.d(w,{transform:()=>transform});var x={};__webpack_require__.r(x),__webpack_require__.d(x,{transform:()=>parameter_oneof_transform});var C={};__webpack_require__.r(C),__webpack_require__.d(C,{allErrors:()=>In,lastError:()=>Tn});var j={};__webpack_require__.r(j),__webpack_require__.d(j,{SHOW:()=>Fn,UPDATE_FILTER:()=>Dn,UPDATE_LAYOUT:()=>Rn,UPDATE_MODE:()=>Ln,changeMode:()=>changeMode,show:()=>actions_show,updateFilter:()=>updateFilter,updateLayout:()=>updateLayout});var L={};__webpack_require__.r(L),__webpack_require__.d(L,{current:()=>current,currentFilter:()=>currentFilter,isShown:()=>isShown,showSummary:()=>$n,whatMode:()=>whatMode});var B={};__webpack_require__.r(B),__webpack_require__.d(B,{taggedOperations:()=>taggedOperations});var $={};__webpack_require__.r($),__webpack_require__.d($,{getActiveLanguage:()=>Vn,getDefaultExpanded:()=>zn,getGenerators:()=>Un,getSnippetGenerators:()=>getSnippetGenerators});var U={};__webpack_require__.r(U),__webpack_require__.d(U,{JsonSchemaArrayItemFile:()=>JsonSchemaArrayItemFile,JsonSchemaArrayItemText:()=>JsonSchemaArrayItemText,JsonSchemaForm:()=>JsonSchemaForm,JsonSchema_array:()=>JsonSchema_array,JsonSchema_boolean:()=>JsonSchema_boolean,JsonSchema_object:()=>JsonSchema_object,JsonSchema_string:()=>JsonSchema_string});var V={};__webpack_require__.r(V),__webpack_require__.d(V,{allowTryItOutFor:()=>allowTryItOutFor,basePath:()=>Hs,canExecuteScheme:()=>canExecuteScheme,consumes:()=>Us,consumesOptionsFor:()=>consumesOptionsFor,contentTypeValues:()=>contentTypeValues,currentProducesFor:()=>currentProducesFor,definitions:()=>Js,externalDocs:()=>Ds,findDefinition:()=>findDefinition,getOAS3RequiredRequestBodyContentType:()=>getOAS3RequiredRequestBodyContentType,getParameter:()=>getParameter,hasHost:()=>ro,host:()=>Ks,info:()=>Rs,isMediaTypeSchemaPropertiesEqual:()=>isMediaTypeSchemaPropertiesEqual,isOAS3:()=>Ms,lastError:()=>Os,mutatedRequestFor:()=>mutatedRequestFor,mutatedRequests:()=>to,operationScheme:()=>operationScheme,operationWithMeta:()=>operationWithMeta,operations:()=>qs,operationsWithRootInherited:()=>Ys,operationsWithTags:()=>Qs,parameterInclusionSettingFor:()=>parameterInclusionSettingFor,parameterValues:()=>parameterValues,parameterWithMeta:()=>parameterWithMeta,parameterWithMetaByIdentity:()=>parameterWithMetaByIdentity,parametersIncludeIn:()=>parametersIncludeIn,parametersIncludeType:()=>parametersIncludeType,paths:()=>Bs,produces:()=>Vs,producesOptionsFor:()=>producesOptionsFor,requestFor:()=>requestFor,requests:()=>eo,responseFor:()=>responseFor,responses:()=>Zs,schemes:()=>Gs,security:()=>zs,securityDefinitions:()=>Ws,semver:()=>Fs,spec:()=>spec,specJS:()=>Is,specJson:()=>Ps,specJsonWithResolvedSubtrees:()=>Ns,specResolved:()=>Ts,specResolvedSubtree:()=>specResolvedSubtree,specSource:()=>js,specStr:()=>Cs,tagDetails:()=>tagDetails,taggedOperations:()=>selectors_taggedOperations,tags:()=>Xs,url:()=>As,validOperationMethods:()=>$s,validateBeforeExecute:()=>validateBeforeExecute,validationErrors:()=>validationErrors,version:()=>Ls});var z={};__webpack_require__.r(z),__webpack_require__.d(z,{CLEAR_REQUEST:()=>wo,CLEAR_RESPONSE:()=>Eo,CLEAR_VALIDATE_PARAMS:()=>xo,LOG_REQUEST:()=>So,SET_MUTATED_REQUEST:()=>_o,SET_REQUEST:()=>bo,SET_RESPONSE:()=>vo,SET_SCHEME:()=>Co,UPDATE_EMPTY_PARAM_INCLUSION:()=>go,UPDATE_JSON:()=>fo,UPDATE_OPERATION_META_VALUE:()=>ko,UPDATE_PARAM:()=>mo,UPDATE_RESOLVED:()=>Oo,UPDATE_RESOLVED_SUBTREE:()=>Ao,UPDATE_SPEC:()=>po,UPDATE_URL:()=>ho,VALIDATE_PARAMS:()=>yo,changeConsumesValue:()=>changeConsumesValue,changeParam:()=>changeParam,changeParamByIdentity:()=>changeParamByIdentity,changeProducesValue:()=>changeProducesValue,clearRequest:()=>clearRequest,clearResponse:()=>clearResponse,clearValidateParams:()=>clearValidateParams,execute:()=>actions_execute,executeRequest:()=>executeRequest,invalidateResolvedSubtreeCache:()=>invalidateResolvedSubtreeCache,logRequest:()=>logRequest,parseToJson:()=>parseToJson,requestResolvedSubtree:()=>requestResolvedSubtree,resolveSpec:()=>resolveSpec,setMutatedRequest:()=>setMutatedRequest,setRequest:()=>setRequest,setResponse:()=>setResponse,setScheme:()=>setScheme,updateEmptyParamInclusion:()=>updateEmptyParamInclusion,updateJsonSpec:()=>updateJsonSpec,updateResolved:()=>updateResolved,updateResolvedSubtree:()=>updateResolvedSubtree,updateSpec:()=>updateSpec,updateUrl:()=>updateUrl,validateParams:()=>validateParams});var Y={};__webpack_require__.r(Y),__webpack_require__.d(Y,{executeRequest:()=>wrap_actions_executeRequest,updateJsonSpec:()=>wrap_actions_updateJsonSpec,updateSpec:()=>wrap_actions_updateSpec,validateParams:()=>wrap_actions_validateParams});var Z={};__webpack_require__.r(Z),__webpack_require__.d(Z,{JsonPatchError:()=>Do,_areEquals:()=>_areEquals,applyOperation:()=>applyOperation,applyPatch:()=>applyPatch,applyReducer:()=>applyReducer,deepClone:()=>Lo,getValueByPointer:()=>getValueByPointer,validate:()=>validate,validator:()=>validator});var ee={};__webpack_require__.r(ee),__webpack_require__.d(ee,{compare:()=>compare,generate:()=>generate,observe:()=>observe,unobserve:()=>unobserve});var ie={};__webpack_require__.r(ie),__webpack_require__.d(ie,{hasElementSourceMap:()=>hasElementSourceMap,includesClasses:()=>includesClasses,includesSymbols:()=>includesSymbols,isAnnotationElement:()=>Fu,isArrayElement:()=>Mu,isBooleanElement:()=>Tu,isCommentElement:()=>Bu,isElement:()=>Cu,isLinkElement:()=>Du,isMemberElement:()=>Ru,isNullElement:()=>Iu,isNumberElement:()=>Pu,isObjectElement:()=>Nu,isParseResultElement:()=>$u,isPrimitiveElement:()=>isPrimitiveElement,isRefElement:()=>Lu,isStringElement:()=>ju});var ae={};__webpack_require__.r(ae),__webpack_require__.d(ae,{isJSONReferenceElement:()=>Ld,isJSONSchemaElement:()=>Dd,isLinkDescriptionElement:()=>Bd,isMediaElement:()=>Fd});var ce={};__webpack_require__.r(ce),__webpack_require__.d(ce,{isBooleanJsonSchemaElement:()=>isBooleanJsonSchemaElement,isCallbackElement:()=>Tm,isComponentsElement:()=>Nm,isContactElement:()=>Mm,isDiscriminatorElement:()=>og,isExampleElement:()=>Rm,isExternalDocumentationElement:()=>Dm,isHeaderElement:()=>Lm,isInfoElement:()=>Fm,isLicenseElement:()=>Bm,isLinkElement:()=>$m,isMediaTypeElement:()=>ng,isOpenApi3_0Element:()=>Um,isOpenapiElement:()=>qm,isOperationElement:()=>Vm,isParameterElement:()=>zm,isPathItemElement:()=>Wm,isPathsElement:()=>Jm,isReferenceElement:()=>Hm,isRequestBodyElement:()=>Km,isResponseElement:()=>Gm,isResponsesElement:()=>Ym,isSchemaElement:()=>Xm,isSecurityRequirementElement:()=>Qm,isSecuritySchemeElement:()=>Zm,isServerElement:()=>eg,isServerVariableElement:()=>rg,isServersElement:()=>sg});var le={};__webpack_require__.r(le),__webpack_require__.d(le,{isJSONReferenceElement:()=>Ld,isJSONSchemaElement:()=>g_,isLinkDescriptionElement:()=>y_,isMediaElement:()=>Fd});var pe={};__webpack_require__.r(pe),__webpack_require__.d(pe,{isJSONReferenceElement:()=>Ld,isJSONSchemaElement:()=>A_,isLinkDescriptionElement:()=>C_});var de={};__webpack_require__.r(de),__webpack_require__.d(de,{isJSONSchemaElement:()=>K_,isLinkDescriptionElement:()=>G_});var fe={};__webpack_require__.r(fe),__webpack_require__.d(fe,{isJSONSchemaElement:()=>oS,isLinkDescriptionElement:()=>iS});var ye={};__webpack_require__.r(ye),__webpack_require__.d(ye,{isBooleanJsonSchemaElement:()=>predicates_isBooleanJsonSchemaElement,isCallbackElement:()=>zS,isComponentsElement:()=>WS,isContactElement:()=>JS,isExampleElement:()=>HS,isExternalDocumentationElement:()=>KS,isHeaderElement:()=>GS,isInfoElement:()=>YS,isJsonSchemaDialectElement:()=>XS,isLicenseElement:()=>QS,isLinkElement:()=>ZS,isMediaTypeElement:()=>mE,isOpenApi3_1Element:()=>tE,isOpenapiElement:()=>eE,isOperationElement:()=>rE,isParameterElement:()=>nE,isPathItemElement:()=>sE,isPathItemElementExternal:()=>isPathItemElementExternal,isPathsElement:()=>oE,isReferenceElement:()=>iE,isReferenceElementExternal:()=>isReferenceElementExternal,isRequestBodyElement:()=>aE,isResponseElement:()=>cE,isResponsesElement:()=>lE,isSchemaElement:()=>uE,isSecurityRequirementElement:()=>pE,isSecuritySchemeElement:()=>hE,isServerElement:()=>dE,isServerVariableElement:()=>fE});var be={};__webpack_require__.r(be),__webpack_require__.d(be,{cookie:()=>cookie,header:()=>parameter_builders_header,path:()=>parameter_builders_path,query:()=>query});var _e={};__webpack_require__.r(_e),__webpack_require__.d(_e,{Button:()=>Button,Col:()=>Col,Collapse:()=>Collapse,Container:()=>Container,Input:()=>Input,Link:()=>layout_utils_Link,Row:()=>Row,Select:()=>Select,TextArea:()=>TextArea});var Se={};__webpack_require__.r(Se),__webpack_require__.d(Se,{basePath:()=>NP,consumes:()=>MP,definitions:()=>jP,findDefinition:()=>CP,hasHost:()=>PP,host:()=>TP,produces:()=>RP,schemes:()=>DP,securityDefinitions:()=>IP,validOperationMethods:()=>wrap_selectors_validOperationMethods});var we={};__webpack_require__.r(we),__webpack_require__.d(we,{definitionsToAuthorize:()=>LP});var xe={};__webpack_require__.r(xe),__webpack_require__.d(xe,{callbacksOperations:()=>$P,findSchema:()=>findSchema,isOAS3:()=>selectors_isOAS3,isOAS30:()=>selectors_isOAS30,isSwagger2:()=>selectors_isSwagger2,servers:()=>BP});var Pe={};__webpack_require__.r(Pe),__webpack_require__.d(Pe,{CLEAR_REQUEST_BODY_VALIDATE_ERROR:()=>iI,CLEAR_REQUEST_BODY_VALUE:()=>aI,SET_REQUEST_BODY_VALIDATE_ERROR:()=>oI,UPDATE_ACTIVE_EXAMPLES_MEMBER:()=>tI,UPDATE_REQUEST_BODY_INCLUSION:()=>eI,UPDATE_REQUEST_BODY_VALUE:()=>QP,UPDATE_REQUEST_BODY_VALUE_RETAIN_FLAG:()=>ZP,UPDATE_REQUEST_CONTENT_TYPE:()=>rI,UPDATE_RESPONSE_CONTENT_TYPE:()=>nI,UPDATE_SELECTED_SERVER:()=>XP,UPDATE_SERVER_VARIABLE_VALUE:()=>sI,clearRequestBodyValidateError:()=>clearRequestBodyValidateError,clearRequestBodyValue:()=>clearRequestBodyValue,initRequestBodyValidateError:()=>initRequestBodyValidateError,setActiveExamplesMember:()=>setActiveExamplesMember,setRequestBodyInclusion:()=>setRequestBodyInclusion,setRequestBodyValidateError:()=>setRequestBodyValidateError,setRequestBodyValue:()=>setRequestBodyValue,setRequestContentType:()=>setRequestContentType,setResponseContentType:()=>setResponseContentType,setRetainRequestBodyValueFlag:()=>setRetainRequestBodyValueFlag,setSelectedServer:()=>setSelectedServer,setServerVariableValue:()=>setServerVariableValue});var Te={};__webpack_require__.r(Te),__webpack_require__.d(Te,{activeExamplesMember:()=>gI,hasUserEditedBody:()=>dI,requestBodyErrors:()=>mI,requestBodyInclusionSetting:()=>fI,requestBodyValue:()=>pI,requestContentType:()=>yI,responseContentType:()=>vI,selectDefaultRequestBodyValue:()=>selectDefaultRequestBodyValue,selectedServer:()=>uI,serverEffectiveValue:()=>SI,serverVariableValue:()=>bI,serverVariables:()=>_I,shouldRetainRequestBodyValue:()=>hI,validOperationMethods:()=>wI,validateBeforeExecute:()=>EI,validateShallowRequired:()=>validateShallowRequired});var Re=__webpack_require__(96540);function formatProdErrorMessage(s){return`Minified Redux error #${s}; visit https://redux.js.org/Errors?code=${s} for the full message or use the non-minified dev environment for full errors. `}var $e=(()=>"function"==typeof Symbol&&Symbol.observable||"@@observable")(),randomString=()=>Math.random().toString(36).substring(7).split("").join("."),qe={INIT:`@@redux/INIT${randomString()}`,REPLACE:`@@redux/REPLACE${randomString()}`,PROBE_UNKNOWN_ACTION:()=>`@@redux/PROBE_UNKNOWN_ACTION${randomString()}`};function isPlainObject(s){if("object"!=typeof s||null===s)return!1;let o=s;for(;null!==Object.getPrototypeOf(o);)o=Object.getPrototypeOf(o);return Object.getPrototypeOf(s)===o||null===Object.getPrototypeOf(s)}function createStore(s,o,i){if("function"!=typeof s)throw new Error(formatProdErrorMessage(2));if("function"==typeof o&&"function"==typeof i||"function"==typeof i&&"function"==typeof arguments[3])throw new Error(formatProdErrorMessage(0));if("function"==typeof o&&void 0===i&&(i=o,o=void 0),void 0!==i){if("function"!=typeof i)throw new Error(formatProdErrorMessage(1));return i(createStore)(s,o)}let a=s,u=o,_=new Map,w=_,x=0,C=!1;function ensureCanMutateNextListeners(){w===_&&(w=new Map,_.forEach(((s,o)=>{w.set(o,s)})))}function getState(){if(C)throw new Error(formatProdErrorMessage(3));return u}function subscribe(s){if("function"!=typeof s)throw new Error(formatProdErrorMessage(4));if(C)throw new Error(formatProdErrorMessage(5));let o=!0;ensureCanMutateNextListeners();const i=x++;return w.set(i,s),function unsubscribe(){if(o){if(C)throw new Error(formatProdErrorMessage(6));o=!1,ensureCanMutateNextListeners(),w.delete(i),_=null}}}function dispatch(s){if(!isPlainObject(s))throw new Error(formatProdErrorMessage(7));if(void 0===s.type)throw new Error(formatProdErrorMessage(8));if("string"!=typeof s.type)throw new Error(formatProdErrorMessage(17));if(C)throw new Error(formatProdErrorMessage(9));try{C=!0,u=a(u,s)}finally{C=!1}return(_=w).forEach((s=>{s()})),s}dispatch({type:qe.INIT});return{dispatch,subscribe,getState,replaceReducer:function replaceReducer(s){if("function"!=typeof s)throw new Error(formatProdErrorMessage(10));a=s,dispatch({type:qe.REPLACE})},[$e]:function observable(){const s=subscribe;return{subscribe(o){if("object"!=typeof o||null===o)throw new Error(formatProdErrorMessage(11));function observeState(){const s=o;s.next&&s.next(getState())}observeState();return{unsubscribe:s(observeState)}},[$e](){return this}}}}}function bindActionCreator(s,o){return function(...i){return o(s.apply(this,i))}}function compose(...s){return 0===s.length?s=>s:1===s.length?s[0]:s.reduce(((s,o)=>(...i)=>s(o(...i))))}var ze=__webpack_require__(9404),We=__webpack_require__.n(ze),He=__webpack_require__(81919),Ye=__webpack_require__.n(He),Xe=__webpack_require__(89593),Qe=__webpack_require__(20334),et=__webpack_require__(55364),tt=__webpack_require__.n(et);const rt="err_new_thrown_err",nt="err_new_thrown_err_batch",st="err_new_spec_err",ot="err_new_spec_err_batch",it="err_new_auth_err",at="err_clear",ct="err_clear_by";function newThrownErr(s){return{type:rt,payload:(0,Qe.serializeError)(s)}}function newThrownErrBatch(s){return{type:nt,payload:s}}function newSpecErr(s){return{type:st,payload:s}}function newSpecErrBatch(s){return{type:ot,payload:s}}function newAuthErr(s){return{type:it,payload:s}}function clear(s={}){return{type:at,payload:s}}function clearBy(s=()=>!0){return{type:ct,payload:s}}const lt=function makeWindow(){var s={location:{},history:{},open:()=>{},close:()=>{},File:function(){},FormData:function(){}};if("undefined"==typeof window)return s;try{s=window;for(var o of["File","Blob","FormData"])o in window&&(s[o]=window[o])}catch(s){console.error(s)}return s}();__webpack_require__(84058),__webpack_require__(55808);var ut=__webpack_require__(50104),pt=__webpack_require__.n(ut),ht=__webpack_require__(7309),dt=__webpack_require__.n(ht),mt=__webpack_require__(42426),gt=__webpack_require__.n(mt),yt=__webpack_require__(75288),vt=__webpack_require__.n(yt),bt=__webpack_require__(1882),_t=__webpack_require__.n(bt),St=__webpack_require__(2205),Et=__webpack_require__.n(St),wt=__webpack_require__(53209),xt=__webpack_require__.n(wt),kt=__webpack_require__(62802),Ot=__webpack_require__.n(kt);const At=We().Set.of("type","format","items","default","maximum","exclusiveMaximum","minimum","exclusiveMinimum","maxLength","minLength","pattern","maxItems","minItems","uniqueItems","enum","multipleOf");function getParameterSchema(s,{isOAS3:o}={}){if(!We().Map.isMap(s))return{schema:We().Map(),parameterContentMediaType:null};if(!o)return"body"===s.get("in")?{schema:s.get("schema",We().Map()),parameterContentMediaType:null}:{schema:s.filter(((s,o)=>At.includes(o))),parameterContentMediaType:null};if(s.get("content")){const o=s.get("content",We().Map({})).keySeq().first();return{schema:s.getIn(["content",o,"schema"],We().Map()),parameterContentMediaType:o}}return{schema:s.get("schema")?s.get("schema",We().Map()):We().Map(),parameterContentMediaType:null}}var Ct=__webpack_require__(48287).Buffer;const jt="default",isImmutable=s=>We().Iterable.isIterable(s),immutableToJS=s=>isImmutable(s)?s.toJS():s;function objectify(s){return isObject(s)?immutableToJS(s):{}}function fromJSOrdered(s){if(isImmutable(s))return s;if(s instanceof lt.File)return s;if(!isObject(s))return s;if(Array.isArray(s))return We().Seq(s).map(fromJSOrdered).toList();if(_t()(s.entries)){const o=function createObjWithHashedKeys(s){if(!_t()(s.entries))return s;const o={},i="_**[]",a={};for(let u of s.entries())if(o[u[0]]||a[u[0]]&&a[u[0]].containsMultiple){if(!a[u[0]]){a[u[0]]={containsMultiple:!0,length:1},o[`${u[0]}${i}${a[u[0]].length}`]=o[u[0]],delete o[u[0]]}a[u[0]].length+=1,o[`${u[0]}${i}${a[u[0]].length}`]=u[1]}else o[u[0]]=u[1];return o}(s);return We().OrderedMap(o).map(fromJSOrdered)}return We().OrderedMap(s).map(fromJSOrdered)}function normalizeArray(s){return Array.isArray(s)?s:[s]}function isFn(s){return"function"==typeof s}function isObject(s){return!!s&&"object"==typeof s}function isFunc(s){return"function"==typeof s}function isArray(s){return Array.isArray(s)}const Pt=pt();function objMap(s,o){return Object.keys(s).reduce(((i,a)=>(i[a]=o(s[a],a),i)),{})}function objReduce(s,o){return Object.keys(s).reduce(((i,a)=>{let u=o(s[a],a);return u&&"object"==typeof u&&Object.assign(i,u),i}),{})}function systemThunkMiddleware(s){return({dispatch:o,getState:i})=>o=>i=>"function"==typeof i?i(s()):o(i)}function validateValueBySchema(s,o,i,a,u){if(!o)return[];let _=[],w=o.get("nullable"),x=o.get("required"),C=o.get("maximum"),j=o.get("minimum"),L=o.get("type"),B=o.get("format"),$=o.get("maxLength"),U=o.get("minLength"),V=o.get("uniqueItems"),z=o.get("maxItems"),Y=o.get("minItems"),Z=o.get("pattern");const ee=i||!0===x,ie=null!=s,ae=ee||ie&&"array"===L||!(!ee&&!ie),ce=w&&null===s;if(ee&&!ie&&!ce&&!a&&!L)return _.push("Required field is not provided"),_;if(ce||!L||!ae)return[];let le="string"===L&&s,pe="array"===L&&Array.isArray(s)&&s.length,de="array"===L&&We().List.isList(s)&&s.count();const fe=[le,pe,de,"array"===L&&"string"==typeof s&&s,"file"===L&&s instanceof lt.File,"boolean"===L&&(s||!1===s),"number"===L&&(s||0===s),"integer"===L&&(s||0===s),"object"===L&&"object"==typeof s&&null!==s,"object"===L&&"string"==typeof s&&s].some((s=>!!s));if(ee&&!fe&&!a)return _.push("Required field is not provided"),_;if("object"===L&&(null===u||"application/json"===u)){let i=s;if("string"==typeof s)try{i=JSON.parse(s)}catch(s){return _.push("Parameter string value must be valid JSON"),_}o&&o.has("required")&&isFunc(x.isList)&&x.isList()&&x.forEach((s=>{void 0===i[s]&&_.push({propKey:s,error:"Required property not found"})})),o&&o.has("properties")&&o.get("properties").forEach(((s,o)=>{const w=validateValueBySchema(i[o],s,!1,a,u);_.push(...w.map((s=>({propKey:o,error:s}))))}))}if(Z){let o=((s,o)=>{if(!new RegExp(o).test(s))return"Value must follow pattern "+o})(s,Z);o&&_.push(o)}if(Y&&"array"===L){let o=((s,o)=>{if(!s&&o>=1||s&&s.length<o)return`Array must contain at least ${o} item${1===o?"":"s"}`})(s,Y);o&&_.push(o)}if(z&&"array"===L){let o=((s,o)=>{if(s&&s.length>o)return`Array must not contain more then ${o} item${1===o?"":"s"}`})(s,z);o&&_.push({needRemove:!0,error:o})}if(V&&"array"===L){let o=((s,o)=>{if(s&&("true"===o||!0===o)){const o=(0,ze.fromJS)(s),i=o.toSet();if(s.length>i.size){let s=(0,ze.Set)();if(o.forEach(((i,a)=>{o.filter((s=>isFunc(s.equals)?s.equals(i):s===i)).size>1&&(s=s.add(a))})),0!==s.size)return s.map((s=>({index:s,error:"No duplicates allowed."}))).toArray()}}})(s,V);o&&_.push(...o)}if($||0===$){let o=((s,o)=>{if(s.length>o)return`Value must be no longer than ${o} character${1!==o?"s":""}`})(s,$);o&&_.push(o)}if(U){let o=((s,o)=>{if(s.length<o)return`Value must be at least ${o} character${1!==o?"s":""}`})(s,U);o&&_.push(o)}if(C||0===C){let o=((s,o)=>{if(s>o)return`Value must be less than or equal to ${o}`})(s,C);o&&_.push(o)}if(j||0===j){let o=((s,o)=>{if(s<o)return`Value must be greater than or equal to ${o}`})(s,j);o&&_.push(o)}if("string"===L){let o;if(o="date-time"===B?(s=>{if(isNaN(Date.parse(s)))return"Value must be a DateTime"})(s):"uuid"===B?(s=>{if(s=s.toString().toLowerCase(),!/^[{(]?[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}[)}]?$/.test(s))return"Value must be a Guid"})(s):(s=>{if(s&&"string"!=typeof s)return"Value must be a string"})(s),!o)return _;_.push(o)}else if("boolean"===L){let o=(s=>{if("true"!==s&&"false"!==s&&!0!==s&&!1!==s)return"Value must be a boolean"})(s);if(!o)return _;_.push(o)}else if("number"===L){let o=(s=>{if(!/^-?\d+(\.?\d+)?$/.test(s))return"Value must be a number"})(s);if(!o)return _;_.push(o)}else if("integer"===L){let o=(s=>{if(!/^-?\d+$/.test(s))return"Value must be an integer"})(s);if(!o)return _;_.push(o)}else if("array"===L){if(!pe&&!de)return _;s&&s.forEach(((s,i)=>{const w=validateValueBySchema(s,o.get("items"),!1,a,u);_.push(...w.map((s=>({index:i,error:s}))))}))}else if("file"===L){let o=(s=>{if(s&&!(s instanceof lt.File))return"Value must be a file"})(s);if(!o)return _;_.push(o)}return _}const utils_btoa=s=>{let o;return o=s instanceof Ct?s:Ct.from(s.toString(),"utf-8"),o.toString("base64")},It={operationsSorter:{alpha:(s,o)=>s.get("path").localeCompare(o.get("path")),method:(s,o)=>s.get("method").localeCompare(o.get("method"))},tagsSorter:{alpha:(s,o)=>s.localeCompare(o)}},buildFormData=s=>{let o=[];for(let i in s){let a=s[i];void 0!==a&&""!==a&&o.push([i,"=",encodeURIComponent(a).replace(/%20/g,"+")].join(""))}return o.join("&")},shallowEqualKeys=(s,o,i)=>!!dt()(i,(i=>vt()(s[i],o[i])));function requiresValidationURL(s){return!(!s||s.indexOf("localhost")>=0||s.indexOf("127.0.0.1")>=0||"none"===s)}const createDeepLinkPath=s=>"string"==typeof s||s instanceof String?s.trim().replace(/\s/g,"%20"):"",escapeDeepLinkPath=s=>Et()(createDeepLinkPath(s).replace(/%20/g,"_")),isExtension=s=>/^x-/.test(s),getExtensions=s=>ze.Map.isMap(s)?s.filter(((s,o)=>isExtension(o))):Object.keys(s).filter((s=>isExtension(s))),getCommonExtensions=s=>s.filter(((s,o)=>/^pattern|maxLength|minLength|maximum|minimum/.test(o)));function deeplyStripKey(s,o,i=()=>!0){if("object"!=typeof s||Array.isArray(s)||null===s||!o)return s;const a=Object.assign({},s);return Object.keys(a).forEach((s=>{s===o&&i(a[s],s)?delete a[s]:a[s]=deeplyStripKey(a[s],o,i)})),a}function stringify(s){if("string"==typeof s)return s;if(s&&s.toJS&&(s=s.toJS()),"object"==typeof s&&null!==s)try{return JSON.stringify(s,null,2)}catch(o){return String(s)}return null==s?"":s.toString()}function paramToIdentifier(s,{returnAll:o=!1,allowHashes:i=!0}={}){if(!We().Map.isMap(s))throw new Error("paramToIdentifier: received a non-Im.Map parameter as input");const a=s.get("name"),u=s.get("in");let _=[];return s&&s.hashCode&&u&&a&&i&&_.push(`${u}.${a}.hash-${s.hashCode()}`),u&&a&&_.push(`${u}.${a}`),_.push(a),o?_:_[0]||""}function paramToValue(s,o){return paramToIdentifier(s,{returnAll:!0}).map((s=>o[s])).filter((s=>void 0!==s))[0]}function b64toB64UrlEncoded(s){return s.replace(/\+/g,"-").replace(/\//g,"_").replace(/=/g,"")}const isEmptyValue=s=>!s||!(!isImmutable(s)||!s.isEmpty()),idFn=s=>s;function createStoreWithMiddleware(s,o,i){let a=[systemThunkMiddleware(i)];return createStore(s,o,(lt.__REDUX_DEVTOOLS_EXTENSION_COMPOSE__||compose)(function applyMiddleware(...s){return o=>(i,a)=>{const u=o(i,a);let dispatch=()=>{throw new Error(formatProdErrorMessage(15))};const _={getState:u.getState,dispatch:(s,...o)=>dispatch(s,...o)},w=s.map((s=>s(_)));return dispatch=compose(...w)(u.dispatch),{...u,dispatch}}}(...a)))}class Store{constructor(s={}){Ye()(this,{state:{},plugins:[],system:{configs:{},fn:{},components:{},rootInjects:{},statePlugins:{}},boundSystem:{},toolbox:{}},s),this.getSystem=this._getSystem.bind(this),this.store=function configureStore(s,o,i){return createStoreWithMiddleware(s,o,i)}(idFn,(0,ze.fromJS)(this.state),this.getSystem),this.buildSystem(!1),this.register(this.plugins)}getStore(){return this.store}register(s,o=!0){var i=combinePlugins(s,this.getSystem());systemExtend(this.system,i),o&&this.buildSystem();callAfterLoad.call(this.system,s,this.getSystem())&&this.buildSystem()}buildSystem(s=!0){let o=this.getStore().dispatch,i=this.getStore().getState;this.boundSystem=Object.assign({},this.getRootInjects(),this.getWrappedAndBoundActions(o),this.getWrappedAndBoundSelectors(i,this.getSystem),this.getStateThunks(i),this.getFn(),this.getConfigs()),s&&this.rebuildReducer()}_getSystem(){return this.boundSystem}getRootInjects(){return Object.assign({getSystem:this.getSystem,getStore:this.getStore.bind(this),getComponents:this.getComponents.bind(this),getState:this.getStore().getState,getConfigs:this._getConfigs.bind(this),Im:We(),React:Re},this.system.rootInjects||{})}_getConfigs(){return this.system.configs}getConfigs(){return{configs:this.system.configs}}setConfigs(s){this.system.configs=s}rebuildReducer(){this.store.replaceReducer(function buildReducer(s,o){return function allReducers(s,o){let i=Object.keys(s).reduce(((i,a)=>(i[a]=function makeReducer(s,o){return(i=new ze.Map,a)=>{if(!s)return i;let u=s[a.type];if(u){const s=wrapWithTryCatch(u,o)(i,a);return null===s?i:s}return i}}(s[a],o),i)),{});if(!Object.keys(i).length)return idFn;return(0,Xe.H)(i)}(objMap(s,(s=>s.reducers)),o)}(this.system.statePlugins,this.getSystem))}getType(s){let o=s[0].toUpperCase()+s.slice(1);return objReduce(this.system.statePlugins,((i,a)=>{let u=i[s];if(u)return{[a+o]:u}}))}getSelectors(){return this.getType("selectors")}getActions(){return objMap(this.getType("actions"),(s=>objReduce(s,((s,o)=>{if(isFn(s))return{[o]:s}}))))}getWrappedAndBoundActions(s){return objMap(this.getBoundActions(s),((s,o)=>{let i=this.system.statePlugins[o.slice(0,-7)].wrapActions;return i?objMap(s,((s,o)=>{let a=i[o];return a?(Array.isArray(a)||(a=[a]),a.reduce(((s,o)=>{let newAction=(...i)=>o(s,this.getSystem())(...i);if(!isFn(newAction))throw new TypeError("wrapActions needs to return a function that returns a new function (ie the wrapped action)");return wrapWithTryCatch(newAction,this.getSystem)}),s||Function.prototype)):s})):s}))}getWrappedAndBoundSelectors(s,o){return objMap(this.getBoundSelectors(s,o),((o,i)=>{let a=[i.slice(0,-9)],u=this.system.statePlugins[a].wrapSelectors;return u?objMap(o,((o,i)=>{let _=u[i];return _?(Array.isArray(_)||(_=[_]),_.reduce(((o,i)=>{let wrappedSelector=(...u)=>i(o,this.getSystem())(s().getIn(a),...u);if(!isFn(wrappedSelector))throw new TypeError("wrapSelector needs to return a function that returns a new function (ie the wrapped action)");return wrappedSelector}),o||Function.prototype)):o})):o}))}getStates(s){return Object.keys(this.system.statePlugins).reduce(((o,i)=>(o[i]=s.get(i),o)),{})}getStateThunks(s){return Object.keys(this.system.statePlugins).reduce(((o,i)=>(o[i]=()=>s().get(i),o)),{})}getFn(){return{fn:this.system.fn}}getComponents(s){const o=this.system.components[s];return Array.isArray(o)?o.reduce(((s,o)=>o(s,this.getSystem()))):void 0!==s?this.system.components[s]:this.system.components}getBoundSelectors(s,o){return objMap(this.getSelectors(),((i,a)=>{let u=[a.slice(0,-9)];return objMap(i,(i=>(...a)=>{let _=wrapWithTryCatch(i,this.getSystem).apply(null,[s().getIn(u),...a]);return"function"==typeof _&&(_=wrapWithTryCatch(_,this.getSystem)(o())),_}))}))}getBoundActions(s){s=s||this.getStore().dispatch;const o=this.getActions(),process=s=>"function"!=typeof s?objMap(s,(s=>process(s))):(...o)=>{var i=null;try{i=s(...o)}catch(s){i={type:rt,error:!0,payload:(0,Qe.serializeError)(s)}}finally{return i}};return objMap(o,(o=>function bindActionCreators(s,o){if("function"==typeof s)return bindActionCreator(s,o);if("object"!=typeof s||null===s)throw new Error(formatProdErrorMessage(16));const i={};for(const a in s){const u=s[a];"function"==typeof u&&(i[a]=bindActionCreator(u,o))}return i}(process(o),s)))}getMapStateToProps(){return()=>Object.assign({},this.getSystem())}getMapDispatchToProps(s){return o=>Ye()({},this.getWrappedAndBoundActions(o),this.getFn(),s)}}function combinePlugins(s,o){return isObject(s)&&!isArray(s)?tt()({},s):isFunc(s)?combinePlugins(s(o),o):isArray(s)?s.map((s=>combinePlugins(s,o))).reduce(systemExtend,{components:o.getComponents()}):{}}function callAfterLoad(s,o,{hasLoaded:i}={}){let a=i;return isObject(s)&&!isArray(s)&&"function"==typeof s.afterLoad&&(a=!0,wrapWithTryCatch(s.afterLoad,o.getSystem).call(this,o)),isFunc(s)?callAfterLoad.call(this,s(o),o,{hasLoaded:a}):isArray(s)?s.map((s=>callAfterLoad.call(this,s,o,{hasLoaded:a}))):a}function systemExtend(s={},o={}){if(!isObject(s))return{};if(!isObject(o))return s;o.wrapComponents&&(objMap(o.wrapComponents,((i,a)=>{const u=s.components&&s.components[a];u&&Array.isArray(u)?(s.components[a]=u.concat([i]),delete o.wrapComponents[a]):u&&(s.components[a]=[u,i],delete o.wrapComponents[a])})),Object.keys(o.wrapComponents).length||delete o.wrapComponents);const{statePlugins:i}=s;if(isObject(i))for(let s in i){const a=i[s];if(!isObject(a))continue;const{wrapActions:u,wrapSelectors:_}=a;if(isObject(u))for(let i in u){let a=u[i];Array.isArray(a)||(a=[a],u[i]=a),o&&o.statePlugins&&o.statePlugins[s]&&o.statePlugins[s].wrapActions&&o.statePlugins[s].wrapActions[i]&&(o.statePlugins[s].wrapActions[i]=u[i].concat(o.statePlugins[s].wrapActions[i]))}if(isObject(_))for(let i in _){let a=_[i];Array.isArray(a)||(a=[a],_[i]=a),o&&o.statePlugins&&o.statePlugins[s]&&o.statePlugins[s].wrapSelectors&&o.statePlugins[s].wrapSelectors[i]&&(o.statePlugins[s].wrapSelectors[i]=_[i].concat(o.statePlugins[s].wrapSelectors[i]))}}return Ye()(s,o)}function wrapWithTryCatch(s,o,{logErrors:i=!0}={}){return"function"!=typeof s?s:function(...a){try{return s.call(this,...a)}catch(s){if(i){const{uncaughtExceptionHandler:i}=o().getConfigs();"function"==typeof i?i(s):console.error(s)}return null}}}var Tt=__webpack_require__(61160),Nt=__webpack_require__.n(Tt);const Mt="show_popup",Rt="authorize",Dt="logout",Lt="authorize_oauth2",Ft="configure_auth",Bt="restore_authorization";function showDefinitions(s){return{type:Mt,payload:s}}function authorize(s){return{type:Rt,payload:s}}const authorizeWithPersistOption=s=>({authActions:o})=>{o.authorize(s),o.persistAuthorizationIfNeeded()};function logout(s){return{type:Dt,payload:s}}const logoutWithPersistOption=s=>({authActions:o})=>{o.logout(s),o.persistAuthorizationIfNeeded()},preAuthorizeImplicit=s=>({authActions:o,errActions:i})=>{let{auth:a,token:u,isValid:_}=s,{schema:w,name:x}=a,C=w.get("flow");delete lt.swaggerUIRedirectOauth2,"accessCode"===C||_||i.newAuthErr({authId:x,source:"auth",level:"warning",message:"Authorization may be unsafe, passed state was changed in server Passed state wasn't returned from auth server"}),u.error?i.newAuthErr({authId:x,source:"auth",level:"error",message:JSON.stringify(u)}):o.authorizeOauth2WithPersistOption({auth:a,token:u})};function authorizeOauth2(s){return{type:Lt,payload:s}}const authorizeOauth2WithPersistOption=s=>({authActions:o})=>{o.authorizeOauth2(s),o.persistAuthorizationIfNeeded()},authorizePassword=s=>({authActions:o})=>{let{schema:i,name:a,username:u,password:_,passwordType:w,clientId:x,clientSecret:C}=s,j={grant_type:"password",scope:s.scopes.join(" "),username:u,password:_},L={};switch(w){case"request-body":!function setClientIdAndSecret(s,o,i){o&&Object.assign(s,{client_id:o});i&&Object.assign(s,{client_secret:i})}(j,x,C);break;case"basic":L.Authorization="Basic "+utils_btoa(x+":"+C);break;default:console.warn(`Warning: invalid passwordType ${w} was passed, not including client id and secret`)}return o.authorizeRequest({body:buildFormData(j),url:i.get("tokenUrl"),name:a,headers:L,query:{},auth:s})};const authorizeApplication=s=>({authActions:o})=>{let{schema:i,scopes:a,name:u,clientId:_,clientSecret:w}=s,x={Authorization:"Basic "+utils_btoa(_+":"+w)},C={grant_type:"client_credentials",scope:a.join(" ")};return o.authorizeRequest({body:buildFormData(C),name:u,url:i.get("tokenUrl"),auth:s,headers:x})},authorizeAccessCodeWithFormParams=({auth:s,redirectUrl:o})=>({authActions:i})=>{let{schema:a,name:u,clientId:_,clientSecret:w,codeVerifier:x}=s,C={grant_type:"authorization_code",code:s.code,client_id:_,client_secret:w,redirect_uri:o,code_verifier:x};return i.authorizeRequest({body:buildFormData(C),name:u,url:a.get("tokenUrl"),auth:s})},authorizeAccessCodeWithBasicAuthentication=({auth:s,redirectUrl:o})=>({authActions:i})=>{let{schema:a,name:u,clientId:_,clientSecret:w,codeVerifier:x}=s,C={Authorization:"Basic "+utils_btoa(_+":"+w)},j={grant_type:"authorization_code",code:s.code,client_id:_,redirect_uri:o,code_verifier:x};return i.authorizeRequest({body:buildFormData(j),name:u,url:a.get("tokenUrl"),auth:s,headers:C})},authorizeRequest=s=>({fn:o,getConfigs:i,authActions:a,errActions:u,oas3Selectors:_,specSelectors:w,authSelectors:x})=>{let C,{body:j,query:L={},headers:B={},name:$,url:U,auth:V}=s,{additionalQueryStringParams:z}=x.getConfigs()||{};if(w.isOAS3()){let s=_.serverEffectiveValue(_.selectedServer());C=Nt()(U,s,!0)}else C=Nt()(U,w.url(),!0);"object"==typeof z&&(C.query=Object.assign({},C.query,z));const Y=C.toString();let Z=Object.assign({Accept:"application/json, text/plain, */*","Content-Type":"application/x-www-form-urlencoded","X-Requested-With":"XMLHttpRequest"},B);o.fetch({url:Y,method:"post",headers:Z,query:L,body:j,requestInterceptor:i().requestInterceptor,responseInterceptor:i().responseInterceptor}).then((function(s){let o=JSON.parse(s.data),i=o&&(o.error||""),_=o&&(o.parseError||"");s.ok?i||_?u.newAuthErr({authId:$,level:"error",source:"auth",message:JSON.stringify(o)}):a.authorizeOauth2WithPersistOption({auth:V,token:o}):u.newAuthErr({authId:$,level:"error",source:"auth",message:s.statusText})})).catch((s=>{let o=new Error(s).message;if(s.response&&s.response.data){const i=s.response.data;try{const s="string"==typeof i?JSON.parse(i):i;s.error&&(o+=`, error: ${s.error}`),s.error_description&&(o+=`, description: ${s.error_description}`)}catch(s){}}u.newAuthErr({authId:$,level:"error",source:"auth",message:o})}))};function configureAuth(s){return{type:Ft,payload:s}}function restoreAuthorization(s){return{type:Bt,payload:s}}const persistAuthorizationIfNeeded=()=>({authSelectors:s,getConfigs:o})=>{if(!o().persistAuthorization)return;const i=s.authorized().toJS();localStorage.setItem("authorized",JSON.stringify(i))},authPopup=(s,o)=>()=>{lt.swaggerUIRedirectOauth2=o,lt.open(s)},$t={[Mt]:(s,{payload:o})=>s.set("showDefinitions",o),[Rt]:(s,{payload:o})=>{let i=(0,ze.fromJS)(o),a=s.get("authorized")||(0,ze.Map)();return i.entrySeq().forEach((([o,i])=>{if(!isFunc(i.getIn))return s.set("authorized",a);let u=i.getIn(["schema","type"]);if("apiKey"===u||"http"===u)a=a.set(o,i);else if("basic"===u){let s=i.getIn(["value","username"]),u=i.getIn(["value","password"]);a=a.setIn([o,"value"],{username:s,header:"Basic "+utils_btoa(s+":"+u)}),a=a.setIn([o,"schema"],i.get("schema"))}})),s.set("authorized",a)},[Lt]:(s,{payload:o})=>{let i,{auth:a,token:u}=o;a.token=Object.assign({},u),i=(0,ze.fromJS)(a);let _=s.get("authorized")||(0,ze.Map)();return _=_.set(i.get("name"),i),s.set("authorized",_)},[Dt]:(s,{payload:o})=>{let i=s.get("authorized").withMutations((s=>{o.forEach((o=>{s.delete(o)}))}));return s.set("authorized",i)},[Ft]:(s,{payload:o})=>s.set("configs",o),[Bt]:(s,{payload:o})=>s.set("authorized",(0,ze.fromJS)(o.authorized))};function assertIsFunction(s,o="expected a function, instead received "+typeof s){if("function"!=typeof s)throw new TypeError(o)}var ensureIsArray=s=>Array.isArray(s)?s:[s];function getDependencies(s){const o=Array.isArray(s[0])?s[0]:s;return function assertIsArrayOfFunctions(s,o="expected all items to be functions, instead received the following types: "){if(!s.every((s=>"function"==typeof s))){const i=s.map((s=>"function"==typeof s?`function ${s.name||"unnamed"}()`:typeof s)).join(", ");throw new TypeError(`${o}[${i}]`)}}(o,"createSelector expects all input-selectors to be functions, but received the following types: "),o}Symbol(),Object.getPrototypeOf({});var qt="undefined"!=typeof WeakRef?WeakRef:class{constructor(s){this.value=s}deref(){return this.value}};function weakMapMemoize(s,o={}){let i={s:0,v:void 0,o:null,p:null};const{resultEqualityCheck:a}=o;let u,_=0;function memoized(){let o=i;const{length:w}=arguments;for(let s=0,i=w;s<i;s++){const i=arguments[s];if("function"==typeof i||"object"==typeof i&&null!==i){let s=o.o;null===s&&(o.o=s=new WeakMap);const a=s.get(i);void 0===a?(o={s:0,v:void 0,o:null,p:null},s.set(i,o)):o=a}else{let s=o.p;null===s&&(o.p=s=new Map);const a=s.get(i);void 0===a?(o={s:0,v:void 0,o:null,p:null},s.set(i,o)):o=a}}const x=o;let C;if(1===o.s)C=o.v;else if(C=s.apply(null,arguments),_++,a){const s=u?.deref?.()??u;null!=s&&a(s,C)&&(C=s,0!==_&&_--);u="object"==typeof C&&null!==C||"function"==typeof C?new qt(C):C}return x.s=1,x.v=C,C}return memoized.clearCache=()=>{i={s:0,v:void 0,o:null,p:null},memoized.resetResultsCount()},memoized.resultsCount=()=>_,memoized.resetResultsCount=()=>{_=0},memoized}function createSelectorCreator(s,...o){const i="function"==typeof s?{memoize:s,memoizeOptions:o}:s,createSelector2=(...s)=>{let o,a=0,u=0,_={},w=s.pop();"object"==typeof w&&(_=w,w=s.pop()),assertIsFunction(w,`createSelector expects an output function after the inputs, but received: [${typeof w}]`);const x={...i,..._},{memoize:C,memoizeOptions:j=[],argsMemoize:L=weakMapMemoize,argsMemoizeOptions:B=[],devModeChecks:$={}}=x,U=ensureIsArray(j),V=ensureIsArray(B),z=getDependencies(s),Y=C((function recomputationWrapper(){return a++,w.apply(null,arguments)}),...U);const Z=L((function dependenciesChecker(){u++;const s=function collectInputSelectorResults(s,o){const i=[],{length:a}=s;for(let u=0;u<a;u++)i.push(s[u].apply(null,o));return i}(z,arguments);return o=Y.apply(null,s),o}),...V);return Object.assign(Z,{resultFunc:w,memoizedResultFunc:Y,dependencies:z,dependencyRecomputations:()=>u,resetDependencyRecomputations:()=>{u=0},lastResult:()=>o,recomputations:()=>a,resetRecomputations:()=>{a=0},memoize:C,argsMemoize:L})};return Object.assign(createSelector2,{withTypes:()=>createSelector2}),createSelector2}var Ut=createSelectorCreator(weakMapMemoize),Vt=Object.assign(((s,o=Ut)=>{!function assertIsObject(s,o="expected an object, instead received "+typeof s){if("object"!=typeof s)throw new TypeError(o)}(s,"createStructuredSelector expects first argument to be an object where each property is a selector, instead received a "+typeof s);const i=Object.keys(s);return o(i.map((o=>s[o])),((...s)=>s.reduce(((s,o,a)=>(s[i[a]]=o,s)),{})))}),{withTypes:()=>Vt});const state=s=>s,zt=Ut(state,(s=>s.get("showDefinitions"))),Wt=Ut(state,(()=>({specSelectors:s})=>{let o=s.securityDefinitions()||(0,ze.Map)({}),i=(0,ze.List)();return o.entrySeq().forEach((([s,o])=>{let a=(0,ze.Map)();a=a.set(s,o),i=i.push(a)})),i})),selectAuthPath=(s,o)=>({specSelectors:s})=>(0,ze.List)(s.isOAS3()?["components","securitySchemes",o]:["securityDefinitions",o]),getDefinitionsByNames=(s,o)=>({specSelectors:s})=>{console.warn("WARNING: getDefinitionsByNames is deprecated and will be removed in the next major version.");let i=s.securityDefinitions(),a=(0,ze.List)();return o.valueSeq().forEach((s=>{let o=(0,ze.Map)();s.entrySeq().forEach((([s,a])=>{let u,_=i.get(s);"oauth2"===_.get("type")&&a.size&&(u=_.get("scopes"),u.keySeq().forEach((s=>{a.contains(s)||(u=u.delete(s))})),_=_.set("allowedScopes",u)),o=o.set(s,_)})),a=a.push(o)})),a},definitionsForRequirements=(s,o=(0,ze.List)())=>({authSelectors:s})=>{const i=s.definitionsToAuthorize()||(0,ze.List)();let a=(0,ze.List)();return i.forEach((s=>{let i=o.find((o=>o.get(s.keySeq().first())));i&&(s.forEach(((o,a)=>{if("oauth2"===o.get("type")){const u=i.get(a);let _=o.get("scopes");ze.List.isList(u)&&ze.Map.isMap(_)&&(_.keySeq().forEach((s=>{u.contains(s)||(_=_.delete(s))})),s=s.set(a,o.set("scopes",_)))}})),a=a.push(s))})),a},Jt=Ut(state,(s=>s.get("authorized")||(0,ze.Map)())),isAuthorized=(s,o)=>({authSelectors:s})=>{let i=s.authorized();return ze.List.isList(o)?!!o.toJS().filter((s=>-1===Object.keys(s).map((s=>!!i.get(s))).indexOf(!1))).length:null},Ht=Ut(state,(s=>s.get("configs"))),execute=(s,{authSelectors:o,specSelectors:i})=>({path:a,method:u,operation:_,extras:w})=>{let x={authorized:o.authorized()&&o.authorized().toJS(),definitions:i.securityDefinitions()&&i.securityDefinitions().toJS(),specSecurity:i.security()&&i.security().toJS()};return s({path:a,method:u,operation:_,securities:x,...w})},loaded=(s,o)=>i=>{const{getConfigs:a,authActions:u}=o,_=a();if(s(i),_.persistAuthorization){const s=localStorage.getItem("authorized");s&&u.restoreAuthorization({authorized:JSON.parse(s)})}},wrap_actions_authorize=(s,o)=>i=>{s(i);if(o.getConfigs().persistAuthorization)try{const[{schema:s,value:o}]=Object.values(i),a=(0,ze.fromJS)(s),u="apiKey"===a.get("type"),_="cookie"===a.get("in");u&&_&&(document.cookie=`${a.get("name")}=${o}; SameSite=None; Secure`)}catch(s){console.error("Error persisting cookie based apiKey in document.cookie.",s)}},wrap_actions_logout=(s,o)=>i=>{const a=o.getConfigs(),u=o.authSelectors.authorized();try{a.persistAuthorization&&Array.isArray(i)&&i.forEach((s=>{const o=u.get(s,{}),i="apiKey"===o.getIn(["schema","type"]),a="cookie"===o.getIn(["schema","in"]);if(i&&a){const s=o.getIn(["schema","name"]);document.cookie=`${s}=; Max-Age=-99999999`}}))}catch(s){console.error("Error deleting cookie based apiKey from document.cookie.",s)}s(i)};var Kt=__webpack_require__(90179),Gt=__webpack_require__.n(Kt);class LockAuthIcon extends Re.Component{mapStateToProps(s,o){return{state:s,ownProps:Gt()(o,Object.keys(o.getSystem()))}}render(){const{getComponent:s,ownProps:o}=this.props,i=s("LockIcon");return Re.createElement(i,o)}}const Yt=LockAuthIcon;class UnlockAuthIcon extends Re.Component{mapStateToProps(s,o){return{state:s,ownProps:Gt()(o,Object.keys(o.getSystem()))}}render(){const{getComponent:s,ownProps:o}=this.props,i=s("UnlockIcon");return Re.createElement(i,o)}}const Xt=UnlockAuthIcon;function auth(){return{afterLoad(s){this.rootInjects=this.rootInjects||{},this.rootInjects.initOAuth=s.authActions.configureAuth,this.rootInjects.preauthorizeApiKey=preauthorizeApiKey.bind(null,s),this.rootInjects.preauthorizeBasic=preauthorizeBasic.bind(null,s)},components:{LockAuthIcon:Yt,UnlockAuthIcon:Xt,LockAuthOperationIcon:Yt,UnlockAuthOperationIcon:Xt},statePlugins:{auth:{reducers:$t,actions:o,selectors:a,wrapActions:{authorize:wrap_actions_authorize,logout:wrap_actions_logout}},configs:{wrapActions:{loaded}},spec:{wrapActions:{execute}}}}}function preauthorizeBasic(s,o,i,a){const{authActions:{authorize:u},specSelectors:{specJson:_,isOAS3:w}}=s,x=w()?["components","securitySchemes"]:["securityDefinitions"],C=_().getIn([...x,o]);return C?u({[o]:{value:{username:i,password:a},schema:C.toJS()}}):null}function preauthorizeApiKey(s,o,i){const{authActions:{authorize:a},specSelectors:{specJson:u,isOAS3:_}}=s,w=_()?["components","securitySchemes"]:["securityDefinitions"],x=u().getIn([...w,o]);return x?a({[o]:{value:i,schema:x.toJS()}}):null}function isNothing(s){return null==s}var Qt=function repeat(s,o){var i,a="";for(i=0;i<o;i+=1)a+=s;return a},Zt=function isNegativeZero(s){return 0===s&&Number.NEGATIVE_INFINITY===1/s},er={isNothing,isObject:function js_yaml_isObject(s){return"object"==typeof s&&null!==s},toArray:function toArray(s){return Array.isArray(s)?s:isNothing(s)?[]:[s]},repeat:Qt,isNegativeZero:Zt,extend:function extend(s,o){var i,a,u,_;if(o)for(i=0,a=(_=Object.keys(o)).length;i<a;i+=1)s[u=_[i]]=o[u];return s}};function formatError(s,o){var i="",a=s.reason||"(unknown reason)";return s.mark?(s.mark.name&&(i+='in "'+s.mark.name+'" '),i+="("+(s.mark.line+1)+":"+(s.mark.column+1)+")",!o&&s.mark.snippet&&(i+="\n\n"+s.mark.snippet),a+" "+i):a}function YAMLException$1(s,o){Error.call(this),this.name="YAMLException",this.reason=s,this.mark=o,this.message=formatError(this,!1),Error.captureStackTrace?Error.captureStackTrace(this,this.constructor):this.stack=(new Error).stack||""}YAMLException$1.prototype=Object.create(Error.prototype),YAMLException$1.prototype.constructor=YAMLException$1,YAMLException$1.prototype.toString=function toString(s){return this.name+": "+formatError(this,s)};var tr=YAMLException$1;function getLine(s,o,i,a,u){var _="",w="",x=Math.floor(u/2)-1;return a-o>x&&(o=a-x+(_=" ... ").length),i-a>x&&(i=a+x-(w=" ...").length),{str:_+s.slice(o,i).replace(/\t/g,"→")+w,pos:a-o+_.length}}function padStart(s,o){return er.repeat(" ",o-s.length)+s}var rr=function makeSnippet(s,o){if(o=Object.create(o||null),!s.buffer)return null;o.maxLength||(o.maxLength=79),"number"!=typeof o.indent&&(o.indent=1),"number"!=typeof o.linesBefore&&(o.linesBefore=3),"number"!=typeof o.linesAfter&&(o.linesAfter=2);for(var i,a=/\r?\n|\r|\0/g,u=[0],_=[],w=-1;i=a.exec(s.buffer);)_.push(i.index),u.push(i.index+i[0].length),s.position<=i.index&&w<0&&(w=u.length-2);w<0&&(w=u.length-1);var x,C,j="",L=Math.min(s.line+o.linesAfter,_.length).toString().length,B=o.maxLength-(o.indent+L+3);for(x=1;x<=o.linesBefore&&!(w-x<0);x++)C=getLine(s.buffer,u[w-x],_[w-x],s.position-(u[w]-u[w-x]),B),j=er.repeat(" ",o.indent)+padStart((s.line-x+1).toString(),L)+" | "+C.str+"\n"+j;for(C=getLine(s.buffer,u[w],_[w],s.position,B),j+=er.repeat(" ",o.indent)+padStart((s.line+1).toString(),L)+" | "+C.str+"\n",j+=er.repeat("-",o.indent+L+3+C.pos)+"^\n",x=1;x<=o.linesAfter&&!(w+x>=_.length);x++)C=getLine(s.buffer,u[w+x],_[w+x],s.position-(u[w]-u[w+x]),B),j+=er.repeat(" ",o.indent)+padStart((s.line+x+1).toString(),L)+" | "+C.str+"\n";return j.replace(/\n$/,"")},nr=["kind","multi","resolve","construct","instanceOf","predicate","represent","representName","defaultStyle","styleAliases"],sr=["scalar","sequence","mapping"];var ir=function Type$1(s,o){if(o=o||{},Object.keys(o).forEach((function(o){if(-1===nr.indexOf(o))throw new tr('Unknown option "'+o+'" is met in definition of "'+s+'" YAML type.')})),this.options=o,this.tag=s,this.kind=o.kind||null,this.resolve=o.resolve||function(){return!0},this.construct=o.construct||function(s){return s},this.instanceOf=o.instanceOf||null,this.predicate=o.predicate||null,this.represent=o.represent||null,this.representName=o.representName||null,this.defaultStyle=o.defaultStyle||null,this.multi=o.multi||!1,this.styleAliases=function compileStyleAliases(s){var o={};return null!==s&&Object.keys(s).forEach((function(i){s[i].forEach((function(s){o[String(s)]=i}))})),o}(o.styleAliases||null),-1===sr.indexOf(this.kind))throw new tr('Unknown kind "'+this.kind+'" is specified for "'+s+'" YAML type.')};function compileList(s,o){var i=[];return s[o].forEach((function(s){var o=i.length;i.forEach((function(i,a){i.tag===s.tag&&i.kind===s.kind&&i.multi===s.multi&&(o=a)})),i[o]=s})),i}function Schema$1(s){return this.extend(s)}Schema$1.prototype.extend=function extend(s){var o=[],i=[];if(s instanceof ir)i.push(s);else if(Array.isArray(s))i=i.concat(s);else{if(!s||!Array.isArray(s.implicit)&&!Array.isArray(s.explicit))throw new tr("Schema.extend argument should be a Type, [ Type ], or a schema definition ({ implicit: [...], explicit: [...] })");s.implicit&&(o=o.concat(s.implicit)),s.explicit&&(i=i.concat(s.explicit))}o.forEach((function(s){if(!(s instanceof ir))throw new tr("Specified list of YAML types (or a single Type object) contains a non-Type object.");if(s.loadKind&&"scalar"!==s.loadKind)throw new tr("There is a non-scalar type in the implicit list of a schema. Implicit resolving of such types is not supported.");if(s.multi)throw new tr("There is a multi type in the implicit list of a schema. Multi tags can only be listed as explicit.")})),i.forEach((function(s){if(!(s instanceof ir))throw new tr("Specified list of YAML types (or a single Type object) contains a non-Type object.")}));var a=Object.create(Schema$1.prototype);return a.implicit=(this.implicit||[]).concat(o),a.explicit=(this.explicit||[]).concat(i),a.compiledImplicit=compileList(a,"implicit"),a.compiledExplicit=compileList(a,"explicit"),a.compiledTypeMap=function compileMap(){var s,o,i={scalar:{},sequence:{},mapping:{},fallback:{},multi:{scalar:[],sequence:[],mapping:[],fallback:[]}};function collectType(s){s.multi?(i.multi[s.kind].push(s),i.multi.fallback.push(s)):i[s.kind][s.tag]=i.fallback[s.tag]=s}for(s=0,o=arguments.length;s<o;s+=1)arguments[s].forEach(collectType);return i}(a.compiledImplicit,a.compiledExplicit),a};var ar=Schema$1,cr=new ir("tag:yaml.org,2002:str",{kind:"scalar",construct:function(s){return null!==s?s:""}}),lr=new ir("tag:yaml.org,2002:seq",{kind:"sequence",construct:function(s){return null!==s?s:[]}}),ur=new ir("tag:yaml.org,2002:map",{kind:"mapping",construct:function(s){return null!==s?s:{}}}),pr=new ar({explicit:[cr,lr,ur]});var dr=new ir("tag:yaml.org,2002:null",{kind:"scalar",resolve:function resolveYamlNull(s){if(null===s)return!0;var o=s.length;return 1===o&&"~"===s||4===o&&("null"===s||"Null"===s||"NULL"===s)},construct:function constructYamlNull(){return null},predicate:function isNull(s){return null===s},represent:{canonical:function(){return"~"},lowercase:function(){return"null"},uppercase:function(){return"NULL"},camelcase:function(){return"Null"},empty:function(){return""}},defaultStyle:"lowercase"});var fr=new ir("tag:yaml.org,2002:bool",{kind:"scalar",resolve:function resolveYamlBoolean(s){if(null===s)return!1;var o=s.length;return 4===o&&("true"===s||"True"===s||"TRUE"===s)||5===o&&("false"===s||"False"===s||"FALSE"===s)},construct:function constructYamlBoolean(s){return"true"===s||"True"===s||"TRUE"===s},predicate:function isBoolean(s){return"[object Boolean]"===Object.prototype.toString.call(s)},represent:{lowercase:function(s){return s?"true":"false"},uppercase:function(s){return s?"TRUE":"FALSE"},camelcase:function(s){return s?"True":"False"}},defaultStyle:"lowercase"});function isOctCode(s){return 48<=s&&s<=55}function isDecCode(s){return 48<=s&&s<=57}var mr=new ir("tag:yaml.org,2002:int",{kind:"scalar",resolve:function resolveYamlInteger(s){if(null===s)return!1;var o,i,a=s.length,u=0,_=!1;if(!a)return!1;if("-"!==(o=s[u])&&"+"!==o||(o=s[++u]),"0"===o){if(u+1===a)return!0;if("b"===(o=s[++u])){for(u++;u<a;u++)if("_"!==(o=s[u])){if("0"!==o&&"1"!==o)return!1;_=!0}return _&&"_"!==o}if("x"===o){for(u++;u<a;u++)if("_"!==(o=s[u])){if(!(48<=(i=s.charCodeAt(u))&&i<=57||65<=i&&i<=70||97<=i&&i<=102))return!1;_=!0}return _&&"_"!==o}if("o"===o){for(u++;u<a;u++)if("_"!==(o=s[u])){if(!isOctCode(s.charCodeAt(u)))return!1;_=!0}return _&&"_"!==o}}if("_"===o)return!1;for(;u<a;u++)if("_"!==(o=s[u])){if(!isDecCode(s.charCodeAt(u)))return!1;_=!0}return!(!_||"_"===o)},construct:function constructYamlInteger(s){var o,i=s,a=1;if(-1!==i.indexOf("_")&&(i=i.replace(/_/g,"")),"-"!==(o=i[0])&&"+"!==o||("-"===o&&(a=-1),o=(i=i.slice(1))[0]),"0"===i)return 0;if("0"===o){if("b"===i[1])return a*parseInt(i.slice(2),2);if("x"===i[1])return a*parseInt(i.slice(2),16);if("o"===i[1])return a*parseInt(i.slice(2),8)}return a*parseInt(i,10)},predicate:function isInteger(s){return"[object Number]"===Object.prototype.toString.call(s)&&s%1==0&&!er.isNegativeZero(s)},represent:{binary:function(s){return s>=0?"0b"+s.toString(2):"-0b"+s.toString(2).slice(1)},octal:function(s){return s>=0?"0o"+s.toString(8):"-0o"+s.toString(8).slice(1)},decimal:function(s){return s.toString(10)},hexadecimal:function(s){return s>=0?"0x"+s.toString(16).toUpperCase():"-0x"+s.toString(16).toUpperCase().slice(1)}},defaultStyle:"decimal",styleAliases:{binary:[2,"bin"],octal:[8,"oct"],decimal:[10,"dec"],hexadecimal:[16,"hex"]}}),gr=new RegExp("^(?:[-+]?(?:[0-9][0-9_]*)(?:\\.[0-9_]*)?(?:[eE][-+]?[0-9]+)?|\\.[0-9_]+(?:[eE][-+]?[0-9]+)?|[-+]?\\.(?:inf|Inf|INF)|\\.(?:nan|NaN|NAN))$");var yr=/^[-+]?[0-9]+e/;var vr=new ir("tag:yaml.org,2002:float",{kind:"scalar",resolve:function resolveYamlFloat(s){return null!==s&&!(!gr.test(s)||"_"===s[s.length-1])},construct:function constructYamlFloat(s){var o,i;return i="-"===(o=s.replace(/_/g,"").toLowerCase())[0]?-1:1,"+-".indexOf(o[0])>=0&&(o=o.slice(1)),".inf"===o?1===i?Number.POSITIVE_INFINITY:Number.NEGATIVE_INFINITY:".nan"===o?NaN:i*parseFloat(o,10)},predicate:function isFloat(s){return"[object Number]"===Object.prototype.toString.call(s)&&(s%1!=0||er.isNegativeZero(s))},represent:function representYamlFloat(s,o){var i;if(isNaN(s))switch(o){case"lowercase":return".nan";case"uppercase":return".NAN";case"camelcase":return".NaN"}else if(Number.POSITIVE_INFINITY===s)switch(o){case"lowercase":return".inf";case"uppercase":return".INF";case"camelcase":return".Inf"}else if(Number.NEGATIVE_INFINITY===s)switch(o){case"lowercase":return"-.inf";case"uppercase":return"-.INF";case"camelcase":return"-.Inf"}else if(er.isNegativeZero(s))return"-0.0";return i=s.toString(10),yr.test(i)?i.replace("e",".e"):i},defaultStyle:"lowercase"}),br=pr.extend({implicit:[dr,fr,mr,vr]}),_r=br,Sr=new RegExp("^([0-9][0-9][0-9][0-9])-([0-9][0-9])-([0-9][0-9])$"),Er=new RegExp("^([0-9][0-9][0-9][0-9])-([0-9][0-9]?)-([0-9][0-9]?)(?:[Tt]|[ \\t]+)([0-9][0-9]?):([0-9][0-9]):([0-9][0-9])(?:\\.([0-9]*))?(?:[ \\t]*(Z|([-+])([0-9][0-9]?)(?::([0-9][0-9]))?))?$");var wr=new ir("tag:yaml.org,2002:timestamp",{kind:"scalar",resolve:function resolveYamlTimestamp(s){return null!==s&&(null!==Sr.exec(s)||null!==Er.exec(s))},construct:function constructYamlTimestamp(s){var o,i,a,u,_,w,x,C,j=0,L=null;if(null===(o=Sr.exec(s))&&(o=Er.exec(s)),null===o)throw new Error("Date resolve error");if(i=+o[1],a=+o[2]-1,u=+o[3],!o[4])return new Date(Date.UTC(i,a,u));if(_=+o[4],w=+o[5],x=+o[6],o[7]){for(j=o[7].slice(0,3);j.length<3;)j+="0";j=+j}return o[9]&&(L=6e4*(60*+o[10]+ +(o[11]||0)),"-"===o[9]&&(L=-L)),C=new Date(Date.UTC(i,a,u,_,w,x,j)),L&&C.setTime(C.getTime()-L),C},instanceOf:Date,represent:function representYamlTimestamp(s){return s.toISOString()}});var xr=new ir("tag:yaml.org,2002:merge",{kind:"scalar",resolve:function resolveYamlMerge(s){return"<<"===s||null===s}}),kr="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=\n\r";var Or=new ir("tag:yaml.org,2002:binary",{kind:"scalar",resolve:function resolveYamlBinary(s){if(null===s)return!1;var o,i,a=0,u=s.length,_=kr;for(i=0;i<u;i++)if(!((o=_.indexOf(s.charAt(i)))>64)){if(o<0)return!1;a+=6}return a%8==0},construct:function constructYamlBinary(s){var o,i,a=s.replace(/[\r\n=]/g,""),u=a.length,_=kr,w=0,x=[];for(o=0;o<u;o++)o%4==0&&o&&(x.push(w>>16&255),x.push(w>>8&255),x.push(255&w)),w=w<<6|_.indexOf(a.charAt(o));return 0===(i=u%4*6)?(x.push(w>>16&255),x.push(w>>8&255),x.push(255&w)):18===i?(x.push(w>>10&255),x.push(w>>2&255)):12===i&&x.push(w>>4&255),new Uint8Array(x)},predicate:function isBinary(s){return"[object Uint8Array]"===Object.prototype.toString.call(s)},represent:function representYamlBinary(s){var o,i,a="",u=0,_=s.length,w=kr;for(o=0;o<_;o++)o%3==0&&o&&(a+=w[u>>18&63],a+=w[u>>12&63],a+=w[u>>6&63],a+=w[63&u]),u=(u<<8)+s[o];return 0===(i=_%3)?(a+=w[u>>18&63],a+=w[u>>12&63],a+=w[u>>6&63],a+=w[63&u]):2===i?(a+=w[u>>10&63],a+=w[u>>4&63],a+=w[u<<2&63],a+=w[64]):1===i&&(a+=w[u>>2&63],a+=w[u<<4&63],a+=w[64],a+=w[64]),a}}),Ar=Object.prototype.hasOwnProperty,Cr=Object.prototype.toString;var jr=new ir("tag:yaml.org,2002:omap",{kind:"sequence",resolve:function resolveYamlOmap(s){if(null===s)return!0;var o,i,a,u,_,w=[],x=s;for(o=0,i=x.length;o<i;o+=1){if(a=x[o],_=!1,"[object Object]"!==Cr.call(a))return!1;for(u in a)if(Ar.call(a,u)){if(_)return!1;_=!0}if(!_)return!1;if(-1!==w.indexOf(u))return!1;w.push(u)}return!0},construct:function constructYamlOmap(s){return null!==s?s:[]}}),Pr=Object.prototype.toString;var Ir=new ir("tag:yaml.org,2002:pairs",{kind:"sequence",resolve:function resolveYamlPairs(s){if(null===s)return!0;var o,i,a,u,_,w=s;for(_=new Array(w.length),o=0,i=w.length;o<i;o+=1){if(a=w[o],"[object Object]"!==Pr.call(a))return!1;if(1!==(u=Object.keys(a)).length)return!1;_[o]=[u[0],a[u[0]]]}return!0},construct:function constructYamlPairs(s){if(null===s)return[];var o,i,a,u,_,w=s;for(_=new Array(w.length),o=0,i=w.length;o<i;o+=1)a=w[o],u=Object.keys(a),_[o]=[u[0],a[u[0]]];return _}}),Tr=Object.prototype.hasOwnProperty;var Nr=new ir("tag:yaml.org,2002:set",{kind:"mapping",resolve:function resolveYamlSet(s){if(null===s)return!0;var o,i=s;for(o in i)if(Tr.call(i,o)&&null!==i[o])return!1;return!0},construct:function constructYamlSet(s){return null!==s?s:{}}}),Mr=_r.extend({implicit:[wr,xr],explicit:[Or,jr,Ir,Nr]}),Rr=Object.prototype.hasOwnProperty,Dr=/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F-\x84\x86-\x9F\uFFFE\uFFFF]|[\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?:[^\uD800-\uDBFF]|^)[\uDC00-\uDFFF]/,Lr=/[\x85\u2028\u2029]/,Fr=/[,\[\]\{\}]/,Br=/^(?:!|!!|![a-z\-]+!)$/i,$r=/^(?:!|[^,\[\]\{\}])(?:%[0-9a-f]{2}|[0-9a-z\-#;\/\?:@&=\+\$,_\.!~\*'\(\)\[\]])*$/i;function _class(s){return Object.prototype.toString.call(s)}function is_EOL(s){return 10===s||13===s}function is_WHITE_SPACE(s){return 9===s||32===s}function is_WS_OR_EOL(s){return 9===s||32===s||10===s||13===s}function is_FLOW_INDICATOR(s){return 44===s||91===s||93===s||123===s||125===s}function fromHexCode(s){var o;return 48<=s&&s<=57?s-48:97<=(o=32|s)&&o<=102?o-97+10:-1}function simpleEscapeSequence(s){return 48===s?"\0":97===s?"":98===s?"\b":116===s||9===s?"\t":110===s?"\n":118===s?"\v":102===s?"\f":114===s?"\r":101===s?"":32===s?" ":34===s?'"':47===s?"/":92===s?"\\":78===s?"":95===s?" ":76===s?"\u2028":80===s?"\u2029":""}function charFromCodepoint(s){return s<=65535?String.fromCharCode(s):String.fromCharCode(55296+(s-65536>>10),56320+(s-65536&1023))}function setProperty(s,o,i){"__proto__"===o?Object.defineProperty(s,o,{configurable:!0,enumerable:!0,writable:!0,value:i}):s[o]=i}for(var qr=new Array(256),Ur=new Array(256),Vr=0;Vr<256;Vr++)qr[Vr]=simpleEscapeSequence(Vr)?1:0,Ur[Vr]=simpleEscapeSequence(Vr);function State$1(s,o){this.input=s,this.filename=o.filename||null,this.schema=o.schema||Mr,this.onWarning=o.onWarning||null,this.legacy=o.legacy||!1,this.json=o.json||!1,this.listener=o.listener||null,this.implicitTypes=this.schema.compiledImplicit,this.typeMap=this.schema.compiledTypeMap,this.length=s.length,this.position=0,this.line=0,this.lineStart=0,this.lineIndent=0,this.firstTabInLine=-1,this.documents=[]}function generateError(s,o){var i={name:s.filename,buffer:s.input.slice(0,-1),position:s.position,line:s.line,column:s.position-s.lineStart};return i.snippet=rr(i),new tr(o,i)}function throwError(s,o){throw generateError(s,o)}function throwWarning(s,o){s.onWarning&&s.onWarning.call(null,generateError(s,o))}var zr={YAML:function handleYamlDirective(s,o,i){var a,u,_;null!==s.version&&throwError(s,"duplication of %YAML directive"),1!==i.length&&throwError(s,"YAML directive accepts exactly one argument"),null===(a=/^([0-9]+)\.([0-9]+)$/.exec(i[0]))&&throwError(s,"ill-formed argument of the YAML directive"),u=parseInt(a[1],10),_=parseInt(a[2],10),1!==u&&throwError(s,"unacceptable YAML version of the document"),s.version=i[0],s.checkLineBreaks=_<2,1!==_&&2!==_&&throwWarning(s,"unsupported YAML version of the document")},TAG:function handleTagDirective(s,o,i){var a,u;2!==i.length&&throwError(s,"TAG directive accepts exactly two arguments"),a=i[0],u=i[1],Br.test(a)||throwError(s,"ill-formed tag handle (first argument) of the TAG directive"),Rr.call(s.tagMap,a)&&throwError(s,'there is a previously declared suffix for "'+a+'" tag handle'),$r.test(u)||throwError(s,"ill-formed tag prefix (second argument) of the TAG directive");try{u=decodeURIComponent(u)}catch(o){throwError(s,"tag prefix is malformed: "+u)}s.tagMap[a]=u}};function captureSegment(s,o,i,a){var u,_,w,x;if(o<i){if(x=s.input.slice(o,i),a)for(u=0,_=x.length;u<_;u+=1)9===(w=x.charCodeAt(u))||32<=w&&w<=1114111||throwError(s,"expected valid JSON character");else Dr.test(x)&&throwError(s,"the stream contains non-printable characters");s.result+=x}}function mergeMappings(s,o,i,a){var u,_,w,x;for(er.isObject(i)||throwError(s,"cannot merge mappings; the provided source object is unacceptable"),w=0,x=(u=Object.keys(i)).length;w<x;w+=1)_=u[w],Rr.call(o,_)||(setProperty(o,_,i[_]),a[_]=!0)}function storeMappingPair(s,o,i,a,u,_,w,x,C){var j,L;if(Array.isArray(u))for(j=0,L=(u=Array.prototype.slice.call(u)).length;j<L;j+=1)Array.isArray(u[j])&&throwError(s,"nested arrays are not supported inside keys"),"object"==typeof u&&"[object Object]"===_class(u[j])&&(u[j]="[object Object]");if("object"==typeof u&&"[object Object]"===_class(u)&&(u="[object Object]"),u=String(u),null===o&&(o={}),"tag:yaml.org,2002:merge"===a)if(Array.isArray(_))for(j=0,L=_.length;j<L;j+=1)mergeMappings(s,o,_[j],i);else mergeMappings(s,o,_,i);else s.json||Rr.call(i,u)||!Rr.call(o,u)||(s.line=w||s.line,s.lineStart=x||s.lineStart,s.position=C||s.position,throwError(s,"duplicated mapping key")),setProperty(o,u,_),delete i[u];return o}function readLineBreak(s){var o;10===(o=s.input.charCodeAt(s.position))?s.position++:13===o?(s.position++,10===s.input.charCodeAt(s.position)&&s.position++):throwError(s,"a line break is expected"),s.line+=1,s.lineStart=s.position,s.firstTabInLine=-1}function skipSeparationSpace(s,o,i){for(var a=0,u=s.input.charCodeAt(s.position);0!==u;){for(;is_WHITE_SPACE(u);)9===u&&-1===s.firstTabInLine&&(s.firstTabInLine=s.position),u=s.input.charCodeAt(++s.position);if(o&&35===u)do{u=s.input.charCodeAt(++s.position)}while(10!==u&&13!==u&&0!==u);if(!is_EOL(u))break;for(readLineBreak(s),u=s.input.charCodeAt(s.position),a++,s.lineIndent=0;32===u;)s.lineIndent++,u=s.input.charCodeAt(++s.position)}return-1!==i&&0!==a&&s.lineIndent<i&&throwWarning(s,"deficient indentation"),a}function testDocumentSeparator(s){var o,i=s.position;return!(45!==(o=s.input.charCodeAt(i))&&46!==o||o!==s.input.charCodeAt(i+1)||o!==s.input.charCodeAt(i+2)||(i+=3,0!==(o=s.input.charCodeAt(i))&&!is_WS_OR_EOL(o)))}function writeFoldedLines(s,o){1===o?s.result+=" ":o>1&&(s.result+=er.repeat("\n",o-1))}function readBlockSequence(s,o){var i,a,u=s.tag,_=s.anchor,w=[],x=!1;if(-1!==s.firstTabInLine)return!1;for(null!==s.anchor&&(s.anchorMap[s.anchor]=w),a=s.input.charCodeAt(s.position);0!==a&&(-1!==s.firstTabInLine&&(s.position=s.firstTabInLine,throwError(s,"tab characters must not be used in indentation")),45===a)&&is_WS_OR_EOL(s.input.charCodeAt(s.position+1));)if(x=!0,s.position++,skipSeparationSpace(s,!0,-1)&&s.lineIndent<=o)w.push(null),a=s.input.charCodeAt(s.position);else if(i=s.line,composeNode(s,o,3,!1,!0),w.push(s.result),skipSeparationSpace(s,!0,-1),a=s.input.charCodeAt(s.position),(s.line===i||s.lineIndent>o)&&0!==a)throwError(s,"bad indentation of a sequence entry");else if(s.lineIndent<o)break;return!!x&&(s.tag=u,s.anchor=_,s.kind="sequence",s.result=w,!0)}function readTagProperty(s){var o,i,a,u,_=!1,w=!1;if(33!==(u=s.input.charCodeAt(s.position)))return!1;if(null!==s.tag&&throwError(s,"duplication of a tag property"),60===(u=s.input.charCodeAt(++s.position))?(_=!0,u=s.input.charCodeAt(++s.position)):33===u?(w=!0,i="!!",u=s.input.charCodeAt(++s.position)):i="!",o=s.position,_){do{u=s.input.charCodeAt(++s.position)}while(0!==u&&62!==u);s.position<s.length?(a=s.input.slice(o,s.position),u=s.input.charCodeAt(++s.position)):throwError(s,"unexpected end of the stream within a verbatim tag")}else{for(;0!==u&&!is_WS_OR_EOL(u);)33===u&&(w?throwError(s,"tag suffix cannot contain exclamation marks"):(i=s.input.slice(o-1,s.position+1),Br.test(i)||throwError(s,"named tag handle cannot contain such characters"),w=!0,o=s.position+1)),u=s.input.charCodeAt(++s.position);a=s.input.slice(o,s.position),Fr.test(a)&&throwError(s,"tag suffix cannot contain flow indicator characters")}a&&!$r.test(a)&&throwError(s,"tag name cannot contain such characters: "+a);try{a=decodeURIComponent(a)}catch(o){throwError(s,"tag name is malformed: "+a)}return _?s.tag=a:Rr.call(s.tagMap,i)?s.tag=s.tagMap[i]+a:"!"===i?s.tag="!"+a:"!!"===i?s.tag="tag:yaml.org,2002:"+a:throwError(s,'undeclared tag handle "'+i+'"'),!0}function readAnchorProperty(s){var o,i;if(38!==(i=s.input.charCodeAt(s.position)))return!1;for(null!==s.anchor&&throwError(s,"duplication of an anchor property"),i=s.input.charCodeAt(++s.position),o=s.position;0!==i&&!is_WS_OR_EOL(i)&&!is_FLOW_INDICATOR(i);)i=s.input.charCodeAt(++s.position);return s.position===o&&throwError(s,"name of an anchor node must contain at least one character"),s.anchor=s.input.slice(o,s.position),!0}function composeNode(s,o,i,a,u){var _,w,x,C,j,L,B,$,U,V=1,z=!1,Y=!1;if(null!==s.listener&&s.listener("open",s),s.tag=null,s.anchor=null,s.kind=null,s.result=null,_=w=x=4===i||3===i,a&&skipSeparationSpace(s,!0,-1)&&(z=!0,s.lineIndent>o?V=1:s.lineIndent===o?V=0:s.lineIndent<o&&(V=-1)),1===V)for(;readTagProperty(s)||readAnchorProperty(s);)skipSeparationSpace(s,!0,-1)?(z=!0,x=_,s.lineIndent>o?V=1:s.lineIndent===o?V=0:s.lineIndent<o&&(V=-1)):x=!1;if(x&&(x=z||u),1!==V&&4!==i||($=1===i||2===i?o:o+1,U=s.position-s.lineStart,1===V?x&&(readBlockSequence(s,U)||function readBlockMapping(s,o,i){var a,u,_,w,x,C,j,L=s.tag,B=s.anchor,$={},U=Object.create(null),V=null,z=null,Y=null,Z=!1,ee=!1;if(-1!==s.firstTabInLine)return!1;for(null!==s.anchor&&(s.anchorMap[s.anchor]=$),j=s.input.charCodeAt(s.position);0!==j;){if(Z||-1===s.firstTabInLine||(s.position=s.firstTabInLine,throwError(s,"tab characters must not be used in indentation")),a=s.input.charCodeAt(s.position+1),_=s.line,63!==j&&58!==j||!is_WS_OR_EOL(a)){if(w=s.line,x=s.lineStart,C=s.position,!composeNode(s,i,2,!1,!0))break;if(s.line===_){for(j=s.input.charCodeAt(s.position);is_WHITE_SPACE(j);)j=s.input.charCodeAt(++s.position);if(58===j)is_WS_OR_EOL(j=s.input.charCodeAt(++s.position))||throwError(s,"a whitespace character is expected after the key-value separator within a block mapping"),Z&&(storeMappingPair(s,$,U,V,z,null,w,x,C),V=z=Y=null),ee=!0,Z=!1,u=!1,V=s.tag,z=s.result;else{if(!ee)return s.tag=L,s.anchor=B,!0;throwError(s,"can not read an implicit mapping pair; a colon is missed")}}else{if(!ee)return s.tag=L,s.anchor=B,!0;throwError(s,"can not read a block mapping entry; a multiline key may not be an implicit key")}}else 63===j?(Z&&(storeMappingPair(s,$,U,V,z,null,w,x,C),V=z=Y=null),ee=!0,Z=!0,u=!0):Z?(Z=!1,u=!0):throwError(s,"incomplete explicit mapping pair; a key node is missed; or followed by a non-tabulated empty line"),s.position+=1,j=a;if((s.line===_||s.lineIndent>o)&&(Z&&(w=s.line,x=s.lineStart,C=s.position),composeNode(s,o,4,!0,u)&&(Z?z=s.result:Y=s.result),Z||(storeMappingPair(s,$,U,V,z,Y,w,x,C),V=z=Y=null),skipSeparationSpace(s,!0,-1),j=s.input.charCodeAt(s.position)),(s.line===_||s.lineIndent>o)&&0!==j)throwError(s,"bad indentation of a mapping entry");else if(s.lineIndent<o)break}return Z&&storeMappingPair(s,$,U,V,z,null,w,x,C),ee&&(s.tag=L,s.anchor=B,s.kind="mapping",s.result=$),ee}(s,U,$))||function readFlowCollection(s,o){var i,a,u,_,w,x,C,j,L,B,$,U,V=!0,z=s.tag,Y=s.anchor,Z=Object.create(null);if(91===(U=s.input.charCodeAt(s.position)))w=93,j=!1,_=[];else{if(123!==U)return!1;w=125,j=!0,_={}}for(null!==s.anchor&&(s.anchorMap[s.anchor]=_),U=s.input.charCodeAt(++s.position);0!==U;){if(skipSeparationSpace(s,!0,o),(U=s.input.charCodeAt(s.position))===w)return s.position++,s.tag=z,s.anchor=Y,s.kind=j?"mapping":"sequence",s.result=_,!0;V?44===U&&throwError(s,"expected the node content, but found ','"):throwError(s,"missed comma between flow collection entries"),$=null,x=C=!1,63===U&&is_WS_OR_EOL(s.input.charCodeAt(s.position+1))&&(x=C=!0,s.position++,skipSeparationSpace(s,!0,o)),i=s.line,a=s.lineStart,u=s.position,composeNode(s,o,1,!1,!0),B=s.tag,L=s.result,skipSeparationSpace(s,!0,o),U=s.input.charCodeAt(s.position),!C&&s.line!==i||58!==U||(x=!0,U=s.input.charCodeAt(++s.position),skipSeparationSpace(s,!0,o),composeNode(s,o,1,!1,!0),$=s.result),j?storeMappingPair(s,_,Z,B,L,$,i,a,u):x?_.push(storeMappingPair(s,null,Z,B,L,$,i,a,u)):_.push(L),skipSeparationSpace(s,!0,o),44===(U=s.input.charCodeAt(s.position))?(V=!0,U=s.input.charCodeAt(++s.position)):V=!1}throwError(s,"unexpected end of the stream within a flow collection")}(s,$)?Y=!0:(w&&function readBlockScalar(s,o){var i,a,u,_,w,x=1,C=!1,j=!1,L=o,B=0,$=!1;if(124===(_=s.input.charCodeAt(s.position)))a=!1;else{if(62!==_)return!1;a=!0}for(s.kind="scalar",s.result="";0!==_;)if(43===(_=s.input.charCodeAt(++s.position))||45===_)1===x?x=43===_?3:2:throwError(s,"repeat of a chomping mode identifier");else{if(!((u=48<=(w=_)&&w<=57?w-48:-1)>=0))break;0===u?throwError(s,"bad explicit indentation width of a block scalar; it cannot be less than one"):j?throwError(s,"repeat of an indentation width identifier"):(L=o+u-1,j=!0)}if(is_WHITE_SPACE(_)){do{_=s.input.charCodeAt(++s.position)}while(is_WHITE_SPACE(_));if(35===_)do{_=s.input.charCodeAt(++s.position)}while(!is_EOL(_)&&0!==_)}for(;0!==_;){for(readLineBreak(s),s.lineIndent=0,_=s.input.charCodeAt(s.position);(!j||s.lineIndent<L)&&32===_;)s.lineIndent++,_=s.input.charCodeAt(++s.position);if(!j&&s.lineIndent>L&&(L=s.lineIndent),is_EOL(_))B++;else{if(s.lineIndent<L){3===x?s.result+=er.repeat("\n",C?1+B:B):1===x&&C&&(s.result+="\n");break}for(a?is_WHITE_SPACE(_)?($=!0,s.result+=er.repeat("\n",C?1+B:B)):$?($=!1,s.result+=er.repeat("\n",B+1)):0===B?C&&(s.result+=" "):s.result+=er.repeat("\n",B):s.result+=er.repeat("\n",C?1+B:B),C=!0,j=!0,B=0,i=s.position;!is_EOL(_)&&0!==_;)_=s.input.charCodeAt(++s.position);captureSegment(s,i,s.position,!1)}}return!0}(s,$)||function readSingleQuotedScalar(s,o){var i,a,u;if(39!==(i=s.input.charCodeAt(s.position)))return!1;for(s.kind="scalar",s.result="",s.position++,a=u=s.position;0!==(i=s.input.charCodeAt(s.position));)if(39===i){if(captureSegment(s,a,s.position,!0),39!==(i=s.input.charCodeAt(++s.position)))return!0;a=s.position,s.position++,u=s.position}else is_EOL(i)?(captureSegment(s,a,u,!0),writeFoldedLines(s,skipSeparationSpace(s,!1,o)),a=u=s.position):s.position===s.lineStart&&testDocumentSeparator(s)?throwError(s,"unexpected end of the document within a single quoted scalar"):(s.position++,u=s.position);throwError(s,"unexpected end of the stream within a single quoted scalar")}(s,$)||function readDoubleQuotedScalar(s,o){var i,a,u,_,w,x,C;if(34!==(x=s.input.charCodeAt(s.position)))return!1;for(s.kind="scalar",s.result="",s.position++,i=a=s.position;0!==(x=s.input.charCodeAt(s.position));){if(34===x)return captureSegment(s,i,s.position,!0),s.position++,!0;if(92===x){if(captureSegment(s,i,s.position,!0),is_EOL(x=s.input.charCodeAt(++s.position)))skipSeparationSpace(s,!1,o);else if(x<256&&qr[x])s.result+=Ur[x],s.position++;else if((w=120===(C=x)?2:117===C?4:85===C?8:0)>0){for(u=w,_=0;u>0;u--)(w=fromHexCode(x=s.input.charCodeAt(++s.position)))>=0?_=(_<<4)+w:throwError(s,"expected hexadecimal character");s.result+=charFromCodepoint(_),s.position++}else throwError(s,"unknown escape sequence");i=a=s.position}else is_EOL(x)?(captureSegment(s,i,a,!0),writeFoldedLines(s,skipSeparationSpace(s,!1,o)),i=a=s.position):s.position===s.lineStart&&testDocumentSeparator(s)?throwError(s,"unexpected end of the document within a double quoted scalar"):(s.position++,a=s.position)}throwError(s,"unexpected end of the stream within a double quoted scalar")}(s,$)?Y=!0:!function readAlias(s){var o,i,a;if(42!==(a=s.input.charCodeAt(s.position)))return!1;for(a=s.input.charCodeAt(++s.position),o=s.position;0!==a&&!is_WS_OR_EOL(a)&&!is_FLOW_INDICATOR(a);)a=s.input.charCodeAt(++s.position);return s.position===o&&throwError(s,"name of an alias node must contain at least one character"),i=s.input.slice(o,s.position),Rr.call(s.anchorMap,i)||throwError(s,'unidentified alias "'+i+'"'),s.result=s.anchorMap[i],skipSeparationSpace(s,!0,-1),!0}(s)?function readPlainScalar(s,o,i){var a,u,_,w,x,C,j,L,B=s.kind,$=s.result;if(is_WS_OR_EOL(L=s.input.charCodeAt(s.position))||is_FLOW_INDICATOR(L)||35===L||38===L||42===L||33===L||124===L||62===L||39===L||34===L||37===L||64===L||96===L)return!1;if((63===L||45===L)&&(is_WS_OR_EOL(a=s.input.charCodeAt(s.position+1))||i&&is_FLOW_INDICATOR(a)))return!1;for(s.kind="scalar",s.result="",u=_=s.position,w=!1;0!==L;){if(58===L){if(is_WS_OR_EOL(a=s.input.charCodeAt(s.position+1))||i&&is_FLOW_INDICATOR(a))break}else if(35===L){if(is_WS_OR_EOL(s.input.charCodeAt(s.position-1)))break}else{if(s.position===s.lineStart&&testDocumentSeparator(s)||i&&is_FLOW_INDICATOR(L))break;if(is_EOL(L)){if(x=s.line,C=s.lineStart,j=s.lineIndent,skipSeparationSpace(s,!1,-1),s.lineIndent>=o){w=!0,L=s.input.charCodeAt(s.position);continue}s.position=_,s.line=x,s.lineStart=C,s.lineIndent=j;break}}w&&(captureSegment(s,u,_,!1),writeFoldedLines(s,s.line-x),u=_=s.position,w=!1),is_WHITE_SPACE(L)||(_=s.position+1),L=s.input.charCodeAt(++s.position)}return captureSegment(s,u,_,!1),!!s.result||(s.kind=B,s.result=$,!1)}(s,$,1===i)&&(Y=!0,null===s.tag&&(s.tag="?")):(Y=!0,null===s.tag&&null===s.anchor||throwError(s,"alias node should not have any properties")),null!==s.anchor&&(s.anchorMap[s.anchor]=s.result)):0===V&&(Y=x&&readBlockSequence(s,U))),null===s.tag)null!==s.anchor&&(s.anchorMap[s.anchor]=s.result);else if("?"===s.tag){for(null!==s.result&&"scalar"!==s.kind&&throwError(s,'unacceptable node kind for !<?> tag; it should be "scalar", not "'+s.kind+'"'),C=0,j=s.implicitTypes.length;C<j;C+=1)if((B=s.implicitTypes[C]).resolve(s.result)){s.result=B.construct(s.result),s.tag=B.tag,null!==s.anchor&&(s.anchorMap[s.anchor]=s.result);break}}else if("!"!==s.tag){if(Rr.call(s.typeMap[s.kind||"fallback"],s.tag))B=s.typeMap[s.kind||"fallback"][s.tag];else for(B=null,C=0,j=(L=s.typeMap.multi[s.kind||"fallback"]).length;C<j;C+=1)if(s.tag.slice(0,L[C].tag.length)===L[C].tag){B=L[C];break}B||throwError(s,"unknown tag !<"+s.tag+">"),null!==s.result&&B.kind!==s.kind&&throwError(s,"unacceptable node kind for !<"+s.tag+'> tag; it should be "'+B.kind+'", not "'+s.kind+'"'),B.resolve(s.result,s.tag)?(s.result=B.construct(s.result,s.tag),null!==s.anchor&&(s.anchorMap[s.anchor]=s.result)):throwError(s,"cannot resolve a node with !<"+s.tag+"> explicit tag")}return null!==s.listener&&s.listener("close",s),null!==s.tag||null!==s.anchor||Y}function readDocument(s){var o,i,a,u,_=s.position,w=!1;for(s.version=null,s.checkLineBreaks=s.legacy,s.tagMap=Object.create(null),s.anchorMap=Object.create(null);0!==(u=s.input.charCodeAt(s.position))&&(skipSeparationSpace(s,!0,-1),u=s.input.charCodeAt(s.position),!(s.lineIndent>0||37!==u));){for(w=!0,u=s.input.charCodeAt(++s.position),o=s.position;0!==u&&!is_WS_OR_EOL(u);)u=s.input.charCodeAt(++s.position);for(a=[],(i=s.input.slice(o,s.position)).length<1&&throwError(s,"directive name must not be less than one character in length");0!==u;){for(;is_WHITE_SPACE(u);)u=s.input.charCodeAt(++s.position);if(35===u){do{u=s.input.charCodeAt(++s.position)}while(0!==u&&!is_EOL(u));break}if(is_EOL(u))break;for(o=s.position;0!==u&&!is_WS_OR_EOL(u);)u=s.input.charCodeAt(++s.position);a.push(s.input.slice(o,s.position))}0!==u&&readLineBreak(s),Rr.call(zr,i)?zr[i](s,i,a):throwWarning(s,'unknown document directive "'+i+'"')}skipSeparationSpace(s,!0,-1),0===s.lineIndent&&45===s.input.charCodeAt(s.position)&&45===s.input.charCodeAt(s.position+1)&&45===s.input.charCodeAt(s.position+2)?(s.position+=3,skipSeparationSpace(s,!0,-1)):w&&throwError(s,"directives end mark is expected"),composeNode(s,s.lineIndent-1,4,!1,!0),skipSeparationSpace(s,!0,-1),s.checkLineBreaks&&Lr.test(s.input.slice(_,s.position))&&throwWarning(s,"non-ASCII line breaks are interpreted as content"),s.documents.push(s.result),s.position===s.lineStart&&testDocumentSeparator(s)?46===s.input.charCodeAt(s.position)&&(s.position+=3,skipSeparationSpace(s,!0,-1)):s.position<s.length-1&&throwError(s,"end of the stream or a document separator is expected")}function loadDocuments(s,o){o=o||{},0!==(s=String(s)).length&&(10!==s.charCodeAt(s.length-1)&&13!==s.charCodeAt(s.length-1)&&(s+="\n"),65279===s.charCodeAt(0)&&(s=s.slice(1)));var i=new State$1(s,o),a=s.indexOf("\0");for(-1!==a&&(i.position=a,throwError(i,"null byte is not allowed in input")),i.input+="\0";32===i.input.charCodeAt(i.position);)i.lineIndent+=1,i.position+=1;for(;i.position<i.length-1;)readDocument(i);return i.documents}var Wr={loadAll:function loadAll$1(s,o,i){null!==o&&"object"==typeof o&&void 0===i&&(i=o,o=null);var a=loadDocuments(s,i);if("function"!=typeof o)return a;for(var u=0,_=a.length;u<_;u+=1)o(a[u])},load:function load$1(s,o){var i=loadDocuments(s,o);if(0!==i.length){if(1===i.length)return i[0];throw new tr("expected a single document in the stream, but found more")}}},Jr=Object.prototype.toString,Hr=Object.prototype.hasOwnProperty,Kr=65279,Gr={0:"\\0",7:"\\a",8:"\\b",9:"\\t",10:"\\n",11:"\\v",12:"\\f",13:"\\r",27:"\\e",34:'\\"',92:"\\\\",133:"\\N",160:"\\_",8232:"\\L",8233:"\\P"},Yr=["y","Y","yes","Yes","YES","on","On","ON","n","N","no","No","NO","off","Off","OFF"],Xr=/^[-+]?[0-9_]+(?::[0-9_]+)+(?:\.[0-9_]*)?$/;function encodeHex(s){var o,i,a;if(o=s.toString(16).toUpperCase(),s<=255)i="x",a=2;else if(s<=65535)i="u",a=4;else{if(!(s<=4294967295))throw new tr("code point within a string may not be greater than 0xFFFFFFFF");i="U",a=8}return"\\"+i+er.repeat("0",a-o.length)+o}function State(s){this.schema=s.schema||Mr,this.indent=Math.max(1,s.indent||2),this.noArrayIndent=s.noArrayIndent||!1,this.skipInvalid=s.skipInvalid||!1,this.flowLevel=er.isNothing(s.flowLevel)?-1:s.flowLevel,this.styleMap=function compileStyleMap(s,o){var i,a,u,_,w,x,C;if(null===o)return{};for(i={},u=0,_=(a=Object.keys(o)).length;u<_;u+=1)w=a[u],x=String(o[w]),"!!"===w.slice(0,2)&&(w="tag:yaml.org,2002:"+w.slice(2)),(C=s.compiledTypeMap.fallback[w])&&Hr.call(C.styleAliases,x)&&(x=C.styleAliases[x]),i[w]=x;return i}(this.schema,s.styles||null),this.sortKeys=s.sortKeys||!1,this.lineWidth=s.lineWidth||80,this.noRefs=s.noRefs||!1,this.noCompatMode=s.noCompatMode||!1,this.condenseFlow=s.condenseFlow||!1,this.quotingType='"'===s.quotingType?2:1,this.forceQuotes=s.forceQuotes||!1,this.replacer="function"==typeof s.replacer?s.replacer:null,this.implicitTypes=this.schema.compiledImplicit,this.explicitTypes=this.schema.compiledExplicit,this.tag=null,this.result="",this.duplicates=[],this.usedDuplicates=null}function indentString(s,o){for(var i,a=er.repeat(" ",o),u=0,_=-1,w="",x=s.length;u<x;)-1===(_=s.indexOf("\n",u))?(i=s.slice(u),u=x):(i=s.slice(u,_+1),u=_+1),i.length&&"\n"!==i&&(w+=a),w+=i;return w}function generateNextLine(s,o){return"\n"+er.repeat(" ",s.indent*o)}function isWhitespace(s){return 32===s||9===s}function isPrintable(s){return 32<=s&&s<=126||161<=s&&s<=55295&&8232!==s&&8233!==s||57344<=s&&s<=65533&&s!==Kr||65536<=s&&s<=1114111}function isNsCharOrWhitespace(s){return isPrintable(s)&&s!==Kr&&13!==s&&10!==s}function isPlainSafe(s,o,i){var a=isNsCharOrWhitespace(s),u=a&&!isWhitespace(s);return(i?a:a&&44!==s&&91!==s&&93!==s&&123!==s&&125!==s)&&35!==s&&!(58===o&&!u)||isNsCharOrWhitespace(o)&&!isWhitespace(o)&&35===s||58===o&&u}function codePointAt(s,o){var i,a=s.charCodeAt(o);return a>=55296&&a<=56319&&o+1<s.length&&(i=s.charCodeAt(o+1))>=56320&&i<=57343?1024*(a-55296)+i-56320+65536:a}function needIndentIndicator(s){return/^\n* /.test(s)}function chooseScalarStyle(s,o,i,a,u,_,w,x){var C,j=0,L=null,B=!1,$=!1,U=-1!==a,V=-1,z=function isPlainSafeFirst(s){return isPrintable(s)&&s!==Kr&&!isWhitespace(s)&&45!==s&&63!==s&&58!==s&&44!==s&&91!==s&&93!==s&&123!==s&&125!==s&&35!==s&&38!==s&&42!==s&&33!==s&&124!==s&&61!==s&&62!==s&&39!==s&&34!==s&&37!==s&&64!==s&&96!==s}(codePointAt(s,0))&&function isPlainSafeLast(s){return!isWhitespace(s)&&58!==s}(codePointAt(s,s.length-1));if(o||w)for(C=0;C<s.length;j>=65536?C+=2:C++){if(!isPrintable(j=codePointAt(s,C)))return 5;z=z&&isPlainSafe(j,L,x),L=j}else{for(C=0;C<s.length;j>=65536?C+=2:C++){if(10===(j=codePointAt(s,C)))B=!0,U&&($=$||C-V-1>a&&" "!==s[V+1],V=C);else if(!isPrintable(j))return 5;z=z&&isPlainSafe(j,L,x),L=j}$=$||U&&C-V-1>a&&" "!==s[V+1]}return B||$?i>9&&needIndentIndicator(s)?5:w?2===_?5:2:$?4:3:!z||w||u(s)?2===_?5:2:1}function writeScalar(s,o,i,a,u){s.dump=function(){if(0===o.length)return 2===s.quotingType?'""':"''";if(!s.noCompatMode&&(-1!==Yr.indexOf(o)||Xr.test(o)))return 2===s.quotingType?'"'+o+'"':"'"+o+"'";var _=s.indent*Math.max(1,i),w=-1===s.lineWidth?-1:Math.max(Math.min(s.lineWidth,40),s.lineWidth-_),x=a||s.flowLevel>-1&&i>=s.flowLevel;switch(chooseScalarStyle(o,x,s.indent,w,(function testAmbiguity(o){return function testImplicitResolving(s,o){var i,a;for(i=0,a=s.implicitTypes.length;i<a;i+=1)if(s.implicitTypes[i].resolve(o))return!0;return!1}(s,o)}),s.quotingType,s.forceQuotes&&!a,u)){case 1:return o;case 2:return"'"+o.replace(/'/g,"''")+"'";case 3:return"|"+blockHeader(o,s.indent)+dropEndingNewline(indentString(o,_));case 4:return">"+blockHeader(o,s.indent)+dropEndingNewline(indentString(function foldString(s,o){var i,a,u=/(\n+)([^\n]*)/g,_=(x=s.indexOf("\n"),x=-1!==x?x:s.length,u.lastIndex=x,foldLine(s.slice(0,x),o)),w="\n"===s[0]||" "===s[0];var x;for(;a=u.exec(s);){var C=a[1],j=a[2];i=" "===j[0],_+=C+(w||i||""===j?"":"\n")+foldLine(j,o),w=i}return _}(o,w),_));case 5:return'"'+function escapeString(s){for(var o,i="",a=0,u=0;u<s.length;a>=65536?u+=2:u++)a=codePointAt(s,u),!(o=Gr[a])&&isPrintable(a)?(i+=s[u],a>=65536&&(i+=s[u+1])):i+=o||encodeHex(a);return i}(o)+'"';default:throw new tr("impossible error: invalid scalar style")}}()}function blockHeader(s,o){var i=needIndentIndicator(s)?String(o):"",a="\n"===s[s.length-1];return i+(a&&("\n"===s[s.length-2]||"\n"===s)?"+":a?"":"-")+"\n"}function dropEndingNewline(s){return"\n"===s[s.length-1]?s.slice(0,-1):s}function foldLine(s,o){if(""===s||" "===s[0])return s;for(var i,a,u=/ [^ ]/g,_=0,w=0,x=0,C="";i=u.exec(s);)(x=i.index)-_>o&&(a=w>_?w:x,C+="\n"+s.slice(_,a),_=a+1),w=x;return C+="\n",s.length-_>o&&w>_?C+=s.slice(_,w)+"\n"+s.slice(w+1):C+=s.slice(_),C.slice(1)}function writeBlockSequence(s,o,i,a){var u,_,w,x="",C=s.tag;for(u=0,_=i.length;u<_;u+=1)w=i[u],s.replacer&&(w=s.replacer.call(i,String(u),w)),(writeNode(s,o+1,w,!0,!0,!1,!0)||void 0===w&&writeNode(s,o+1,null,!0,!0,!1,!0))&&(a&&""===x||(x+=generateNextLine(s,o)),s.dump&&10===s.dump.charCodeAt(0)?x+="-":x+="- ",x+=s.dump);s.tag=C,s.dump=x||"[]"}function detectType(s,o,i){var a,u,_,w,x,C;for(_=0,w=(u=i?s.explicitTypes:s.implicitTypes).length;_<w;_+=1)if(((x=u[_]).instanceOf||x.predicate)&&(!x.instanceOf||"object"==typeof o&&o instanceof x.instanceOf)&&(!x.predicate||x.predicate(o))){if(i?x.multi&&x.representName?s.tag=x.representName(o):s.tag=x.tag:s.tag="?",x.represent){if(C=s.styleMap[x.tag]||x.defaultStyle,"[object Function]"===Jr.call(x.represent))a=x.represent(o,C);else{if(!Hr.call(x.represent,C))throw new tr("!<"+x.tag+'> tag resolver accepts not "'+C+'" style');a=x.represent[C](o,C)}s.dump=a}return!0}return!1}function writeNode(s,o,i,a,u,_,w){s.tag=null,s.dump=i,detectType(s,i,!1)||detectType(s,i,!0);var x,C=Jr.call(s.dump),j=a;a&&(a=s.flowLevel<0||s.flowLevel>o);var L,B,$="[object Object]"===C||"[object Array]"===C;if($&&(B=-1!==(L=s.duplicates.indexOf(i))),(null!==s.tag&&"?"!==s.tag||B||2!==s.indent&&o>0)&&(u=!1),B&&s.usedDuplicates[L])s.dump="*ref_"+L;else{if($&&B&&!s.usedDuplicates[L]&&(s.usedDuplicates[L]=!0),"[object Object]"===C)a&&0!==Object.keys(s.dump).length?(!function writeBlockMapping(s,o,i,a){var u,_,w,x,C,j,L="",B=s.tag,$=Object.keys(i);if(!0===s.sortKeys)$.sort();else if("function"==typeof s.sortKeys)$.sort(s.sortKeys);else if(s.sortKeys)throw new tr("sortKeys must be a boolean or a function");for(u=0,_=$.length;u<_;u+=1)j="",a&&""===L||(j+=generateNextLine(s,o)),x=i[w=$[u]],s.replacer&&(x=s.replacer.call(i,w,x)),writeNode(s,o+1,w,!0,!0,!0)&&((C=null!==s.tag&&"?"!==s.tag||s.dump&&s.dump.length>1024)&&(s.dump&&10===s.dump.charCodeAt(0)?j+="?":j+="? "),j+=s.dump,C&&(j+=generateNextLine(s,o)),writeNode(s,o+1,x,!0,C)&&(s.dump&&10===s.dump.charCodeAt(0)?j+=":":j+=": ",L+=j+=s.dump));s.tag=B,s.dump=L||"{}"}(s,o,s.dump,u),B&&(s.dump="&ref_"+L+s.dump)):(!function writeFlowMapping(s,o,i){var a,u,_,w,x,C="",j=s.tag,L=Object.keys(i);for(a=0,u=L.length;a<u;a+=1)x="",""!==C&&(x+=", "),s.condenseFlow&&(x+='"'),w=i[_=L[a]],s.replacer&&(w=s.replacer.call(i,_,w)),writeNode(s,o,_,!1,!1)&&(s.dump.length>1024&&(x+="? "),x+=s.dump+(s.condenseFlow?'"':"")+":"+(s.condenseFlow?"":" "),writeNode(s,o,w,!1,!1)&&(C+=x+=s.dump));s.tag=j,s.dump="{"+C+"}"}(s,o,s.dump),B&&(s.dump="&ref_"+L+" "+s.dump));else if("[object Array]"===C)a&&0!==s.dump.length?(s.noArrayIndent&&!w&&o>0?writeBlockSequence(s,o-1,s.dump,u):writeBlockSequence(s,o,s.dump,u),B&&(s.dump="&ref_"+L+s.dump)):(!function writeFlowSequence(s,o,i){var a,u,_,w="",x=s.tag;for(a=0,u=i.length;a<u;a+=1)_=i[a],s.replacer&&(_=s.replacer.call(i,String(a),_)),(writeNode(s,o,_,!1,!1)||void 0===_&&writeNode(s,o,null,!1,!1))&&(""!==w&&(w+=","+(s.condenseFlow?"":" ")),w+=s.dump);s.tag=x,s.dump="["+w+"]"}(s,o,s.dump),B&&(s.dump="&ref_"+L+" "+s.dump));else{if("[object String]"!==C){if("[object Undefined]"===C)return!1;if(s.skipInvalid)return!1;throw new tr("unacceptable kind of an object to dump "+C)}"?"!==s.tag&&writeScalar(s,s.dump,o,_,j)}null!==s.tag&&"?"!==s.tag&&(x=encodeURI("!"===s.tag[0]?s.tag.slice(1):s.tag).replace(/!/g,"%21"),x="!"===s.tag[0]?"!"+x:"tag:yaml.org,2002:"===x.slice(0,18)?"!!"+x.slice(18):"!<"+x+">",s.dump=x+" "+s.dump)}return!0}function getDuplicateReferences(s,o){var i,a,u=[],_=[];for(inspectNode(s,u,_),i=0,a=_.length;i<a;i+=1)o.duplicates.push(u[_[i]]);o.usedDuplicates=new Array(a)}function inspectNode(s,o,i){var a,u,_;if(null!==s&&"object"==typeof s)if(-1!==(u=o.indexOf(s)))-1===i.indexOf(u)&&i.push(u);else if(o.push(s),Array.isArray(s))for(u=0,_=s.length;u<_;u+=1)inspectNode(s[u],o,i);else for(u=0,_=(a=Object.keys(s)).length;u<_;u+=1)inspectNode(s[a[u]],o,i)}var Qr=function dump$1(s,o){var i=new State(o=o||{});i.noRefs||getDuplicateReferences(s,i);var a=s;return i.replacer&&(a=i.replacer.call({"":a},"",a)),writeNode(i,0,a,!0,!0)?i.dump+"\n":""};function renamed(s,o){return function(){throw new Error("Function yaml."+s+" is removed in js-yaml 4. Use yaml."+o+" instead, which is now safe by default.")}}var Zr=ir,en=ar,tn=pr,rn=br,nn=_r,sn=Mr,on=Wr.load,an=Wr.loadAll,cn={dump:Qr}.dump,ln=tr,un={binary:Or,float:vr,map:ur,null:dr,pairs:Ir,set:Nr,timestamp:wr,bool:fr,int:mr,merge:xr,omap:jr,seq:lr,str:cr},pn=renamed("safeLoad","load"),hn=renamed("safeLoadAll","loadAll"),dn=renamed("safeDump","dump"),fn={Type:Zr,Schema:en,FAILSAFE_SCHEMA:tn,JSON_SCHEMA:rn,CORE_SCHEMA:nn,DEFAULT_SCHEMA:sn,load:on,loadAll:an,dump:cn,YAMLException:ln,types:un,safeLoad:pn,safeLoadAll:hn,safeDump:dn};const mn="configs_update",gn="configs_toggle";function update(s,o){return{type:mn,payload:{[s]:o}}}function toggle(s){return{type:gn,payload:s}}const actions_loaded=()=>()=>{},downloadConfig=s=>o=>{const{fn:{fetch:i}}=o;return i(s)},getConfigByUrl=(s,o)=>i=>{const{specActions:a,configsActions:u}=i;if(s)return u.downloadConfig(s).then(next,next);function next(u){u instanceof Error||u.status>=400?(a.updateLoadingStatus("failedConfig"),a.updateLoadingStatus("failedConfig"),a.updateUrl(""),console.error(u.statusText+" "+s.url),o(null)):o(((s,o)=>{try{return fn.load(s)}catch(s){return o&&o.errActions.newThrownErr(new Error(s)),{}}})(u.text,i))}},get=(s,o)=>s.getIn(Array.isArray(o)?o:[o]),yn={[mn]:(s,o)=>s.merge((0,ze.fromJS)(o.payload)),[gn]:(s,o)=>{const i=o.payload,a=s.get(i);return s.set(i,!a)}};function configsPlugin(){return{statePlugins:{configs:{reducers:yn,actions:u,selectors:_}}}}const setHash=s=>s?history.pushState(null,null,`#${s}`):window.location.hash="";var vn=__webpack_require__(86215),bn=__webpack_require__.n(vn);const _n="layout_scroll_to",Sn="layout_clear_scroll";const En={fn:{getScrollParent:function getScrollParent(s,o){const i=document.documentElement;let a=getComputedStyle(s);const u="absolute"===a.position,_=o?/(auto|scroll|hidden)/:/(auto|scroll)/;if("fixed"===a.position)return i;for(let o=s;o=o.parentElement;)if(a=getComputedStyle(o),(!u||"static"!==a.position)&&_.test(a.overflow+a.overflowY+a.overflowX))return o;return i}},statePlugins:{layout:{actions:{scrollToElement:(s,o)=>i=>{try{o=o||i.fn.getScrollParent(s),bn().createScroller(o).to(s)}catch(s){console.error(s)}},scrollTo:s=>({type:_n,payload:Array.isArray(s)?s:[s]}),clearScrollTo:()=>({type:Sn}),readyToScroll:(s,o)=>i=>{const a=i.layoutSelectors.getScrollToKey();We().is(a,(0,ze.fromJS)(s))&&(i.layoutActions.scrollToElement(o),i.layoutActions.clearScrollTo())},parseDeepLinkHash:s=>({layoutActions:o,layoutSelectors:i,getConfigs:a})=>{if(a().deepLinking&&s){let a=s.slice(1);"!"===a[0]&&(a=a.slice(1)),"/"===a[0]&&(a=a.slice(1));const u=a.split("/").map((s=>s||"")),_=i.isShownKeyFromUrlHashArray(u),[w,x="",C=""]=_;if("operations"===w){const s=i.isShownKeyFromUrlHashArray([x]);x.indexOf("_")>-1&&(console.warn("Warning: escaping deep link whitespace with `_` will be unsupported in v4.0, use `%20` instead."),o.show(s.map((s=>s.replace(/_/g," "))),!0)),o.show(s,!0)}(x.indexOf("_")>-1||C.indexOf("_")>-1)&&(console.warn("Warning: escaping deep link whitespace with `_` will be unsupported in v4.0, use `%20` instead."),o.show(_.map((s=>s.replace(/_/g," "))),!0)),o.show(_,!0),o.scrollTo(_)}}},selectors:{getScrollToKey:s=>s.get("scrollToKey"),isShownKeyFromUrlHashArray(s,o){const[i,a]=o;return a?["operations",i,a]:i?["operations-tag",i]:[]},urlHashArrayFromIsShownKey(s,o){let[i,a,u]=o;return"operations"==i?[a,u]:"operations-tag"==i?[a]:[]}},reducers:{[_n]:(s,o)=>s.set("scrollToKey",We().fromJS(o.payload)),[Sn]:s=>s.delete("scrollToKey")},wrapActions:{show:(s,{getConfigs:o,layoutSelectors:i})=>(...a)=>{if(s(...a),o().deepLinking)try{let[s,o]=a;s=Array.isArray(s)?s:[s];const u=i.urlHashArrayFromIsShownKey(s);if(!u.length)return;const[_,w]=u;if(!o)return setHash("/");2===u.length?setHash(createDeepLinkPath(`/${encodeURIComponent(_)}/${encodeURIComponent(w)}`)):1===u.length&&setHash(createDeepLinkPath(`/${encodeURIComponent(_)}`))}catch(s){console.error(s)}}}}}};var wn=__webpack_require__(2209),xn=__webpack_require__.n(wn);const operation_wrapper=(s,o)=>class OperationWrapper extends Re.Component{onLoad=s=>{const{operation:i}=this.props,{tag:a,operationId:u}=i.toObject();let{isShownKey:_}=i.toObject();_=_||["operations",a,u],o.layoutActions.readyToScroll(_,s)};render(){return Re.createElement("span",{ref:this.onLoad},Re.createElement(s,this.props))}},operation_tag_wrapper=(s,o)=>class OperationTagWrapper extends Re.Component{onLoad=s=>{const{tag:i}=this.props,a=["operations-tag",i];o.layoutActions.readyToScroll(a,s)};render(){return Re.createElement("span",{ref:this.onLoad},Re.createElement(s,this.props))}};function deep_linking(){return[En,{statePlugins:{configs:{wrapActions:{loaded:(s,o)=>(...i)=>{s(...i);const a=decodeURIComponent(window.location.hash);o.layoutActions.parseDeepLinkHash(a)}}}},wrapComponents:{operation:operation_wrapper,OperationTag:operation_tag_wrapper}}]}var kn=__webpack_require__(40860),On=__webpack_require__.n(kn);function transform(s){return s.map((s=>{let o="is not of a type(s)",i=s.get("message").indexOf(o);if(i>-1){let o=s.get("message").slice(i+19).split(",");return s.set("message",s.get("message").slice(0,i)+function makeNewMessage(s){return s.reduce(((s,o,i,a)=>i===a.length-1&&a.length>1?s+"or "+o:a[i+1]&&a.length>2?s+o+", ":a[i+1]?s+o+" ":s+o),"should be a")}(o))}return s}))}var An=__webpack_require__(58156),Cn=__webpack_require__.n(An);function parameter_oneof_transform(s,{jsSpec:o}){return s}const jn=[w,x];function transformErrors(s){let o={jsSpec:{}},i=On()(jn,((s,i)=>{try{return i.transform(s,o).filter((s=>!!s))}catch(o){return console.error("Transformer error:",o),s}}),s);return i.filter((s=>!!s)).map((s=>(!s.get("line")&&s.get("path"),s)))}let Pn={line:0,level:"error",message:"Unknown error"};const In=Ut((s=>s),(s=>s.get("errors",(0,ze.List)()))),Tn=Ut(In,(s=>s.last()));function err(o){return{statePlugins:{err:{reducers:{[rt]:(s,{payload:o})=>{let i=Object.assign(Pn,o,{type:"thrown"});return s.update("errors",(s=>(s||(0,ze.List)()).push((0,ze.fromJS)(i)))).update("errors",(s=>transformErrors(s)))},[nt]:(s,{payload:o})=>(o=o.map((s=>(0,ze.fromJS)(Object.assign(Pn,s,{type:"thrown"})))),s.update("errors",(s=>(s||(0,ze.List)()).concat((0,ze.fromJS)(o)))).update("errors",(s=>transformErrors(s)))),[st]:(s,{payload:o})=>{let i=(0,ze.fromJS)(o);return i=i.set("type","spec"),s.update("errors",(s=>(s||(0,ze.List)()).push((0,ze.fromJS)(i)).sortBy((s=>s.get("line"))))).update("errors",(s=>transformErrors(s)))},[ot]:(s,{payload:o})=>(o=o.map((s=>(0,ze.fromJS)(Object.assign(Pn,s,{type:"spec"})))),s.update("errors",(s=>(s||(0,ze.List)()).concat((0,ze.fromJS)(o)))).update("errors",(s=>transformErrors(s)))),[it]:(s,{payload:o})=>{let i=(0,ze.fromJS)(Object.assign({},o));return i=i.set("type","auth"),s.update("errors",(s=>(s||(0,ze.List)()).push((0,ze.fromJS)(i)))).update("errors",(s=>transformErrors(s)))},[at]:(s,{payload:o})=>{if(!o||!s.get("errors"))return s;let i=s.get("errors").filter((s=>s.keySeq().every((i=>{const a=s.get(i),u=o[i];return!u||a!==u}))));return s.merge({errors:i})},[ct]:(s,{payload:o})=>{if(!o||"function"!=typeof o)return s;let i=s.get("errors").filter((s=>o(s)));return s.merge({errors:i})}},actions:s,selectors:C}}}}function opsFilter(s,o){return s.filter(((s,i)=>-1!==i.indexOf(o)))}function filter(){return{fn:{opsFilter}}}var Nn=__webpack_require__(7666),Mn=__webpack_require__.n(Nn);const arrow_up=({className:s=null,width:o=20,height:i=20,...a})=>Re.createElement("svg",Mn()({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 20 20",className:s,width:o,height:i,"aria-hidden":"true",focusable:"false"},a),Re.createElement("path",{d:"M 17.418 14.908 C 17.69 15.176 18.127 15.176 18.397 14.908 C 18.667 14.64 18.668 14.207 18.397 13.939 L 10.489 6.109 C 10.219 5.841 9.782 5.841 9.51 6.109 L 1.602 13.939 C 1.332 14.207 1.332 14.64 1.602 14.908 C 1.873 15.176 2.311 15.176 2.581 14.908 L 10 7.767 L 17.418 14.908 Z"})),arrow_down=({className:s=null,width:o=20,height:i=20,...a})=>Re.createElement("svg",Mn()({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 20 20",className:s,width:o,height:i,"aria-hidden":"true",focusable:"false"},a),Re.createElement("path",{d:"M17.418 6.109c.272-.268.709-.268.979 0s.271.701 0 .969l-7.908 7.83c-.27.268-.707.268-.979 0l-7.908-7.83c-.27-.268-.27-.701 0-.969.271-.268.709-.268.979 0L10 13.25l7.418-7.141z"})),arrow=({className:s=null,width:o=20,height:i=20,...a})=>Re.createElement("svg",Mn()({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 20 20",className:s,width:o,height:i,"aria-hidden":"true",focusable:"false"},a),Re.createElement("path",{d:"M13.25 10L6.109 2.58c-.268-.27-.268-.707 0-.979.268-.27.701-.27.969 0l7.83 7.908c.268.271.268.709 0 .979l-7.83 7.908c-.268.271-.701.27-.969 0-.268-.269-.268-.707 0-.979L13.25 10z"})),components_close=({className:s=null,width:o=20,height:i=20,...a})=>Re.createElement("svg",Mn()({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 20 20",className:s,width:o,height:i,"aria-hidden":"true",focusable:"false"},a),Re.createElement("path",{d:"M14.348 14.849c-.469.469-1.229.469-1.697 0L10 11.819l-2.651 3.029c-.469.469-1.229.469-1.697 0-.469-.469-.469-1.229 0-1.697l2.758-3.15-2.759-3.152c-.469-.469-.469-1.228 0-1.697.469-.469 1.228-.469 1.697 0L10 8.183l2.651-3.031c.469-.469 1.228-.469 1.697 0 .469.469.469 1.229 0 1.697l-2.758 3.152 2.758 3.15c.469.469.469 1.229 0 1.698z"})),copy=({className:s=null,width:o=15,height:i=16,...a})=>Re.createElement("svg",Mn()({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 15 16",className:s,width:o,height:i,"aria-hidden":"true",focusable:"false"},a),Re.createElement("g",{transform:"translate(2, -1)"},Re.createElement("path",{fill:"#ffffff",fillRule:"evenodd",d:"M2 13h4v1H2v-1zm5-6H2v1h5V7zm2 3V8l-3 3 3 3v-2h5v-2H9zM4.5 9H2v1h2.5V9zM2 12h2.5v-1H2v1zm9 1h1v2c-.02.28-.11.52-.3.7-.19.18-.42.28-.7.3H1c-.55 0-1-.45-1-1V4c0-.55.45-1 1-1h3c0-1.11.89-2 2-2 1.11 0 2 .89 2 2h3c.55 0 1 .45 1 1v5h-1V6H1v9h10v-2zM2 5h8c0-.55-.45-1-1-1H8c-.55 0-1-.45-1-1s-.45-1-1-1-1 .45-1 1-.45 1-1 1H3c-.55 0-1 .45-1 1z"}))),lock=({className:s=null,width:o=20,height:i=20,...a})=>Re.createElement("svg",Mn()({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 20 20",className:s,width:o,height:i,"aria-hidden":"true",focusable:"false"},a),Re.createElement("path",{d:"M15.8 8H14V5.6C14 2.703 12.665 1 10 1 7.334 1 6 2.703 6 5.6V8H4c-.553 0-1 .646-1 1.199V17c0 .549.428 1.139.951 1.307l1.197.387C5.672 18.861 6.55 19 7.1 19h5.8c.549 0 1.428-.139 1.951-.307l1.196-.387c.524-.167.953-.757.953-1.306V9.199C17 8.646 16.352 8 15.8 8zM12 8H8V5.199C8 3.754 8.797 3 10 3c1.203 0 2 .754 2 2.199V8z"})),unlock=({className:s=null,width:o=20,height:i=20,...a})=>Re.createElement("svg",Mn()({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 20 20",className:s,width:o,height:i,"aria-hidden":"true",focusable:"false"},a),Re.createElement("path",{d:"M15.8 8H14V5.6C14 2.703 12.665 1 10 1 7.334 1 6 2.703 6 5.6V6h2v-.801C8 3.754 8.797 3 10 3c1.203 0 2 .754 2 2.199V8H4c-.553 0-1 .646-1 1.199V17c0 .549.428 1.139.951 1.307l1.197.387C5.672 18.861 6.55 19 7.1 19h5.8c.549 0 1.428-.139 1.951-.307l1.196-.387c.524-.167.953-.757.953-1.306V9.199C17 8.646 16.352 8 15.8 8z"})),icons=()=>({components:{ArrowUpIcon:arrow_up,ArrowDownIcon:arrow_down,ArrowIcon:arrow,CloseIcon:components_close,CopyIcon:copy,LockIcon:lock,UnlockIcon:unlock}}),Rn="layout_update_layout",Dn="layout_update_filter",Ln="layout_update_mode",Fn="layout_show";function updateLayout(s){return{type:Rn,payload:s}}function updateFilter(s){return{type:Dn,payload:s}}function actions_show(s,o=!0){return s=normalizeArray(s),{type:Fn,payload:{thing:s,shown:o}}}function changeMode(s,o=""){return s=normalizeArray(s),{type:Ln,payload:{thing:s,mode:o}}}const Bn={[Rn]:(s,o)=>s.set("layout",o.payload),[Dn]:(s,o)=>s.set("filter",o.payload),[Fn]:(s,o)=>{const i=o.payload.shown,a=(0,ze.fromJS)(o.payload.thing);return s.update("shown",(0,ze.fromJS)({}),(s=>s.set(a,i)))},[Ln]:(s,o)=>{let i=o.payload.thing,a=o.payload.mode;return s.setIn(["modes"].concat(i),(a||"")+"")}},current=s=>s.get("layout"),currentFilter=s=>s.get("filter"),isShown=(s,o,i)=>(o=normalizeArray(o),s.get("shown",(0,ze.fromJS)({})).get((0,ze.fromJS)(o),i)),whatMode=(s,o,i="")=>(o=normalizeArray(o),s.getIn(["modes",...o],i)),$n=Ut((s=>s),(s=>!isShown(s,"editor"))),taggedOperations=(s,o)=>(i,...a)=>{let u=s(i,...a);const{fn:_,layoutSelectors:w,getConfigs:x}=o.getSystem(),C=x(),{maxDisplayedTags:j}=C;let L=w.currentFilter();return L&&!0!==L&&(u=_.opsFilter(u,L)),j>=0&&(u=u.slice(0,j)),u};function plugins_layout(){return{statePlugins:{layout:{reducers:Bn,actions:j,selectors:L},spec:{wrapSelectors:B}}}}function logs({configs:s}){const o={debug:0,info:1,log:2,warn:3,error:4},getLevel=s=>o[s]||-1;let{logLevel:i}=s,a=getLevel(i);function log(s,...o){getLevel(s)>=a&&console[s](...o)}return log.warn=log.bind(null,"warn"),log.error=log.bind(null,"error"),log.info=log.bind(null,"info"),log.debug=log.bind(null,"debug"),{rootInjects:{log}}}let qn=!1;function on_complete(){return{statePlugins:{spec:{wrapActions:{updateSpec:s=>(...o)=>(qn=!0,s(...o)),updateJsonSpec:(s,o)=>(...i)=>{const a=o.getConfigs().onComplete;return qn&&"function"==typeof a&&(setTimeout(a,0),qn=!1),s(...i)}}}}}}const extractKey=s=>{const o="_**[]";return s.indexOf(o)<0?s:s.split(o)[0].trim()},escapeShell=s=>"-d "===s||/^[_\/-]/g.test(s)?s:"'"+s.replace(/'/g,"'\\''")+"'",escapeCMD=s=>"-d "===(s=s.replace(/\^/g,"^^").replace(/\\"/g,'\\\\"').replace(/"/g,'""').replace(/\n/g,"^\n"))?s.replace(/-d /g,"-d ^\n"):/^[_\/-]/g.test(s)?s:'"'+s+'"',escapePowershell=s=>{if("-d "===s)return s;if(/\n/.test(s)){return`@"\n${s.replace(/`/g,"``").replace(/\$/g,"`$")}\n"@`}if(!/^[_\/-]/.test(s)){return`'${s.replace(/'/g,"''")}'`}return s};const curlify=(s,o,i,a="")=>{let u=!1,_="";const addWords=(...s)=>_+=" "+s.map(o).join(" "),addWordsWithoutLeadingSpace=(...s)=>_+=s.map(o).join(" "),addNewLine=()=>_+=` ${i}`,addIndent=(s=1)=>_+="  ".repeat(s);let w=s.get("headers");_+="curl"+a;const x=s.get("curlOptions");if(ze.List.isList(x)&&!x.isEmpty()&&addWords(...s.get("curlOptions")),addWords("-X",s.get("method")),addNewLine(),addIndent(),addWordsWithoutLeadingSpace(`${s.get("url")}`),w&&w.size)for(let o of s.get("headers").entries()){addNewLine(),addIndent();let[s,i]=o;addWordsWithoutLeadingSpace("-H",`${s}: ${i}`),u=u||/^content-type$/i.test(s)&&/^multipart\/form-data$/i.test(i)}const C=s.get("body");if(C)if(u&&["POST","PUT","PATCH"].includes(s.get("method")))for(let[s,o]of C.entrySeq()){let i=extractKey(s);addNewLine(),addIndent(),addWordsWithoutLeadingSpace("-F"),o instanceof lt.File&&"string"==typeof o.valueOf()?addWords(`${i}=${o.data}${o.type?`;type=${o.type}`:""}`):o instanceof lt.File?addWords(`${i}=@${o.name}${o.type?`;type=${o.type}`:""}`):addWords(`${i}=${o}`)}else if(C instanceof lt.File)addNewLine(),addIndent(),addWordsWithoutLeadingSpace(`--data-binary '@${C.name}'`);else{addNewLine(),addIndent(),addWordsWithoutLeadingSpace("-d ");let o=C;ze.Map.isMap(o)?addWordsWithoutLeadingSpace(function getStringBodyOfMap(s){let o=[];for(let[i,a]of s.get("body").entrySeq()){let s=extractKey(i);a instanceof lt.File?o.push(`  "${s}": {\n    "name": "${a.name}"${a.type?`,\n    "type": "${a.type}"`:""}\n  }`):o.push(`  "${s}": ${JSON.stringify(a,null,2).replace(/(\r\n|\r|\n)/g,"\n  ")}`)}return`{\n${o.join(",\n")}\n}`}(s)):("string"!=typeof o&&(o=JSON.stringify(o)),addWordsWithoutLeadingSpace(o))}else C||"POST"!==s.get("method")||(addNewLine(),addIndent(),addWordsWithoutLeadingSpace("-d ''"));return _},requestSnippetGenerator_curl_powershell=s=>curlify(s,escapePowershell,"`\n",".exe"),requestSnippetGenerator_curl_bash=s=>curlify(s,escapeShell,"\\\n"),requestSnippetGenerator_curl_cmd=s=>curlify(s,escapeCMD,"^\n"),request_snippets_selectors_state=s=>s||(0,ze.Map)(),Un=Ut(request_snippets_selectors_state,(s=>{const o=s.get("languages"),i=s.get("generators",(0,ze.Map)());return!o||o.isEmpty()?i:i.filter(((s,i)=>o.includes(i)))})),getSnippetGenerators=s=>({fn:o})=>Un(s).map(((s,i)=>{const a=(s=>o[`requestSnippetGenerator_${s}`])(i);return"function"!=typeof a?null:s.set("fn",a)})).filter((s=>s)),Vn=Ut(request_snippets_selectors_state,(s=>s.get("activeLanguage"))),zn=Ut(request_snippets_selectors_state,(s=>s.get("defaultExpanded")));var Wn=__webpack_require__(46942),Jn=__webpack_require__.n(Wn),Hn=__webpack_require__(59399);const Kn={cursor:"pointer",lineHeight:1,display:"inline-flex",backgroundColor:"rgb(250, 250, 250)",paddingBottom:"0",paddingTop:"0",border:"1px solid rgb(51, 51, 51)",borderRadius:"4px 4px 0 0",boxShadow:"none",borderBottom:"none"},Gn={cursor:"pointer",lineHeight:1,display:"inline-flex",backgroundColor:"rgb(51, 51, 51)",boxShadow:"none",border:"1px solid rgb(51, 51, 51)",paddingBottom:"0",paddingTop:"0",borderRadius:"4px 4px 0 0",marginTop:"-5px",marginRight:"-5px",marginLeft:"-5px",zIndex:"9999",borderBottom:"none"},request_snippets=({request:s,requestSnippetsSelectors:o,getComponent:i})=>{const a=(0,Re.useRef)(null),u=i("ArrowUpIcon"),_=i("ArrowDownIcon"),w=i("SyntaxHighlighter",!0),[x,C]=(0,Re.useState)(o.getSnippetGenerators()?.keySeq().first()),[j,L]=(0,Re.useState)(o?.getDefaultExpanded()),B=o.getSnippetGenerators(),$=B.get(x),U=$.get("fn")(s),handleSetIsExpanded=()=>{L(!j)},handleGetBtnStyle=s=>s===x?Gn:Kn,handlePreventYScrollingBeyondElement=s=>{const{target:o,deltaY:i}=s,{scrollHeight:a,offsetHeight:u,scrollTop:_}=o;a>u&&(0===_&&i<0||u+_>=a&&i>0)&&s.preventDefault()};return(0,Re.useEffect)((()=>{}),[]),(0,Re.useEffect)((()=>{const s=Array.from(a.current.childNodes).filter((s=>!!s.nodeType&&s.classList?.contains("curl-command")));return s.forEach((s=>s.addEventListener("mousewheel",handlePreventYScrollingBeyondElement,{passive:!1}))),()=>{s.forEach((s=>s.removeEventListener("mousewheel",handlePreventYScrollingBeyondElement)))}}),[s]),Re.createElement("div",{className:"request-snippets",ref:a},Re.createElement("div",{style:{width:"100%",display:"flex",justifyContent:"flex-start",alignItems:"center",marginBottom:"15px"}},Re.createElement("h4",{onClick:()=>handleSetIsExpanded(),style:{cursor:"pointer"}},"Snippets"),Re.createElement("button",{onClick:()=>handleSetIsExpanded(),style:{border:"none",background:"none"},title:j?"Collapse operation":"Expand operation"},j?Re.createElement(_,{className:"arrow",width:"10",height:"10"}):Re.createElement(u,{className:"arrow",width:"10",height:"10"}))),j&&Re.createElement("div",{className:"curl-command"},Re.createElement("div",{style:{paddingLeft:"15px",paddingRight:"10px",width:"100%",display:"flex"}},B.entrySeq().map((([s,o])=>Re.createElement("div",{className:Jn()("btn",{active:s===x}),style:handleGetBtnStyle(s),key:s,onClick:()=>(s=>{x!==s&&C(s)})(s)},Re.createElement("h4",{style:s===x?{color:"white"}:{}},o.get("title")))))),Re.createElement("div",{className:"copy-to-clipboard"},Re.createElement(Hn.CopyToClipboard,{text:U},Re.createElement("button",null))),Re.createElement("div",null,Re.createElement(w,{language:$.get("syntax"),className:"curl microlight",renderPlainText:({children:s,PlainTextViewer:o})=>Re.createElement(o,{className:"curl"},s)},U))))},plugins_request_snippets=()=>({components:{RequestSnippets:request_snippets},fn:{requestSnippetGenerator_curl_bash,requestSnippetGenerator_curl_cmd,requestSnippetGenerator_curl_powershell},statePlugins:{requestSnippets:{selectors:$}}});class ModelCollapse extends Re.Component{static defaultProps={collapsedContent:"{...}",expanded:!1,title:null,onToggle:()=>{},hideSelfOnExpand:!1,specPath:We().List([])};constructor(s,o){super(s,o);let{expanded:i,collapsedContent:a}=this.props;this.state={expanded:i,collapsedContent:a||ModelCollapse.defaultProps.collapsedContent}}componentDidMount(){const{hideSelfOnExpand:s,expanded:o,modelName:i}=this.props;s&&o&&this.props.onToggle(i,o)}UNSAFE_componentWillReceiveProps(s){this.props.expanded!==s.expanded&&this.setState({expanded:s.expanded})}toggleCollapsed=()=>{this.props.onToggle&&this.props.onToggle(this.props.modelName,!this.state.expanded),this.setState({expanded:!this.state.expanded})};onLoad=s=>{if(s&&this.props.layoutSelectors){const o=this.props.layoutSelectors.getScrollToKey();We().is(o,this.props.specPath)&&this.toggleCollapsed(),this.props.layoutActions.readyToScroll(this.props.specPath,s.parentElement)}};render(){const{title:s,classes:o}=this.props;return this.state.expanded&&this.props.hideSelfOnExpand?Re.createElement("span",{className:o||""},this.props.children):Re.createElement("span",{className:o||"",ref:this.onLoad},Re.createElement("button",{"aria-expanded":this.state.expanded,className:"model-box-control",onClick:this.toggleCollapsed},s&&Re.createElement("span",{className:"pointer"},s),Re.createElement("span",{className:"model-toggle"+(this.state.expanded?"":" collapsed")}),!this.state.expanded&&Re.createElement("span",null,this.state.collapsedContent)),this.state.expanded&&this.props.children)}}const useTabs=({initialTab:s,isExecute:o,schema:i,example:a})=>{const u=(0,Re.useMemo)((()=>({example:"example",model:"model"})),[]),_=(0,Re.useMemo)((()=>Object.keys(u)),[u]).includes(s)&&i&&!o?s:u.example,w=(s=>{const o=(0,Re.useRef)();return(0,Re.useEffect)((()=>{o.current=s})),o.current})(o),[x,C]=(0,Re.useState)(_),j=(0,Re.useCallback)((s=>{C(s.target.dataset.name)}),[]);return(0,Re.useEffect)((()=>{w&&!o&&a&&C(u.example)}),[w,o,a]),{activeTab:x,onTabChange:j,tabs:u}},model_example=({schema:s,example:o,isExecute:i=!1,specPath:a,includeWriteOnly:u=!1,includeReadOnly:_=!1,getComponent:w,getConfigs:x,specSelectors:C})=>{const{defaultModelRendering:j,defaultModelExpandDepth:L}=x(),B=w("ModelWrapper"),$=w("HighlightCode",!0),U=xt()(5).toString("base64"),V=xt()(5).toString("base64"),z=xt()(5).toString("base64"),Y=xt()(5).toString("base64"),Z=C.isOAS3(),{activeTab:ee,tabs:ie,onTabChange:ae}=useTabs({initialTab:j,isExecute:i,schema:s,example:o});return Re.createElement("div",{className:"model-example"},Re.createElement("ul",{className:"tab",role:"tablist"},Re.createElement("li",{className:Jn()("tabitem",{active:ee===ie.example}),role:"presentation"},Re.createElement("button",{"aria-controls":V,"aria-selected":ee===ie.example,className:"tablinks","data-name":"example",id:U,onClick:ae,role:"tab"},i?"Edit Value":"Example Value")),s&&Re.createElement("li",{className:Jn()("tabitem",{active:ee===ie.model}),role:"presentation"},Re.createElement("button",{"aria-controls":Y,"aria-selected":ee===ie.model,className:Jn()("tablinks",{inactive:i}),"data-name":"model",id:z,onClick:ae,role:"tab"},Z?"Schema":"Model"))),ee===ie.example&&Re.createElement("div",{"aria-hidden":ee!==ie.example,"aria-labelledby":U,"data-name":"examplePanel",id:V,role:"tabpanel",tabIndex:"0"},o||Re.createElement($,null,"(no example available")),ee===ie.model&&Re.createElement("div",{className:"model-container","aria-hidden":ee===ie.example,"aria-labelledby":z,"data-name":"modelPanel",id:Y,role:"tabpanel",tabIndex:"0"},Re.createElement(B,{schema:s,getComponent:w,getConfigs:x,specSelectors:C,expandDepth:L,specPath:a,includeReadOnly:_,includeWriteOnly:u})))};class ModelWrapper extends Re.Component{onToggle=(s,o)=>{this.props.layoutActions&&this.props.layoutActions.show(this.props.fullPath,o)};render(){let{getComponent:s,getConfigs:o}=this.props;const i=s("Model");let a;return this.props.layoutSelectors&&(a=this.props.layoutSelectors.isShown(this.props.fullPath)),Re.createElement("div",{className:"model-box"},Re.createElement(i,Mn()({},this.props,{getConfigs:o,expanded:a,depth:1,onToggle:this.onToggle,expandDepth:this.props.expandDepth||0})))}}function _typeof(s){return _typeof="function"==typeof Symbol&&"symbol"==typeof Symbol.iterator?function(s){return typeof s}:function(s){return s&&"function"==typeof Symbol&&s.constructor===Symbol&&s!==Symbol.prototype?"symbol":typeof s},_typeof(s)}function _defineProperties(s,o){for(var i=0;i<o.length;i++){var a=o[i];a.enumerable=a.enumerable||!1,a.configurable=!0,"value"in a&&(a.writable=!0),Object.defineProperty(s,a.key,a)}}function _defineProperty(s,o,i){return o in s?Object.defineProperty(s,o,{value:i,enumerable:!0,configurable:!0,writable:!0}):s[o]=i,s}function ownKeys(s,o){var i=Object.keys(s);if(Object.getOwnPropertySymbols){var a=Object.getOwnPropertySymbols(s);o&&(a=a.filter((function(o){return Object.getOwnPropertyDescriptor(s,o).enumerable}))),i.push.apply(i,a)}return i}function _getPrototypeOf(s){return _getPrototypeOf=Object.setPrototypeOf?Object.getPrototypeOf:function _getPrototypeOf(s){return s.__proto__||Object.getPrototypeOf(s)},_getPrototypeOf(s)}function _setPrototypeOf(s,o){return _setPrototypeOf=Object.setPrototypeOf||function _setPrototypeOf(s,o){return s.__proto__=o,s},_setPrototypeOf(s,o)}function _possibleConstructorReturn(s,o){return!o||"object"!=typeof o&&"function"!=typeof o?function _assertThisInitialized(s){if(void 0===s)throw new ReferenceError("this hasn't been initialised - super() hasn't been called");return s}(s):o}var Yn={};function react_immutable_pure_component_es_get(s,o,i){return function isInvalid(s){return null==s}(s)?i:function isMapLike(s){return null!==s&&"object"===_typeof(s)&&"function"==typeof s.get&&"function"==typeof s.has}(s)?s.has(o)?s.get(o):i:hasOwnProperty.call(s,o)?s[o]:i}function getIn(s,o,i){for(var a=0;a!==o.length;)if((s=react_immutable_pure_component_es_get(s,o[a++],Yn))===Yn)return i;return s}function check(s){var o=arguments.length>1&&void 0!==arguments[1]?arguments[1]:{},i=arguments.length>2&&void 0!==arguments[2]?arguments[2]:{},a=function createChecker(s,o){return function(i){if("string"==typeof i)return(0,ze.is)(o[i],s[i]);if(Array.isArray(i))return(0,ze.is)(getIn(o,i),getIn(s,i));throw new TypeError("Invalid key: expected Array or string: "+i)}}(o,i),u=s||Object.keys(function _objectSpread2(s){for(var o=1;o<arguments.length;o++){var i=null!=arguments[o]?arguments[o]:{};o%2?ownKeys(i,!0).forEach((function(o){_defineProperty(s,o,i[o])})):Object.getOwnPropertyDescriptors?Object.defineProperties(s,Object.getOwnPropertyDescriptors(i)):ownKeys(i).forEach((function(o){Object.defineProperty(s,o,Object.getOwnPropertyDescriptor(i,o))}))}return s}({},i,{},o));return u.every(a)}const Xn=function(s){function ImmutablePureComponent(){return function _classCallCheck(s,o){if(!(s instanceof o))throw new TypeError("Cannot call a class as a function")}(this,ImmutablePureComponent),_possibleConstructorReturn(this,_getPrototypeOf(ImmutablePureComponent).apply(this,arguments))}return function _inherits(s,o){if("function"!=typeof o&&null!==o)throw new TypeError("Super expression must either be null or a function");s.prototype=Object.create(o&&o.prototype,{constructor:{value:s,writable:!0,configurable:!0}}),o&&_setPrototypeOf(s,o)}(ImmutablePureComponent,s),function _createClass(s,o,i){return o&&_defineProperties(s.prototype,o),i&&_defineProperties(s,i),s}(ImmutablePureComponent,[{key:"shouldComponentUpdate",value:function shouldComponentUpdate(s){var o=arguments.length>1&&void 0!==arguments[1]?arguments[1]:{};return!check(this.updateOnProps,this.props,s,"updateOnProps")||!check(this.updateOnStates,this.state,o,"updateOnStates")}}]),ImmutablePureComponent}(Re.Component);var Qn,Zn=__webpack_require__(5556),es=__webpack_require__.n(Zn);function _extends(){return _extends=Object.assign?Object.assign.bind():function(s){for(var o=1;o<arguments.length;o++){var i=arguments[o];for(var a in i)({}).hasOwnProperty.call(i,a)&&(s[a]=i[a])}return s},_extends.apply(null,arguments)}const rolling_load=s=>Re.createElement("svg",_extends({xmlns:"http://www.w3.org/2000/svg",width:200,height:200,className:"rolling-load_svg__lds-rolling",preserveAspectRatio:"xMidYMid",style:{backgroundImage:"none",backgroundPosition:"initial initial",backgroundRepeat:"initial initial"},viewBox:"0 0 100 100"},s),Qn||(Qn=Re.createElement("circle",{cx:50,cy:50,r:35,fill:"none",stroke:"#555",strokeDasharray:"164.93361431346415 56.97787143782138",strokeWidth:10},Re.createElement("animateTransform",{attributeName:"transform",begin:"0s",calcMode:"linear",dur:"1s",keyTimes:"0;1",repeatCount:"indefinite",type:"rotate",values:"0 50 50;360 50 50"})))),decodeRefName=s=>{const o=s.replace(/~1/g,"/").replace(/~0/g,"~");try{return decodeURIComponent(o)}catch{return o}};class Model extends Xn{static propTypes={schema:xn().map.isRequired,getComponent:es().func.isRequired,getConfigs:es().func.isRequired,specSelectors:es().object.isRequired,name:es().string,displayName:es().string,isRef:es().bool,required:es().bool,expandDepth:es().number,depth:es().number,specPath:xn().list.isRequired,includeReadOnly:es().bool,includeWriteOnly:es().bool};getModelName=s=>-1!==s.indexOf("#/definitions/")?decodeRefName(s.replace(/^.*#\/definitions\//,"")):-1!==s.indexOf("#/components/schemas/")?decodeRefName(s.replace(/^.*#\/components\/schemas\//,"")):void 0;getRefSchema=s=>{let{specSelectors:o}=this.props;return o.findDefinition(s)};render(){let{getComponent:s,getConfigs:o,specSelectors:i,schema:a,required:u,name:_,isRef:w,specPath:x,displayName:C,includeReadOnly:j,includeWriteOnly:L}=this.props;const B=s("ObjectModel"),$=s("ArrayModel"),U=s("PrimitiveModel");let V="object",z=a&&a.get("$$ref"),Y=a&&a.get("$ref");if(!_&&z&&(_=this.getModelName(z)),Y){const s=this.getModelName(Y),o=this.getRefSchema(s);ze.Map.isMap(o)?(a=o.mergeDeep(a),z||(a=a.set("$$ref",Y),z=Y)):ze.Map.isMap(a)&&1===a.size&&(a=null,_=Y)}if(!a)return Re.createElement("span",{className:"model model-title"},Re.createElement("span",{className:"model-title__text"},C||_),!Y&&Re.createElement(rolling_load,{height:"20px",width:"20px"}));const Z=i.isOAS3()&&a.get("deprecated");switch(w=void 0!==w?w:!!z,V=a&&a.get("type")||V,V){case"object":return Re.createElement(B,Mn()({className:"object"},this.props,{specPath:x,getConfigs:o,schema:a,name:_,deprecated:Z,isRef:w,includeReadOnly:j,includeWriteOnly:L}));case"array":return Re.createElement($,Mn()({className:"array"},this.props,{getConfigs:o,schema:a,name:_,deprecated:Z,required:u,includeReadOnly:j,includeWriteOnly:L}));default:return Re.createElement(U,Mn()({},this.props,{getComponent:s,getConfigs:o,schema:a,name:_,deprecated:Z,required:u}))}}}class Models extends Re.Component{getSchemaBasePath=()=>this.props.specSelectors.isOAS3()?["components","schemas"]:["definitions"];getCollapsedContent=()=>" ";handleToggle=(s,o)=>{const{layoutActions:i}=this.props;i.show([...this.getSchemaBasePath(),s],o),o&&this.props.specActions.requestResolvedSubtree([...this.getSchemaBasePath(),s])};onLoadModels=s=>{s&&this.props.layoutActions.readyToScroll(this.getSchemaBasePath(),s)};onLoadModel=s=>{if(s){const o=s.getAttribute("data-name");this.props.layoutActions.readyToScroll([...this.getSchemaBasePath(),o],s)}};render(){let{specSelectors:s,getComponent:o,layoutSelectors:i,layoutActions:a,getConfigs:u}=this.props,_=s.definitions(),{docExpansion:w,defaultModelsExpandDepth:x}=u();if(!_.size||x<0)return null;const C=this.getSchemaBasePath();let j=i.isShown(C,x>0&&"none"!==w);const L=s.isOAS3(),B=o("ModelWrapper"),$=o("Collapse"),U=o("ModelCollapse"),V=o("JumpToPath",!0),z=o("ArrowUpIcon"),Y=o("ArrowDownIcon");return Re.createElement("section",{className:j?"models is-open":"models",ref:this.onLoadModels},Re.createElement("h4",null,Re.createElement("button",{"aria-expanded":j,className:"models-control",onClick:()=>a.show(C,!j)},Re.createElement("span",null,L?"Schemas":"Models"),j?Re.createElement(z,null):Re.createElement(Y,null))),Re.createElement($,{isOpened:j},_.entrySeq().map((([_])=>{const w=[...C,_],j=We().List(w),L=s.specResolvedSubtree(w),$=s.specJson().getIn(w),z=ze.Map.isMap(L)?L:We().Map(),Y=ze.Map.isMap($)?$:We().Map(),Z=z.get("title")||Y.get("title")||_,ee=i.isShown(w,!1);ee&&0===z.size&&Y.size>0&&this.props.specActions.requestResolvedSubtree(w);const ie=Re.createElement(B,{name:_,expandDepth:x,schema:z||We().Map(),displayName:Z,fullPath:w,specPath:j,getComponent:o,specSelectors:s,getConfigs:u,layoutSelectors:i,layoutActions:a,includeReadOnly:!0,includeWriteOnly:!0}),ae=Re.createElement("span",{className:"model-box"},Re.createElement("span",{className:"model model-title"},Z));return Re.createElement("div",{id:`model-${_}`,className:"model-container",key:`models-section-${_}`,"data-name":_,ref:this.onLoadModel},Re.createElement("span",{className:"models-jump-to-path"},Re.createElement(V,{path:j})),Re.createElement(U,{classes:"model-box",collapsedContent:this.getCollapsedContent(_),onToggle:this.handleToggle,title:ae,displayName:Z,modelName:_,specPath:j,layoutSelectors:i,layoutActions:a,hideSelfOnExpand:!0,expanded:x>0&&ee},ie))})).toArray()))}}const enum_model=({value:s,getComponent:o})=>{let i=o("ModelCollapse"),a=Re.createElement("span",null,"Array [ ",s.count()," ]");return Re.createElement("span",{className:"prop-enum"},"Enum:",Re.createElement("br",null),Re.createElement(i,{collapsedContent:a},"[ ",s.map(String).join(", ")," ]"))};function isAbsoluteUrl(s){return s.match(/^(?:[a-z]+:)?\/\//i)}function buildBaseUrl(s,o){return s?isAbsoluteUrl(s)?function addProtocol(s){return s.match(/^\/\//i)?`${window.location.protocol}${s}`:s}(s):new URL(s,o).href:o}function safeBuildUrl(s,o,{selectedServer:i=""}={}){try{return function buildUrl(s,o,{selectedServer:i=""}={}){if(!s)return;if(isAbsoluteUrl(s))return s;const a=buildBaseUrl(i,o);return isAbsoluteUrl(a)?new URL(s,a).href:new URL(s,window.location.href).href}(s,o,{selectedServer:i})}catch{return}}function sanitizeUrl(s){if("string"!=typeof s||""===s.trim())return"";const o=s.trim(),i="about:blank";try{const s=`https://base${String(Math.random()).slice(2)}`,a=new URL(o,s),u=a.protocol.slice(0,-1);if(["javascript","data","vbscript"].includes(u.toLowerCase()))return i;if(a.origin===s){if(o.startsWith("/"))return`${a.pathname}${a.search}${a.hash}`;if(o.startsWith("./")||o.startsWith("../")){const s=o.match(/^(\.\.?\/)+/)[0];return`${s}${a.pathname.substring(1)}${a.search}${a.hash}`}return`${a.pathname.substring(1)}${a.search}${a.hash}`}return String(a)}catch{return i}}class ObjectModel extends Re.Component{render(){let{schema:s,name:o,displayName:i,isRef:a,getComponent:u,getConfigs:_,depth:w,onToggle:x,expanded:C,specPath:j,...L}=this.props,{specSelectors:B,expandDepth:$,includeReadOnly:U,includeWriteOnly:V}=L;const{isOAS3:z}=B,Y=w>2||2===w&&"items"!==j.last();if(!s)return null;const{showExtensions:Z}=_(),ee=Z?getExtensions(s):(0,ze.List)();let ie=s.get("description"),ae=s.get("properties"),ce=s.get("additionalProperties"),le=s.get("title")||i||o,pe=s.get("required"),de=s.filter(((s,o)=>-1!==["maxProperties","minProperties","nullable","example"].indexOf(o))),fe=s.get("deprecated"),ye=s.getIn(["externalDocs","url"]),be=s.getIn(["externalDocs","description"]);const _e=u("JumpToPath",!0),Se=u("Markdown",!0),we=u("Model"),xe=u("ModelCollapse"),Pe=u("Property"),Te=u("Link"),$e=u("ModelExtensions"),JumpToPathSection=()=>Re.createElement("span",{className:"model-jump-to-path"},Re.createElement(_e,{path:j})),qe=Re.createElement("span",null,Re.createElement("span",null,"{"),"...",Re.createElement("span",null,"}"),a?Re.createElement(JumpToPathSection,null):""),We=B.isOAS3()?s.get("allOf"):null,He=B.isOAS3()?s.get("anyOf"):null,Ye=B.isOAS3()?s.get("oneOf"):null,Xe=B.isOAS3()?s.get("not"):null,Qe=le&&Re.createElement("span",{className:"model-title"},a&&s.get("$$ref")&&Re.createElement("span",{className:Jn()("model-hint",{"model-hint--embedded":Y})},s.get("$$ref")),Re.createElement("span",{className:"model-title__text"},le));return Re.createElement("span",{className:"model"},Re.createElement(xe,{modelName:o,title:Qe,onToggle:x,expanded:!!C||w<=$,collapsedContent:qe},Re.createElement("span",{className:"brace-open object"},"{"),a?Re.createElement(JumpToPathSection,null):null,Re.createElement("span",{className:"inner-object"},Re.createElement("table",{className:"model"},Re.createElement("tbody",null,ie?Re.createElement("tr",{className:"description"},Re.createElement("td",null,"description:"),Re.createElement("td",null,Re.createElement(Se,{source:ie}))):null,ye&&Re.createElement("tr",{className:"external-docs"},Re.createElement("td",null,"externalDocs:"),Re.createElement("td",null,Re.createElement(Te,{target:"_blank",href:sanitizeUrl(ye)},be||ye))),fe?Re.createElement("tr",{className:"property"},Re.createElement("td",null,"deprecated:"),Re.createElement("td",null,"true")):null,ae&&ae.size?ae.entrySeq().filter((([,s])=>(!s.get("readOnly")||U)&&(!s.get("writeOnly")||V))).map((([s,i])=>{let a=z()&&i.get("deprecated"),x=ze.List.isList(pe)&&pe.contains(s),C=["property-row"];return a&&C.push("deprecated"),x&&C.push("required"),Re.createElement("tr",{key:s,className:C.join(" ")},Re.createElement("td",null,s,x&&Re.createElement("span",{className:"star"},"*")),Re.createElement("td",null,Re.createElement(we,Mn()({key:`object-${o}-${s}_${i}`},L,{required:x,getComponent:u,specPath:j.push("properties",s),getConfigs:_,schema:i,depth:w+1}))))})).toArray():null,0===ee.size?null:Re.createElement(Re.Fragment,null,Re.createElement("tr",null,Re.createElement("td",null," ")),Re.createElement($e,{extensions:ee,propClass:"extension"})),ce&&ce.size?Re.createElement("tr",null,Re.createElement("td",null,"< * >:"),Re.createElement("td",null,Re.createElement(we,Mn()({},L,{required:!1,getComponent:u,specPath:j.push("additionalProperties"),getConfigs:_,schema:ce,depth:w+1})))):null,We?Re.createElement("tr",null,Re.createElement("td",null,"allOf ->"),Re.createElement("td",null,We.map(((s,o)=>Re.createElement("div",{key:o},Re.createElement(we,Mn()({},L,{required:!1,getComponent:u,specPath:j.push("allOf",o),getConfigs:_,schema:s,depth:w+1}))))))):null,He?Re.createElement("tr",null,Re.createElement("td",null,"anyOf ->"),Re.createElement("td",null,He.map(((s,o)=>Re.createElement("div",{key:o},Re.createElement(we,Mn()({},L,{required:!1,getComponent:u,specPath:j.push("anyOf",o),getConfigs:_,schema:s,depth:w+1}))))))):null,Ye?Re.createElement("tr",null,Re.createElement("td",null,"oneOf ->"),Re.createElement("td",null,Ye.map(((s,o)=>Re.createElement("div",{key:o},Re.createElement(we,Mn()({},L,{required:!1,getComponent:u,specPath:j.push("oneOf",o),getConfigs:_,schema:s,depth:w+1}))))))):null,Xe?Re.createElement("tr",null,Re.createElement("td",null,"not ->"),Re.createElement("td",null,Re.createElement("div",null,Re.createElement(we,Mn()({},L,{required:!1,getComponent:u,specPath:j.push("not"),getConfigs:_,schema:Xe,depth:w+1}))))):null))),Re.createElement("span",{className:"brace-close"},"}")),de.size?de.entrySeq().map((([s,o])=>Re.createElement(Pe,{key:`${s}-${o}`,propKey:s,propVal:o,propClass:"property"}))):null)}}class ArrayModel extends Re.Component{render(){let{getComponent:s,getConfigs:o,schema:i,depth:a,expandDepth:u,name:_,displayName:w,specPath:x}=this.props,C=i.get("description"),j=i.get("items"),L=i.get("title")||w||_,B=i.filter(((s,o)=>-1===["type","items","description","$$ref","externalDocs"].indexOf(o))),$=i.getIn(["externalDocs","url"]),U=i.getIn(["externalDocs","description"]);const V=s("Markdown",!0),z=s("ModelCollapse"),Y=s("Model"),Z=s("Property"),ee=s("Link"),ie=L&&Re.createElement("span",{className:"model-title"},Re.createElement("span",{className:"model-title__text"},L));return Re.createElement("span",{className:"model"},Re.createElement(z,{title:ie,expanded:a<=u,collapsedContent:"[...]"},"[",B.size?B.entrySeq().map((([s,o])=>Re.createElement(Z,{key:`${s}-${o}`,propKey:s,propVal:o,propClass:"property"}))):null,C?Re.createElement(V,{source:C}):B.size?Re.createElement("div",{className:"markdown"}):null,$&&Re.createElement("div",{className:"external-docs"},Re.createElement(ee,{target:"_blank",href:sanitizeUrl($)},U||$)),Re.createElement("span",null,Re.createElement(Y,Mn()({},this.props,{getConfigs:o,specPath:x.push("items"),name:null,schema:j,required:!1,depth:a+1}))),"]"))}}const ts="property primitive";class Primitive extends Re.Component{render(){let{schema:s,getComponent:o,getConfigs:i,name:a,displayName:u,depth:_,expandDepth:w}=this.props;const{showExtensions:x}=i();if(!s||!s.get)return Re.createElement("div",null);let C=s.get("type"),j=s.get("format"),L=s.get("xml"),B=s.get("enum"),$=s.get("title")||u||a,U=s.get("description");const V=getExtensions(s);let z=s.filter(((s,o)=>-1===["enum","type","format","description","$$ref","externalDocs"].indexOf(o))).filterNot(((s,o)=>V.has(o))),Y=s.getIn(["externalDocs","url"]),Z=s.getIn(["externalDocs","description"]);const ee=o("Markdown",!0),ie=o("EnumModel"),ae=o("Property"),ce=o("ModelCollapse"),le=o("Link"),pe=o("ModelExtensions"),de=$&&Re.createElement("span",{className:"model-title"},Re.createElement("span",{className:"model-title__text"},$));return Re.createElement("span",{className:"model"},Re.createElement(ce,{title:de,expanded:_<=w,collapsedContent:"[...]"},Re.createElement("span",{className:"prop"},a&&_>1&&Re.createElement("span",{className:"prop-name"},$),Re.createElement("span",{className:"prop-type"},C),j&&Re.createElement("span",{className:"prop-format"},"($",j,")"),z.size?z.entrySeq().map((([s,o])=>Re.createElement(ae,{key:`${s}-${o}`,propKey:s,propVal:o,propClass:ts}))):null,x&&V.size>0?Re.createElement(pe,{extensions:V,propClass:`${ts} extension`}):null,U?Re.createElement(ee,{source:U}):null,Y&&Re.createElement("div",{className:"external-docs"},Re.createElement(le,{target:"_blank",href:sanitizeUrl(Y)},Z||Y)),L&&L.size?Re.createElement("span",null,Re.createElement("br",null),Re.createElement("span",{className:ts},"xml:"),L.entrySeq().map((([s,o])=>Re.createElement("span",{key:`${s}-${o}`,className:ts},Re.createElement("br",null),"   ",s,": ",String(o)))).toArray()):null,B&&Re.createElement(ie,{value:B,getComponent:o}))))}}class Schemes extends Re.Component{UNSAFE_componentWillMount(){let{schemes:s}=this.props;this.setScheme(s.first())}UNSAFE_componentWillReceiveProps(s){this.props.currentScheme&&s.schemes.includes(this.props.currentScheme)||this.setScheme(s.schemes.first())}onChange=s=>{this.setScheme(s.target.value)};setScheme=s=>{let{path:o,method:i,specActions:a}=this.props;a.setScheme(s,o,i)};render(){let{schemes:s,currentScheme:o}=this.props;return Re.createElement("label",{htmlFor:"schemes"},Re.createElement("span",{className:"schemes-title"},"Schemes"),Re.createElement("select",{onChange:this.onChange,value:o,id:"schemes"},s.valueSeq().map((s=>Re.createElement("option",{value:s,key:s},s))).toArray()))}}class SchemesContainer extends Re.Component{render(){const{specActions:s,specSelectors:o,getComponent:i}=this.props,a=o.operationScheme(),u=o.schemes(),_=i("schemes");return u&&u.size?Re.createElement(_,{currentScheme:a,schemes:u,specActions:s}):null}}var rs=__webpack_require__(24677),ns=__webpack_require__.n(rs);const ss={value:"",onChange:()=>{},schema:{},keyName:"",required:!1,errors:(0,ze.List)()};class JsonSchemaForm extends Re.Component{static defaultProps=ss;componentDidMount(){const{dispatchInitialValue:s,value:o,onChange:i}=this.props;s?i(o):!1===s&&i("")}render(){let{schema:s,errors:o,value:i,onChange:a,getComponent:u,fn:_,disabled:w}=this.props;const x=s&&s.get?s.get("format"):null,C=s&&s.get?s.get("type"):null,j=_.getSchemaObjectType(s),L=_.isFileUploadIntended(s);let getComponentSilently=s=>u(s,!1,{failSilently:!0}),B=C?getComponentSilently(x?`JsonSchema_${C}_${x}`:`JsonSchema_${C}`):u("JsonSchema_string");return L||!ze.List.isList(C)||"array"!==j&&"object"!==j||(B=u("JsonSchema_object")),B||(B=u("JsonSchema_string")),Re.createElement(B,Mn()({},this.props,{errors:o,fn:_,getComponent:u,value:i,onChange:a,schema:s,disabled:w}))}}class JsonSchema_string extends Re.Component{static defaultProps=ss;onChange=s=>{const o=this.props.schema&&"file"===this.props.schema.get("type")?s.target.files[0]:s.target.value;this.props.onChange(o,this.props.keyName)};onEnumChange=s=>this.props.onChange(s);render(){let{getComponent:s,value:o,schema:i,errors:a,required:u,description:_,disabled:w}=this.props;const x=i&&i.get?i.get("enum"):null,C=i&&i.get?i.get("format"):null,j=i&&i.get?i.get("type"):null,L=i&&i.get?i.get("in"):null;if(o?(isImmutable(o)||"object"==typeof o)&&(o=stringify(o)):o="",a=a.toJS?a.toJS():[],x){const i=s("Select");return Re.createElement(i,{className:a.length?"invalid":"",title:a.length?a:"",allowedValues:[...x],value:o,allowEmptyValue:!u,disabled:w,onChange:this.onEnumChange})}const B=w||L&&"formData"===L&&!("FormData"in window),$=s("Input");return j&&"file"===j?Re.createElement($,{type:"file",className:a.length?"invalid":"",title:a.length?a:"",onChange:this.onChange,disabled:B}):Re.createElement(ns(),{type:C&&"password"===C?"password":"text",className:a.length?"invalid":"",title:a.length?a:"",value:o,minLength:0,debounceTimeout:350,placeholder:_,onChange:this.onChange,disabled:B})}}class JsonSchema_array extends Re.PureComponent{static defaultProps=ss;constructor(s,o){super(s,o),this.state={value:valueOrEmptyList(s.value),schema:s.schema}}UNSAFE_componentWillReceiveProps(s){const o=valueOrEmptyList(s.value);o!==this.state.value&&this.setState({value:o}),s.schema!==this.state.schema&&this.setState({schema:s.schema})}onChange=()=>{this.props.onChange(this.state.value)};onItemChange=(s,o)=>{this.setState((({value:i})=>({value:i.set(o,s)})),this.onChange)};removeItem=s=>{this.setState((({value:o})=>({value:o.delete(s)})),this.onChange)};addItem=()=>{const{fn:s}=this.props;let o=valueOrEmptyList(this.state.value);this.setState((()=>({value:o.push(s.getSampleSchema(this.state.schema.get("items"),!1,{includeWriteOnly:!0}))})),this.onChange)};onEnumChange=s=>{this.setState((()=>({value:s})),this.onChange)};render(){let{getComponent:s,required:o,schema:i,errors:a,fn:u,disabled:_}=this.props;a=a.toJS?a.toJS():Array.isArray(a)?a:[];const w=a.filter((s=>"string"==typeof s)),x=a.filter((s=>void 0!==s.needRemove)).map((s=>s.error)),C=this.state.value,j=!!(C&&C.count&&C.count()>0),L=i.getIn(["items","enum"]),B=i.get("items"),$=u.getSchemaObjectType(B),U=u.getSchemaObjectTypeLabel(B),V=i.getIn(["items","format"]),z=i.get("items");let Y,Z=!1,ee="file"===$||"string"===$&&"binary"===V;if($&&V?Y=s(`JsonSchema_${$}_${V}`):"boolean"!==$&&"array"!==$&&"object"!==$||(Y=s(`JsonSchema_${$}`)),!ze.List.isList(B?.get("type"))||"array"!==$&&"object"!==$||(Y=s("JsonSchema_object")),Y||ee||(Z=!0),L){const i=s("Select");return Re.createElement(i,{className:a.length?"invalid":"",title:a.length?a:"",multiple:!0,value:C,disabled:_,allowedValues:L,allowEmptyValue:!o,onChange:this.onEnumChange})}const ie=s("Button");return Re.createElement("div",{className:"json-schema-array"},j?C.map(((o,i)=>{const w=(0,ze.fromJS)([...a.filter((s=>s.index===i)).map((s=>s.error))]);return Re.createElement("div",{key:i,className:"json-schema-form-item"},ee?Re.createElement(JsonSchemaArrayItemFile,{value:o,onChange:s=>this.onItemChange(s,i),disabled:_,errors:w,getComponent:s}):Z?Re.createElement(JsonSchemaArrayItemText,{value:o,onChange:s=>this.onItemChange(s,i),disabled:_,errors:w}):Re.createElement(Y,Mn()({},this.props,{value:o,onChange:s=>this.onItemChange(s,i),disabled:_,errors:w,schema:z,getComponent:s,fn:u})),_?null:Re.createElement(ie,{className:`btn btn-sm json-schema-form-item-remove ${x.length?"invalid":null}`,title:x.length?x:"",onClick:()=>this.removeItem(i)}," - "))})):null,_?null:Re.createElement(ie,{className:`btn btn-sm json-schema-form-item-add ${w.length?"invalid":null}`,title:w.length?w:"",onClick:this.addItem},"Add ",U," item"))}}class JsonSchemaArrayItemText extends Re.Component{static defaultProps=ss;onChange=s=>{const o=s.target.value;this.props.onChange(o,this.props.keyName)};render(){let{value:s,errors:o,description:i,disabled:a}=this.props;return s?(isImmutable(s)||"object"==typeof s)&&(s=stringify(s)):s="",o=o.toJS?o.toJS():[],Re.createElement(ns(),{type:"text",className:o.length?"invalid":"",title:o.length?o:"",value:s,minLength:0,debounceTimeout:350,placeholder:i,onChange:this.onChange,disabled:a})}}class JsonSchemaArrayItemFile extends Re.Component{static defaultProps=ss;onFileChange=s=>{const o=s.target.files[0];this.props.onChange(o,this.props.keyName)};render(){let{getComponent:s,errors:o,disabled:i}=this.props;const a=s("Input"),u=i||!("FormData"in window);return Re.createElement(a,{type:"file",className:o.length?"invalid":"",title:o.length?o:"",onChange:this.onFileChange,disabled:u})}}class JsonSchema_boolean extends Re.Component{static defaultProps=ss;onEnumChange=s=>this.props.onChange(s);render(){let{getComponent:s,value:o,errors:i,schema:a,required:u,disabled:_}=this.props;i=i.toJS?i.toJS():[];let w=a&&a.get?a.get("enum"):null,x=!w||!u,C=!w&&["true","false"];const j=s("Select");return Re.createElement(j,{className:i.length?"invalid":"",title:i.length?i:"",value:String(o),disabled:_,allowedValues:w?[...w]:C,allowEmptyValue:x,onChange:this.onEnumChange})}}const stringifyObjectErrors=s=>s.map((s=>{const o=void 0!==s.propKey?s.propKey:s.index;let i="string"==typeof s?s:"string"==typeof s.error?s.error:null;if(!o&&i)return i;let a=s.error,u=`/${s.propKey}`;for(;"object"==typeof a;){const s=void 0!==a.propKey?a.propKey:a.index;if(void 0===s)break;if(u+=`/${s}`,!a.error)break;a=a.error}return`${u}: ${a}`}));class JsonSchema_object extends Re.PureComponent{constructor(){super()}static defaultProps=ss;onChange=s=>{this.props.onChange(s)};handleOnChange=s=>{const o=s.target.value;this.onChange(o)};render(){let{getComponent:s,value:o,errors:i,disabled:a}=this.props;const u=s("TextArea");return i=i.toJS?i.toJS():Array.isArray(i)?i:[],Re.createElement("div",null,Re.createElement(u,{className:Jn()({invalid:i.length}),title:i.length?stringifyObjectErrors(i).join(", "):"",value:stringify(o),disabled:a,onChange:this.handleOnChange}))}}function valueOrEmptyList(s){return ze.List.isList(s)?s:Array.isArray(s)?(0,ze.fromJS)(s):(0,ze.List)()}const ModelExtensions=({extensions:s,propClass:o=""})=>s.entrySeq().map((([s,i])=>{const a=immutableToJS(i)??null;return Re.createElement("tr",{key:s,className:o},Re.createElement("td",null,s),Re.createElement("td",null,JSON.stringify(a)))})).toArray();var os=__webpack_require__(11331),as=__webpack_require__.n(os);const hasSchemaType=(s,o)=>{const i=ze.Map.isMap(s);if(!i&&!as()(s))return!1;const a=i?s.get("type"):s.type;return o===a||Array.isArray(o)&&o.includes(a)},getType=(s,o=new WeakSet)=>{if(null==s)return"any";if(o.has(s))return"any";o.add(s);const{type:i,items:a}=s;return Object.hasOwn(s,"items")?(()=>{if(a)return`array<${getType(a,o)}>`;return"array<any>"})():i},getSchemaObjectTypeLabel=s=>getType(immutableToJS(s)),json_schema_5=()=>({components:{modelExample:model_example,ModelWrapper,ModelCollapse,Model,Models,EnumModel:enum_model,ObjectModel,ArrayModel,PrimitiveModel:Primitive,ModelExtensions,schemes:Schemes,SchemesContainer,...U},fn:{hasSchemaType,getSchemaObjectTypeLabel}});var cs=__webpack_require__(19123),ls=__webpack_require__.n(cs),us=__webpack_require__(41859),ps=__webpack_require__.n(us),hs=__webpack_require__(62193),ds=__webpack_require__.n(hs);const shallowArrayEquals=s=>o=>Array.isArray(s)&&Array.isArray(o)&&s.length===o.length&&s.every(((s,i)=>s===o[i])),list=(...s)=>s;class Cache extends Map{delete(s){const o=Array.from(this.keys()).find(shallowArrayEquals(s));return super.delete(o)}get(s){const o=Array.from(this.keys()).find(shallowArrayEquals(s));return super.get(o)}has(s){return-1!==Array.from(this.keys()).findIndex(shallowArrayEquals(s))}}const utils_memoizeN=(s,o=list)=>{const{Cache:i}=pt();pt().Cache=Cache;const a=pt()(s,o);return pt().Cache=i,a},fs={string:s=>s.pattern?(s=>{try{const o=/(?<=(?<!\\)\{)(\d{3,})(?=\})|(?<=(?<!\\)\{\d*,)(\d{3,})(?=\})|(?<=(?<!\\)\{)(\d{3,})(?=,\d*\})/g,i=s.replace(o,"100"),a=new(ps())(i);return a.max=100,a.gen()}catch(s){return"string"}})(s.pattern):"string",string_email:()=>"user@example.com","string_date-time":()=>(new Date).toISOString(),string_date:()=>(new Date).toISOString().substring(0,10),string_time:()=>(new Date).toISOString().substring(11),string_uuid:()=>"3fa85f64-5717-4562-b3fc-2c963f66afa6",string_hostname:()=>"example.com",string_ipv4:()=>"198.51.100.42",string_ipv6:()=>"2001:0db8:5b96:0000:0000:426f:8e17:642a",number:()=>0,number_float:()=>0,integer:()=>0,boolean:s=>"boolean"!=typeof s.default||s.default},primitive=s=>{s=objectify(s);let{type:o,format:i}=s,a=fs[`${o}_${i}`]||fs[o];return isFunc(a)?a(s):"Unknown Type: "+s.type},sanitizeRef=s=>deeplyStripKey(s,"$$ref",(s=>"string"==typeof s&&s.indexOf("#")>-1)),ms=["maxProperties","minProperties"],gs=["minItems","maxItems"],ys=["minimum","maximum","exclusiveMinimum","exclusiveMaximum"],vs=["minLength","maxLength"],mergeJsonSchema=(s,o,i={})=>{const a={...s};if(["example","default","enum","xml","type",...ms,...gs,...ys,...vs].forEach((s=>(s=>{void 0===a[s]&&void 0!==o[s]&&(a[s]=o[s])})(s))),void 0!==o.required&&Array.isArray(o.required)&&(void 0!==a.required&&a.required.length||(a.required=[]),o.required.forEach((s=>{a.required.includes(s)||a.required.push(s)}))),o.properties){a.properties||(a.properties={});let s=objectify(o.properties);for(let u in s)Object.prototype.hasOwnProperty.call(s,u)&&(s[u]&&s[u].deprecated||s[u]&&s[u].readOnly&&!i.includeReadOnly||s[u]&&s[u].writeOnly&&!i.includeWriteOnly||a.properties[u]||(a.properties[u]=s[u],!o.required&&Array.isArray(o.required)&&-1!==o.required.indexOf(u)&&(a.required?a.required.push(u):a.required=[u])))}return o.items&&(a.items||(a.items={}),a.items=mergeJsonSchema(a.items,o.items,i)),a},sampleFromSchemaGeneric=(s,o={},i=void 0,a=!1)=>{s&&isFunc(s.toJS)&&(s=s.toJS());let u=void 0!==i||s&&void 0!==s.example||s&&void 0!==s.default;const _=!u&&s&&s.oneOf&&s.oneOf.length>0,w=!u&&s&&s.anyOf&&s.anyOf.length>0;if(!u&&(_||w)){const i=objectify(_?s.oneOf[0]:s.anyOf[0]);if(!(s=mergeJsonSchema(s,i,o)).xml&&i.xml&&(s.xml=i.xml),void 0!==s.example&&void 0!==i.example)u=!0;else if(i.properties){s.properties||(s.properties={});let a=objectify(i.properties);for(let u in a)Object.prototype.hasOwnProperty.call(a,u)&&(a[u]&&a[u].deprecated||a[u]&&a[u].readOnly&&!o.includeReadOnly||a[u]&&a[u].writeOnly&&!o.includeWriteOnly||s.properties[u]||(s.properties[u]=a[u],!i.required&&Array.isArray(i.required)&&-1!==i.required.indexOf(u)&&(s.required?s.required.push(u):s.required=[u])))}}const x={};let{xml:C,type:j,example:L,properties:B,additionalProperties:$,items:U}=s||{},{includeReadOnly:V,includeWriteOnly:z}=o;C=C||{};let Y,{name:Z,prefix:ee,namespace:ie}=C,ae={};if(a&&(Z=Z||"notagname",Y=(ee?ee+":":"")+Z,ie)){x[ee?"xmlns:"+ee:"xmlns"]=ie}a&&(ae[Y]=[]);const schemaHasAny=o=>o.some((o=>Object.prototype.hasOwnProperty.call(s,o)));s&&!j&&(B||$||schemaHasAny(ms)?j="object":U||schemaHasAny(gs)?j="array":schemaHasAny(ys)?(j="number",s.type="number"):u||s.enum||(j="string",s.type="string"));const handleMinMaxItems=o=>{if(null!=s?.maxItems&&(o=o.slice(0,s?.maxItems)),null!=s?.minItems){let i=0;for(;o.length<s?.minItems;)o.push(o[i++%o.length])}return o},ce=objectify(B);let le,pe=0;const hasExceededMaxProperties=()=>s&&null!==s.maxProperties&&void 0!==s.maxProperties&&pe>=s.maxProperties,canAddProperty=o=>!s||null===s.maxProperties||void 0===s.maxProperties||!hasExceededMaxProperties()&&(!(o=>!(s&&s.required&&s.required.length&&s.required.includes(o)))(o)||s.maxProperties-pe-(()=>{if(!s||!s.required)return 0;let o=0;return a?s.required.forEach((s=>o+=void 0===ae[s]?0:1)):s.required.forEach((s=>o+=void 0===ae[Y]?.find((o=>void 0!==o[s]))?0:1)),s.required.length-o})()>0);if(le=a?(i,u=void 0)=>{if(s&&ce[i]){if(ce[i].xml=ce[i].xml||{},ce[i].xml.attribute){const s=Array.isArray(ce[i].enum)?ce[i].enum[0]:void 0,o=ce[i].example,a=ce[i].default;return void(x[ce[i].xml.name||i]=void 0!==o?o:void 0!==a?a:void 0!==s?s:primitive(ce[i]))}ce[i].xml.name=ce[i].xml.name||i}else ce[i]||!1===$||(ce[i]={xml:{name:i}});let _=sampleFromSchemaGeneric(s&&ce[i]||void 0,o,u,a);canAddProperty(i)&&(pe++,Array.isArray(_)?ae[Y]=ae[Y].concat(_):ae[Y].push(_))}:(i,u)=>{if(canAddProperty(i)){if(Object.prototype.hasOwnProperty.call(s,"discriminator")&&s.discriminator&&Object.prototype.hasOwnProperty.call(s.discriminator,"mapping")&&s.discriminator.mapping&&Object.prototype.hasOwnProperty.call(s,"$$ref")&&s.$$ref&&s.discriminator.propertyName===i){for(let o in s.discriminator.mapping)if(-1!==s.$$ref.search(s.discriminator.mapping[o])){ae[i]=o;break}}else ae[i]=sampleFromSchemaGeneric(ce[i],o,u,a);pe++}},u){let u;if(u=sanitizeRef(void 0!==i?i:void 0!==L?L:s.default),!a){if("number"==typeof u&&"string"===j)return`${u}`;if("string"!=typeof u||"string"===j)return u;try{return JSON.parse(u)}catch(s){return u}}if(s||(j=Array.isArray(u)?"array":typeof u),"array"===j){if(!Array.isArray(u)){if("string"==typeof u)return u;u=[u]}const i=s?s.items:void 0;i&&(i.xml=i.xml||C||{},i.xml.name=i.xml.name||C.name);let _=u.map((s=>sampleFromSchemaGeneric(i,o,s,a)));return _=handleMinMaxItems(_),C.wrapped?(ae[Y]=_,ds()(x)||ae[Y].push({_attr:x})):ae=_,ae}if("object"===j){if("string"==typeof u)return u;for(let o in u)Object.prototype.hasOwnProperty.call(u,o)&&(s&&ce[o]&&ce[o].readOnly&&!V||s&&ce[o]&&ce[o].writeOnly&&!z||(s&&ce[o]&&ce[o].xml&&ce[o].xml.attribute?x[ce[o].xml.name||o]=u[o]:le(o,u[o])));return ds()(x)||ae[Y].push({_attr:x}),ae}return ae[Y]=ds()(x)?u:[{_attr:x},u],ae}if("object"===j){for(let s in ce)Object.prototype.hasOwnProperty.call(ce,s)&&(ce[s]&&ce[s].deprecated||ce[s]&&ce[s].readOnly&&!V||ce[s]&&ce[s].writeOnly&&!z||le(s));if(a&&x&&ae[Y].push({_attr:x}),hasExceededMaxProperties())return ae;if(!0===$)a?ae[Y].push({additionalProp:"Anything can be here"}):ae.additionalProp1={},pe++;else if($){const i=objectify($),u=sampleFromSchemaGeneric(i,o,void 0,a);if(a&&i.xml&&i.xml.name&&"notagname"!==i.xml.name)ae[Y].push(u);else{const o=i["x-additionalPropertiesName"]||"additionalProp",_=null!==s.minProperties&&void 0!==s.minProperties&&pe<s.minProperties?s.minProperties-pe:3;for(let s=1;s<=_;s++){if(hasExceededMaxProperties())return ae;if(a){const i={};i[o+s]=u.notagname,ae[Y].push(i)}else ae[o+s]=u;pe++}}}return ae}if("array"===j){if(!U)return;let i;if(a&&(U.xml=U.xml||s?.xml||{},U.xml.name=U.xml.name||C.name),Array.isArray(U.anyOf))i=U.anyOf.map((s=>sampleFromSchemaGeneric(mergeJsonSchema(s,U,o),o,void 0,a)));else if(Array.isArray(U.oneOf))i=U.oneOf.map((s=>sampleFromSchemaGeneric(mergeJsonSchema(s,U,o),o,void 0,a)));else{if(!(!a||a&&C.wrapped))return sampleFromSchemaGeneric(U,o,void 0,a);i=[sampleFromSchemaGeneric(U,o,void 0,a)]}return i=handleMinMaxItems(i),a&&C.wrapped?(ae[Y]=i,ds()(x)||ae[Y].push({_attr:x}),ae):i}let de;if(s&&Array.isArray(s.enum))de=normalizeArray(s.enum)[0];else{if(!s)return;if(de=primitive(s),"number"==typeof de){let o=s.minimum;null!=o&&(s.exclusiveMinimum&&o++,de=o);let i=s.maximum;null!=i&&(s.exclusiveMaximum&&i--,de=i)}if("string"==typeof de&&(null!==s.maxLength&&void 0!==s.maxLength&&(de=de.slice(0,s.maxLength)),null!==s.minLength&&void 0!==s.minLength)){let o=0;for(;de.length<s.minLength;)de+=de[o++%de.length]}}if("file"!==j)return a?(ae[Y]=ds()(x)?de:[{_attr:x},de],ae):de},inferSchema=s=>(s.schema&&(s=s.schema),s.properties&&(s.type="object"),s),createXMLExample=(s,o,i)=>{const a=sampleFromSchemaGeneric(s,o,i,!0);if(a)return"string"==typeof a?a:ls()(a,{declaration:!0,indent:"\t"})},sampleFromSchema=(s,o,i)=>sampleFromSchemaGeneric(s,o,i,!1),resolver=(s,o,i)=>[s,JSON.stringify(o),JSON.stringify(i)],bs=utils_memoizeN(createXMLExample,resolver),_s=utils_memoizeN(sampleFromSchema,resolver),getSchemaObjectType=s=>immutableToJS(s)?.type??"string",Ss=[{when:/json/,shouldStringifyTypes:["string"]}],Es=["object"],get_json_sample_schema=s=>(o,i,a,u)=>{const{fn:_}=s(),w=_.memoizedSampleFromSchema(o,i,u),x=typeof w,C=Ss.reduce(((s,o)=>o.when.test(a)?[...s,...o.shouldStringifyTypes]:s),Es);return gt()(C,(s=>s===x))?JSON.stringify(w,null,2):w},get_yaml_sample_schema=s=>(o,i,a,u)=>{const{fn:_}=s(),w=_.getJsonSampleSchema(o,i,a,u);let x;try{x=fn.dump(fn.load(w),{lineWidth:-1},{schema:rn}),"\n"===x[x.length-1]&&(x=x.slice(0,x.length-1))}catch(s){return console.error(s),"error: could not generate yaml example"}return x.replace(/\t/g,"  ")},get_xml_sample_schema=s=>(o,i,a)=>{const{fn:u}=s();if(o&&!o.xml&&(o.xml={}),o&&!o.xml.name){if(!o.$$ref&&(o.type||o.items||o.properties||o.additionalProperties))return'<?xml version="1.0" encoding="UTF-8"?>\n\x3c!-- XML example cannot be generated; root element name is undefined --\x3e';if(o.$$ref){let s=o.$$ref.match(/\S*\/(\S+)$/);o.xml.name=s[1]}}return u.memoizedCreateXMLExample(o,i,a)},get_sample_schema=s=>(o,i="",a={},u=void 0)=>{const{fn:_}=s();return"function"==typeof o?.toJS&&(o=o.toJS()),"function"==typeof u?.toJS&&(u=u.toJS()),/xml/.test(i)?_.getXmlSampleSchema(o,a,u):/(yaml|yml)/.test(i)?_.getYamlSampleSchema(o,a,i,u):_.getJsonSampleSchema(o,a,i,u)},json_schema_5_samples=({getSystem:s})=>{const o=get_json_sample_schema(s),i=get_yaml_sample_schema(s),a=get_xml_sample_schema(s),u=get_sample_schema(s);return{fn:{jsonSchema5:{inferSchema,sampleFromSchema,sampleFromSchemaGeneric,createXMLExample,memoizedSampleFromSchema:_s,memoizedCreateXMLExample:bs,getJsonSampleSchema:o,getYamlSampleSchema:i,getXmlSampleSchema:a,getSampleSchema:u,mergeJsonSchema},inferSchema,sampleFromSchema,sampleFromSchemaGeneric,createXMLExample,memoizedSampleFromSchema:_s,memoizedCreateXMLExample:bs,getJsonSampleSchema:o,getYamlSampleSchema:i,getXmlSampleSchema:a,getSampleSchema:u,mergeJsonSchema,getSchemaObjectType}}};var ws=__webpack_require__(37334),xs=__webpack_require__.n(ws);const ks=["get","put","post","delete","options","head","patch","trace"],spec_selectors_state=s=>s||(0,ze.Map)(),Os=Ut(spec_selectors_state,(s=>s.get("lastError"))),As=Ut(spec_selectors_state,(s=>s.get("url"))),Cs=Ut(spec_selectors_state,(s=>s.get("spec")||"")),js=Ut(spec_selectors_state,(s=>s.get("specSource")||"not-editor")),Ps=Ut(spec_selectors_state,(s=>s.get("json",(0,ze.Map)()))),Is=Ut(Ps,(s=>s.toJS())),Ts=Ut(spec_selectors_state,(s=>s.get("resolved",(0,ze.Map)()))),specResolvedSubtree=(s,o)=>s.getIn(["resolvedSubtrees",...o],void 0),mergerFn=(s,o)=>ze.Map.isMap(s)&&ze.Map.isMap(o)?o.get("$$ref")?o:(0,ze.OrderedMap)().mergeWith(mergerFn,s,o):o,Ns=Ut(spec_selectors_state,(s=>(0,ze.OrderedMap)().mergeWith(mergerFn,s.get("json"),s.get("resolvedSubtrees")))),spec=s=>Ps(s),Ms=Ut(spec,(()=>!1)),Rs=Ut(spec,(s=>returnSelfOrNewMap(s&&s.get("info")))),Ds=Ut(spec,(s=>returnSelfOrNewMap(s&&s.get("externalDocs")))),Ls=Ut(Rs,(s=>s&&s.get("version"))),Fs=Ut(Ls,(s=>/v?([0-9]*)\.([0-9]*)\.([0-9]*)/i.exec(s).slice(1))),Bs=Ut(Ns,(s=>s.get("paths"))),$s=xs()(["get","put","post","delete","options","head","patch"]),qs=Ut(Bs,(s=>{let o=(0,ze.List)();return!ze.Map.isMap(s)||s.isEmpty()||s.forEach(((s,i)=>{if(!s||!s.forEach)return{};s.forEach(((s,a)=>{ks.indexOf(a)<0||(o=o.push((0,ze.fromJS)({path:i,method:a,operation:s,id:`${a}-${i}`})))}))})),o})),Us=Ut(spec,(s=>(0,ze.Set)(s.get("consumes")))),Vs=Ut(spec,(s=>(0,ze.Set)(s.get("produces")))),zs=Ut(spec,(s=>s.get("security",(0,ze.List)()))),Ws=Ut(spec,(s=>s.get("securityDefinitions"))),findDefinition=(s,o)=>{const i=s.getIn(["resolvedSubtrees","definitions",o],null),a=s.getIn(["json","definitions",o],null);return i||a||null},Js=Ut(spec,(s=>{const o=s.get("definitions");return ze.Map.isMap(o)?o:(0,ze.Map)()})),Hs=Ut(spec,(s=>s.get("basePath"))),Ks=Ut(spec,(s=>s.get("host"))),Gs=Ut(spec,(s=>s.get("schemes",(0,ze.Map)()))),Ys=Ut([qs,Us,Vs],((s,o,i)=>s.map((s=>s.update("operation",(s=>ze.Map.isMap(s)?s.withMutations((s=>(s.get("consumes")||s.update("consumes",(s=>(0,ze.Set)(s).merge(o))),s.get("produces")||s.update("produces",(s=>(0,ze.Set)(s).merge(i))),s))):(0,ze.Map)())))))),Xs=Ut(spec,(s=>{const o=s.get("tags",(0,ze.List)());return ze.List.isList(o)?o.filter((s=>ze.Map.isMap(s))):(0,ze.List)()})),tagDetails=(s,o)=>(Xs(s)||(0,ze.List)()).filter(ze.Map.isMap).find((s=>s.get("name")===o),(0,ze.Map)()),Qs=Ut(Ys,Xs,((s,o)=>s.reduce(((s,o)=>{let i=(0,ze.Set)(o.getIn(["operation","tags"]));return i.count()<1?s.update("default",(0,ze.List)(),(s=>s.push(o))):i.reduce(((s,i)=>s.update(i,(0,ze.List)(),(s=>s.push(o)))),s)}),o.reduce(((s,o)=>s.set(o.get("name"),(0,ze.List)())),(0,ze.OrderedMap)())))),selectors_taggedOperations=s=>({getConfigs:o})=>{let{tagsSorter:i,operationsSorter:a}=o();return Qs(s).sortBy(((s,o)=>o),((s,o)=>{let a="function"==typeof i?i:It.tagsSorter[i];return a?a(s,o):null})).map(((o,i)=>{let u="function"==typeof a?a:It.operationsSorter[a],_=u?o.sort(u):o;return(0,ze.Map)({tagDetails:tagDetails(s,i),operations:_})}))},Zs=Ut(spec_selectors_state,(s=>s.get("responses",(0,ze.Map)()))),eo=Ut(spec_selectors_state,(s=>s.get("requests",(0,ze.Map)()))),to=Ut(spec_selectors_state,(s=>s.get("mutatedRequests",(0,ze.Map)()))),responseFor=(s,o,i)=>Zs(s).getIn([o,i],null),requestFor=(s,o,i)=>eo(s).getIn([o,i],null),mutatedRequestFor=(s,o,i)=>to(s).getIn([o,i],null),allowTryItOutFor=()=>!0,parameterWithMetaByIdentity=(s,o,i)=>{const a=Ns(s).getIn(["paths",...o,"parameters"],(0,ze.OrderedMap)()),u=s.getIn(["meta","paths",...o,"parameters"],(0,ze.OrderedMap)());return a.map((s=>{const o=u.get(`${i.get("in")}.${i.get("name")}`),a=u.get(`${i.get("in")}.${i.get("name")}.hash-${i.hashCode()}`);return(0,ze.OrderedMap)().merge(s,o,a)})).find((s=>s.get("in")===i.get("in")&&s.get("name")===i.get("name")),(0,ze.OrderedMap)())},parameterInclusionSettingFor=(s,o,i,a)=>{const u=`${a}.${i}`;return s.getIn(["meta","paths",...o,"parameter_inclusions",u],!1)},parameterWithMeta=(s,o,i,a)=>{const u=Ns(s).getIn(["paths",...o,"parameters"],(0,ze.OrderedMap)()).find((s=>s.get("in")===a&&s.get("name")===i),(0,ze.OrderedMap)());return parameterWithMetaByIdentity(s,o,u)},operationWithMeta=(s,o,i)=>{const a=Ns(s).getIn(["paths",o,i],(0,ze.OrderedMap)()),u=s.getIn(["meta","paths",o,i],(0,ze.OrderedMap)()),_=a.get("parameters",(0,ze.List)()).map((a=>parameterWithMetaByIdentity(s,[o,i],a)));return(0,ze.OrderedMap)().merge(a,u).set("parameters",_)};function getParameter(s,o,i,a){return o=o||[],s.getIn(["meta","paths",...o,"parameters"],(0,ze.fromJS)([])).find((s=>ze.Map.isMap(s)&&s.get("name")===i&&s.get("in")===a))||(0,ze.Map)()}const ro=Ut(spec,(s=>{const o=s.get("host");return"string"==typeof o&&o.length>0&&"/"!==o[0]}));function parameterValues(s,o,i){return o=o||[],operationWithMeta(s,...o).get("parameters",(0,ze.List)()).reduce(((s,o)=>{let a=i&&"body"===o.get("in")?o.get("value_xml"):o.get("value");return ze.List.isList(a)&&(a=a.filter((s=>""!==s))),s.set(paramToIdentifier(o,{allowHashes:!1}),a)}),(0,ze.fromJS)({}))}function parametersIncludeIn(s,o=""){if(ze.List.isList(s))return s.some((s=>ze.Map.isMap(s)&&s.get("in")===o))}function parametersIncludeType(s,o=""){if(ze.List.isList(s))return s.some((s=>ze.Map.isMap(s)&&s.get("type")===o))}function contentTypeValues(s,o){o=o||[];let i=Ns(s).getIn(["paths",...o],(0,ze.fromJS)({})),a=s.getIn(["meta","paths",...o],(0,ze.fromJS)({})),u=currentProducesFor(s,o);const _=i.get("parameters")||new ze.List,w=a.get("consumes_value")?a.get("consumes_value"):parametersIncludeType(_,"file")?"multipart/form-data":parametersIncludeType(_,"formData")?"application/x-www-form-urlencoded":void 0;return(0,ze.fromJS)({requestContentType:w,responseContentType:u})}function currentProducesFor(s,o){o=o||[];const i=Ns(s).getIn(["paths",...o],null);if(null===i)return;const a=s.getIn(["meta","paths",...o,"produces_value"],null),u=i.getIn(["produces",0],null);return a||u||"application/json"}function producesOptionsFor(s,o){o=o||[];const i=Ns(s),a=i.getIn(["paths",...o],null);if(null===a)return;const[u]=o,_=a.get("produces",null),w=i.getIn(["paths",u,"produces"],null),x=i.getIn(["produces"],null);return _||w||x}function consumesOptionsFor(s,o){o=o||[];const i=Ns(s),a=i.getIn(["paths",...o],null);if(null===a)return;const[u]=o,_=a.get("consumes",null),w=i.getIn(["paths",u,"consumes"],null),x=i.getIn(["consumes"],null);return _||w||x}const operationScheme=(s,o,i)=>{let a=s.get("url").match(/^([a-z][a-z0-9+\-.]*):/),u=Array.isArray(a)?a[1]:null;return s.getIn(["scheme",o,i])||s.getIn(["scheme","_defaultScheme"])||u||""},canExecuteScheme=(s,o,i)=>["http","https"].indexOf(operationScheme(s,o,i))>-1,validationErrors=(s,o)=>{o=o||[];const i=s.getIn(["meta","paths",...o,"parameters"],(0,ze.fromJS)([])),a=[];if(0===i.length)return a;const getErrorsWithPaths=(s,o=[])=>{const getNestedErrorsWithPaths=(s,o)=>{const i=[...o,s.get("propKey")||s.get("index")];return ze.Map.isMap(s.get("error"))?getErrorsWithPaths(s.get("error"),i):{error:s.get("error"),path:i}};return ze.List.isList(s)?s.map((s=>ze.Map.isMap(s)?getNestedErrorsWithPaths(s,o):{error:s,path:o})):getNestedErrorsWithPaths(s,o)};return i.forEach(((s,o)=>{const i=o.split(".").slice(1,-1).join("."),u=s.get("errors");if(u&&u.count()){getErrorsWithPaths(u).forEach((({error:s,path:o})=>{a.push(((s,o,i)=>`For '${i}'${(o=o.reduce(((s,o)=>"number"==typeof o?`${s}[${o}]`:s?`${s}.${o}`:o),""))?` at path '${o}'`:""}: ${s}.`)(s,o,i))}))}})),a},validateBeforeExecute=(s,o)=>0===validationErrors(s,o).length,getOAS3RequiredRequestBodyContentType=(s,o)=>{let i={requestBody:!1,requestContentType:{}},a=s.getIn(["resolvedSubtrees","paths",...o,"requestBody"],(0,ze.fromJS)([]));return a.size<1||(a.getIn(["required"])&&(i.requestBody=a.getIn(["required"])),a.getIn(["content"]).entrySeq().forEach((s=>{const o=s[0];if(s[1].getIn(["schema","required"])){const a=s[1].getIn(["schema","required"]).toJS();i.requestContentType[o]=a}}))),i},isMediaTypeSchemaPropertiesEqual=(s,o,i,a)=>{if((i||a)&&i===a)return!0;let u=s.getIn(["resolvedSubtrees","paths",...o,"requestBody","content"],(0,ze.fromJS)([]));if(u.size<2||!i||!a)return!1;let _=u.getIn([i,"schema","properties"],(0,ze.fromJS)([])),w=u.getIn([a,"schema","properties"],(0,ze.fromJS)([]));return!!_.equals(w)};function returnSelfOrNewMap(s){return ze.Map.isMap(s)?s:new ze.Map}var no=__webpack_require__(85015),so=__webpack_require__.n(no),oo=__webpack_require__(38221),io=__webpack_require__.n(oo),ao=__webpack_require__(63560),co=__webpack_require__.n(ao),lo=__webpack_require__(56367),uo=__webpack_require__.n(lo);const po="spec_update_spec",ho="spec_update_url",fo="spec_update_json",mo="spec_update_param",go="spec_update_empty_param_inclusion",yo="spec_validate_param",vo="spec_set_response",bo="spec_set_request",_o="spec_set_mutated_request",So="spec_log_request",Eo="spec_clear_response",wo="spec_clear_request",xo="spec_clear_validate_param",ko="spec_update_operation_meta_value",Oo="spec_update_resolved",Ao="spec_update_resolved_subtree",Co="set_scheme",toStr=s=>so()(s)?s:"";function updateSpec(s){const o=toStr(s).replace(/\t/g,"  ");if("string"==typeof s)return{type:po,payload:o}}function updateResolved(s){return{type:Oo,payload:s}}function updateUrl(s){return{type:ho,payload:s}}function updateJsonSpec(s){return{type:fo,payload:s}}const parseToJson=s=>({specActions:o,specSelectors:i,errActions:a})=>{let{specStr:u}=i,_=null;try{s=s||u(),a.clear({source:"parser"}),_=fn.load(s,{schema:rn})}catch(s){return console.error(s),a.newSpecErr({source:"parser",level:"error",message:s.reason,line:s.mark&&s.mark.line?s.mark.line+1:void 0})}return _&&"object"==typeof _?o.updateJsonSpec(_):o.updateJsonSpec({})};let jo=!1;const resolveSpec=(s,o)=>({specActions:i,specSelectors:a,errActions:u,fn:{fetch:_,resolve:w,AST:x={}},getConfigs:C})=>{jo||(console.warn("specActions.resolveSpec is deprecated since v3.10.0 and will be removed in v4.0.0; use requestResolvedSubtree instead!"),jo=!0);const{modelPropertyMacro:j,parameterMacro:L,requestInterceptor:B,responseInterceptor:$}=C();void 0===s&&(s=a.specJson()),void 0===o&&(o=a.url());let U=x.getLineNumberForPath?x.getLineNumberForPath:()=>{},V=a.specStr();return w({fetch:_,spec:s,baseDoc:String(new URL(o,document.baseURI)),modelPropertyMacro:j,parameterMacro:L,requestInterceptor:B,responseInterceptor:$}).then((({spec:s,errors:o})=>{if(u.clear({type:"thrown"}),Array.isArray(o)&&o.length>0){let s=o.map((s=>(console.error(s),s.line=s.fullPath?U(V,s.fullPath):null,s.path=s.fullPath?s.fullPath.join("."):null,s.level="error",s.type="thrown",s.source="resolver",Object.defineProperty(s,"message",{enumerable:!0,value:s.message}),s)));u.newThrownErrBatch(s)}return i.updateResolved(s)}))};let Po=[];const Io=io()((()=>{const s=Po.reduce(((s,{path:o,system:i})=>(s.has(i)||s.set(i,[]),s.get(i).push(o),s)),new Map);Po=[],s.forEach((async(s,o)=>{if(!o)return void console.error("debResolveSubtrees: don't have a system to operate on, aborting.");if(!o.fn.resolveSubtree)return void console.error("Error: Swagger-Client did not provide a `resolveSubtree` method, doing nothing.");const{errActions:i,errSelectors:a,fn:{resolveSubtree:u,fetch:_,AST:w={}},specSelectors:x,specActions:C}=o,j=w.getLineNumberForPath??xs()(void 0),L=x.specStr(),{modelPropertyMacro:B,parameterMacro:$,requestInterceptor:U,responseInterceptor:V}=o.getConfigs();try{const o=await s.reduce((async(s,o)=>{let{resultMap:w,specWithCurrentSubtrees:C}=await s;const{errors:z,spec:Y}=await u(C,o,{baseDoc:String(new URL(x.url(),document.baseURI)),modelPropertyMacro:B,parameterMacro:$,requestInterceptor:U,responseInterceptor:V});if(a.allErrors().size&&i.clearBy((s=>"thrown"!==s.get("type")||"resolver"!==s.get("source")||!s.get("fullPath")?.every(((s,i)=>s===o[i]||void 0===o[i])))),Array.isArray(z)&&z.length>0){let s=z.map((s=>(s.line=s.fullPath?j(L,s.fullPath):null,s.path=s.fullPath?s.fullPath.join("."):null,s.level="error",s.type="thrown",s.source="resolver",Object.defineProperty(s,"message",{enumerable:!0,value:s.message}),s)));i.newThrownErrBatch(s)}return Y&&x.isOAS3()&&"components"===o[0]&&"securitySchemes"===o[1]&&await Promise.all(Object.values(Y).filter((s=>"openIdConnect"===s?.type)).map((async s=>{const o={url:s.openIdConnectUrl,requestInterceptor:U,responseInterceptor:V};try{const i=await _(o);i instanceof Error||i.status>=400?console.error(i.statusText+" "+o.url):s.openIdConnectData=JSON.parse(i.text)}catch(s){console.error(s)}}))),co()(w,o,Y),C=uo()(o,Y,C),{resultMap:w,specWithCurrentSubtrees:C}}),Promise.resolve({resultMap:(x.specResolvedSubtree([])||(0,ze.Map)()).toJS(),specWithCurrentSubtrees:x.specJS()}));C.updateResolvedSubtree([],o.resultMap)}catch(s){console.error(s)}}))}),35),requestResolvedSubtree=s=>o=>{Po.find((({path:i,system:a})=>a===o&&i.toString()===s.toString()))||(Po.push({path:s,system:o}),Io())};function changeParam(s,o,i,a,u){return{type:mo,payload:{path:s,value:a,paramName:o,paramIn:i,isXml:u}}}function changeParamByIdentity(s,o,i,a){return{type:mo,payload:{path:s,param:o,value:i,isXml:a}}}const updateResolvedSubtree=(s,o)=>({type:Ao,payload:{path:s,value:o}}),invalidateResolvedSubtreeCache=()=>({type:Ao,payload:{path:[],value:(0,ze.Map)()}}),validateParams=(s,o)=>({type:yo,payload:{pathMethod:s,isOAS3:o}}),updateEmptyParamInclusion=(s,o,i,a)=>({type:go,payload:{pathMethod:s,paramName:o,paramIn:i,includeEmptyValue:a}});function clearValidateParams(s){return{type:xo,payload:{pathMethod:s}}}function changeConsumesValue(s,o){return{type:ko,payload:{path:s,value:o,key:"consumes_value"}}}function changeProducesValue(s,o){return{type:ko,payload:{path:s,value:o,key:"produces_value"}}}const setResponse=(s,o,i)=>({payload:{path:s,method:o,res:i},type:vo}),setRequest=(s,o,i)=>({payload:{path:s,method:o,req:i},type:bo}),setMutatedRequest=(s,o,i)=>({payload:{path:s,method:o,req:i},type:_o}),logRequest=s=>({payload:s,type:So}),executeRequest=s=>({fn:o,specActions:i,specSelectors:a,getConfigs:u,oas3Selectors:_})=>{let{pathName:w,method:x,operation:C}=s,{requestInterceptor:j,responseInterceptor:L}=u(),B=C.toJS();if(C&&C.get("parameters")&&C.get("parameters").filter((s=>s&&!0===s.get("allowEmptyValue"))).forEach((o=>{if(a.parameterInclusionSettingFor([w,x],o.get("name"),o.get("in"))){s.parameters=s.parameters||{};const i=paramToValue(o,s.parameters);(!i||i&&0===i.size)&&(s.parameters[o.get("name")]="")}})),s.contextUrl=Nt()(a.url()).toString(),B&&B.operationId?s.operationId=B.operationId:B&&w&&x&&(s.operationId=o.opId(B,w,x)),a.isOAS3()){const o=`${w}:${x}`;s.server=_.selectedServer(o)||_.selectedServer();const i=_.serverVariables({server:s.server,namespace:o}).toJS(),a=_.serverVariables({server:s.server}).toJS();s.serverVariables=Object.keys(i).length?i:a,s.requestContentType=_.requestContentType(w,x),s.responseContentType=_.responseContentType(w,x)||"*/*";const u=_.requestBodyValue(w,x),C=_.requestBodyInclusionSetting(w,x);u&&u.toJS?s.requestBody=u.map((s=>ze.Map.isMap(s)?s.get("value"):s)).filter(((s,o)=>(Array.isArray(s)?0!==s.length:!isEmptyValue(s))||C.get(o))).toJS():s.requestBody=u}let $=Object.assign({},s);$=o.buildRequest($),i.setRequest(s.pathName,s.method,$);s.requestInterceptor=async o=>{let a=await j.apply(void 0,[o]),u=Object.assign({},a);return i.setMutatedRequest(s.pathName,s.method,u),a},s.responseInterceptor=L;const U=Date.now();return o.execute(s).then((o=>{o.duration=Date.now()-U,i.setResponse(s.pathName,s.method,o)})).catch((o=>{"Failed to fetch"===o.message&&(o.name="",o.message='**Failed to fetch.**  \n**Possible Reasons:** \n  - CORS \n  - Network Failure \n  - URL scheme must be "http" or "https" for CORS request.'),i.setResponse(s.pathName,s.method,{error:!0,err:o})}))},actions_execute=({path:s,method:o,...i}={})=>a=>{let{fn:{fetch:u},specSelectors:_,specActions:w}=a,x=_.specJsonWithResolvedSubtrees().toJS(),C=_.operationScheme(s,o),{requestContentType:j,responseContentType:L}=_.contentTypeValues([s,o]).toJS(),B=/xml/i.test(j),$=_.parameterValues([s,o],B).toJS();return w.executeRequest({...i,fetch:u,spec:x,pathName:s,method:o,parameters:$,requestContentType:j,scheme:C,responseContentType:L})};function clearResponse(s,o){return{type:Eo,payload:{path:s,method:o}}}function clearRequest(s,o){return{type:wo,payload:{path:s,method:o}}}function setScheme(s,o,i){return{type:Co,payload:{scheme:s,path:o,method:i}}}const To={[po]:(s,o)=>"string"==typeof o.payload?s.set("spec",o.payload):s,[ho]:(s,o)=>s.set("url",o.payload+""),[fo]:(s,o)=>s.set("json",fromJSOrdered(o.payload)),[Oo]:(s,o)=>s.setIn(["resolved"],fromJSOrdered(o.payload)),[Ao]:(s,o)=>{const{value:i,path:a}=o.payload;return s.setIn(["resolvedSubtrees",...a],fromJSOrdered(i))},[mo]:(s,{payload:o})=>{let{path:i,paramName:a,paramIn:u,param:_,value:w,isXml:x}=o,C=_?paramToIdentifier(_):`${u}.${a}`;const j=x?"value_xml":"value";return s.setIn(["meta","paths",...i,"parameters",C,j],(0,ze.fromJS)(w))},[go]:(s,{payload:o})=>{let{pathMethod:i,paramName:a,paramIn:u,includeEmptyValue:_}=o;if(!a||!u)return console.warn("Warning: UPDATE_EMPTY_PARAM_INCLUSION could not generate a paramKey."),s;const w=`${u}.${a}`;return s.setIn(["meta","paths",...i,"parameter_inclusions",w],_)},[yo]:(s,{payload:{pathMethod:o,isOAS3:i}})=>{const a=Ns(s).getIn(["paths",...o]),u=parameterValues(s,o).toJS();return s.updateIn(["meta","paths",...o,"parameters"],(0,ze.fromJS)({}),(_=>a.get("parameters",(0,ze.List)()).reduce(((a,_)=>{const w=paramToValue(_,u),x=parameterInclusionSettingFor(s,o,_.get("name"),_.get("in")),C=((s,o,{isOAS3:i=!1,bypassRequiredCheck:a=!1}={})=>{let u=s.get("required"),{schema:_,parameterContentMediaType:w}=getParameterSchema(s,{isOAS3:i});return validateValueBySchema(o,_,u,a,w)})(_,w,{bypassRequiredCheck:x,isOAS3:i});return a.setIn([paramToIdentifier(_),"errors"],(0,ze.fromJS)(C))}),_)))},[xo]:(s,{payload:{pathMethod:o}})=>s.updateIn(["meta","paths",...o,"parameters"],(0,ze.fromJS)([]),(s=>s.map((s=>s.set("errors",(0,ze.fromJS)([])))))),[vo]:(s,{payload:{res:o,path:i,method:a}})=>{let u;u=o.error?Object.assign({error:!0,name:o.err.name,message:o.err.message,statusCode:o.err.statusCode},o.err.response):o,u.headers=u.headers||{};let _=s.setIn(["responses",i,a],fromJSOrdered(u));return lt.Blob&&u.data instanceof lt.Blob&&(_=_.setIn(["responses",i,a,"text"],u.data)),_},[bo]:(s,{payload:{req:o,path:i,method:a}})=>s.setIn(["requests",i,a],fromJSOrdered(o)),[_o]:(s,{payload:{req:o,path:i,method:a}})=>s.setIn(["mutatedRequests",i,a],fromJSOrdered(o)),[ko]:(s,{payload:{path:o,value:i,key:a}})=>{let u=["paths",...o],_=["meta","paths",...o];return s.getIn(["json",...u])||s.getIn(["resolved",...u])||s.getIn(["resolvedSubtrees",...u])?s.setIn([..._,a],(0,ze.fromJS)(i)):s},[Eo]:(s,{payload:{path:o,method:i}})=>s.deleteIn(["responses",o,i]),[wo]:(s,{payload:{path:o,method:i}})=>s.deleteIn(["requests",o,i]),[Co]:(s,{payload:{scheme:o,path:i,method:a}})=>i&&a?s.setIn(["scheme",i,a],o):i||a?void 0:s.setIn(["scheme","_defaultScheme"],o)},wrap_actions_updateSpec=(s,{specActions:o})=>(...i)=>{s(...i),o.parseToJson(...i)},wrap_actions_updateJsonSpec=(s,{specActions:o})=>(...i)=>{s(...i),o.invalidateResolvedSubtreeCache();const[a]=i,u=Cn()(a,["paths"])||{};Object.keys(u).forEach((s=>{const i=Cn()(u,[s]);as()(i)&&i.$ref&&o.requestResolvedSubtree(["paths",s])})),o.requestResolvedSubtree(["components","securitySchemes"])},wrap_actions_executeRequest=(s,{specActions:o})=>i=>(o.logRequest(i),s(i)),wrap_actions_validateParams=(s,{specSelectors:o})=>i=>s(i,o.isOAS3()),plugins_spec=()=>({statePlugins:{spec:{wrapActions:{...Y},reducers:{...To},actions:{...z},selectors:{...V}}}});var No=function(){var extendStatics=function(s,o){return extendStatics=Object.setPrototypeOf||{__proto__:[]}instanceof Array&&function(s,o){s.__proto__=o}||function(s,o){for(var i in o)o.hasOwnProperty(i)&&(s[i]=o[i])},extendStatics(s,o)};return function(s,o){function __(){this.constructor=s}extendStatics(s,o),s.prototype=null===o?Object.create(o):(__.prototype=o.prototype,new __)}}(),Mo=Object.prototype.hasOwnProperty;function module_helpers_hasOwnProperty(s,o){return Mo.call(s,o)}function _objectKeys(s){if(Array.isArray(s)){for(var o=new Array(s.length),i=0;i<o.length;i++)o[i]=""+i;return o}if(Object.keys)return Object.keys(s);var a=[];for(var u in s)module_helpers_hasOwnProperty(s,u)&&a.push(u);return a}function _deepClone(s){switch(typeof s){case"object":return JSON.parse(JSON.stringify(s));case"undefined":return null;default:return s}}function helpers_isInteger(s){for(var o,i=0,a=s.length;i<a;){if(!((o=s.charCodeAt(i))>=48&&o<=57))return!1;i++}return!0}function escapePathComponent(s){return-1===s.indexOf("/")&&-1===s.indexOf("~")?s:s.replace(/~/g,"~0").replace(/\//g,"~1")}function unescapePathComponent(s){return s.replace(/~1/g,"/").replace(/~0/g,"~")}function hasUndefined(s){if(void 0===s)return!0;if(s)if(Array.isArray(s)){for(var o=0,i=s.length;o<i;o++)if(hasUndefined(s[o]))return!0}else if("object"==typeof s)for(var a=_objectKeys(s),u=a.length,_=0;_<u;_++)if(hasUndefined(s[a[_]]))return!0;return!1}function patchErrorMessageFormatter(s,o){var i=[s];for(var a in o){var u="object"==typeof o[a]?JSON.stringify(o[a],null,2):o[a];void 0!==u&&i.push(a+": "+u)}return i.join("\n")}var Ro=function(s){function PatchError(o,i,a,u,_){var w=this.constructor,x=s.call(this,patchErrorMessageFormatter(o,{name:i,index:a,operation:u,tree:_}))||this;return x.name=i,x.index=a,x.operation=u,x.tree=_,Object.setPrototypeOf(x,w.prototype),x.message=patchErrorMessageFormatter(o,{name:i,index:a,operation:u,tree:_}),x}return No(PatchError,s),PatchError}(Error),Do=Ro,Lo=_deepClone,Fo={add:function(s,o,i){return s[o]=this.value,{newDocument:i}},remove:function(s,o,i){var a=s[o];return delete s[o],{newDocument:i,removed:a}},replace:function(s,o,i){var a=s[o];return s[o]=this.value,{newDocument:i,removed:a}},move:function(s,o,i){var a=getValueByPointer(i,this.path);a&&(a=_deepClone(a));var u=applyOperation(i,{op:"remove",path:this.from}).removed;return applyOperation(i,{op:"add",path:this.path,value:u}),{newDocument:i,removed:a}},copy:function(s,o,i){var a=getValueByPointer(i,this.from);return applyOperation(i,{op:"add",path:this.path,value:_deepClone(a)}),{newDocument:i}},test:function(s,o,i){return{newDocument:i,test:_areEquals(s[o],this.value)}},_get:function(s,o,i){return this.value=s[o],{newDocument:i}}},Bo={add:function(s,o,i){return helpers_isInteger(o)?s.splice(o,0,this.value):s[o]=this.value,{newDocument:i,index:o}},remove:function(s,o,i){return{newDocument:i,removed:s.splice(o,1)[0]}},replace:function(s,o,i){var a=s[o];return s[o]=this.value,{newDocument:i,removed:a}},move:Fo.move,copy:Fo.copy,test:Fo.test,_get:Fo._get};function getValueByPointer(s,o){if(""==o)return s;var i={op:"_get",path:o};return applyOperation(s,i),i.value}function applyOperation(s,o,i,a,u,_){if(void 0===i&&(i=!1),void 0===a&&(a=!0),void 0===u&&(u=!0),void 0===_&&(_=0),i&&("function"==typeof i?i(o,0,s,o.path):validator(o,0)),""===o.path){var w={newDocument:s};if("add"===o.op)return w.newDocument=o.value,w;if("replace"===o.op)return w.newDocument=o.value,w.removed=s,w;if("move"===o.op||"copy"===o.op)return w.newDocument=getValueByPointer(s,o.from),"move"===o.op&&(w.removed=s),w;if("test"===o.op){if(w.test=_areEquals(s,o.value),!1===w.test)throw new Do("Test operation failed","TEST_OPERATION_FAILED",_,o,s);return w.newDocument=s,w}if("remove"===o.op)return w.removed=s,w.newDocument=null,w;if("_get"===o.op)return o.value=s,w;if(i)throw new Do("Operation `op` property is not one of operations defined in RFC-6902","OPERATION_OP_INVALID",_,o,s);return w}a||(s=_deepClone(s));var x=(o.path||"").split("/"),C=s,j=1,L=x.length,B=void 0,$=void 0,U=void 0;for(U="function"==typeof i?i:validator;;){if(($=x[j])&&-1!=$.indexOf("~")&&($=unescapePathComponent($)),u&&("__proto__"==$||"prototype"==$&&j>0&&"constructor"==x[j-1]))throw new TypeError("JSON-Patch: modifying `__proto__` or `constructor/prototype` prop is banned for security reasons, if this was on purpose, please set `banPrototypeModifications` flag false and pass it to this function. More info in fast-json-patch README");if(i&&void 0===B&&(void 0===C[$]?B=x.slice(0,j).join("/"):j==L-1&&(B=o.path),void 0!==B&&U(o,0,s,B)),j++,Array.isArray(C)){if("-"===$)$=C.length;else{if(i&&!helpers_isInteger($))throw new Do("Expected an unsigned base-10 integer value, making the new referenced value the array element with the zero-based index","OPERATION_PATH_ILLEGAL_ARRAY_INDEX",_,o,s);helpers_isInteger($)&&($=~~$)}if(j>=L){if(i&&"add"===o.op&&$>C.length)throw new Do("The specified index MUST NOT be greater than the number of elements in the array","OPERATION_VALUE_OUT_OF_BOUNDS",_,o,s);if(!1===(w=Bo[o.op].call(o,C,$,s)).test)throw new Do("Test operation failed","TEST_OPERATION_FAILED",_,o,s);return w}}else if(j>=L){if(!1===(w=Fo[o.op].call(o,C,$,s)).test)throw new Do("Test operation failed","TEST_OPERATION_FAILED",_,o,s);return w}if(C=C[$],i&&j<L&&(!C||"object"!=typeof C))throw new Do("Cannot perform operation at the desired path","OPERATION_PATH_UNRESOLVABLE",_,o,s)}}function applyPatch(s,o,i,a,u){if(void 0===a&&(a=!0),void 0===u&&(u=!0),i&&!Array.isArray(o))throw new Do("Patch sequence must be an array","SEQUENCE_NOT_AN_ARRAY");a||(s=_deepClone(s));for(var _=new Array(o.length),w=0,x=o.length;w<x;w++)_[w]=applyOperation(s,o[w],i,!0,u,w),s=_[w].newDocument;return _.newDocument=s,_}function applyReducer(s,o,i){var a=applyOperation(s,o);if(!1===a.test)throw new Do("Test operation failed","TEST_OPERATION_FAILED",i,o,s);return a.newDocument}function validator(s,o,i,a){if("object"!=typeof s||null===s||Array.isArray(s))throw new Do("Operation is not an object","OPERATION_NOT_AN_OBJECT",o,s,i);if(!Fo[s.op])throw new Do("Operation `op` property is not one of operations defined in RFC-6902","OPERATION_OP_INVALID",o,s,i);if("string"!=typeof s.path)throw new Do("Operation `path` property is not a string","OPERATION_PATH_INVALID",o,s,i);if(0!==s.path.indexOf("/")&&s.path.length>0)throw new Do('Operation `path` property must start with "/"',"OPERATION_PATH_INVALID",o,s,i);if(("move"===s.op||"copy"===s.op)&&"string"!=typeof s.from)throw new Do("Operation `from` property is not present (applicable in `move` and `copy` operations)","OPERATION_FROM_REQUIRED",o,s,i);if(("add"===s.op||"replace"===s.op||"test"===s.op)&&void 0===s.value)throw new Do("Operation `value` property is not present (applicable in `add`, `replace` and `test` operations)","OPERATION_VALUE_REQUIRED",o,s,i);if(("add"===s.op||"replace"===s.op||"test"===s.op)&&hasUndefined(s.value))throw new Do("Operation `value` property is not present (applicable in `add`, `replace` and `test` operations)","OPERATION_VALUE_CANNOT_CONTAIN_UNDEFINED",o,s,i);if(i)if("add"==s.op){var u=s.path.split("/").length,_=a.split("/").length;if(u!==_+1&&u!==_)throw new Do("Cannot perform an `add` operation at the desired path","OPERATION_PATH_CANNOT_ADD",o,s,i)}else if("replace"===s.op||"remove"===s.op||"_get"===s.op){if(s.path!==a)throw new Do("Cannot perform the operation at a path that does not exist","OPERATION_PATH_UNRESOLVABLE",o,s,i)}else if("move"===s.op||"copy"===s.op){var w=validate([{op:"_get",path:s.from,value:void 0}],i);if(w&&"OPERATION_PATH_UNRESOLVABLE"===w.name)throw new Do("Cannot perform the operation from a path that does not exist","OPERATION_FROM_UNRESOLVABLE",o,s,i)}}function validate(s,o,i){try{if(!Array.isArray(s))throw new Do("Patch sequence must be an array","SEQUENCE_NOT_AN_ARRAY");if(o)applyPatch(_deepClone(o),_deepClone(s),i||!0);else{i=i||validator;for(var a=0;a<s.length;a++)i(s[a],a,o,void 0)}}catch(s){if(s instanceof Do)return s;throw s}}function _areEquals(s,o){if(s===o)return!0;if(s&&o&&"object"==typeof s&&"object"==typeof o){var i,a,u,_=Array.isArray(s),w=Array.isArray(o);if(_&&w){if((a=s.length)!=o.length)return!1;for(i=a;0!=i--;)if(!_areEquals(s[i],o[i]))return!1;return!0}if(_!=w)return!1;var x=Object.keys(s);if((a=x.length)!==Object.keys(o).length)return!1;for(i=a;0!=i--;)if(!o.hasOwnProperty(x[i]))return!1;for(i=a;0!=i--;)if(!_areEquals(s[u=x[i]],o[u]))return!1;return!0}return s!=s&&o!=o}var $o=new WeakMap,qo=function qo(s){this.observers=new Map,this.obj=s},Uo=function Uo(s,o){this.callback=s,this.observer=o};function unobserve(s,o){o.unobserve()}function observe(s,o){var i,a=function getMirror(s){return $o.get(s)}(s);if(a){var u=function getObserverFromMirror(s,o){return s.observers.get(o)}(a,o);i=u&&u.observer}else a=new qo(s),$o.set(s,a);if(i)return i;if(i={},a.value=_deepClone(s),o){i.callback=o,i.next=null;var dirtyCheck=function(){generate(i)},fastCheck=function(){clearTimeout(i.next),i.next=setTimeout(dirtyCheck)};"undefined"!=typeof window&&(window.addEventListener("mouseup",fastCheck),window.addEventListener("keyup",fastCheck),window.addEventListener("mousedown",fastCheck),window.addEventListener("keydown",fastCheck),window.addEventListener("change",fastCheck))}return i.patches=[],i.object=s,i.unobserve=function(){generate(i),clearTimeout(i.next),function removeObserverFromMirror(s,o){s.observers.delete(o.callback)}(a,i),"undefined"!=typeof window&&(window.removeEventListener("mouseup",fastCheck),window.removeEventListener("keyup",fastCheck),window.removeEventListener("mousedown",fastCheck),window.removeEventListener("keydown",fastCheck),window.removeEventListener("change",fastCheck))},a.observers.set(o,new Uo(o,i)),i}function generate(s,o){void 0===o&&(o=!1);var i=$o.get(s.object);_generate(i.value,s.object,s.patches,"",o),s.patches.length&&applyPatch(i.value,s.patches);var a=s.patches;return a.length>0&&(s.patches=[],s.callback&&s.callback(a)),a}function _generate(s,o,i,a,u){if(o!==s){"function"==typeof o.toJSON&&(o=o.toJSON());for(var _=_objectKeys(o),w=_objectKeys(s),x=!1,C=w.length-1;C>=0;C--){var j=s[B=w[C]];if(!module_helpers_hasOwnProperty(o,B)||void 0===o[B]&&void 0!==j&&!1===Array.isArray(o))Array.isArray(s)===Array.isArray(o)?(u&&i.push({op:"test",path:a+"/"+escapePathComponent(B),value:_deepClone(j)}),i.push({op:"remove",path:a+"/"+escapePathComponent(B)}),x=!0):(u&&i.push({op:"test",path:a,value:s}),i.push({op:"replace",path:a,value:o}),!0);else{var L=o[B];"object"==typeof j&&null!=j&&"object"==typeof L&&null!=L&&Array.isArray(j)===Array.isArray(L)?_generate(j,L,i,a+"/"+escapePathComponent(B),u):j!==L&&(u&&i.push({op:"test",path:a+"/"+escapePathComponent(B),value:_deepClone(j)}),i.push({op:"replace",path:a+"/"+escapePathComponent(B),value:_deepClone(L)}))}}if(x||_.length!=w.length)for(C=0;C<_.length;C++){var B;module_helpers_hasOwnProperty(s,B=_[C])||void 0===o[B]||i.push({op:"add",path:a+"/"+escapePathComponent(B),value:_deepClone(o[B])})}}}function compare(s,o,i){void 0===i&&(i=!1);var a=[];return _generate(s,o,a,"",i),a}Object.assign({},Z,ee,{JsonPatchError:Ro,deepClone:_deepClone,escapePathComponent,unescapePathComponent});var Vo=__webpack_require__(14744),zo=__webpack_require__.n(Vo);const Wo={add:function add(s,o){return{op:"add",path:s,value:o}},replace,remove:function remove(s){return{op:"remove",path:s}},merge:function lib_merge(s,o){return{type:"mutation",op:"merge",path:s,value:o}},mergeDeep:function mergeDeep(s,o){return{type:"mutation",op:"mergeDeep",path:s,value:o}},context:function context(s,o){return{type:"context",path:s,value:o}},getIn:function lib_getIn(s,o){return o.reduce(((s,o)=>void 0!==o&&s?s[o]:s),s)},applyPatch:function lib_applyPatch(s,o,i){if(i=i||{},"merge"===(o={...o,path:o.path&&normalizeJSONPath(o.path)}).op){const i=getInByJsonPath(s,o.path);Object.assign(i,o.value),applyPatch(s,[replace(o.path,i)])}else if("mergeDeep"===o.op){const i=getInByJsonPath(s,o.path),a=zo()(i,o.value,{customMerge:s=>{if("enum"===s)return(s,o)=>Array.isArray(s)&&Array.isArray(o)?[...new Set([...s,...o])]:zo()(s,o)}});s=applyPatch(s,[replace(o.path,a)]).newDocument}else if("add"===o.op&&""===o.path&&lib_isObject(o.value)){applyPatch(s,Object.keys(o.value).reduce(((s,i)=>(s.push({op:"add",path:`/${normalizeJSONPath(i)}`,value:o.value[i]}),s)),[]))}else if("replace"===o.op&&""===o.path){let{value:a}=o;i.allowMetaPatches&&o.meta&&isAdditiveMutation(o)&&(Array.isArray(o.value)||lib_isObject(o.value))&&(a={...a,...o.meta}),s=a}else if(applyPatch(s,[o]),i.allowMetaPatches&&o.meta&&isAdditiveMutation(o)&&(Array.isArray(o.value)||lib_isObject(o.value))){const i={...getInByJsonPath(s,o.path),...o.meta};applyPatch(s,[replace(o.path,i)])}return s},parentPathMatch:function parentPathMatch(s,o){if(!Array.isArray(o))return!1;for(let i=0,a=o.length;i<a;i+=1)if(o[i]!==s[i])return!1;return!0},flatten,fullyNormalizeArray:function fullyNormalizeArray(s){return cleanArray(flatten(lib_normalizeArray(s)))},normalizeArray:lib_normalizeArray,isPromise:function isPromise(s){return lib_isObject(s)&&lib_isFunction(s.then)},forEachNew:function forEachNew(s,o){try{return forEachNewPatch(s,forEach,o)}catch(s){return s}},forEachNewPrimitive:function forEachNewPrimitive(s,o){try{return forEachNewPatch(s,forEachPrimitive,o)}catch(s){return s}},isJsonPatch,isContextPatch:function isContextPatch(s){return isPatch(s)&&"context"===s.type},isPatch,isMutation,isAdditiveMutation,isGenerator:function isGenerator(s){return"[object GeneratorFunction]"===Object.prototype.toString.call(s)},isFunction:lib_isFunction,isObject:lib_isObject,isError:function lib_isError(s){return s instanceof Error}};function normalizeJSONPath(s){return Array.isArray(s)?s.length<1?"":`/${s.map((s=>(s+"").replace(/~/g,"~0").replace(/\//g,"~1"))).join("/")}`:s}function replace(s,o,i){return{op:"replace",path:s,value:o,meta:i}}function forEachNewPatch(s,o,i){return cleanArray(flatten(s.filter(isAdditiveMutation).map((s=>o(s.value,i,s.path)))||[]))}function forEachPrimitive(s,o,i){return i=i||[],Array.isArray(s)?s.map(((s,a)=>forEachPrimitive(s,o,i.concat(a)))):lib_isObject(s)?Object.keys(s).map((a=>forEachPrimitive(s[a],o,i.concat(a)))):o(s,i[i.length-1],i)}function forEach(s,o,i){let a=[];if((i=i||[]).length>0){const u=o(s,i[i.length-1],i);u&&(a=a.concat(u))}if(Array.isArray(s)){const u=s.map(((s,a)=>forEach(s,o,i.concat(a))));u&&(a=a.concat(u))}else if(lib_isObject(s)){const u=Object.keys(s).map((a=>forEach(s[a],o,i.concat(a))));u&&(a=a.concat(u))}return a=flatten(a),a}function lib_normalizeArray(s){return Array.isArray(s)?s:[s]}function flatten(s){return[].concat(...s.map((s=>Array.isArray(s)?flatten(s):s)))}function cleanArray(s){return s.filter((s=>void 0!==s))}function lib_isObject(s){return s&&"object"==typeof s}function lib_isFunction(s){return s&&"function"==typeof s}function isJsonPatch(s){if(isPatch(s)){const{op:o}=s;return"add"===o||"remove"===o||"replace"===o}return!1}function isMutation(s){return isJsonPatch(s)||isPatch(s)&&"mutation"===s.type}function isAdditiveMutation(s){return isMutation(s)&&("add"===s.op||"replace"===s.op||"merge"===s.op||"mergeDeep"===s.op)}function isPatch(s){return s&&"object"==typeof s}function getInByJsonPath(s,o){try{return getValueByPointer(s,o)}catch(s){return console.error(s),{}}}var Jo=__webpack_require__(48675);const Ho=class ApiDOMAggregateError extends Jo{constructor(s,o,i){if(super(s,o,i),this.name=this.constructor.name,"string"==typeof o&&(this.message=o),"function"==typeof Error.captureStackTrace?Error.captureStackTrace(this,this.constructor):this.stack=new Error(o).stack,null!=i&&"object"==typeof i&&Object.hasOwn(i,"cause")&&!("cause"in this)){const{cause:s}=i;this.cause=s,s instanceof Error&&"stack"in s&&(this.stack=`${this.stack}\nCAUSE: ${s.stack}`)}}};class ApiDOMError extends Error{static[Symbol.hasInstance](s){return super[Symbol.hasInstance](s)||Function.prototype[Symbol.hasInstance].call(Ho,s)}constructor(s,o){if(super(s,o),this.name=this.constructor.name,"string"==typeof s&&(this.message=s),"function"==typeof Error.captureStackTrace?Error.captureStackTrace(this,this.constructor):this.stack=new Error(s).stack,null!=o&&"object"==typeof o&&Object.hasOwn(o,"cause")&&!("cause"in this)){const{cause:s}=o;this.cause=s,s instanceof Error&&"stack"in s&&(this.stack=`${this.stack}\nCAUSE: ${s.stack}`)}}}const Ko=ApiDOMError;const Go=class ApiDOMStructuredError extends Ko{constructor(s,o){if(super(s,o),null!=o&&"object"==typeof o){const{cause:s,...i}=o;Object.assign(this,i)}}};var Yo=__webpack_require__(65606);function _isPlaceholder(s){return null!=s&&"object"==typeof s&&!0===s["@@functional/placeholder"]}function _curry1(s){return function f1(o){return 0===arguments.length||_isPlaceholder(o)?f1:s.apply(this,arguments)}}function _curry2(s){return function f2(o,i){switch(arguments.length){case 0:return f2;case 1:return _isPlaceholder(o)?f2:_curry1((function(i){return s(o,i)}));default:return _isPlaceholder(o)&&_isPlaceholder(i)?f2:_isPlaceholder(o)?_curry1((function(o){return s(o,i)})):_isPlaceholder(i)?_curry1((function(i){return s(o,i)})):s(o,i)}}}function _curry3(s){return function f3(o,i,a){switch(arguments.length){case 0:return f3;case 1:return _isPlaceholder(o)?f3:_curry2((function(i,a){return s(o,i,a)}));case 2:return _isPlaceholder(o)&&_isPlaceholder(i)?f3:_isPlaceholder(o)?_curry2((function(o,a){return s(o,i,a)})):_isPlaceholder(i)?_curry2((function(i,a){return s(o,i,a)})):_curry1((function(a){return s(o,i,a)}));default:return _isPlaceholder(o)&&_isPlaceholder(i)&&_isPlaceholder(a)?f3:_isPlaceholder(o)&&_isPlaceholder(i)?_curry2((function(o,i){return s(o,i,a)})):_isPlaceholder(o)&&_isPlaceholder(a)?_curry2((function(o,a){return s(o,i,a)})):_isPlaceholder(i)&&_isPlaceholder(a)?_curry2((function(i,a){return s(o,i,a)})):_isPlaceholder(o)?_curry1((function(o){return s(o,i,a)})):_isPlaceholder(i)?_curry1((function(i){return s(o,i,a)})):_isPlaceholder(a)?_curry1((function(a){return s(o,i,a)})):s(o,i,a)}}}const Xo=Number.isInteger||function _isInteger(s){return(s|0)===s};function _isString(s){return"[object String]"===Object.prototype.toString.call(s)}function _nth(s,o){var i=s<0?o.length+s:s;return _isString(o)?o.charAt(i):o[i]}function _path(s,o){for(var i=o,a=0;a<s.length;a+=1){if(null==i)return;var u=s[a];i=Xo(u)?_nth(u,i):i[u]}return i}const Qo=_curry3((function pathSatisfies(s,o,i){return s(_path(o,i))}));function _cloneRegExp(s){return new RegExp(s.source,s.flags?s.flags:(s.global?"g":"")+(s.ignoreCase?"i":"")+(s.multiline?"m":"")+(s.sticky?"y":"")+(s.unicode?"u":"")+(s.dotAll?"s":""))}function _arrayFromIterator(s){for(var o,i=[];!(o=s.next()).done;)i.push(o.value);return i}function _includesWith(s,o,i){for(var a=0,u=i.length;a<u;){if(s(o,i[a]))return!0;a+=1}return!1}function _has(s,o){return Object.prototype.hasOwnProperty.call(o,s)}const Zo="function"==typeof Object.is?Object.is:function _objectIs(s,o){return s===o?0!==s||1/s==1/o:s!=s&&o!=o};var _i=Object.prototype.toString;const Ei=function(){return"[object Arguments]"===_i.call(arguments)?function _isArguments(s){return"[object Arguments]"===_i.call(s)}:function _isArguments(s){return _has("callee",s)}}();var Oi=!{toString:null}.propertyIsEnumerable("toString"),Pi=["constructor","valueOf","isPrototypeOf","toString","propertyIsEnumerable","hasOwnProperty","toLocaleString"],Mi=function(){return arguments.propertyIsEnumerable("length")}(),Ri=function contains(s,o){for(var i=0;i<s.length;){if(s[i]===o)return!0;i+=1}return!1},Wi="function"!=typeof Object.keys||Mi?_curry1((function keys(s){if(Object(s)!==s)return[];var o,i,a=[],u=Mi&&Ei(s);for(o in s)!_has(o,s)||u&&"length"===o||(a[a.length]=o);if(Oi)for(i=Pi.length-1;i>=0;)_has(o=Pi[i],s)&&!Ri(a,o)&&(a[a.length]=o),i-=1;return a})):_curry1((function keys(s){return Object(s)!==s?[]:Object.keys(s)}));const ea=Wi;const ra=_curry1((function type(s){return null===s?"Null":void 0===s?"Undefined":Object.prototype.toString.call(s).slice(8,-1)}));function _uniqContentEquals(s,o,i,a){var u=_arrayFromIterator(s);function eq(s,o){return _equals(s,o,i.slice(),a.slice())}return!_includesWith((function(s,o){return!_includesWith(eq,o,s)}),_arrayFromIterator(o),u)}function _equals(s,o,i,a){if(Zo(s,o))return!0;var u=ra(s);if(u!==ra(o))return!1;if("function"==typeof s["fantasy-land/equals"]||"function"==typeof o["fantasy-land/equals"])return"function"==typeof s["fantasy-land/equals"]&&s["fantasy-land/equals"](o)&&"function"==typeof o["fantasy-land/equals"]&&o["fantasy-land/equals"](s);if("function"==typeof s.equals||"function"==typeof o.equals)return"function"==typeof s.equals&&s.equals(o)&&"function"==typeof o.equals&&o.equals(s);switch(u){case"Arguments":case"Array":case"Object":if("function"==typeof s.constructor&&"Promise"===function _functionName(s){var o=String(s).match(/^function (\w*)/);return null==o?"":o[1]}(s.constructor))return s===o;break;case"Boolean":case"Number":case"String":if(typeof s!=typeof o||!Zo(s.valueOf(),o.valueOf()))return!1;break;case"Date":if(!Zo(s.valueOf(),o.valueOf()))return!1;break;case"Error":return s.name===o.name&&s.message===o.message;case"RegExp":if(s.source!==o.source||s.global!==o.global||s.ignoreCase!==o.ignoreCase||s.multiline!==o.multiline||s.sticky!==o.sticky||s.unicode!==o.unicode)return!1}for(var _=i.length-1;_>=0;){if(i[_]===s)return a[_]===o;_-=1}switch(u){case"Map":return s.size===o.size&&_uniqContentEquals(s.entries(),o.entries(),i.concat([s]),a.concat([o]));case"Set":return s.size===o.size&&_uniqContentEquals(s.values(),o.values(),i.concat([s]),a.concat([o]));case"Arguments":case"Array":case"Object":case"Boolean":case"Number":case"String":case"Date":case"Error":case"RegExp":case"Int8Array":case"Uint8Array":case"Uint8ClampedArray":case"Int16Array":case"Uint16Array":case"Int32Array":case"Uint32Array":case"Float32Array":case"Float64Array":case"ArrayBuffer":break;default:return!1}var w=ea(s);if(w.length!==ea(o).length)return!1;var x=i.concat([s]),C=a.concat([o]);for(_=w.length-1;_>=0;){var j=w[_];if(!_has(j,o)||!_equals(o[j],s[j],x,C))return!1;_-=1}return!0}const na=_curry2((function equals(s,o){return _equals(s,o,[],[])}));function _includes(s,o){return function _indexOf(s,o,i){var a,u;if("function"==typeof s.indexOf)switch(typeof o){case"number":if(0===o){for(a=1/o;i<s.length;){if(0===(u=s[i])&&1/u===a)return i;i+=1}return-1}if(o!=o){for(;i<s.length;){if("number"==typeof(u=s[i])&&u!=u)return i;i+=1}return-1}return s.indexOf(o,i);case"string":case"boolean":case"function":case"undefined":return s.indexOf(o,i);case"object":if(null===o)return s.indexOf(o,i)}for(;i<s.length;){if(na(s[i],o))return i;i+=1}return-1}(o,s,0)>=0}function _map(s,o){for(var i=0,a=o.length,u=Array(a);i<a;)u[i]=s(o[i]),i+=1;return u}function _quote(s){return'"'+s.replace(/\\/g,"\\\\").replace(/[\b]/g,"\\b").replace(/\f/g,"\\f").replace(/\n/g,"\\n").replace(/\r/g,"\\r").replace(/\t/g,"\\t").replace(/\v/g,"\\v").replace(/\0/g,"\\0").replace(/"/g,'\\"')+'"'}var ia=function pad(s){return(s<10?"0":"")+s};const aa="function"==typeof Date.prototype.toISOString?function _toISOString(s){return s.toISOString()}:function _toISOString(s){return s.getUTCFullYear()+"-"+ia(s.getUTCMonth()+1)+"-"+ia(s.getUTCDate())+"T"+ia(s.getUTCHours())+":"+ia(s.getUTCMinutes())+":"+ia(s.getUTCSeconds())+"."+(s.getUTCMilliseconds()/1e3).toFixed(3).slice(2,5)+"Z"};function _complement(s){return function(){return!s.apply(this,arguments)}}function _arrayReduce(s,o,i){for(var a=0,u=i.length;a<u;)o=s(o,i[a]),a+=1;return o}const ca=Array.isArray||function _isArray(s){return null!=s&&s.length>=0&&"[object Array]"===Object.prototype.toString.call(s)};function _dispatchable(s,o,i){return function(){if(0===arguments.length)return i();var a=arguments[arguments.length-1];if(!ca(a)){for(var u=0;u<s.length;){if("function"==typeof a[s[u]])return a[s[u]].apply(a,Array.prototype.slice.call(arguments,0,-1));u+=1}if(function _isTransformer(s){return null!=s&&"function"==typeof s["@@transducer/step"]}(a))return o.apply(null,Array.prototype.slice.call(arguments,0,-1))(a)}return i.apply(this,arguments)}}function _isObject(s){return"[object Object]"===Object.prototype.toString.call(s)}const _xfBase_init=function(){return this.xf["@@transducer/init"]()},_xfBase_result=function(s){return this.xf["@@transducer/result"](s)};var la=function(){function XFilter(s,o){this.xf=o,this.f=s}return XFilter.prototype["@@transducer/init"]=_xfBase_init,XFilter.prototype["@@transducer/result"]=_xfBase_result,XFilter.prototype["@@transducer/step"]=function(s,o){return this.f(o)?this.xf["@@transducer/step"](s,o):s},XFilter}();function _xfilter(s){return function(o){return new la(s,o)}}var ua=_curry2(_dispatchable(["fantasy-land/filter","filter"],_xfilter,(function(s,o){return _isObject(o)?_arrayReduce((function(i,a){return s(o[a])&&(i[a]=o[a]),i}),{},ea(o)):function _filter(s,o){for(var i=0,a=o.length,u=[];i<a;)s(o[i])&&(u[u.length]=o[i]),i+=1;return u}(s,o)})));const da=ua;const ma=_curry2((function reject(s,o){return da(_complement(s),o)}));function _toString_toString(s,o){var i=function recur(i){var a=o.concat([s]);return _includes(i,a)?"<Circular>":_toString_toString(i,a)},mapPairs=function(s,o){return _map((function(o){return _quote(o)+": "+i(s[o])}),o.slice().sort())};switch(Object.prototype.toString.call(s)){case"[object Arguments]":return"(function() { return arguments; }("+_map(i,s).join(", ")+"))";case"[object Array]":return"["+_map(i,s).concat(mapPairs(s,ma((function(s){return/^\d+$/.test(s)}),ea(s)))).join(", ")+"]";case"[object Boolean]":return"object"==typeof s?"new Boolean("+i(s.valueOf())+")":s.toString();case"[object Date]":return"new Date("+(isNaN(s.valueOf())?i(NaN):_quote(aa(s)))+")";case"[object Map]":return"new Map("+i(Array.from(s))+")";case"[object Null]":return"null";case"[object Number]":return"object"==typeof s?"new Number("+i(s.valueOf())+")":1/s==-1/0?"-0":s.toString(10);case"[object Set]":return"new Set("+i(Array.from(s).sort())+")";case"[object String]":return"object"==typeof s?"new String("+i(s.valueOf())+")":_quote(s);case"[object Undefined]":return"undefined";default:if("function"==typeof s.toString){var a=s.toString();if("[object Object]"!==a)return a}return"{"+mapPairs(s,ea(s)).join(", ")+"}"}}const ga=_curry1((function toString(s){return _toString_toString(s,[])}));var ya=_curry2((function test(s,o){if(!function _isRegExp(s){return"[object RegExp]"===Object.prototype.toString.call(s)}(s))throw new TypeError("‘test’ requires a value of type RegExp as its first argument; received "+ga(s));return _cloneRegExp(s).test(o)}));const va=ya;function _arity(s,o){switch(s){case 0:return function(){return o.apply(this,arguments)};case 1:return function(s){return o.apply(this,arguments)};case 2:return function(s,i){return o.apply(this,arguments)};case 3:return function(s,i,a){return o.apply(this,arguments)};case 4:return function(s,i,a,u){return o.apply(this,arguments)};case 5:return function(s,i,a,u,_){return o.apply(this,arguments)};case 6:return function(s,i,a,u,_,w){return o.apply(this,arguments)};case 7:return function(s,i,a,u,_,w,x){return o.apply(this,arguments)};case 8:return function(s,i,a,u,_,w,x,C){return o.apply(this,arguments)};case 9:return function(s,i,a,u,_,w,x,C,j){return o.apply(this,arguments)};case 10:return function(s,i,a,u,_,w,x,C,j,L){return o.apply(this,arguments)};default:throw new Error("First argument to _arity must be a non-negative integer no greater than ten")}}function _pipe(s,o){return function(){return o.call(this,s.apply(this,arguments))}}const ba=_curry1((function isArrayLike(s){return!!ca(s)||!!s&&("object"==typeof s&&(!_isString(s)&&(0===s.length||s.length>0&&(s.hasOwnProperty(0)&&s.hasOwnProperty(s.length-1)))))}));var _a="undefined"!=typeof Symbol?Symbol.iterator:"@@iterator";function _createReduce(s,o,i){return function _reduce(a,u,_){if(ba(_))return s(a,u,_);if(null==_)return u;if("function"==typeof _["fantasy-land/reduce"])return o(a,u,_,"fantasy-land/reduce");if(null!=_[_a])return i(a,u,_[_a]());if("function"==typeof _.next)return i(a,u,_);if("function"==typeof _.reduce)return o(a,u,_,"reduce");throw new TypeError("reduce: list must be array or iterable")}}function _xArrayReduce(s,o,i){for(var a=0,u=i.length;a<u;){if((o=s["@@transducer/step"](o,i[a]))&&o["@@transducer/reduced"]){o=o["@@transducer/value"];break}a+=1}return s["@@transducer/result"](o)}const Ea=_curry2((function bind(s,o){return _arity(s.length,(function(){return s.apply(o,arguments)}))}));function _xIterableReduce(s,o,i){for(var a=i.next();!a.done;){if((o=s["@@transducer/step"](o,a.value))&&o["@@transducer/reduced"]){o=o["@@transducer/value"];break}a=i.next()}return s["@@transducer/result"](o)}function _xMethodReduce(s,o,i,a){return s["@@transducer/result"](i[a](Ea(s["@@transducer/step"],s),o))}const wa=_createReduce(_xArrayReduce,_xMethodReduce,_xIterableReduce);var xa=function(){function XWrap(s){this.f=s}return XWrap.prototype["@@transducer/init"]=function(){throw new Error("init not implemented on XWrap")},XWrap.prototype["@@transducer/result"]=function(s){return s},XWrap.prototype["@@transducer/step"]=function(s,o){return this.f(s,o)},XWrap}();function _xwrap(s){return new xa(s)}var ka=_curry3((function(s,o,i){return wa("function"==typeof s?_xwrap(s):s,o,i)}));const Aa=ka;function _checkForMethod(s,o){return function(){var i=arguments.length;if(0===i)return o();var a=arguments[i-1];return ca(a)||"function"!=typeof a[s]?o.apply(this,arguments):a[s].apply(a,Array.prototype.slice.call(arguments,0,i-1))}}var Ca=_curry3(_checkForMethod("slice",(function slice(s,o,i){return Array.prototype.slice.call(i,s,o)})));const ja=Ca;const Ia=_curry1(_checkForMethod("tail",ja(1,1/0)));function pipe(){if(0===arguments.length)throw new Error("pipe requires at least one argument");return _arity(arguments[0].length,Aa(_pipe,arguments[0],Ia(arguments)))}const Na=_curry2((function defaultTo(s,o){return null==o||o!=o?s:o}));const Da=_curry2((function prop(s,o){if(null!=o)return Xo(s)?_nth(s,o):o[s]}));const La=_curry3((function propOr(s,o,i){return Na(s,Da(o,i))}));var Fa=_curry1((function(s){return _nth(-1,s)}));const Ba=Fa;function _curryN(s,o,i){return function(){for(var a=[],u=0,_=s,w=0,x=!1;w<o.length||u<arguments.length;){var C;w<o.length&&(!_isPlaceholder(o[w])||u>=arguments.length)?C=o[w]:(C=arguments[u],u+=1),a[w]=C,_isPlaceholder(C)?x=!0:_-=1,w+=1}return!x&&_<=0?i.apply(this,a):_arity(Math.max(0,_),_curryN(s,a,i))}}const $a=_curry2((function curryN(s,o){return 1===s?_curry1(o):_arity(s,_curryN(s,[],o))}));const za=_curry1((function curry(s){return $a(s.length,s)}));function _isFunction(s){var o=Object.prototype.toString.call(s);return"[object Function]"===o||"[object AsyncFunction]"===o||"[object GeneratorFunction]"===o||"[object AsyncGeneratorFunction]"===o}const Ja=_curry2((function invoker(s,o){return $a(s+1,(function(){var i=arguments[s];if(null!=i&&_isFunction(i[o]))return i[o].apply(i,Array.prototype.slice.call(arguments,0,s));throw new TypeError(ga(i)+' does not have a method named "'+o+'"')}))}));const Ha=Ja(1,"split");function dropLastWhile(s,o){for(var i=o.length-1;i>=0&&s(o[i]);)i-=1;return ja(0,i+1,o)}var Ga=function(){function XDropLastWhile(s,o){this.f=s,this.retained=[],this.xf=o}return XDropLastWhile.prototype["@@transducer/init"]=_xfBase_init,XDropLastWhile.prototype["@@transducer/result"]=function(s){return this.retained=null,this.xf["@@transducer/result"](s)},XDropLastWhile.prototype["@@transducer/step"]=function(s,o){return this.f(o)?this.retain(s,o):this.flush(s,o)},XDropLastWhile.prototype.flush=function(s,o){return s=wa(this.xf,s,this.retained),this.retained=[],this.xf["@@transducer/step"](s,o)},XDropLastWhile.prototype.retain=function(s,o){return this.retained.push(o),s},XDropLastWhile}();function _xdropLastWhile(s){return function(o){return new Ga(s,o)}}const ec=_curry2(_dispatchable([],_xdropLastWhile,dropLastWhile));const rc=Ja(1,"join");const sc=_curry1((function flip(s){return $a(s.length,(function(o,i){var a=Array.prototype.slice.call(arguments,0);return a[0]=i,a[1]=o,s.apply(this,a)}))}))(_curry2(_includes));const oc=za((function(s,o){return pipe(Ha(""),ec(sc(s)),rc(""))(o)}));function _iterableReduce(s,o,i){for(var a=i.next();!a.done;)o=s(o,a.value),a=i.next();return o}function _methodReduce(s,o,i,a){return i[a](s,o)}const ic=_createReduce(_arrayReduce,_methodReduce,_iterableReduce);var ac=function(){function XMap(s,o){this.xf=o,this.f=s}return XMap.prototype["@@transducer/init"]=_xfBase_init,XMap.prototype["@@transducer/result"]=_xfBase_result,XMap.prototype["@@transducer/step"]=function(s,o){return this.xf["@@transducer/step"](s,this.f(o))},XMap}();const cc=_curry2(_dispatchable(["fantasy-land/map","map"],(function _xmap(s){return function(o){return new ac(s,o)}}),(function map(s,o){switch(Object.prototype.toString.call(o)){case"[object Function]":return $a(o.length,(function(){return s.call(this,o.apply(this,arguments))}));case"[object Object]":return _arrayReduce((function(i,a){return i[a]=s(o[a]),i}),{},ea(o));default:return _map(s,o)}})));const lc=_curry2((function ap(s,o){return"function"==typeof o["fantasy-land/ap"]?o["fantasy-land/ap"](s):"function"==typeof s.ap?s.ap(o):"function"==typeof s?function(i){return s(i)(o(i))}:ic((function(s,i){return function _concat(s,o){var i;o=o||[];var a=(s=s||[]).length,u=o.length,_=[];for(i=0;i<a;)_[_.length]=s[i],i+=1;for(i=0;i<u;)_[_.length]=o[i],i+=1;return _}(s,cc(i,o))}),[],s)}));const pc=_curry2((function liftN(s,o){var i=$a(s,o);return $a(s,(function(){return _arrayReduce(lc,cc(i,arguments[0]),Array.prototype.slice.call(arguments,1))}))}));const hc=_curry1((function lift(s){return pc(s.length,s)}));const dc=hc(_curry1((function not(s){return!s})));const fc=_curry1((function always(s){return function(){return s}}));const gc=fc(void 0);const bc=na(gc());const _c=dc(bc);const Ec=_curry2((function max(s,o){if(s===o)return o;function safeMax(s,o){if(s>o!=o>s)return o>s?o:s}var i=safeMax(s,o);if(void 0!==i)return i;var a=safeMax(typeof s,typeof o);if(void 0!==a)return a===typeof s?s:o;var u=ga(s),_=safeMax(u,ga(o));return void 0!==_&&_===u?s:o}));var kc=_curry2((function pluck(s,o){return cc(Da(s),o)}));const Oc=kc;const jc=_curry1((function anyPass(s){return $a(Aa(Ec,0,Oc("length",s)),(function(){for(var o=0,i=s.length;o<i;){if(s[o].apply(this,arguments))return!0;o+=1}return!1}))}));var identical=function(s,o){switch(arguments.length){case 0:return identical;case 1:return function unaryIdentical(o){return 0===arguments.length?unaryIdentical:Zo(s,o)};default:return Zo(s,o)}};const Pc=identical;const Ic=$a(1,pipe(ra,Pc("GeneratorFunction")));const Nc=$a(1,pipe(ra,Pc("AsyncFunction")));const Mc=jc([pipe(ra,Pc("Function")),Ic,Nc]);var Rc=_curry3((function replace(s,o,i){return i.replace(s,o)}));const Lc=Rc;const Fc=$a(1,pipe(ra,Pc("RegExp")));const qc=_curry3((function when(s,o,i){return s(i)?o(i):i}));const Jc=$a(1,pipe(ra,Pc("String")));const Hc=qc(Jc,Lc(/[.*+?^${}()|[\]\\-]/g,"\\$&"));var Kc=function checkValue(s,o){if("string"!=typeof s&&!(s instanceof String))throw TypeError("`".concat(o,"` must be a string"))};const Gc=function replaceAll(s,o,i){!function checkArguments(s,o,i){if(null==i||null==s||null==o)throw TypeError("Input values must not be `null` or `undefined`")}(s,o,i),Kc(i,"str"),Kc(o,"replaceValue"),function checkSearchValue(s){if(!("string"==typeof s||s instanceof String||s instanceof RegExp))throw TypeError("`searchValue` must be a string or an regexp")}(s);var a=new RegExp(Fc(s)?s:Hc(s),"g");return Lc(a,o,i)};var Qc=$a(3,Gc),tl=Ja(2,"replaceAll");const sl=Mc(String.prototype.replaceAll)?tl:Qc,isWindows=()=>Qo(va(/^win/),["platform"],Yo),getProtocol=s=>{try{const o=new URL(s);return oc(":",o.protocol)}catch{return}},ul=(pipe(getProtocol,_c),s=>{if(Yo.browser)return!1;const o=getProtocol(s);return bc(o)||"file"===o||/^[a-zA-Z]$/.test(o)}),isHttpUrl=s=>{const o=getProtocol(s);return"http"===o||"https"===o},toFileSystemPath=(s,o)=>{const i=[/%23/g,"#",/%24/g,"$",/%26/g,"&",/%2C/g,",",/%40/g,"@"],a=La(!1,"keepFileProtocol",o),u=La(isWindows,"isWindows",o);let _=decodeURI(s);for(let s=0;s<i.length;s+=2)_=_.replace(i[s],i[s+1]);let w="file://"===_.substring(0,7).toLowerCase();return w&&(_="/"===_[7]?_.substring(8):_.substring(7),u()&&"/"===_[1]&&(_=`${_[0]}:${_.substring(1)}`),a?_=`file:///${_}`:(w=!1,_=u()?_:`/${_}`)),u()&&!w&&(_=sl("/","\\",_),":\\"===_.substring(1,3)&&(_=_[0].toUpperCase()+_.substring(1))),_},getHash=s=>{const o=s.indexOf("#");return-1!==o?s.substring(o):"#"},stripHash=s=>{const o=s.indexOf("#");let i=s;return o>=0&&(i=s.substring(0,o)),i},url_cwd=()=>{if(Yo.browser)return stripHash(globalThis.location.href);const s=Yo.cwd(),o=Ba(s);return["/","\\"].includes(o)?s:s+(isWindows()?"\\":"/")},resolve=(s,o)=>{const i=new URL(o,new URL(s,"resolve://"));if("resolve:"===i.protocol){const{pathname:s,search:o,hash:a}=i;return s+o+a}return i.toString()},sanitize=s=>{if(ul(s))return(s=>{const o=[/\?/g,"%3F",/#/g,"%23"];let i=s;isWindows()&&(i=i.replace(/\\/g,"/")),i=encodeURI(i);for(let s=0;s<o.length;s+=2)i=i.replace(o[s],o[s+1]);return i})(toFileSystemPath(s));try{return new URL(s).toString()}catch{return encodeURI(decodeURI(s)).replace(/%5B/g,"[").replace(/%5D/g,"]")}},unsanitize=s=>ul(s)?toFileSystemPath(s):decodeURI(s),{fetch:yl,Response:vl,Headers:_l,Request:Sl,FormData:El,File:wl,Blob:xl}=globalThis;function _array_like_to_array(s,o){(null==o||o>s.length)&&(o=s.length);for(var i=0,a=new Array(o);i<o;i++)a[i]=s[i];return a}function legacy_defineProperties(s,o){for(var i=0;i<o.length;i++){var a=o[i];a.enumerable=a.enumerable||!1,a.configurable=!0,"value"in a&&(a.writable=!0),Object.defineProperty(s,a.key,a)}}function _instanceof(s,o){return null!=o&&"undefined"!=typeof Symbol&&o[Symbol.hasInstance]?!!o[Symbol.hasInstance](s):s instanceof o}function _sliced_to_array(s,o){return function _array_with_holes(s){if(Array.isArray(s))return s}(s)||function _iterable_to_array_limit(s,o){var i=null==s?null:"undefined"!=typeof Symbol&&s[Symbol.iterator]||s["@@iterator"];if(null!=i){var a,u,_=[],w=!0,x=!1;try{for(i=i.call(s);!(w=(a=i.next()).done)&&(_.push(a.value),!o||_.length!==o);w=!0);}catch(s){x=!0,u=s}finally{try{w||null==i.return||i.return()}finally{if(x)throw u}}return _}}(s,o)||function _unsupported_iterable_to_array(s,o){if(!s)return;if("string"==typeof s)return _array_like_to_array(s,o);var i=Object.prototype.toString.call(s).slice(8,-1);"Object"===i&&s.constructor&&(i=s.constructor.name);if("Map"===i||"Set"===i)return Array.from(i);if("Arguments"===i||/^(?:Ui|I)nt(?:8|16|32)(?:Clamped)?Array$/.test(i))return _array_like_to_array(s,o)}(s,o)||function _non_iterable_rest(){throw new TypeError("Invalid attempt to destructure non-iterable instance.\\nIn order to be iterable, non-array objects must have a [Symbol.iterator]() method.")}()}function _type_of(s){return s&&"undefined"!=typeof Symbol&&s.constructor===Symbol?"symbol":typeof s}void 0===globalThis.fetch&&(globalThis.fetch=yl),void 0===globalThis.Headers&&(globalThis.Headers=_l),void 0===globalThis.Request&&(globalThis.Request=Sl),void 0===globalThis.Response&&(globalThis.Response=vl),void 0===globalThis.FormData&&(globalThis.FormData=El),void 0===globalThis.File&&(globalThis.File=wl),void 0===globalThis.Blob&&(globalThis.Blob=xl);var __typeError=function(s){throw TypeError(s)},__accessCheck=function(s,o,i){return o.has(s)||__typeError("Cannot "+i)},__privateGet=function(s,o,i){return __accessCheck(s,o,"read from private field"),i?i.call(s):o.get(s)},__privateAdd=function(s,o,i){return o.has(s)?__typeError("Cannot add the same private member more than once"):_instanceof(o,WeakSet)?o.add(s):o.set(s,i)},__privateSet=function(s,o,i,a){return __accessCheck(s,o,"write to private field"),a?a.call(s,i):o.set(s,i),i},to_string=function(s){return Object.prototype.toString.call(s)},is_typed_array=function(s){return ArrayBuffer.isView(s)&&!_instanceof(s,DataView)},kl=Array.isArray,Ol=Object.getOwnPropertyDescriptor,Al=Object.prototype.propertyIsEnumerable,Cl=Object.getOwnPropertySymbols,Pl=Object.prototype.hasOwnProperty;function own_enumerable_keys(s){for(var o=Object.keys(s),i=Cl(s),a=0;a<i.length;a++)Al.call(s,i[a])&&o.push(i[a]);return o}function is_writable(s,o){var i;return!(null===(i=Ol(s,o))||void 0===i?void 0:i.writable)}function legacy_copy(s,o){if("object"===(void 0===s?"undefined":_type_of(s))&&null!==s){var i;if(kl(s))i=[];else if("[object Date]"===to_string(s))i=new Date(s.getTime?s.getTime():s);else if(function(s){return"[object RegExp]"===to_string(s)}(s))i=new RegExp(s);else if(function(s){return"[object Error]"===to_string(s)}(s))i={message:s.message};else if(function(s){return"[object Boolean]"===to_string(s)}(s)||function(s){return"[object Number]"===to_string(s)}(s)||function(s){return"[object String]"===to_string(s)}(s))i=Object(s);else{if(is_typed_array(s))return s.slice();i=Object.create(Object.getPrototypeOf(s))}var a=o.includeSymbols?own_enumerable_keys:Object.keys,u=!0,_=!1,w=void 0;try{for(var x,C=a(s)[Symbol.iterator]();!(u=(x=C.next()).done);u=!0){var j=x.value;i[j]=s[j]}}catch(s){_=!0,w=s}finally{try{u||null==C.return||C.return()}finally{if(_)throw w}}return i}return s}var Il,Tl,Nl={includeSymbols:!1,immutable:!1};function walk(s,o){var i=arguments.length>2&&void 0!==arguments[2]?arguments[2]:Nl,a=[],u=[],_=!0,w=i.includeSymbols?own_enumerable_keys:Object.keys,x=!!i.immutable;return function walker(s){var C=x?legacy_copy(s,i):s,j={},L=!0,B={node:C,node_:s,path:[].concat(a),parent:u[u.length-1],parents:u,key:a[a.length-1],isRoot:0===a.length,level:a.length,circular:void 0,isLeaf:!1,notLeaf:!0,notRoot:!0,isFirst:!1,isLast:!1,update:function update(s){var o=arguments.length>1&&void 0!==arguments[1]&&arguments[1];B.isRoot||(B.parent.node[B.key]=s),B.node=s,o&&(L=!1)},delete:function _delete(s){delete B.parent.node[B.key],s&&(L=!1)},remove:function remove(s){kl(B.parent.node)?B.parent.node.splice(B.key,1):delete B.parent.node[B.key],s&&(L=!1)},keys:null,before:function before(s){j.before=s},after:function after(s){j.after=s},pre:function pre(s){j.pre=s},post:function post(s){j.post=s},stop:function stop(){_=!1},block:function block(){L=!1}};if(!_)return B;function update_state(){if("object"===_type_of(B.node)&&null!==B.node){B.keys&&B.node_===B.node||(B.keys=w(B.node)),B.isLeaf=0===B.keys.length;for(var o=0;o<u.length;o++)if(u[o].node_===s){B.circular=u[o];break}}else B.isLeaf=!0,B.keys=null;B.notLeaf=!B.isLeaf,B.notRoot=!B.isRoot}update_state();var $=o.call(B,B.node);if(void 0!==$&&B.update&&B.update($),j.before&&j.before.call(B,B.node),!L)return B;if("object"===_type_of(B.node)&&null!==B.node&&!B.circular){var U;u.push(B),update_state();var V=!0,z=!1,Y=void 0;try{for(var Z,ee=Object.entries(null!==(U=B.keys)&&void 0!==U?U:[])[Symbol.iterator]();!(V=(Z=ee.next()).done);V=!0){var ie,ae=_sliced_to_array(Z.value,2),ce=ae[0],le=ae[1];a.push(le),j.pre&&j.pre.call(B,B.node[le],le);var pe=walker(B.node[le]);x&&Pl.call(B.node,le)&&!is_writable(B.node,le)&&(B.node[le]=pe.node),pe.isLast=!!(null===(ie=B.keys)||void 0===ie?void 0:ie.length)&&+ce==B.keys.length-1,pe.isFirst=0==+ce,j.post&&j.post.call(B,pe),a.pop()}}catch(s){z=!0,Y=s}finally{try{V||null==ee.return||ee.return()}finally{if(z)throw Y}}u.pop()}return j.after&&j.after.call(B,B.node),B}(s).node}var Ml=function(){function Traverse(s){var o=arguments.length>1&&void 0!==arguments[1]?arguments[1]:Nl;!function _class_call_check(s,o){if(!(s instanceof o))throw new TypeError("Cannot call a class as a function")}(this,Traverse),__privateAdd(this,Il),__privateAdd(this,Tl),__privateSet(this,Il,s),__privateSet(this,Tl,o)}return function _create_class(s,o,i){return o&&legacy_defineProperties(s.prototype,o),i&&legacy_defineProperties(s,i),s}(Traverse,[{key:"get",value:function get(s){for(var o=__privateGet(this,Il),i=0;o&&i<s.length;i++){var a=s[i];if(!Pl.call(o,a)||!__privateGet(this,Tl).includeSymbols&&"symbol"===(void 0===a?"undefined":_type_of(a)))return;o=o[a]}return o}},{key:"has",value:function has(s){for(var o=__privateGet(this,Il),i=0;o&&i<s.length;i++){var a=s[i];if(!Pl.call(o,a)||!__privateGet(this,Tl).includeSymbols&&"symbol"===(void 0===a?"undefined":_type_of(a)))return!1;o=o[a]}return!0}},{key:"set",value:function set(s,o){var i=__privateGet(this,Il),a=0;for(a=0;a<s.length-1;a++){var u=s[a];Pl.call(i,u)||(i[u]={}),i=i[u]}return i[s[a]]=o,o}},{key:"map",value:function map(s){return walk(__privateGet(this,Il),s,{immutable:!0,includeSymbols:!!__privateGet(this,Tl).includeSymbols})}},{key:"forEach",value:function forEach(s){return __privateSet(this,Il,walk(__privateGet(this,Il),s,__privateGet(this,Tl))),__privateGet(this,Il)}},{key:"reduce",value:function reduce(s,o){var i=1===arguments.length,a=i?__privateGet(this,Il):o;return this.forEach((function(o){this.isRoot&&i||(a=s.call(this,a,o))})),a}},{key:"paths",value:function paths(){var s=[];return this.forEach((function(){s.push(this.path)})),s}},{key:"nodes",value:function nodes(){var s=[];return this.forEach((function(){s.push(this.node)})),s}},{key:"clone",value:function clone(){var s=[],o=[],i=__privateGet(this,Tl);return is_typed_array(__privateGet(this,Il))?__privateGet(this,Il).slice():function clone(a){for(var u=0;u<s.length;u++)if(s[u]===a)return o[u];if("object"===(void 0===a?"undefined":_type_of(a))&&null!==a){var _=legacy_copy(a,i);s.push(a),o.push(_);var w=i.includeSymbols?own_enumerable_keys:Object.keys,x=!0,C=!1,j=void 0;try{for(var L,B=w(a)[Symbol.iterator]();!(x=(L=B.next()).done);x=!0){var $=L.value;_[$]=clone(a[$])}}catch(s){C=!0,j=s}finally{try{x||null==B.return||B.return()}finally{if(C)throw j}}return s.pop(),o.pop(),_}return a}(__privateGet(this,Il))}}]),Traverse}();Il=new WeakMap,Tl=new WeakMap;var traverse=function(s,o){return new Ml(s,o)};traverse.get=function(s,o,i){return new Ml(s,i).get(o)},traverse.set=function(s,o,i,a){return new Ml(s,a).set(o,i)},traverse.has=function(s,o,i){return new Ml(s,i).has(o)},traverse.map=function(s,o,i){return new Ml(s,i).map(o)},traverse.forEach=function(s,o,i){return new Ml(s,i).forEach(o)},traverse.reduce=function(s,o,i,a){return new Ml(s,a).reduce(o,i)},traverse.paths=function(s,o){return new Ml(s,o).paths()},traverse.nodes=function(s,o){return new Ml(s,o).nodes()},traverse.clone=function(s,o){return new Ml(s,o).clone()};var Rl=traverse;const Dl="application/json, application/yaml",Ll="https://swagger.io",Fl=Object.freeze({url:"/"}),Bl=3e3,$l=["properties"],Ul=["properties"],Vl=["definitions","parameters","responses","securityDefinitions","components/schemas","components/responses","components/parameters","components/securitySchemes"],zl=["schema/example","items/example"];function isFreelyNamed(s){const o=s[s.length-1],i=s[s.length-2],a=s.join("/");return $l.indexOf(o)>-1&&-1===Ul.indexOf(i)||Vl.indexOf(a)>-1||zl.some((s=>a.indexOf(s)>-1))}function absolutifyPointer(s,o){const[i,a]=s.split("#"),u=null!=o?o:"",_=null!=i?i:"";let w;if(isHttpUrl(u))w=resolve(u,_);else{const s=resolve(Ll,u),o=resolve(s,_).replace(Ll,"");w=_.startsWith("/")?o:o.substring(1)}return a?`${w}#${a}`:w}const Wl=/^([a-z]+:\/\/|\/\/)/i;class JSONRefError extends Go{}const Jl={},Hl=new WeakMap,Kl=[s=>"paths"===s[0]&&"responses"===s[3]&&"examples"===s[5],s=>"paths"===s[0]&&"responses"===s[3]&&"content"===s[5]&&"example"===s[7],s=>"paths"===s[0]&&"responses"===s[3]&&"content"===s[5]&&"examples"===s[7]&&"value"===s[9],s=>"paths"===s[0]&&"requestBody"===s[3]&&"content"===s[4]&&"example"===s[6],s=>"paths"===s[0]&&"requestBody"===s[3]&&"content"===s[4]&&"examples"===s[6]&&"value"===s[8],s=>"paths"===s[0]&&"parameters"===s[2]&&"example"===s[4],s=>"paths"===s[0]&&"parameters"===s[3]&&"example"===s[5],s=>"paths"===s[0]&&"parameters"===s[2]&&"examples"===s[4]&&"value"===s[6],s=>"paths"===s[0]&&"parameters"===s[3]&&"examples"===s[5]&&"value"===s[7],s=>"paths"===s[0]&&"parameters"===s[2]&&"content"===s[4]&&"example"===s[6],s=>"paths"===s[0]&&"parameters"===s[2]&&"content"===s[4]&&"examples"===s[6]&&"value"===s[8],s=>"paths"===s[0]&&"parameters"===s[3]&&"content"===s[4]&&"example"===s[7],s=>"paths"===s[0]&&"parameters"===s[3]&&"content"===s[5]&&"examples"===s[7]&&"value"===s[9]],Gl={key:"$ref",plugin:(s,o,i,a)=>{const u=a.getInstance(),_=i.slice(0,-1);if(isFreelyNamed(_)||(s=>Kl.some((o=>o(s))))(_))return;const{baseDoc:w}=a.getContext(i);if("string"!=typeof s)return new JSONRefError("$ref: must be a string (JSON-Ref)",{$ref:s,baseDoc:w,fullPath:i});const x=refs_split(s),C=x[0],j=x[1]||"";let L,B,$;try{L=w||C?absoluteify(C,w):null}catch(o){return wrapError(o,{pointer:j,$ref:s,basePath:L,fullPath:i})}if(function pointerAlreadyInPath(s,o,i,a){let u=Hl.get(a);u||(u={},Hl.set(a,u));const _=function arrayToJsonPointer(s){if(0===s.length)return"";return`/${s.map(escapeJsonPointerToken).join("/")}`}(i),w=`${o||"<specmap-base>"}#${s}`,x=_.replace(/allOf\/\d+\/?/g,""),C=a.contextTree.get([]).baseDoc;if(o===C&&pointerIsAParent(x,s))return!0;let j="";const L=i.some((s=>(j=`${j}/${escapeJsonPointerToken(s)}`,u[j]&&u[j].some((s=>pointerIsAParent(s,w)||pointerIsAParent(w,s))))));if(L)return!0;return void(u[x]=(u[x]||[]).concat(w))}(j,L,_,a)&&!u.useCircularStructures){const o=absolutifyPointer(s,L);return s===o?null:Wo.replace(i,o)}if(null==L?($=jsonPointerToArray(j),B=a.get($),void 0===B&&(B=new JSONRefError(`Could not resolve reference: ${s}`,{pointer:j,$ref:s,baseDoc:w,fullPath:i}))):(B=extractFromDoc(L,j),B=null!=B.__value?B.__value:B.catch((o=>{throw wrapError(o,{pointer:j,$ref:s,baseDoc:w,fullPath:i})}))),B instanceof Error)return[Wo.remove(i),B];const U=absolutifyPointer(s,L),V=Wo.replace(_,B,{$$ref:U});if(L&&L!==w)return[V,Wo.context(_,{baseDoc:L})];try{if(!function patchValueAlreadyInPath(s,o){const i=[s];return o.path.reduce(((s,o)=>(i.push(s[o]),s[o])),s),pointToAncestor(o.value);function pointToAncestor(s){return Wo.isObject(s)&&(i.indexOf(s)>=0||Object.keys(s).some((o=>pointToAncestor(s[o]))))}}(a.state,V)||u.useCircularStructures)return V}catch(s){return null}}},Yl=Object.assign(Gl,{docCache:Jl,absoluteify,clearCache:function clearCache(s){void 0!==s?delete Jl[s]:Object.keys(Jl).forEach((s=>{delete Jl[s]}))},JSONRefError,wrapError,getDoc,split:refs_split,extractFromDoc,fetchJSON:function fetchJSON(s){return fetch(s,{headers:{Accept:Dl},loadSpec:!0}).then((s=>s.text())).then((s=>fn.load(s)))},extract,jsonPointerToArray,unescapeJsonPointerToken}),Xl=Yl;function absoluteify(s,o){if(!Wl.test(s)){if(!o)throw new JSONRefError(`Tried to resolve a relative URL, without having a basePath. path: '${s}' basePath: '${o}'`);return resolve(o,s)}return s}function wrapError(s,o){let i;return i=s&&s.response&&s.response.body?`${s.response.body.code} ${s.response.body.message}`:s.message,new JSONRefError(`Could not resolve reference: ${i}`,{...o,cause:s})}function refs_split(s){return(s+"").split("#")}function extractFromDoc(s,o){const i=Jl[s];if(i&&!Wo.isPromise(i))try{const s=extract(o,i);return Object.assign(Promise.resolve(s),{__value:s})}catch(s){return Promise.reject(s)}return getDoc(s).then((s=>extract(o,s)))}function getDoc(s){const o=Jl[s];return o?Wo.isPromise(o)?o:Promise.resolve(o):(Jl[s]=Yl.fetchJSON(s).then((o=>(Jl[s]=o,o))),Jl[s])}function extract(s,o){const i=jsonPointerToArray(s);if(i.length<1)return o;const a=Wo.getIn(o,i);if(void 0===a)throw new JSONRefError(`Could not resolve pointer: ${s} does not exist in document`,{pointer:s});return a}function jsonPointerToArray(s){if("string"!=typeof s)throw new TypeError("Expected a string, got a "+typeof s);return"/"===s[0]&&(s=s.substr(1)),""===s?[]:s.split("/").map(unescapeJsonPointerToken)}function unescapeJsonPointerToken(s){if("string"!=typeof s)return s;return new URLSearchParams(`=${s.replace(/~1/g,"/").replace(/~0/g,"~")}`).get("")}function escapeJsonPointerToken(s){return new URLSearchParams([["",s.replace(/~/g,"~0").replace(/\//g,"~1")]]).toString().slice(1)}const pointerBoundaryChar=s=>!s||"/"===s||"#"===s;function pointerIsAParent(s,o){if(pointerBoundaryChar(o))return!0;const i=s.charAt(o.length),a=o.slice(-1);return 0===s.indexOf(o)&&(!i||"/"===i||"#"===i)&&"#"!==a}const Ql={key:"allOf",plugin:(s,o,i,a,u)=>{if(u.meta&&u.meta.$$ref)return;const _=i.slice(0,-1);if(isFreelyNamed(_))return;if(!Array.isArray(s)){const s=new TypeError("allOf must be an array");return s.fullPath=i,s}let w=!1,x=u.value;if(_.forEach((s=>{x&&(x=x[s])})),x={...x},0===Object.keys(x).length)return;delete x.allOf;const C=[];return C.push(a.replace(_,{})),s.forEach(((s,o)=>{if(!a.isObject(s)){if(w)return null;w=!0;const s=new TypeError("Elements in allOf must be objects");return s.fullPath=i,C.push(s)}C.push(a.mergeDeep(_,s));const u=function generateAbsoluteRefPatches(s,o,{specmap:i,getBaseUrlForNodePath:a=s=>i.getContext([...o,...s]).baseDoc,targetKeys:u=["$ref","$$ref"]}={}){const _=[];return Rl(s).forEach((function callback(){if(u.includes(this.key)&&"string"==typeof this.node){const s=this.path,u=o.concat(this.path),w=absolutifyPointer(this.node,a(s));_.push(i.replace(u,w))}})),_}(s,i.slice(0,-1),{getBaseUrlForNodePath:s=>a.getContext([...i,o,...s]).baseDoc,specmap:a});C.push(...u)})),x.example&&C.push(a.remove([].concat(_,"example"))),C.push(a.mergeDeep(_,x)),x.$$ref||C.push(a.remove([].concat(_,"$$ref"))),C}},Zl={key:"parameters",plugin:(s,o,i,a)=>{if(Array.isArray(s)&&s.length){const o=Object.assign([],s),u=i.slice(0,-1),_={...Wo.getIn(a.spec,u)};for(let u=0;u<s.length;u+=1){const w=s[u];try{o[u].default=a.parameterMacro(_,w)}catch(s){const o=new Error(s);return o.fullPath=i,o}}return Wo.replace(i,o)}return Wo.replace(i,s)}},eu={key:"properties",plugin:(s,o,i,a)=>{const u={...s};for(const o in s)try{u[o].default=a.modelPropertyMacro(u[o])}catch(s){const o=new Error(s);return o.fullPath=i,o}return Wo.replace(i,u)}};class ContextTree{constructor(s){this.root=context_tree_createNode(s||{})}set(s,o){const i=this.getParent(s,!0);if(!i)return void context_tree_updateNode(this.root,o,null);const a=s[s.length-1],{children:u}=i;u[a]?context_tree_updateNode(u[a],o,i):u[a]=context_tree_createNode(o,i)}get(s){if((s=s||[]).length<1)return this.root.value;let o,i,a=this.root;for(let u=0;u<s.length&&(i=s[u],o=a.children,o[i]);u+=1)a=o[i];return a&&a.protoValue}getParent(s,o){return!s||s.length<1?null:s.length<2?this.root:s.slice(0,-1).reduce(((s,i)=>{if(!s)return s;const{children:a}=s;return!a[i]&&o&&(a[i]=context_tree_createNode(null,s)),a[i]}),this.root)}}function context_tree_createNode(s,o){return context_tree_updateNode({children:{}},s,o)}function context_tree_updateNode(s,o,i){return s.value=o||{},s.protoValue=i?{...i.protoValue,...s.value}:s.value,Object.keys(s.children).forEach((o=>{const i=s.children[o];s.children[o]=context_tree_updateNode(i,i.value,s)})),s}const specmap_noop=()=>{};class SpecMap{static getPluginName(s){return s.pluginName}static getPatchesOfType(s,o){return s.filter(o)}constructor(s){Object.assign(this,{spec:"",debugLevel:"info",plugins:[],pluginHistory:{},errors:[],mutations:[],promisedPatches:[],state:{},patches:[],context:{},contextTree:new ContextTree,showDebug:!1,allPatches:[],pluginProp:"specMap",libMethods:Object.assign(Object.create(this),Wo,{getInstance:()=>this}),allowMetaPatches:!1},s),this.get=this._get.bind(this),this.getContext=this._getContext.bind(this),this.hasRun=this._hasRun.bind(this),this.wrappedPlugins=this.plugins.map(this.wrapPlugin.bind(this)).filter(Wo.isFunction),this.patches.push(Wo.add([],this.spec)),this.patches.push(Wo.context([],this.context)),this.updatePatches(this.patches)}debug(s,...o){this.debugLevel===s&&console.log(...o)}verbose(s,...o){"verbose"===this.debugLevel&&console.log(`[${s}]   `,...o)}wrapPlugin(s,o){const{pathDiscriminator:i}=this;let a,u=null;return s[this.pluginProp]?(u=s,a=s[this.pluginProp]):Wo.isFunction(s)?a=s:Wo.isObject(s)&&(a=function createKeyBasedPlugin(s){const isSubPath=(s,o)=>!Array.isArray(s)||s.every(((s,i)=>s===o[i]));return function*generator(o,a){const u={};for(const[s,i]of o.filter(Wo.isAdditiveMutation).entries()){if(!(s<Bl))return;yield*traverse(i.value,i.path,i)}function*traverse(o,_,w){if(Wo.isObject(o)){const x=_.length-1,C=_[x],j=_.indexOf("properties"),L="properties"===C&&x===j,B=a.allowMetaPatches&&u[o.$$ref];for(const x of Object.keys(o)){const C=o[x],j=_.concat(x),$=Wo.isObject(C),U=o.$$ref;if(B||$&&(a.allowMetaPatches&&U&&isSubPath(i,j)&&(u[U]=!0),yield*traverse(C,j,w)),!L&&x===s.key){const o=isSubPath(i,_);i&&!o||(yield s.plugin(C,x,j,a,w))}}}else s.key===_[_.length-1]&&(yield s.plugin(o,s.key,_,a))}}}(s)),Object.assign(a.bind(u),{pluginName:s.name||o,isGenerator:Wo.isGenerator(a)})}nextPlugin(){return this.wrappedPlugins.find((s=>this.getMutationsForPlugin(s).length>0))}nextPromisedPatch(){if(this.promisedPatches.length>0)return Promise.race(this.promisedPatches.map((s=>s.value)))}getPluginHistory(s){const o=this.constructor.getPluginName(s);return this.pluginHistory[o]||[]}getPluginRunCount(s){return this.getPluginHistory(s).length}getPluginHistoryTip(s){const o=this.getPluginHistory(s);return o&&o[o.length-1]||{}}getPluginMutationIndex(s){const o=this.getPluginHistoryTip(s).mutationIndex;return"number"!=typeof o?-1:o}updatePluginHistory(s,o){const i=this.constructor.getPluginName(s);this.pluginHistory[i]=this.pluginHistory[i]||[],this.pluginHistory[i].push(o)}updatePatches(s){Wo.normalizeArray(s).forEach((s=>{if(s instanceof Error)this.errors.push(s);else try{if(!Wo.isObject(s))return void this.debug("updatePatches","Got a non-object patch",s);if(this.showDebug&&this.allPatches.push(s),Wo.isPromise(s.value))return this.promisedPatches.push(s),void this.promisedPatchThen(s);if(Wo.isContextPatch(s))return void this.setContext(s.path,s.value);Wo.isMutation(s)&&this.updateMutations(s)}catch(s){console.error(s),this.errors.push(s)}}))}updateMutations(s){"object"==typeof s.value&&!Array.isArray(s.value)&&this.allowMetaPatches&&(s.value={...s.value});const o=Wo.applyPatch(this.state,s,{allowMetaPatches:this.allowMetaPatches});o&&(this.mutations.push(s),this.state=o)}removePromisedPatch(s){const o=this.promisedPatches.indexOf(s);o<0?this.debug("Tried to remove a promisedPatch that isn't there!"):this.promisedPatches.splice(o,1)}promisedPatchThen(s){return s.value=s.value.then((o=>{const i={...s,value:o};this.removePromisedPatch(s),this.updatePatches(i)})).catch((o=>{this.removePromisedPatch(s),this.updatePatches(o)})),s.value}getMutations(s,o){return s=s||0,"number"!=typeof o&&(o=this.mutations.length),this.mutations.slice(s,o)}getCurrentMutations(){return this.getMutationsForPlugin(this.getCurrentPlugin())}getMutationsForPlugin(s){const o=this.getPluginMutationIndex(s);return this.getMutations(o+1)}getCurrentPlugin(){return this.currentPlugin}getLib(){return this.libMethods}_get(s){return Wo.getIn(this.state,s)}_getContext(s){return this.contextTree.get(s)}setContext(s,o){return this.contextTree.set(s,o)}_hasRun(s){return this.getPluginRunCount(this.getCurrentPlugin())>(s||0)}dispatch(){const s=this,o=this.nextPlugin();if(!o){const s=this.nextPromisedPatch();if(s)return s.then((()=>this.dispatch())).catch((()=>this.dispatch()));const o={spec:this.state,errors:this.errors};return this.showDebug&&(o.patches=this.allPatches),Promise.resolve(o)}if(s.pluginCount=s.pluginCount||new WeakMap,s.pluginCount.set(o,(s.pluginCount.get(o)||0)+1),s.pluginCount[o]>100)return Promise.resolve({spec:s.state,errors:s.errors.concat(new Error("We've reached a hard limit of 100 plugin runs"))});if(o!==this.currentPlugin&&this.promisedPatches.length){const s=this.promisedPatches.map((s=>s.value));return Promise.all(s.map((s=>s.then(specmap_noop,specmap_noop)))).then((()=>this.dispatch()))}return function executePlugin(){s.currentPlugin=o;const i=s.getCurrentMutations(),a=s.mutations.length-1;try{if(o.isGenerator)for(const a of o(i,s.getLib()))updatePatches(a);else{updatePatches(o(i,s.getLib()))}}catch(s){console.error(s),updatePatches([Object.assign(Object.create(s),{plugin:o})])}finally{s.updatePluginHistory(o,{mutationIndex:a})}return s.dispatch()}();function updatePatches(i){i&&(i=Wo.fullyNormalizeArray(i),s.updatePatches(i,o))}}}const tu={refs:Xl,allOf:Ql,parameters:Zl,properties:eu};function makeFetchJSON(s,o={}){const{requestInterceptor:i,responseInterceptor:a}=o,u=s.withCredentials?"include":"same-origin";return o=>s({url:o,loadSpec:!0,requestInterceptor:i,responseInterceptor:a,headers:{Accept:Dl},credentials:u}).then((s=>s.body))}function isFile(s,o){return o||"undefined"==typeof navigator||(o=navigator),o&&"ReactNative"===o.product?!(!s||"object"!=typeof s||"string"!=typeof s.uri):"undefined"!=typeof File&&s instanceof File||("undefined"!=typeof Blob&&s instanceof Blob||(!!ArrayBuffer.isView(s)||null!==s&&"object"==typeof s&&"function"==typeof s.pipe))}function isArrayOfFile(s,o){return Array.isArray(s)&&s.some((s=>isFile(s,o)))}class FileWithData extends File{constructor(s,o="",i={}){super([s],o,i),this.data=s}valueOf(){return this.data}toString(){return this.valueOf()}}const isRfc3986Reserved=s=>":/?#[]@!$&'()*+,;=".indexOf(s)>-1,isRfc3986Unreserved=s=>/^[a-z0-9\-._~]+$/i.test(s);function encodeCharacters(s,o="reserved"){return[...s].map((s=>{if(isRfc3986Unreserved(s))return s;if(isRfc3986Reserved(s)&&"unsafe"===o)return s;const i=new TextEncoder;return Array.from(i.encode(s)).map((s=>`0${s.toString(16).toUpperCase()}`.slice(-2))).map((s=>`%${s}`)).join("")})).join("")}function stylize(s){const{value:o}=s;return Array.isArray(o)?function encodeArray({key:s,value:o,style:i,explode:a,escape:u}){if("simple"===i)return o.map((s=>valueEncoder(s,u))).join(",");if("label"===i)return`.${o.map((s=>valueEncoder(s,u))).join(".")}`;if("matrix"===i)return o.map((s=>valueEncoder(s,u))).reduce(((o,i)=>!o||a?`${o||""};${s}=${i}`:`${o},${i}`),"");if("form"===i){const i=a?`&${s}=`:",";return o.map((s=>valueEncoder(s,u))).join(i)}if("spaceDelimited"===i){const i=a?`${s}=`:"";return o.map((s=>valueEncoder(s,u))).join(` ${i}`)}if("pipeDelimited"===i){const i=a?`${s}=`:"";return o.map((s=>valueEncoder(s,u))).join(`|${i}`)}return}(s):"object"==typeof o?function encodeObject({key:s,value:o,style:i,explode:a,escape:u}){const _=Object.keys(o);if("simple"===i)return _.reduce(((s,i)=>{const _=valueEncoder(o[i],u);return`${s?`${s},`:""}${i}${a?"=":","}${_}`}),"");if("label"===i)return _.reduce(((s,i)=>{const _=valueEncoder(o[i],u);return`${s?`${s}.`:"."}${i}${a?"=":"."}${_}`}),"");if("matrix"===i&&a)return _.reduce(((s,i)=>`${s?`${s};`:";"}${i}=${valueEncoder(o[i],u)}`),"");if("matrix"===i)return _.reduce(((i,a)=>{const _=valueEncoder(o[a],u);return`${i?`${i},`:`;${s}=`}${a},${_}`}),"");if("form"===i)return _.reduce(((s,i)=>{const _=valueEncoder(o[i],u);return`${s?`${s}${a?"&":","}`:""}${i}${a?"=":","}${_}`}),"");return}(s):function encodePrimitive({key:s,value:o,style:i,escape:a}){if("simple"===i)return valueEncoder(o,a);if("label"===i)return`.${valueEncoder(o,a)}`;if("matrix"===i)return`;${s}=${valueEncoder(o,a)}`;if("form"===i)return valueEncoder(o,a);if("deepObject"===i)return valueEncoder(o,a);return}(s)}function valueEncoder(s,o=!1){return Array.isArray(s)||null!==s&&"object"==typeof s?s=JSON.stringify(s):"number"!=typeof s&&"boolean"!=typeof s||(s=String(s)),o&&"string"==typeof s&&s.length>0?encodeCharacters(s,o):null!=s?s:""}const ru={form:",",spaceDelimited:"%20",pipeDelimited:"|"},nu={csv:",",ssv:"%20",tsv:"%09",pipes:"|"};function formatKeyValue(s,o,i=!1){const{collectionFormat:a,allowEmptyValue:u,serializationOption:_,encoding:w}=o,x="object"!=typeof o||Array.isArray(o)?o:o.value,C=i?s=>s.toString():s=>encodeURIComponent(s),j=C(s);if(void 0===x&&u)return[[j,""]];if(isFile(x)||isArrayOfFile(x))return[[j,x]];if(_)return formatKeyValueBySerializationOption(s,x,i,_);if(w){if([typeof w.style,typeof w.explode,typeof w.allowReserved].some((s=>"undefined"!==s))){const{style:o,explode:a,allowReserved:u}=w;return formatKeyValueBySerializationOption(s,x,i,{style:o,explode:a,allowReserved:u})}if("string"==typeof w.contentType){if(w.contentType.startsWith("application/json")){const s=C("string"==typeof x?x:JSON.stringify(x));return[[j,new FileWithData(s,"blob",{type:w.contentType})]]}const s=C(String(x));return[[j,new FileWithData(s,"blob",{type:w.contentType})]]}return"object"!=typeof x?[[j,C(x)]]:Array.isArray(x)&&x.every((s=>"object"!=typeof s))?[[j,x.map(C).join(",")]]:[[j,C(JSON.stringify(x))]]}return"object"!=typeof x?[[j,C(x)]]:Array.isArray(x)?"multi"===a?[[j,x.map(C)]]:[[j,x.map(C).join(nu[a||"csv"])]]:[[j,""]]}function formatKeyValueBySerializationOption(s,o,i,a){const u=a.style||"form",_=void 0===a.explode?"form"===u:a.explode,w=!i&&(a&&a.allowReserved?"unsafe":"reserved"),encodeFn=s=>valueEncoder(s,w),x=i?s=>s:s=>encodeFn(s);return"object"!=typeof o?[[x(s),encodeFn(o)]]:Array.isArray(o)?_?[[x(s),o.map(encodeFn)]]:[[x(s),o.map(encodeFn).join(ru[u])]]:"deepObject"===u?Object.keys(o).map((i=>[x(`${s}[${i}]`),encodeFn(o[i])])):_?Object.keys(o).map((s=>[x(s),encodeFn(o[s])])):[[x(s),Object.keys(o).map((s=>[`${x(s)},${encodeFn(o[s])}`])).join(",")]]}function encodeFormOrQuery(s){return((s,{encode:o=!0}={})=>{const buildNestedParams=(s,o,i)=>(Array.isArray(i)?i.reduce(((i,a)=>buildNestedParams(s,o,a)),s):i instanceof Date?s.append(o,i.toISOString()):"object"==typeof i?Object.entries(i).reduce(((i,[a,u])=>buildNestedParams(s,`${o}[${a}]`,u)),s):s.append(o,i),s),i=Object.entries(s).reduce(((s,[o,i])=>buildNestedParams(s,o,i)),new URLSearchParams),a=String(i);return o?a:decodeURIComponent(a)})(Object.keys(s).reduce(((o,i)=>{for(const[a,u]of formatKeyValue(i,s[i]))o[a]=u instanceof FileWithData?u.valueOf():u;return o}),{}),{encode:!1})}function serializeRequest(s={}){const{url:o="",query:i,form:a}=s;if(a){const o=Object.keys(a).some((s=>{const{value:o}=a[s];return isFile(o)||isArrayOfFile(o)})),i=s.headers["content-type"]||s.headers["Content-Type"];if(o||/multipart\/form-data/i.test(i)){const o=function request_buildFormData(s){return Object.entries(s).reduce(((s,[o,i])=>{for(const[a,u]of formatKeyValue(o,i,!0))if(Array.isArray(u))for(const o of u)if(ArrayBuffer.isView(o)){const i=new Blob([o]);s.append(a,i)}else s.append(a,o);else if(ArrayBuffer.isView(u)){const o=new Blob([u]);s.append(a,o)}else s.append(a,u);return s}),new FormData)}(s.form);s.formdata=o,s.body=o}else s.body=encodeFormOrQuery(a);delete s.form}if(i){const[a,u]=o.split("?");let _="";if(u){const s=new URLSearchParams(u);Object.keys(i).forEach((o=>s.delete(o))),_=String(s)}const w=((...s)=>{const o=s.filter((s=>s)).join("&");return o?`?${o}`:""})(_,encodeFormOrQuery(i));s.url=a+w,delete s.query}return s}function serializeHeaders(s={}){return"function"!=typeof s.entries?{}:Array.from(s.entries()).reduce(((s,[o,i])=>(s[o]=function serializeHeaderValue(s){return s.includes(", ")?s.split(", "):s}(i),s)),{})}function serializeResponse(s,o,{loadSpec:i=!1}={}){const a={ok:s.ok,url:s.url||o,status:s.status,statusText:s.statusText,headers:serializeHeaders(s.headers)},u=a.headers["content-type"],_=i||((s="")=>/(json|xml|yaml|text)\b/.test(s))(u);return(_?s.text:s.blob||s.buffer).call(s).then((s=>{if(a.text=s,a.data=s,_)try{const o=function parseBody(s,o){if(o){if(0===o.indexOf("application/json")||o.indexOf("+json")>0)return JSON.parse(s);if(0===o.indexOf("application/xml")||o.indexOf("+xml")>0)return s}return fn.load(s)}(s,u);a.body=o,a.obj=o}catch(s){a.parseError=s}return a}))}async function http_http(s,o={}){"object"==typeof s&&(s=(o=s).url),o.headers=o.headers||{},(o=serializeRequest(o)).headers&&Object.keys(o.headers).forEach((s=>{const i=o.headers[s];"string"==typeof i&&(o.headers[s]=i.replace(/\n+/g," "))})),o.requestInterceptor&&(o=await o.requestInterceptor(o)||o);const i=o.headers["content-type"]||o.headers["Content-Type"];let a;/multipart\/form-data/i.test(i)&&(delete o.headers["content-type"],delete o.headers["Content-Type"]);try{a=await(o.userFetch||fetch)(o.url,o),a=await serializeResponse(a,s,o),o.responseInterceptor&&(a=await o.responseInterceptor(a)||a)}catch(s){if(!a)throw s;const o=new Error(a.statusText||`response status is ${a.status}`);throw o.status=a.status,o.statusCode=a.status,o.responseError=s,o}if(!a.ok){const s=new Error(a.statusText||`response status is ${a.status}`);throw s.status=a.status,s.statusCode=a.status,s.response=a,s}return a}const options_retrievalURI=s=>{var o,i;const{baseDoc:a,url:u}=s,_=null!==(o=null!=a?a:u)&&void 0!==o?o:"";return"string"==typeof(null===(i=globalThis.document)||void 0===i?void 0:i.baseURI)?String(new URL(_,globalThis.document.baseURI)):_},options_httpClient=s=>{const{fetch:o,http:i}=s;return o||i||http_http};async function resolveGenericStrategy(s){const{spec:o,mode:i,allowMetaPatches:a=!0,pathDiscriminator:u,modelPropertyMacro:_,parameterMacro:w,requestInterceptor:x,responseInterceptor:C,skipNormalization:j=!1,useCircularStructures:L,strategies:B}=s,$=options_retrievalURI(s),U=options_httpClient(s),V=B.find((s=>s.match(o)));return async function doResolve(s){$&&(tu.refs.docCache[$]=s);tu.refs.fetchJSON=makeFetchJSON(U,{requestInterceptor:x,responseInterceptor:C});const o=[tu.refs];"function"==typeof w&&o.push(tu.parameters);"function"==typeof _&&o.push(tu.properties);"strict"!==i&&o.push(tu.allOf);const B=await function mapSpec(s){return new SpecMap(s).dispatch()}({spec:s,context:{baseDoc:$},plugins:o,allowMetaPatches:a,pathDiscriminator:u,parameterMacro:w,modelPropertyMacro:_,useCircularStructures:L});j||(B.spec=V.normalize(B.spec));return B}(o)}const su=_curry2((function and(s,o){return s&&o}));const ou=_curry2((function both(s,o){return _isFunction(s)?function _both(){return s.apply(this,arguments)&&o.apply(this,arguments)}:hc(su)(s,o)}));const iu=na(null);const au=dc(iu);function isOfTypeObject_typeof(s){return isOfTypeObject_typeof="function"==typeof Symbol&&"symbol"==typeof Symbol.iterator?function(s){return typeof s}:function(s){return s&&"function"==typeof Symbol&&s.constructor===Symbol&&s!==Symbol.prototype?"symbol":typeof s},isOfTypeObject_typeof(s)}const cu=function isOfTypeObject(s){return"object"===isOfTypeObject_typeof(s)};const lu=$a(1,ou(au,cu));var uu=pipe(ra,Pc("Object")),pu=pipe(ga,na(ga(Object))),hu=Qo(ou(Mc,pu),["constructor"]),du=$a(1,(function(s){if(!lu(s)||!uu(s))return!1;var o=Object.getPrototypeOf(s);return!!iu(o)||hu(o)}));const fu=du,replace_special_chars_with_underscore=s=>s.replace(/\W/gi,"_");function opId(s,o,i="",{v2OperationIdCompatibilityMode:a}={}){if(!s||"object"!=typeof s)return null;return(s.operationId||"").replace(/\s/g,"").length?replace_special_chars_with_underscore(s.operationId):function idFromPathMethod(s,o,{v2OperationIdCompatibilityMode:i}={}){if(i){let i=`${o.toLowerCase()}_${s}`.replace(/[\s!@#$%^&*()_+=[{\]};:<>|./?,\\'""-]/g,"_");return i=i||`${s.substring(1)}_${o}`,i.replace(/((_){2,})/g,"_").replace(/^(_)*/g,"").replace(/([_])*$/g,"")}return`${o.toLowerCase()}${replace_special_chars_with_underscore(s)}`}(o,i,{v2OperationIdCompatibilityMode:a})}function normalize_normalize(s){const{spec:o}=s,{paths:i}=o,a={};if(!i||o.$$normalized)return s;for(const s in i){const u=i[s];if(null==u||!["object","function"].includes(typeof u))continue;const _=u.parameters;for(const i in u){const w=u[i];if(null==w||!["object","function"].includes(typeof w))continue;const x=opId(w,s,i);if(x){a[x]?a[x].push(w):a[x]=[w];const s=a[x];if(s.length>1)s.forEach(((s,o)=>{s.__originalOperationId=s.__originalOperationId||s.operationId,s.operationId=`${x}${o+1}`}));else if(void 0!==w.operationId){const o=s[0];o.__originalOperationId=o.__originalOperationId||w.operationId,o.operationId=x}}if("parameters"!==i){const s=[],i={};for(const a in o)"produces"!==a&&"consumes"!==a&&"security"!==a||(i[a]=o[a],s.push(i));if(_&&(i.parameters=_,s.push(i)),s.length)for(const o of s)for(const s in o)if(Array.isArray(w[s])){if("parameters"===s)for(const i of o[s]){w[s].some((s=>!(!fu(s)&&!fu(i))&&(s===i||["name","$ref","$$ref"].some((o=>"string"==typeof s[o]&&"string"==typeof i[o]&&s[o]===i[o])))))||w[s].push(i)}}else w[s]=o[s]}}}return o.$$normalized=!0,s}const mu={name:"generic",match:()=>!0,normalize(s){const{spec:o}=normalize_normalize({spec:s});return o},resolve:async s=>resolveGenericStrategy(s)},gu=mu;const isOpenAPI30=s=>{try{const{openapi:o}=s;return"string"==typeof o&&/^3\.0\.(?:[1-9]\d*|0)$/.test(o)}catch{return!1}},isOpenAPI31=s=>{try{const{openapi:o}=s;return"string"==typeof o&&/^3\.1\.(?:[1-9]\d*|0)$/.test(o)}catch{return!1}},isOpenAPI3=s=>isOpenAPI30(s)||isOpenAPI31(s),yu={name:"openapi-2",match:s=>(s=>{try{const{swagger:o}=s;return"2.0"===o}catch{return!1}})(s),normalize(s){const{spec:o}=normalize_normalize({spec:s});return o},resolve:async s=>async function resolveOpenAPI2Strategy(s){return resolveGenericStrategy(s)}(s)},vu=yu;const bu={name:"openapi-3-0",match:s=>isOpenAPI30(s),normalize(s){const{spec:o}=normalize_normalize({spec:s});return o},resolve:async s=>async function resolveOpenAPI30Strategy(s){return resolveGenericStrategy(s)}(s)},_u=bu;var Su=__webpack_require__(34035);function _reduced(s){return s&&s["@@transducer/reduced"]?s:{"@@transducer/value":s,"@@transducer/reduced":!0}}var Eu=function(){function XAll(s,o){this.xf=o,this.f=s,this.all=!0}return XAll.prototype["@@transducer/init"]=_xfBase_init,XAll.prototype["@@transducer/result"]=function(s){return this.all&&(s=this.xf["@@transducer/step"](s,!0)),this.xf["@@transducer/result"](s)},XAll.prototype["@@transducer/step"]=function(s,o){return this.f(o)||(this.all=!1,s=_reduced(this.xf["@@transducer/step"](s,!1))),s},XAll}();function _xall(s){return function(o){return new Eu(s,o)}}var wu=_curry2(_dispatchable(["all"],_xall,(function all(s,o){for(var i=0;i<o.length;){if(!s(o[i]))return!1;i+=1}return!0})));const xu=wu;class Annotation extends Su.Om{constructor(s,o,i){super(s,o,i),this.element="annotation"}get code(){return this.attributes.get("code")}set code(s){this.attributes.set("code",s)}}const ku=Annotation;class Comment extends Su.Om{constructor(s,o,i){super(s,o,i),this.element="comment"}}const Ou=Comment;class ParseResult extends Su.wE{constructor(s,o,i){super(s,o,i),this.element="parseResult"}get api(){return this.children.filter((s=>s.classes.contains("api"))).first}get results(){return this.children.filter((s=>s.classes.contains("result")))}get result(){return this.results.first}get annotations(){return this.children.filter((s=>"annotation"===s.element))}get warnings(){return this.children.filter((s=>"annotation"===s.element&&s.classes.contains("warning")))}get errors(){return this.children.filter((s=>"annotation"===s.element&&s.classes.contains("error")))}get isEmpty(){return this.children.reject((s=>"annotation"===s.element)).isEmpty}replaceResult(s){const{result:o}=this;if(bc(o))return!1;const i=this.content.findIndex((s=>s===o));return-1!==i&&(this.content[i]=s,!0)}}const Au=ParseResult,hasMethod=(s,o)=>"object"==typeof o&&null!==o&&s in o&&"function"==typeof o[s],hasBasicElementProps=s=>"object"==typeof s&&null!=s&&"_storedElement"in s&&"string"==typeof s._storedElement&&"_content"in s,primitiveEq=(s,o)=>"object"==typeof o&&null!==o&&"primitive"in o&&("function"==typeof o.primitive&&o.primitive()===s),hasClass=(s,o)=>"object"==typeof o&&null!==o&&"classes"in o&&(Array.isArray(o.classes)||o.classes instanceof Su.wE)&&o.classes.includes(s),isElementType=(s,o)=>"object"==typeof o&&null!==o&&"element"in o&&o.element===s,helpers=s=>s({hasMethod,hasBasicElementProps,primitiveEq,isElementType,hasClass}),Cu=helpers((({hasBasicElementProps:s,primitiveEq:o})=>i=>i instanceof Su.Hg||s(i)&&o(void 0,i))),ju=helpers((({hasBasicElementProps:s,primitiveEq:o})=>i=>i instanceof Su.Om||s(i)&&o("string",i))),Pu=helpers((({hasBasicElementProps:s,primitiveEq:o})=>i=>i instanceof Su.kT||s(i)&&o("number",i))),Iu=helpers((({hasBasicElementProps:s,primitiveEq:o})=>i=>i instanceof Su.Os||s(i)&&o("null",i))),Tu=helpers((({hasBasicElementProps:s,primitiveEq:o})=>i=>i instanceof Su.bd||s(i)&&o("boolean",i))),Nu=helpers((({hasBasicElementProps:s,primitiveEq:o,hasMethod:i})=>a=>a instanceof Su.Sh||s(a)&&o("object",a)&&i("keys",a)&&i("values",a)&&i("items",a))),Mu=helpers((({hasBasicElementProps:s,primitiveEq:o,hasMethod:i})=>a=>a instanceof Su.wE&&!(a instanceof Su.Sh)||s(a)&&o("array",a)&&i("push",a)&&i("unshift",a)&&i("map",a)&&i("reduce",a))),Ru=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof Su.Pr||s(a)&&o("member",a)&&i(void 0,a))),Du=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof Su.Ft||s(a)&&o("link",a)&&i(void 0,a))),Lu=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof Su.sI||s(a)&&o("ref",a)&&i(void 0,a))),Fu=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof ku||s(a)&&o("annotation",a)&&i("array",a))),Bu=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof Ou||s(a)&&o("comment",a)&&i("string",a))),$u=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof Au||s(a)&&o("parseResult",a)&&i("array",a))),isPrimitiveElement=s=>isElementType("object",s)||isElementType("array",s)||isElementType("boolean",s)||isElementType("number",s)||isElementType("string",s)||isElementType("null",s)||isElementType("member",s),hasElementSourceMap=s=>!!Cu(s)&&(Number.isInteger(s.startPositionRow)&&Number.isInteger(s.startPositionColumn)&&Number.isInteger(s.startIndex)&&Number.isInteger(s.endPositionRow)&&Number.isInteger(s.endPositionColumn)&&Number.isInteger(s.endIndex)),includesSymbols=(s,o)=>{if(0===s.length)return!0;const i=o.attributes.get("symbols");return!!Mu(i)&&xu(sc(i.toValue()),s)},includesClasses=(s,o)=>0===s.length||xu(sc(o.classes.toValue()),s);const es_T=function(){return!0};const es_F=function(){return!1},getVisitFn=(s,o,i)=>{const a=s[o];if(null!=a){if(!i&&"function"==typeof a)return a;const s=i?a.leave:a.enter;if("function"==typeof s)return s}else{const a=i?s.leave:s.enter;if(null!=a){if("function"==typeof a)return a;const s=a[o];if("function"==typeof s)return s}}return null},qu={},getNodeType=s=>null==s?void 0:s.type,isNode=s=>"string"==typeof getNodeType(s),cloneNode=s=>Object.create(Object.getPrototypeOf(s),Object.getOwnPropertyDescriptors(s)),mergeAll=(s,{visitFnGetter:o=getVisitFn,nodeTypeGetter:i=getNodeType,breakSymbol:a=qu,deleteNodeSymbol:u=null,skipVisitingNodeSymbol:_=!1,exposeEdits:w=!1}={})=>{const x=Symbol("skip"),C=new Array(s.length).fill(x);return{enter(j,L,B,$,U,V){let z=j,Y=!1;const Z={...V,replaceWith(s,o){V.replaceWith(s,o),z=s}};for(let j=0;j<s.length;j+=1)if(C[j]===x){const x=o(s[j],i(z),!1);if("function"==typeof x){const o=x.call(s[j],z,L,B,$,U,Z);if("function"==typeof(null==o?void 0:o.then))throw new Go("Async visitor not supported in sync mode",{visitor:s[j],visitFn:x});if(o===_)C[j]=z;else if(o===a)C[j]=a;else{if(o===u)return o;if(void 0!==o){if(!w)return o;z=o,Y=!0}}}}return Y?z:void 0},leave(u,w,j,L,B,$){let U=u;const V={...$,replaceWith(s,o){$.replaceWith(s,o),U=s}};for(let u=0;u<s.length;u+=1)if(C[u]===x){const x=o(s[u],i(U),!0);if("function"==typeof x){const o=x.call(s[u],U,w,j,L,B,V);if("function"==typeof(null==o?void 0:o.then))throw new Go("Async visitor not supported in sync mode",{visitor:s[u],visitFn:x});if(o===a)C[u]=a;else if(void 0!==o&&o!==_)return o}}else C[u]===U&&(C[u]=x)}}};mergeAll[Symbol.for("nodejs.util.promisify.custom")]=(s,{visitFnGetter:o=getVisitFn,nodeTypeGetter:i=getNodeType,breakSymbol:a=qu,deleteNodeSymbol:u=null,skipVisitingNodeSymbol:_=!1,exposeEdits:w=!1}={})=>{const x=Symbol("skip"),C=new Array(s.length).fill(x);return{async enter(j,L,B,$,U,V){let z=j,Y=!1;const Z={...V,replaceWith(s,o){V.replaceWith(s,o),z=s}};for(let j=0;j<s.length;j+=1)if(C[j]===x){const x=o(s[j],i(z),!1);if("function"==typeof x){const o=await x.call(s[j],z,L,B,$,U,Z);if(o===_)C[j]=z;else if(o===a)C[j]=a;else{if(o===u)return o;if(void 0!==o){if(!w)return o;z=o,Y=!0}}}}return Y?z:void 0},async leave(u,w,j,L,B,$){let U=u;const V={...$,replaceWith(s,o){$.replaceWith(s,o),U=s}};for(let u=0;u<s.length;u+=1)if(C[u]===x){const x=o(s[u],i(U),!0);if("function"==typeof x){const o=await x.call(s[u],U,w,j,L,B,V);if(o===a)C[u]=a;else if(void 0!==o&&o!==_)return o}}else C[u]===U&&(C[u]=x)}}};const visit=(s,o,{keyMap:i=null,state:a={},breakSymbol:u=qu,deleteNodeSymbol:_=null,skipVisitingNodeSymbol:w=!1,visitFnGetter:x=getVisitFn,nodeTypeGetter:C=getNodeType,nodePredicate:j=isNode,nodeCloneFn:L=cloneNode,detectCycles:B=!0,detectCyclesCallback:$=null}={})=>{const U=i||{};let V,z,Y=Array.isArray(s),Z=[s],ee=-1,ie=[],ae=s;const ce=[],le=[];do{ee+=1;const s=ee===Z.length;let i;const fe=s&&0!==ie.length;if(s){if(i=0===le.length?void 0:ce.pop(),ae=z,z=le.pop(),fe)if(Y){ae=ae.slice();let s=0;for(const[o,i]of ie){const a=o-s;i===_?(ae.splice(a,1),s+=1):ae[a]=i}}else{ae=L(ae);for(const[s,o]of ie)ae[s]=o}ee=V.index,Z=V.keys,ie=V.edits,Y=V.inArray,V=V.prev}else if(z!==_&&void 0!==z){if(i=Y?ee:Z[ee],ae=z[i],ae===_||void 0===ae)continue;ce.push(i)}let ye;if(!Array.isArray(ae)){var pe;if(!j(ae))throw new Go(`Invalid AST Node:  ${String(ae)}`,{node:ae});if(B&&le.includes(ae)){"function"==typeof $&&$(ae,i,z,ce,le),ce.pop();continue}const _=x(o,C(ae),s);if(_){for(const[s,i]of Object.entries(a))o[s]=i;const u={replaceWith(o,a){"function"==typeof a?a(o,ae,i,z,ce,le):z&&(z[i]=o),s||(ae=o)}};ye=_.call(o,ae,i,z,ce,le,u)}if("function"==typeof(null===(pe=ye)||void 0===pe?void 0:pe.then))throw new Go("Async visitor not supported in sync mode",{visitor:o,visitFn:_});if(ye===u)break;if(ye===w){if(!s){ce.pop();continue}}else if(void 0!==ye&&(ie.push([i,ye]),!s)){if(!j(ye)){ce.pop();continue}ae=ye}}var de;if(void 0===ye&&fe&&ie.push([i,ae]),!s)V={inArray:Y,index:ee,keys:Z,edits:ie,prev:V},Y=Array.isArray(ae),Z=Y?ae:null!==(de=U[C(ae)])&&void 0!==de?de:[],ee=-1,ie=[],z!==_&&void 0!==z&&le.push(z),z=ae}while(void 0!==V);return 0!==ie.length?ie[ie.length-1][1]:s};visit[Symbol.for("nodejs.util.promisify.custom")]=async(s,o,{keyMap:i=null,state:a={},breakSymbol:u=qu,deleteNodeSymbol:_=null,skipVisitingNodeSymbol:w=!1,visitFnGetter:x=getVisitFn,nodeTypeGetter:C=getNodeType,nodePredicate:j=isNode,nodeCloneFn:L=cloneNode,detectCycles:B=!0,detectCyclesCallback:$=null}={})=>{const U=i||{};let V,z,Y=Array.isArray(s),Z=[s],ee=-1,ie=[],ae=s;const ce=[],le=[];do{ee+=1;const s=ee===Z.length;let i;const de=s&&0!==ie.length;if(s){if(i=0===le.length?void 0:ce.pop(),ae=z,z=le.pop(),de)if(Y){ae=ae.slice();let s=0;for(const[o,i]of ie){const a=o-s;i===_?(ae.splice(a,1),s+=1):ae[a]=i}}else{ae=L(ae);for(const[s,o]of ie)ae[s]=o}ee=V.index,Z=V.keys,ie=V.edits,Y=V.inArray,V=V.prev}else if(z!==_&&void 0!==z){if(i=Y?ee:Z[ee],ae=z[i],ae===_||void 0===ae)continue;ce.push(i)}let fe;if(!Array.isArray(ae)){if(!j(ae))throw new Go(`Invalid AST Node: ${String(ae)}`,{node:ae});if(B&&le.includes(ae)){"function"==typeof $&&$(ae,i,z,ce,le),ce.pop();continue}const _=x(o,C(ae),s);if(_){for(const[s,i]of Object.entries(a))o[s]=i;const u={replaceWith(o,a){"function"==typeof a?a(o,ae,i,z,ce,le):z&&(z[i]=o),s||(ae=o)}};fe=await _.call(o,ae,i,z,ce,le,u)}if(fe===u)break;if(fe===w){if(!s){ce.pop();continue}}else if(void 0!==fe&&(ie.push([i,fe]),!s)){if(!j(fe)){ce.pop();continue}ae=fe}}var pe;if(void 0===fe&&de&&ie.push([i,ae]),!s)V={inArray:Y,index:ee,keys:Z,edits:ie,prev:V},Y=Array.isArray(ae),Z=Y?ae:null!==(pe=U[C(ae)])&&void 0!==pe?pe:[],ee=-1,ie=[],z!==_&&void 0!==z&&le.push(z),z=ae}while(void 0!==V);return 0!==ie.length?ie[ie.length-1][1]:s};const Uu=class CloneError extends Go{value;constructor(s,o){super(s,o),void 0!==o&&(this.value=o.value)}};const Vu=class DeepCloneError extends Uu{};const zu=class ShallowCloneError extends Uu{};const Wu=_curry2((function mapObjIndexed(s,o){return _arrayReduce((function(i,a){return i[a]=s(o[a],a,o),i}),{},ea(o))}));const Ju=_curry1((function isNil(s){return null==s}));var Hu=_curry2((function hasPath(s,o){if(0===s.length||Ju(o))return!1;for(var i=o,a=0;a<s.length;){if(Ju(i)||!_has(s[a],i))return!1;i=i[s[a]],a+=1}return!0}));const Ku=Hu;var Gu=_curry2((function has(s,o){return Ku([s],o)}));const Yu=Gu;const Xu=_curry3((function propSatisfies(s,o,i){return s(Da(o,i))}));const Qu=_curry2(_path);var Zu=function(){function XDropWhile(s,o){this.xf=o,this.f=s}return XDropWhile.prototype["@@transducer/init"]=_xfBase_init,XDropWhile.prototype["@@transducer/result"]=_xfBase_result,XDropWhile.prototype["@@transducer/step"]=function(s,o){if(this.f){if(this.f(o))return s;this.f=null}return this.xf["@@transducer/step"](s,o)},XDropWhile}();function _xdropWhile(s){return function(o){return new Zu(s,o)}}const ep=_curry2(_dispatchable(["dropWhile"],_xdropWhile,(function dropWhile(s,o){for(var i=0,a=o.length;i<a&&s(o[i]);)i+=1;return ja(i,1/0,o)})));const tp=za((function(s,o){return pipe(Ha(""),ep(sc(s)),rc(""))(o)})),dereference=(s,o)=>{const i=Na(s,o);return Wu((s=>{if(fu(s)&&Yu("$ref",s)&&Xu(Jc,"$ref",s)){const o=Qu(["$ref"],s),a=tp("#/",o);return Qu(a.split("/"),i)}return fu(s)?dereference(s,i):s}),s)},assignSourceMap=(s,o)=>(s.startPositionRow=null==o?void 0:o.startPositionRow,s.startPositionColumn=null==o?void 0:o.startPositionColumn,s.startIndex=null==o?void 0:o.startIndex,s.endPositionRow=null==o?void 0:o.endPositionRow,s.endPositionColumn=null==o?void 0:o.endPositionColumn,s.endIndex=null==o?void 0:o.endIndex,s),cloneDeep=(s,o={})=>{const{visited:i=new WeakMap}=o,a={...o,visited:i};if(i.has(s))return i.get(s);if(s instanceof Su.KeyValuePair){const{key:o,value:u}=s,_=Cu(o)?cloneDeep(o,a):o,w=Cu(u)?cloneDeep(u,a):u,x=new Su.KeyValuePair(_,w);return i.set(s,x),x}if(s instanceof Su.ot){const mapper=s=>cloneDeep(s,a),o=[...s].map(mapper),u=new Su.ot(o);return i.set(s,u),u}if(s instanceof Su.G6){const mapper=s=>cloneDeep(s,a),o=[...s].map(mapper),u=new Su.G6(o);return i.set(s,u),u}if(Cu(s)){const o=cloneShallow(s);if(i.set(s,o),s.content)if(Cu(s.content))o.content=cloneDeep(s.content,a);else if(s.content instanceof Su.KeyValuePair)o.content=cloneDeep(s.content,a);else if(Array.isArray(s.content)){const mapper=s=>cloneDeep(s,a);o.content=s.content.map(mapper)}else o.content=s.content;else o.content=s.content;return o}throw new Vu("Value provided to cloneDeep function couldn't be cloned",{value:s})};cloneDeep.safe=s=>{try{return cloneDeep(s)}catch{return s}};const cloneShallowKeyValuePair=s=>{const{key:o,value:i}=s;return new Su.KeyValuePair(o,i)},cloneShallowElement=s=>{const o=new s.constructor;if(o.element=s.element,hasElementSourceMap(s)&&assignSourceMap(o,s),s.meta.length>0&&(o._meta=cloneDeep(s.meta)),s.attributes.length>0&&(o._attributes=cloneDeep(s.attributes)),Cu(s.content)){const i=s.content;o.content=cloneShallowElement(i)}else Array.isArray(s.content)?o.content=[...s.content]:s.content instanceof Su.KeyValuePair?o.content=cloneShallowKeyValuePair(s.content):o.content=s.content;return o},cloneShallow=s=>{if(s instanceof Su.KeyValuePair)return cloneShallowKeyValuePair(s);if(s instanceof Su.ot)return(s=>{const o=[...s];return new Su.ot(o)})(s);if(s instanceof Su.G6)return(s=>{const o=[...s];return new Su.G6(o)})(s);if(Cu(s))return cloneShallowElement(s);throw new zu("Value provided to cloneShallow function couldn't be cloned",{value:s})};cloneShallow.safe=s=>{try{return cloneShallow(s)}catch{return s}};const visitor_getNodeType=s=>Nu(s)?"ObjectElement":Mu(s)?"ArrayElement":Ru(s)?"MemberElement":ju(s)?"StringElement":Tu(s)?"BooleanElement":Pu(s)?"NumberElement":Iu(s)?"NullElement":Du(s)?"LinkElement":Lu(s)?"RefElement":void 0,visitor_cloneNode=s=>Cu(s)?cloneShallow(s):cloneNode(s),rp=pipe(visitor_getNodeType,Jc),np={ObjectElement:["content"],ArrayElement:["content"],MemberElement:["key","value"],StringElement:[],BooleanElement:[],NumberElement:[],NullElement:[],RefElement:[],LinkElement:[],Annotation:[],Comment:[],ParseResultElement:["content"]};class PredicateVisitor{result;predicate;returnOnTrue;returnOnFalse;constructor({predicate:s=es_F,returnOnTrue:o,returnOnFalse:i}={}){this.result=[],this.predicate=s,this.returnOnTrue=o,this.returnOnFalse=i}enter(s){return this.predicate(s)?(this.result.push(s),this.returnOnTrue):this.returnOnFalse}}const visitor_visit=(s,o,{keyMap:i=np,...a}={})=>visit(s,o,{keyMap:i,nodeTypeGetter:visitor_getNodeType,nodePredicate:rp,nodeCloneFn:visitor_cloneNode,...a});visitor_visit[Symbol.for("nodejs.util.promisify.custom")]=async(s,o,{keyMap:i=np,...a}={})=>visit[Symbol.for("nodejs.util.promisify.custom")](s,o,{keyMap:i,nodeTypeGetter:visitor_getNodeType,nodePredicate:rp,nodeCloneFn:visitor_cloneNode,...a});const nodeTypeGetter=s=>"string"==typeof(null==s?void 0:s.type)?s.type:visitor_getNodeType(s),sp={EphemeralObject:["content"],EphemeralArray:["content"],...np},value_visitor_visit=(s,o,{keyMap:i=sp,...a}={})=>visitor_visit(s,o,{keyMap:i,nodeTypeGetter,nodePredicate:es_T,detectCycles:!1,deleteNodeSymbol:Symbol.for("delete-node"),skipVisitingNodeSymbol:Symbol.for("skip-visiting-node"),...a});value_visitor_visit[Symbol.for("nodejs.util.promisify.custom")]=async(s,{keyMap:o=sp,...i}={})=>visitor_visit[Symbol.for("nodejs.util.promisify.custom")](s,visitor,{keyMap:o,nodeTypeGetter,nodePredicate:es_T,detectCycles:!1,deleteNodeSymbol:Symbol.for("delete-node"),skipVisitingNodeSymbol:Symbol.for("skip-visiting-node"),...i});const op=class EphemeralArray{type="EphemeralArray";content=[];reference=void 0;constructor(s){this.content=s,this.reference=[]}toReference(){return this.reference}toArray(){return this.reference.push(...this.content),this.reference}};const ip=class EphemeralObject{type="EphemeralObject";content=[];reference=void 0;constructor(s){this.content=s,this.reference={}}toReference(){return this.reference}toObject(){return Object.assign(this.reference,Object.fromEntries(this.content))}};class Visitor{ObjectElement={enter:s=>{if(this.references.has(s))return this.references.get(s).toReference();const o=new ip(s.content);return this.references.set(s,o),o}};EphemeralObject={leave:s=>s.toObject()};MemberElement={enter:s=>[s.key,s.value]};ArrayElement={enter:s=>{if(this.references.has(s))return this.references.get(s).toReference();const o=new op(s.content);return this.references.set(s,o),o}};EphemeralArray={leave:s=>s.toArray()};references=new WeakMap;BooleanElement(s){return s.toValue()}NumberElement(s){return s.toValue()}StringElement(s){return s.toValue()}NullElement(){return null}RefElement(s,...o){var i;const a=o[3];return"EphemeralObject"===(null===(i=a[a.length-1])||void 0===i?void 0:i.type)?Symbol.for("delete-node"):String(s.toValue())}LinkElement(s){return ju(s.href)?s.href.toValue():""}}const serializers_value=s=>Cu(s)?ju(s)||Pu(s)||Tu(s)||Iu(s)?s.toValue():value_visitor_visit(s,new Visitor):s;const cp=_curry3((function mergeWithKey(s,o,i){var a,u={};for(a in i=i||{},o=o||{})_has(a,o)&&(u[a]=_has(a,i)?s(a,o[a],i[a]):o[a]);for(a in i)_has(a,i)&&!_has(a,u)&&(u[a]=i[a]);return u}));const lp=_curry3((function mergeDeepWithKey(s,o,i){return cp((function(o,i,a){return _isObject(i)&&_isObject(a)?mergeDeepWithKey(s,i,a):s(o,i,a)}),o,i)}));const up=_curry2((function mergeDeepRight(s,o){return lp((function(s,o,i){return i}),s,o)}));const pp=ja(0,-1);const hp=_curry2((function apply(s,o){return s.apply(this,o)}));const dp=dc(Mc);var fp=_curry1((function empty(s){return null!=s&&"function"==typeof s["fantasy-land/empty"]?s["fantasy-land/empty"]():null!=s&&null!=s.constructor&&"function"==typeof s.constructor["fantasy-land/empty"]?s.constructor["fantasy-land/empty"]():null!=s&&"function"==typeof s.empty?s.empty():null!=s&&null!=s.constructor&&"function"==typeof s.constructor.empty?s.constructor.empty():ca(s)?[]:_isString(s)?"":_isObject(s)?{}:Ei(s)?function(){return arguments}():function _isTypedArray(s){var o=Object.prototype.toString.call(s);return"[object Uint8ClampedArray]"===o||"[object Int8Array]"===o||"[object Uint8Array]"===o||"[object Int16Array]"===o||"[object Uint16Array]"===o||"[object Int32Array]"===o||"[object Uint32Array]"===o||"[object Float32Array]"===o||"[object Float64Array]"===o||"[object BigInt64Array]"===o||"[object BigUint64Array]"===o}(s)?s.constructor.from(""):void 0}));const mp=fp;const gp=_curry1((function isEmpty(s){return null!=s&&na(s,mp(s))}));const yp=$a(1,Mc(Array.isArray)?Array.isArray:pipe(ra,Pc("Array")));const vp=ou(yp,gp);var bp=$a(3,(function(s,o,i){var a=Qu(s,i),u=Qu(pp(s),i);if(!dp(a)&&!vp(s)){var _=Ea(a,u);return hp(_,o)}}));const _p=bp;class Namespace extends Su.g${constructor(){super(),this.register("annotation",ku),this.register("comment",Ou),this.register("parseResult",Au)}}const Sp=new Namespace,createNamespace=s=>{const o=new Namespace;return fu(s)&&o.use(s),o},Ep=Sp,toolbox=()=>({predicates:{...ie},namespace:Ep}),wp={toolboxCreator:toolbox,visitorOptions:{nodeTypeGetter:visitor_getNodeType,exposeEdits:!0}},dispatchPluginsSync=(s,o,i={})=>{if(0===o.length)return s;const a=up(wp,i),{toolboxCreator:u,visitorOptions:_}=a,w=u(),x=o.map((s=>s(w))),C=mergeAll(x.map(La({},"visitor")),{..._});x.forEach(_p(["pre"],[]));const j=visitor_visit(s,C,_);return x.forEach(_p(["post"],[])),j};dispatchPluginsSync[Symbol.for("nodejs.util.promisify.custom")]=async(s,o,i={})=>{if(0===o.length)return s;const a=up(wp,i),{toolboxCreator:u,visitorOptions:_}=a,w=u(),x=o.map((s=>s(w))),C=mergeAll[Symbol.for("nodejs.util.promisify.custom")],j=visitor_visit[Symbol.for("nodejs.util.promisify.custom")],L=C(x.map(La({},"visitor")),{..._});await Promise.allSettled(x.map(_p(["pre"],[])));const B=await j(s,L,_);return await Promise.allSettled(x.map(_p(["post"],[]))),B};const refract=(s,{Type:o,plugins:i=[]})=>{const a=new o(s);return Cu(s)&&(s.meta.length>0&&(a.meta=cloneDeep(s.meta)),s.attributes.length>0&&(a.attributes=cloneDeep(s.attributes))),dispatchPluginsSync(a,i,{toolboxCreator:toolbox,visitorOptions:{nodeTypeGetter:visitor_getNodeType}})},createRefractor=s=>(o,i={})=>refract(o,{...i,Type:s});Su.Sh.refract=createRefractor(Su.Sh),Su.wE.refract=createRefractor(Su.wE),Su.Om.refract=createRefractor(Su.Om),Su.bd.refract=createRefractor(Su.bd),Su.Os.refract=createRefractor(Su.Os),Su.kT.refract=createRefractor(Su.kT),Su.Ft.refract=createRefractor(Su.Ft),Su.sI.refract=createRefractor(Su.sI),ku.refract=createRefractor(ku),Ou.refract=createRefractor(Ou),Au.refract=createRefractor(Au);const computeEdges=(s,o=new WeakMap)=>(Ru(s)?(o.set(s.key,s),computeEdges(s.key,o),o.set(s.value,s),computeEdges(s.value,o)):s.children.forEach((i=>{o.set(i,s),computeEdges(i,o)})),o);const xp=class Transcluder_Transcluder{element;edges;constructor({element:s}){this.element=s}transclude(s,o){var i;if(s===this.element)return o;if(s===o)return this.element;this.edges=null!==(i=this.edges)&&void 0!==i?i:computeEdges(this.element);const a=this.edges.get(s);return bc(a)?void 0:(Nu(a)?((s,o,i)=>{const a=i.get(s);Nu(a)&&(a.content=a.map(((u,_,w)=>w===s?(i.delete(s),i.set(o,a),o):w)))})(s,o,this.edges):Mu(a)?((s,o,i)=>{const a=i.get(s);Mu(a)&&(a.content=a.map((u=>u===s?(i.delete(s),i.set(o,a),o):u)))})(s,o,this.edges):Ru(a)&&((s,o,i)=>{const a=i.get(s);Ru(a)&&(a.key===s&&(a.key=o,i.delete(s),i.set(o,a)),a.value===s&&(a.value=o,i.delete(s),i.set(o,a)))})(s,o,this.edges),this.element)}},fromURIReference=s=>{const o=s.indexOf("#");return(s=>{try{const o=s.startsWith("#")?s.slice(1):s;return decodeURIComponent(o)}catch{return s}})(-1===o?"#":s.substring(o))},kp=function fnparser(){const s=Pp,o=jp,i=this,a="parser.js: Parser(): ";i.ast=void 0,i.stats=void 0,i.trace=void 0,i.callbacks=[];let u,_,w,x,C,j,L,B=0,$=0,U=0,V=0,z=0,Y=new function systemData(){this.state=s.ACTIVE,this.phraseLength=0,this.refresh=()=>{this.state=s.ACTIVE,this.phraseLength=0}};i.parse=(Z,ee,ie,ae)=>{const ce=`${a}parse(): `;B=0,$=0,U=0,V=0,z=0,u=void 0,_=void 0,w=void 0,x=void 0,Y.refresh(),C=void 0,j=void 0,L=void 0,x=o.stringToChars(ie),u=Z.rules,_=Z.udts;const le=ee.toLowerCase();let pe;for(const s in u)if(u.hasOwnProperty(s)&&le===u[s].lower){pe=u[s].index;break}if(void 0===pe)throw new Error(`${ce}start rule name '${startRule}' not recognized`);(()=>{const s=`${a}initializeCallbacks(): `;let o,w;for(C=[],j=[],o=0;o<u.length;o+=1)C[o]=void 0;for(o=0;o<_.length;o+=1)j[o]=void 0;const x=[];for(o=0;o<u.length;o+=1)x.push(u[o].lower);for(o=0;o<_.length;o+=1)x.push(_[o].lower);for(const a in i.callbacks)if(i.callbacks.hasOwnProperty(a)){if(o=x.indexOf(a.toLowerCase()),o<0)throw new Error(`${s}syntax callback '${a}' not a rule or udt name`);if(w=i.callbacks[a]?i.callbacks[a]:void 0,"function"!=typeof w&&void 0!==w)throw new Error(`${s}syntax callback[${a}] must be function reference or falsy)`);o<u.length?C[o]=w:j[o-u.length]=w}})(),i.trace&&i.trace.init(u,_,x),i.stats&&i.stats.init(u,_),i.ast&&i.ast.init(u,_,x),L=ae,w=[{type:s.RNM,index:pe}],opExecute(0,0),w=void 0;let de=!1;switch(Y.state){case s.ACTIVE:throw new Error(`${ce}final state should never be 'ACTIVE'`);case s.NOMATCH:de=!1;break;case s.EMPTY:case s.MATCH:de=Y.phraseLength===x.length;break;default:throw new Error("unrecognized state")}return{success:de,state:Y.state,stateName:s.idName(Y.state),length:x.length,matched:Y.phraseLength,maxMatched:z,maxTreeDepth:U,nodeHits:V}};const validateRnmCallbackResult=(o,i,u,_)=>{if(i.phraseLength>u){let s=`${a}opRNM(${o.name}): callback function error: `;throw s+=`sysData.phraseLength: ${i.phraseLength}`,s+=` must be <= remaining chars: ${u}`,new Error(s)}switch(i.state){case s.ACTIVE:if(!_)throw new Error(`${a}opRNM(${o.name}): callback function return error. ACTIVE state not allowed.`);break;case s.EMPTY:i.phraseLength=0;break;case s.MATCH:0===i.phraseLength&&(i.state=s.EMPTY);break;case s.NOMATCH:i.phraseLength=0;break;default:throw new Error(`${a}opRNM(${o.name}): callback function return error. Unrecognized return state: ${i.state}`)}},opUDT=(o,C)=>{let $,U,V;const z=w[o],Z=_[z.index];Y.UdtIndex=Z.index,B||(V=i.ast&&i.ast.udtDefined(z.index),V&&(U=u.length+z.index,$=i.ast.getLength(),i.ast.down(U,Z.name)));const ee=x.length-C;j[z.index](Y,x,C,L),((o,i,u)=>{if(i.phraseLength>u){let s=`${a}opUDT(${o.name}): callback function error: `;throw s+=`sysData.phraseLength: ${i.phraseLength}`,s+=` must be <= remaining chars: ${u}`,new Error(s)}switch(i.state){case s.ACTIVE:throw new Error(`${a}opUDT(${o.name}) ACTIVE state return not allowed.`);case s.EMPTY:if(!o.empty)throw new Error(`${a}opUDT(${o.name}) may not return EMPTY.`);i.phraseLength=0;break;case s.MATCH:if(0===i.phraseLength){if(!o.empty)throw new Error(`${a}opUDT(${o.name}) may not return EMPTY.`);i.state=s.EMPTY}break;case s.NOMATCH:i.phraseLength=0;break;default:throw new Error(`${a}opUDT(${o.name}): callback function return error. Unrecognized return state: ${i.state}`)}})(Z,Y,ee),B||V&&(Y.state===s.NOMATCH?i.ast.setLength($):i.ast.up(U,Z.name,C,Y.phraseLength))},opExecute=(o,_)=>{const j=`${a}opExecute(): `,Z=w[o];switch(V+=1,$>U&&(U=$),$+=1,Y.refresh(),i.trace&&i.trace.down(Z,_),Z.type){case s.ALT:((o,i)=>{const a=w[o];for(let o=0;o<a.children.length&&(opExecute(a.children[o],i),Y.state===s.NOMATCH);o+=1);})(o,_);break;case s.CAT:((o,a)=>{let u,_,x,C;const j=w[o];i.ast&&(_=i.ast.getLength()),u=!0,x=a,C=0;for(let o=0;o<j.children.length;o+=1){if(opExecute(j.children[o],x),Y.state===s.NOMATCH){u=!1;break}x+=Y.phraseLength,C+=Y.phraseLength}u?(Y.state=0===C?s.EMPTY:s.MATCH,Y.phraseLength=C):(Y.state=s.NOMATCH,Y.phraseLength=0,i.ast&&i.ast.setLength(_))})(o,_);break;case s.REP:((o,a)=>{let u,_,C,j;const L=w[o];if(0===L.max)return Y.state=s.EMPTY,void(Y.phraseLength=0);for(_=a,C=0,j=0,i.ast&&(u=i.ast.getLength());!(_>=x.length)&&(opExecute(o+1,_),Y.state!==s.NOMATCH)&&Y.state!==s.EMPTY&&(j+=1,C+=Y.phraseLength,_+=Y.phraseLength,j!==L.max););Y.state===s.EMPTY||j>=L.min?(Y.state=0===C?s.EMPTY:s.MATCH,Y.phraseLength=C):(Y.state=s.NOMATCH,Y.phraseLength=0,i.ast&&i.ast.setLength(u))})(o,_);break;case s.RNM:((o,a)=>{let _,j,$;const U=w[o],V=u[U.index],z=C[V.index];if(B||(j=i.ast&&i.ast.ruleDefined(U.index),j&&(_=i.ast.getLength(),i.ast.down(U.index,u[U.index].name))),z){const o=x.length-a;z(Y,x,a,L),validateRnmCallbackResult(V,Y,o,!0),Y.state===s.ACTIVE&&($=w,w=V.opcodes,opExecute(0,a),w=$,z(Y,x,a,L),validateRnmCallbackResult(V,Y,o,!1))}else $=w,w=V.opcodes,opExecute(0,a,Y),w=$;B||j&&(Y.state===s.NOMATCH?i.ast.setLength(_):i.ast.up(U.index,V.name,a,Y.phraseLength))})(o,_);break;case s.TRG:((o,i)=>{const a=w[o];Y.state=s.NOMATCH,i<x.length&&a.min<=x[i]&&x[i]<=a.max&&(Y.state=s.MATCH,Y.phraseLength=1)})(o,_);break;case s.TBS:((o,i)=>{const a=w[o],u=a.string.length;if(Y.state=s.NOMATCH,i+u<=x.length){for(let s=0;s<u;s+=1)if(x[i+s]!==a.string[s])return;Y.state=s.MATCH,Y.phraseLength=u}})(o,_);break;case s.TLS:((o,i)=>{let a;const u=w[o];Y.state=s.NOMATCH;const _=u.string.length;if(0!==_){if(i+_<=x.length){for(let s=0;s<_;s+=1)if(a=x[i+s],a>=65&&a<=90&&(a+=32),a!==u.string[s])return;Y.state=s.MATCH,Y.phraseLength=_}}else Y.state=s.EMPTY})(o,_);break;case s.UDT:opUDT(o,_);break;case s.AND:((o,i)=>{switch(B+=1,opExecute(o+1,i),B-=1,Y.phraseLength=0,Y.state){case s.EMPTY:case s.MATCH:Y.state=s.EMPTY;break;case s.NOMATCH:Y.state=s.NOMATCH;break;default:throw new Error(`opAND: invalid state ${Y.state}`)}})(o,_);break;case s.NOT:((o,i)=>{switch(B+=1,opExecute(o+1,i),B-=1,Y.phraseLength=0,Y.state){case s.EMPTY:case s.MATCH:Y.state=s.NOMATCH;break;case s.NOMATCH:Y.state=s.EMPTY;break;default:throw new Error(`opNOT: invalid state ${Y.state}`)}})(o,_);break;default:throw new Error(`${j}unrecognized operator`)}B||_+Y.phraseLength>z&&(z=_+Y.phraseLength),i.stats&&i.stats.collect(Z,Y),i.trace&&i.trace.up(Z,Y.state,_,Y.phraseLength),$-=1}},Op=function fnast(){const s=Pp,o=jp,i=this;let a,u,_,w=0;const x=[],C=[],j=[];function indent(s){let o="";for(;s-- >0;)o+=" ";return o}i.callbacks=[],i.init=(s,o,L)=>{let B;C.length=0,j.length=0,w=0,a=s,u=o,_=L;const $=[];for(B=0;B<a.length;B+=1)$.push(a[B].lower);for(B=0;B<u.length;B+=1)$.push(u[B].lower);for(w=a.length+u.length,B=0;B<w;B+=1)x[B]=void 0;for(const s in i.callbacks)if(i.callbacks.hasOwnProperty(s)){const o=s.toLowerCase();if(B=$.indexOf(o),B<0)throw new Error(`parser.js: Ast()): init: node '${s}' not a rule or udt name`);x[B]=i.callbacks[s]}},i.ruleDefined=s=>!!x[s],i.udtDefined=s=>!!x[a.length+s],i.down=(o,i)=>{const a=j.length;return C.push(a),j.push({name:i,thisIndex:a,thatIndex:void 0,state:s.SEM_PRE,callbackIndex:o,phraseIndex:void 0,phraseLength:void 0,stack:C.length}),a},i.up=(o,i,a,u)=>{const _=j.length,w=C.pop();return j.push({name:i,thisIndex:_,thatIndex:w,state:s.SEM_POST,callbackIndex:o,phraseIndex:a,phraseLength:u,stack:C.length}),j[w].thatIndex=_,j[w].phraseIndex=a,j[w].phraseLength=u,_},i.translate=o=>{let i,a;for(let u=0;u<j.length;u+=1)a=j[u],i=x[a.callbackIndex],i&&(a.state===s.SEM_PRE?i(s.SEM_PRE,_,a.phraseIndex,a.phraseLength,o):i&&i(s.SEM_POST,_,a.phraseIndex,a.phraseLength,o))},i.setLength=s=>{j.length=s,C.length=s>0?j[s-1].stack:0},i.getLength=()=>j.length,i.toXml=()=>{let i="",a=0;return i+='<?xml version="1.0" encoding="utf-8"?>\n',i+=`<root nodes="${j.length/2}" characters="${_.length}">\n`,i+="\x3c!-- input string --\x3e\n",i+=indent(a+2),i+=o.charsToString(_),i+="\n",j.forEach((u=>{u.state===s.SEM_PRE?(a+=1,i+=indent(a),i+=`<node name="${u.name}" index="${u.phraseIndex}" length="${u.phraseLength}">\n`,i+=indent(a+2),i+=o.charsToString(_,u.phraseIndex,u.phraseLength),i+="\n"):(i+=indent(a),i+=`</node>\x3c!-- name="${u.name}" --\x3e\n`,a-=1)})),i+="</root>\n",i}},Ap=function fntrace(){const s=Pp,o=jp,i="parser.js: Trace(): ";let a,u,_,w="",x=0;const C=this,indent=s=>{let o="",i=0;if(s>=0)for(;s--;)i+=1,5===i?(o+="|",i=0):o+=".";return o};C.init=(s,o,i)=>{u=s,_=o,a=i};const opName=a=>{let w;switch(a.type){case s.ALT:w="ALT";break;case s.CAT:w="CAT";break;case s.REP:w=a.max===1/0?`REP(${a.min},inf)`:`REP(${a.min},${a.max})`;break;case s.RNM:w=`RNM(${u[a.index].name})`;break;case s.TRG:w=`TRG(${a.min},${a.max})`;break;case s.TBS:w=a.string.length>6?`TBS(${o.charsToString(a.string,0,3)}...)`:`TBS(${o.charsToString(a.string,0,6)})`;break;case s.TLS:w=a.string.length>6?`TLS(${o.charsToString(a.string,0,3)}...)`:`TLS(${o.charsToString(a.string,0,6)})`;break;case s.UDT:w=`UDT(${_[a.index].name})`;break;case s.AND:w="AND";break;case s.NOT:w="NOT";break;default:throw new Error(`${i}Trace: opName: unrecognized opcode`)}return w};C.down=(s,i)=>{const u=indent(x),_=Math.min(100,a.length-i);let C=o.charsToString(a,i,_);_<a.length-i&&(C+="..."),C=`${u}|-|[${opName(s)}]${C}\n`,w+=C,x+=1},C.up=(u,_,C,j)=>{const L=`${i}trace.up: `;x-=1;const B=indent(x);let $,U,V;switch(_){case s.EMPTY:V="|E|",U="''";break;case s.MATCH:V="|M|",$=Math.min(100,j),U=$<j?`'${o.charsToString(a,C,$)}...'`:`'${o.charsToString(a,C,$)}'`;break;case s.NOMATCH:V="|N|",U="";break;default:throw new Error(`${L} unrecognized state`)}U=`${B}${V}[${opName(u)}]${U}\n`,w+=U},C.displayTrace=()=>w},Cp=function fnstats(){const s=Pp;let o,i,a;const u=[],_=[],w=[];this.init=(s,a)=>{o=s,i=a,clear()},this.collect=(o,i)=>{incStat(a,i.state,i.phraseLength),incStat(u[o.type],i.state,i.phraseLength),o.type===s.RNM&&incStat(_[o.index],i.state,i.phraseLength),o.type===s.UDT&&incStat(w[o.index],i.state,i.phraseLength)},this.displayStats=()=>{let o="";const i={match:0,empty:0,nomatch:0,total:0},displayRow=(s,o,a,u,_)=>{i.match+=o,i.empty+=a,i.nomatch+=u,i.total+=_;return`${s} | ${normalize(o)} | ${normalize(a)} | ${normalize(u)} | ${normalize(_)} |\n`};return o+="          OPERATOR STATS\n",o+="      |   MATCH |   EMPTY | NOMATCH |   TOTAL |\n",o+=displayRow("  ALT",u[s.ALT].match,u[s.ALT].empty,u[s.ALT].nomatch,u[s.ALT].total),o+=displayRow("  CAT",u[s.CAT].match,u[s.CAT].empty,u[s.CAT].nomatch,u[s.CAT].total),o+=displayRow("  REP",u[s.REP].match,u[s.REP].empty,u[s.REP].nomatch,u[s.REP].total),o+=displayRow("  RNM",u[s.RNM].match,u[s.RNM].empty,u[s.RNM].nomatch,u[s.RNM].total),o+=displayRow("  TRG",u[s.TRG].match,u[s.TRG].empty,u[s.TRG].nomatch,u[s.TRG].total),o+=displayRow("  TBS",u[s.TBS].match,u[s.TBS].empty,u[s.TBS].nomatch,u[s.TBS].total),o+=displayRow("  TLS",u[s.TLS].match,u[s.TLS].empty,u[s.TLS].nomatch,u[s.TLS].total),o+=displayRow("  UDT",u[s.UDT].match,u[s.UDT].empty,u[s.UDT].nomatch,u[s.UDT].total),o+=displayRow("  AND",u[s.AND].match,u[s.AND].empty,u[s.AND].nomatch,u[s.AND].total),o+=displayRow("  NOT",u[s.NOT].match,u[s.NOT].empty,u[s.NOT].nomatch,u[s.NOT].total),o+=displayRow("TOTAL",i.match,i.empty,i.nomatch,i.total),o},this.displayHits=s=>{let o="";const displayRow=(s,o,i,u,_)=>{a.match+=s,a.empty+=o,a.nomatch+=i,a.total+=u;return`| ${normalize(s)} | ${normalize(o)} | ${normalize(i)} | ${normalize(u)} | ${_}\n`};"string"==typeof s&&"a"===s.toLowerCase()[0]?(_.sort(sortAlpha),w.sort(sortAlpha),o+="    RULES/UDTS ALPHABETICALLY\n"):"string"==typeof s&&"i"===s.toLowerCase()[0]?(_.sort(sortIndex),w.sort(sortIndex),o+="    RULES/UDTS BY INDEX\n"):(_.sort(sortHits),w.sort(sortHits),o+="    RULES/UDTS BY HIT COUNT\n"),o+="|   MATCH |   EMPTY | NOMATCH |   TOTAL | NAME\n";for(let s=0;s<_.length;s+=1){let i=_[s];i.total&&(o+=displayRow(i.match,i.empty,i.nomatch,i.total,i.name))}for(let s=0;s<w.length;s+=1){let i=w[s];i.total&&(o+=displayRow(i.match,i.empty,i.nomatch,i.total,i.name))}return o};const normalize=s=>s<10?`      ${s}`:s<100?`     ${s}`:s<1e3?`    ${s}`:s<1e4?`   ${s}`:s<1e5?`  ${s}`:s<1e6?` ${s}`:`${s}`,sortAlpha=(s,o)=>s.lower<o.lower?-1:s.lower>o.lower?1:0,sortHits=(s,o)=>s.total<o.total?1:s.total>o.total?-1:sortAlpha(s,o),sortIndex=(s,o)=>s.index<o.index?-1:s.index>o.index?1:0,x=function fnempty(){this.empty=0,this.match=0,this.nomatch=0,this.total=0},clear=()=>{u.length=0,a=new x,u[s.ALT]=new x,u[s.CAT]=new x,u[s.REP]=new x,u[s.RNM]=new x,u[s.TRG]=new x,u[s.TBS]=new x,u[s.TLS]=new x,u[s.UDT]=new x,u[s.AND]=new x,u[s.NOT]=new x,_.length=0;for(let s=0;s<o.length;s+=1)_.push({empty:0,match:0,nomatch:0,total:0,name:o[s].name,lower:o[s].lower,index:o[s].index});if(i.length>0){w.length=0;for(let s=0;s<i.length;s+=1)w.push({empty:0,match:0,nomatch:0,total:0,name:i[s].name,lower:i[s].lower,index:i[s].index})}},incStat=(o,i)=>{switch(o.total+=1,i){case s.EMPTY:o.empty+=1;break;case s.MATCH:o.match+=1;break;case s.NOMATCH:o.nomatch+=1;break;default:throw new Error(`parser.js: Stats(): collect(): incStat(): unrecognized state: ${i}`)}}},jp={stringToChars:s=>[...s].map((s=>s.codePointAt(0))),charsToString:(s,o,i)=>{let a=s;for(;!(void 0===o||o<0);){if(void 0===i){a=s.slice(o);break}if(i<=0)return"";a=s.slice(o,o+i);break}return String.fromCodePoint(...a)}},Pp={ALT:1,CAT:2,REP:3,RNM:4,TRG:5,TBS:6,TLS:7,UDT:11,AND:12,NOT:13,ACTIVE:100,MATCH:101,EMPTY:102,NOMATCH:103,SEM_PRE:200,SEM_POST:201,SEM_OK:300,idName:s=>{switch(s){case Pp.ALT:return"ALT";case Pp.CAT:return"CAT";case Pp.REP:return"REP";case Pp.RNM:return"RNM";case Pp.TRG:return"TRG";case Pp.TBS:return"TBS";case Pp.TLS:return"TLS";case Pp.UDT:return"UDT";case Pp.AND:return"AND";case Pp.NOT:return"NOT";case Pp.ACTIVE:return"ACTIVE";case Pp.EMPTY:return"EMPTY";case Pp.MATCH:return"MATCH";case Pp.NOMATCH:return"NOMATCH";case Pp.SEM_PRE:return"SEM_PRE";case Pp.SEM_POST:return"SEM_POST";case Pp.SEM_OK:return"SEM_OK";default:return"UNRECOGNIZED STATE"}}};function grammar(){this.grammarObject="grammarObject",this.rules=[],this.rules[0]={name:"json-pointer",lower:"json-pointer",index:0,isBkr:!1},this.rules[1]={name:"reference-token",lower:"reference-token",index:1,isBkr:!1},this.rules[2]={name:"unescaped",lower:"unescaped",index:2,isBkr:!1},this.rules[3]={name:"escaped",lower:"escaped",index:3,isBkr:!1},this.rules[4]={name:"array-location",lower:"array-location",index:4,isBkr:!1},this.rules[5]={name:"array-index",lower:"array-index",index:5,isBkr:!1},this.rules[6]={name:"array-dash",lower:"array-dash",index:6,isBkr:!1},this.rules[7]={name:"slash",lower:"slash",index:7,isBkr:!1},this.udts=[],this.rules[0].opcodes=[],this.rules[0].opcodes[0]={type:3,min:0,max:1/0},this.rules[0].opcodes[1]={type:2,children:[2,3]},this.rules[0].opcodes[2]={type:4,index:7},this.rules[0].opcodes[3]={type:4,index:1},this.rules[1].opcodes=[],this.rules[1].opcodes[0]={type:3,min:0,max:1/0},this.rules[1].opcodes[1]={type:1,children:[2,3]},this.rules[1].opcodes[2]={type:4,index:2},this.rules[1].opcodes[3]={type:4,index:3},this.rules[2].opcodes=[],this.rules[2].opcodes[0]={type:1,children:[1,2,3]},this.rules[2].opcodes[1]={type:5,min:0,max:46},this.rules[2].opcodes[2]={type:5,min:48,max:125},this.rules[2].opcodes[3]={type:5,min:127,max:1114111},this.rules[3].opcodes=[],this.rules[3].opcodes[0]={type:2,children:[1,2]},this.rules[3].opcodes[1]={type:7,string:[126]},this.rules[3].opcodes[2]={type:1,children:[3,4]},this.rules[3].opcodes[3]={type:7,string:[48]},this.rules[3].opcodes[4]={type:7,string:[49]},this.rules[4].opcodes=[],this.rules[4].opcodes[0]={type:1,children:[1,2]},this.rules[4].opcodes[1]={type:4,index:5},this.rules[4].opcodes[2]={type:4,index:6},this.rules[5].opcodes=[],this.rules[5].opcodes[0]={type:1,children:[1,2]},this.rules[5].opcodes[1]={type:6,string:[48]},this.rules[5].opcodes[2]={type:2,children:[3,4]},this.rules[5].opcodes[3]={type:5,min:49,max:57},this.rules[5].opcodes[4]={type:3,min:0,max:1/0},this.rules[5].opcodes[5]={type:5,min:48,max:57},this.rules[6].opcodes=[],this.rules[6].opcodes[0]={type:7,string:[45]},this.rules[7].opcodes=[],this.rules[7].opcodes[0]={type:7,string:[47]},this.toString=function toString(){let s="";return s+="; JavaScript Object Notation (JSON) Pointer ABNF syntax\n",s+="; https://datatracker.ietf.org/doc/html/rfc6901\n",s+="json-pointer    = *( slash reference-token ) ; MODIFICATION: surrogate text rule used\n",s+="reference-token = *( unescaped / escaped )\n",s+="unescaped       = %x00-2E / %x30-7D / %x7F-10FFFF\n",s+="                ; %x2F ('/') and %x7E ('~') are excluded from 'unescaped'\n",s+='escaped         = "~" ( "0" / "1" )\n',s+="                ; representing '~' and '/', respectively\n",s+="\n",s+="; https://datatracker.ietf.org/doc/html/rfc6901#section-4\n",s+="array-location  = array-index / array-dash\n",s+="array-index     = %x30 / ( %x31-39 *(%x30-39) )\n",s+='                ; "0", or digits without a leading "0"\n',s+='array-dash      = "-"\n',s+="\n",s+="; Surrogate named rules\n",s+='slash           = "/"\n','; JavaScript Object Notation (JSON) Pointer ABNF syntax\n; https://datatracker.ietf.org/doc/html/rfc6901\njson-pointer    = *( slash reference-token ) ; MODIFICATION: surrogate text rule used\nreference-token = *( unescaped / escaped )\nunescaped       = %x00-2E / %x30-7D / %x7F-10FFFF\n                ; %x2F (\'/\') and %x7E (\'~\') are excluded from \'unescaped\'\nescaped         = "~" ( "0" / "1" )\n                ; representing \'~\' and \'/\', respectively\n\n; https://datatracker.ietf.org/doc/html/rfc6901#section-4\narray-location  = array-index / array-dash\narray-index     = %x30 / ( %x31-39 *(%x30-39) )\n                ; "0", or digits without a leading "0"\narray-dash      = "-"\n\n; Surrogate named rules\nslash           = "/"\n'}}class JSONPointerError extends Error{constructor(s,o=void 0){if(super(s,o),this.name=this.constructor.name,"string"==typeof s&&(this.message=s),"function"==typeof Error.captureStackTrace?Error.captureStackTrace(this,this.constructor):this.stack=new Error(s).stack,null!=o&&"object"==typeof o&&Object.prototype.hasOwnProperty.call(o,"cause")&&!("cause"in this)){const{cause:s}=o;this.cause=s,s instanceof Error&&"stack"in s&&(this.stack=`${this.stack}\nCAUSE: ${s.stack}`)}if(null!=o&&"object"==typeof o){const{cause:s,...i}=o;Object.assign(this,i)}}}const Ip=JSONPointerError;const Tp=class JSONPointerParseError extends Ip{},callbacks_cst=s=>(o,i,a,u,_)=>{if("object"!=typeof _||null===_||Array.isArray(_))throw new Tp("parser's user data must be an object");if(o===Pp.SEM_PRE){const o={type:s,text:jp.charsToString(i,a,u),start:a,length:u,children:[]};if(_.stack.length>0){_.stack[_.stack.length-1].children.push(o)}else _.root=o;_.stack.push(o)}o===Pp.SEM_POST&&_.stack.pop()};const Np=class CSTTranslator_CSTTranslator extends Op{constructor(){super(),this.callbacks["json-pointer"]=callbacks_cst("json-pointer"),this.callbacks["reference-token"]=callbacks_cst("reference-token"),this.callbacks.slash=callbacks_cst("text")}getTree(){const s={stack:[],root:null};return this.translate(s),delete s.stack,s}},es_unescape=s=>{if("string"!=typeof s)throw new TypeError("Reference token must be a string");return s.replace(/~1/g,"/").replace(/~0/g,"~")};const Mp=class ASTTranslator extends Np{getTree(){const{root:s}=super.getTree();return s.children.filter((({type:s})=>"reference-token"===s)).map((({text:s})=>es_unescape(s)))}};const Rp=class Expectations extends Array{toString(){return this.map((s=>`"${String(s)}"`)).join(", ")}};const Dp=class Trace extends Ap{inferExpectations(){const s=this.displayTrace().split("\n"),o=new Set;let i=-1;for(let a=0;a<s.length;a++){const u=s[a];if(u.includes("M|")){const s=u.match(/]'(.*)'$/);s&&s[1]&&(i=a)}if(a>i){const s=u.match(/N\|\[TLS\(([^)]+)\)]/);s&&o.add(s[1])}}return new Rp(...o)}},Lp=new grammar,es_parse=(s,{translator:o=new Mp,stats:i=!1,trace:a=!1}={})=>{if("string"!=typeof s)throw new TypeError("JSON Pointer must be a string");try{const u=new kp;o&&(u.ast=o),i&&(u.stats=new Cp),a&&(u.trace=new Dp);const _=u.parse(Lp,"json-pointer",s);return{result:_,tree:_.success&&o?u.ast.getTree():void 0,stats:u.stats,trace:u.trace}}catch(o){throw new Tp("Unexpected error during JSON Pointer parsing",{cause:o,jsonPointer:s})}};new grammar,new kp,new grammar,new kp;const Fp=new grammar,Bp=new kp,array_index=s=>{if("string"!=typeof s)return!1;try{return Bp.parse(Fp,"array-index",s).success}catch{return!1}},$p=new grammar,qp=new kp,array_dash=s=>{if("string"!=typeof s)return!1;try{return qp.parse($p,"array-dash",s).success}catch{return!1}},es_escape=s=>{if("string"!=typeof s&&"number"!=typeof s)throw new TypeError("Reference token must be a string or number");return String(s).replace(/~/g,"~0").replace(/\//g,"~1")};const Up=class JSONPointerCompileError extends Ip{},es_compile=s=>{if(!Array.isArray(s))throw new TypeError("Reference tokens must be a list of strings or numbers");try{return 0===s.length?"":`/${s.map((s=>{if("string"!=typeof s&&"number"!=typeof s)throw new TypeError("Reference token must be a string or number");return es_escape(String(s))})).join("/")}`}catch(o){throw new Up("Unexpected error during JSON Pointer compilation",{cause:o,referenceTokens:s})}};const Vp=class TraceBuilder{#e;#t;#r;constructor(s,o={}){this.#e=s,this.#e.steps=[],this.#e.failed=!1,this.#e.failedAt=-1,this.#e.message=`JSON Pointer "${o.jsonPointer}" was successfully evaluated against the provided value`,this.#e.context={...o,realm:o.realm.name},this.#t=[],this.#r=o.realm}step({referenceToken:s,input:o,output:i,success:a=!0,reason:u}){const _=this.#t.length;this.#t.push(s);const w={referenceToken:s,referenceTokenPosition:_,input:o,inputType:this.#r.isObject(o)?"object":this.#r.isArray(o)?"array":"unrecognized",output:i,success:a};u&&(w.reason=u),this.#e.steps.push(w),a||(this.#e.failed=!0,this.#e.failedAt=_,this.#e.message=u)}};const zp=class EvaluationRealm{name="";isArray(s){throw new Ip("Realm.isArray(node) must be implemented in a subclass")}isObject(s){throw new Ip("Realm.isObject(node) must be implemented in a subclass")}sizeOf(s){throw new Ip("Realm.sizeOf(node) must be implemented in a subclass")}has(s,o){throw new Ip("Realm.has(node) must be implemented in a subclass")}evaluate(s,o){throw new Ip("Realm.evaluate(node) must be implemented in a subclass")}};const Wp=class JSONPointerEvaluateError extends Ip{};const Jp=class JSONPointerIndexError extends Wp{};const Hp=class JSONEvaluationRealm extends zp{name="json";isArray(s){return Array.isArray(s)}isObject(s){return"object"==typeof s&&null!==s&&!this.isArray(s)}sizeOf(s){return this.isArray(s)?s.length:this.isObject(s)?Object.keys(s).length:0}has(s,o){if(this.isArray(s)){const i=Number(o),a=i>>>0;if(i!==a)throw new Jp(`Invalid array index "${o}": index must be an unsinged 32-bit integer`,{referenceToken:o,currentValue:s,realm:this.name});return a<this.sizeOf(s)&&Object.prototype.hasOwnProperty.call(s,i)}return!!this.isObject(s)&&Object.prototype.hasOwnProperty.call(s,o)}evaluate(s,o){return this.isArray(s)?s[Number(o)]:s[o]}};const Kp=class JSONPointerTypeError extends Wp{};const Gp=class JSONPointerKeyError extends Wp{},es_evaluate=(s,o,{strictArrays:i=!0,strictObjects:a=!0,realm:u=new Hp,trace:_=!0}={})=>{const{result:w,tree:x,trace:C}=es_parse(o,{trace:!!_}),j="object"==typeof _&&null!==_?new Vp(_,{jsonPointer:o,referenceTokens:x,strictArrays:i,strictObjects:a,realm:u,value:s}):null;try{let _;if(!w.success){let i=`Invalid JSON Pointer: "${o}". Syntax error at position ${w.maxMatched}`;throw i+=C?`, expected ${C.inferExpectations()}`:"",new Wp(i,{jsonPointer:o,currentValue:s,realm:u.name})}return x.reduce(((s,w,C)=>{if(u.isArray(s)){if(array_dash(w)){if(i)throw new Jp(`Invalid array index "-" at position ${C} in "${o}". The "-" token always refers to a nonexistent element during evaluation`,{jsonPointer:o,referenceTokens:x,referenceToken:w,referenceTokenPosition:C,currentValue:s,realm:u.name});return _=u.evaluate(s,String(u.sizeOf(s))),null==j||j.step({referenceToken:w,input:s,output:_}),_}if(!array_index(w))throw new Jp(`Invalid array index "${w}" at position ${C} in "${o}": index MUST be "0", or digits without a leading "0"`,{jsonPointer:o,referenceTokens:x,referenceToken:w,referenceTokenPosition:C,currentValue:s,realm:u.name});const a=Number(w);if(!Number.isSafeInteger(a))throw new Jp(`Invalid array index "${w}" at position ${C} in "${o}": index must be a safe integer`,{jsonPointer:o,referenceTokens:x,referenceToken:w,referenceTokenPosition:C,currentValue:s,realm:u.name});if(!u.has(s,w)&&i)throw new Jp(`Invalid array index "${w}" at position ${C} in "${o}": index not found in array`,{jsonPointer:o,referenceTokens:x,referenceToken:w,referenceTokenPosition:C,currentValue:s,realm:u.name});return _=u.evaluate(s,w),null==j||j.step({referenceToken:w,input:s,output:_}),_}if(u.isObject(s)){if(!u.has(s,w)&&a)throw new Gp(`Invalid object key "${w}" at position ${C} in "${o}": key not found in object`,{jsonPointer:o,referenceTokens:x,referenceToken:w,referenceTokenPosition:C,currentValue:s,realm:u.name});return _=u.evaluate(s,w),null==j||j.step({referenceToken:w,input:s,output:_}),_}throw new Kp(`Invalid reference token "${w}" at position ${C} in "${o}": cannot be applied to a non-object/non-array value`,{jsonPointer:o,referenceTokens:x,referenceToken:w,referenceTokenPosition:C,currentValue:s,realm:u.name})}),s)}catch(s){if(null==j||j.step({referenceToken:s.referenceToken,input:s.currentValue,success:!1,reason:s.message}),s instanceof Wp)throw s;throw new Wp("Unexpected error during JSON Pointer evaluation",{cause:s,jsonPointer:o,referenceTokens:x})}};const Yp=class ApiDOMEvaluationRealm extends zp{name="apidom";isArray(s){return Mu(s)}isObject(s){return Nu(s)}sizeOf(s){return this.isArray(s)||this.isObject(s)?s.length:0}has(s,o){if(this.isArray(s)){const i=Number(o),a=i>>>0;if(i!==a)throw new Jp(`Invalid array index "${o}": index must be an unsinged 32-bit integer`,{referenceToken:o,currentValue:s,realm:this.name});return a<this.sizeOf(s)}if(this.isObject(s)){const i=s.keys(),a=new Set(i);if(i.length!==a.size)throw new Gp(`Object key "${o}" is not unique — JSON Pointer requires unique member names`,{referenceToken:o,currentValue:s,realm:this.name});return s.hasKey(o)}return!1}evaluate(s,o){return this.isArray(s)?s.get(Number(o)):s.get(o)}},apidom_evaluate=(s,o,i={})=>es_evaluate(s,o,{...i,realm:new Yp});class Callback extends Su.Sh{constructor(s,o,i){super(s,o,i),this.element="callback"}}const Xp=Callback;class Components extends Su.Sh{constructor(s,o,i){super(s,o,i),this.element="components"}get schemas(){return this.get("schemas")}set schemas(s){this.set("schemas",s)}get responses(){return this.get("responses")}set responses(s){this.set("responses",s)}get parameters(){return this.get("parameters")}set parameters(s){this.set("parameters",s)}get examples(){return this.get("examples")}set examples(s){this.set("examples",s)}get requestBodies(){return this.get("requestBodies")}set requestBodies(s){this.set("requestBodies",s)}get headers(){return this.get("headers")}set headers(s){this.set("headers",s)}get securitySchemes(){return this.get("securitySchemes")}set securitySchemes(s){this.set("securitySchemes",s)}get links(){return this.get("links")}set links(s){this.set("links",s)}get callbacks(){return this.get("callbacks")}set callbacks(s){this.set("callbacks",s)}}const Qp=Components;class Contact extends Su.Sh{constructor(s,o,i){super(s,o,i),this.element="contact"}get name(){return this.get("name")}set name(s){this.set("name",s)}get url(){return this.get("url")}set url(s){this.set("url",s)}get email(){return this.get("email")}set email(s){this.set("email",s)}}const Zp=Contact;class Discriminator extends Su.Sh{constructor(s,o,i){super(s,o,i),this.element="discriminator"}get propertyName(){return this.get("propertyName")}set propertyName(s){this.set("propertyName",s)}get mapping(){return this.get("mapping")}set mapping(s){this.set("mapping",s)}}const th=Discriminator;class Encoding extends Su.Sh{constructor(s,o,i){super(s,o,i),this.element="encoding"}get contentType(){return this.get("contentType")}set contentType(s){this.set("contentType",s)}get headers(){return this.get("headers")}set headers(s){this.set("headers",s)}get style(){return this.get("style")}set style(s){this.set("style",s)}get explode(){return this.get("explode")}set explode(s){this.set("explode",s)}get allowedReserved(){return this.get("allowedReserved")}set allowedReserved(s){this.set("allowedReserved",s)}}const rh=Encoding;class Example extends Su.Sh{constructor(s,o,i){super(s,o,i),this.element="example"}get summary(){return this.get("summary")}set summary(s){this.set("summary",s)}get description(){return this.get("description")}set description(s){this.set("description",s)}get value(){return this.get("value")}set value(s){this.set("value",s)}get externalValue(){return this.get("externalValue")}set externalValue(s){this.set("externalValue",s)}}const uh=Example;class ExternalDocumentation extends Su.Sh{constructor(s,o,i){super(s,o,i),this.element="externalDocumentation"}get description(){return this.get("description")}set description(s){this.set("description",s)}get url(){return this.get("url")}set url(s){this.set("url",s)}}const dh=ExternalDocumentation;class Header extends Su.Sh{constructor(s,o,i){super(s,o,i),this.element="header"}get required(){return this.hasKey("required")?this.get("required"):new Su.bd(!1)}set required(s){this.set("required",s)}get deprecated(){return this.hasKey("deprecated")?this.get("deprecated"):new Su.bd(!1)}set deprecated(s){this.set("deprecated",s)}get allowEmptyValue(){return this.get("allowEmptyValue")}set allowEmptyValue(s){this.set("allowEmptyValue",s)}get style(){return this.get("style")}set style(s){this.set("style",s)}get explode(){return this.get("explode")}set explode(s){this.set("explode",s)}get allowReserved(){return this.get("allowReserved")}set allowReserved(s){this.set("allowReserved",s)}get schema(){return this.get("schema")}set schema(s){this.set("schema",s)}get example(){return this.get("example")}set example(s){this.set("example",s)}get examples(){return this.get("examples")}set examples(s){this.set("examples",s)}get contentProp(){return this.get("content")}set contentProp(s){this.set("content",s)}}Object.defineProperty(Header.prototype,"description",{get(){return this.get("description")},set(s){this.set("description",s)},enumerable:!0});const fh=Header;class Info extends Su.Sh{constructor(s,o,i){super(s,o,i),this.element="info",this.classes.push("info")}get title(){return this.get("title")}set title(s){this.set("title",s)}get description(){return this.get("description")}set description(s){this.set("description",s)}get termsOfService(){return this.get("termsOfService")}set termsOfService(s){this.set("termsOfService",s)}get contact(){return this.get("contact")}set contact(s){this.set("contact",s)}get license(){return this.get("license")}set license(s){this.set("license",s)}get version(){return this.get("version")}set version(s){this.set("version",s)}}const vh=Info;class License extends Su.Sh{constructor(s,o,i){super(s,o,i),this.element="license"}get name(){return this.get("name")}set name(s){this.set("name",s)}get url(){return this.get("url")}set url(s){this.set("url",s)}}const _h=License;class Link extends Su.Sh{constructor(s,o,i){super(s,o,i),this.element="link"}get operationRef(){return this.get("operationRef")}set operationRef(s){this.set("operationRef",s)}get operationId(){return this.get("operationId")}set operationId(s){this.set("operationId",s)}get operation(){var s,o;return ju(this.operationRef)?null===(s=this.operationRef)||void 0===s?void 0:s.meta.get("operation"):ju(this.operationId)?null===(o=this.operationId)||void 0===o?void 0:o.meta.get("operation"):void 0}set operation(s){this.set("operation",s)}get parameters(){return this.get("parameters")}set parameters(s){this.set("parameters",s)}get requestBody(){return this.get("requestBody")}set requestBody(s){this.set("requestBody",s)}get description(){return this.get("description")}set description(s){this.set("description",s)}get server(){return this.get("server")}set server(s){this.set("server",s)}}const wh=Link;class MediaType extends Su.Sh{constructor(s,o,i){super(s,o,i),this.element="mediaType"}get schema(){return this.get("schema")}set schema(s){this.set("schema",s)}get example(){return this.get("example")}set example(s){this.set("example",s)}get examples(){return this.get("examples")}set examples(s){this.set("examples",s)}get encoding(){return this.get("encoding")}set encoding(s){this.set("encoding",s)}}const Oh=MediaType;class OAuthFlow extends Su.Sh{constructor(s,o,i){super(s,o,i),this.element="oAuthFlow"}get authorizationUrl(){return this.get("authorizationUrl")}set authorizationUrl(s){this.set("authorizationUrl",s)}get tokenUrl(){return this.get("tokenUrl")}set tokenUrl(s){this.set("tokenUrl",s)}get refreshUrl(){return this.get("refreshUrl")}set refreshUrl(s){this.set("refreshUrl",s)}get scopes(){return this.get("scopes")}set scopes(s){this.set("scopes",s)}}const jh=OAuthFlow;class OAuthFlows extends Su.Sh{constructor(s,o,i){super(s,o,i),this.element="oAuthFlows"}get implicit(){return this.get("implicit")}set implicit(s){this.set("implicit",s)}get password(){return this.get("password")}set password(s){this.set("password",s)}get clientCredentials(){return this.get("clientCredentials")}set clientCredentials(s){this.set("clientCredentials",s)}get authorizationCode(){return this.get("authorizationCode")}set authorizationCode(s){this.set("authorizationCode",s)}}const Ph=OAuthFlows;class Openapi extends Su.Om{constructor(s,o,i){super(s,o,i),this.element="openapi",this.classes.push("spec-version"),this.classes.push("version")}}const Ih=Openapi;class OpenApi3_0 extends Su.Sh{constructor(s,o,i){super(s,o,i),this.element="openApi3_0",this.classes.push("api")}get openapi(){return this.get("openapi")}set openapi(s){this.set("openapi",s)}get info(){return this.get("info")}set info(s){this.set("info",s)}get servers(){return this.get("servers")}set servers(s){this.set("servers",s)}get paths(){return this.get("paths")}set paths(s){this.set("paths",s)}get components(){return this.get("components")}set components(s){this.set("components",s)}get security(){return this.get("security")}set security(s){this.set("security",s)}get tags(){return this.get("tags")}set tags(s){this.set("tags",s)}get externalDocs(){return this.get("externalDocs")}set externalDocs(s){this.set("externalDocs",s)}}const Rh=OpenApi3_0;class Operation extends Su.Sh{constructor(s,o,i){super(s,o,i),this.element="operation"}get tags(){return this.get("tags")}set tags(s){this.set("tags",s)}get summary(){return this.get("summary")}set summary(s){this.set("summary",s)}get description(){return this.get("description")}set description(s){this.set("description",s)}set externalDocs(s){this.set("externalDocs",s)}get externalDocs(){return this.get("externalDocs")}get operationId(){return this.get("operationId")}set operationId(s){this.set("operationId",s)}get parameters(){return this.get("parameters")}set parameters(s){this.set("parameters",s)}get requestBody(){return this.get("requestBody")}set requestBody(s){this.set("requestBody",s)}get responses(){return this.get("responses")}set responses(s){this.set("responses",s)}get callbacks(){return this.get("callbacks")}set callbacks(s){this.set("callbacks",s)}get deprecated(){return this.hasKey("deprecated")?this.get("deprecated"):new Su.bd(!1)}set deprecated(s){this.set("deprecated",s)}get security(){return this.get("security")}set security(s){this.set("security",s)}get servers(){return this.get("severs")}set servers(s){this.set("servers",s)}}const Dh=Operation;class Parameter extends Su.Sh{constructor(s,o,i){super(s,o,i),this.element="parameter"}get name(){return this.get("name")}set name(s){this.set("name",s)}get in(){return this.get("in")}set in(s){this.set("in",s)}get required(){return this.hasKey("required")?this.get("required"):new Su.bd(!1)}set required(s){this.set("required",s)}get deprecated(){return this.hasKey("deprecated")?this.get("deprecated"):new Su.bd(!1)}set deprecated(s){this.set("deprecated",s)}get allowEmptyValue(){return this.get("allowEmptyValue")}set allowEmptyValue(s){this.set("allowEmptyValue",s)}get style(){return this.get("style")}set style(s){this.set("style",s)}get explode(){return this.get("explode")}set explode(s){this.set("explode",s)}get allowReserved(){return this.get("allowReserved")}set allowReserved(s){this.set("allowReserved",s)}get schema(){return this.get("schema")}set schema(s){this.set("schema",s)}get example(){return this.get("example")}set example(s){this.set("example",s)}get examples(){return this.get("examples")}set examples(s){this.set("examples",s)}get contentProp(){return this.get("content")}set contentProp(s){this.set("content",s)}}Object.defineProperty(Parameter.prototype,"description",{get(){return this.get("description")},set(s){this.set("description",s)},enumerable:!0});const Lh=Parameter;class PathItem extends Su.Sh{constructor(s,o,i){super(s,o,i),this.element="pathItem"}get $ref(){return this.get("$ref")}set $ref(s){this.set("$ref",s)}get summary(){return this.get("summary")}set summary(s){this.set("summary",s)}get description(){return this.get("description")}set description(s){this.set("description",s)}get GET(){return this.get("get")}set GET(s){this.set("GET",s)}get PUT(){return this.get("put")}set PUT(s){this.set("PUT",s)}get POST(){return this.get("post")}set POST(s){this.set("POST",s)}get DELETE(){return this.get("delete")}set DELETE(s){this.set("DELETE",s)}get OPTIONS(){return this.get("options")}set OPTIONS(s){this.set("OPTIONS",s)}get HEAD(){return this.get("head")}set HEAD(s){this.set("HEAD",s)}get PATCH(){return this.get("patch")}set PATCH(s){this.set("PATCH",s)}get TRACE(){return this.get("trace")}set TRACE(s){this.set("TRACE",s)}get servers(){return this.get("servers")}set servers(s){this.set("servers",s)}get parameters(){return this.get("parameters")}set parameters(s){this.set("parameters",s)}}const Fh=PathItem;class Paths extends Su.Sh{constructor(s,o,i){super(s,o,i),this.element="paths"}}const Jh=Paths;class Reference extends Su.Sh{constructor(s,o,i){super(s,o,i),this.element="reference",this.classes.push("openapi-reference")}get $ref(){return this.get("$ref")}set $ref(s){this.set("$ref",s)}}const Hh=Reference;class RequestBody extends Su.Sh{constructor(s,o,i){super(s,o,i),this.element="requestBody"}get description(){return this.get("description")}set description(s){this.set("description",s)}get contentProp(){return this.get("content")}set contentProp(s){this.set("content",s)}get required(){return this.hasKey("required")?this.get("required"):new Su.bd(!1)}set required(s){this.set("required",s)}}const Kh=RequestBody;class Response_Response extends Su.Sh{constructor(s,o,i){super(s,o,i),this.element="response"}get description(){return this.get("description")}set description(s){this.set("description",s)}get headers(){return this.get("headers")}set headers(s){this.set("headers",s)}get contentProp(){return this.get("content")}set contentProp(s){this.set("content",s)}get links(){return this.get("links")}set links(s){this.set("links",s)}}const Gh=Response_Response;class Responses extends Su.Sh{constructor(s,o,i){super(s,o,i),this.element="responses"}get default(){return this.get("default")}set default(s){this.set("default",s)}}const Qh=Responses;const td=class UnsupportedOperationError extends Ko{};class JSONSchema extends Su.Sh{constructor(s,o,i){super(s,o,i),this.element="JSONSchemaDraft4"}get idProp(){return this.get("id")}set idProp(s){this.set("id",s)}get $schema(){return this.get("$schema")}set $schema(s){this.set("$schema",s)}get multipleOf(){return this.get("multipleOf")}set multipleOf(s){this.set("multipleOf",s)}get maximum(){return this.get("maximum")}set maximum(s){this.set("maximum",s)}get exclusiveMaximum(){return this.get("exclusiveMaximum")}set exclusiveMaximum(s){this.set("exclusiveMaximum",s)}get minimum(){return this.get("minimum")}set minimum(s){this.set("minimum",s)}get exclusiveMinimum(){return this.get("exclusiveMinimum")}set exclusiveMinimum(s){this.set("exclusiveMinimum",s)}get maxLength(){return this.get("maxLength")}set maxLength(s){this.set("maxLength",s)}get minLength(){return this.get("minLength")}set minLength(s){this.set("minLength",s)}get pattern(){return this.get("pattern")}set pattern(s){this.set("pattern",s)}get additionalItems(){return this.get("additionalItems")}set additionalItems(s){this.set("additionalItems",s)}get items(){return this.get("items")}set items(s){this.set("items",s)}get maxItems(){return this.get("maxItems")}set maxItems(s){this.set("maxItems",s)}get minItems(){return this.get("minItems")}set minItems(s){this.set("minItems",s)}get uniqueItems(){return this.get("uniqueItems")}set uniqueItems(s){this.set("uniqueItems",s)}get maxProperties(){return this.get("maxProperties")}set maxProperties(s){this.set("maxProperties",s)}get minProperties(){return this.get("minProperties")}set minProperties(s){this.set("minProperties",s)}get required(){return this.get("required")}set required(s){this.set("required",s)}get properties(){return this.get("properties")}set properties(s){this.set("properties",s)}get additionalProperties(){return this.get("additionalProperties")}set additionalProperties(s){this.set("additionalProperties",s)}get patternProperties(){return this.get("patternProperties")}set patternProperties(s){this.set("patternProperties",s)}get dependencies(){return this.get("dependencies")}set dependencies(s){this.set("dependencies",s)}get enum(){return this.get("enum")}set enum(s){this.set("enum",s)}get type(){return this.get("type")}set type(s){this.set("type",s)}get allOf(){return this.get("allOf")}set allOf(s){this.set("allOf",s)}get anyOf(){return this.get("anyOf")}set anyOf(s){this.set("anyOf",s)}get oneOf(){return this.get("oneOf")}set oneOf(s){this.set("oneOf",s)}get not(){return this.get("not")}set not(s){this.set("not",s)}get definitions(){return this.get("definitions")}set definitions(s){this.set("definitions",s)}get title(){return this.get("title")}set title(s){this.set("title",s)}get description(){return this.get("description")}set description(s){this.set("description",s)}get default(){return this.get("default")}set default(s){this.set("default",s)}get format(){return this.get("format")}set format(s){this.set("format",s)}get base(){return this.get("base")}set base(s){this.set("base",s)}get links(){return this.get("links")}set links(s){this.set("links",s)}get media(){return this.get("media")}set media(s){this.set("media",s)}get readOnly(){return this.get("readOnly")}set readOnly(s){this.set("readOnly",s)}}const sd=JSONSchema;class JSONReference extends Su.Sh{constructor(s,o,i){super(s,o,i),this.element="JSONReference",this.classes.push("json-reference")}get $ref(){return this.get("$ref")}set $ref(s){this.set("$ref",s)}}const id=JSONReference;class Media extends Su.Sh{constructor(s,o,i){super(s,o,i),this.element="media"}get binaryEncoding(){return this.get("binaryEncoding")}set binaryEncoding(s){this.set("binaryEncoding",s)}get type(){return this.get("type")}set type(s){this.set("type",s)}}const cd=Media;class LinkDescription extends Su.Sh{constructor(s,o,i){super(s,o,i),this.element="linkDescription"}get href(){return this.get("href")}set href(s){this.set("href",s)}get rel(){return this.get("rel")}set rel(s){this.set("rel",s)}get title(){return this.get("title")}set title(s){this.set("title",s)}get targetSchema(){return this.get("targetSchema")}set targetSchema(s){this.set("targetSchema",s)}get mediaType(){return this.get("mediaType")}set mediaType(s){this.set("mediaType",s)}get method(){return this.get("method")}set method(s){this.set("method",s)}get encType(){return this.get("encType")}set encType(s){this.set("encType",s)}get schema(){return this.get("schema")}set schema(s){this.set("schema",s)}}const ld=LinkDescription,emptyElement=s=>{const o=s.meta.length>0?cloneDeep(s.meta):void 0,i=s.attributes.length>0?cloneDeep(s.attributes):void 0;return new s.constructor(void 0,o,i)},cloneUnlessOtherwiseSpecified=(s,o)=>o.clone&&o.isMergeableElement(s)?deepmerge(emptyElement(s),s,o):s,ud={clone:!0,isMergeableElement:s=>Nu(s)||Mu(s),arrayElementMerge:(s,o,i)=>s.concat(o)["fantasy-land/map"]((s=>cloneUnlessOtherwiseSpecified(s,i))),objectElementMerge:(s,o,i)=>{const a=Nu(s)?emptyElement(s):emptyElement(o);return Nu(s)&&s.forEach(((s,o,u)=>{const _=cloneShallow(u);_.value=cloneUnlessOtherwiseSpecified(s,i),a.content.push(_)})),o.forEach(((o,u,_)=>{const w=serializers_value(u);let x;if(Nu(s)&&s.hasKey(w)&&i.isMergeableElement(o)){const a=s.get(w);x=cloneShallow(_),x.value=((s,o)=>{if("function"!=typeof o.customMerge)return deepmerge;const i=o.customMerge(s,o);return"function"==typeof i?i:deepmerge})(u,i)(a,o,i)}else x=cloneShallow(_),x.value=cloneUnlessOtherwiseSpecified(o,i);a.remove(w),a.content.push(x)})),a},customMerge:void 0,customMetaMerge:void 0,customAttributesMerge:void 0},deepmerge=(s,o,i)=>{var a,u,_;const w={...ud,...i};w.isMergeableElement=null!==(a=w.isMergeableElement)&&void 0!==a?a:ud.isMergeableElement,w.arrayElementMerge=null!==(u=w.arrayElementMerge)&&void 0!==u?u:ud.arrayElementMerge,w.objectElementMerge=null!==(_=w.objectElementMerge)&&void 0!==_?_:ud.objectElementMerge;const x=Mu(o);if(!(x===Mu(s)))return cloneUnlessOtherwiseSpecified(o,w);const C=x&&"function"==typeof w.arrayElementMerge?w.arrayElementMerge(s,o,w):w.objectElementMerge(s,o,w);return C.meta=(s=>"function"!=typeof s.customMetaMerge?s=>cloneDeep(s):s.customMetaMerge)(w)(s.meta,o.meta),C.attributes=(s=>"function"!=typeof s.customAttributesMerge?s=>cloneDeep(s):s.customAttributesMerge)(w)(s.attributes,o.attributes),C};deepmerge.all=(s,o)=>{if(!Array.isArray(s))throw new TypeError("First argument of deepmerge should be an array.");return 0===s.length?new Su.Sh:s.reduce(((s,i)=>deepmerge(s,i,o)),emptyElement(s[0]))};const dd=deepmerge;const md=class Visitor_Visitor{element;constructor(s){Object.assign(this,s)}copyMetaAndAttributes(s,o){(s.meta.length>0||o.meta.length>0)&&(o.meta=dd(o.meta,s.meta)),hasElementSourceMap(s)&&assignSourceMap(o,s),(s.attributes.length>0||s.meta.length>0)&&(o.attributes=dd(o.attributes,s.attributes))}};const yd=class FallbackVisitor extends md{enter(s){return this.element=cloneDeep(s),qu}},copyProps=(s,o,i=[])=>{const a=Object.getOwnPropertyDescriptors(o);for(let s of i)delete a[s];Object.defineProperties(s,a)},protoChain=(s,o=[s])=>{const i=Object.getPrototypeOf(s);return null===i?o:protoChain(i,[...o,i])},hardMixProtos=(s,o,i=[])=>{var a;const u=null!==(a=((...s)=>{if(0===s.length)return;let o;const i=s.map((s=>protoChain(s)));for(;i.every((s=>s.length>0));){const s=i.map((s=>s.pop())),a=s[0];if(!s.every((s=>s===a)))break;o=a}return o})(...s))&&void 0!==a?a:Object.prototype,_=Object.create(u),w=protoChain(u);for(let o of s){let s=protoChain(o);for(let o=s.length-1;o>=0;o--){let a=s[o];-1===w.indexOf(a)&&(copyProps(_,a,["constructor",...i]),w.push(a))}}return _.constructor=o,_},unique=s=>s.filter(((o,i)=>s.indexOf(o)==i)),getIngredientWithProp=(s,o)=>{const i=o.map((s=>protoChain(s)));let a=0,u=!0;for(;u;){u=!1;for(let _=o.length-1;_>=0;_--){const o=i[_][a];if(null!=o&&(u=!0,null!=Object.getOwnPropertyDescriptor(o,s)))return i[_][0]}a++}},proxyMix=(s,o=Object.prototype)=>new Proxy({},{getPrototypeOf:()=>o,setPrototypeOf(){throw Error("Cannot set prototype of Proxies created by ts-mixer")},getOwnPropertyDescriptor:(o,i)=>Object.getOwnPropertyDescriptor(getIngredientWithProp(i,s)||{},i),defineProperty(){throw new Error("Cannot define new properties on Proxies created by ts-mixer")},has:(i,a)=>void 0!==getIngredientWithProp(a,s)||void 0!==o[a],get:(i,a)=>(getIngredientWithProp(a,s)||o)[a],set(o,i,a){const u=getIngredientWithProp(i,s);if(void 0===u)throw new Error("Cannot set new properties on Proxies created by ts-mixer");return u[i]=a,!0},deleteProperty(){throw new Error("Cannot delete properties on Proxies created by ts-mixer")},ownKeys:()=>s.map(Object.getOwnPropertyNames).reduce(((s,o)=>o.concat(s.filter((s=>o.indexOf(s)<0)))))}),vd=null,_d="copy",Sd="copy",Ed="deep",wd=new WeakMap,getMixinsForClass=s=>wd.get(s),mergeObjectsOfDecorators=(s,o)=>{var i,a;const u=unique([...Object.getOwnPropertyNames(s),...Object.getOwnPropertyNames(o)]),_={};for(let w of u)_[w]=unique([...null!==(i=null==s?void 0:s[w])&&void 0!==i?i:[],...null!==(a=null==o?void 0:o[w])&&void 0!==a?a:[]]);return _},mergePropertyAndMethodDecorators=(s,o)=>{var i,a,u,_;return{property:mergeObjectsOfDecorators(null!==(i=null==s?void 0:s.property)&&void 0!==i?i:{},null!==(a=null==o?void 0:o.property)&&void 0!==a?a:{}),method:mergeObjectsOfDecorators(null!==(u=null==s?void 0:s.method)&&void 0!==u?u:{},null!==(_=null==o?void 0:o.method)&&void 0!==_?_:{})}},mergeDecorators=(s,o)=>{var i,a,u,_,w,x;return{class:unique([...null!==(i=null==s?void 0:s.class)&&void 0!==i?i:[],...null!==(a=null==o?void 0:o.class)&&void 0!==a?a:[]]),static:mergePropertyAndMethodDecorators(null!==(u=null==s?void 0:s.static)&&void 0!==u?u:{},null!==(_=null==o?void 0:o.static)&&void 0!==_?_:{}),instance:mergePropertyAndMethodDecorators(null!==(w=null==s?void 0:s.instance)&&void 0!==w?w:{},null!==(x=null==o?void 0:o.instance)&&void 0!==x?x:{})}},xd=new Map,deepDecoratorSearch=(...s)=>{const o=((...s)=>{var o;const i=new Set,a=new Set([...s]);for(;a.size>0;)for(let s of a){const u=protoChain(s.prototype).map((s=>s.constructor)),_=[...u,...null!==(o=getMixinsForClass(s))&&void 0!==o?o:[]].filter((s=>!i.has(s)));for(let s of _)a.add(s);i.add(s),a.delete(s)}return[...i]})(...s).map((s=>xd.get(s))).filter((s=>!!s));return 0==o.length?{}:1==o.length?o[0]:o.reduce(((s,o)=>mergeDecorators(s,o)))},getDecoratorsForClass=s=>{let o=xd.get(s);return o||(o={},xd.set(s,o)),o};function Mixin(...s){var o,i,a;const u=s.map((s=>s.prototype)),_=vd;if(null!==_){const s=u.map((s=>s[_])).filter((s=>"function"==typeof s)),combinedInitFunction=function(...o){for(let i of s)i.apply(this,o)},o={[_]:combinedInitFunction};u.push(o)}function MixedClass(...o){for(const i of s)copyProps(this,new i(...o));null!==_&&"function"==typeof this[_]&&this[_].apply(this,o)}var w,x;MixedClass.prototype="copy"===Sd?hardMixProtos(u,MixedClass):(w=u,x=MixedClass,proxyMix([...w,{constructor:x}])),Object.setPrototypeOf(MixedClass,"copy"===_d?hardMixProtos(s,null,["prototype"]):proxyMix(s,Function.prototype));let C=MixedClass;if("none"!==Ed){const u="deep"===Ed?deepDecoratorSearch(...s):((...s)=>{const o=s.map((s=>getDecoratorsForClass(s)));return 0===o.length?{}:1===o.length?o[0]:o.reduce(((s,o)=>mergeDecorators(s,o)))})(...s);for(let s of null!==(o=null==u?void 0:u.class)&&void 0!==o?o:[]){const o=s(C);o&&(C=o)}applyPropAndMethodDecorators(null!==(i=null==u?void 0:u.static)&&void 0!==i?i:{},C),applyPropAndMethodDecorators(null!==(a=null==u?void 0:u.instance)&&void 0!==a?a:{},C.prototype)}var j,L;return j=C,L=s,wd.set(j,L),C}const applyPropAndMethodDecorators=(s,o)=>{const i=s.property,a=s.method;if(i)for(let s in i)for(let a of i[s])a(o,s);if(a)for(let s in a)for(let i of a[s])i(o,s,Object.getOwnPropertyDescriptor(o,s))};const kd=_curry1((function allPass(s){return $a(Aa(Ec,0,Oc("length",s)),(function(){for(var o=0,i=s.length;o<i;){if(!s[o].apply(this,arguments))return!1;o+=1}return!0}))}));const Od=_curry1((function isNotEmpty(s){return!gp(s)}));const Ad=_curry2((function or(s,o){return s||o}));var Cd=dc($a(1,ou(au,_curry2((function either(s,o){return _isFunction(s)?function _either(){return s.apply(this,arguments)||o.apply(this,arguments)}:hc(Ad)(s,o)}))(cu,Mc))));const Id=kd([Jc,Cd,Od]);const Td=_curry2((function pick(s,o){for(var i={},a=0;a<s.length;)s[a]in o&&(i[s[a]]=o[s[a]]),a+=1;return i}));const Nd=class SpecificationVisitor extends md{specObj;passingOptionsNames=["specObj","parent"];constructor({specObj:s,...o}){super({...o}),this.specObj=s}retrievePassingOptions(){return Td(this.passingOptionsNames,this)}retrieveFixedFields(s){const o=Qu(["visitors",...s,"fixedFields"],this.specObj);return"object"==typeof o&&null!==o?Object.keys(o):[]}retrieveVisitor(s){return Qo(Mc,["visitors",...s],this.specObj)?Qu(["visitors",...s],this.specObj):Qu(["visitors",...s,"$visitor"],this.specObj)}retrieveVisitorInstance(s,o={}){const i=this.retrievePassingOptions();return new(this.retrieveVisitor(s))({...i,...o})}toRefractedElement(s,o,i={}){const a=this.retrieveVisitorInstance(s,i);return a instanceof yd&&(null==a?void 0:a.constructor)===yd?cloneDeep(o):(visitor_visit(o,a,i),a.element)}};const Md=class FixedFieldsVisitor extends Nd{specPath;ignoredFields;constructor({specPath:s,ignoredFields:o,...i}){super({...i}),this.specPath=s,this.ignoredFields=o||[]}ObjectElement(s){const o=this.specPath(s),i=this.retrieveFixedFields(o);return s.forEach(((s,a,u)=>{if(ju(a)&&i.includes(serializers_value(a))&&!this.ignoredFields.includes(serializers_value(a))){const i=this.toRefractedElement([...o,"fixedFields",serializers_value(a)],s),_=new Su.Pr(cloneDeep(a),i);this.copyMetaAndAttributes(u,_),_.classes.push("fixed-field"),this.element.content.push(_)}else this.ignoredFields.includes(serializers_value(a))||this.element.content.push(cloneDeep(u))})),this.copyMetaAndAttributes(s,this.element),qu}};const Rd=class ParentSchemaAwareVisitor{parent;constructor({parent:s}){this.parent=s}},Dd=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof sd||s(a)&&o("JSONSchemaDraft4",a)&&i("object",a))),Ld=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof id||s(a)&&o("JSONReference",a)&&i("object",a))),Fd=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof cd||s(a)&&o("media",a)&&i("object",a))),Bd=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof ld||s(a)&&o("linkDescription",a)&&i("object",a)));class JSONSchemaVisitor extends(Mixin(Md,Rd,yd)){constructor(s){super(s),this.element=new sd,this.specPath=fc(["document","objects","JSONSchema"])}get defaultDialectIdentifier(){return"http://json-schema.org/draft-04/schema#"}ObjectElement(s){return this.handleDialectIdentifier(s),this.handleSchemaIdentifier(s),this.parent=this.element,Md.prototype.ObjectElement.call(this,s)}handleDialectIdentifier(s){if(bc(this.parent)&&!ju(s.get("$schema")))this.element.setMetaProperty("inheritedDialectIdentifier",this.defaultDialectIdentifier);else if(Dd(this.parent)&&!ju(s.get("$schema"))){const s=Na(serializers_value(this.parent.meta.get("inheritedDialectIdentifier")),serializers_value(this.parent.$schema));this.element.setMetaProperty("inheritedDialectIdentifier",s)}}handleSchemaIdentifier(s,o="id"){const i=void 0!==this.parent?cloneDeep(this.parent.getMetaProperty("ancestorsSchemaIdentifiers",[])):new Su.wE,a=serializers_value(s.get(o));Id(a)&&i.push(a),this.element.setMetaProperty("ancestorsSchemaIdentifiers",i)}}const $d=JSONSchemaVisitor,isJSONReferenceLikeElement=s=>Nu(s)&&s.hasKey("$ref");class ItemsVisitor extends(Mixin(Nd,Rd,yd)){ObjectElement(s){const o=isJSONReferenceLikeElement(s)?["document","objects","JSONReference"]:["document","objects","JSONSchema"];return this.element=this.toRefractedElement(o,s),qu}ArrayElement(s){return this.element=new Su.wE,this.element.classes.push("json-schema-items"),s.forEach((s=>{const o=isJSONReferenceLikeElement(s)?["document","objects","JSONReference"]:["document","objects","JSONSchema"],i=this.toRefractedElement(o,s);this.element.push(i)})),this.copyMetaAndAttributes(s,this.element),qu}}const Ud=ItemsVisitor;const Vd=class RequiredVisitor extends yd{ArrayElement(s){const o=this.enter(s);return this.element.classes.push("json-schema-required"),o}};const Wd=class PatternedFieldsVisitor extends Nd{specPath;ignoredFields;fieldPatternPredicate=es_F;constructor({specPath:s,ignoredFields:o,fieldPatternPredicate:i,...a}){super({...a}),this.specPath=s,this.ignoredFields=o||[],"function"==typeof i&&(this.fieldPatternPredicate=i)}ObjectElement(s){return s.forEach(((s,o,i)=>{if(!this.ignoredFields.includes(serializers_value(o))&&this.fieldPatternPredicate(serializers_value(o))){const a=this.specPath(s),u=this.toRefractedElement(a,s),_=new Su.Pr(cloneDeep(o),u);this.copyMetaAndAttributes(i,_),_.classes.push("patterned-field"),this.element.content.push(_)}else this.ignoredFields.includes(serializers_value(o))||this.element.content.push(cloneDeep(i))})),this.copyMetaAndAttributes(s,this.element),qu}};const Jd=class MapVisitor extends Wd{constructor(s){super(s),this.fieldPatternPredicate=Id}};class PropertiesVisitor extends(Mixin(Jd,Rd,yd)){constructor(s){super(s),this.element=new Su.Sh,this.element.classes.push("json-schema-properties"),this.specPath=s=>isJSONReferenceLikeElement(s)?["document","objects","JSONReference"]:["document","objects","JSONSchema"]}}const Hd=PropertiesVisitor;class PatternPropertiesVisitor extends(Mixin(Jd,Rd,yd)){constructor(s){super(s),this.element=new Su.Sh,this.element.classes.push("json-schema-patternProperties"),this.specPath=s=>isJSONReferenceLikeElement(s)?["document","objects","JSONReference"]:["document","objects","JSONSchema"]}}const Kd=PatternPropertiesVisitor;class DependenciesVisitor extends(Mixin(Jd,Rd,yd)){constructor(s){super(s),this.element=new Su.Sh,this.element.classes.push("json-schema-dependencies"),this.specPath=s=>isJSONReferenceLikeElement(s)?["document","objects","JSONReference"]:["document","objects","JSONSchema"]}}const Gd=DependenciesVisitor;const Yd=class EnumVisitor extends yd{ArrayElement(s){const o=this.enter(s);return this.element.classes.push("json-schema-enum"),o}};const Xd=class TypeVisitor extends yd{StringElement(s){const o=this.enter(s);return this.element.classes.push("json-schema-type"),o}ArrayElement(s){const o=this.enter(s);return this.element.classes.push("json-schema-type"),o}};class AllOfVisitor extends(Mixin(Nd,Rd,yd)){constructor(s){super(s),this.element=new Su.wE,this.element.classes.push("json-schema-allOf")}ArrayElement(s){return s.forEach((s=>{const o=isJSONReferenceLikeElement(s)?["document","objects","JSONReference"]:["document","objects","JSONSchema"],i=this.toRefractedElement(o,s);this.element.push(i)})),this.copyMetaAndAttributes(s,this.element),qu}}const Qd=AllOfVisitor;class AnyOfVisitor extends(Mixin(Nd,Rd,yd)){constructor(s){super(s),this.element=new Su.wE,this.element.classes.push("json-schema-anyOf")}ArrayElement(s){return s.forEach((s=>{const o=isJSONReferenceLikeElement(s)?["document","objects","JSONReference"]:["document","objects","JSONSchema"],i=this.toRefractedElement(o,s);this.element.push(i)})),this.copyMetaAndAttributes(s,this.element),qu}}const Zd=AnyOfVisitor;class OneOfVisitor extends(Mixin(Nd,Rd,yd)){constructor(s){super(s),this.element=new Su.wE,this.element.classes.push("json-schema-oneOf")}ArrayElement(s){return s.forEach((s=>{const o=isJSONReferenceLikeElement(s)?["document","objects","JSONReference"]:["document","objects","JSONSchema"],i=this.toRefractedElement(o,s);this.element.push(i)})),this.copyMetaAndAttributes(s,this.element),qu}}const ef=OneOfVisitor;class DefinitionsVisitor extends(Mixin(Jd,Rd,yd)){constructor(s){super(s),this.element=new Su.Sh,this.element.classes.push("json-schema-definitions"),this.specPath=s=>isJSONReferenceLikeElement(s)?["document","objects","JSONReference"]:["document","objects","JSONSchema"]}}const rf=DefinitionsVisitor;class LinksVisitor extends(Mixin(Nd,Rd,yd)){constructor(s){super(s),this.element=new Su.wE,this.element.classes.push("json-schema-links")}ArrayElement(s){return s.forEach((s=>{const o=this.toRefractedElement(["document","objects","LinkDescription"],s);this.element.push(o)})),this.copyMetaAndAttributes(s,this.element),qu}}const of=LinksVisitor;class JSONReferenceVisitor extends(Mixin(Md,yd)){constructor(s){super(s),this.element=new id,this.specPath=fc(["document","objects","JSONReference"])}ObjectElement(s){const o=Md.prototype.ObjectElement.call(this,s);return ju(this.element.$ref)&&this.element.classes.push("reference-element"),o}}const af=JSONReferenceVisitor;const cf=class $RefVisitor extends yd{StringElement(s){const o=this.enter(s);return this.element.classes.push("reference-value"),o}};const lf=_curry3((function ifElse(s,o,i){return $a(Math.max(s.length,o.length,i.length),(function _ifElse(){return s.apply(this,arguments)?o.apply(this,arguments):i.apply(this,arguments)}))}));const uf=_curry1((function comparator(s){return function(o,i){return s(o,i)?-1:s(i,o)?1:0}}));var hf=_curry2((function sort(s,o){return Array.prototype.slice.call(o,0).sort(s)}));const df=hf;var mf=_curry1((function(s){return _nth(0,s)}));const gf=mf;const yf=_curry1(_reduced);const bf=dc(Ju);const _f=ou(yp,Od);function _toConsumableArray(s){return function _arrayWithoutHoles(s){if(Array.isArray(s))return _arrayLikeToArray(s)}(s)||function _iterableToArray(s){if("undefined"!=typeof Symbol&&null!=s[Symbol.iterator]||null!=s["@@iterator"])return Array.from(s)}(s)||function _unsupportedIterableToArray(s,o){if(s){if("string"==typeof s)return _arrayLikeToArray(s,o);var i={}.toString.call(s).slice(8,-1);return"Object"===i&&s.constructor&&(i=s.constructor.name),"Map"===i||"Set"===i?Array.from(s):"Arguments"===i||/^(?:Ui|I)nt(?:8|16|32)(?:Clamped)?Array$/.test(i)?_arrayLikeToArray(s,o):void 0}}(s)||function _nonIterableSpread(){throw new TypeError("Invalid attempt to spread non-iterable instance.\nIn order to be iterable, non-array objects must have a [Symbol.iterator]() method.")}()}function _arrayLikeToArray(s,o){(null==o||o>s.length)&&(o=s.length);for(var i=0,a=Array(o);i<o;i++)a[i]=s[i];return a}var Sf=pipe(df(uf((function(s,o){return s.length>o.length}))),gf,Da("length")),xf=za((function(s,o,i){var a=i.apply(void 0,_toConsumableArray(s));return bf(a)?yf(a):o}));const kf=lf(_f,(function dispatchImpl(s){var o=Sf(s);return $a(o,(function(){for(var o=arguments.length,i=new Array(o),a=0;a<o;a++)i[a]=arguments[a];return Aa(xf(i),void 0,s)}))}),gc);const Of=class AlternatingVisitor extends Nd{alternator;constructor({alternator:s,...o}){super({...o}),this.alternator=s}enter(s){const o=this.alternator.map((({predicate:s,specPath:o})=>lf(s,fc(o),gc))),i=kf(o)(s);return this.element=this.toRefractedElement(i,s),qu}};const Cf=class SchemaOrReferenceVisitor extends Of{constructor(s){super(s),this.alternator=[{predicate:isJSONReferenceLikeElement,specPath:["document","objects","JSONReference"]},{predicate:es_T,specPath:["document","objects","JSONSchema"]}]}};class MediaVisitor extends(Mixin(Md,yd)){constructor(s){super(s),this.element=new cd,this.specPath=fc(["document","objects","Media"])}}const jf=MediaVisitor;class LinkDescriptionVisitor extends(Mixin(Md,yd)){constructor(s){super(s),this.element=new ld,this.specPath=fc(["document","objects","LinkDescription"])}}const Pf=LinkDescriptionVisitor,Tf={visitors:{value:yd,JSONSchemaOrJSONReferenceVisitor:Cf,document:{objects:{JSONSchema:{$visitor:$d,fixedFields:{id:{$ref:"#/visitors/value"},$schema:{$ref:"#/visitors/value"},multipleOf:{$ref:"#/visitors/value"},maximum:{$ref:"#/visitors/value"},exclusiveMaximum:{$ref:"#/visitors/value"},minimum:{$ref:"#/visitors/value"},exclusiveMinimum:{$ref:"#/visitors/value"},maxLength:{$ref:"#/visitors/value"},minLength:{$ref:"#/visitors/value"},pattern:{$ref:"#/visitors/value"},additionalItems:Cf,items:Ud,maxItems:{$ref:"#/visitors/value"},minItems:{$ref:"#/visitors/value"},uniqueItems:{$ref:"#/visitors/value"},maxProperties:{$ref:"#/visitors/value"},minProperties:{$ref:"#/visitors/value"},required:Vd,properties:Hd,additionalProperties:Cf,patternProperties:Kd,dependencies:Gd,enum:Yd,type:Xd,allOf:Qd,anyOf:Zd,oneOf:ef,not:Cf,definitions:rf,title:{$ref:"#/visitors/value"},description:{$ref:"#/visitors/value"},default:{$ref:"#/visitors/value"},format:{$ref:"#/visitors/value"},base:{$ref:"#/visitors/value"},links:of,media:{$ref:"#/visitors/document/objects/Media"},readOnly:{$ref:"#/visitors/value"}}},JSONReference:{$visitor:af,fixedFields:{$ref:cf}},Media:{$visitor:jf,fixedFields:{binaryEncoding:{$ref:"#/visitors/value"},type:{$ref:"#/visitors/value"}}},LinkDescription:{$visitor:Pf,fixedFields:{href:{$ref:"#/visitors/value"},rel:{$ref:"#/visitors/value"},title:{$ref:"#/visitors/value"},targetSchema:Cf,mediaType:{$ref:"#/visitors/value"},method:{$ref:"#/visitors/value"},encType:{$ref:"#/visitors/value"},schema:Cf}}}}}},traversal_visitor_getNodeType=s=>{if(Cu(s))return`${s.element.charAt(0).toUpperCase()+s.element.slice(1)}Element`},Nf={JSONSchemaDraft4Element:["content"],JSONReferenceElement:["content"],MediaElement:["content"],LinkDescriptionElement:["content"],...np},Rf={namespace:s=>{const{base:o}=s;return o.register("jSONSchemaDraft4",sd),o.register("jSONReference",id),o.register("media",cd),o.register("linkDescription",ld),o}},Df=Rf,refractor_toolbox=()=>{const s=createNamespace(Df);return{predicates:{...ae,isStringElement:ju},namespace:s}},refractor_refract=(s,{specPath:o=["visitors","document","objects","JSONSchema","$visitor"],plugins:i=[],specificationObj:a=Tf}={})=>{const u=(0,Su.e)(s),_=dereference(a),w=new(Qu(o,_))({specObj:_});return visitor_visit(u,w),dispatchPluginsSync(w.element,i,{toolboxCreator:refractor_toolbox,visitorOptions:{keyMap:Nf,nodeTypeGetter:traversal_visitor_getNodeType}})},refractor_createRefractor=s=>(o,i={})=>refractor_refract(o,{specPath:s,...i});sd.refract=refractor_createRefractor(["visitors","document","objects","JSONSchema","$visitor"]),id.refract=refractor_createRefractor(["visitors","document","objects","JSONReference","$visitor"]),cd.refract=refractor_createRefractor(["visitors","document","objects","Media","$visitor"]),ld.refract=refractor_createRefractor(["visitors","document","objects","LinkDescription","$visitor"]);const Ff=class Schema_Schema extends sd{constructor(s,o,i){super(s,o,i),this.element="schema",this.classes.push("json-schema-draft-4")}get idProp(){throw new td("idProp getter in Schema class is not not supported.")}set idProp(s){throw new td("idProp setter in Schema class is not not supported.")}get $schema(){throw new td("$schema getter in Schema class is not not supported.")}set $schema(s){throw new td("$schema setter in Schema class is not not supported.")}get additionalItems(){return this.get("additionalItems")}set additionalItems(s){this.set("additionalItems",s)}get items(){return this.get("items")}set items(s){this.set("items",s)}get additionalProperties(){return this.get("additionalProperties")}set additionalProperties(s){this.set("additionalProperties",s)}get patternProperties(){throw new td("patternProperties getter in Schema class is not not supported.")}set patternProperties(s){throw new td("patternProperties setter in Schema class is not not supported.")}get dependencies(){throw new td("dependencies getter in Schema class is not not supported.")}set dependencies(s){throw new td("dependencies setter in Schema class is not not supported.")}get type(){return this.get("type")}set type(s){this.set("type",s)}get not(){return this.get("not")}set not(s){this.set("not",s)}get definitions(){throw new td("definitions getter in Schema class is not not supported.")}set definitions(s){throw new td("definitions setter in Schema class is not not supported.")}get base(){throw new td("base getter in Schema class is not not supported.")}set base(s){throw new td("base setter in Schema class is not not supported.")}get links(){throw new td("links getter in Schema class is not not supported.")}set links(s){throw new td("links setter in Schema class is not not supported.")}get media(){throw new td("media getter in Schema class is not not supported.")}set media(s){throw new td("media setter in Schema class is not not supported.")}get nullable(){return this.get("nullable")}set nullable(s){this.set("nullable",s)}get discriminator(){return this.get("discriminator")}set discriminator(s){this.set("discriminator",s)}get writeOnly(){return this.get("writeOnly")}set writeOnly(s){this.set("writeOnly",s)}get xml(){return this.get("xml")}set xml(s){this.set("xml",s)}get externalDocs(){return this.get("externalDocs")}set externalDocs(s){this.set("externalDocs",s)}get example(){return this.get("example")}set example(s){this.set("example",s)}get deprecated(){return this.get("deprecated")}set deprecated(s){this.set("deprecated",s)}};class SecurityRequirement extends Su.Sh{constructor(s,o,i){super(s,o,i),this.element="securityRequirement"}}const Vf=SecurityRequirement;class SecurityScheme extends Su.Sh{constructor(s,o,i){super(s,o,i),this.element="securityScheme"}get type(){return this.get("type")}set type(s){this.set("type",s)}get description(){return this.get("description")}set description(s){this.set("description",s)}get name(){return this.get("name")}set name(s){this.set("name",s)}get in(){return this.get("in")}set in(s){this.set("in",s)}get scheme(){return this.get("scheme")}set scheme(s){this.set("scheme",s)}get bearerFormat(){return this.get("bearerFormat")}set bearerFormat(s){this.set("bearerFormat",s)}get flows(){return this.get("flows")}set flows(s){this.set("flows",s)}get openIdConnectUrl(){return this.get("openIdConnectUrl")}set openIdConnectUrl(s){this.set("openIdConnectUrl",s)}}const Wf=SecurityScheme;class Server extends Su.Sh{constructor(s,o,i){super(s,o,i),this.element="server"}get url(){return this.get("url")}set url(s){this.set("url",s)}get description(){return this.get("description")}set description(s){this.set("description",s)}get variables(){return this.get("variables")}set variables(s){this.set("variables",s)}}const Jf=Server;class ServerVariable extends Su.Sh{constructor(s,o,i){super(s,o,i),this.element="serverVariable"}get enum(){return this.get("enum")}set enum(s){this.set("enum",s)}get default(){return this.get("default")}set default(s){this.set("default",s)}get description(){return this.get("description")}set description(s){this.set("description",s)}}const Hf=ServerVariable;class Tag extends Su.Sh{constructor(s,o,i){super(s,o,i),this.element="tag"}get name(){return this.get("name")}set name(s){this.set("name",s)}get description(){return this.get("description")}set description(s){this.set("description",s)}get externalDocs(){return this.get("externalDocs")}set externalDocs(s){this.set("externalDocs",s)}}const Gf=Tag;class Xml extends Su.Sh{constructor(s,o,i){super(s,o,i),this.element="xml"}get name(){return this.get("name")}set name(s){this.set("name",s)}get namespace(){return this.get("namespace")}set namespace(s){this.set("namespace",s)}get prefix(){return this.get("prefix")}set prefix(s){this.set("prefix",s)}get attribute(){return this.get("attribute")}set attribute(s){this.set("attribute",s)}get wrapped(){return this.get("wrapped")}set wrapped(s){this.set("wrapped",s)}}const Xf=Xml;const Qf=class visitors_Visitor_Visitor{element;constructor(s={}){Object.assign(this,s)}copyMetaAndAttributes(s,o){(s.meta.length>0||o.meta.length>0)&&(o.meta=dd(o.meta,s.meta)),hasElementSourceMap(s)&&assignSourceMap(o,s),(s.attributes.length>0||s.meta.length>0)&&(o.attributes=dd(o.attributes,s.attributes))}};const em=class FallbackVisitor_FallbackVisitor extends Qf{enter(s){return this.element=cloneDeep(s),qu}};const tm=class SpecificationVisitor_SpecificationVisitor extends Qf{specObj;passingOptionsNames=["specObj","openApiGenericElement","openApiSemanticElement"];openApiGenericElement;openApiSemanticElement;constructor({specObj:s,passingOptionsNames:o,openApiGenericElement:i,openApiSemanticElement:a,...u}){super({...u}),this.specObj=s,this.openApiGenericElement=i,this.openApiSemanticElement=a,Array.isArray(o)&&(this.passingOptionsNames=o)}retrievePassingOptions(){return Td(this.passingOptionsNames,this)}retrieveFixedFields(s){const o=Qu(["visitors",...s,"fixedFields"],this.specObj);return"object"==typeof o&&null!==o?Object.keys(o):[]}retrieveVisitor(s){return Qo(Mc,["visitors",...s],this.specObj)?Qu(["visitors",...s],this.specObj):Qu(["visitors",...s,"$visitor"],this.specObj)}retrieveVisitorInstance(s,o={}){const i=this.retrievePassingOptions();return new(this.retrieveVisitor(s))({...i,...o})}toRefractedElement(s,o,i={}){const a=this.retrieveVisitorInstance(s,i);return a instanceof em&&(null==a?void 0:a.constructor)===em?cloneDeep(o):(visitor_visit(o,a,i),a.element)}};var rm=function(){function XTake(s,o){this.xf=o,this.n=s,this.i=0}return XTake.prototype["@@transducer/init"]=_xfBase_init,XTake.prototype["@@transducer/result"]=_xfBase_result,XTake.prototype["@@transducer/step"]=function(s,o){this.i+=1;var i=0===this.n?s:this.xf["@@transducer/step"](s,o);return this.n>=0&&this.i>=this.n?_reduced(i):i},XTake}();function _xtake(s){return function(o){return new rm(s,o)}}const nm=_curry2(_dispatchable(["take"],_xtake,(function take(s,o){return ja(0,s<0?1/0:s,o)})));var sm=_curry2((function(s,o){return na(nm(s.length,o),s)}));const om=sm,isReferenceLikeElement=s=>Nu(s)&&s.hasKey("$ref"),im=Nu,am=Nu,isOpenApiExtension=s=>ju(s.key)&&om("x-",serializers_value(s.key));const cm=class FixedFieldsVisitor_FixedFieldsVisitor extends tm{specPath;ignoredFields;canSupportSpecificationExtensions=!0;specificationExtensionPredicate=isOpenApiExtension;constructor({specPath:s,ignoredFields:o,canSupportSpecificationExtensions:i,specificationExtensionPredicate:a,...u}){super({...u}),this.specPath=s,this.ignoredFields=o||[],"boolean"==typeof i&&(this.canSupportSpecificationExtensions=i),"function"==typeof a&&(this.specificationExtensionPredicate=a)}ObjectElement(s){const o=this.specPath(s),i=this.retrieveFixedFields(o);return s.forEach(((s,a,u)=>{if(ju(a)&&i.includes(serializers_value(a))&&!this.ignoredFields.includes(serializers_value(a))){const i=this.toRefractedElement([...o,"fixedFields",serializers_value(a)],s),_=new Su.Pr(cloneDeep(a),i);this.copyMetaAndAttributes(u,_),_.classes.push("fixed-field"),this.element.content.push(_)}else if(this.canSupportSpecificationExtensions&&this.specificationExtensionPredicate(u)){const s=this.toRefractedElement(["document","extension"],u);this.element.content.push(s)}else this.ignoredFields.includes(serializers_value(a))||this.element.content.push(cloneDeep(u))})),this.copyMetaAndAttributes(s,this.element),qu}};class OpenApi3_0Visitor extends(Mixin(cm,em)){constructor(s){super(s),this.element=new Rh,this.specPath=fc(["document","objects","OpenApi"]),this.canSupportSpecificationExtensions=!0}ObjectElement(s){return cm.prototype.ObjectElement.call(this,s)}}const lm=OpenApi3_0Visitor;class OpenapiVisitor extends(Mixin(tm,em)){StringElement(s){const o=new Ih(serializers_value(s));return this.copyMetaAndAttributes(s,o),this.element=o,qu}}const um=OpenapiVisitor;const pm=class SpecificationExtensionVisitor extends tm{MemberElement(s){return this.element=cloneDeep(s),this.element.classes.push("specification-extension"),qu}};class InfoVisitor extends(Mixin(cm,em)){constructor(s){super(s),this.element=new vh,this.specPath=fc(["document","objects","Info"]),this.canSupportSpecificationExtensions=!0}}const hm=InfoVisitor;const dm=class VersionVisitor extends em{StringElement(s){const o=super.enter(s);return this.element.classes.push("api-version"),this.element.classes.push("version"),o}};class ContactVisitor extends(Mixin(cm,em)){constructor(s){super(s),this.element=new Zp,this.specPath=fc(["document","objects","Contact"]),this.canSupportSpecificationExtensions=!0}}const fm=ContactVisitor;class LicenseVisitor extends(Mixin(cm,em)){constructor(s){super(s),this.element=new _h,this.specPath=fc(["document","objects","License"]),this.canSupportSpecificationExtensions=!0}}const mm=LicenseVisitor;class LinkVisitor extends(Mixin(cm,em)){constructor(s){super(s),this.element=new wh,this.specPath=fc(["document","objects","Link"]),this.canSupportSpecificationExtensions=!0}ObjectElement(s){const o=cm.prototype.ObjectElement.call(this,s);return(ju(this.element.operationId)||ju(this.element.operationRef))&&this.element.classes.push("reference-element"),o}}const gm=LinkVisitor;const ym=class OperationRefVisitor extends em{StringElement(s){const o=super.enter(s);return this.element.classes.push("reference-value"),o}};const vm=class OperationIdVisitor extends em{StringElement(s){const o=super.enter(s);return this.element.classes.push("reference-value"),o}};const bm=class PatternedFieldsVisitor_PatternedFieldsVisitor extends tm{specPath;ignoredFields;fieldPatternPredicate=es_F;canSupportSpecificationExtensions=!1;specificationExtensionPredicate=isOpenApiExtension;constructor({specPath:s,ignoredFields:o,fieldPatternPredicate:i,canSupportSpecificationExtensions:a,specificationExtensionPredicate:u,..._}){super({..._}),this.specPath=s,this.ignoredFields=o||[],"function"==typeof i&&(this.fieldPatternPredicate=i),"boolean"==typeof a&&(this.canSupportSpecificationExtensions=a),"function"==typeof u&&(this.specificationExtensionPredicate=u)}ObjectElement(s){return s.forEach(((s,o,i)=>{if(this.canSupportSpecificationExtensions&&this.specificationExtensionPredicate(i)){const s=this.toRefractedElement(["document","extension"],i);this.element.content.push(s)}else if(!this.ignoredFields.includes(serializers_value(o))&&this.fieldPatternPredicate(serializers_value(o))){const a=this.specPath(s),u=this.toRefractedElement(a,s),_=new Su.Pr(cloneDeep(o),u);this.copyMetaAndAttributes(i,_),_.classes.push("patterned-field"),this.element.content.push(_)}else this.ignoredFields.includes(serializers_value(o))||this.element.content.push(cloneDeep(i))})),this.copyMetaAndAttributes(s,this.element),qu}};const _m=class MapVisitor_MapVisitor extends bm{constructor(s){super(s),this.fieldPatternPredicate=Id}};class LinkParameters extends Su.Sh{static primaryClass="link-parameters";constructor(s,o,i){super(s,o,i),this.classes.push(LinkParameters.primaryClass)}}const Sm=LinkParameters;class ParametersVisitor extends(Mixin(_m,em)){constructor(s){super(s),this.element=new Sm,this.specPath=fc(["value"])}}const Em=ParametersVisitor;class ServerVisitor extends(Mixin(cm,em)){constructor(s){super(s),this.element=new Jf,this.specPath=fc(["document","objects","Server"]),this.canSupportSpecificationExtensions=!0}}const wm=ServerVisitor;const xm=class UrlVisitor extends em{StringElement(s){const o=super.enter(s);return this.element.classes.push("server-url"),o}};class Servers extends Su.wE{static primaryClass="servers";constructor(s,o,i){super(s,o,i),this.classes.push(Servers.primaryClass)}}const km=Servers;class ServersVisitor extends(Mixin(tm,em)){constructor(s){super(s),this.element=new km}ArrayElement(s){return s.forEach((s=>{const o=im(s)?["document","objects","Server"]:["value"],i=this.toRefractedElement(o,s);this.element.push(i)})),this.copyMetaAndAttributes(s,this.element),qu}}const Om=ServersVisitor;class ServerVariableVisitor extends(Mixin(cm,em)){constructor(s){super(s),this.element=new Hf,this.specPath=fc(["document","objects","ServerVariable"]),this.canSupportSpecificationExtensions=!0}}const Am=ServerVariableVisitor;class ServerVariables extends Su.Sh{static primaryClass="server-variables";constructor(s,o,i){super(s,o,i),this.classes.push(ServerVariables.primaryClass)}}const Cm=ServerVariables;class VariablesVisitor extends(Mixin(_m,em)){constructor(s){super(s),this.element=new Cm,this.specPath=fc(["document","objects","ServerVariable"])}}const jm=VariablesVisitor;class MediaTypeVisitor extends(Mixin(cm,em)){constructor(s){super(s),this.element=new Oh,this.specPath=fc(["document","objects","MediaType"]),this.canSupportSpecificationExtensions=!0}}const Pm=MediaTypeVisitor;const Im=class AlternatingVisitor_AlternatingVisitor extends tm{alternator;constructor({alternator:s,...o}){super({...o}),this.alternator=s||[]}enter(s){const o=this.alternator.map((({predicate:s,specPath:o})=>lf(s,fc(o),gc))),i=kf(o)(s);return this.element=this.toRefractedElement(i,s),qu}},Tm=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof Xp||s(a)&&o("callback",a)&&i("object",a))),Nm=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof Qp||s(a)&&o("components",a)&&i("object",a))),Mm=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof Zp||s(a)&&o("contact",a)&&i("object",a))),Rm=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof uh||s(a)&&o("example",a)&&i("object",a))),Dm=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof dh||s(a)&&o("externalDocumentation",a)&&i("object",a))),Lm=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof fh||s(a)&&o("header",a)&&i("object",a))),Fm=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof vh||s(a)&&o("info",a)&&i("object",a))),Bm=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof _h||s(a)&&o("license",a)&&i("object",a))),$m=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof wh||s(a)&&o("link",a)&&i("object",a))),qm=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof Ih||s(a)&&o("openapi",a)&&i("string",a))),Um=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i,hasClass:a})=>u=>u instanceof Rh||s(u)&&o("openApi3_0",u)&&i("object",u)&&a("api",u))),Vm=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof Dh||s(a)&&o("operation",a)&&i("object",a))),zm=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof Lh||s(a)&&o("parameter",a)&&i("object",a))),Wm=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof Fh||s(a)&&o("pathItem",a)&&i("object",a))),Jm=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof Jh||s(a)&&o("paths",a)&&i("object",a))),Hm=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof Hh||s(a)&&o("reference",a)&&i("object",a))),Km=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof Kh||s(a)&&o("requestBody",a)&&i("object",a))),Gm=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof Gh||s(a)&&o("response",a)&&i("object",a))),Ym=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof Qh||s(a)&&o("responses",a)&&i("object",a))),Xm=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof Ff||s(a)&&o("schema",a)&&i("object",a))),isBooleanJsonSchemaElement=s=>Tu(s)&&s.classes.includes("boolean-json-schema"),Qm=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof Vf||s(a)&&o("securityRequirement",a)&&i("object",a))),Zm=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof Wf||s(a)&&o("securityScheme",a)&&i("object",a))),eg=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof Jf||s(a)&&o("server",a)&&i("object",a))),rg=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof Hf||s(a)&&o("serverVariable",a)&&i("object",a))),ng=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof Oh||s(a)&&o("mediaType",a)&&i("object",a))),sg=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i,hasClass:a})=>u=>u instanceof km||s(u)&&o("array",u)&&i("array",u)&&a("servers",u))),og=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof th||s(a)&&o("discriminator",a)&&i("object",a)));class SchemaVisitor extends(Mixin(Im,em)){constructor(s){super(s),this.alternator=[{predicate:isReferenceLikeElement,specPath:["document","objects","Reference"]},{predicate:es_T,specPath:["document","objects","Schema"]}]}ObjectElement(s){const o=Im.prototype.enter.call(this,s);return Hm(this.element)&&this.element.setMetaProperty("referenced-element","schema"),o}}const lg=SchemaVisitor;class ExamplesVisitor extends(Mixin(_m,em)){constructor(s){super(s),this.element=new Su.Sh,this.element.classes.push("examples"),this.specPath=s=>isReferenceLikeElement(s)?["document","objects","Reference"]:["document","objects","Example"],this.canSupportSpecificationExtensions=!0}ObjectElement(s){const o=_m.prototype.ObjectElement.call(this,s);return this.element.filter(Hm).forEach((s=>{s.setMetaProperty("referenced-element","example")})),o}}const pg=ExamplesVisitor;class MediaTypeExamples extends Su.Sh{static primaryClass="media-type-examples";constructor(s,o,i){super(s,o,i),this.classes.push(MediaTypeExamples.primaryClass),this.classes.push("examples")}}const fg=MediaTypeExamples;const mg=class ExamplesVisitor_ExamplesVisitor extends pg{constructor(s){super(s),this.element=new fg}};class MediaTypeEncoding extends Su.Sh{static primaryClass="media-type-encoding";constructor(s,o,i){super(s,o,i),this.classes.push(MediaTypeEncoding.primaryClass)}}const gg=MediaTypeEncoding;class EncodingVisitor extends(Mixin(_m,em)){constructor(s){super(s),this.element=new gg,this.specPath=fc(["document","objects","Encoding"])}}const yg=EncodingVisitor;class SecurityRequirementVisitor extends(Mixin(_m,em)){constructor(s){super(s),this.element=new Vf,this.specPath=fc(["value"])}}const _g=SecurityRequirementVisitor;class Security extends Su.wE{static primaryClass="security";constructor(s,o,i){super(s,o,i),this.classes.push(Security.primaryClass)}}const xg=Security;class SecurityVisitor extends(Mixin(tm,em)){constructor(s){super(s),this.element=new xg}ArrayElement(s){return s.forEach((s=>{if(Nu(s)){const o=this.toRefractedElement(["document","objects","SecurityRequirement"],s);this.element.push(o)}else this.element.push(cloneDeep(s))})),this.copyMetaAndAttributes(s,this.element),qu}}const kg=SecurityVisitor;class ComponentsVisitor extends(Mixin(cm,em)){constructor(s){super(s),this.element=new Qp,this.specPath=fc(["document","objects","Components"]),this.canSupportSpecificationExtensions=!0}}const qg=ComponentsVisitor;class TagVisitor extends(Mixin(cm,em)){constructor(s){super(s),this.element=new Gf,this.specPath=fc(["document","objects","Tag"]),this.canSupportSpecificationExtensions=!0}}const Ug=TagVisitor;class ReferenceVisitor extends(Mixin(cm,em)){constructor(s){super(s),this.element=new Hh,this.specPath=fc(["document","objects","Reference"]),this.canSupportSpecificationExtensions=!1}ObjectElement(s){const o=cm.prototype.ObjectElement.call(this,s);return ju(this.element.$ref)&&this.element.classes.push("reference-element"),o}}const Vg=ReferenceVisitor;const zg=class $RefVisitor_$RefVisitor extends em{StringElement(s){const o=super.enter(s);return this.element.classes.push("reference-value"),o}};class ParameterVisitor extends(Mixin(cm,em)){constructor(s){super(s),this.element=new Lh,this.specPath=fc(["document","objects","Parameter"]),this.canSupportSpecificationExtensions=!0}ObjectElement(s){const o=cm.prototype.ObjectElement.call(this,s);return Nu(this.element.contentProp)&&this.element.contentProp.filter(ng).forEach(((s,o)=>{s.setMetaProperty("media-type",serializers_value(o))})),o}}const Wg=ParameterVisitor;class SchemaVisitor_SchemaVisitor extends(Mixin(Im,em)){constructor(s){super(s),this.alternator=[{predicate:isReferenceLikeElement,specPath:["document","objects","Reference"]},{predicate:es_T,specPath:["document","objects","Schema"]}]}ObjectElement(s){const o=Im.prototype.enter.call(this,s);return Hm(this.element)&&this.element.setMetaProperty("referenced-element","schema"),o}}const Kg=SchemaVisitor_SchemaVisitor;class HeaderVisitor extends(Mixin(cm,em)){constructor(s){super(s),this.element=new fh,this.specPath=fc(["document","objects","Header"]),this.canSupportSpecificationExtensions=!0}}const Yg=HeaderVisitor;class header_SchemaVisitor_SchemaVisitor extends(Mixin(Im,em)){constructor(s){super(s),this.alternator=[{predicate:isReferenceLikeElement,specPath:["document","objects","Reference"]},{predicate:es_T,specPath:["document","objects","Schema"]}]}ObjectElement(s){const o=Im.prototype.enter.call(this,s);return Hm(this.element)&&this.element.setMetaProperty("referenced-element","schema"),o}}const Xg=header_SchemaVisitor_SchemaVisitor;class HeaderExamples extends Su.Sh{static primaryClass="header-examples";constructor(s,o,i){super(s,o,i),this.classes.push(HeaderExamples.primaryClass),this.classes.push("examples")}}const Zg=HeaderExamples;const ey=class header_ExamplesVisitor_ExamplesVisitor extends pg{constructor(s){super(s),this.element=new Zg}};class ContentVisitor extends(Mixin(_m,em)){constructor(s){super(s),this.element=new Su.Sh,this.element.classes.push("content"),this.specPath=fc(["document","objects","MediaType"])}}const ty=ContentVisitor;class HeaderContent extends Su.Sh{static primaryClass="header-content";constructor(s,o,i){super(s,o,i),this.classes.push(HeaderContent.primaryClass),this.classes.push("content")}}const ry=HeaderContent;const ny=class ContentVisitor_ContentVisitor extends ty{constructor(s){super(s),this.element=new ry}};class schema_SchemaVisitor extends(Mixin(cm,em)){constructor(s){super(s),this.element=new Ff,this.specPath=fc(["document","objects","Schema"]),this.canSupportSpecificationExtensions=!0}}const sy=schema_SchemaVisitor,oy=Tf.visitors.document.objects.JSONSchema.fixedFields.allOf;const iy=class AllOfVisitor_AllOfVisitor extends oy{ArrayElement(s){const o=oy.prototype.ArrayElement.call(this,s);return this.element.filter(Hm).forEach((s=>{s.setMetaProperty("referenced-element","schema")})),o}},ay=Tf.visitors.document.objects.JSONSchema.fixedFields.anyOf;const cy=class AnyOfVisitor_AnyOfVisitor extends ay{ArrayElement(s){const o=ay.prototype.ArrayElement.call(this,s);return this.element.filter(Hm).forEach((s=>{s.setMetaProperty("referenced-element","schema")})),o}},ly=Tf.visitors.document.objects.JSONSchema.fixedFields.oneOf;const uy=class OneOfVisitor_OneOfVisitor extends ly{ArrayElement(s){const o=ly.prototype.ArrayElement.call(this,s);return this.element.filter(Hm).forEach((s=>{s.setMetaProperty("referenced-element","schema")})),o}},py=Tf.visitors.document.objects.JSONSchema.fixedFields.items;const hy=class ItemsVisitor_ItemsVisitor extends py{ObjectElement(s){const o=py.prototype.ObjectElement.call(this,s);return Hm(this.element)&&this.element.setMetaProperty("referenced-element","schema"),o}ArrayElement(s){return this.enter(s)}},dy=Tf.visitors.document.objects.JSONSchema.fixedFields.properties;const fy=class PropertiesVisitor_PropertiesVisitor extends dy{ObjectElement(s){const o=dy.prototype.ObjectElement.call(this,s);return this.element.filter(Hm).forEach((s=>{s.setMetaProperty("referenced-element","schema")})),o}},my=Tf.visitors.document.objects.JSONSchema.fixedFields.type;const gy=class TypeVisitor_TypeVisitor extends my{ArrayElement(s){return this.enter(s)}},yy=Tf.visitors.JSONSchemaOrJSONReferenceVisitor;const vy=class SchemaOrReferenceVisitor_SchemaOrReferenceVisitor extends yy{ObjectElement(s){const o=yy.prototype.enter.call(this,s);return Hm(this.element)&&this.element.setMetaProperty("referenced-element","schema"),o}};class DiscriminatorVisitor extends(Mixin(cm,em)){constructor(s){super(s),this.element=new th,this.specPath=fc(["document","objects","Discriminator"]),this.canSupportSpecificationExtensions=!1}}const by=DiscriminatorVisitor;class DiscriminatorMapping extends Su.Sh{static primaryClass="discriminator-mapping";constructor(s,o,i){super(s,o,i),this.classes.push(DiscriminatorMapping.primaryClass)}}const _y=DiscriminatorMapping;class MappingVisitor extends(Mixin(_m,em)){constructor(s){super(s),this.element=new _y,this.specPath=fc(["value"])}}const Sy=MappingVisitor;class XmlVisitor extends(Mixin(cm,em)){constructor(s){super(s),this.element=new Xf,this.specPath=fc(["document","objects","XML"]),this.canSupportSpecificationExtensions=!0}}const Ey=XmlVisitor;class ParameterExamples extends Su.Sh{static primaryClass="parameter-examples";constructor(s,o,i){super(s,o,i),this.classes.push(ParameterExamples.primaryClass),this.classes.push("examples")}}const wy=ParameterExamples;const xy=class parameter_ExamplesVisitor_ExamplesVisitor extends pg{constructor(s){super(s),this.element=new wy}};class ParameterContent extends Su.Sh{static primaryClass="parameter-content";constructor(s,o,i){super(s,o,i),this.classes.push(ParameterContent.primaryClass),this.classes.push("content")}}const ky=ParameterContent;const Oy=class parameter_ContentVisitor_ContentVisitor extends ty{constructor(s){super(s),this.element=new ky}};class ComponentsSchemas extends Su.Sh{static primaryClass="components-schemas";constructor(s,o,i){super(s,o,i),this.classes.push(ComponentsSchemas.primaryClass)}}const Ay=ComponentsSchemas;class SchemasVisitor extends(Mixin(_m,em)){constructor(s){super(s),this.element=new Ay,this.specPath=s=>isReferenceLikeElement(s)?["document","objects","Reference"]:["document","objects","Schema"]}ObjectElement(s){const o=_m.prototype.ObjectElement.call(this,s);return this.element.filter(Hm).forEach((s=>{s.setMetaProperty("referenced-element","schema")})),o}}const Cy=SchemasVisitor;class ComponentsResponses extends Su.Sh{static primaryClass="components-responses";constructor(s,o,i){super(s,o,i),this.classes.push(ComponentsResponses.primaryClass)}}const jy=ComponentsResponses;class ResponsesVisitor extends(Mixin(_m,em)){constructor(s){super(s),this.element=new jy,this.specPath=s=>isReferenceLikeElement(s)?["document","objects","Reference"]:["document","objects","Response"]}ObjectElement(s){const o=_m.prototype.ObjectElement.call(this,s);return this.element.filter(Hm).forEach((s=>{s.setMetaProperty("referenced-element","response")})),this.element.filter(Gm).forEach(((s,o)=>{s.setMetaProperty("http-status-code",serializers_value(o))})),o}}const Py=ResponsesVisitor;class ComponentsParameters extends Su.Sh{static primaryClass="components-parameters";constructor(s,o,i){super(s,o,i),this.classes.push(ComponentsParameters.primaryClass),this.classes.push("parameters")}}const Iy=ComponentsParameters;class ParametersVisitor_ParametersVisitor extends(Mixin(_m,em)){constructor(s){super(s),this.element=new Iy,this.specPath=s=>isReferenceLikeElement(s)?["document","objects","Reference"]:["document","objects","Parameter"]}ObjectElement(s){const o=_m.prototype.ObjectElement.call(this,s);return this.element.filter(Hm).forEach((s=>{s.setMetaProperty("referenced-element","parameter")})),o}}const Ty=ParametersVisitor_ParametersVisitor;class ComponentsExamples extends Su.Sh{static primaryClass="components-examples";constructor(s,o,i){super(s,o,i),this.classes.push(ComponentsExamples.primaryClass),this.classes.push("examples")}}const Ny=ComponentsExamples;class components_ExamplesVisitor_ExamplesVisitor extends(Mixin(_m,em)){constructor(s){super(s),this.element=new Ny,this.specPath=s=>isReferenceLikeElement(s)?["document","objects","Reference"]:["document","objects","Example"]}ObjectElement(s){const o=_m.prototype.ObjectElement.call(this,s);return this.element.filter(Hm).forEach((s=>{s.setMetaProperty("referenced-element","example")})),o}}const My=components_ExamplesVisitor_ExamplesVisitor;class ComponentsRequestBodies extends Su.Sh{static primaryClass="components-request-bodies";constructor(s,o,i){super(s,o,i),this.classes.push(ComponentsRequestBodies.primaryClass)}}const Ry=ComponentsRequestBodies;class RequestBodiesVisitor extends(Mixin(_m,em)){constructor(s){super(s),this.element=new Ry,this.specPath=s=>isReferenceLikeElement(s)?["document","objects","Reference"]:["document","objects","RequestBody"]}ObjectElement(s){const o=_m.prototype.ObjectElement.call(this,s);return this.element.filter(Hm).forEach((s=>{s.setMetaProperty("referenced-element","requestBody")})),o}}const Dy=RequestBodiesVisitor;class ComponentsHeaders extends Su.Sh{static primaryClass="components-headers";constructor(s,o,i){super(s,o,i),this.classes.push(ComponentsHeaders.primaryClass)}}const Ly=ComponentsHeaders;class HeadersVisitor extends(Mixin(_m,em)){constructor(s){super(s),this.element=new Ly,this.specPath=s=>isReferenceLikeElement(s)?["document","objects","Reference"]:["document","objects","Header"]}ObjectElement(s){const o=_m.prototype.ObjectElement.call(this,s);return this.element.filter(Hm).forEach((s=>{s.setMetaProperty("referenced-element","header")})),this.element.filter(Lm).forEach(((s,o)=>{s.setMetaProperty("header-name",serializers_value(o))})),o}}const Fy=HeadersVisitor;class ComponentsSecuritySchemes extends Su.Sh{static primaryClass="components-security-schemes";constructor(s,o,i){super(s,o,i),this.classes.push(ComponentsSecuritySchemes.primaryClass)}}const By=ComponentsSecuritySchemes;class SecuritySchemesVisitor extends(Mixin(_m,em)){constructor(s){super(s),this.element=new By,this.specPath=s=>isReferenceLikeElement(s)?["document","objects","Reference"]:["document","objects","SecurityScheme"]}ObjectElement(s){const o=_m.prototype.ObjectElement.call(this,s);return this.element.filter(Hm).forEach((s=>{s.setMetaProperty("referenced-element","securityScheme")})),o}}const $y=SecuritySchemesVisitor;class ComponentsLinks extends Su.Sh{static primaryClass="components-links";constructor(s,o,i){super(s,o,i),this.classes.push(ComponentsLinks.primaryClass)}}const qy=ComponentsLinks;class LinksVisitor_LinksVisitor extends(Mixin(_m,em)){constructor(s){super(s),this.element=new qy,this.specPath=s=>isReferenceLikeElement(s)?["document","objects","Reference"]:["document","objects","Link"]}ObjectElement(s){const o=_m.prototype.ObjectElement.call(this,s);return this.element.filter(Hm).forEach((s=>{s.setMetaProperty("referenced-element","link")})),o}}const Uy=LinksVisitor_LinksVisitor;class ComponentsCallbacks extends Su.Sh{static primaryClass="components-callbacks";constructor(s,o,i){super(s,o,i),this.classes.push(ComponentsCallbacks.primaryClass)}}const Vy=ComponentsCallbacks;class CallbacksVisitor extends(Mixin(_m,em)){constructor(s){super(s),this.element=new Vy,this.specPath=s=>isReferenceLikeElement(s)?["document","objects","Reference"]:["document","objects","Callback"]}ObjectElement(s){const o=_m.prototype.ObjectElement.call(this,s);return this.element.filter(Hm).forEach((s=>{s.setMetaProperty("referenced-element","callback")})),o}}const zy=CallbacksVisitor;class ExampleVisitor extends(Mixin(cm,em)){constructor(s){super(s),this.element=new uh,this.specPath=fc(["document","objects","Example"]),this.canSupportSpecificationExtensions=!0}ObjectElement(s){const o=cm.prototype.ObjectElement.call(this,s);return ju(this.element.externalValue)&&this.element.classes.push("reference-element"),o}}const Wy=ExampleVisitor;const Jy=class ExternalValueVisitor extends em{StringElement(s){const o=super.enter(s);return this.element.classes.push("reference-value"),o}};class ExternalDocumentationVisitor extends(Mixin(cm,em)){constructor(s){super(s),this.element=new dh,this.specPath=fc(["document","objects","ExternalDocumentation"]),this.canSupportSpecificationExtensions=!0}}const Hy=ExternalDocumentationVisitor;class encoding_EncodingVisitor extends(Mixin(cm,em)){constructor(s){super(s),this.element=new rh,this.specPath=fc(["document","objects","Encoding"]),this.canSupportSpecificationExtensions=!0}ObjectElement(s){const o=cm.prototype.ObjectElement.call(this,s);return Nu(this.element.headers)&&this.element.headers.filter(Lm).forEach(((s,o)=>{s.setMetaProperty("header-name",serializers_value(o))})),o}}const Ky=encoding_EncodingVisitor;class EncodingHeaders extends Su.Sh{static primaryClass="encoding-headers";constructor(s,o,i){super(s,o,i),this.classes.push(EncodingHeaders.primaryClass)}}const Gy=EncodingHeaders;class HeadersVisitor_HeadersVisitor extends(Mixin(_m,em)){constructor(s){super(s),this.element=new Gy,this.specPath=s=>isReferenceLikeElement(s)?["document","objects","Reference"]:["document","objects","Header"]}ObjectElement(s){const o=_m.prototype.ObjectElement.call(this,s);return this.element.filter(Hm).forEach((s=>{s.setMetaProperty("referenced-element","header")})),this.element.forEach(((s,o)=>{if(!Lm(s))return;const i=serializers_value(o);s.setMetaProperty("headerName",i)})),o}}const Yy=HeadersVisitor_HeadersVisitor;class PathsVisitor extends(Mixin(bm,em)){constructor(s){super(s),this.element=new Jh,this.specPath=fc(["document","objects","PathItem"]),this.canSupportSpecificationExtensions=!0,this.fieldPatternPredicate=es_T}ObjectElement(s){const o=bm.prototype.ObjectElement.call(this,s);return this.element.filter(Wm).forEach(((s,o)=>{o.classes.push("openapi-path-template"),o.classes.push("path-template"),s.setMetaProperty("path",cloneDeep(o))})),o}}const Xy=PathsVisitor;class RequestBodyVisitor extends(Mixin(cm,em)){constructor(s){super(s),this.element=new Kh,this.specPath=fc(["document","objects","RequestBody"])}ObjectElement(s){const o=cm.prototype.ObjectElement.call(this,s);return Nu(this.element.contentProp)&&this.element.contentProp.filter(ng).forEach(((s,o)=>{s.setMetaProperty("media-type",serializers_value(o))})),o}}const Qy=RequestBodyVisitor;class RequestBodyContent extends Su.Sh{static primaryClass="request-body-content";constructor(s,o,i){super(s,o,i),this.classes.push(RequestBodyContent.primaryClass),this.classes.push("content")}}const Zy=RequestBodyContent;const ev=class request_body_ContentVisitor_ContentVisitor extends ty{constructor(s){super(s),this.element=new Zy}};class CallbackVisitor extends(Mixin(bm,em)){constructor(s){super(s),this.element=new Xp,this.specPath=fc(["document","objects","PathItem"]),this.canSupportSpecificationExtensions=!0,this.fieldPatternPredicate=s=>/{(?<expression>[^}]{1,2083})}/.test(String(s))}ObjectElement(s){const o=_m.prototype.ObjectElement.call(this,s);return this.element.filter(Wm).forEach(((s,o)=>{s.setMetaProperty("runtime-expression",serializers_value(o))})),o}}const tv=CallbackVisitor;class ResponseVisitor extends(Mixin(cm,em)){constructor(s){super(s),this.element=new Gh,this.specPath=fc(["document","objects","Response"])}ObjectElement(s){const o=cm.prototype.ObjectElement.call(this,s);return Nu(this.element.contentProp)&&this.element.contentProp.filter(ng).forEach(((s,o)=>{s.setMetaProperty("media-type",serializers_value(o))})),Nu(this.element.headers)&&this.element.headers.filter(Lm).forEach(((s,o)=>{s.setMetaProperty("header-name",serializers_value(o))})),o}}const rv=ResponseVisitor;class ResponseHeaders extends Su.Sh{static primaryClass="response-headers";constructor(s,o,i){super(s,o,i),this.classes.push(ResponseHeaders.primaryClass)}}const nv=ResponseHeaders;class response_HeadersVisitor_HeadersVisitor extends(Mixin(_m,em)){constructor(s){super(s),this.element=new nv,this.specPath=s=>isReferenceLikeElement(s)?["document","objects","Reference"]:["document","objects","Header"]}ObjectElement(s){const o=_m.prototype.ObjectElement.call(this,s);return this.element.filter(Hm).forEach((s=>{s.setMetaProperty("referenced-element","header")})),this.element.forEach(((s,o)=>{if(!Lm(s))return;const i=serializers_value(o);s.setMetaProperty("header-name",i)})),o}}const sv=response_HeadersVisitor_HeadersVisitor;class ResponseContent extends Su.Sh{static primaryClass="response-content";constructor(s,o,i){super(s,o,i),this.classes.push(ResponseContent.primaryClass),this.classes.push("content")}}const ov=ResponseContent;const iv=class response_ContentVisitor_ContentVisitor extends ty{constructor(s){super(s),this.element=new ov}};class ResponseLinks extends Su.Sh{static primaryClass="response-links";constructor(s,o,i){super(s,o,i),this.classes.push(ResponseLinks.primaryClass)}}const av=ResponseLinks;class response_LinksVisitor_LinksVisitor extends(Mixin(_m,em)){constructor(s){super(s),this.element=new av,this.specPath=s=>isReferenceLikeElement(s)?["document","objects","Reference"]:["document","objects","Link"]}ObjectElement(s){const o=_m.prototype.ObjectElement.call(this,s);return this.element.filter(Hm).forEach((s=>{s.setMetaProperty("referenced-element","link")})),o}}const cv=response_LinksVisitor_LinksVisitor;function _isNumber(s){return"[object Number]"===Object.prototype.toString.call(s)}var lv=_curry2((function range(s,o){if(!_isNumber(s)||!_isNumber(o))throw new TypeError("Both arguments to range must be numbers");for(var i=Array(s<o?o-s:0),a=s<0?o+Math.abs(s):o-s,u=0;u<a;)i[u]=u+s,u+=1;return i}));const uv=lv;function hasOrAdd(s,o,i){var a,u=typeof s;switch(u){case"string":case"number":return 0===s&&1/s==-1/0?!!i._items["-0"]||(o&&(i._items["-0"]=!0),!1):null!==i._nativeSet?o?(a=i._nativeSet.size,i._nativeSet.add(s),i._nativeSet.size===a):i._nativeSet.has(s):u in i._items?s in i._items[u]||(o&&(i._items[u][s]=!0),!1):(o&&(i._items[u]={},i._items[u][s]=!0),!1);case"boolean":if(u in i._items){var _=s?1:0;return!!i._items[u][_]||(o&&(i._items[u][_]=!0),!1)}return o&&(i._items[u]=s?[!1,!0]:[!0,!1]),!1;case"function":return null!==i._nativeSet?o?(a=i._nativeSet.size,i._nativeSet.add(s),i._nativeSet.size===a):i._nativeSet.has(s):u in i._items?!!_includes(s,i._items[u])||(o&&i._items[u].push(s),!1):(o&&(i._items[u]=[s]),!1);case"undefined":return!!i._items[u]||(o&&(i._items[u]=!0),!1);case"object":if(null===s)return!!i._items.null||(o&&(i._items.null=!0),!1);default:return(u=Object.prototype.toString.call(s))in i._items?!!_includes(s,i._items[u])||(o&&i._items[u].push(s),!1):(o&&(i._items[u]=[s]),!1)}}const pv=function(){function _Set(){this._nativeSet="function"==typeof Set?new Set:null,this._items={}}return _Set.prototype.add=function(s){return!hasOrAdd(s,!0,this)},_Set.prototype.has=function(s){return hasOrAdd(s,!1,this)},_Set}();var hv=_curry2((function difference(s,o){for(var i=[],a=0,u=s.length,_=o.length,w=new pv,x=0;x<_;x+=1)w.add(o[x]);for(;a<u;)w.add(s[a])&&(i[i.length]=s[a]),a+=1;return i}));const dv=hv;class MixedFieldsVisitor extends(Mixin(cm,bm)){specPathFixedFields;specPathPatternedFields;constructor({specPathFixedFields:s,specPathPatternedFields:o,...i}){super({...i}),this.specPathFixedFields=s,this.specPathPatternedFields=o}ObjectElement(s){const{specPath:o,ignoredFields:i}=this;try{this.specPath=this.specPathFixedFields;const o=this.retrieveFixedFields(this.specPath(s));this.ignoredFields=[...i,...dv(s.keys(),o)],cm.prototype.ObjectElement.call(this,s),this.specPath=this.specPathPatternedFields,this.ignoredFields=o,bm.prototype.ObjectElement.call(this,s)}catch(s){throw this.specPath=o,s}return qu}}const fv=MixedFieldsVisitor;class responses_ResponsesVisitor extends(Mixin(fv,em)){constructor(s){super(s),this.element=new Qh,this.specPathFixedFields=fc(["document","objects","Responses"]),this.canSupportSpecificationExtensions=!0,this.specPathPatternedFields=s=>isReferenceLikeElement(s)?["document","objects","Reference"]:["document","objects","Response"],this.fieldPatternPredicate=s=>new RegExp(`^(1XX|2XX|3XX|4XX|5XX|${uv(100,600).join("|")})$`).test(String(s))}ObjectElement(s){const o=fv.prototype.ObjectElement.call(this,s);return this.element.filter(Hm).forEach((s=>{s.setMetaProperty("referenced-element","response")})),this.element.filter(Gm).forEach(((s,o)=>{const i=cloneDeep(o);this.fieldPatternPredicate(serializers_value(i))&&s.setMetaProperty("http-status-code",i)})),o}}const mv=responses_ResponsesVisitor;class DefaultVisitor extends(Mixin(Im,em)){constructor(s){super(s),this.alternator=[{predicate:isReferenceLikeElement,specPath:["document","objects","Reference"]},{predicate:es_T,specPath:["document","objects","Response"]}]}ObjectElement(s){const o=Im.prototype.enter.call(this,s);return Hm(this.element)?this.element.setMetaProperty("referenced-element","response"):Gm(this.element)&&this.element.setMetaProperty("http-status-code","default"),o}}const gv=DefaultVisitor;class OperationVisitor extends(Mixin(cm,em)){constructor(s){super(s),this.element=new Dh,this.specPath=fc(["document","objects","Operation"])}}const yv=OperationVisitor;class OperationTags extends Su.wE{static primaryClass="operation-tags";constructor(s,o,i){super(s,o,i),this.classes.push(OperationTags.primaryClass)}}const vv=OperationTags;const bv=class TagsVisitor extends em{constructor(s){super(s),this.element=new vv}ArrayElement(s){return this.element=this.element.concat(cloneDeep(s)),qu}};class OperationParameters extends Su.wE{static primaryClass="operation-parameters";constructor(s,o,i){super(s,o,i),this.classes.push(OperationParameters.primaryClass),this.classes.push("parameters")}}const _v=OperationParameters;class open_api_3_0_ParametersVisitor_ParametersVisitor extends(Mixin(tm,em)){constructor(s){super(s),this.element=new Su.wE,this.element.classes.push("parameters")}ArrayElement(s){return s.forEach((s=>{const o=isReferenceLikeElement(s)?["document","objects","Reference"]:["document","objects","Parameter"],i=this.toRefractedElement(o,s);Hm(i)&&i.setMetaProperty("referenced-element","parameter"),this.element.push(i)})),this.copyMetaAndAttributes(s,this.element),qu}}const Sv=open_api_3_0_ParametersVisitor_ParametersVisitor;const Ev=class operation_ParametersVisitor_ParametersVisitor extends Sv{constructor(s){super(s),this.element=new _v}};const wv=class RequestBodyVisitor_RequestBodyVisitor extends Im{constructor(s){super(s),this.alternator=[{predicate:isReferenceLikeElement,specPath:["document","objects","Reference"]},{predicate:es_T,specPath:["document","objects","RequestBody"]}]}ObjectElement(s){const o=Im.prototype.enter.call(this,s);return Hm(this.element)&&this.element.setMetaProperty("referenced-element","requestBody"),o}};class OperationCallbacks extends Su.Sh{static primaryClass="operation-callbacks";constructor(s,o,i){super(s,o,i),this.classes.push(OperationCallbacks.primaryClass)}}const xv=OperationCallbacks;class CallbacksVisitor_CallbacksVisitor extends(Mixin(_m,em)){specPath;constructor(s){super(s),this.element=new xv,this.specPath=s=>isReferenceLikeElement(s)?["document","objects","Reference"]:["document","objects","Callback"]}ObjectElement(s){const o=_m.prototype.ObjectElement.call(this,s);return this.element.filter(Hm).forEach((s=>{s.setMetaProperty("referenced-element","callback")})),o}}const kv=CallbacksVisitor_CallbacksVisitor;class OperationSecurity extends Su.wE{static primaryClass="operation-security";constructor(s,o,i){super(s,o,i),this.classes.push(OperationSecurity.primaryClass),this.classes.push("security")}}const Ov=OperationSecurity;class SecurityVisitor_SecurityVisitor extends(Mixin(tm,em)){constructor(s){super(s),this.element=new Ov}ArrayElement(s){return s.forEach((s=>{const o=Nu(s)?["document","objects","SecurityRequirement"]:["value"],i=this.toRefractedElement(o,s);this.element.push(i)})),this.copyMetaAndAttributes(s,this.element),qu}}const Av=SecurityVisitor_SecurityVisitor;class OperationServers extends Su.wE{static primaryClass="operation-servers";constructor(s,o,i){super(s,o,i),this.classes.push(OperationServers.primaryClass),this.classes.push("servers")}}const Cv=OperationServers;const jv=class ServersVisitor_ServersVisitor extends Om{constructor(s){super(s),this.element=new Cv}};class PathItemVisitor extends(Mixin(cm,em)){constructor(s){super(s),this.element=new Fh,this.specPath=fc(["document","objects","PathItem"])}ObjectElement(s){const o=cm.prototype.ObjectElement.call(this,s);return this.element.filter(Vm).forEach(((s,o)=>{const i=cloneDeep(o);i.content=serializers_value(i).toUpperCase(),s.setMetaProperty("http-method",i)})),ju(this.element.$ref)&&this.element.classes.push("reference-element"),o}}const Pv=PathItemVisitor;const Iv=class path_item_$RefVisitor_$RefVisitor extends em{StringElement(s){const o=super.enter(s);return this.element.classes.push("reference-value"),o}};class PathItemServers extends Su.wE{static primaryClass="path-item-servers";constructor(s,o,i){super(s,o,i),this.classes.push(PathItemServers.primaryClass),this.classes.push("servers")}}const Tv=PathItemServers;const Nv=class path_item_ServersVisitor_ServersVisitor extends Om{constructor(s){super(s),this.element=new Tv}};class PathItemParameters extends Su.wE{static primaryClass="path-item-parameters";constructor(s,o,i){super(s,o,i),this.classes.push(PathItemParameters.primaryClass),this.classes.push("parameters")}}const Mv=PathItemParameters;const Rv=class path_item_ParametersVisitor_ParametersVisitor extends Sv{constructor(s){super(s),this.element=new Mv}};class SecuritySchemeVisitor extends(Mixin(cm,em)){constructor(s){super(s),this.element=new Wf,this.specPath=fc(["document","objects","SecurityScheme"]),this.canSupportSpecificationExtensions=!0}}const Dv=SecuritySchemeVisitor;class OAuthFlowsVisitor extends(Mixin(cm,em)){constructor(s){super(s),this.element=new Ph,this.specPath=fc(["document","objects","OAuthFlows"]),this.canSupportSpecificationExtensions=!0}}const Lv=OAuthFlowsVisitor;class OAuthFlowVisitor extends(Mixin(cm,em)){constructor(s){super(s),this.element=new jh,this.specPath=fc(["document","objects","OAuthFlow"]),this.canSupportSpecificationExtensions=!0}}const Fv=OAuthFlowVisitor;class OAuthFlowScopes extends Su.Sh{static primaryClass="oauth-flow-scopes";constructor(s,o,i){super(s,o,i),this.classes.push(OAuthFlowScopes.primaryClass)}}const Bv=OAuthFlowScopes;class ScopesVisitor extends(Mixin(_m,em)){constructor(s){super(s),this.element=new Bv,this.specPath=fc(["value"])}}const $v=ScopesVisitor;class Tags extends Su.wE{static primaryClass="tags";constructor(s,o,i){super(s,o,i),this.classes.push(Tags.primaryClass)}}const qv=Tags;class TagsVisitor_TagsVisitor extends(Mixin(tm,em)){constructor(s){super(s),this.element=new qv}ArrayElement(s){return s.forEach((s=>{const o=am(s)?["document","objects","Tag"]:["value"],i=this.toRefractedElement(o,s);this.element.push(i)})),this.copyMetaAndAttributes(s,this.element),qu}}const Uv=TagsVisitor_TagsVisitor,{fixedFields:Vv}=Tf.visitors.document.objects.JSONSchema,zv={visitors:{value:em,document:{objects:{OpenApi:{$visitor:lm,fixedFields:{openapi:um,info:{$ref:"#/visitors/document/objects/Info"},servers:Om,paths:{$ref:"#/visitors/document/objects/Paths"},components:{$ref:"#/visitors/document/objects/Components"},security:kg,tags:Uv,externalDocs:{$ref:"#/visitors/document/objects/ExternalDocumentation"}}},Info:{$visitor:hm,fixedFields:{title:{$ref:"#/visitors/value"},description:{$ref:"#/visitors/value"},termsOfService:{$ref:"#/visitors/value"},contact:{$ref:"#/visitors/document/objects/Contact"},license:{$ref:"#/visitors/document/objects/License"},version:dm}},Contact:{$visitor:fm,fixedFields:{name:{$ref:"#/visitors/value"},url:{$ref:"#/visitors/value"},email:{$ref:"#/visitors/value"}}},License:{$visitor:mm,fixedFields:{name:{$ref:"#/visitors/value"},url:{$ref:"#/visitors/value"}}},Server:{$visitor:wm,fixedFields:{url:xm,description:{$ref:"#/visitors/value"},variables:jm}},ServerVariable:{$visitor:Am,fixedFields:{enum:{$ref:"#/visitors/value"},default:{$ref:"#/visitors/value"},description:{$ref:"#/visitors/value"}}},Components:{$visitor:qg,fixedFields:{schemas:Cy,responses:Py,parameters:Ty,examples:My,requestBodies:Dy,headers:Fy,securitySchemes:$y,links:Uy,callbacks:zy}},Paths:{$visitor:Xy},PathItem:{$visitor:Pv,fixedFields:{$ref:Iv,summary:{$ref:"#/visitors/value"},description:{$ref:"#/visitors/value"},get:{$ref:"#/visitors/document/objects/Operation"},put:{$ref:"#/visitors/document/objects/Operation"},post:{$ref:"#/visitors/document/objects/Operation"},delete:{$ref:"#/visitors/document/objects/Operation"},options:{$ref:"#/visitors/document/objects/Operation"},head:{$ref:"#/visitors/document/objects/Operation"},patch:{$ref:"#/visitors/document/objects/Operation"},trace:{$ref:"#/visitors/document/objects/Operation"},servers:Nv,parameters:Rv}},Operation:{$visitor:yv,fixedFields:{tags:bv,summary:{$ref:"#/visitors/value"},description:{$ref:"#/visitors/value"},externalDocs:{$ref:"#/visitors/document/objects/ExternalDocumentation"},operationId:{$ref:"#/visitors/value"},parameters:Ev,requestBody:wv,responses:{$ref:"#/visitors/document/objects/Responses"},callbacks:kv,deprecated:{$ref:"#/visitors/value"},security:Av,servers:jv}},ExternalDocumentation:{$visitor:Hy,fixedFields:{description:{$ref:"#/visitors/value"},url:{$ref:"#/visitors/value"}}},Parameter:{$visitor:Wg,fixedFields:{name:{$ref:"#/visitors/value"},in:{$ref:"#/visitors/value"},description:{$ref:"#/visitors/value"},required:{$ref:"#/visitors/value"},deprecated:{$ref:"#/visitors/value"},allowEmptyValue:{$ref:"#/visitors/value"},style:{$ref:"#/visitors/value"},explode:{$ref:"#/visitors/value"},allowReserved:{$ref:"#/visitors/value"},schema:Kg,example:{$ref:"#/visitors/value"},examples:xy,content:Oy}},RequestBody:{$visitor:Qy,fixedFields:{description:{$ref:"#/visitors/value"},content:ev,required:{$ref:"#/visitors/value"}}},MediaType:{$visitor:Pm,fixedFields:{schema:lg,example:{$ref:"#/visitors/value"},examples:mg,encoding:yg}},Encoding:{$visitor:Ky,fixedFields:{contentType:{$ref:"#/visitors/value"},headers:Yy,style:{$ref:"#/visitors/value"},explode:{$ref:"#/visitors/value"},allowReserved:{$ref:"#/visitors/value"}}},Responses:{$visitor:mv,fixedFields:{default:gv}},Response:{$visitor:rv,fixedFields:{description:{$ref:"#/visitors/value"},headers:sv,content:iv,links:cv}},Callback:{$visitor:tv},Example:{$visitor:Wy,fixedFields:{summary:{$ref:"#/visitors/value"},description:{$ref:"#/visitors/value"},value:{$ref:"#/visitors/value"},externalValue:Jy}},Link:{$visitor:gm,fixedFields:{operationRef:ym,operationId:vm,parameters:Em,requestBody:{$ref:"#/visitors/value"},description:{$ref:"#/visitors/value"},server:{$ref:"#/visitors/document/objects/Server"}}},Header:{$visitor:Yg,fixedFields:{description:{$ref:"#/visitors/value"},required:{$ref:"#/visitors/value"},deprecated:{$ref:"#/visitors/value"},allowEmptyValue:{$ref:"#/visitors/value"},style:{$ref:"#/visitors/value"},explode:{$ref:"#/visitors/value"},allowReserved:{$ref:"#/visitors/value"},schema:Xg,example:{$ref:"#/visitors/value"},examples:ey,content:ny}},Tag:{$visitor:Ug,fixedFields:{name:{$ref:"#/visitors/value"},description:{$ref:"#/visitors/value"},externalDocs:{$ref:"#/visitors/document/objects/ExternalDocumentation"}}},Reference:{$visitor:Vg,fixedFields:{$ref:zg}},JSONSchema:{$ref:"#/visitors/document/objects/Schema"},JSONReference:{$ref:"#/visitors/document/objects/Reference"},Schema:{$visitor:sy,fixedFields:{title:Vv.title,multipleOf:Vv.multipleOf,maximum:Vv.maximum,exclusiveMaximum:Vv.exclusiveMaximum,minimum:Vv.minimum,exclusiveMinimum:Vv.exclusiveMinimum,maxLength:Vv.maxLength,minLength:Vv.minLength,pattern:Vv.pattern,maxItems:Vv.maxItems,minItems:Vv.minItems,uniqueItems:Vv.uniqueItems,maxProperties:Vv.maxProperties,minProperties:Vv.minProperties,required:Vv.required,enum:Vv.enum,type:gy,allOf:iy,anyOf:cy,oneOf:uy,not:vy,items:hy,properties:fy,additionalProperties:vy,description:Vv.description,format:Vv.format,default:Vv.default,nullable:{$ref:"#/visitors/value"},discriminator:{$ref:"#/visitors/document/objects/Discriminator"},writeOnly:{$ref:"#/visitors/value"},xml:{$ref:"#/visitors/document/objects/XML"},externalDocs:{$ref:"#/visitors/document/objects/ExternalDocumentation"},example:{$ref:"#/visitors/value"},deprecated:{$ref:"#/visitors/value"}}},Discriminator:{$visitor:by,fixedFields:{propertyName:{$ref:"#/visitors/value"},mapping:Sy}},XML:{$visitor:Ey,fixedFields:{name:{$ref:"#/visitors/value"},namespace:{$ref:"#/visitors/value"},prefix:{$ref:"#/visitors/value"},attribute:{$ref:"#/visitors/value"},wrapped:{$ref:"#/visitors/value"}}},SecurityScheme:{$visitor:Dv,fixedFields:{type:{$ref:"#/visitors/value"},description:{$ref:"#/visitors/value"},name:{$ref:"#/visitors/value"},in:{$ref:"#/visitors/value"},scheme:{$ref:"#/visitors/value"},bearerFormat:{$ref:"#/visitors/value"},flows:{$ref:"#/visitors/document/objects/OAuthFlows"},openIdConnectUrl:{$ref:"#/visitors/value"}}},OAuthFlows:{$visitor:Lv,fixedFields:{implicit:{$ref:"#/visitors/document/objects/OAuthFlow"},password:{$ref:"#/visitors/document/objects/OAuthFlow"},clientCredentials:{$ref:"#/visitors/document/objects/OAuthFlow"},authorizationCode:{$ref:"#/visitors/document/objects/OAuthFlow"}}},OAuthFlow:{$visitor:Fv,fixedFields:{authorizationUrl:{$ref:"#/visitors/value"},tokenUrl:{$ref:"#/visitors/value"},refreshUrl:{$ref:"#/visitors/value"},scopes:$v}},SecurityRequirement:{$visitor:_g}},extension:{$visitor:pm}}}},src_traversal_visitor_getNodeType=s=>{if(Cu(s))return`${s.element.charAt(0).toUpperCase()+s.element.slice(1)}Element`},Wv={CallbackElement:["content"],ComponentsElement:["content"],ContactElement:["content"],DiscriminatorElement:["content"],Encoding:["content"],Example:["content"],ExternalDocumentationElement:["content"],HeaderElement:["content"],InfoElement:["content"],LicenseElement:["content"],MediaTypeElement:["content"],OAuthFlowElement:["content"],OAuthFlowsElement:["content"],OpenApi3_0Element:["content"],OperationElement:["content"],ParameterElement:["content"],PathItemElement:["content"],PathsElement:["content"],ReferenceElement:["content"],RequestBodyElement:["content"],ResponseElement:["content"],ResponsesElement:["content"],SchemaElement:["content"],SecurityRequirementElement:["content"],SecuritySchemeElement:["content"],ServerElement:["content"],ServerVariableElement:["content"],TagElement:["content"],...np},Jv={namespace:s=>{const{base:o}=s;return o.register("callback",Xp),o.register("components",Qp),o.register("contact",Zp),o.register("discriminator",th),o.register("encoding",rh),o.register("example",uh),o.register("externalDocumentation",dh),o.register("header",fh),o.register("info",vh),o.register("license",_h),o.register("link",wh),o.register("mediaType",Oh),o.register("oAuthFlow",jh),o.register("oAuthFlows",Ph),o.register("openapi",Ih),o.register("openApi3_0",Rh),o.register("operation",Dh),o.register("parameter",Lh),o.register("pathItem",Fh),o.register("paths",Jh),o.register("reference",Hh),o.register("requestBody",Kh),o.register("response",Gh),o.register("responses",Qh),o.register("schema",Ff),o.register("securityRequirement",Vf),o.register("securityScheme",Wf),o.register("server",Jf),o.register("serverVariable",Hf),o.register("tag",Gf),o.register("xml",Xf),o}},Hv=Jv,src_refractor_toolbox=()=>{const s=createNamespace(Hv);return{predicates:{...ce,isElement:Cu,isStringElement:ju,isArrayElement:Mu,isObjectElement:Nu,isMemberElement:Ru,includesClasses,hasElementSourceMap},namespace:s}},src_refractor_refract=(s,{specPath:o=["visitors","document","objects","OpenApi","$visitor"],plugins:i=[]}={})=>{const a=(0,Su.e)(s),u=dereference(zv),_=new(Qu(o,u))({specObj:u});return visitor_visit(a,_),dispatchPluginsSync(_.element,i,{toolboxCreator:src_refractor_toolbox,visitorOptions:{keyMap:Wv,nodeTypeGetter:src_traversal_visitor_getNodeType}})},src_refractor_createRefractor=s=>(o,i={})=>src_refractor_refract(o,{specPath:s,...i});Xp.refract=src_refractor_createRefractor(["visitors","document","objects","Callback","$visitor"]),Qp.refract=src_refractor_createRefractor(["visitors","document","objects","Components","$visitor"]),Zp.refract=src_refractor_createRefractor(["visitors","document","objects","Contact","$visitor"]),uh.refract=src_refractor_createRefractor(["visitors","document","objects","Example","$visitor"]),th.refract=src_refractor_createRefractor(["visitors","document","objects","Discriminator","$visitor"]),rh.refract=src_refractor_createRefractor(["visitors","document","objects","Encoding","$visitor"]),dh.refract=src_refractor_createRefractor(["visitors","document","objects","ExternalDocumentation","$visitor"]),fh.refract=src_refractor_createRefractor(["visitors","document","objects","Header","$visitor"]),vh.refract=src_refractor_createRefractor(["visitors","document","objects","Info","$visitor"]),_h.refract=src_refractor_createRefractor(["visitors","document","objects","License","$visitor"]),wh.refract=src_refractor_createRefractor(["visitors","document","objects","Link","$visitor"]),Oh.refract=src_refractor_createRefractor(["visitors","document","objects","MediaType","$visitor"]),jh.refract=src_refractor_createRefractor(["visitors","document","objects","OAuthFlow","$visitor"]),Ph.refract=src_refractor_createRefractor(["visitors","document","objects","OAuthFlows","$visitor"]),Ih.refract=src_refractor_createRefractor(["visitors","document","objects","OpenApi","fixedFields","openapi"]),Rh.refract=src_refractor_createRefractor(["visitors","document","objects","OpenApi","$visitor"]),Dh.refract=src_refractor_createRefractor(["visitors","document","objects","Operation","$visitor"]),Lh.refract=src_refractor_createRefractor(["visitors","document","objects","Parameter","$visitor"]),Fh.refract=src_refractor_createRefractor(["visitors","document","objects","PathItem","$visitor"]),Jh.refract=src_refractor_createRefractor(["visitors","document","objects","Paths","$visitor"]),Hh.refract=src_refractor_createRefractor(["visitors","document","objects","Reference","$visitor"]),Kh.refract=src_refractor_createRefractor(["visitors","document","objects","RequestBody","$visitor"]),Gh.refract=src_refractor_createRefractor(["visitors","document","objects","Response","$visitor"]),Qh.refract=src_refractor_createRefractor(["visitors","document","objects","Responses","$visitor"]),Ff.refract=src_refractor_createRefractor(["visitors","document","objects","Schema","$visitor"]),Vf.refract=src_refractor_createRefractor(["visitors","document","objects","SecurityRequirement","$visitor"]),Wf.refract=src_refractor_createRefractor(["visitors","document","objects","SecurityScheme","$visitor"]),Jf.refract=src_refractor_createRefractor(["visitors","document","objects","Server","$visitor"]),Hf.refract=src_refractor_createRefractor(["visitors","document","objects","ServerVariable","$visitor"]),Gf.refract=src_refractor_createRefractor(["visitors","document","objects","Tag","$visitor"]),Xf.refract=src_refractor_createRefractor(["visitors","document","objects","XML","$visitor"]);const Kv=class Callback_Callback extends Xp{};const Gv=class Components_Components extends Qp{get pathItems(){return this.get("pathItems")}set pathItems(s){this.set("pathItems",s)}};const Yv=class Contact_Contact extends Zp{};const Xv=class Discriminator_Discriminator extends th{};const Qv=class Encoding_Encoding extends rh{};const Zv=class Example_Example extends uh{};const eb=class ExternalDocumentation_ExternalDocumentation extends dh{};const tb=class Header_Header extends fh{get schema(){return this.get("schema")}set schema(s){this.set("schema",s)}};const nb=class Info_Info extends vh{get license(){return this.get("license")}set license(s){this.set("license",s)}get summary(){return this.get("summary")}set summary(s){this.set("summary",s)}};class JsonSchemaDialect extends Su.Om{static default=new JsonSchemaDialect("https://spec.openapis.org/oas/3.1/dialect/base");constructor(s,o,i){super(s,o,i),this.element="jsonSchemaDialect"}}const pb=JsonSchemaDialect;const mb=class License_License extends _h{get identifier(){return this.get("identifier")}set identifier(s){this.set("identifier",s)}};const yb=class Link_Link extends wh{};const _b=class MediaType_MediaType extends Oh{get schema(){return this.get("schema")}set schema(s){this.set("schema",s)}};const Sb=class OAuthFlow_OAuthFlow extends jh{};const wb=class OAuthFlows_OAuthFlows extends Ph{};const Ob=class Openapi_Openapi extends Ih{};class OpenApi3_1 extends Su.Sh{constructor(s,o,i){super(s,o,i),this.element="openApi3_1",this.classes.push("api")}get openapi(){return this.get("openapi")}set openapi(s){this.set("openapi",s)}get info(){return this.get("info")}set info(s){this.set("info",s)}get jsonSchemaDialect(){return this.get("jsonSchemaDialect")}set jsonSchemaDialect(s){this.set("jsonSchemaDialect",s)}get servers(){return this.get("servers")}set servers(s){this.set("servers",s)}get paths(){return this.get("paths")}set paths(s){this.set("paths",s)}get components(){return this.get("components")}set components(s){this.set("components",s)}get security(){return this.get("security")}set security(s){this.set("security",s)}get tags(){return this.get("tags")}set tags(s){this.set("tags",s)}get externalDocs(){return this.get("externalDocs")}set externalDocs(s){this.set("externalDocs",s)}get webhooks(){return this.get("webhooks")}set webhooks(s){this.set("webhooks",s)}}const Ab=OpenApi3_1;const Pb=class Operation_Operation extends Dh{get requestBody(){return this.get("requestBody")}set requestBody(s){this.set("requestBody",s)}};const Ib=class Parameter_Parameter extends Lh{get schema(){return this.get("schema")}set schema(s){this.set("schema",s)}};const Mb=class PathItem_PathItem extends Fh{get GET(){return this.get("get")}set GET(s){this.set("GET",s)}get PUT(){return this.get("put")}set PUT(s){this.set("PUT",s)}get POST(){return this.get("post")}set POST(s){this.set("POST",s)}get DELETE(){return this.get("delete")}set DELETE(s){this.set("DELETE",s)}get OPTIONS(){return this.get("options")}set OPTIONS(s){this.set("OPTIONS",s)}get HEAD(){return this.get("head")}set HEAD(s){this.set("HEAD",s)}get PATCH(){return this.get("patch")}set PATCH(s){this.set("PATCH",s)}get TRACE(){return this.get("trace")}set TRACE(s){this.set("TRACE",s)}};const Rb=class Paths_Paths extends Jh{};class Reference_Reference extends Hh{}Object.defineProperty(Reference_Reference.prototype,"description",{get(){return this.get("description")},set(s){this.set("description",s)},enumerable:!0}),Object.defineProperty(Reference_Reference.prototype,"summary",{get(){return this.get("summary")},set(s){this.set("summary",s)},enumerable:!0});const Lb=Reference_Reference;const qb=class RequestBody_RequestBody extends Kh{};const zb=class elements_Response_Response extends Gh{};const Qb=class Responses_Responses extends Qh{};const e_=class JSONSchema_JSONSchema extends sd{constructor(s,o,i){super(s,o,i),this.element="JSONSchemaDraft6"}get idProp(){throw new td("id keyword from Core vocabulary has been renamed to $id.")}set idProp(s){throw new td("id keyword from Core vocabulary has been renamed to $id.")}get $id(){return this.get("$id")}set $id(s){this.set("$id",s)}get exclusiveMaximum(){return this.get("exclusiveMaximum")}set exclusiveMaximum(s){this.set("exclusiveMaximum",s)}get exclusiveMinimum(){return this.get("exclusiveMinimum")}set exclusiveMinimum(s){this.set("exclusiveMinimum",s)}get containsProp(){return this.get("contains")}set containsProp(s){this.set("contains",s)}get items(){return this.get("items")}set items(s){this.set("items",s)}get propertyNames(){return this.get("propertyNames")}set propertyNames(s){this.set("propertyNames",s)}get const(){return this.get("const")}set const(s){this.set("const",s)}get not(){return this.get("not")}set not(s){this.set("not",s)}get examples(){return this.get("examples")}set examples(s){this.set("examples",s)}};const t_=class LinkDescription_LinkDescription extends ld{get hrefSchema(){return this.get("hrefSchema")}set hrefSchema(s){this.set("hrefSchema",s)}get targetSchema(){return this.get("targetSchema")}set targetSchema(s){this.set("targetSchema",s)}get schema(){throw new td("schema keyword from Hyper-Schema vocabulary has been renamed to submissionSchema.")}set schema(s){throw new td("schema keyword from Hyper-Schema vocabulary has been renamed to submissionSchema.")}get submissionSchema(){return this.get("submissionSchema")}set submissionSchema(s){this.set("submissionSchema",s)}get method(){throw new td("method keyword from Hyper-Schema vocabulary has been removed.")}set method(s){throw new td("method keyword from Hyper-Schema vocabulary has been removed.")}get encType(){throw new td("encType keyword from Hyper-Schema vocabulary has been renamed to submissionEncType.")}set encType(s){throw new td("encType keyword from Hyper-Schema vocabulary has been renamed to submissionEncType.")}get submissionEncType(){return this.get("submissionEncType")}set submissionEncType(s){this.set("submissionEncType",s)}};var r_=_curry3((function assocPath(s,o,i){if(0===s.length)return o;var a=s[0];if(s.length>1){var u=!Ju(i)&&_has(a,i)&&"object"==typeof i[a]?i[a]:Xo(s[1])?[]:{};o=assocPath(Array.prototype.slice.call(s,1),o,u)}return function _assoc(s,o,i){if(Xo(s)&&ca(i)){var a=[].concat(i);return a[s]=o,a}var u={};for(var _ in i)u[_]=i[_];return u[s]=o,u}(a,o,i)}));const n_=r_;var s_=_curry3((function remove(s,o,i){var a=Array.prototype.slice.call(i,0);return a.splice(s,o),a}));const o_=s_;var i_=_curry3((function assoc(s,o,i){return n_([s],o,i)}));const a_=i_;var c_=_curry2((function dissocPath(s,o){if(null==o)return o;switch(s.length){case 0:return o;case 1:return function _dissoc(s,o){if(null==o)return o;if(Xo(s)&&ca(o))return o_(s,1,o);var i={};for(var a in o)i[a]=o[a];return delete i[s],i}(s[0],o);default:var i=s[0],a=Array.prototype.slice.call(s,1);return null==o[i]?function _shallowCloneObject(s,o){if(Xo(s)&&ca(o))return[].concat(o);var i={};for(var a in o)i[a]=o[a];return i}(i,o):a_(i,dissocPath(a,o[i]),o)}}));const l_=c_;const u_=class json_schema_JSONSchemaVisitor extends $d{constructor(s){super(s),this.element=new e_}get defaultDialectIdentifier(){return"http://json-schema.org/draft-06/schema#"}BooleanElement(s){const o=this.enter(s);return this.element.classes.push("boolean-json-schema"),o}handleSchemaIdentifier(s,o="$id"){return super.handleSchemaIdentifier(s,o)}};const p_=class json_schema_ItemsVisitor_ItemsVisitor extends Ud{BooleanElement(s){return this.element=this.toRefractedElement(["document","objects","JSONSchema"],s),qu}};const h_=class json_schema_ExamplesVisitor_ExamplesVisitor extends yd{ArrayElement(s){const o=this.enter(s);return this.element.classes.push("json-schema-examples"),o}};const d_=class link_description_LinkDescriptionVisitor extends Pf{constructor(s){super(s),this.element=new t_}},f_=pipe(n_(["visitors","document","objects","JSONSchema","$visitor"],u_),l_(["visitors","document","objects","JSONSchema","fixedFields","id"]),n_(["visitors","document","objects","JSONSchema","fixedFields","$id"],Tf.visitors.value),n_(["visitors","document","objects","JSONSchema","fixedFields","contains"],Tf.visitors.JSONSchemaOrJSONReferenceVisitor),n_(["visitors","document","objects","JSONSchema","fixedFields","items"],p_),n_(["visitors","document","objects","JSONSchema","fixedFields","propertyNames"],Tf.visitors.JSONSchemaOrJSONReferenceVisitor),n_(["visitors","document","objects","JSONSchema","fixedFields","const"],Tf.visitors.value),n_(["visitors","document","objects","JSONSchema","fixedFields","examples"],h_),n_(["visitors","document","objects","LinkDescription","$visitor"],d_),n_(["visitors","document","objects","LinkDescription","fixedFields","hrefSchema"],Tf.visitors.JSONSchemaOrJSONReferenceVisitor),l_(["visitors","document","objects","LinkDescription","fixedFields","schema"]),n_(["visitors","document","objects","LinkDescription","fixedFields","submissionSchema"],Tf.visitors.JSONSchemaOrJSONReferenceVisitor),l_(["visitors","document","objects","LinkDescription","fixedFields","method"]),l_(["visitors","document","objects","LinkDescription","fixedFields","encType"]),n_(["visitors","document","objects","LinkDescription","fixedFields","submissionEncType"],Tf.visitors.value))(Tf),m_={JSONSchemaDraft6Element:["content"],JSONReferenceElement:["content"],MediaElement:["content"],LinkDescriptionElement:["content"],...np},g_=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof e_||s(a)&&o("JSONSchemaDraft6",a)&&i("object",a))),y_=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof t_||s(a)&&o("linkDescription",a)&&i("object",a))),v_={namespace:s=>{const{base:o}=s;return o.register("jSONSchemaDraft6",e_),o.register("jSONReference",id),o.register("media",cd),o.register("linkDescription",t_),o}},b_=v_,apidom_ns_json_schema_draft_6_src_refractor_toolbox=()=>{const s=createNamespace(b_);return{predicates:{...le,isStringElement:ju},namespace:s}},apidom_ns_json_schema_draft_6_src_refractor_refract=(s,{specPath:o=["visitors","document","objects","JSONSchema","$visitor"],plugins:i=[],specificationObj:a=f_}={})=>{const u=(0,Su.e)(s),_=dereference(a),w=new(Qu(o,_))({specObj:_});return visitor_visit(u,w),dispatchPluginsSync(w.element,i,{toolboxCreator:apidom_ns_json_schema_draft_6_src_refractor_toolbox,visitorOptions:{keyMap:m_,nodeTypeGetter:traversal_visitor_getNodeType}})},apidom_ns_json_schema_draft_6_src_refractor_createRefractor=s=>(o,i={})=>apidom_ns_json_schema_draft_6_src_refractor_refract(o,{specPath:s,...i});e_.refract=apidom_ns_json_schema_draft_6_src_refractor_createRefractor(["visitors","document","objects","JSONSchema","$visitor"]),t_.refract=apidom_ns_json_schema_draft_6_src_refractor_createRefractor(["visitors","document","objects","LinkDescription","$visitor"]);const S_=class elements_JSONSchema_JSONSchema extends e_{constructor(s,o,i){super(s,o,i),this.element="JSONSchemaDraft7"}get $comment(){return this.get("$comment")}set $comment(s){this.set("$comment",s)}get items(){return this.get("items")}set items(s){this.set("items",s)}get if(){return this.get("if")}set if(s){this.set("if",s)}get then(){return this.get("then")}set then(s){this.set("then",s)}get else(){return this.get("else")}set else(s){this.set("else",s)}get not(){return this.get("not")}set not(s){this.set("not",s)}get contentEncoding(){return this.get("contentEncoding")}set contentEncoding(s){this.set("contentEncoding",s)}get contentMediaType(){return this.get("contentMediaType")}set contentMediaType(s){this.set("contentMediaType",s)}get media(){throw new td('media keyword from Hyper-Schema vocabulary has been moved to validation vocabulary as "contentMediaType" / "contentEncoding"')}set media(s){throw new td('media keyword from Hyper-Schema vocabulary has been moved to validation vocabulary as "contentMediaType" / "contentEncoding"')}get writeOnly(){return this.get("writeOnly")}set writeOnly(s){this.set("writeOnly",s)}};const E_=class elements_LinkDescription_LinkDescription extends t_{get anchor(){return this.get("anchor")}set anchor(s){this.set("anchor",s)}get anchorPointer(){return this.get("anchorPointer")}set anchorPointer(s){this.set("anchorPointer",s)}get templatePointers(){return this.get("templatePointers")}set templatePointers(s){this.set("templatePointers",s)}get templateRequired(){return this.get("templateRequired")}set templateRequired(s){this.set("templateRequired",s)}get targetSchema(){return this.get("targetSchema")}set targetSchema(s){this.set("targetSchema",s)}get mediaType(){throw new td("mediaType keyword from Hyper-Schema vocabulary has been renamed to targetMediaType.")}set mediaType(s){throw new td("mediaType keyword from Hyper-Schema vocabulary has been renamed to targetMediaType.")}get targetMediaType(){return this.get("targetMediaType")}set targetMediaType(s){this.set("targetMediaType",s)}get targetHints(){return this.get("targetHints")}set targetHints(s){this.set("targetHints",s)}get description(){return this.get("description")}set description(s){this.set("description",s)}get $comment(){return this.get("$comment")}set $comment(s){this.set("$comment",s)}get hrefSchema(){return this.get("hrefSchema")}set hrefSchema(s){this.set("hrefSchema",s)}get headerSchema(){return this.get("headerSchema")}set headerSchema(s){this.set("headerSchema",s)}get submissionSchema(){return this.get("submissionSchema")}set submissionSchema(s){this.set("submissionSchema",s)}get submissionEncType(){throw new td("submissionEncType keyword from Hyper-Schema vocabulary has been renamed to submissionMediaType.")}set submissionEncType(s){throw new td("submissionEncType keyword from Hyper-Schema vocabulary has been renamed to submissionMediaType.")}get submissionMediaType(){return this.get("submissionMediaType")}set submissionMediaType(s){this.set("submissionMediaType",s)}};const w_=class visitors_json_schema_JSONSchemaVisitor extends u_{constructor(s){super(s),this.element=new S_}get defaultDialectIdentifier(){return"http://json-schema.org/draft-07/schema#"}};const x_=class json_schema_link_description_LinkDescriptionVisitor extends d_{constructor(s){super(s),this.element=new E_}},k_=pipe(n_(["visitors","document","objects","JSONSchema","$visitor"],w_),n_(["visitors","document","objects","JSONSchema","fixedFields","$comment"],f_.visitors.value),n_(["visitors","document","objects","JSONSchema","fixedFields","if"],f_.visitors.JSONSchemaOrJSONReferenceVisitor),n_(["visitors","document","objects","JSONSchema","fixedFields","then"],f_.visitors.JSONSchemaOrJSONReferenceVisitor),n_(["visitors","document","objects","JSONSchema","fixedFields","else"],f_.visitors.JSONSchemaOrJSONReferenceVisitor),l_(["visitors","document","objects","JSONSchema","fixedFields","media"]),n_(["visitors","document","objects","JSONSchema","fixedFields","contentEncoding"],f_.visitors.value),n_(["visitors","document","objects","JSONSchema","fixedFields","contentMediaType"],f_.visitors.value),n_(["visitors","document","objects","JSONSchema","fixedFields","writeOnly"],f_.visitors.value),n_(["visitors","document","objects","LinkDescription","$visitor"],x_),n_(["visitors","document","objects","LinkDescription","fixedFields","anchor"],f_.visitors.value),n_(["visitors","document","objects","LinkDescription","fixedFields","anchorPointer"],f_.visitors.value),l_(["visitors","document","objects","LinkDescription","fixedFields","mediaType"]),n_(["visitors","document","objects","LinkDescription","fixedFields","targetMediaType"],f_.visitors.value),n_(["visitors","document","objects","LinkDescription","fixedFields","targetHints"],f_.visitors.value),n_(["visitors","document","objects","LinkDescription","fixedFields","description"],f_.visitors.value),n_(["visitors","document","objects","LinkDescription","fixedFields","$comment"],f_.visitors.value),n_(["visitors","document","objects","LinkDescription","fixedFields","headerSchema"],f_.visitors.JSONSchemaOrJSONReferenceVisitor),l_(["visitors","document","objects","LinkDescription","fixedFields","submissionEncType"]),n_(["visitors","document","objects","LinkDescription","fixedFields","submissionMediaType"],f_.visitors.value))(f_),O_={JSONSchemaDraft7Element:["content"],JSONReferenceElement:["content"],LinkDescriptionElement:["content"],...np},A_=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof S_||s(a)&&o("JSONSchemaDraft7",a)&&i("object",a))),C_=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof E_||s(a)&&o("linkDescription",a)&&i("object",a))),j_={namespace:s=>{const{base:o}=s;return o.register("jSONSchemaDraft7",S_),o.register("jSONReference",id),o.register("linkDescription",E_),o}},P_=j_,apidom_ns_json_schema_draft_7_src_refractor_toolbox=()=>{const s=createNamespace(P_);return{predicates:{...pe,isStringElement:ju},namespace:s}},apidom_ns_json_schema_draft_7_src_refractor_refract=(s,{specPath:o=["visitors","document","objects","JSONSchema","$visitor"],plugins:i=[],specificationObj:a=k_}={})=>{const u=(0,Su.e)(s),_=dereference(a),w=new(Qu(o,_))({specObj:_});return visitor_visit(u,w),dispatchPluginsSync(w.element,i,{toolboxCreator:apidom_ns_json_schema_draft_7_src_refractor_toolbox,visitorOptions:{keyMap:O_,nodeTypeGetter:traversal_visitor_getNodeType}})},apidom_ns_json_schema_draft_7_src_refractor_createRefractor=s=>(o,i={})=>apidom_ns_json_schema_draft_7_src_refractor_refract(o,{specPath:s,...i});S_.refract=apidom_ns_json_schema_draft_7_src_refractor_createRefractor(["visitors","document","objects","JSONSchema","$visitor"]),E_.refract=apidom_ns_json_schema_draft_7_src_refractor_createRefractor(["visitors","document","objects","LinkDescription","$visitor"]);const I_=class src_elements_JSONSchema_JSONSchema extends S_{constructor(s,o,i){super(s,o,i),this.element="JSONSchema201909"}get $vocabulary(){return this.get("$vocabulary")}set $vocabulary(s){this.set("$vocabulary",s)}get $anchor(){return this.get("$anchor")}set $anchor(s){this.set("$anchor",s)}get $recursiveAnchor(){return this.get("$recursiveAnchor")}set $recursiveAnchor(s){this.set("$recursiveAnchor",s)}get $recursiveRef(){return this.get("$recursiveRef")}set $recursiveRef(s){this.set("$recursiveRef",s)}get $ref(){return this.get("$ref")}set $ref(s){this.set("$ref",s)}get $defs(){return this.get("$defs")}set $defs(s){this.set("$defs",s)}get definitions(){throw new td("definitions keyword from Validation vocabulary has been renamed to $defs.")}set definitions(s){throw new td("definitions keyword from Validation vocabulary has been renamed to $defs.")}get not(){return this.get("not")}set not(s){this.set("not",s)}get if(){return this.get("if")}set if(s){this.set("if",s)}get then(){return this.get("then")}set then(s){this.set("then",s)}get else(){return this.get("else")}set else(s){this.set("else",s)}get dependentSchemas(){return this.get("dependentSchemas")}set dependentSchemas(s){this.set("dependentSchemas",s)}get dependencies(){throw new td("dependencies keyword from Validation vocabulary has been renamed to dependentSchemas.")}set dependencies(s){throw new td("dependencies keyword from Validation vocabulary has been renamed to dependentSchemas.")}get items(){return this.get("items")}set items(s){this.set("items",s)}get containsProp(){return this.get("contains")}set containsProp(s){this.set("contains",s)}get additionalProperties(){return this.get("additionalProperties")}set additionalProperties(s){this.set("additionalProperties",s)}get additionalItems(){return this.get("additionalItems")}set additionalItems(s){this.set("additionalItems",s)}get propertyNames(){return this.get("propertyNames")}set propertyNames(s){this.set("propertyNames",s)}get unevaluatedItems(){return this.get("unevaluatedItems")}set unevaluatedItems(s){this.set("unevaluatedItems",s)}get unevaluatedProperties(){return this.get("unevaluatedProperties")}set unevaluatedProperties(s){this.set("unevaluatedProperties",s)}get maxContains(){return this.get("maxContains")}set maxContains(s){this.set("maxContains",s)}get minContains(){return this.get("minContains")}set minContains(s){this.set("minContains",s)}get dependentRequired(){return this.get("dependentRequired")}set dependentRequired(s){this.set("dependentRequired",s)}get deprecated(){return this.get("deprecated")}set deprecated(s){this.set("deprecated",s)}get contentSchema(){return this.get("contentSchema")}set contentSchema(s){this.set("contentSchema",s)}};const T_=class src_elements_LinkDescription_LinkDescription extends E_{get targetSchema(){return this.get("targetSchema")}set targetSchema(s){this.set("targetSchema",s)}get hrefSchema(){return this.get("hrefSchema")}set hrefSchema(s){this.set("hrefSchema",s)}get headerSchema(){return this.get("headerSchema")}set headerSchema(s){this.set("headerSchema",s)}get submissionSchema(){return this.get("submissionSchema")}set submissionSchema(s){this.set("submissionSchema",s)}};const N_=class refractor_visitors_json_schema_JSONSchemaVisitor extends w_{constructor(s){super(s),this.element=new I_}get defaultDialectIdentifier(){return"https://json-schema.org/draft/2019-09/schema"}ObjectElement(s){this.handleDialectIdentifier(s),this.handleSchemaIdentifier(s),this.parent=this.element;const o=Md.prototype.ObjectElement.call(this,s);return ju(this.element.$ref)&&(this.element.classes.push("reference-element"),this.element.setMetaProperty("referenced-element","schema")),o}};const M_=class $vocabularyVisitor extends yd{ObjectElement(s){const o=super.enter(s);return this.element.classes.push("json-schema-$vocabulary"),o}};const R_=class $refVisitor extends yd{StringElement(s){const o=super.enter(s);return this.element.classes.push("reference-value"),o}};class $defsVisitor extends(Mixin(Jd,Rd,yd)){constructor(s){super(s),this.element=new Su.Sh,this.element.classes.push("json-schema-$defs"),this.specPath=fc(["document","objects","JSONSchema"])}}const D_=$defsVisitor;class json_schema_AllOfVisitor_AllOfVisitor extends(Mixin(Nd,Rd,yd)){constructor(s){super(s),this.element=new Su.wE,this.element.classes.push("json-schema-allOf")}ArrayElement(s){return s.forEach((s=>{const o=this.toRefractedElement(["document","objects","JSONSchema"],s);this.element.push(o)})),this.copyMetaAndAttributes(s,this.element),qu}}const L_=json_schema_AllOfVisitor_AllOfVisitor;class json_schema_AnyOfVisitor_AnyOfVisitor extends(Mixin(Nd,Rd,yd)){constructor(s){super(s),this.element=new Su.wE,this.element.classes.push("json-schema-anyOf")}ArrayElement(s){return s.forEach((s=>{const o=this.toRefractedElement(["document","objects","JSONSchema"],s);this.element.push(o)})),this.copyMetaAndAttributes(s,this.element),qu}}const F_=json_schema_AnyOfVisitor_AnyOfVisitor;class json_schema_OneOfVisitor_OneOfVisitor extends(Mixin(Nd,Rd,yd)){constructor(s){super(s),this.element=new Su.wE,this.element.classes.push("json-schema-oneOf")}ArrayElement(s){return s.forEach((s=>{const o=this.toRefractedElement(["document","objects","JSONSchema"],s);this.element.push(o)})),this.copyMetaAndAttributes(s,this.element),qu}}const B_=json_schema_OneOfVisitor_OneOfVisitor;class DependentSchemasVisitor extends(Mixin(Jd,Rd,yd)){constructor(s){super(s),this.element=new Su.Sh,this.element.classes.push("json-schema-dependentSchemas"),this.specPath=fc(["document","objects","JSONSchema"])}}const $_=DependentSchemasVisitor;class visitors_json_schema_ItemsVisitor_ItemsVisitor extends(Mixin(Nd,Rd,yd)){ObjectElement(s){return this.element=this.toRefractedElement(["document","objects","JSONSchema"],s),qu}ArrayElement(s){return this.element=new Su.wE,this.element.classes.push("json-schema-items"),s.forEach((s=>{const o=this.toRefractedElement(["document","objects","JSONSchema"],s);this.element.push(o)})),this.copyMetaAndAttributes(s,this.element),qu}BooleanElement(s){return this.element=this.toRefractedElement(["document","objects","JSONSchema"],s),qu}}const q_=visitors_json_schema_ItemsVisitor_ItemsVisitor;class json_schema_PropertiesVisitor_PropertiesVisitor extends(Mixin(Jd,Rd,yd)){constructor(s){super(s),this.element=new Su.Sh,this.element.classes.push("json-schema-properties"),this.specPath=fc(["document","objects","JSONSchema"])}}const U_=json_schema_PropertiesVisitor_PropertiesVisitor;class PatternPropertiesVisitor_PatternPropertiesVisitor extends(Mixin(Jd,Rd,yd)){constructor(s){super(s),this.element=new Su.Sh,this.element.classes.push("json-schema-patternProperties"),this.specPath=fc(["document","objects","JSONSchema"])}}const V_=PatternPropertiesVisitor_PatternPropertiesVisitor;const z_=class DependentRequiredVisitor extends yd{ObjectElement(s){const o=super.enter(s);return this.element.classes.push("json-schema-dependentRequired"),o}};const W_=class visitors_json_schema_link_description_LinkDescriptionVisitor extends x_{constructor(s){super(s),this.element=new T_}},J_=pipe(n_(["visitors","document","objects","JSONSchema","$visitor"],N_),n_(["visitors","document","objects","JSONSchema","fixedFields","$vocabulary"],M_),n_(["visitors","document","objects","JSONSchema","fixedFields","$anchor"],k_.visitors.value),n_(["visitors","document","objects","JSONSchema","fixedFields","$recursiveAnchor"],k_.visitors.value),n_(["visitors","document","objects","JSONSchema","fixedFields","$recursiveRef"],k_.visitors.value),l_(["visitors","document","objects","JSONReference","$visitor"]),n_(["visitors","document","objects","JSONSchema","fixedFields","$ref"],R_),l_(["visitors","document","objects","JSONSchema","fixedFields","definitions"]),n_(["visitors","document","objects","JSONSchema","fixedFields","$defs"],D_),n_(["visitors","document","objects","JSONSchema","fixedFields","allOf"],L_),n_(["visitors","document","objects","JSONSchema","fixedFields","anyOf"],F_),n_(["visitors","document","objects","JSONSchema","fixedFields","oneOf"],B_),n_(["visitors","document","objects","JSONSchema","fixedFields","not"],N_),n_(["visitors","document","objects","JSONSchema","fixedFields","if"],N_),n_(["visitors","document","objects","JSONSchema","fixedFields","then"],N_),n_(["visitors","document","objects","JSONSchema","fixedFields","else"],N_),l_(["visitors","document","objects","JSONSchema","fixedFields","dependencies"]),n_(["visitors","document","objects","JSONSchema","fixedFields","dependentSchemas"],$_),n_(["visitors","document","objects","JSONSchema","fixedFields","items"],q_),n_(["visitors","document","objects","JSONSchema","fixedFields","contains"],N_),n_(["visitors","document","objects","JSONSchema","fixedFields","properties"],U_),n_(["visitors","document","objects","JSONSchema","fixedFields","patternProperties"],V_),n_(["visitors","document","objects","JSONSchema","fixedFields","additionalProperties"],N_),n_(["visitors","document","objects","JSONSchema","fixedFields","additionalItems"],N_),n_(["visitors","document","objects","JSONSchema","fixedFields","propertyNames"],N_),n_(["visitors","document","objects","JSONSchema","fixedFields","unevaluatedItems"],N_),n_(["visitors","document","objects","JSONSchema","fixedFields","unevaluatedProperties"],N_),n_(["visitors","document","objects","JSONSchema","fixedFields","maxContains"],k_.visitors.value),n_(["visitors","document","objects","JSONSchema","fixedFields","minContains"],k_.visitors.value),n_(["visitors","document","objects","JSONSchema","fixedFields","dependentRequired"],z_),n_(["visitors","document","objects","JSONSchema","fixedFields","deprecated"],k_.visitors.value),n_(["visitors","document","objects","JSONSchema","fixedFields","contentSchema"],N_),n_(["visitors","document","objects","LinkDescription","$visitor"],W_),n_(["visitors","document","objects","LinkDescription","fixedFields","targetSchema"],N_),n_(["visitors","document","objects","LinkDescription","fixedFields","hrefSchema"],N_),n_(["visitors","document","objects","LinkDescription","fixedFields","headerSchema"],N_),n_(["visitors","document","objects","LinkDescription","fixedFields","submissionSchema"],N_))(k_),H_={JSONSchema201909Element:["content"],LinkDescriptionElement:["content"],...np},K_=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof I_||s(a)&&o("JSONSchema201909",a)&&i("object",a))),G_=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof T_||s(a)&&o("linkDescription",a)&&i("object",a))),Y_={namespace:s=>{const{base:o}=s;return o.register("jSONSchema201909",I_),o.register("linkDescription",T_),o}},X_=Y_,apidom_ns_json_schema_2019_09_src_refractor_toolbox=()=>{const s=createNamespace(X_);return{predicates:{...de,isStringElement:ju},namespace:s}},apidom_ns_json_schema_2019_09_src_refractor_refract=(s,{specPath:o=["visitors","document","objects","JSONSchema","$visitor"],plugins:i=[],specificationObj:a=J_}={})=>{const u=(0,Su.e)(s),_=dereference(a),w=new(Qu(o,_))({specObj:_});return visitor_visit(u,w),dispatchPluginsSync(w.element,i,{toolboxCreator:apidom_ns_json_schema_2019_09_src_refractor_toolbox,visitorOptions:{keyMap:H_,nodeTypeGetter:traversal_visitor_getNodeType}})},apidom_ns_json_schema_2019_09_src_refractor_createRefractor=s=>(o,i={})=>apidom_ns_json_schema_2019_09_src_refractor_refract(o,{specPath:s,...i});I_.refract=apidom_ns_json_schema_2019_09_src_refractor_createRefractor(["visitors","document","objects","JSONSchema","$visitor"]),T_.refract=apidom_ns_json_schema_2019_09_src_refractor_createRefractor(["visitors","document","objects","LinkDescription","$visitor"]);const Q_=class apidom_ns_json_schema_2020_12_src_elements_JSONSchema_JSONSchema extends I_{constructor(s,o,i){super(s,o,i),this.element="JSONSchema202012"}get $dynamicAnchor(){return this.get("$dynamicAnchor")}set $dynamicAnchor(s){this.set("$dynamicAnchor",s)}get $recursiveAnchor(){throw new td("$recursiveAnchor keyword from Core vocabulary has been renamed to $dynamicAnchor.")}set $recursiveAnchor(s){throw new td("$recursiveAnchor keyword from Core vocabulary has been renamed to $dynamicAnchor.")}get $dynamicRef(){return this.get("$dynamicRef")}set $dynamicRef(s){this.set("$dynamicRef",s)}get $recursiveRef(){throw new td("$recursiveRef keyword from Core vocabulary has been renamed to $dynamicRef.")}set $recursiveRef(s){throw new td("$recursiveRef keyword from Core vocabulary has been renamed to $dynamicRef.")}get prefixItems(){return this.get("prefixItems")}set prefixItems(s){this.set("prefixItems",s)}};const Z_=class apidom_ns_json_schema_2020_12_src_elements_LinkDescription_LinkDescription extends T_{get targetSchema(){return this.get("targetSchema")}set targetSchema(s){this.set("targetSchema",s)}get hrefSchema(){return this.get("hrefSchema")}set hrefSchema(s){this.set("hrefSchema",s)}get headerSchema(){return this.get("headerSchema")}set headerSchema(s){this.set("headerSchema",s)}get submissionSchema(){return this.get("submissionSchema")}set submissionSchema(s){this.set("submissionSchema",s)}};const eS=class src_refractor_visitors_json_schema_JSONSchemaVisitor extends N_{constructor(s){super(s),this.element=new Q_}get defaultDialectIdentifier(){return"https://json-schema.org/draft/2020-12/schema"}};class PrefixItemsVisitor extends(Mixin(Nd,Rd,yd)){constructor(s){super(s),this.element=new Su.wE,this.element.classes.push("json-schema-prefixItems")}ArrayElement(s){return s.forEach((s=>{const o=this.toRefractedElement(["document","objects","JSONSchema"],s);this.element.push(o)})),this.copyMetaAndAttributes(s,this.element),qu}}const tS=PrefixItemsVisitor;const rS=class refractor_visitors_json_schema_link_description_LinkDescriptionVisitor extends W_{constructor(s){super(s),this.element=new Z_}},nS=pipe(n_(["visitors","document","objects","JSONSchema","$visitor"],eS),l_(["visitors","document","objects","JSONSchema","fixedFields","$recursiveAnchor"]),n_(["visitors","document","objects","JSONSchema","fixedFields","$dynamicAnchor"],J_.visitors.value),l_(["visitors","document","objects","JSONSchema","fixedFields","$recursiveRef"]),n_(["visitors","document","objects","JSONSchema","fixedFields","$dynamicRef"],J_.visitors.value),n_(["visitors","document","objects","JSONSchema","fixedFields","not"],eS),n_(["visitors","document","objects","JSONSchema","fixedFields","if"],eS),n_(["visitors","document","objects","JSONSchema","fixedFields","then"],eS),n_(["visitors","document","objects","JSONSchema","fixedFields","else"],eS),n_(["visitors","document","objects","JSONSchema","fixedFields","prefixItems"],tS),n_(["visitors","document","objects","JSONSchema","fixedFields","items"],eS),n_(["visitors","document","objects","JSONSchema","fixedFields","contains"],eS),n_(["visitors","document","objects","JSONSchema","fixedFields","additionalProperties"],eS),l_(["visitors","document","objects","JSONSchema","fixedFields","additionalItems"]),n_(["visitors","document","objects","JSONSchema","fixedFields","propertyNames"],eS),n_(["visitors","document","objects","JSONSchema","fixedFields","unevaluatedItems"],eS),n_(["visitors","document","objects","JSONSchema","fixedFields","unevaluatedProperties"],eS),n_(["visitors","document","objects","JSONSchema","fixedFields","contentSchema"],eS),n_(["visitors","document","objects","LinkDescription","$visitor"],rS),n_(["visitors","document","objects","LinkDescription","fixedFields","targetSchema"],eS),n_(["visitors","document","objects","LinkDescription","fixedFields","hrefSchema"],eS),n_(["visitors","document","objects","LinkDescription","fixedFields","headerSchema"],eS),n_(["visitors","document","objects","LinkDescription","fixedFields","submissionSchema"],eS))(J_),sS={JSONSchema202012Element:["content"],LinkDescriptionElement:["content"],...np},oS=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof Q_||s(a)&&o("JSONSchema202012",a)&&i("object",a))),iS=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof Z_||s(a)&&o("linkDescription",a)&&i("object",a))),aS={namespace:s=>{const{base:o}=s;return o.register("jSONSchema202012",Q_),o.register("linkDescription",Z_),o}},cS=aS,apidom_ns_json_schema_2020_12_src_refractor_toolbox=()=>{const s=createNamespace(cS);return{predicates:{...fe,isStringElement:ju},namespace:s}},apidom_ns_json_schema_2020_12_src_refractor_refract=(s,{specPath:o=["visitors","document","objects","JSONSchema","$visitor"],plugins:i=[],specificationObj:a=nS}={})=>{const u=(0,Su.e)(s),_=dereference(a),w=new(Qu(o,_))({specObj:_});return visitor_visit(u,w),dispatchPluginsSync(w.element,i,{toolboxCreator:apidom_ns_json_schema_2020_12_src_refractor_toolbox,visitorOptions:{keyMap:sS,nodeTypeGetter:traversal_visitor_getNodeType}})},apidom_ns_json_schema_2020_12_src_refractor_createRefractor=s=>(o,i={})=>apidom_ns_json_schema_2020_12_src_refractor_refract(o,{specPath:s,...i});Q_.refract=apidom_ns_json_schema_2020_12_src_refractor_createRefractor(["visitors","document","objects","JSONSchema","$visitor"]),Z_.refract=apidom_ns_json_schema_2020_12_src_refractor_createRefractor(["visitors","document","objects","LinkDescription","$visitor"]);const lS=class elements_Schema_Schema extends Q_{constructor(s,o,i){super(s,o,i),this.element="schema"}get discriminator(){return this.get("discriminator")}set discriminator(s){this.set("discriminator",s)}get xml(){return this.get("xml")}set xml(s){this.set("xml",s)}get externalDocs(){return this.get("externalDocs")}set externalDocs(s){this.set("externalDocs",s)}get example(){return this.get("example")}set example(s){this.set("example",s)}};const uS=class SecurityRequirement_SecurityRequirement extends Vf{};const pS=class SecurityScheme_SecurityScheme extends Wf{};const hS=class Server_Server extends Jf{};const dS=class ServerVariable_ServerVariable extends Hf{};const fS=class Tag_Tag extends Gf{};const mS=class Xml_Xml extends Xf{};class OpenApi3_1Visitor extends(Mixin(cm,em)){constructor(s){super(s),this.element=new Ab,this.specPath=fc(["document","objects","OpenApi"]),this.canSupportSpecificationExtensions=!0,this.openApiSemanticElement=this.element}ObjectElement(s){return this.openApiGenericElement=s,cm.prototype.ObjectElement.call(this,s)}}const gS=OpenApi3_1Visitor,yS=zv.visitors.document.objects.Info.$visitor;const vS=class info_InfoVisitor extends yS{constructor(s){super(s),this.element=new nb}},bS=zv.visitors.document.objects.Contact.$visitor;const _S=class contact_ContactVisitor extends bS{constructor(s){super(s),this.element=new Yv}},SS=zv.visitors.document.objects.License.$visitor;const ES=class license_LicenseVisitor extends SS{constructor(s){super(s),this.element=new mb}},wS=zv.visitors.document.objects.Link.$visitor;const xS=class link_LinkVisitor extends wS{constructor(s){super(s),this.element=new yb}};class JsonSchemaDialectVisitor extends(Mixin(tm,em)){StringElement(s){const o=new pb(serializers_value(s));return this.copyMetaAndAttributes(s,o),this.element=o,qu}}const kS=JsonSchemaDialectVisitor,OS=zv.visitors.document.objects.Server.$visitor;const AS=class server_ServerVisitor extends OS{constructor(s){super(s),this.element=new hS}},CS=zv.visitors.document.objects.ServerVariable.$visitor;const jS=class server_variable_ServerVariableVisitor extends CS{constructor(s){super(s),this.element=new dS}},PS=zv.visitors.document.objects.MediaType.$visitor;const IS=class media_type_MediaTypeVisitor extends PS{constructor(s){super(s),this.element=new _b}},TS=zv.visitors.document.objects.SecurityRequirement.$visitor;const NS=class security_requirement_SecurityRequirementVisitor extends TS{constructor(s){super(s),this.element=new uS}},MS=zv.visitors.document.objects.Components.$visitor;const RS=class components_ComponentsVisitor extends MS{constructor(s){super(s),this.element=new Gv}},DS=zv.visitors.document.objects.Tag.$visitor;const LS=class tag_TagVisitor extends DS{constructor(s){super(s),this.element=new fS}},FS=zv.visitors.document.objects.Reference.$visitor;const BS=class reference_ReferenceVisitor extends FS{constructor(s){super(s),this.element=new Lb}},$S=zv.visitors.document.objects.Parameter.$visitor;const qS=class parameter_ParameterVisitor extends $S{constructor(s){super(s),this.element=new Ib}},US=zv.visitors.document.objects.Header.$visitor;const VS=class header_HeaderVisitor extends US{constructor(s){super(s),this.element=new tb}},zS=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof Kv||s(a)&&o("callback",a)&&i("object",a))),WS=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof Gv||s(a)&&o("components",a)&&i("object",a))),JS=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof Yv||s(a)&&o("contact",a)&&i("object",a))),HS=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof Zv||s(a)&&o("example",a)&&i("object",a))),KS=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof eb||s(a)&&o("externalDocumentation",a)&&i("object",a))),GS=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof tb||s(a)&&o("header",a)&&i("object",a))),YS=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof nb||s(a)&&o("info",a)&&i("object",a))),XS=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof pb||s(a)&&o("jsonSchemaDialect",a)&&i("string",a))),QS=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof mb||s(a)&&o("license",a)&&i("object",a))),ZS=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof yb||s(a)&&o("link",a)&&i("object",a))),eE=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof Ob||s(a)&&o("openapi",a)&&i("string",a))),tE=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i,hasClass:a})=>u=>u instanceof Ab||s(u)&&o("openApi3_1",u)&&i("object",u)&&a("api",u))),rE=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof Pb||s(a)&&o("operation",a)&&i("object",a))),nE=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof Ib||s(a)&&o("parameter",a)&&i("object",a))),sE=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof Mb||s(a)&&o("pathItem",a)&&i("object",a))),isPathItemElementExternal=s=>{if(!sE(s))return!1;if(!ju(s.$ref))return!1;const o=serializers_value(s.$ref);return"string"==typeof o&&o.length>0&&!o.startsWith("#")},oE=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof Rb||s(a)&&o("paths",a)&&i("object",a))),iE=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof Lb||s(a)&&o("reference",a)&&i("object",a))),isReferenceElementExternal=s=>{if(!iE(s))return!1;if(!ju(s.$ref))return!1;const o=serializers_value(s.$ref);return"string"==typeof o&&o.length>0&&!o.startsWith("#")},aE=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof qb||s(a)&&o("requestBody",a)&&i("object",a))),cE=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof zb||s(a)&&o("response",a)&&i("object",a))),lE=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof Qb||s(a)&&o("responses",a)&&i("object",a))),uE=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof lS||s(a)&&o("schema",a)&&i("object",a))),predicates_isBooleanJsonSchemaElement=s=>Tu(s)&&s.classes.includes("boolean-json-schema"),pE=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof uS||s(a)&&o("securityRequirement",a)&&i("object",a))),hE=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof pS||s(a)&&o("securityScheme",a)&&i("object",a))),dE=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof hS||s(a)&&o("server",a)&&i("object",a))),fE=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof dS||s(a)&&o("serverVariable",a)&&i("object",a))),mE=helpers((({hasBasicElementProps:s,isElementType:o,primitiveEq:i})=>a=>a instanceof _b||s(a)&&o("mediaType",a)&&i("object",a)));class open_api_3_1_schema_SchemaVisitor extends(Mixin(cm,Rd,em)){constructor(s){super(s),this.element=new lS,this.specPath=fc(["document","objects","Schema"]),this.canSupportSpecificationExtensions=!0,this.jsonSchemaDefaultDialect=pb.default,this.passingOptionsNames.push("parent")}ObjectElement(s){this.handleDialectIdentifier(s),this.handleSchemaIdentifier(s),this.parent=this.element;const o=cm.prototype.ObjectElement.call(this,s);return ju(this.element.$ref)&&(this.element.classes.push("reference-element"),this.element.setMetaProperty("referenced-element","schema")),o}BooleanElement(s){return eS.prototype.BooleanElement.call(this,s)}get defaultDialectIdentifier(){let s;return s=void 0!==this.openApiSemanticElement&&XS(this.openApiSemanticElement.jsonSchemaDialect)?serializers_value(this.openApiSemanticElement.jsonSchemaDialect):void 0!==this.openApiGenericElement&&ju(this.openApiGenericElement.get("jsonSchemaDialect"))?serializers_value(this.openApiGenericElement.get("jsonSchemaDialect")):serializers_value(this.jsonSchemaDefaultDialect),s}handleDialectIdentifier(s){return eS.prototype.handleDialectIdentifier.call(this,s)}handleSchemaIdentifier(s){return eS.prototype.handleSchemaIdentifier.call(this,s)}}const gE=open_api_3_1_schema_SchemaVisitor;const yE=class $defsVisitor_$defsVisitor extends D_{constructor(s){super(s),this.passingOptionsNames.push("parent")}};const vE=class schema_AllOfVisitor_AllOfVisitor extends L_{constructor(s){super(s),this.passingOptionsNames.push("parent")}};const bE=class schema_AnyOfVisitor_AnyOfVisitor extends F_{constructor(s){super(s),this.passingOptionsNames.push("parent")}};const _E=class schema_OneOfVisitor_OneOfVisitor extends B_{constructor(s){super(s),this.passingOptionsNames.push("parent")}};const SE=class DependentSchemasVisitor_DependentSchemasVisitor extends $_{constructor(s){super(s),this.passingOptionsNames.push("parent")}};const EE=class PrefixItemsVisitor_PrefixItemsVisitor extends tS{constructor(s){super(s),this.passingOptionsNames.push("parent")}};const wE=class schema_PropertiesVisitor_PropertiesVisitor extends U_{constructor(s){super(s),this.passingOptionsNames.push("parent")}};const xE=class schema_PatternPropertiesVisitor_PatternPropertiesVisitor extends V_{constructor(s){super(s),this.passingOptionsNames.push("parent")}},kE=zv.visitors.document.objects.Discriminator.$visitor;const OE=class distriminator_DiscriminatorVisitor extends kE{constructor(s){super(s),this.element=new Xv,this.canSupportSpecificationExtensions=!0}},AE=zv.visitors.document.objects.XML.$visitor;const CE=class xml_XmlVisitor extends AE{constructor(s){super(s),this.element=new mS}};class SchemasVisitor_SchemasVisitor extends(Mixin(_m,em)){constructor(s){super(s),this.element=new Ay,this.specPath=fc(["document","objects","Schema"])}ObjectElement(s){const o=_m.prototype.ObjectElement.call(this,s);return this.element.filter(uE).forEach(((s,o)=>{s.setMetaProperty("schemaName",serializers_value(o))})),o}}const jE=SchemasVisitor_SchemasVisitor;class ComponentsPathItems extends Su.Sh{static primaryClass="components-path-items";constructor(s,o,i){super(s,o,i),this.classes.push(ComponentsPathItems.primaryClass)}}const PE=ComponentsPathItems;class PathItemsVisitor extends(Mixin(_m,em)){constructor(s){super(s),this.element=new PE,this.specPath=s=>isReferenceLikeElement(s)?["document","objects","Reference"]:["document","objects","PathItem"]}ObjectElement(s){const o=_m.prototype.ObjectElement.call(this,s);return this.element.filter(iE).forEach((s=>{s.setMetaProperty("referenced-element","pathItem")})),o}}const IE=PathItemsVisitor,TE=zv.visitors.document.objects.Example.$visitor;const NE=class example_ExampleVisitor extends TE{constructor(s){super(s),this.element=new Zv}},ME=zv.visitors.document.objects.ExternalDocumentation.$visitor;const RE=class external_documentation_ExternalDocumentationVisitor extends ME{constructor(s){super(s),this.element=new eb}},DE=zv.visitors.document.objects.Encoding.$visitor;const LE=class open_api_3_1_encoding_EncodingVisitor extends DE{constructor(s){super(s),this.element=new Qv}},FE=zv.visitors.document.objects.Paths.$visitor;const BE=class paths_PathsVisitor extends FE{constructor(s){super(s),this.element=new Rb}},$E=zv.visitors.document.objects.RequestBody.$visitor;const qE=class request_body_RequestBodyVisitor extends $E{constructor(s){super(s),this.element=new qb}},UE=zv.visitors.document.objects.Callback.$visitor;const VE=class callback_CallbackVisitor extends UE{constructor(s){super(s),this.element=new Kv,this.specPath=s=>isReferenceLikeElement(s)?["document","objects","Reference"]:["document","objects","PathItem"]}ObjectElement(s){const o=UE.prototype.ObjectElement.call(this,s);return this.element.filter(iE).forEach((s=>{s.setMetaProperty("referenced-element","pathItem")})),o}},zE=zv.visitors.document.objects.Response.$visitor;const WE=class response_ResponseVisitor extends zE{constructor(s){super(s),this.element=new zb}},JE=zv.visitors.document.objects.Responses.$visitor;const HE=class open_api_3_1_responses_ResponsesVisitor extends JE{constructor(s){super(s),this.element=new Qb}},KE=zv.visitors.document.objects.Operation.$visitor;const GE=class operation_OperationVisitor extends KE{constructor(s){super(s),this.element=new Pb}},YE=zv.visitors.document.objects.PathItem.$visitor;const XE=class path_item_PathItemVisitor extends YE{constructor(s){super(s),this.element=new Mb}},QE=zv.visitors.document.objects.SecurityScheme.$visitor;const ZE=class security_scheme_SecuritySchemeVisitor extends QE{constructor(s){super(s),this.element=new pS}},ew=zv.visitors.document.objects.OAuthFlows.$visitor;const tw=class oauth_flows_OAuthFlowsVisitor extends ew{constructor(s){super(s),this.element=new wb}},rw=zv.visitors.document.objects.OAuthFlow.$visitor;const nw=class oauth_flow_OAuthFlowVisitor extends rw{constructor(s){super(s),this.element=new Sb}};class Webhooks extends Su.Sh{static primaryClass="webhooks";constructor(s,o,i){super(s,o,i),this.classes.push(Webhooks.primaryClass)}}const sw=Webhooks;class WebhooksVisitor extends(Mixin(_m,em)){constructor(s){super(s),this.element=new sw,this.specPath=s=>isReferenceLikeElement(s)?["document","objects","Reference"]:["document","objects","PathItem"]}ObjectElement(s){const o=_m.prototype.ObjectElement.call(this,s);return this.element.filter(iE).forEach((s=>{s.setMetaProperty("referenced-element","pathItem")})),this.element.filter(sE).forEach(((s,o)=>{s.setMetaProperty("webhook-name",serializers_value(o))})),o}}const ow=WebhooksVisitor,{JSONSchema:iw,LinkDescription:aw}=nS.visitors.document.objects,cw={visitors:{value:zv.visitors.value,document:{objects:{OpenApi:{$visitor:gS,fixedFields:{openapi:zv.visitors.document.objects.OpenApi.fixedFields.openapi,info:{$ref:"#/visitors/document/objects/Info"},jsonSchemaDialect:kS,servers:zv.visitors.document.objects.OpenApi.fixedFields.servers,paths:{$ref:"#/visitors/document/objects/Paths"},webhooks:ow,components:{$ref:"#/visitors/document/objects/Components"},security:zv.visitors.document.objects.OpenApi.fixedFields.security,tags:zv.visitors.document.objects.OpenApi.fixedFields.tags,externalDocs:{$ref:"#/visitors/document/objects/ExternalDocumentation"}}},Info:{$visitor:vS,fixedFields:{title:zv.visitors.document.objects.Info.fixedFields.title,description:zv.visitors.document.objects.Info.fixedFields.description,summary:{$ref:"#/visitors/value"},termsOfService:zv.visitors.document.objects.Info.fixedFields.termsOfService,contact:{$ref:"#/visitors/document/objects/Contact"},license:{$ref:"#/visitors/document/objects/License"},version:zv.visitors.document.objects.Info.fixedFields.version}},Contact:{$visitor:_S,fixedFields:{name:zv.visitors.document.objects.Contact.fixedFields.name,url:zv.visitors.document.objects.Contact.fixedFields.url,email:zv.visitors.document.objects.Contact.fixedFields.email}},License:{$visitor:ES,fixedFields:{name:zv.visitors.document.objects.License.fixedFields.name,identifier:{$ref:"#/visitors/value"},url:zv.visitors.document.objects.License.fixedFields.url}},Server:{$visitor:AS,fixedFields:{url:zv.visitors.document.objects.Server.fixedFields.url,description:zv.visitors.document.objects.Server.fixedFields.description,variables:zv.visitors.document.objects.Server.fixedFields.variables}},ServerVariable:{$visitor:jS,fixedFields:{enum:zv.visitors.document.objects.ServerVariable.fixedFields.enum,default:zv.visitors.document.objects.ServerVariable.fixedFields.default,description:zv.visitors.document.objects.ServerVariable.fixedFields.description}},Components:{$visitor:RS,fixedFields:{schemas:jE,responses:zv.visitors.document.objects.Components.fixedFields.responses,parameters:zv.visitors.document.objects.Components.fixedFields.parameters,examples:zv.visitors.document.objects.Components.fixedFields.examples,requestBodies:zv.visitors.document.objects.Components.fixedFields.requestBodies,headers:zv.visitors.document.objects.Components.fixedFields.headers,securitySchemes:zv.visitors.document.objects.Components.fixedFields.securitySchemes,links:zv.visitors.document.objects.Components.fixedFields.links,callbacks:zv.visitors.document.objects.Components.fixedFields.callbacks,pathItems:IE}},Paths:{$visitor:BE},PathItem:{$visitor:XE,fixedFields:{$ref:zv.visitors.document.objects.PathItem.fixedFields.$ref,summary:zv.visitors.document.objects.PathItem.fixedFields.summary,description:zv.visitors.document.objects.PathItem.fixedFields.description,get:{$ref:"#/visitors/document/objects/Operation"},put:{$ref:"#/visitors/document/objects/Operation"},post:{$ref:"#/visitors/document/objects/Operation"},delete:{$ref:"#/visitors/document/objects/Operation"},options:{$ref:"#/visitors/document/objects/Operation"},head:{$ref:"#/visitors/document/objects/Operation"},patch:{$ref:"#/visitors/document/objects/Operation"},trace:{$ref:"#/visitors/document/objects/Operation"},servers:zv.visitors.document.objects.PathItem.fixedFields.servers,parameters:zv.visitors.document.objects.PathItem.fixedFields.parameters}},Operation:{$visitor:GE,fixedFields:{tags:zv.visitors.document.objects.Operation.fixedFields.tags,summary:zv.visitors.document.objects.Operation.fixedFields.summary,description:zv.visitors.document.objects.Operation.fixedFields.description,externalDocs:{$ref:"#/visitors/document/objects/ExternalDocumentation"},operationId:zv.visitors.document.objects.Operation.fixedFields.operationId,parameters:zv.visitors.document.objects.Operation.fixedFields.parameters,requestBody:zv.visitors.document.objects.Operation.fixedFields.requestBody,responses:{$ref:"#/visitors/document/objects/Responses"},callbacks:zv.visitors.document.objects.Operation.fixedFields.callbacks,deprecated:zv.visitors.document.objects.Operation.fixedFields.deprecated,security:zv.visitors.document.objects.Operation.fixedFields.security,servers:zv.visitors.document.objects.Operation.fixedFields.servers}},ExternalDocumentation:{$visitor:RE,fixedFields:{description:zv.visitors.document.objects.ExternalDocumentation.fixedFields.description,url:zv.visitors.document.objects.ExternalDocumentation.fixedFields.url}},Parameter:{$visitor:qS,fixedFields:{name:zv.visitors.document.objects.Parameter.fixedFields.name,in:zv.visitors.document.objects.Parameter.fixedFields.in,description:zv.visitors.document.objects.Parameter.fixedFields.description,required:zv.visitors.document.objects.Parameter.fixedFields.required,deprecated:zv.visitors.document.objects.Parameter.fixedFields.deprecated,allowEmptyValue:zv.visitors.document.objects.Parameter.fixedFields.allowEmptyValue,style:zv.visitors.document.objects.Parameter.fixedFields.style,explode:zv.visitors.document.objects.Parameter.fixedFields.explode,allowReserved:zv.visitors.document.objects.Parameter.fixedFields.allowReserved,schema:{$ref:"#/visitors/document/objects/Schema"},example:zv.visitors.document.objects.Parameter.fixedFields.example,examples:zv.visitors.document.objects.Parameter.fixedFields.examples,content:zv.visitors.document.objects.Parameter.fixedFields.content}},RequestBody:{$visitor:qE,fixedFields:{description:zv.visitors.document.objects.RequestBody.fixedFields.description,content:zv.visitors.document.objects.RequestBody.fixedFields.content,required:zv.visitors.document.objects.RequestBody.fixedFields.required}},MediaType:{$visitor:IS,fixedFields:{schema:{$ref:"#/visitors/document/objects/Schema"},example:zv.visitors.document.objects.MediaType.fixedFields.example,examples:zv.visitors.document.objects.MediaType.fixedFields.examples,encoding:zv.visitors.document.objects.MediaType.fixedFields.encoding}},Encoding:{$visitor:LE,fixedFields:{contentType:zv.visitors.document.objects.Encoding.fixedFields.contentType,headers:zv.visitors.document.objects.Encoding.fixedFields.headers,style:zv.visitors.document.objects.Encoding.fixedFields.style,explode:zv.visitors.document.objects.Encoding.fixedFields.explode,allowReserved:zv.visitors.document.objects.Encoding.fixedFields.allowReserved}},Responses:{$visitor:HE,fixedFields:{default:zv.visitors.document.objects.Responses.fixedFields.default}},Response:{$visitor:WE,fixedFields:{description:zv.visitors.document.objects.Response.fixedFields.description,headers:zv.visitors.document.objects.Response.fixedFields.headers,content:zv.visitors.document.objects.Response.fixedFields.content,links:zv.visitors.document.objects.Response.fixedFields.links}},Callback:{$visitor:VE},Example:{$visitor:NE,fixedFields:{summary:zv.visitors.document.objects.Example.fixedFields.summary,description:zv.visitors.document.objects.Example.fixedFields.description,value:zv.visitors.document.objects.Example.fixedFields.value,externalValue:zv.visitors.document.objects.Example.fixedFields.externalValue}},Link:{$visitor:xS,fixedFields:{operationRef:zv.visitors.document.objects.Link.fixedFields.operationRef,operationId:zv.visitors.document.objects.Link.fixedFields.operationId,parameters:zv.visitors.document.objects.Link.fixedFields.parameters,requestBody:zv.visitors.document.objects.Link.fixedFields.requestBody,description:zv.visitors.document.objects.Link.fixedFields.description,server:{$ref:"#/visitors/document/objects/Server"}}},Header:{$visitor:VS,fixedFields:{description:zv.visitors.document.objects.Header.fixedFields.description,required:zv.visitors.document.objects.Header.fixedFields.required,deprecated:zv.visitors.document.objects.Header.fixedFields.deprecated,allowEmptyValue:zv.visitors.document.objects.Header.fixedFields.allowEmptyValue,style:zv.visitors.document.objects.Header.fixedFields.style,explode:zv.visitors.document.objects.Header.fixedFields.explode,allowReserved:zv.visitors.document.objects.Header.fixedFields.allowReserved,schema:{$ref:"#/visitors/document/objects/Schema"},example:zv.visitors.document.objects.Header.fixedFields.example,examples:zv.visitors.document.objects.Header.fixedFields.examples,content:zv.visitors.document.objects.Header.fixedFields.content}},Tag:{$visitor:LS,fixedFields:{name:zv.visitors.document.objects.Tag.fixedFields.name,description:zv.visitors.document.objects.Tag.fixedFields.description,externalDocs:{$ref:"#/visitors/document/objects/ExternalDocumentation"}}},Reference:{$visitor:BS,fixedFields:{$ref:zv.visitors.document.objects.Reference.fixedFields.$ref,summary:{$ref:"#/visitors/value"},description:{$ref:"#/visitors/value"}}},JSONSchema:{$ref:"#/visitors/document/objects/Schema"},LinkDescription:{...aw},Schema:{$visitor:gE,fixedFields:{...iw.fixedFields,$defs:yE,allOf:vE,anyOf:bE,oneOf:_E,not:{$ref:"#/visitors/document/objects/Schema"},if:{$ref:"#/visitors/document/objects/Schema"},then:{$ref:"#/visitors/document/objects/Schema"},else:{$ref:"#/visitors/document/objects/Schema"},dependentSchemas:SE,prefixItems:EE,items:{$ref:"#/visitors/document/objects/Schema"},contains:{$ref:"#/visitors/document/objects/Schema"},properties:wE,patternProperties:xE,additionalProperties:{$ref:"#/visitors/document/objects/Schema"},propertyNames:{$ref:"#/visitors/document/objects/Schema"},unevaluatedItems:{$ref:"#/visitors/document/objects/Schema"},unevaluatedProperties:{$ref:"#/visitors/document/objects/Schema"},contentSchema:{$ref:"#/visitors/document/objects/Schema"},discriminator:{$ref:"#/visitors/document/objects/Discriminator"},xml:{$ref:"#/visitors/document/objects/XML"},externalDocs:{$ref:"#/visitors/document/objects/ExternalDocumentation"},example:{$ref:"#/visitors/value"}}},Discriminator:{$visitor:OE,fixedFields:{propertyName:zv.visitors.document.objects.Discriminator.fixedFields.propertyName,mapping:zv.visitors.document.objects.Discriminator.fixedFields.mapping}},XML:{$visitor:CE,fixedFields:{name:zv.visitors.document.objects.XML.fixedFields.name,namespace:zv.visitors.document.objects.XML.fixedFields.namespace,prefix:zv.visitors.document.objects.XML.fixedFields.prefix,attribute:zv.visitors.document.objects.XML.fixedFields.attribute,wrapped:zv.visitors.document.objects.XML.fixedFields.wrapped}},SecurityScheme:{$visitor:ZE,fixedFields:{type:zv.visitors.document.objects.SecurityScheme.fixedFields.type,description:zv.visitors.document.objects.SecurityScheme.fixedFields.description,name:zv.visitors.document.objects.SecurityScheme.fixedFields.name,in:zv.visitors.document.objects.SecurityScheme.fixedFields.in,scheme:zv.visitors.document.objects.SecurityScheme.fixedFields.scheme,bearerFormat:zv.visitors.document.objects.SecurityScheme.fixedFields.bearerFormat,flows:{$ref:"#/visitors/document/objects/OAuthFlows"},openIdConnectUrl:zv.visitors.document.objects.SecurityScheme.fixedFields.openIdConnectUrl}},OAuthFlows:{$visitor:tw,fixedFields:{implicit:{$ref:"#/visitors/document/objects/OAuthFlow"},password:{$ref:"#/visitors/document/objects/OAuthFlow"},clientCredentials:{$ref:"#/visitors/document/objects/OAuthFlow"},authorizationCode:{$ref:"#/visitors/document/objects/OAuthFlow"}}},OAuthFlow:{$visitor:nw,fixedFields:{authorizationUrl:zv.visitors.document.objects.OAuthFlow.fixedFields.authorizationUrl,tokenUrl:zv.visitors.document.objects.OAuthFlow.fixedFields.tokenUrl,refreshUrl:zv.visitors.document.objects.OAuthFlow.fixedFields.refreshUrl,scopes:zv.visitors.document.objects.OAuthFlow.fixedFields.scopes}},SecurityRequirement:{$visitor:NS}},extension:{$visitor:zv.visitors.document.extension.$visitor}}}},apidom_ns_openapi_3_1_src_traversal_visitor_getNodeType=s=>{if(Cu(s))return`${s.element.charAt(0).toUpperCase()+s.element.slice(1)}Element`},lw={CallbackElement:["content"],ComponentsElement:["content"],ContactElement:["content"],DiscriminatorElement:["content"],Encoding:["content"],Example:["content"],ExternalDocumentationElement:["content"],HeaderElement:["content"],InfoElement:["content"],LicenseElement:["content"],MediaTypeElement:["content"],OAuthFlowElement:["content"],OAuthFlowsElement:["content"],OpenApi3_1Element:["content"],OperationElement:["content"],ParameterElement:["content"],PathItemElement:["content"],PathsElement:["content"],ReferenceElement:["content"],RequestBodyElement:["content"],ResponseElement:["content"],ResponsesElement:["content"],SchemaElement:["content"],SecurityRequirementElement:["content"],SecuritySchemeElement:["content"],ServerElement:["content"],ServerVariableElement:["content"],TagElement:["content"],...np},uw={namespace:s=>{const{base:o}=s;return o.register("callback",Kv),o.register("components",Gv),o.register("contact",Yv),o.register("discriminator",Xv),o.register("encoding",Qv),o.register("example",Zv),o.register("externalDocumentation",eb),o.register("header",tb),o.register("info",nb),o.register("jsonSchemaDialect",pb),o.register("license",mb),o.register("link",yb),o.register("mediaType",_b),o.register("oAuthFlow",Sb),o.register("oAuthFlows",wb),o.register("openapi",Ob),o.register("openApi3_1",Ab),o.register("operation",Pb),o.register("parameter",Ib),o.register("pathItem",Mb),o.register("paths",Rb),o.register("reference",Lb),o.register("requestBody",qb),o.register("response",zb),o.register("responses",Qb),o.register("schema",lS),o.register("securityRequirement",uS),o.register("securityScheme",pS),o.register("server",hS),o.register("serverVariable",dS),o.register("tag",fS),o.register("xml",mS),o}},pw=uw,ancestorLineageToJSONPointer=s=>{const o=s.reduce(((o,i,a)=>{if(Ru(i)){const s=String(serializers_value(i.key));o.push(s)}else if(Mu(s[a-2])){const u=String(s[a-2].content.indexOf(i));o.push(u)}return o}),[]);return es_compile(o)},apidom_ns_openapi_3_1_src_refractor_toolbox=()=>{const s=createNamespace(pw);return{predicates:{...ye,isElement:Cu,isStringElement:ju,isArrayElement:Mu,isObjectElement:Nu,isMemberElement:Ru,isServersElement:sg,includesClasses,hasElementSourceMap},ancestorLineageToJSONPointer,namespace:s}},apidom_ns_openapi_3_1_src_refractor_refract=(s,{specPath:o=["visitors","document","objects","OpenApi","$visitor"],plugins:i=[]}={})=>{const a=(0,Su.e)(s),u=dereference(cw),_=new(Qu(o,u))({specObj:u});return visitor_visit(a,_),dispatchPluginsSync(_.element,i,{toolboxCreator:apidom_ns_openapi_3_1_src_refractor_toolbox,visitorOptions:{keyMap:lw,nodeTypeGetter:apidom_ns_openapi_3_1_src_traversal_visitor_getNodeType}})},apidom_ns_openapi_3_1_src_refractor_createRefractor=s=>(o,i={})=>apidom_ns_openapi_3_1_src_refractor_refract(o,{specPath:s,...i});Kv.refract=apidom_ns_openapi_3_1_src_refractor_createRefractor(["visitors","document","objects","Callback","$visitor"]),Gv.refract=apidom_ns_openapi_3_1_src_refractor_createRefractor(["visitors","document","objects","Components","$visitor"]),Yv.refract=apidom_ns_openapi_3_1_src_refractor_createRefractor(["visitors","document","objects","Contact","$visitor"]),Zv.refract=apidom_ns_openapi_3_1_src_refractor_createRefractor(["visitors","document","objects","Example","$visitor"]),Xv.refract=apidom_ns_openapi_3_1_src_refractor_createRefractor(["visitors","document","objects","Discriminator","$visitor"]),Qv.refract=apidom_ns_openapi_3_1_src_refractor_createRefractor(["visitors","document","objects","Encoding","$visitor"]),eb.refract=apidom_ns_openapi_3_1_src_refractor_createRefractor(["visitors","document","objects","ExternalDocumentation","$visitor"]),tb.refract=apidom_ns_openapi_3_1_src_refractor_createRefractor(["visitors","document","objects","Header","$visitor"]),nb.refract=apidom_ns_openapi_3_1_src_refractor_createRefractor(["visitors","document","objects","Info","$visitor"]),pb.refract=apidom_ns_openapi_3_1_src_refractor_createRefractor(["visitors","document","objects","OpenApi","fixedFields","jsonSchemaDialect"]),mb.refract=apidom_ns_openapi_3_1_src_refractor_createRefractor(["visitors","document","objects","License","$visitor"]),yb.refract=apidom_ns_openapi_3_1_src_refractor_createRefractor(["visitors","document","objects","Link","$visitor"]),_b.refract=apidom_ns_openapi_3_1_src_refractor_createRefractor(["visitors","document","objects","MediaType","$visitor"]),Sb.refract=apidom_ns_openapi_3_1_src_refractor_createRefractor(["visitors","document","objects","OAuthFlow","$visitor"]),wb.refract=apidom_ns_openapi_3_1_src_refractor_createRefractor(["visitors","document","objects","OAuthFlows","$visitor"]),Ob.refract=apidom_ns_openapi_3_1_src_refractor_createRefractor(["visitors","document","objects","OpenApi","fixedFields","openapi"]),Ab.refract=apidom_ns_openapi_3_1_src_refractor_createRefractor(["visitors","document","objects","OpenApi","$visitor"]),Pb.refract=apidom_ns_openapi_3_1_src_refractor_createRefractor(["visitors","document","objects","Operation","$visitor"]),Ib.refract=apidom_ns_openapi_3_1_src_refractor_createRefractor(["visitors","document","objects","Parameter","$visitor"]),Mb.refract=apidom_ns_openapi_3_1_src_refractor_createRefractor(["visitors","document","objects","PathItem","$visitor"]),Rb.refract=apidom_ns_openapi_3_1_src_refractor_createRefractor(["visitors","document","objects","Paths","$visitor"]),Lb.refract=apidom_ns_openapi_3_1_src_refractor_createRefractor(["visitors","document","objects","Reference","$visitor"]),qb.refract=apidom_ns_openapi_3_1_src_refractor_createRefractor(["visitors","document","objects","RequestBody","$visitor"]),zb.refract=apidom_ns_openapi_3_1_src_refractor_createRefractor(["visitors","document","objects","Response","$visitor"]),Qb.refract=apidom_ns_openapi_3_1_src_refractor_createRefractor(["visitors","document","objects","Responses","$visitor"]),lS.refract=apidom_ns_openapi_3_1_src_refractor_createRefractor(["visitors","document","objects","Schema","$visitor"]),uS.refract=apidom_ns_openapi_3_1_src_refractor_createRefractor(["visitors","document","objects","SecurityRequirement","$visitor"]),pS.refract=apidom_ns_openapi_3_1_src_refractor_createRefractor(["visitors","document","objects","SecurityScheme","$visitor"]),hS.refract=apidom_ns_openapi_3_1_src_refractor_createRefractor(["visitors","document","objects","Server","$visitor"]),dS.refract=apidom_ns_openapi_3_1_src_refractor_createRefractor(["visitors","document","objects","ServerVariable","$visitor"]),fS.refract=apidom_ns_openapi_3_1_src_refractor_createRefractor(["visitors","document","objects","Tag","$visitor"]),mS.refract=apidom_ns_openapi_3_1_src_refractor_createRefractor(["visitors","document","objects","XML","$visitor"]);const hw=class NotImplementedError extends td{};const dw=class MediaTypes extends Array{unknownMediaType="application/octet-stream";filterByFormat(){throw new hw("filterByFormat method in MediaTypes class is not yet implemented.")}findBy(){throw new hw("findBy method in MediaTypes class is not yet implemented.")}latest(){throw new hw("latest method in MediaTypes class is not yet implemented.")}};class OpenAPIMediaTypes extends dw{filterByFormat(s="generic"){const o="generic"===s?"openapi;version":s;return this.filter((s=>s.includes(o)))}findBy(s="3.1.0",o="generic"){const i="generic"===o?`vnd.oai.openapi;version=${s}`:`vnd.oai.openapi+${o};version=${s}`;return this.find((s=>s.includes(i)))||this.unknownMediaType}latest(s="generic"){return Ba(this.filterByFormat(s))}}const fw=new OpenAPIMediaTypes("application/vnd.oai.openapi;version=3.1.0","application/vnd.oai.openapi+json;version=3.1.0","application/vnd.oai.openapi+yaml;version=3.1.0");const mw=class src_Reference_Reference{uri;depth;value;refSet;errors;constructor({uri:s,depth:o=0,refSet:i,value:a}){this.uri=s,this.value=a,this.depth=o,this.refSet=i,this.errors=[]}};const gw=class ReferenceSet{rootRef;refs;circular;constructor({refs:s=[],circular:o=!1}={}){this.refs=[],this.circular=o,s.forEach(this.add.bind(this))}get size(){return this.refs.length}add(s){return this.has(s)||(this.refs.push(s),this.rootRef=void 0===this.rootRef?s:this.rootRef,s.refSet=this),this}merge(s){for(const o of s.values())this.add(o);return this}has(s){const o=Jc(s)?s:s.uri;return _c(this.find((s=>s.uri===o)))}find(s){return this.refs.find(s)}*values(){yield*this.refs}clean(){this.refs.forEach((s=>{s.refSet=void 0})),this.rootRef=void 0,this.refs.length=0}};function _identity(s){return s}const yw=_curry1(_identity),vw={parse:{mediaType:"text/plain",parsers:[],parserOpts:{}},resolve:{baseURI:"",resolvers:[],resolverOpts:{},strategies:[],strategyOpts:{},internal:!0,external:!0,maxDepth:1/0},dereference:{strategies:[],strategyOpts:{},refSet:null,maxDepth:1/0,circular:"ignore",circularReplacer:yw,immutable:!0},bundle:{strategies:[],refSet:null,maxDepth:1/0}};const bw=_curry2((function lens(s,o){return function(i){return function(a){return cc((function(s){return o(s,a)}),i(s(a)))}}}));var Identity=function(s){return{value:s,map:function(o){return Identity(o(s))}}},_w=_curry3((function over(s,o,i){return s((function(s){return Identity(o(s))}))(i).value}));const Sw=_w;const Ew=na(""),ww=bw(Qu(["resolve","baseURI"]),n_(["resolve","baseURI"])),baseURIDefault=s=>Ew(s)?url_cwd():s,util_merge=(s,o)=>{const i=up(s,o);return Sw(ww,baseURIDefault,i)};const xw=class File_File{uri;mediaType;data;parseResult;constructor({uri:s,mediaType:o="text/plain",data:i,parseResult:a}){this.uri=s,this.mediaType=o,this.data=i,this.parseResult=a}get extension(){return Jc(this.uri)?(s=>{const o=s.lastIndexOf(".");return o>=0?s.substring(o).toLowerCase():""})(this.uri):""}toString(){if("string"==typeof this.data)return this.data;if(this.data instanceof ArrayBuffer||["ArrayBuffer"].includes(ra(this.data))||ArrayBuffer.isView(this.data)){return new TextDecoder("utf-8").decode(this.data)}return String(this.data)}};const kw=class PluginError extends Ko{plugin;constructor(s,o){super(s,{cause:o.cause}),this.plugin=o.plugin}},plugins_filter=async(s,o,i)=>{const a=await Promise.all(i.map(_p([s],o)));return i.filter(((s,o)=>a[o]))},run=async(s,o,i)=>{let a;for(const u of i)try{const i=await u[s].call(u,...o);return{plugin:u,result:i}}catch(s){a=new kw("Error while running plugin",{cause:s,plugin:u})}return Promise.reject(a)};const Ow=class DereferenceError extends Ko{};const Aw=class UnmatchedDereferenceStrategyError extends Ow{},dereferenceApiDOM=async(s,o)=>{let i=s,a=!1;if(!$u(s)){const o=cloneShallow(s);o.classes.push("result"),i=new Au([o]),a=!0}const u=new xw({uri:o.resolve.baseURI,parseResult:i,mediaType:o.parse.mediaType}),_=await plugins_filter("canDereference",[u,o],o.dereference.strategies);if(gp(_))throw new Aw(u.uri);try{const{result:s}=await run("dereference",[u,o],_);return a?s.get(0):s}catch(s){throw new Ow(`Error while dereferencing file "${u.uri}"`,{cause:s})}};const Cw=class ParseError extends Ko{};const jw=class ParserError extends Cw{};const Pw=class Parser_Parser{name;allowEmpty;sourceMap;fileExtensions;mediaTypes;constructor({name:s,allowEmpty:o=!0,sourceMap:i=!1,fileExtensions:a=[],mediaTypes:u=[]}){this.name=s,this.allowEmpty=o,this.sourceMap=i,this.fileExtensions=a,this.mediaTypes=u}};const Iw=class BinaryParser extends Pw{constructor(s){super({...null!=s?s:{},name:"binary"})}canParse(s){return 0===this.fileExtensions.length||this.fileExtensions.includes(s.extension)}parse(s){try{const o=unescape(encodeURIComponent(s.toString())),i=btoa(o),a=new Au;if(0!==i.length){const s=new Su.Om(i);s.classes.push("result"),a.push(s)}return a}catch(o){throw new jw(`Error parsing "${s.uri}"`,{cause:o})}}};const Tw=class ResolveStrategy{name;constructor({name:s}){this.name=s}};const Nw=class OpenAPI3_1ResolveStrategy extends Tw{constructor(s){super({...null!=s?s:{},name:"openapi-3-1"})}canResolve(s,o){const i=o.dereference.strategies.find((s=>"openapi-3-1"===s.name));return void 0!==i&&i.canDereference(s,o)}async resolve(s,o){const i=o.dereference.strategies.find((s=>"openapi-3-1"===s.name));if(void 0===i)throw new Aw('"openapi-3-1" dereference strategy is not available.');const a=new gw,u=util_merge(o,{resolve:{internal:!1},dereference:{refSet:a}});return await i.dereference(s,u),a}};const Mw=class Resolver{name;constructor({name:s}){this.name=s}};const Rw=class HTTPResolver extends Mw{timeout;redirects;withCredentials;constructor(s){const{name:o="http-resolver",timeout:i=5e3,redirects:a=5,withCredentials:u=!1}=null!=s?s:{};super({name:o}),this.timeout=i,this.redirects=a,this.withCredentials=u}canRead(s){return isHttpUrl(s.uri)}};const Dw=class ResolveError extends Ko{};const Lw=class ResolverError extends Dw{},{AbortController:Fw,AbortSignal:Bw}=globalThis;void 0===globalThis.AbortController&&(globalThis.AbortController=Fw),void 0===globalThis.AbortSignal&&(globalThis.AbortSignal=Bw);const $w=class HTTPResolverSwaggerClient extends Rw{swaggerHTTPClient=http_http;swaggerHTTPClientConfig;constructor({swaggerHTTPClient:s=http_http,swaggerHTTPClientConfig:o={},...i}={}){super({...i,name:"http-swagger-client"}),this.swaggerHTTPClient=s,this.swaggerHTTPClientConfig=o}getHttpClient(){return this.swaggerHTTPClient}async read(s){const o=this.getHttpClient(),i=new AbortController,{signal:a}=i,u=setTimeout((()=>{i.abort()}),this.timeout),_=this.getHttpClient().withCredentials||this.withCredentials?"include":"same-origin",w=0===this.redirects?"error":"follow",x=this.redirects>0?this.redirects:void 0;try{return(await o({url:s.uri,signal:a,userFetch:async(s,o)=>{let i=await fetch(s,o);try{i.headers.delete("Content-Type")}catch{i=new Response(i.body,{...i,headers:new Headers(i.headers)}),i.headers.delete("Content-Type")}return i},credentials:_,redirect:w,follow:x,...this.swaggerHTTPClientConfig})).text.arrayBuffer()}catch(o){throw new Lw(`Error downloading "${s.uri}"`,{cause:o})}finally{clearTimeout(u)}}},transformers_from=(s,o=Ep)=>{if(Jc(s))try{return o.fromRefract(JSON.parse(s))}catch{}return fu(s)&&Yu("element",s)?o.fromRefract(s):o.toElement(s)};const qw=class JSONParser extends Pw{constructor(s={}){super({name:"json-swagger-client",mediaTypes:["application/json"],...s})}async canParse(s){const o=0===this.fileExtensions.length||this.fileExtensions.includes(s.extension),i=this.mediaTypes.includes(s.mediaType);if(!o)return!1;if(i)return!0;if(!i)try{return JSON.parse(s.toString()),!0}catch(s){return!1}return!1}async parse(s){if(this.sourceMap)throw new jw("json-swagger-client parser plugin doesn't support sourceMaps option");const o=new Au,i=s.toString();if(this.allowEmpty&&""===i.trim())return o;try{const s=transformers_from(JSON.parse(i));return s.classes.push("result"),o.push(s),o}catch(o){throw new jw(`Error parsing "${s.uri}"`,{cause:o})}}};const Uw=class YAMLParser extends Pw{constructor(s={}){super({name:"yaml-1-2-swagger-client",mediaTypes:["text/yaml","application/yaml"],...s})}async canParse(s){const o=0===this.fileExtensions.length||this.fileExtensions.includes(s.extension),i=this.mediaTypes.includes(s.mediaType);if(!o)return!1;if(i)return!0;if(!i)try{return fn.load(s.toString(),{schema:rn}),!0}catch(s){return!1}return!1}async parse(s){if(this.sourceMap)throw new jw("yaml-1-2-swagger-client parser plugin doesn't support sourceMaps option");const o=new Au,i=s.toString();try{const s=fn.load(i,{schema:rn});if(this.allowEmpty&&void 0===s)return o;const a=transformers_from(s);return a.classes.push("result"),o.push(a),o}catch(o){throw new jw(`Error parsing "${s.uri}"`,{cause:o})}}};const Vw=class OpenAPIJSON3_1Parser extends Pw{detectionRegExp=/"openapi"\s*:\s*"(?<version_json>3\.1\.(?:[1-9]\d*|0))"/;constructor(s={}){super({name:"openapi-json-3-1-swagger-client",mediaTypes:new OpenAPIMediaTypes(...fw.filterByFormat("generic"),...fw.filterByFormat("json")),...s})}async canParse(s){const o=0===this.fileExtensions.length||this.fileExtensions.includes(s.extension),i=this.mediaTypes.includes(s.mediaType);if(!o)return!1;if(i)return!0;if(!i)try{const o=s.toString();return JSON.parse(o),this.detectionRegExp.test(o)}catch(s){return!1}return!1}async parse(s){if(this.sourceMap)throw new jw("openapi-json-3-1-swagger-client parser plugin doesn't support sourceMaps option");const o=new Au,i=s.toString();if(this.allowEmpty&&""===i.trim())return o;try{const s=JSON.parse(i),a=Ab.refract(s,this.refractorOpts);return a.classes.push("result"),o.push(a),o}catch(o){throw new jw(`Error parsing "${s.uri}"`,{cause:o})}}};const zw=class OpenAPIYAML31Parser extends Pw{detectionRegExp=/(?<YAML>^(["']?)openapi\2\s*:\s*(["']?)(?<version_yaml>3\.1\.(?:[1-9]\d*|0))\3(?:\s+|$))|(?<JSON>"openapi"\s*:\s*"(?<version_json>3\.1\.(?:[1-9]\d*|0))")/m;constructor(s={}){super({name:"openapi-yaml-3-1-swagger-client",mediaTypes:new OpenAPIMediaTypes(...fw.filterByFormat("generic"),...fw.filterByFormat("yaml")),...s})}async canParse(s){const o=0===this.fileExtensions.length||this.fileExtensions.includes(s.extension),i=this.mediaTypes.includes(s.mediaType);if(!o)return!1;if(i)return!0;if(!i)try{const o=s.toString();return fn.load(o),this.detectionRegExp.test(o)}catch(s){return!1}return!1}async parse(s){if(this.sourceMap)throw new jw("openapi-yaml-3-1-swagger-client parser plugin doesn't support sourceMaps option");const o=new Au,i=s.toString();try{const s=fn.load(i,{schema:rn});if(this.allowEmpty&&void 0===s)return o;const a=Ab.refract(s,this.refractorOpts);return a.classes.push("result"),o.push(a),o}catch(o){throw new jw(`Error parsing "${s.uri}"`,{cause:o})}}};const Ww=_curry3((function propEq(s,o,i){return na(s,Da(o,i))}));const Jw=class DereferenceStrategy{name;constructor({name:s}){this.name=s}};const Hw=_curry2((function none(s,o){return xu(_complement(s),o)}));var Kw=__webpack_require__(8068);const Gw=class ElementIdentityError extends Go{value;constructor(s,o){super(s,o),void 0!==o&&(this.value=o.value)}};class IdentityManager{uuid;identityMap;constructor({length:s=6}={}){this.uuid=new Kw({length:s}),this.identityMap=new WeakMap}identify(s){if(!Cu(s))throw new Gw("Cannot not identify the element. `element` is neither structurally compatible nor a subclass of an Element class.",{value:s});if(s.meta.hasKey("id")&&ju(s.meta.get("id"))&&!s.meta.get("id").equals(""))return s.id;if(this.identityMap.has(s))return this.identityMap.get(s);const o=new Su.Om(this.generateId());return this.identityMap.set(s,o),o}forget(s){return!!this.identityMap.has(s)&&(this.identityMap.delete(s),!0)}generateId(){return this.uuid.randomUUID()}}new IdentityManager;const Yw=_curry3((function pathOr(s,o,i){return Na(s,_path(o,i))})),traversal_find=(s,o)=>{const i=new PredicateVisitor({predicate:s,returnOnTrue:qu});return visitor_visit(o,i),Yw(void 0,[0],i.result)};const Xw=class JsonSchema$anchorError extends Ko{};const Qw=class EvaluationJsonSchema$anchorError extends Xw{};const Zw=class InvalidJsonSchema$anchorError extends Xw{constructor(s){super(`Invalid JSON Schema $anchor "${s}".`)}},isAnchor=s=>/^[A-Za-z_][A-Za-z_0-9.-]*$/.test(s),uriToAnchor=s=>{const o=getHash(s);return tp("#",o)},$anchor_evaluate=(s,o)=>{const i=(s=>{if(!isAnchor(s))throw new Zw(s);return s})(s),a=traversal_find((s=>uE(s)&&serializers_value(s.$anchor)===i),o);if(bc(a))throw new Qw(`Evaluation failed on token: "${i}"`);return a},traversal_filter=(s,o)=>{const i=new PredicateVisitor({predicate:s});return visitor_visit(o,i),new Su.G6(i.result)};const ex=class JsonSchemaUriError extends Ko{};const tx=class EvaluationJsonSchemaUriError extends ex{},resolveSchema$refField=(s,o)=>{if(void 0===o.$ref)return;const i=getHash(serializers_value(o.$ref)),a=serializers_value(o.meta.get("ancestorsSchemaIdentifiers")),u=Aa(((s,o)=>resolve(s,sanitize(stripHash(o)))),s,[...a,serializers_value(o.$ref)]);return`${u}${"#"===i?"":i}`},refractToSchemaElement=s=>{if(refractToSchemaElement.cache.has(s))return refractToSchemaElement.cache.get(s);const o=lS.refract(s);return refractToSchemaElement.cache.set(s,o),o};refractToSchemaElement.cache=new WeakMap;const maybeRefractToSchemaElement=s=>isPrimitiveElement(s)?refractToSchemaElement(s):s,uri_evaluate=(s,o)=>{const{cache:i}=uri_evaluate,a=stripHash(s),isSchemaElementWith$id=s=>uE(s)&&void 0!==s.$id;if(!i.has(o)){const s=traversal_filter(isSchemaElementWith$id,o);i.set(o,Array.from(s))}const u=i.get(o).find((s=>{const o=((s,o)=>{if(void 0===o.$id)return;const i=serializers_value(o.meta.get("ancestorsSchemaIdentifiers"));return Aa(((s,o)=>resolve(s,sanitize(stripHash(o)))),s,i)})(a,s);return o===a}));if(bc(u))throw new tx(`Evaluation failed on URI: "${s}"`);return isAnchor(uriToAnchor(s))?$anchor_evaluate(uriToAnchor(s),u):apidom_evaluate(u,fromURIReference(s))};uri_evaluate.cache=new WeakMap;const rx=class MaximumDereferenceDepthError extends Ow{};const nx=class MaximumResolveDepthError extends Dw{};const sx=class UnmatchedResolverError extends Lw{},apidom_reference_src_parse=async(s,o)=>{const i=new xw({uri:sanitize(stripHash(s)),mediaType:o.parse.mediaType}),a=await(async(s,o)=>{const i=o.resolve.resolvers.map((s=>{const i=Object.create(s);return Object.assign(i,o.resolve.resolverOpts)})),a=await plugins_filter("canRead",[s,o],i);if(gp(a))throw new sx(s.uri);try{const{result:o}=await run("read",[s],a);return o}catch(o){throw new Dw(`Error while reading file "${s.uri}"`,{cause:o})}})(i,o);return(async(s,o)=>{const i=o.parse.parsers.map((s=>{const i=Object.create(s);return Object.assign(i,o.parse.parserOpts)})),a=await plugins_filter("canParse",[s,o],i);if(gp(a))throw new sx(s.uri);try{const{plugin:i,result:u}=await run("parse",[s,o],a);return!i.allowEmpty&&u.isEmpty?Promise.reject(new Cw(`Error while parsing file "${s.uri}". File is empty.`)):u}catch(o){throw new Cw(`Error while parsing file "${s.uri}"`,{cause:o})}})(new xw({...i,data:a}),o)};class AncestorLineage extends Array{includesCycle(s){return this.filter((o=>o.has(s))).length>1}includes(s,o){return s instanceof Set?super.includes(s,o):this.some((o=>o.has(s)))}findItem(s){for(const o of this)for(const i of o)if(Cu(i)&&s(i))return i}}const ox=visitor_visit[Symbol.for("nodejs.util.promisify.custom")],ix=new IdentityManager,mutationReplacer=(s,o,i,a)=>{Ru(a)?a.value=s:Array.isArray(a)&&(a[i]=s)};class OpenAPI3_1DereferenceVisitor{indirections;namespace;reference;options;ancestors;refractCache;allOfDiscriminatorMapping;constructor({reference:s,namespace:o,options:i,indirections:a=[],ancestors:u=new AncestorLineage,refractCache:_=new Map,allOfDiscriminatorMapping:w=new Map}){this.indirections=a,this.namespace=o,this.reference=s,this.options=i,this.ancestors=new AncestorLineage(...u),this.refractCache=_,this.allOfDiscriminatorMapping=w}toBaseURI(s){return resolve(this.reference.uri,sanitize(stripHash(s)))}async toReference(s){if(this.reference.depth>=this.options.resolve.maxDepth)throw new nx(`Maximum resolution depth of ${this.options.resolve.maxDepth} has been exceeded by file "${this.reference.uri}"`);const o=this.toBaseURI(s),{refSet:i}=this.reference;if(i.has(o))return i.find(Ww(o,"uri"));const a=await apidom_reference_src_parse(unsanitize(o),{...this.options,parse:{...this.options.parse,mediaType:"text/plain"}}),u=new mw({uri:o,value:cloneDeep(a),depth:this.reference.depth+1});if(i.add(u),this.options.dereference.immutable){const s=new mw({uri:`immutable://${o}`,value:a,depth:this.reference.depth+1});i.add(s)}return u}toAncestorLineage(s){const o=new Set(s.filter(Cu));return[new AncestorLineage(...this.ancestors,o),o]}OpenApi3_1Element={leave:(s,o,i,a,u,_)=>{var w;if(null===(w=this.options.dereference.strategyOpts["openapi-3-1"])||void 0===w||!w.dereferenceDiscriminatorMapping)return;const x=cloneShallow(s);return x.setMetaProperty("allOfDiscriminatorMapping",Object.fromEntries(this.allOfDiscriminatorMapping)),_.replaceWith(x,mutationReplacer),i?void 0:x}};async ReferenceElement(s,o,i,a,u,_){if(this.indirections.includes(s))return!1;const[w,x]=this.toAncestorLineage([...u,i]),C=this.toBaseURI(serializers_value(s.$ref)),j=stripHash(this.reference.uri)===C,L=!j;if(!this.options.resolve.internal&&j)return!1;if(!this.options.resolve.external&&L)return!1;const B=await this.toReference(serializers_value(s.$ref)),$=resolve(C,serializers_value(s.$ref));this.indirections.push(s);const U=fromURIReference($);let V=apidom_evaluate(B.value.result,U);if(V.id=ix.identify(V),isPrimitiveElement(V)){const o=serializers_value(s.meta.get("referenced-element")),i=`${o}-${serializers_value(ix.identify(V))}`;if(this.refractCache.has(i))V=this.refractCache.get(i);else if(isReferenceLikeElement(V))V=Lb.refract(V),V.setMetaProperty("referenced-element",o),this.refractCache.set(i,V);else{V=this.namespace.getElementClass(o).refract(V),this.refractCache.set(i,V)}}if(s===V)throw new Ko("Recursive Reference Object detected");if(this.indirections.length>this.options.dereference.maxDepth)throw new rx(`Maximum dereference depth of "${this.options.dereference.maxDepth}" has been exceeded in file "${this.reference.uri}"`);if(w.includes(V)){if(B.refSet.circular=!0,"error"===this.options.dereference.circular)throw new Ko("Circular reference detected");if("replace"===this.options.dereference.circular){var z,Y;const o=new Su.sI(V.id,{type:"reference",uri:B.uri,$ref:serializers_value(s.$ref)}),a=(null!==(z=null===(Y=this.options.dereference.strategyOpts["openapi-3-1"])||void 0===Y?void 0:Y.circularReplacer)&&void 0!==z?z:this.options.dereference.circularReplacer)(o);return _.replaceWith(a,mutationReplacer),!i&&a}}const Z=stripHash(B.refSet.rootRef.uri)!==B.uri,ee=["error","replace"].includes(this.options.dereference.circular);if((L||Z||iE(V)||ee)&&!w.includesCycle(V)){x.add(s);const o=new OpenAPI3_1DereferenceVisitor({reference:B,namespace:this.namespace,indirections:[...this.indirections],options:this.options,refractCache:this.refractCache,ancestors:w,allOfDiscriminatorMapping:this.allOfDiscriminatorMapping});V=await ox(V,o,{keyMap:lw,nodeTypeGetter:apidom_ns_openapi_3_1_src_traversal_visitor_getNodeType}),x.delete(s)}this.indirections.pop();const ie=cloneShallow(V);return ie.setMetaProperty("id",ix.generateId()),ie.setMetaProperty("ref-fields",{$ref:serializers_value(s.$ref),description:serializers_value(s.description),summary:serializers_value(s.summary)}),ie.setMetaProperty("ref-origin",B.uri),ie.setMetaProperty("ref-referencing-element-id",cloneDeep(ix.identify(s))),Nu(V)&&Nu(ie)&&(s.hasKey("description")&&"description"in V&&(ie.remove("description"),ie.set("description",s.get("description"))),s.hasKey("summary")&&"summary"in V&&(ie.remove("summary"),ie.set("summary",s.get("summary")))),_.replaceWith(ie,mutationReplacer),!i&&ie}async PathItemElement(s,o,i,a,u,_){if(!ju(s.$ref))return;if(this.indirections.includes(s))return!1;const[w,x]=this.toAncestorLineage([...u,i]),C=this.toBaseURI(serializers_value(s.$ref)),j=stripHash(this.reference.uri)===C,L=!j;if(!this.options.resolve.internal&&j)return;if(!this.options.resolve.external&&L)return;const B=await this.toReference(serializers_value(s.$ref)),$=resolve(C,serializers_value(s.$ref));this.indirections.push(s);const U=fromURIReference($);let V=apidom_evaluate(B.value.result,U);if(V.id=ix.identify(V),isPrimitiveElement(V)){const s=`path-item-${serializers_value(ix.identify(V))}`;this.refractCache.has(s)?V=this.refractCache.get(s):(V=Mb.refract(V),this.refractCache.set(s,V))}if(s===V)throw new Ko("Recursive Path Item Object reference detected");if(this.indirections.length>this.options.dereference.maxDepth)throw new rx(`Maximum dereference depth of "${this.options.dereference.maxDepth}" has been exceeded in file "${this.reference.uri}"`);if(w.includes(V)){if(B.refSet.circular=!0,"error"===this.options.dereference.circular)throw new Ko("Circular reference detected");if("replace"===this.options.dereference.circular){var z,Y;const o=new Su.sI(V.id,{type:"path-item",uri:B.uri,$ref:serializers_value(s.$ref)}),a=(null!==(z=null===(Y=this.options.dereference.strategyOpts["openapi-3-1"])||void 0===Y?void 0:Y.circularReplacer)&&void 0!==z?z:this.options.dereference.circularReplacer)(o);return _.replaceWith(a,mutationReplacer),!i&&a}}const Z=stripHash(B.refSet.rootRef.uri)!==B.uri,ee=["error","replace"].includes(this.options.dereference.circular);if((L||Z||sE(V)&&ju(V.$ref)||ee)&&!w.includesCycle(V)){x.add(s);const o=new OpenAPI3_1DereferenceVisitor({reference:B,namespace:this.namespace,indirections:[...this.indirections],options:this.options,refractCache:this.refractCache,ancestors:w,allOfDiscriminatorMapping:this.allOfDiscriminatorMapping});V=await ox(V,o,{keyMap:lw,nodeTypeGetter:apidom_ns_openapi_3_1_src_traversal_visitor_getNodeType}),x.delete(s)}if(this.indirections.pop(),sE(V)){const o=new Mb([...V.content],cloneDeep(V.meta),cloneDeep(V.attributes));o.setMetaProperty("id",ix.generateId()),s.forEach(((s,i,a)=>{o.remove(serializers_value(i)),o.content.push(a)})),o.remove("$ref"),o.setMetaProperty("ref-fields",{$ref:serializers_value(s.$ref)}),o.setMetaProperty("ref-origin",B.uri),o.setMetaProperty("ref-referencing-element-id",cloneDeep(ix.identify(s))),V=o}return _.replaceWith(V,mutationReplacer),i?void 0:V}async LinkElement(s,o,i,a,u,_){if(!ju(s.operationRef)&&!ju(s.operationId))return;if(ju(s.operationRef)&&ju(s.operationId))throw new Ko("LinkElement operationRef and operationId fields are mutually exclusive.");let w;if(ju(s.operationRef)){var x;const o=fromURIReference(serializers_value(s.operationRef)),a=this.toBaseURI(serializers_value(s.operationRef)),u=stripHash(this.reference.uri)===a,C=!u;if(!this.options.resolve.internal&&u)return;if(!this.options.resolve.external&&C)return;const j=await this.toReference(serializers_value(s.operationRef));if(w=apidom_evaluate(j.value.result,o),isPrimitiveElement(w)){const s=`operation-${serializers_value(ix.identify(w))}`;this.refractCache.has(s)?w=this.refractCache.get(s):(w=Pb.refract(w),this.refractCache.set(s,w))}w=cloneShallow(w),w.setMetaProperty("ref-origin",j.uri);const L=cloneShallow(s);return null===(x=L.operationRef)||void 0===x||x.meta.set("operation",w),_.replaceWith(L,mutationReplacer),i?void 0:L}if(ju(s.operationId)){var C;const o=serializers_value(s.operationId),a=await this.toReference(unsanitize(this.reference.uri));if(w=traversal_find((s=>rE(s)&&Cu(s.operationId)&&s.operationId.equals(o)),a.value.result),bc(w))throw new Ko(`OperationElement(operationId=${o}) not found.`);const u=cloneShallow(s);return null===(C=u.operationId)||void 0===C||C.meta.set("operation",w),_.replaceWith(u,mutationReplacer),i?void 0:u}}async ExampleElement(s,o,i,a,u,_){if(!ju(s.externalValue))return;if(s.hasKey("value")&&ju(s.externalValue))throw new Ko("ExampleElement value and externalValue fields are mutually exclusive.");const w=this.toBaseURI(serializers_value(s.externalValue)),x=stripHash(this.reference.uri)===w,C=!x;if(!this.options.resolve.internal&&x)return;if(!this.options.resolve.external&&C)return;const j=await this.toReference(serializers_value(s.externalValue)),L=cloneShallow(j.value.result);L.setMetaProperty("ref-origin",j.uri);const B=cloneShallow(s);return B.value=L,_.replaceWith(B,mutationReplacer),i?void 0:B}async MemberElement(s,o,i,a,u,_){var w;const x=u[u.length-1];if(!Nu(x)||!x.classes.contains("discriminator-mapping"))return;if(null===(w=this.options.dereference.strategyOpts["openapi-3-1"])||void 0===w||!w.dereferenceDiscriminatorMapping)return!1;if(!ju(s.key)||!ju(s.value))return!1;if(this.indirections.includes(s))return!1;this.indirections.push(s);const[C,j]=this.toAncestorLineage([...u,i]),L=[...j].findLast(uE),B=cloneDeep(L.getMetaProperty("ancestorsSchemaIdentifiers")),$=serializers_value(s.value),U=/^[a-zA-Z0-9\\.\\-_]+$/.test($)?`#/components/schemas/${$}`:$,V=new lS({$ref:U});V.setMetaProperty("ancestorsSchemaIdentifiers",B),j.add(V);const z=new OpenAPI3_1DereferenceVisitor({reference:this.reference,namespace:this.namespace,indirections:[...this.indirections],options:this.options,refractCache:this.refractCache,ancestors:C,allOfDiscriminatorMapping:this.allOfDiscriminatorMapping}),Y=await ox(V,z,{keyMap:lw,nodeTypeGetter:apidom_ns_openapi_3_1_src_traversal_visitor_getNodeType});j.delete(V),this.indirections.pop();const Z=cloneShallow(s);return Z.value.setMetaProperty("ref-schema",Y),_.replaceWith(Z,mutationReplacer),i?void 0:Z}async SchemaElement(s,o,i,a,u,_){if(!ju(s.$ref))return;if(this.indirections.includes(s))return!1;const[w,x]=this.toAncestorLineage([...u,i]);let C=await this.toReference(unsanitize(this.reference.uri)),{uri:j}=C;const L=resolveSchema$refField(j,s),B=stripHash(L),$=new xw({uri:B}),U=Hw((s=>s.canRead($)),this.options.resolve.resolvers),V=!U;let z,Y=stripHash(this.reference.uri)===L,Z=!Y;this.indirections.push(s);try{if(U||V){j=this.toBaseURI(L);const s=L,o=maybeRefractToSchemaElement(C.value.result);if(z=uri_evaluate(s,o),z=maybeRefractToSchemaElement(z),z.id=ix.identify(z),!this.options.resolve.internal&&Y)return;if(!this.options.resolve.external&&Z)return}else{if(j=this.toBaseURI(L),Y=stripHash(this.reference.uri)===j,Z=!Y,!this.options.resolve.internal&&Y)return;if(!this.options.resolve.external&&Z)return;C=await this.toReference(unsanitize(L));const s=fromURIReference(L),o=maybeRefractToSchemaElement(C.value.result);z=apidom_evaluate(o,s),z=maybeRefractToSchemaElement(z),z.id=ix.identify(z)}}catch(s){if(!(V&&s instanceof tx))throw s;if(isAnchor(uriToAnchor(L))){if(Y=stripHash(this.reference.uri)===j,Z=!Y,!this.options.resolve.internal&&Y)return;if(!this.options.resolve.external&&Z)return;C=await this.toReference(unsanitize(L));const s=uriToAnchor(L),o=maybeRefractToSchemaElement(C.value.result);z=$anchor_evaluate(s,o),z=maybeRefractToSchemaElement(z),z.id=ix.identify(z)}else{if(j=this.toBaseURI(L),Y=stripHash(this.reference.uri)===j,Z=!Y,!this.options.resolve.internal&&Y)return;if(!this.options.resolve.external&&Z)return;C=await this.toReference(unsanitize(L));const s=fromURIReference(L),o=maybeRefractToSchemaElement(C.value.result);z=apidom_evaluate(o,s),z=maybeRefractToSchemaElement(z),z.id=ix.identify(z)}}if(s===z)throw new Ko("Recursive Schema Object reference detected");if(this.indirections.length>this.options.dereference.maxDepth)throw new rx(`Maximum dereference depth of "${this.options.dereference.maxDepth}" has been exceeded in file "${this.reference.uri}"`);if(w.includes(z)){if(C.refSet.circular=!0,"error"===this.options.dereference.circular)throw new Ko("Circular reference detected");if("replace"===this.options.dereference.circular){var ee,ie;const o=new Su.sI(z.id,{type:"json-schema",uri:C.uri,$ref:serializers_value(s.$ref)}),a=(null!==(ee=null===(ie=this.options.dereference.strategyOpts["openapi-3-1"])||void 0===ie?void 0:ie.circularReplacer)&&void 0!==ee?ee:this.options.dereference.circularReplacer)(o);return _.replaceWith(a,mutationReplacer),!i&&a}}const ae=stripHash(C.refSet.rootRef.uri)!==C.uri,ce=["error","replace"].includes(this.options.dereference.circular);if((Z||ae||uE(z)&&ju(z.$ref)||ce)&&!w.includesCycle(z)){x.add(s);const o=new OpenAPI3_1DereferenceVisitor({reference:C,namespace:this.namespace,indirections:[...this.indirections],options:this.options,refractCache:this.refractCache,ancestors:w,allOfDiscriminatorMapping:this.allOfDiscriminatorMapping});z=await ox(z,o,{keyMap:lw,nodeTypeGetter:apidom_ns_openapi_3_1_src_traversal_visitor_getNodeType}),x.delete(s)}if(this.indirections.pop(),predicates_isBooleanJsonSchemaElement(z)){const o=cloneDeep(z);return o.setMetaProperty("id",ix.generateId()),o.setMetaProperty("ref-fields",{$ref:serializers_value(s.$ref),$refBaseURI:L}),o.setMetaProperty("ref-origin",C.uri),o.setMetaProperty("ref-referencing-element-id",cloneDeep(ix.identify(s))),_.replaceWith(o,mutationReplacer),!i&&o}if(uE(z)){var le;const o=new lS([...z.content],cloneDeep(z.meta),cloneDeep(z.attributes));if(o.setMetaProperty("id",ix.generateId()),s.forEach(((s,i,a)=>{o.remove(serializers_value(i)),o.content.push(a)})),o.remove("$ref"),o.setMetaProperty("ref-fields",{$ref:serializers_value(s.$ref),$refBaseURI:L}),o.setMetaProperty("ref-origin",C.uri),o.setMetaProperty("ref-referencing-element-id",cloneDeep(ix.identify(s))),null!==(le=this.options.dereference.strategyOpts["openapi-3-1"])&&void 0!==le&&le.dereferenceDiscriminatorMapping){var pe;const s=u[u.length-1],i=[...x].findLast(uE),a=null==i?void 0:i.getMetaProperty("schemaName"),_=serializers_value(o.getMetaProperty("schemaName"));if(_&&a&&null!=s&&null!==(pe=s.classes)&&void 0!==pe&&pe.contains("json-schema-allOf")){var de;const s=null!==(de=this.allOfDiscriminatorMapping.get(_))&&void 0!==de?de:[];s.push(i),this.allOfDiscriminatorMapping.set(_,s)}}z=o}return _.replaceWith(z,mutationReplacer),i?void 0:z}}const ax=OpenAPI3_1DereferenceVisitor,cx=visitor_visit[Symbol.for("nodejs.util.promisify.custom")];const lx=class OpenAPI3_1DereferenceStrategy extends Jw{constructor(s){super({...null!=s?s:{},name:"openapi-3-1"})}canDereference(s){var o;return"text/plain"!==s.mediaType?fw.includes(s.mediaType):tE(null===(o=s.parseResult)||void 0===o?void 0:o.result)}async dereference(s,o){var i;const a=createNamespace(pw),u=null!==(i=o.dereference.refSet)&&void 0!==i?i:new gw,_=new gw;let w,x=u;u.has(s.uri)?w=u.find(Ww(s.uri,"uri")):(w=new mw({uri:s.uri,value:s.parseResult}),u.add(w)),o.dereference.immutable&&(u.refs.map((s=>new mw({...s,value:cloneDeep(s.value)}))).forEach((s=>_.add(s))),w=_.find((o=>o.uri===s.uri)),x=_);const C=new ax({reference:w,namespace:a,options:o}),j=await cx(x.rootRef.value,C,{keyMap:lw,nodeTypeGetter:apidom_ns_openapi_3_1_src_traversal_visitor_getNodeType});return o.dereference.immutable&&_.refs.filter((s=>s.uri.startsWith("immutable://"))).map((s=>new mw({...s,uri:s.uri.replace(/^immutable:\/\//,"")}))).forEach((s=>u.add(s))),null===o.dereference.refSet&&u.clean(),_.clean(),j}},to_path=s=>{const o=(s=>s.slice(2))(s);return o.reduce(((s,i,a)=>{if(Ru(i)){const o=String(serializers_value(i.key));s.push(o)}else if(Mu(o[a-2])){const u=o[a-2].content.indexOf(i);s.push(u)}return s}),[])};const ux=class ModelPropertyMacroVisitor{modelPropertyMacro;options;SchemaElement={leave:(s,o,i,a,u)=>{void 0!==s.properties&&Nu(s.properties)&&s.properties.forEach((o=>{if(Nu(o))try{const s=this.modelPropertyMacro(serializers_value(o));o.set("default",s)}catch(o){var a,_;const w=new Error(o,{cause:o});w.fullPath=[...to_path([...u,i,s]),"properties"],null===(a=this.options.dereference.dereferenceOpts)||void 0===a||null===(a=a.errors)||void 0===a||null===(_=a.push)||void 0===_||_.call(a,w)}}))}};constructor({modelPropertyMacro:s,options:o}){this.modelPropertyMacro=s,this.options=o}};var px=function(){function XUniqWith(s,o){this.xf=o,this.pred=s,this.items=[]}return XUniqWith.prototype["@@transducer/init"]=_xfBase_init,XUniqWith.prototype["@@transducer/result"]=_xfBase_result,XUniqWith.prototype["@@transducer/step"]=function(s,o){return _includesWith(this.pred,o,this.items)?s:(this.items.push(o),this.xf["@@transducer/step"](s,o))},XUniqWith}();function _xuniqWith(s){return function(o){return new px(s,o)}}var hx=_curry2(_dispatchable([],_xuniqWith,(function(s,o){for(var i,a=0,u=o.length,_=[];a<u;)_includesWith(s,i=o[a],_)||(_[_.length]=i),a+=1;return _})));const dx=hx;const fx=class all_of_AllOfVisitor{options;SchemaElement={leave(s,o,i,a,u){if(void 0===s.allOf)return;if(!Mu(s.allOf)){var _,w;const o=new TypeError("allOf must be an array");return o.fullPath=[...to_path([...u,i,s]),"allOf"],void(null===(_=this.options.dereference.dereferenceOpts)||void 0===_||null===(_=_.errors)||void 0===_||null===(w=_.push)||void 0===w||w.call(_,o))}if(s.allOf.isEmpty)return void s.remove("allOf");if(!s.allOf.content.every(uE)){var x,C;const o=new TypeError("Elements in allOf must be objects");return o.fullPath=[...to_path([...u,i,s]),"allOf"],void(null===(x=this.options.dereference.dereferenceOpts)||void 0===x||null===(x=x.errors)||void 0===x||null===(C=x.push)||void 0===C||C.call(x,o))}for(;s.hasKey("allOf");){const{allOf:o}=s;s.remove("allOf");const i=dd.all([...o.content,s],{customMerge:s=>"enum"===serializers_value(s)?(s,o)=>{if(includesClasses(["json-schema-enum"],s)&&includesClasses(["json-schema-enum"],o)){const areElementsEqual=(s,o)=>!(Mu(s)||Mu(o)||Nu(s)||Nu(o))&&s.equals(serializers_value(o)),i=cloneShallow(s);return i.content=dx(areElementsEqual)([...s.content,...o.content]),i}return dd(s,o)}:dd});if(s.hasKey("$$ref")||i.remove("$$ref"),s.hasKey("example")){const o=i.getMember("example");o&&(o.value=s.get("example"))}if(s.hasKey("examples")){const o=i.getMember("examples");o&&(o.value=s.get("examples"))}s.content=i.content}}};constructor({options:s}){this.options=s}};const mx=class ParameterMacroVisitor{parameterMacro;options;#n;OperationElement={enter:s=>{this.#n=s},leave:()=>{this.#n=void 0}};ParameterElement={leave:(s,o,i,a,u)=>{const _=this.#n?serializers_value(this.#n):null,w=serializers_value(s);try{const o=this.parameterMacro(_,w);s.set("default",o)}catch(s){var x,C;const o=new Error(s,{cause:s});o.fullPath=to_path([...u,i]),null===(x=this.options.dereference.dereferenceOpts)||void 0===x||null===(x=x.errors)||void 0===x||null===(C=x.push)||void 0===C||C.call(x,o)}}};constructor({parameterMacro:s,options:o}){this.parameterMacro=s,this.options=o}},get_root_cause=s=>{if(null==s.cause)return s;let{cause:o}=s;for(;null!=o.cause;)o=o.cause;return o};const gx=class SchemaRefError extends Go{},{wrapError:yx}=Xl,vx=visitor_visit[Symbol.for("nodejs.util.promisify.custom")],bx=new IdentityManager,dereference_mutationReplacer=(s,o,i,a)=>{Ru(a)?a.value=s:Array.isArray(a)&&(a[i]=s)};class OpenAPI3_1SwaggerClientDereferenceVisitor extends ax{useCircularStructures;allowMetaPatches;basePath;constructor({allowMetaPatches:s=!0,useCircularStructures:o=!1,basePath:i=null,...a}){super(a),this.allowMetaPatches=s,this.useCircularStructures=o,this.basePath=i}async ReferenceElement(s,o,i,a,u,_){try{if(this.indirections.includes(s))return!1;const[o,a]=this.toAncestorLineage([...u,i]),j=this.toBaseURI(serializers_value(s.$ref)),L=stripHash(this.reference.uri)===j,B=!L;if(!this.options.resolve.internal&&L)return!1;if(!this.options.resolve.external&&B)return!1;const $=await this.toReference(serializers_value(s.$ref)),U=resolve(j,serializers_value(s.$ref));this.indirections.push(s);const V=fromURIReference(U);let z=apidom_evaluate($.value.result,V);if(z.id=bx.identify(z),isPrimitiveElement(z)){const o=serializers_value(s.meta.get("referenced-element")),i=`${o}-${serializers_value(bx.identify(z))}`;if(this.refractCache.has(i))z=this.refractCache.get(i);else if(isReferenceLikeElement(z))z=Lb.refract(z),z.setMetaProperty("referenced-element",o),this.refractCache.set(i,z);else{z=this.namespace.getElementClass(o).refract(z),this.refractCache.set(i,z)}}if(s===z)throw new Ko("Recursive Reference Object detected");if(this.indirections.length>this.options.dereference.maxDepth)throw new rx(`Maximum dereference depth of "${this.options.dereference.maxDepth}" has been exceeded in file "${this.reference.uri}"`);if(o.includes(z)){if($.refSet.circular=!0,"error"===this.options.dereference.circular)throw new Ko("Circular reference detected");if("replace"===this.options.dereference.circular){var w,x;const o=new Su.sI(z.id,{type:"reference",uri:$.uri,$ref:serializers_value(s.$ref),baseURI:U,referencingElement:s}),a=(null!==(w=null===(x=this.options.dereference.strategyOpts["openapi-3-1"])||void 0===x?void 0:x.circularReplacer)&&void 0!==w?w:this.options.dereference.circularReplacer)(o);return _.replaceWith(o,dereference_mutationReplacer),!i&&a}}const Y=stripHash($.refSet.rootRef.uri)!==$.uri,Z=["error","replace"].includes(this.options.dereference.circular);if((B||Y||iE(z)||Z)&&!o.includesCycle(z)){var C;a.add(s);const _=new OpenAPI3_1SwaggerClientDereferenceVisitor({reference:$,namespace:this.namespace,indirections:[...this.indirections],options:this.options,refractCache:this.refractCache,ancestors:o,allowMetaPatches:this.allowMetaPatches,useCircularStructures:this.useCircularStructures,basePath:null!==(C=this.basePath)&&void 0!==C?C:[...to_path([...u,i,s]),"$ref"]});z=await vx(z,_,{keyMap:lw,nodeTypeGetter:apidom_ns_openapi_3_1_src_traversal_visitor_getNodeType}),a.delete(s)}this.indirections.pop();const ee=cloneShallow(z);if(ee.setMetaProperty("ref-fields",{$ref:serializers_value(s.$ref),description:serializers_value(s.description),summary:serializers_value(s.summary)}),ee.setMetaProperty("ref-origin",$.uri),ee.setMetaProperty("ref-referencing-element-id",cloneDeep(bx.identify(s))),Nu(z)&&(s.hasKey("description")&&"description"in z&&(ee.remove("description"),ee.set("description",s.get("description"))),s.hasKey("summary")&&"summary"in z&&(ee.remove("summary"),ee.set("summary",s.get("summary")))),this.allowMetaPatches&&Nu(ee)&&!ee.hasKey("$$ref")){const s=resolve(j,U);ee.set("$$ref",s)}return _.replaceWith(ee,dereference_mutationReplacer),!i&&ee}catch(o){var j,L,B;const a=get_root_cause(o),_=yx(a,{baseDoc:this.reference.uri,$ref:serializers_value(s.$ref),pointer:fromURIReference(serializers_value(s.$ref)),fullPath:null!==(j=this.basePath)&&void 0!==j?j:[...to_path([...u,i,s]),"$ref"]});return void(null===(L=this.options.dereference.dereferenceOpts)||void 0===L||null===(L=L.errors)||void 0===L||null===(B=L.push)||void 0===B||B.call(L,_))}}async PathItemElement(s,o,i,a,u,_){try{if(!ju(s.$ref))return;if(this.indirections.includes(s))return!1;if(includesClasses(["cycle"],s.$ref))return!1;const[o,a]=this.toAncestorLineage([...u,i]),j=this.toBaseURI(serializers_value(s.$ref)),L=stripHash(this.reference.uri)===j,B=!L;if(!this.options.resolve.internal&&L)return;if(!this.options.resolve.external&&B)return;const $=await this.toReference(serializers_value(s.$ref)),U=resolve(j,serializers_value(s.$ref));this.indirections.push(s);const V=fromURIReference(U);let z=apidom_evaluate($.value.result,V);if(z.id=bx.identify(z),isPrimitiveElement(z)){const s=`path-item-${serializers_value(bx.identify(z))}`;this.refractCache.has(s)?z=this.refractCache.get(s):(z=Mb.refract(z),this.refractCache.set(s,z))}if(s===z)throw new Ko("Recursive Path Item Object reference detected");if(this.indirections.length>this.options.dereference.maxDepth)throw new rx(`Maximum dereference depth of "${this.options.dereference.maxDepth}" has been exceeded in file "${this.reference.uri}"`);if(o.includes(z)){if($.refSet.circular=!0,"error"===this.options.dereference.circular)throw new Ko("Circular reference detected");if("replace"===this.options.dereference.circular){var w,x;const o=new Su.sI(z.id,{type:"path-item",uri:$.uri,$ref:serializers_value(s.$ref),baseURI:U,referencingElement:s}),a=(null!==(w=null===(x=this.options.dereference.strategyOpts["openapi-3-1"])||void 0===x?void 0:x.circularReplacer)&&void 0!==w?w:this.options.dereference.circularReplacer)(o);return _.replaceWith(o,dereference_mutationReplacer),!i&&a}}const Y=stripHash($.refSet.rootRef.uri)!==$.uri,Z=["error","replace"].includes(this.options.dereference.circular);if((B||Y||sE(z)&&ju(z.$ref)||Z)&&!o.includesCycle(z)){var C;a.add(s);const _=new OpenAPI3_1SwaggerClientDereferenceVisitor({reference:$,namespace:this.namespace,indirections:[...this.indirections],options:this.options,ancestors:o,allowMetaPatches:this.allowMetaPatches,useCircularStructures:this.useCircularStructures,basePath:null!==(C=this.basePath)&&void 0!==C?C:[...to_path([...u,i,s]),"$ref"]});z=await vx(z,_,{keyMap:lw,nodeTypeGetter:apidom_ns_openapi_3_1_src_traversal_visitor_getNodeType}),a.delete(s)}if(this.indirections.pop(),sE(z)){const o=new Mb([...z.content],cloneDeep(z.meta),cloneDeep(z.attributes));if(s.forEach(((s,i,a)=>{o.remove(serializers_value(i)),o.content.push(a)})),o.remove("$ref"),o.setMetaProperty("ref-fields",{$ref:serializers_value(s.$ref)}),o.setMetaProperty("ref-origin",$.uri),o.setMetaProperty("ref-referencing-element-id",cloneDeep(bx.identify(s))),this.allowMetaPatches&&void 0===o.get("$$ref")){const s=resolve(j,U);o.set("$$ref",s)}z=o}return _.replaceWith(z,dereference_mutationReplacer),i?void 0:z}catch(o){var j,L,B;const a=get_root_cause(o),_=yx(a,{baseDoc:this.reference.uri,$ref:serializers_value(s.$ref),pointer:fromURIReference(serializers_value(s.$ref)),fullPath:null!==(j=this.basePath)&&void 0!==j?j:[...to_path([...u,i,s]),"$ref"]});return void(null===(L=this.options.dereference.dereferenceOpts)||void 0===L||null===(L=L.errors)||void 0===L||null===(B=L.push)||void 0===B||B.call(L,_))}}async SchemaElement(s,o,i,a,u,_){try{if(!ju(s.$ref))return;if(this.indirections.includes(s))return!1;const[o,a]=this.toAncestorLineage([...u,i]);let j=await this.toReference(unsanitize(this.reference.uri)),{uri:L}=j;const B=resolveSchema$refField(L,s),$=stripHash(B),U=new xw({uri:$}),V=!this.options.resolve.resolvers.some((s=>s.canRead(U))),z=!V;let Y,Z=stripHash(this.reference.uri)===B,ee=!Z;this.indirections.push(s);try{if(V||z){L=this.toBaseURI(B);const s=B,o=maybeRefractToSchemaElement(j.value.result);if(Y=uri_evaluate(s,o),Y=maybeRefractToSchemaElement(Y),Y.id=bx.identify(Y),!this.options.resolve.internal&&Z)return;if(!this.options.resolve.external&&ee)return}else{if(L=this.toBaseURI(B),Z=stripHash(this.reference.uri)===L,ee=!Z,!this.options.resolve.internal&&Z)return;if(!this.options.resolve.external&&ee)return;j=await this.toReference(unsanitize(B));const s=fromURIReference(B),o=maybeRefractToSchemaElement(j.value.result);Y=apidom_evaluate(o,s),Y=maybeRefractToSchemaElement(Y),Y.id=bx.identify(Y)}}catch(s){if(!(z&&s instanceof tx))throw s;if(isAnchor(uriToAnchor(B))){if(Z=stripHash(this.reference.uri)===L,ee=!Z,!this.options.resolve.internal&&Z)return;if(!this.options.resolve.external&&ee)return;j=await this.toReference(unsanitize(B));const s=uriToAnchor(B),o=maybeRefractToSchemaElement(j.value.result);Y=$anchor_evaluate(s,o),Y=maybeRefractToSchemaElement(Y),Y.id=bx.identify(Y)}else{if(L=this.toBaseURI(serializers_value(B)),Z=stripHash(this.reference.uri)===L,ee=!Z,!this.options.resolve.internal&&Z)return;if(!this.options.resolve.external&&ee)return;j=await this.toReference(unsanitize(B));const s=fromURIReference(B),o=maybeRefractToSchemaElement(j.value.result);Y=apidom_evaluate(o,s),Y=maybeRefractToSchemaElement(Y),Y.id=bx.identify(Y)}}if(s===Y)throw new Ko("Recursive Schema Object reference detected");if(this.indirections.length>this.options.dereference.maxDepth)throw new rx(`Maximum dereference depth of "${this.options.dereference.maxDepth}" has been exceeded in file "${this.reference.uri}"`);if(o.includes(Y)){if(j.refSet.circular=!0,"error"===this.options.dereference.circular)throw new Ko("Circular reference detected");if("replace"===this.options.dereference.circular){var w,x;const o=new Su.sI(Y.id,{type:"json-schema",uri:j.uri,$ref:serializers_value(s.$ref),baseURI:resolve(L,B),referencingElement:s}),a=(null!==(w=null===(x=this.options.dereference.strategyOpts["openapi-3-1"])||void 0===x?void 0:x.circularReplacer)&&void 0!==w?w:this.options.dereference.circularReplacer)(o);return _.replaceWith(a,dereference_mutationReplacer),!i&&a}}const ie=stripHash(j.refSet.rootRef.uri)!==j.uri,ae=["error","replace"].includes(this.options.dereference.circular);if((ee||ie||uE(Y)&&ju(Y.$ref)||ae)&&!o.includesCycle(Y)){var C;a.add(s);const _=new OpenAPI3_1SwaggerClientDereferenceVisitor({reference:j,namespace:this.namespace,indirections:[...this.indirections],options:this.options,useCircularStructures:this.useCircularStructures,allowMetaPatches:this.allowMetaPatches,ancestors:o,basePath:null!==(C=this.basePath)&&void 0!==C?C:[...to_path([...u,i,s]),"$ref"]});Y=await vx(Y,_,{keyMap:lw,nodeTypeGetter:apidom_ns_openapi_3_1_src_traversal_visitor_getNodeType}),a.delete(s)}if(this.indirections.pop(),predicates_isBooleanJsonSchemaElement(Y)){const o=cloneDeep(Y);return o.setMetaProperty("ref-fields",{$ref:serializers_value(s.$ref)}),o.setMetaProperty("ref-origin",j.uri),o.setMetaProperty("ref-referencing-element-id",cloneDeep(bx.identify(s))),_.replaceWith(o,dereference_mutationReplacer),!i&&o}if(uE(Y)){const o=new lS([...Y.content],cloneDeep(Y.meta),cloneDeep(Y.attributes));if(s.forEach(((s,i,a)=>{o.remove(serializers_value(i)),o.content.push(a)})),o.remove("$ref"),o.setMetaProperty("ref-fields",{$ref:serializers_value(s.$ref)}),o.setMetaProperty("ref-origin",j.uri),o.setMetaProperty("ref-referencing-element-id",cloneDeep(bx.identify(s))),this.allowMetaPatches&&void 0===o.get("$$ref")){const s=resolve(L,B);o.set("$$ref",s)}Y=o}return _.replaceWith(Y,dereference_mutationReplacer),i?void 0:Y}catch(o){var j,L,B;const a=get_root_cause(o),_=new gx(`Could not resolve reference: ${a.message}`,{baseDoc:this.reference.uri,$ref:serializers_value(s.$ref),fullPath:null!==(j=this.basePath)&&void 0!==j?j:[...to_path([...u,i,s]),"$ref"],cause:a});return void(null===(L=this.options.dereference.dereferenceOpts)||void 0===L||null===(L=L.errors)||void 0===L||null===(B=L.push)||void 0===B||B.call(L,_))}}async LinkElement(){}async ExampleElement(s,o,i,a,u,_){try{return await super.ExampleElement(s,o,i,a,u,_)}catch(o){var w,x,C;const a=get_root_cause(o),_=yx(a,{baseDoc:this.reference.uri,externalValue:serializers_value(s.externalValue),fullPath:null!==(w=this.basePath)&&void 0!==w?w:[...to_path([...u,i,s]),"externalValue"]});return void(null===(x=this.options.dereference.dereferenceOpts)||void 0===x||null===(x=x.errors)||void 0===x||null===(C=x.push)||void 0===C||C.call(x,_))}}}const _x=OpenAPI3_1SwaggerClientDereferenceVisitor,Sx=mergeAll[Symbol.for("nodejs.util.promisify.custom")];const Ex=class RootVisitor{constructor({parameterMacro:s,modelPropertyMacro:o,mode:i,options:a,...u}){const _=[];_.push(new _x({...u,options:a})),"function"==typeof o&&_.push(new ux({modelPropertyMacro:o,options:a})),"strict"!==i&&_.push(new fx({options:a})),"function"==typeof s&&_.push(new mx({parameterMacro:s,options:a}));const w=Sx(_,{nodeTypeGetter:apidom_ns_openapi_3_1_src_traversal_visitor_getNodeType});Object.assign(this,w)}},wx=visitor_visit[Symbol.for("nodejs.util.promisify.custom")];const xx=class OpenAPI3_1SwaggerClientDereferenceStrategy extends lx{allowMetaPatches;parameterMacro;modelPropertyMacro;mode;ancestors;constructor({allowMetaPatches:s=!1,parameterMacro:o=null,modelPropertyMacro:i=null,mode:a="non-strict",ancestors:u=[],..._}={}){super({..._}),this.name="openapi-3-1-swagger-client",this.allowMetaPatches=s,this.parameterMacro=o,this.modelPropertyMacro=i,this.mode=a,this.ancestors=[...u]}async dereference(s,o){var i;const a=createNamespace(pw),u=null!==(i=o.dereference.refSet)&&void 0!==i?i:new gw,_=new gw;let w,x=u;u.has(s.uri)?w=u.find((o=>o.uri===s.uri)):(w=new mw({uri:s.uri,value:s.parseResult}),u.add(w)),o.dereference.immutable&&(u.refs.map((s=>new mw({...s,value:cloneDeep(s.value)}))).forEach((s=>_.add(s))),w=_.find((o=>o.uri===s.uri)),x=_);const C=new Ex({reference:w,namespace:a,options:o,allowMetaPatches:this.allowMetaPatches,ancestors:this.ancestors,modelPropertyMacro:this.modelPropertyMacro,mode:this.mode,parameterMacro:this.parameterMacro}),j=await wx(x.rootRef.value,C,{keyMap:lw,nodeTypeGetter:apidom_ns_openapi_3_1_src_traversal_visitor_getNodeType});return o.dereference.immutable&&_.refs.filter((s=>s.uri.startsWith("immutable://"))).map((s=>new mw({...s,uri:s.uri.replace(/^immutable:\/\//,"")}))).forEach((s=>u.add(s))),null===o.dereference.refSet&&u.clean(),_.clean(),j}},circularReplacer=s=>{const o=serializers_value(s.meta.get("baseURI")),i=s.meta.get("referencingElement");return new Su.Sh({$ref:o},cloneDeep(i.meta),cloneDeep(i.attributes))},resolveOpenAPI31Strategy=async s=>{const{spec:o,timeout:i,redirects:a,requestInterceptor:u,responseInterceptor:_,pathDiscriminator:w=[],allowMetaPatches:x=!1,useCircularStructures:C=!1,skipNormalization:j=!1,parameterMacro:L=null,modelPropertyMacro:B=null,mode:$="non-strict",strategies:U}=s;try{const{cache:V}=resolveOpenAPI31Strategy,z=U.find((s=>s.match(o))),Y=isHttpUrl(url_cwd())?url_cwd():Ll,Z=options_retrievalURI(s),ee=resolve(Y,Z);let ie;V.has(o)?ie=V.get(o):(ie=Ab.refract(o),ie.classes.push("result"),V.set(o,ie));const ae=new Au([ie]),ce=es_compile(w),le=""===ce?"":`#${ce}`,pe=apidom_evaluate(ie,ce),de=new mw({uri:ee,value:ae}),fe=new gw({refs:[de]});""!==ce&&(fe.rootRef=void 0);const ye=[new Set([pe])],be=[],_e=await(async(s,o={})=>{const i=util_merge(vw,o);return dereferenceApiDOM(s,i)})(pe,{resolve:{baseURI:`${ee}${le}`,resolvers:[new $w({timeout:i||1e4,redirects:a||10})],resolverOpts:{swaggerHTTPClientConfig:{requestInterceptor:u,responseInterceptor:_}},strategies:[new Nw]},parse:{mediaType:fw.latest(),parsers:[new Vw({allowEmpty:!1,sourceMap:!1}),new zw({allowEmpty:!1,sourceMap:!1}),new qw({allowEmpty:!1,sourceMap:!1}),new Uw({allowEmpty:!1,sourceMap:!1}),new Iw({allowEmpty:!1,sourceMap:!1})]},dereference:{maxDepth:100,strategies:[new xx({allowMetaPatches:x,useCircularStructures:C,parameterMacro:L,modelPropertyMacro:B,mode:$,ancestors:ye})],refSet:fe,dereferenceOpts:{errors:be},immutable:!1,circular:C?"ignore":"replace",circularReplacer:C?vw.dereference.circularReplacer:circularReplacer}}),Se=((s,o,i)=>new xp({element:i}).transclude(s,o))(pe,_e,ie),we=j?Se:z.normalize(Se);return{spec:serializers_value(we),errors:be}}catch(s){if(s instanceof Wp)return{spec:o,errors:[]};throw s}};resolveOpenAPI31Strategy.cache=new WeakMap;const kx=resolveOpenAPI31Strategy;function _clone(s,o,i){if(i||(i=new Ox),function _isPrimitive(s){var o=typeof s;return null==s||"object"!=o&&"function"!=o}(s))return s;var a=function copy(a){var u=i.get(s);if(u)return u;for(var _ in i.set(s,a),s)Object.prototype.hasOwnProperty.call(s,_)&&(a[_]=o?_clone(s[_],!0,i):s[_]);return a};switch(ra(s)){case"Object":return a(Object.create(Object.getPrototypeOf(s)));case"Array":return a(Array(s.length));case"Date":return new Date(s.valueOf());case"RegExp":return _cloneRegExp(s);case"Int8Array":case"Uint8Array":case"Uint8ClampedArray":case"Int16Array":case"Uint16Array":case"Int32Array":case"Uint32Array":case"Float32Array":case"Float64Array":case"BigInt64Array":case"BigUint64Array":return s.slice();default:return s}}var Ox=function(){function _ObjectMap(){this.map={},this.length=0}return _ObjectMap.prototype.set=function(s,o){var i=this.hash(s),a=this.map[i];a||(this.map[i]=a=[]),a.push([s,o]),this.length+=1},_ObjectMap.prototype.hash=function(s){var o=[];for(var i in s)o.push(Object.prototype.toString.call(s[i]));return o.join()},_ObjectMap.prototype.get=function(s){if(this.length<=180)for(var o in this.map)for(var i=this.map[o],a=0;a<i.length;a+=1){if((_=i[a])[0]===s)return _[1]}else{var u=this.hash(s);if(i=this.map[u])for(a=0;a<i.length;a+=1){var _;if((_=i[a])[0]===s)return _[1]}}},_ObjectMap}(),Ax=function(){function XReduceBy(s,o,i,a){this.valueFn=s,this.valueAcc=o,this.keyFn=i,this.xf=a,this.inputs={}}return XReduceBy.prototype["@@transducer/init"]=_xfBase_init,XReduceBy.prototype["@@transducer/result"]=function(s){var o;for(o in this.inputs)if(_has(o,this.inputs)&&(s=this.xf["@@transducer/step"](s,this.inputs[o]))["@@transducer/reduced"]){s=s["@@transducer/value"];break}return this.inputs=null,this.xf["@@transducer/result"](s)},XReduceBy.prototype["@@transducer/step"]=function(s,o){var i=this.keyFn(o);return this.inputs[i]=this.inputs[i]||[i,_clone(this.valueAcc,!1)],this.inputs[i][1]=this.valueFn(this.inputs[i][1],o),s},XReduceBy}();function _xreduceBy(s,o,i){return function(a){return new Ax(s,o,i,a)}}var Cx=_curryN(4,[],_dispatchable([],_xreduceBy,(function reduceBy(s,o,i,a){var u=_xwrap((function(a,u){var _=i(u),w=s(_has(_,a)?a[_]:_clone(o,!1),u);return w&&w["@@transducer/reduced"]?_reduced(a):(a[_]=w,a)}));return wa(u,{},a)})));const jx=_curry2(_checkForMethod("groupBy",Cx((function(s,o){return s.push(o),s}),[])));const Px=class NormalizeStorage{internalStore;constructor(s,o,i){this.storageElement=s,this.storageField=o,this.storageSubField=i}get store(){if(!this.internalStore){let s=this.storageElement.get(this.storageField);Nu(s)||(s=new Su.Sh,this.storageElement.set(this.storageField,s));let o=s.get(this.storageSubField);Mu(o)||(o=new Su.wE,s.set(this.storageSubField,o)),this.internalStore=o}return this.internalStore}append(s){this.includes(s)||this.store.push(s)}includes(s){return this.store.includes(s)}},removeSpaces=s=>s.replace(/\s/g,""),normalize_operation_ids_replaceSpecialCharsWithUnderscore=s=>s.replace(/\W/gi,"_"),normalizeOperationId=(s,o,i)=>{const a=removeSpaces(s);return a.length>0?normalize_operation_ids_replaceSpecialCharsWithUnderscore(a):((s,o)=>`${normalize_operation_ids_replaceSpecialCharsWithUnderscore(removeSpaces(o.toLowerCase()))}${normalize_operation_ids_replaceSpecialCharsWithUnderscore(removeSpaces(s))}`)(o,i)},normalize_operation_ids=({storageField:s="x-normalized",operationIdNormalizer:o=normalizeOperationId}={})=>i=>{const{predicates:a,ancestorLineageToJSONPointer:u,namespace:_}=i,w=[],x=[],C=[];let j;return{visitor:{OpenApi3_1Element:{enter(o){j=new Px(o,s,"operation-ids")},leave(){const s=jx((s=>serializers_value(s.operationId)),x);Object.entries(s).forEach((([s,o])=>{Array.isArray(o)&&(o.length<=1||o.forEach(((o,i)=>{const a=`${s}${i+1}`;o.operationId=new _.elements.String(a)})))})),C.forEach((s=>{if(void 0===s.operationId)return;const o=String(serializers_value(s.operationId)),i=x.find((s=>serializers_value(s.meta.get("originalOperationId"))===o));void 0!==i&&(s.operationId=cloneDeep.safe(i.operationId),s.meta.set("originalOperationId",o),s.set("__originalOperationId",o))})),x.length=0,C.length=0,j=void 0}},PathItemElement:{enter(s){const o=Na("path",serializers_value(s.meta.get("path")));w.push(o)},leave(){w.pop()}},OperationElement:{enter(s,i,a,C,L){if(void 0===s.operationId)return;const B=u([...L,a,s]);if(j.includes(B))return;const $=String(serializers_value(s.operationId)),U=Ba(w),V=Na("method",serializers_value(s.meta.get("http-method"))),z=o($,U,V);$!==z&&(s.operationId=new _.elements.String(z),s.set("__originalOperationId",$),s.meta.set("originalOperationId",$),x.push(s),j.append(B))}},LinkElement:{leave(s){a.isLinkElement(s)&&void 0!==s.operationId&&C.push(s)}}}}},normalize_parameters=({storageField:s="x-normalized"}={})=>o=>{const{predicates:i,ancestorLineageToJSONPointer:a}=o,parameterEquals=(s,o)=>!!i.isParameterElement(s)&&(!!i.isParameterElement(o)&&(!!i.isStringElement(s.name)&&(!!i.isStringElement(s.in)&&(!!i.isStringElement(o.name)&&(!!i.isStringElement(o.in)&&(serializers_value(s.name)===serializers_value(o.name)&&serializers_value(s.in)===serializers_value(o.in))))))),u=[];let _;return{visitor:{OpenApi3_1Element:{enter(o){_=new Px(o,s,"parameters")},leave(){_=void 0}},PathItemElement:{enter(s,o,a,_,w){if(w.some(i.isComponentsElement))return;const{parameters:x}=s;i.isArrayElement(x)?u.push([...x.content]):u.push([])},leave(){u.pop()}},OperationElement:{leave(s,o,i,w,x){const C=Ba(u);if(!Array.isArray(C)||0===C.length)return;const j=a([...x,i,s]);if(_.includes(j))return;const L=Yw([],["parameters","content"],s),B=dx(parameterEquals,[...L,...C]);s.parameters=new _v(B),_.append(j)}}}}},normalize_security_requirements=({storageField:s="x-normalized"}={})=>o=>{const{predicates:i,ancestorLineageToJSONPointer:a}=o;let u,_;return{visitor:{OpenApi3_1Element:{enter(o){_=new Px(o,s,"security-requirements"),i.isArrayElement(o.security)&&(u=o.security)},leave(){_=void 0,u=void 0}},OperationElement:{leave(s,o,w,x,C){if(C.some(i.isComponentsElement))return;const j=a([...C,w,s]);if(_.includes(j))return;var L;void 0===s.security&&void 0!==u&&(s.security=new Ov(null===(L=u)||void 0===L?void 0:L.content),_.append(j))}}}}},normalize_parameter_examples=({storageField:s="x-normalized"}={})=>o=>{const{predicates:i,ancestorLineageToJSONPointer:a}=o;let u;return{visitor:{OpenApi3_1Element:{enter(o){u=new Px(o,s,"parameter-examples")},leave(){u=void 0}},ParameterElement:{leave(s,o,_,w,x){var C,j;if(x.some(i.isComponentsElement))return;if(void 0===s.schema||!i.isSchemaElement(s.schema))return;if(void 0===(null===(C=s.schema)||void 0===C?void 0:C.example)&&void 0===(null===(j=s.schema)||void 0===j?void 0:j.examples))return;const L=a([...x,_,s]);if(!u.includes(L)){if(void 0!==s.examples&&i.isObjectElement(s.examples)){const o=s.examples.map((s=>cloneDeep.safe(s.value)));return void 0!==s.schema.examples&&(s.schema.set("examples",o),u.append(L)),void(void 0!==s.schema.example&&(s.schema.set("example",o[0]),u.append(L)))}void 0!==s.example&&(void 0!==s.schema.examples&&(s.schema.set("examples",[cloneDeep(s.example)]),u.append(L)),void 0!==s.schema.example&&(s.schema.set("example",cloneDeep(s.example)),u.append(L)))}}}}}},normalize_header_examples=({storageField:s="x-normalized"}={})=>o=>{const{predicates:i,ancestorLineageToJSONPointer:a}=o;let u;return{visitor:{OpenApi3_1Element:{enter(o){u=new Px(o,s,"header-examples")},leave(){u=void 0}},HeaderElement:{leave(s,o,_,w,x){var C,j;if(x.some(i.isComponentsElement))return;if(void 0===s.schema||!i.isSchemaElement(s.schema))return;if(void 0===(null===(C=s.schema)||void 0===C?void 0:C.example)&&void 0===(null===(j=s.schema)||void 0===j?void 0:j.examples))return;const L=a([...x,_,s]);if(!u.includes(L)){if(void 0!==s.examples&&i.isObjectElement(s.examples)){const o=s.examples.map((s=>cloneDeep.safe(s.value)));return void 0!==s.schema.examples&&(s.schema.set("examples",o),u.append(L)),void(void 0!==s.schema.example&&(s.schema.set("example",o[0]),u.append(L)))}void 0!==s.example&&(void 0!==s.schema.examples&&(s.schema.set("examples",[cloneDeep(s.example)]),u.append(L)),void 0!==s.schema.example&&(s.schema.set("example",cloneDeep(s.example)),u.append(L)))}}}}}},openapi_3_1_apidom_normalize=s=>{if(!Nu(s))return s;const o=[normalize_operation_ids({operationIdNormalizer:(s,o,i)=>opId({operationId:s},o,i,{v2OperationIdCompatibilityMode:!1})}),normalize_parameters(),normalize_security_requirements(),normalize_parameter_examples(),normalize_header_examples()];return dispatchPluginsSync(s,o,{toolboxCreator:apidom_ns_openapi_3_1_src_refractor_toolbox,visitorOptions:{keyMap:lw,nodeTypeGetter:apidom_ns_openapi_3_1_src_traversal_visitor_getNodeType}})},Ix={name:"openapi-3-1-apidom",match:s=>isOpenAPI31(s),normalize(s){if(!Cu(s)&&fu(s)&&!s.$$normalized){const i=(o=openapi_3_1_apidom_normalize,s=>{const i=Ab.refract(s);i.classes.push("result");const a=o(i),u=serializers_value(a);return kx.cache.set(u,a),serializers_value(a)})(s);return i.$$normalized=!0,i}var o;return Cu(s)?openapi_3_1_apidom_normalize(s):s},resolve:async s=>kx(s)},Tx=Ix,makeResolve=s=>async o=>(async s=>{const{spec:o,requestInterceptor:i,responseInterceptor:a}=s,u=options_retrievalURI(s),_=options_httpClient(s),w=o||await makeFetchJSON(_,{requestInterceptor:i,responseInterceptor:a})(u),x={...s,spec:w};return s.strategies.find((s=>s.match(w))).resolve(x)})({...s,...o}),Nx=makeResolve({strategies:[_u,vu,gu]});const server_url_template=(s,o,i,a,u)=>{if(s===Pp.SEM_PRE){if(!1===Array.isArray(u))throw new Error("parser's user data must be an array");u.push(["server-url-template",jp.charsToString(o,i,a)])}return Pp.SEM_OK},callbacks_server_variable=(s,o,i,a,u)=>{if(s===Pp.SEM_PRE){if(!1===Array.isArray(u))throw new Error("parser's user data must be an array");u.push(["server-variable",jp.charsToString(o,i,a)])}return Pp.SEM_OK},server_variable_name=(s,o,i,a,u)=>{if(s===Pp.SEM_PRE){if(!1===Array.isArray(u))throw new Error("parser's user data must be an array");u.push(["server-variable-name",jp.charsToString(o,i,a)])}return Pp.SEM_OK},callbacks_literals=(s,o,i,a,u)=>{if(s===Pp.SEM_PRE){if(!1===Array.isArray(u))throw new Error("parser's user data must be an array");u.push(["literals",jp.charsToString(o,i,a)])}return Pp.SEM_OK},Mx=new function server_url_templating_grammar(){this.grammarObject="grammarObject",this.rules=[],this.rules[0]={name:"server-url-template",lower:"server-url-template",index:0,isBkr:!1},this.rules[1]={name:"server-variable",lower:"server-variable",index:1,isBkr:!1},this.rules[2]={name:"server-variable-name",lower:"server-variable-name",index:2,isBkr:!1},this.rules[3]={name:"literals",lower:"literals",index:3,isBkr:!1},this.rules[4]={name:"DIGIT",lower:"digit",index:4,isBkr:!1},this.rules[5]={name:"HEXDIG",lower:"hexdig",index:5,isBkr:!1},this.rules[6]={name:"pct-encoded",lower:"pct-encoded",index:6,isBkr:!1},this.rules[7]={name:"ucschar",lower:"ucschar",index:7,isBkr:!1},this.rules[8]={name:"iprivate",lower:"iprivate",index:8,isBkr:!1},this.udts=[],this.rules[0].opcodes=[],this.rules[0].opcodes[0]={type:3,min:1,max:1/0},this.rules[0].opcodes[1]={type:1,children:[2,3]},this.rules[0].opcodes[2]={type:4,index:3},this.rules[0].opcodes[3]={type:4,index:1},this.rules[1].opcodes=[],this.rules[1].opcodes[0]={type:2,children:[1,2,3]},this.rules[1].opcodes[1]={type:7,string:[123]},this.rules[1].opcodes[2]={type:4,index:2},this.rules[1].opcodes[3]={type:7,string:[125]},this.rules[2].opcodes=[],this.rules[2].opcodes[0]={type:3,min:1,max:1/0},this.rules[2].opcodes[1]={type:1,children:[2,3,4]},this.rules[2].opcodes[2]={type:5,min:0,max:122},this.rules[2].opcodes[3]={type:6,string:[124]},this.rules[2].opcodes[4]={type:5,min:126,max:1114111},this.rules[3].opcodes=[],this.rules[3].opcodes[0]={type:3,min:1,max:1/0},this.rules[3].opcodes[1]={type:1,children:[2,3,4,5,6,7,8,9,10,11,12,13]},this.rules[3].opcodes[2]={type:6,string:[33]},this.rules[3].opcodes[3]={type:5,min:35,max:36},this.rules[3].opcodes[4]={type:5,min:38,max:59},this.rules[3].opcodes[5]={type:6,string:[61]},this.rules[3].opcodes[6]={type:5,min:63,max:91},this.rules[3].opcodes[7]={type:6,string:[93]},this.rules[3].opcodes[8]={type:6,string:[95]},this.rules[3].opcodes[9]={type:5,min:97,max:122},this.rules[3].opcodes[10]={type:6,string:[126]},this.rules[3].opcodes[11]={type:4,index:7},this.rules[3].opcodes[12]={type:4,index:8},this.rules[3].opcodes[13]={type:4,index:6},this.rules[4].opcodes=[],this.rules[4].opcodes[0]={type:5,min:48,max:57},this.rules[5].opcodes=[],this.rules[5].opcodes[0]={type:1,children:[1,2,3,4,5,6,7]},this.rules[5].opcodes[1]={type:4,index:4},this.rules[5].opcodes[2]={type:7,string:[97]},this.rules[5].opcodes[3]={type:7,string:[98]},this.rules[5].opcodes[4]={type:7,string:[99]},this.rules[5].opcodes[5]={type:7,string:[100]},this.rules[5].opcodes[6]={type:7,string:[101]},this.rules[5].opcodes[7]={type:7,string:[102]},this.rules[6].opcodes=[],this.rules[6].opcodes[0]={type:2,children:[1,2,3]},this.rules[6].opcodes[1]={type:7,string:[37]},this.rules[6].opcodes[2]={type:4,index:5},this.rules[6].opcodes[3]={type:4,index:5},this.rules[7].opcodes=[],this.rules[7].opcodes[0]={type:1,children:[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17]},this.rules[7].opcodes[1]={type:5,min:160,max:55295},this.rules[7].opcodes[2]={type:5,min:63744,max:64975},this.rules[7].opcodes[3]={type:5,min:65008,max:65519},this.rules[7].opcodes[4]={type:5,min:65536,max:131069},this.rules[7].opcodes[5]={type:5,min:131072,max:196605},this.rules[7].opcodes[6]={type:5,min:196608,max:262141},this.rules[7].opcodes[7]={type:5,min:262144,max:327677},this.rules[7].opcodes[8]={type:5,min:327680,max:393213},this.rules[7].opcodes[9]={type:5,min:393216,max:458749},this.rules[7].opcodes[10]={type:5,min:458752,max:524285},this.rules[7].opcodes[11]={type:5,min:524288,max:589821},this.rules[7].opcodes[12]={type:5,min:589824,max:655357},this.rules[7].opcodes[13]={type:5,min:655360,max:720893},this.rules[7].opcodes[14]={type:5,min:720896,max:786429},this.rules[7].opcodes[15]={type:5,min:786432,max:851965},this.rules[7].opcodes[16]={type:5,min:851968,max:917501},this.rules[7].opcodes[17]={type:5,min:921600,max:983037},this.rules[8].opcodes=[],this.rules[8].opcodes[0]={type:1,children:[1,2,3]},this.rules[8].opcodes[1]={type:5,min:57344,max:63743},this.rules[8].opcodes[2]={type:5,min:983040,max:1048573},this.rules[8].opcodes[3]={type:5,min:1048576,max:1114109},this.toString=function toString(){let s="";return s+="; OpenAPI Server URL templating ABNF syntax\n",s+="server-url-template    = 1*( literals / server-variable ) ; variant of https://www.rfc-editor.org/rfc/rfc6570#section-2\n",s+='server-variable        = "{" server-variable-name "}"\n',s+="server-variable-name   = 1*( %x00-7A / %x7C / %x7E-10FFFF ) ; every UTF8 character except { and } (from OpenAPI)\n",s+="\n",s+="; https://www.rfc-editor.org/rfc/rfc6570#section-2.1\n",s+="; https://www.rfc-editor.org/errata/eid6937\n",s+="literals               = 1*( %x21 / %x23-24 / %x26-3B / %x3D / %x3F-5B\n",s+="                       / %x5D / %x5F / %x61-7A / %x7E / ucschar / iprivate\n",s+="                       / pct-encoded)\n",s+="                            ; any Unicode character except: CTL, SP,\n",s+='                            ;  DQUOTE, "%" (aside from pct-encoded),\n',s+='                            ;  "<", ">", "\\", "^", "`", "{", "|", "}"\n',s+="\n",s+="; https://www.rfc-editor.org/rfc/rfc6570#section-1.5\n",s+="DIGIT          =  %x30-39             ; 0-9\n",s+='HEXDIG         =  DIGIT / "A" / "B" / "C" / "D" / "E" / "F" ; case-insensitive\n',s+="\n",s+='pct-encoded    =  "%" HEXDIG HEXDIG\n',s+="\n",s+="ucschar        =  %xA0-D7FF / %xF900-FDCF / %xFDF0-FFEF\n",s+="               /  %x10000-1FFFD / %x20000-2FFFD / %x30000-3FFFD\n",s+="               /  %x40000-4FFFD / %x50000-5FFFD / %x60000-6FFFD\n",s+="               /  %x70000-7FFFD / %x80000-8FFFD / %x90000-9FFFD\n",s+="               /  %xA0000-AFFFD / %xB0000-BFFFD / %xC0000-CFFFD\n",s+="               /  %xD0000-DFFFD / %xE1000-EFFFD\n",s+="\n",s+="iprivate       =  %xE000-F8FF / %xF0000-FFFFD / %x100000-10FFFD\n",'; OpenAPI Server URL templating ABNF syntax\nserver-url-template    = 1*( literals / server-variable ) ; variant of https://www.rfc-editor.org/rfc/rfc6570#section-2\nserver-variable        = "{" server-variable-name "}"\nserver-variable-name   = 1*( %x00-7A / %x7C / %x7E-10FFFF ) ; every UTF8 character except { and } (from OpenAPI)\n\n; https://www.rfc-editor.org/rfc/rfc6570#section-2.1\n; https://www.rfc-editor.org/errata/eid6937\nliterals               = 1*( %x21 / %x23-24 / %x26-3B / %x3D / %x3F-5B\n                       / %x5D / %x5F / %x61-7A / %x7E / ucschar / iprivate\n                       / pct-encoded)\n                            ; any Unicode character except: CTL, SP,\n                            ;  DQUOTE, "%" (aside from pct-encoded),\n                            ;  "<", ">", "\\", "^", "`", "{", "|", "}"\n\n; https://www.rfc-editor.org/rfc/rfc6570#section-1.5\nDIGIT          =  %x30-39             ; 0-9\nHEXDIG         =  DIGIT / "A" / "B" / "C" / "D" / "E" / "F" ; case-insensitive\n\npct-encoded    =  "%" HEXDIG HEXDIG\n\nucschar        =  %xA0-D7FF / %xF900-FDCF / %xFDF0-FFEF\n               /  %x10000-1FFFD / %x20000-2FFFD / %x30000-3FFFD\n               /  %x40000-4FFFD / %x50000-5FFFD / %x60000-6FFFD\n               /  %x70000-7FFFD / %x80000-8FFFD / %x90000-9FFFD\n               /  %xA0000-AFFFD / %xB0000-BFFFD / %xC0000-CFFFD\n               /  %xD0000-DFFFD / %xE1000-EFFFD\n\niprivate       =  %xE000-F8FF / %xF0000-FFFFD / %x100000-10FFFD\n'}},openapi_server_url_templating_es_parse=s=>{const o=new kp;o.ast=new Op,o.ast.callbacks["server-url-template"]=server_url_template,o.ast.callbacks["server-variable"]=callbacks_server_variable,o.ast.callbacks["server-variable-name"]=server_variable_name,o.ast.callbacks.literals=callbacks_literals;return{result:o.parse(Mx,"server-url-template",s),ast:o.ast}},openapi_server_url_templating_es_test=(s,{strict:o=!1}={})=>{try{const i=openapi_server_url_templating_es_parse(s);if(!i.result.success)return!1;const a=[];i.ast.translate(a);const u=a.some((([s])=>"server-variable"===s));if(!o&&!u)try{return new URL(s,"https://vladimirgorej.com"),!0}catch{return!1}return!o||u}catch{return!1}},encodeServerVariable=s=>(s=>{try{return"string"==typeof s&&decodeURIComponent(s)!==s}catch{return!1}})(s)?s:encodeURIComponent(s).replace(/%5B/g,"[").replace(/%5D/g,"]"),Rx=["literals","server-variable-name"],es_substitute=(s,o,i={})=>{const a={...{encoder:encodeServerVariable},...i},u=openapi_server_url_templating_es_parse(s);if(!u.result.success)return s;const _=[];u.ast.translate(_);const w=_.filter((([s])=>Rx.includes(s))).map((([s,i])=>"server-variable-name"===s?Object.hasOwn(o,i)?a.encoder(o[i],i):`{${i}}`:i));return w.join("")};function path_templating_grammar(){this.grammarObject="grammarObject",this.rules=[],this.rules[0]={name:"path-template",lower:"path-template",index:0,isBkr:!1},this.rules[1]={name:"path-segment",lower:"path-segment",index:1,isBkr:!1},this.rules[2]={name:"slash",lower:"slash",index:2,isBkr:!1},this.rules[3]={name:"path-literal",lower:"path-literal",index:3,isBkr:!1},this.rules[4]={name:"template-expression",lower:"template-expression",index:4,isBkr:!1},this.rules[5]={name:"template-expression-param-name",lower:"template-expression-param-name",index:5,isBkr:!1},this.rules[6]={name:"pchar",lower:"pchar",index:6,isBkr:!1},this.rules[7]={name:"unreserved",lower:"unreserved",index:7,isBkr:!1},this.rules[8]={name:"pct-encoded",lower:"pct-encoded",index:8,isBkr:!1},this.rules[9]={name:"sub-delims",lower:"sub-delims",index:9,isBkr:!1},this.rules[10]={name:"ALPHA",lower:"alpha",index:10,isBkr:!1},this.rules[11]={name:"DIGIT",lower:"digit",index:11,isBkr:!1},this.rules[12]={name:"HEXDIG",lower:"hexdig",index:12,isBkr:!1},this.udts=[],this.rules[0].opcodes=[],this.rules[0].opcodes[0]={type:2,children:[1,2,6]},this.rules[0].opcodes[1]={type:4,index:2},this.rules[0].opcodes[2]={type:3,min:0,max:1/0},this.rules[0].opcodes[3]={type:2,children:[4,5]},this.rules[0].opcodes[4]={type:4,index:1},this.rules[0].opcodes[5]={type:4,index:2},this.rules[0].opcodes[6]={type:3,min:0,max:1},this.rules[0].opcodes[7]={type:4,index:1},this.rules[1].opcodes=[],this.rules[1].opcodes[0]={type:3,min:1,max:1/0},this.rules[1].opcodes[1]={type:1,children:[2,3]},this.rules[1].opcodes[2]={type:4,index:3},this.rules[1].opcodes[3]={type:4,index:4},this.rules[2].opcodes=[],this.rules[2].opcodes[0]={type:7,string:[47]},this.rules[3].opcodes=[],this.rules[3].opcodes[0]={type:3,min:1,max:1/0},this.rules[3].opcodes[1]={type:4,index:6},this.rules[4].opcodes=[],this.rules[4].opcodes[0]={type:2,children:[1,2,3]},this.rules[4].opcodes[1]={type:7,string:[123]},this.rules[4].opcodes[2]={type:4,index:5},this.rules[4].opcodes[3]={type:7,string:[125]},this.rules[5].opcodes=[],this.rules[5].opcodes[0]={type:3,min:1,max:1/0},this.rules[5].opcodes[1]={type:1,children:[2,3,4]},this.rules[5].opcodes[2]={type:5,min:0,max:122},this.rules[5].opcodes[3]={type:6,string:[124]},this.rules[5].opcodes[4]={type:5,min:126,max:1114111},this.rules[6].opcodes=[],this.rules[6].opcodes[0]={type:1,children:[1,2,3,4,5]},this.rules[6].opcodes[1]={type:4,index:7},this.rules[6].opcodes[2]={type:4,index:8},this.rules[6].opcodes[3]={type:4,index:9},this.rules[6].opcodes[4]={type:7,string:[58]},this.rules[6].opcodes[5]={type:7,string:[64]},this.rules[7].opcodes=[],this.rules[7].opcodes[0]={type:1,children:[1,2,3,4,5,6]},this.rules[7].opcodes[1]={type:4,index:10},this.rules[7].opcodes[2]={type:4,index:11},this.rules[7].opcodes[3]={type:7,string:[45]},this.rules[7].opcodes[4]={type:7,string:[46]},this.rules[7].opcodes[5]={type:7,string:[95]},this.rules[7].opcodes[6]={type:7,string:[126]},this.rules[8].opcodes=[],this.rules[8].opcodes[0]={type:2,children:[1,2,3]},this.rules[8].opcodes[1]={type:7,string:[37]},this.rules[8].opcodes[2]={type:4,index:12},this.rules[8].opcodes[3]={type:4,index:12},this.rules[9].opcodes=[],this.rules[9].opcodes[0]={type:1,children:[1,2,3,4,5,6,7,8,9,10,11]},this.rules[9].opcodes[1]={type:7,string:[33]},this.rules[9].opcodes[2]={type:7,string:[36]},this.rules[9].opcodes[3]={type:7,string:[38]},this.rules[9].opcodes[4]={type:7,string:[39]},this.rules[9].opcodes[5]={type:7,string:[40]},this.rules[9].opcodes[6]={type:7,string:[41]},this.rules[9].opcodes[7]={type:7,string:[42]},this.rules[9].opcodes[8]={type:7,string:[43]},this.rules[9].opcodes[9]={type:7,string:[44]},this.rules[9].opcodes[10]={type:7,string:[59]},this.rules[9].opcodes[11]={type:7,string:[61]},this.rules[10].opcodes=[],this.rules[10].opcodes[0]={type:1,children:[1,2]},this.rules[10].opcodes[1]={type:5,min:65,max:90},this.rules[10].opcodes[2]={type:5,min:97,max:122},this.rules[11].opcodes=[],this.rules[11].opcodes[0]={type:5,min:48,max:57},this.rules[12].opcodes=[],this.rules[12].opcodes[0]={type:1,children:[1,2,3,4,5,6,7]},this.rules[12].opcodes[1]={type:4,index:11},this.rules[12].opcodes[2]={type:7,string:[97]},this.rules[12].opcodes[3]={type:7,string:[98]},this.rules[12].opcodes[4]={type:7,string:[99]},this.rules[12].opcodes[5]={type:7,string:[100]},this.rules[12].opcodes[6]={type:7,string:[101]},this.rules[12].opcodes[7]={type:7,string:[102]},this.toString=function toString(){let s="";return s+="; OpenAPI Path Templating ABNF syntax\n",s+="; variant of https://datatracker.ietf.org/doc/html/rfc3986#section-3.3\n",s+="path-template                  = slash *( path-segment slash ) [ path-segment ]\n",s+="path-segment                   = 1*( path-literal / template-expression )\n",s+='slash                          = "/"\n',s+="path-literal                   = 1*pchar\n",s+='template-expression            = "{" template-expression-param-name "}"\n',s+="template-expression-param-name = 1*( %x00-7A / %x7C / %x7E-10FFFF ) ; every UTF8 character except { and } (from OpenAPI)\n",s+="\n",s+="; https://datatracker.ietf.org/doc/html/rfc3986#section-3.3\n",s+='pchar               = unreserved / pct-encoded / sub-delims / ":" / "@"\n',s+='unreserved          = ALPHA / DIGIT / "-" / "." / "_" / "~"\n',s+="                    ; https://datatracker.ietf.org/doc/html/rfc3986#section-2.3\n",s+='pct-encoded         = "%" HEXDIG HEXDIG\n',s+="                    ; https://datatracker.ietf.org/doc/html/rfc3986#section-2.1\n",s+='sub-delims          = "!" / "$" / "&" / "\'" / "(" / ")"\n',s+='                    / "*" / "+" / "," / ";" / "="\n',s+="                    ; https://datatracker.ietf.org/doc/html/rfc3986#section-2.2\n",s+="\n",s+="; https://datatracker.ietf.org/doc/html/rfc5234#appendix-B.1\n",s+="ALPHA               = %x41-5A / %x61-7A   ; A-Z / a-z\n",s+="DIGIT               = %x30-39            ; 0-9\n",s+='HEXDIG              = DIGIT / "A" / "B" / "C" / "D" / "E" / "F"\n','; OpenAPI Path Templating ABNF syntax\n; variant of https://datatracker.ietf.org/doc/html/rfc3986#section-3.3\npath-template                  = slash *( path-segment slash ) [ path-segment ]\npath-segment                   = 1*( path-literal / template-expression )\nslash                          = "/"\npath-literal                   = 1*pchar\ntemplate-expression            = "{" template-expression-param-name "}"\ntemplate-expression-param-name = 1*( %x00-7A / %x7C / %x7E-10FFFF ) ; every UTF8 character except { and } (from OpenAPI)\n\n; https://datatracker.ietf.org/doc/html/rfc3986#section-3.3\npchar               = unreserved / pct-encoded / sub-delims / ":" / "@"\nunreserved          = ALPHA / DIGIT / "-" / "." / "_" / "~"\n                    ; https://datatracker.ietf.org/doc/html/rfc3986#section-2.3\npct-encoded         = "%" HEXDIG HEXDIG\n                    ; https://datatracker.ietf.org/doc/html/rfc3986#section-2.1\nsub-delims          = "!" / "$" / "&" / "\'" / "(" / ")"\n                    / "*" / "+" / "," / ";" / "="\n                    ; https://datatracker.ietf.org/doc/html/rfc3986#section-2.2\n\n; https://datatracker.ietf.org/doc/html/rfc5234#appendix-B.1\nALPHA               = %x41-5A / %x61-7A   ; A-Z / a-z\nDIGIT               = %x30-39            ; 0-9\nHEXDIG              = DIGIT / "A" / "B" / "C" / "D" / "E" / "F"\n'}}const callbacks_slash=(s,o,i,a,u)=>(s===Pp.SEM_PRE?u.push(["slash",jp.charsToString(o,i,a)]):Pp.SEM_POST,Pp.SEM_OK),path_template=(s,o,i,a,u)=>{if(s===Pp.SEM_PRE){if(!1===Array.isArray(u))throw new Error("parser's user data must be an array");u.push(["path-template",jp.charsToString(o,i,a)])}return Pp.SEM_OK},path_literal=(s,o,i,a,u)=>(s===Pp.SEM_PRE?u.push(["path-literal",jp.charsToString(o,i,a)]):Pp.SEM_POST,Pp.SEM_OK),template_expression=(s,o,i,a,u)=>(s===Pp.SEM_PRE?u.push(["template-expression",jp.charsToString(o,i,a)]):Pp.SEM_POST,Pp.SEM_OK),template_expression_param_name=(s,o,i,a,u)=>(s===Pp.SEM_PRE?u.push(["template-expression-param-name",jp.charsToString(o,i,a)]):Pp.SEM_POST,Pp.SEM_OK),Dx=new path_templating_grammar,openapi_path_templating_es_parse=s=>{const o=new kp;o.ast=new Op,o.ast.callbacks["path-template"]=path_template,o.ast.callbacks.slash=callbacks_slash,o.ast.callbacks["path-literal"]=path_literal,o.ast.callbacks["template-expression"]=template_expression,o.ast.callbacks["template-expression-param-name"]=template_expression_param_name;return{result:o.parse(Dx,"path-template",s),ast:o.ast}},encodePathComponent=s=>(s=>{try{return"string"==typeof s&&decodeURIComponent(s)!==s}catch{return!1}})(s)?s:encodeURIComponent(s).replace(/%5B/g,"[").replace(/%5D/g,"]"),Lx=["slash","path-literal","template-expression-param-name"],es_resolve=(s,o,i={})=>{const a={...{encoder:encodePathComponent},...i},u=openapi_path_templating_es_parse(s);if(!u.result.success)return s;const _=[];u.ast.translate(_);const w=_.filter((([s])=>Lx.includes(s))).map((([s,i])=>"template-expression-param-name"===s?Object.prototype.hasOwnProperty.call(o,i)?a.encoder(o[i],i):`{${i}}`:i));return w.join("")},Fx=(new path_templating_grammar,new kp,{body:function bodyBuilder({req:s,value:o}){void 0!==o&&(s.body=o)},header:function headerBuilder({req:s,parameter:o,value:i}){s.headers=s.headers||{},void 0!==i&&(s.headers[o.name]=i)},query:function queryBuilder({req:s,value:o,parameter:i}){s.query=s.query||{},!1===o&&"boolean"===i.type&&(o="false");0===o&&["number","integer"].indexOf(i.type)>-1&&(o="0");if(o)s.query[i.name]={collectionFormat:i.collectionFormat,value:o};else if(i.allowEmptyValue&&void 0!==o){const o=i.name;s.query[o]=s.query[o]||{},s.query[o].allowEmptyValue=!0}},path:function pathBuilder({req:s,value:o,parameter:i,baseURL:a}){if(void 0!==o){const u=s.url.replace(a,""),_=es_resolve(u,{[i.name]:o});s.url=a+_}},formData:function formDataBuilder({req:s,value:o,parameter:i}){!1===o&&"boolean"===i.type&&(o="false");0===o&&["number","integer"].indexOf(i.type)>-1&&(o="0");if(o)s.form=s.form||{},s.form[i.name]={collectionFormat:i.collectionFormat,value:o};else if(i.allowEmptyValue&&void 0!==o){s.form=s.form||{};const o=i.name;s.form[o]=s.form[o]||{},s.form[o].allowEmptyValue=!0}}});function serialize(s,o){return o.includes("application/json")?"string"==typeof s?s:(Array.isArray(s)&&(s=s.map((s=>{try{return JSON.parse(s)}catch(o){return s}}))),JSON.stringify(s)):String(s)}function grammar_grammar(){this.grammarObject="grammarObject",this.rules=[],this.rules[0]={name:"lenient-cookie-string",lower:"lenient-cookie-string",index:0,isBkr:!1},this.rules[1]={name:"lenient-cookie-entry",lower:"lenient-cookie-entry",index:1,isBkr:!1},this.rules[2]={name:"lenient-cookie-pair",lower:"lenient-cookie-pair",index:2,isBkr:!1},this.rules[3]={name:"lenient-cookie-pair-invalid",lower:"lenient-cookie-pair-invalid",index:3,isBkr:!1},this.rules[4]={name:"lenient-cookie-name",lower:"lenient-cookie-name",index:4,isBkr:!1},this.rules[5]={name:"lenient-cookie-value",lower:"lenient-cookie-value",index:5,isBkr:!1},this.rules[6]={name:"lenient-quoted-value",lower:"lenient-quoted-value",index:6,isBkr:!1},this.rules[7]={name:"lenient-quoted-char",lower:"lenient-quoted-char",index:7,isBkr:!1},this.rules[8]={name:"lenient-cookie-octet",lower:"lenient-cookie-octet",index:8,isBkr:!1},this.rules[9]={name:"cookie-string",lower:"cookie-string",index:9,isBkr:!1},this.rules[10]={name:"cookie-pair",lower:"cookie-pair",index:10,isBkr:!1},this.rules[11]={name:"cookie-name",lower:"cookie-name",index:11,isBkr:!1},this.rules[12]={name:"cookie-value",lower:"cookie-value",index:12,isBkr:!1},this.rules[13]={name:"cookie-octet",lower:"cookie-octet",index:13,isBkr:!1},this.rules[14]={name:"OWS",lower:"ows",index:14,isBkr:!1},this.rules[15]={name:"token",lower:"token",index:15,isBkr:!1},this.rules[16]={name:"tchar",lower:"tchar",index:16,isBkr:!1},this.rules[17]={name:"CHAR",lower:"char",index:17,isBkr:!1},this.rules[18]={name:"CTL",lower:"ctl",index:18,isBkr:!1},this.rules[19]={name:"separators",lower:"separators",index:19,isBkr:!1},this.rules[20]={name:"SP",lower:"sp",index:20,isBkr:!1},this.rules[21]={name:"HT",lower:"ht",index:21,isBkr:!1},this.rules[22]={name:"ALPHA",lower:"alpha",index:22,isBkr:!1},this.rules[23]={name:"DIGIT",lower:"digit",index:23,isBkr:!1},this.rules[24]={name:"DQUOTE",lower:"dquote",index:24,isBkr:!1},this.rules[25]={name:"WSP",lower:"wsp",index:25,isBkr:!1},this.rules[26]={name:"HTAB",lower:"htab",index:26,isBkr:!1},this.rules[27]={name:"CRLF",lower:"crlf",index:27,isBkr:!1},this.rules[28]={name:"CR",lower:"cr",index:28,isBkr:!1},this.rules[29]={name:"LF",lower:"lf",index:29,isBkr:!1},this.udts=[],this.rules[0].opcodes=[],this.rules[0].opcodes[0]={type:2,children:[1,2]},this.rules[0].opcodes[1]={type:4,index:1},this.rules[0].opcodes[2]={type:3,min:0,max:1/0},this.rules[0].opcodes[3]={type:2,children:[4,5,6]},this.rules[0].opcodes[4]={type:7,string:[59]},this.rules[0].opcodes[5]={type:4,index:14},this.rules[0].opcodes[6]={type:4,index:1},this.rules[1].opcodes=[],this.rules[1].opcodes[0]={type:1,children:[1,2]},this.rules[1].opcodes[1]={type:4,index:2},this.rules[1].opcodes[2]={type:4,index:3},this.rules[2].opcodes=[],this.rules[2].opcodes[0]={type:2,children:[1,2,3,4,5,6,7]},this.rules[2].opcodes[1]={type:4,index:14},this.rules[2].opcodes[2]={type:4,index:4},this.rules[2].opcodes[3]={type:4,index:14},this.rules[2].opcodes[4]={type:7,string:[61]},this.rules[2].opcodes[5]={type:4,index:14},this.rules[2].opcodes[6]={type:4,index:5},this.rules[2].opcodes[7]={type:4,index:14},this.rules[3].opcodes=[],this.rules[3].opcodes[0]={type:2,children:[1,2,4]},this.rules[3].opcodes[1]={type:4,index:14},this.rules[3].opcodes[2]={type:3,min:1,max:1/0},this.rules[3].opcodes[3]={type:4,index:16},this.rules[3].opcodes[4]={type:4,index:14},this.rules[4].opcodes=[],this.rules[4].opcodes[0]={type:3,min:1,max:1/0},this.rules[4].opcodes[1]={type:1,children:[2,3,4]},this.rules[4].opcodes[2]={type:5,min:33,max:58},this.rules[4].opcodes[3]={type:6,string:[60]},this.rules[4].opcodes[4]={type:5,min:62,max:126},this.rules[5].opcodes=[],this.rules[5].opcodes[0]={type:1,children:[1,6]},this.rules[5].opcodes[1]={type:2,children:[2,3]},this.rules[5].opcodes[2]={type:4,index:6},this.rules[5].opcodes[3]={type:3,min:0,max:1},this.rules[5].opcodes[4]={type:3,min:0,max:1/0},this.rules[5].opcodes[5]={type:4,index:8},this.rules[5].opcodes[6]={type:3,min:0,max:1/0},this.rules[5].opcodes[7]={type:4,index:8},this.rules[6].opcodes=[],this.rules[6].opcodes[0]={type:2,children:[1,2,4]},this.rules[6].opcodes[1]={type:4,index:24},this.rules[6].opcodes[2]={type:3,min:0,max:1/0},this.rules[6].opcodes[3]={type:4,index:7},this.rules[6].opcodes[4]={type:4,index:24},this.rules[7].opcodes=[],this.rules[7].opcodes[0]={type:1,children:[1,2]},this.rules[7].opcodes[1]={type:5,min:32,max:33},this.rules[7].opcodes[2]={type:5,min:35,max:126},this.rules[8].opcodes=[],this.rules[8].opcodes[0]={type:1,children:[1,2,3]},this.rules[8].opcodes[1]={type:5,min:33,max:43},this.rules[8].opcodes[2]={type:5,min:45,max:58},this.rules[8].opcodes[3]={type:5,min:60,max:126},this.rules[9].opcodes=[],this.rules[9].opcodes[0]={type:2,children:[1,2]},this.rules[9].opcodes[1]={type:4,index:10},this.rules[9].opcodes[2]={type:3,min:0,max:1/0},this.rules[9].opcodes[3]={type:2,children:[4,5,6]},this.rules[9].opcodes[4]={type:7,string:[59]},this.rules[9].opcodes[5]={type:4,index:20},this.rules[9].opcodes[6]={type:4,index:10},this.rules[10].opcodes=[],this.rules[10].opcodes[0]={type:2,children:[1,2,3]},this.rules[10].opcodes[1]={type:4,index:11},this.rules[10].opcodes[2]={type:7,string:[61]},this.rules[10].opcodes[3]={type:4,index:12},this.rules[11].opcodes=[],this.rules[11].opcodes[0]={type:4,index:15},this.rules[12].opcodes=[],this.rules[12].opcodes[0]={type:1,children:[1,6]},this.rules[12].opcodes[1]={type:2,children:[2,3,5]},this.rules[12].opcodes[2]={type:4,index:24},this.rules[12].opcodes[3]={type:3,min:0,max:1/0},this.rules[12].opcodes[4]={type:4,index:13},this.rules[12].opcodes[5]={type:4,index:24},this.rules[12].opcodes[6]={type:3,min:0,max:1/0},this.rules[12].opcodes[7]={type:4,index:13},this.rules[13].opcodes=[],this.rules[13].opcodes[0]={type:1,children:[1,2,3,4,5]},this.rules[13].opcodes[1]={type:6,string:[33]},this.rules[13].opcodes[2]={type:5,min:35,max:43},this.rules[13].opcodes[3]={type:5,min:45,max:58},this.rules[13].opcodes[4]={type:5,min:60,max:91},this.rules[13].opcodes[5]={type:5,min:93,max:126},this.rules[14].opcodes=[],this.rules[14].opcodes[0]={type:3,min:0,max:1/0},this.rules[14].opcodes[1]={type:2,children:[2,4]},this.rules[14].opcodes[2]={type:3,min:0,max:1},this.rules[14].opcodes[3]={type:4,index:27},this.rules[14].opcodes[4]={type:4,index:25},this.rules[15].opcodes=[],this.rules[15].opcodes[0]={type:3,min:1,max:1/0},this.rules[15].opcodes[1]={type:4,index:16},this.rules[16].opcodes=[],this.rules[16].opcodes[0]={type:1,children:[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17]},this.rules[16].opcodes[1]={type:7,string:[33]},this.rules[16].opcodes[2]={type:7,string:[35]},this.rules[16].opcodes[3]={type:7,string:[36]},this.rules[16].opcodes[4]={type:7,string:[37]},this.rules[16].opcodes[5]={type:7,string:[38]},this.rules[16].opcodes[6]={type:7,string:[39]},this.rules[16].opcodes[7]={type:7,string:[42]},this.rules[16].opcodes[8]={type:7,string:[43]},this.rules[16].opcodes[9]={type:7,string:[45]},this.rules[16].opcodes[10]={type:7,string:[46]},this.rules[16].opcodes[11]={type:7,string:[94]},this.rules[16].opcodes[12]={type:7,string:[95]},this.rules[16].opcodes[13]={type:7,string:[96]},this.rules[16].opcodes[14]={type:7,string:[124]},this.rules[16].opcodes[15]={type:7,string:[126]},this.rules[16].opcodes[16]={type:4,index:23},this.rules[16].opcodes[17]={type:4,index:22},this.rules[17].opcodes=[],this.rules[17].opcodes[0]={type:5,min:1,max:127},this.rules[18].opcodes=[],this.rules[18].opcodes[0]={type:1,children:[1,2]},this.rules[18].opcodes[1]={type:5,min:0,max:31},this.rules[18].opcodes[2]={type:6,string:[127]},this.rules[19].opcodes=[],this.rules[19].opcodes[0]={type:1,children:[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19]},this.rules[19].opcodes[1]={type:7,string:[40]},this.rules[19].opcodes[2]={type:7,string:[41]},this.rules[19].opcodes[3]={type:7,string:[60]},this.rules[19].opcodes[4]={type:7,string:[62]},this.rules[19].opcodes[5]={type:7,string:[64]},this.rules[19].opcodes[6]={type:7,string:[44]},this.rules[19].opcodes[7]={type:7,string:[59]},this.rules[19].opcodes[8]={type:7,string:[58]},this.rules[19].opcodes[9]={type:7,string:[92]},this.rules[19].opcodes[10]={type:6,string:[34]},this.rules[19].opcodes[11]={type:7,string:[47]},this.rules[19].opcodes[12]={type:7,string:[91]},this.rules[19].opcodes[13]={type:7,string:[93]},this.rules[19].opcodes[14]={type:7,string:[63]},this.rules[19].opcodes[15]={type:7,string:[61]},this.rules[19].opcodes[16]={type:7,string:[123]},this.rules[19].opcodes[17]={type:7,string:[125]},this.rules[19].opcodes[18]={type:4,index:20},this.rules[19].opcodes[19]={type:4,index:21},this.rules[20].opcodes=[],this.rules[20].opcodes[0]={type:6,string:[32]},this.rules[21].opcodes=[],this.rules[21].opcodes[0]={type:6,string:[9]},this.rules[22].opcodes=[],this.rules[22].opcodes[0]={type:1,children:[1,2]},this.rules[22].opcodes[1]={type:5,min:65,max:90},this.rules[22].opcodes[2]={type:5,min:97,max:122},this.rules[23].opcodes=[],this.rules[23].opcodes[0]={type:5,min:48,max:57},this.rules[24].opcodes=[],this.rules[24].opcodes[0]={type:6,string:[34]},this.rules[25].opcodes=[],this.rules[25].opcodes[0]={type:1,children:[1,2]},this.rules[25].opcodes[1]={type:4,index:20},this.rules[25].opcodes[2]={type:4,index:26},this.rules[26].opcodes=[],this.rules[26].opcodes[0]={type:6,string:[9]},this.rules[27].opcodes=[],this.rules[27].opcodes[0]={type:2,children:[1,2]},this.rules[27].opcodes[1]={type:4,index:28},this.rules[27].opcodes[2]={type:4,index:29},this.rules[28].opcodes=[],this.rules[28].opcodes[0]={type:6,string:[13]},this.rules[29].opcodes=[],this.rules[29].opcodes[0]={type:6,string:[10]},this.toString=function toString(){let s="";return s+="; Lenient version of https://datatracker.ietf.org/doc/html/rfc6265#section-4.2.1\n",s+='lenient-cookie-string        = lenient-cookie-entry *( ";" OWS lenient-cookie-entry )\n',s+="lenient-cookie-entry         = lenient-cookie-pair / lenient-cookie-pair-invalid\n",s+='lenient-cookie-pair          = OWS lenient-cookie-name OWS "=" OWS lenient-cookie-value OWS\n',s+='lenient-cookie-pair-invalid  = OWS 1*tchar OWS ; Allow for standalone entries like "fizz" to be ignored\n',s+='lenient-cookie-name          = 1*( %x21-3A / %x3C / %x3E-7E ) ; Allow all printable US-ASCII except "="\n',s+="lenient-cookie-value         = lenient-quoted-value [ *lenient-cookie-octet ] / *lenient-cookie-octet\n",s+="lenient-quoted-value         = DQUOTE *( lenient-quoted-char ) DQUOTE\n",s+="lenient-quoted-char          = %x20-21 / %x23-7E ; Allow all printable US-ASCII except DQUOTE\n",s+="lenient-cookie-octet         = %x21-2B / %x2D-3A / %x3C-7E\n",s+="                             ; Allow all printable characters except CTLs, semicolon and SP\n",s+="\n",s+="; https://datatracker.ietf.org/doc/html/rfc6265#section-4.2.1\n",s+='cookie-string     = cookie-pair *( ";" SP cookie-pair )\n',s+="\n",s+="; https://datatracker.ietf.org/doc/html/rfc6265#section-4.1.1\n",s+="; https://www.rfc-editor.org/errata/eid5518\n",s+='cookie-pair       = cookie-name "=" cookie-value\n',s+="cookie-name       = token\n",s+="cookie-value      = ( DQUOTE *cookie-octet DQUOTE ) / *cookie-octet\n",s+="                  ; https://www.rfc-editor.org/errata/eid8242\n",s+="cookie-octet      = %x21 / %x23-2B / %x2D-3A / %x3C-5B / %x5D-7E\n",s+="                       ; US-ASCII characters excluding CTLs,\n",s+="                       ; whitespace, DQUOTE, comma, semicolon,\n",s+="                       ; and backslash\n",s+="\n",s+="; https://datatracker.ietf.org/doc/html/rfc6265#section-2.2\n",s+='OWS            = *( [ CRLF ] WSP ) ; "optional" whitespace\n',s+="\n",s+="; https://datatracker.ietf.org/doc/html/rfc9110#section-5.6.2\n",s+="token          = 1*(tchar)\n",s+='tchar          = "!" / "#" / "$" / "%" / "&" / "\'" / "*"\n',s+='                 / "+" / "-" / "." / "^" / "_" / "`" / "|" / "~"\n',s+="                 / DIGIT / ALPHA\n",s+="                 ; any VCHAR, except delimiters\n",s+="\n",s+="; https://datatracker.ietf.org/doc/html/rfc2616#section-2.2\n",s+="CHAR           = %x01-7F ; any US-ASCII character (octets 0 - 127)\n",s+="CTL            = %x00-1F / %x7F ; any US-ASCII control character\n",s+='separators     = "(" / ")" / "<" / ">" / "@" / "," / ";" / ":" / "\\" / %x22 / "/" / "[" / "]" / "?" / "=" / "{" / "}" / SP / HT\n',s+="SP             = %x20 ; US-ASCII SP, space (32)\n",s+="HT             = %x09 ; US-ASCII HT, horizontal-tab (9)\n",s+="\n",s+="; https://datatracker.ietf.org/doc/html/rfc5234#appendix-B.1\n",s+="ALPHA          =  %x41-5A / %x61-7A ; A-Z / a-z\n",s+="DIGIT          =  %x30-39 ; 0-9\n",s+='DQUOTE         =  %x22 ; " (Double Quote)\n',s+="WSP            =  SP / HTAB ; white space\n",s+="HTAB           =  %x09 ; horizontal tab\n",s+="CRLF           =  CR LF ; Internet standard newline\n",s+="CR             =  %x0D ; carriage return\n",s+="LF             =  %x0A ; linefeed\n",'; Lenient version of https://datatracker.ietf.org/doc/html/rfc6265#section-4.2.1\nlenient-cookie-string        = lenient-cookie-entry *( ";" OWS lenient-cookie-entry )\nlenient-cookie-entry         = lenient-cookie-pair / lenient-cookie-pair-invalid\nlenient-cookie-pair          = OWS lenient-cookie-name OWS "=" OWS lenient-cookie-value OWS\nlenient-cookie-pair-invalid  = OWS 1*tchar OWS ; Allow for standalone entries like "fizz" to be ignored\nlenient-cookie-name          = 1*( %x21-3A / %x3C / %x3E-7E ) ; Allow all printable US-ASCII except "="\nlenient-cookie-value         = lenient-quoted-value [ *lenient-cookie-octet ] / *lenient-cookie-octet\nlenient-quoted-value         = DQUOTE *( lenient-quoted-char ) DQUOTE\nlenient-quoted-char          = %x20-21 / %x23-7E ; Allow all printable US-ASCII except DQUOTE\nlenient-cookie-octet         = %x21-2B / %x2D-3A / %x3C-7E\n                             ; Allow all printable characters except CTLs, semicolon and SP\n\n; https://datatracker.ietf.org/doc/html/rfc6265#section-4.2.1\ncookie-string     = cookie-pair *( ";" SP cookie-pair )\n\n; https://datatracker.ietf.org/doc/html/rfc6265#section-4.1.1\n; https://www.rfc-editor.org/errata/eid5518\ncookie-pair       = cookie-name "=" cookie-value\ncookie-name       = token\ncookie-value      = ( DQUOTE *cookie-octet DQUOTE ) / *cookie-octet\n                  ; https://www.rfc-editor.org/errata/eid8242\ncookie-octet      = %x21 / %x23-2B / %x2D-3A / %x3C-5B / %x5D-7E\n                       ; US-ASCII characters excluding CTLs,\n                       ; whitespace, DQUOTE, comma, semicolon,\n                       ; and backslash\n\n; https://datatracker.ietf.org/doc/html/rfc6265#section-2.2\nOWS            = *( [ CRLF ] WSP ) ; "optional" whitespace\n\n; https://datatracker.ietf.org/doc/html/rfc9110#section-5.6.2\ntoken          = 1*(tchar)\ntchar          = "!" / "#" / "$" / "%" / "&" / "\'" / "*"\n                 / "+" / "-" / "." / "^" / "_" / "`" / "|" / "~"\n                 / DIGIT / ALPHA\n                 ; any VCHAR, except delimiters\n\n; https://datatracker.ietf.org/doc/html/rfc2616#section-2.2\nCHAR           = %x01-7F ; any US-ASCII character (octets 0 - 127)\nCTL            = %x00-1F / %x7F ; any US-ASCII control character\nseparators     = "(" / ")" / "<" / ">" / "@" / "," / ";" / ":" / "\\" / %x22 / "/" / "[" / "]" / "?" / "=" / "{" / "}" / SP / HT\nSP             = %x20 ; US-ASCII SP, space (32)\nHT             = %x09 ; US-ASCII HT, horizontal-tab (9)\n\n; https://datatracker.ietf.org/doc/html/rfc5234#appendix-B.1\nALPHA          =  %x41-5A / %x61-7A ; A-Z / a-z\nDIGIT          =  %x30-39 ; 0-9\nDQUOTE         =  %x22 ; " (Double Quote)\nWSP            =  SP / HTAB ; white space\nHTAB           =  %x09 ; horizontal tab\nCRLF           =  CR LF ; Internet standard newline\nCR             =  %x0D ; carriage return\nLF             =  %x0A ; linefeed\n'}}new grammar_grammar;const utils_percentEncodeChar=s=>{if("string"!=typeof s||1!==[...s].length)throw new TypeError("Input must be a single character string.");const o=s.codePointAt(0);return o<=127?`%${o.toString(16).toUpperCase().padStart(2,"0")}`:encodeURIComponent(s)},utils_isQuoted=s=>s.length>=2&&s.startsWith('"')&&s.endsWith('"'),utils_unquote=s=>utils_isQuoted(s)?s.slice(1,-1):s,utils_quote=s=>`"${s}"`,utils_identity=s=>s,Bx=new kp,$x=new grammar_grammar,test_cookie_value=(s,{strict:o=!0,quoted:i=null}={})=>{try{const a=o?"cookie-value":"lenient-cookie-value",u=Bx.parse($x,a,s);return"boolean"==typeof i?u.success&&i===utils_isQuoted(s):u.success}catch{return!1}},base64_browser=s=>{const o=(new TextEncoder).encode(s).reduce(((s,o)=>s+String.fromCharCode(o)),"");return btoa(o)},cookie_value_strict_base64=(s,o=base64_browser)=>{const i=String(s);if(test_cookie_value(i))return i;const a=utils_isQuoted(i),u=o(a?utils_unquote(i):i);return a?utils_quote(u):u},base64url_browser=s=>(s=>s.replace(/\+/g,"-").replace(/\//g,"_").replace(/=+$/g,""))(base64_browser(s)),cookie_value_strict_base64url=s=>cookie_value_strict_base64(s,base64url_browser),qx=new kp,Ux=new grammar_grammar,test_cookie_name=(s,{strict:o=!0}={})=>{try{const i=o?"cookie-name":"lenient-cookie-name";return qx.parse(Ux,i,s).success}catch{return!1}},cookie_name_strict=s=>{if(!test_cookie_name(s))throw new TypeError(`Invalid cookie name: ${s}`)},cookie_value_strict=s=>{if(!test_cookie_value(s))throw new TypeError(`Invalid cookie value: ${s}`)},Vx={encoders:{name:utils_identity,value:cookie_value_strict_base64url},validators:{name:cookie_name_strict,value:cookie_value_strict}},set_cookie_serialize=(s,o,i={})=>{const a={...Vx,...i,encoders:{...Vx.encoders,...i.encoders},validators:{...Vx.validators,...i.validators}},u=a.encoders.name(s),_=a.encoders.value(o);return a.validators.name(u),a.validators.value(_),`${u}=${_}`},cookie_serialize=(s,o={})=>(Array.isArray(s)?s:"object"==typeof s&&null!==s?Object.entries(s):[]).map((([s,i])=>set_cookie_serialize(s,i,o))).join("; "),zx=new kp,Wx=new grammar_grammar,cookie_value_strict_percent=s=>{const o=String(s);if(test_cookie_value(o))return o;const i=utils_isQuoted(o),a=i?utils_unquote(o):o;let u="";for(const s of a)u+=zx.parse(Wx,"cookie-octet",s).success?s:utils_percentEncodeChar(s);return i?utils_quote(u):u},Jx=(new kp,new grammar_grammar,s=>{if(!test_cookie_name(s,{strict:!1}))throw new TypeError(`Invalid cookie name: ${s}`)}),valuePercentEncoder=s=>cookie_value_strict_percent(s).replace(/[=&]/gu,(s=>"="===s?"%3D":"%26")),helpers_cookie_serialize=(s,o={})=>cookie_serialize(s,up({encoders:{name:utils_identity,value:valuePercentEncoder},validators:{name:Jx,value:cookie_value_strict}},o));function parameter_builders_path({req:s,value:o,parameter:i,baseURL:a}){const{name:u,style:_,explode:w,content:x}=i;if(void 0===o)return;const C=s.url.replace(a,"");let j;if(x){const s=Object.keys(x)[0];j=es_resolve(C,{[u]:o},{encoder:o=>encodeCharacters(serialize(o,s))})}else j=es_resolve(C,{[u]:o},{encoder:s=>stylize({key:i.name,value:s,style:_||"simple",explode:null!=w&&w,escape:"reserved"})});s.url=a+j}function query({req:s,value:o,parameter:i}){if(s.query=s.query||{},void 0!==o&&i.content){const a=serialize(o,Object.keys(i.content)[0]);if(a)s.query[i.name]=a;else if(i.allowEmptyValue){const o=i.name;s.query[o]=s.query[o]||{},s.query[o].allowEmptyValue=!0}}else if(!1===o&&(o="false"),0===o&&(o="0"),o){const{style:a,explode:u,allowReserved:_}=i;s.query[i.name]={value:o,serializationOption:{style:a,explode:u,allowReserved:_}}}else if(i.allowEmptyValue&&void 0!==o){const o=i.name;s.query[o]=s.query[o]||{},s.query[o].allowEmptyValue=!0}}const Hx=["accept","authorization","content-type"];function parameter_builders_header({req:s,parameter:o,value:i}){if(s.headers=s.headers||{},!(Hx.indexOf(o.name.toLowerCase())>-1))if(void 0!==i&&o.content){const a=Object.keys(o.content)[0];s.headers[o.name]=serialize(i,a)}else void 0===i||Array.isArray(i)&&0===i.length||(s.headers[o.name]=stylize({key:o.name,value:i,style:o.style||"simple",explode:void 0!==o.explode&&o.explode,escape:!1}))}function cookie({req:s,parameter:o,value:i}){const{name:a}=o;if(s.headers=s.headers||{},void 0!==i&&o.content){const u=serialize(i,Object.keys(o.content)[0]);s.headers.Cookie=helpers_cookie_serialize({[a]:u})}else if(void 0!==i&&(!Array.isArray(i)||0!==i.length)){var u;const _=stylize({key:o.name,value:i,escape:!1,style:o.style||"form",explode:null!==(u=o.explode)&&void 0!==u&&u}),w=Array.isArray(i)&&o.explode?`${a}=${_}`:_;s.headers.Cookie=helpers_cookie_serialize({[a]:w})}}const Kx="undefined"!=typeof globalThis?globalThis:"undefined"!=typeof self?self:window,{btoa:Gx}=Kx,Yx=Gx;function buildRequest(s,o){const{operation:i,requestBody:a,securities:u,spec:_,attachContentTypeForEmptyPayload:w}=s;let{requestContentType:x}=s;o=function applySecurities({request:s,securities:o={},operation:i={},spec:a}){var u;const _={...s},{authorized:w={}}=o,x=i.security||a.security||[],C=w&&!!Object.keys(w).length,j=(null==a||null===(u=a.components)||void 0===u?void 0:u.securitySchemes)||{};if(_.headers=_.headers||{},_.query=_.query||{},!Object.keys(o).length||!C||!x||Array.isArray(i.security)&&!i.security.length)return s;return x.forEach((s=>{Object.keys(s).forEach((s=>{const o=w[s],i=j[s];if(!o)return;const a=o.value||o,{type:u}=i;if(o)if("apiKey"===u)"query"===i.in&&(_.query[i.name]=a),"header"===i.in&&(_.headers[i.name]=a),"cookie"===i.in&&(_.cookies[i.name]=a);else if("http"===u){if(/^basic$/i.test(i.scheme)){const s=a.username||"",o=a.password||"",i=Yx(`${s}:${o}`);_.headers.Authorization=`Basic ${i}`}/^bearer$/i.test(i.scheme)&&(_.headers.Authorization=`Bearer ${a}`)}else if("oauth2"===u||"openIdConnect"===u){const s=o.token||{},a=s[i["x-tokenName"]||"access_token"];let u=s.token_type;u&&"bearer"!==u.toLowerCase()||(u="Bearer"),_.headers.Authorization=`${u} ${a}`}}))})),_}({request:o,securities:u,operation:i,spec:_});const C=i.requestBody||{},j=Object.keys(C.content||{}),L=x&&j.indexOf(x)>-1;if(a||w){if(x&&L)o.headers["Content-Type"]=x;else if(!x){const s=j[0];s&&(o.headers["Content-Type"]=s,x=s)}}else x&&L&&(o.headers["Content-Type"]=x);if(!s.responseContentType&&i.responses){const s=Object.entries(i.responses).filter((([s,o])=>{const i=parseInt(s,10);return i>=200&&i<300&&fu(o.content)})).reduce(((s,[,o])=>s.concat(Object.keys(o.content))),[]);s.length>0&&(o.headers.accept=s.join(", "))}if(a)if(x){if(j.indexOf(x)>-1)if("application/x-www-form-urlencoded"===x||"multipart/form-data"===x)if("object"==typeof a){var B,$;const s=null!==(B=null===($=C.content[x])||void 0===$?void 0:$.encoding)&&void 0!==B?B:{};o.form={},Object.keys(a).forEach((i=>{let u;try{u=JSON.parse(a[i])}catch{u=a[i]}o.form[i]={value:u,encoding:s[i]||{}}}))}else if("string"==typeof a){var U,V;const s=null!==(U=null===(V=C.content[x])||void 0===V?void 0:V.encoding)&&void 0!==U?U:{};try{o.form={};const i=JSON.parse(a);Object.entries(i).forEach((([i,a])=>{o.form[i]={value:a,encoding:s[i]||{}}}))}catch{o.form=a}}else o.form=a;else o.body=a}else o.body=a;return o}function build_request_buildRequest(s,o){const{spec:i,operation:a,securities:u,requestContentType:_,responseContentType:w,attachContentTypeForEmptyPayload:x}=s;if(o=function build_request_applySecurities({request:s,securities:o={},operation:i={},spec:a}){const u={...s},{authorized:_={},specSecurity:w=[]}=o,x=i.security||w,C=_&&!!Object.keys(_).length,j=a.securityDefinitions;if(u.headers=u.headers||{},u.query=u.query||{},!Object.keys(o).length||!C||!x||Array.isArray(i.security)&&!i.security.length)return s;return x.forEach((s=>{Object.keys(s).forEach((s=>{const o=_[s];if(!o)return;const{token:i}=o,a=o.value||o,w=j[s],{type:x}=w,C=w["x-tokenName"]||"access_token",L=i&&i[C];let B=i&&i.token_type;if(o)if("apiKey"===x){const s="query"===w.in?"query":"headers";u[s]=u[s]||{},u[s][w.name]=a}else if("basic"===x)if(a.header)u.headers.authorization=a.header;else{const s=a.username||"",o=a.password||"";a.base64=Yx(`${s}:${o}`),u.headers.authorization=`Basic ${a.base64}`}else"oauth2"===x&&L&&(B=B&&"bearer"!==B.toLowerCase()?B:"Bearer",u.headers.authorization=`${B} ${L}`)}))})),u}({request:o,securities:u,operation:a,spec:i}),o.body||o.form||x)_?o.headers["Content-Type"]=_:Array.isArray(a.consumes)?[o.headers["Content-Type"]]=a.consumes:Array.isArray(i.consumes)?[o.headers["Content-Type"]]=i.consumes:a.parameters&&a.parameters.filter((s=>"file"===s.type)).length?o.headers["Content-Type"]="multipart/form-data":a.parameters&&a.parameters.filter((s=>"formData"===s.in)).length&&(o.headers["Content-Type"]="application/x-www-form-urlencoded");else if(_){const s=a.parameters&&a.parameters.filter((s=>"body"===s.in)).length>0,i=a.parameters&&a.parameters.filter((s=>"formData"===s.in)).length>0;(s||i)&&(o.headers["Content-Type"]=_)}return!w&&Array.isArray(a.produces)&&a.produces.length>0&&(o.headers.accept=a.produces.join(", ")),o}function idFromPathMethodLegacy(s,o){return`${o.toLowerCase()}-${s}`}const arrayOrEmpty=s=>Array.isArray(s)?s:[],findObjectOrArraySchema=(s,{recurse:o=!0,depth:i=1}={})=>{if(fu(s)){if("object"===s.type||"array"===s.type||Array.isArray(s.type)&&(s.type.includes("object")||s.type.includes("array")))return s;if(!(i>Bl)&&o){const a=Array.isArray(s.oneOf)?s.oneOf.find((s=>findObjectOrArraySchema(s,{recurse:o,depth:i+1}))):void 0;if(a)return a;const u=Array.isArray(s.anyOf)?s.anyOf.find((s=>findObjectOrArraySchema(s,{recurse:o,depth:i+1}))):void 0;if(u)return u}}},parseJsonObjectOrArray=({value:s,silentFail:o=!1})=>{try{const i=JSON.parse(s);if(fu(i)||Array.isArray(i))return i;if(!o)throw new Error("Expected JSON serialized object or array")}catch{if(!o)throw new Error("Could not parse parameter value string as JSON Object or JSON Array")}return s},parseURIReference=s=>{try{return new URL(s)}catch{const o=new URL(s,Ll),i=String(s).startsWith("/")?o.pathname:o.pathname.substring(1);return{hash:o.hash,host:"",hostname:"",href:"",origin:"",password:"",pathname:i,port:"",protocol:"",search:o.search,searchParams:o.searchParams}}};class OperationNotFoundError extends Go{}const Xx={buildRequest:execute_buildRequest};function execute_execute({http:s,fetch:o,spec:i,operationId:a,pathName:u,method:_,parameters:w,securities:x,...C}){const j=s||o||http_http;u&&_&&!a&&(a=idFromPathMethodLegacy(u,_));const L=Xx.buildRequest({spec:i,operationId:a,parameters:w,securities:x,http:j,...C});return L.body&&(fu(L.body)||Array.isArray(L.body))&&(L.body=JSON.stringify(L.body)),j(L)}function execute_buildRequest(s){const{spec:o,operationId:i,responseContentType:a,scheme:u,requestInterceptor:_,responseInterceptor:w,contextUrl:x,userFetch:C,server:j,serverVariables:L,http:B,signal:$,serverVariableEncoder:U}=s;let{parameters:V,parameterBuilders:z,baseURL:Y}=s;const Z=isOpenAPI3(o);z||(z=Z?be:Fx);let ee={url:"",credentials:B&&B.withCredentials?"include":"same-origin",headers:{},cookies:{}};$&&(ee.signal=$),_&&(ee.requestInterceptor=_),w&&(ee.responseInterceptor=w),C&&(ee.userFetch=C);const ie=function getOperationRaw(s,o){return s&&s.paths?function findOperation(s,o){return function eachOperation(s,o,i){if(!s||"object"!=typeof s||!s.paths||"object"!=typeof s.paths)return null;const{paths:a}=s;for(const u in a)for(const _ in a[u]){if("PARAMETERS"===_.toUpperCase())continue;const w=a[u][_];if(!w||"object"!=typeof w)continue;const x={spec:s,pathName:u,method:_.toUpperCase(),operation:w},C=o(x);if(i&&C)return x}}(s,o,!0)||null}(s,(({pathName:s,method:i,operation:a})=>{if(!a||"object"!=typeof a)return!1;const u=a.operationId;return[opId(a,s,i),idFromPathMethodLegacy(s,i),u].some((s=>s&&s===o))})):null}(o,i);if(!ie)throw new OperationNotFoundError(`Operation ${i} not found`);const{operation:ae={},method:ce,pathName:le}=ie;if(Y=null!=Y?Y:function baseUrl(s){const o=isOpenAPI3(s.spec);return o?function oas3BaseUrl({spec:s,pathName:o,method:i,server:a,contextUrl:u,serverVariables:_={},serverVariableEncoder:w}){var x,C;let j,L=[],B="";const $=null==s||null===(x=s.paths)||void 0===x||null===(x=x[o])||void 0===x||null===(x=x[(i||"").toLowerCase()])||void 0===x?void 0:x.servers,U=null==s||null===(C=s.paths)||void 0===C||null===(C=C[o])||void 0===C?void 0:C.servers,V=null==s?void 0:s.servers;L=isNonEmptyServerList($)?$:isNonEmptyServerList(U)?U:isNonEmptyServerList(V)?V:[Fl],a&&(j=L.find((s=>s.url===a)),j&&(B=a));B||([j]=L,B=j.url);if(openapi_server_url_templating_es_test(B,{strict:!0})){const s=Object.entries({...j.variables}).reduce(((s,[o,i])=>(s[o]=i.default,s)),{});B=es_substitute(B,{...s,..._},{encoder:"function"==typeof w?w:yw})}return function buildOas3UrlWithContext(s="",o=""){const i=parseURIReference(s&&o?resolve(o,s):s),a=parseURIReference(o),u=stripNonAlpha(i.protocol)||stripNonAlpha(a.protocol),_=i.host||a.host,w=i.pathname;let x;x=u&&_?`${u}://${_+w}`:w;return"/"===x[x.length-1]?x.slice(0,-1):x}(B,u)}(s):function swagger2BaseUrl({spec:s,scheme:o,contextUrl:i=""}){const a=parseURIReference(i),u=Array.isArray(s.schemes)?s.schemes[0]:null,_=o||u||stripNonAlpha(a.protocol)||"http",w=s.host||a.host||"",x=s.basePath||"";let C;C=_&&w?`${_}://${w+x}`:x;return"/"===C[C.length-1]?C.slice(0,-1):C}(s)}({spec:o,scheme:u,contextUrl:x,server:j,serverVariables:L,pathName:le,method:ce,serverVariableEncoder:U}),ee.url+=Y,!i)return delete ee.cookies,ee;ee.url+=le,ee.method=`${ce}`.toUpperCase(),V=V||{};const pe=o.paths[le]||{};a&&(ee.headers.accept=a);const de=(s=>{const o={};s.forEach((s=>{o[s.in]||(o[s.in]={}),o[s.in][s.name]=s}));const i=[];return Object.keys(o).forEach((s=>{Object.keys(o[s]).forEach((a=>{i.push(o[s][a])}))})),i})([].concat(arrayOrEmpty(ae.parameters)).concat(arrayOrEmpty(pe.parameters)));de.forEach((s=>{const i=z[s.in];let a;if("body"===s.in&&s.schema&&s.schema.properties&&(a=V),a=s&&s.name&&V[s.name],void 0===a?a=s&&s.name&&V[`${s.in}.${s.name}`]:((s,o)=>o.filter((o=>o.name===s)))(s.name,de).length>1&&console.warn(`Parameter '${s.name}' is ambiguous because the defined spec has more than one parameter with the name: '${s.name}' and the passed-in parameter values did not define an 'in' value.`),null!==a){if(void 0!==s.default&&void 0===a&&(a=s.default),void 0===a&&s.required&&!s.allowEmptyValue)throw new Error(`Required parameter ${s.name} is not provided`);Z&&"string"==typeof a&&(Yu("type",s.schema)&&"string"==typeof s.schema.type&&findObjectOrArraySchema(s.schema,{recurse:!1})?a=parseJsonObjectOrArray({value:a,silentFail:!1}):(Yu("type",s.schema)&&Array.isArray(s.schema.type)&&findObjectOrArraySchema(s.schema,{recurse:!1})||!Yu("type",s.schema)&&findObjectOrArraySchema(s.schema,{recurse:!0}))&&(a=parseJsonObjectOrArray({value:a,silentFail:!0}))),i&&i({req:ee,parameter:s,value:a,operation:ae,spec:o,baseURL:Y})}}));const fe={...s,operation:ae};if(ee=Z?buildRequest(fe,ee):build_request_buildRequest(fe,ee),ee.cookies&&Object.keys(ee.cookies).length>0){const s=helpers_cookie_serialize(ee.cookies);Id(ee.headers.Cookie)?ee.headers.Cookie+=`; ${s}`:ee.headers.Cookie=s}return ee.cookies&&delete ee.cookies,serializeRequest(ee)}const stripNonAlpha=s=>s?s.replace(/\W/g,""):null;const isNonEmptyServerList=s=>Array.isArray(s)&&s.length>0;const makeResolveSubtree=s=>async(o,i,a={})=>(async(s,o,i={})=>{const{returnEntireTree:a,baseDoc:u,requestInterceptor:_,responseInterceptor:w,parameterMacro:x,modelPropertyMacro:C,useCircularStructures:j,strategies:L}=i,B={spec:s,pathDiscriminator:o,baseDoc:u,requestInterceptor:_,responseInterceptor:w,parameterMacro:x,modelPropertyMacro:C,useCircularStructures:j,strategies:L},$=L.find((o=>o.match(s))).normalize(s),U=await Nx({spec:$,...B,allowMetaPatches:!0,skipNormalization:!isOpenAPI31(s)});return!a&&Array.isArray(o)&&o.length&&(U.spec=o.reduce(((s,o)=>null==s?void 0:s[o]),U.spec)||null),U})(o,i,{...s,...a}),Qx=(makeResolveSubtree({strategies:[_u,vu,gu]}),(s,o)=>(...i)=>{s(...i);const a=o.getConfigs().withCredentials;o.fn.fetch.withCredentials=a});function swagger_client({configs:s,getConfigs:o}){return{fn:{fetch:(i=http_http,a=s.preFetch,u=s.postFetch,u=u||(s=>s),a=a||(s=>s),s=>("string"==typeof s&&(s={url:s}),s=serializeRequest(s),s=a(s),u(i(s)))),buildRequest:execute_buildRequest,execute:execute_execute,resolve:makeResolve({strategies:[Tx,_u,vu,gu]}),resolveSubtree:async(s,i,a={})=>{const u=o(),_={modelPropertyMacro:u.modelPropertyMacro,parameterMacro:u.parameterMacro,requestInterceptor:u.requestInterceptor,responseInterceptor:u.responseInterceptor,strategies:[Tx,_u,vu,gu]};return makeResolveSubtree(_)(s,i,a)},serializeRes:serializeResponse,opId},statePlugins:{configs:{wrapActions:{loaded:Qx}}}};var i,a,u}function util(){return{fn:{shallowEqualKeys,sanitizeUrl}}}var Zx=__webpack_require__(40961),tk=(__webpack_require__(78418),Re.version.startsWith("19")),rk=Symbol.for(tk?"react.transitional.element":"react.element"),nk=Symbol.for("react.portal"),sk=Symbol.for("react.fragment"),ok=Symbol.for("react.strict_mode"),lk=Symbol.for("react.profiler"),uk=Symbol.for("react.consumer"),pk=Symbol.for("react.context"),fk=Symbol.for("react.forward_ref"),mk=Symbol.for("react.suspense"),yk=Symbol.for("react.suspense_list"),vk=Symbol.for("react.memo"),_k=Symbol.for("react.lazy"),wk=fk,xk=vk;function typeOf(s){if("object"==typeof s&&null!==s){const{$$typeof:o}=s;switch(o){case rk:switch(s=s.type){case sk:case lk:case ok:case mk:case yk:return s;default:switch(s=s&&s.$$typeof){case pk:case fk:case _k:case vk:case uk:return s;default:return o}}case nk:return o}}}function pureFinalPropsSelectorFactory(s,o,i,a,{areStatesEqual:u,areOwnPropsEqual:_,areStatePropsEqual:w}){let x,C,j,L,B,$=!1;function handleSubsequentCalls($,U){const V=!_(U,C),z=!u($,x,U,C);return x=$,C=U,V&&z?function handleNewPropsAndNewState(){return j=s(x,C),o.dependsOnOwnProps&&(L=o(a,C)),B=i(j,L,C),B}():V?function handleNewProps(){return s.dependsOnOwnProps&&(j=s(x,C)),o.dependsOnOwnProps&&(L=o(a,C)),B=i(j,L,C),B}():z?function handleNewState(){const o=s(x,C),a=!w(o,j);return j=o,a&&(B=i(j,L,C)),B}():B}return function pureFinalPropsSelector(u,_){return $?handleSubsequentCalls(u,_):function handleFirstCall(u,_){return x=u,C=_,j=s(x,C),L=o(a,C),B=i(j,L,C),$=!0,B}(u,_)}}function wrapMapToPropsConstant(s){return function initConstantSelector(o){const i=s(o);function constantSelector(){return i}return constantSelector.dependsOnOwnProps=!1,constantSelector}}function getDependsOnOwnProps(s){return s.dependsOnOwnProps?Boolean(s.dependsOnOwnProps):1!==s.length}function wrapMapToPropsFunc(s,o){return function initProxySelector(o,{displayName:i}){const a=function mapToPropsProxy(s,o){return a.dependsOnOwnProps?a.mapToProps(s,o):a.mapToProps(s,void 0)};return a.dependsOnOwnProps=!0,a.mapToProps=function detectFactoryAndVerify(o,i){a.mapToProps=s,a.dependsOnOwnProps=getDependsOnOwnProps(s);let u=a(o,i);return"function"==typeof u&&(a.mapToProps=u,a.dependsOnOwnProps=getDependsOnOwnProps(u),u=a(o,i)),u},a}}function createInvalidArgFactory(s,o){return(i,a)=>{throw new Error(`Invalid value of type ${typeof s} for ${o} argument when connecting component ${a.wrappedComponentName}.`)}}function defaultMergeProps(s,o,i){return{...i,...s,...o}}function defaultNoopBatch(s){s()}var Ak={notify(){},get:()=>[]};function createSubscription(s,o){let i,a=Ak,u=0,_=!1;function handleChangeWrapper(){w.onStateChange&&w.onStateChange()}function trySubscribe(){u++,i||(i=o?o.addNestedSub(handleChangeWrapper):s.subscribe(handleChangeWrapper),a=function createListenerCollection(){let s=null,o=null;return{clear(){s=null,o=null},notify(){defaultNoopBatch((()=>{let o=s;for(;o;)o.callback(),o=o.next}))},get(){const o=[];let i=s;for(;i;)o.push(i),i=i.next;return o},subscribe(i){let a=!0;const u=o={callback:i,next:null,prev:o};return u.prev?u.prev.next=u:s=u,function unsubscribe(){a&&null!==s&&(a=!1,u.next?u.next.prev=u.prev:o=u.prev,u.prev?u.prev.next=u.next:s=u.next)}}}}())}function tryUnsubscribe(){u--,i&&0===u&&(i(),i=void 0,a.clear(),a=Ak)}const w={addNestedSub:function addNestedSub(s){trySubscribe();const o=a.subscribe(s);let i=!1;return()=>{i||(i=!0,o(),tryUnsubscribe())}},notifyNestedSubs:function notifyNestedSubs(){a.notify()},handleChangeWrapper,isSubscribed:function isSubscribed(){return _},trySubscribe:function trySubscribeSelf(){_||(_=!0,trySubscribe())},tryUnsubscribe:function tryUnsubscribeSelf(){_&&(_=!1,tryUnsubscribe())},getListeners:()=>a};return w}var Bk=(()=>!("undefined"==typeof window||void 0===window.document||void 0===window.document.createElement))(),qk=(()=>"undefined"!=typeof navigator&&"ReactNative"===navigator.product)(),Vk=(()=>Bk||qk?Re.useLayoutEffect:Re.useEffect)();function is(s,o){return s===o?0!==s||0!==o||1/s==1/o:s!=s&&o!=o}function shallowEqual(s,o){if(is(s,o))return!0;if("object"!=typeof s||null===s||"object"!=typeof o||null===o)return!1;const i=Object.keys(s),a=Object.keys(o);if(i.length!==a.length)return!1;for(let a=0;a<i.length;a++)if(!Object.prototype.hasOwnProperty.call(o,i[a])||!is(s[i[a]],o[i[a]]))return!1;return!0}var zk={childContextTypes:!0,contextType:!0,contextTypes:!0,defaultProps:!0,displayName:!0,getDefaultProps:!0,getDerivedStateFromError:!0,getDerivedStateFromProps:!0,mixins:!0,propTypes:!0,type:!0},eO={name:!0,length:!0,prototype:!0,caller:!0,callee:!0,arguments:!0,arity:!0},tO={$$typeof:!0,compare:!0,defaultProps:!0,displayName:!0,propTypes:!0,type:!0},rO={[wk]:{$$typeof:!0,render:!0,defaultProps:!0,displayName:!0,propTypes:!0},[xk]:tO};function getStatics(s){return function isMemo(s){return typeOf(s)===vk}(s)?tO:rO[s.$$typeof]||zk}var nO=Object.defineProperty,sO=Object.getOwnPropertyNames,oO=Object.getOwnPropertySymbols,iO=Object.getOwnPropertyDescriptor,aO=Object.getPrototypeOf,cO=Object.prototype;function hoistNonReactStatics(s,o){if("string"!=typeof o){if(cO){const i=aO(o);i&&i!==cO&&hoistNonReactStatics(s,i)}let i=sO(o);oO&&(i=i.concat(oO(o)));const a=getStatics(s),u=getStatics(o);for(let _=0;_<i.length;++_){const w=i[_];if(!(eO[w]||u&&u[w]||a&&a[w])){const i=iO(o,w);try{nO(s,w,i)}catch(s){}}}}return s}var lO=Symbol.for("react-redux-context"),uO="undefined"!=typeof globalThis?globalThis:{};function getContext(){if(!Re.createContext)return{};const s=uO[lO]??=new Map;let o=s.get(Re.createContext);return o||(o=Re.createContext(null),s.set(Re.createContext,o)),o}var pO=getContext(),hO=[null,null];function captureWrapperProps(s,o,i,a,u,_){s.current=a,i.current=!1,u.current&&(u.current=null,_())}function strictEqual(s,o){return s===o}var dO=function connect(s,o,i,{pure:a,areStatesEqual:u=strictEqual,areOwnPropsEqual:_=shallowEqual,areStatePropsEqual:w=shallowEqual,areMergedPropsEqual:x=shallowEqual,forwardRef:C=!1,context:j=pO}={}){const L=j,B=function mapStateToPropsFactory(s){return s?"function"==typeof s?wrapMapToPropsFunc(s):createInvalidArgFactory(s,"mapStateToProps"):wrapMapToPropsConstant((()=>({})))}(s),$=function mapDispatchToPropsFactory(s){return s&&"object"==typeof s?wrapMapToPropsConstant((o=>function react_redux_bindActionCreators(s,o){const i={};for(const a in s){const u=s[a];"function"==typeof u&&(i[a]=(...s)=>o(u(...s)))}return i}(s,o))):s?"function"==typeof s?wrapMapToPropsFunc(s):createInvalidArgFactory(s,"mapDispatchToProps"):wrapMapToPropsConstant((s=>({dispatch:s})))}(o),U=function mergePropsFactory(s){return s?"function"==typeof s?function wrapMergePropsFunc(s){return function initMergePropsProxy(o,{displayName:i,areMergedPropsEqual:a}){let u,_=!1;return function mergePropsProxy(o,i,w){const x=s(o,i,w);return _?a(x,u)||(u=x):(_=!0,u=x),u}}}(s):createInvalidArgFactory(s,"mergeProps"):()=>defaultMergeProps}(i),V=Boolean(s);return s=>{const o=s.displayName||s.name||"Component",i=`Connect(${o})`,a={shouldHandleStateChanges:V,displayName:i,wrappedComponentName:o,WrappedComponent:s,initMapStateToProps:B,initMapDispatchToProps:$,initMergeProps:U,areStatesEqual:u,areStatePropsEqual:w,areOwnPropsEqual:_,areMergedPropsEqual:x};function ConnectFunction(o){const[i,u,_]=Re.useMemo((()=>{const{reactReduxForwardedRef:s,...i}=o;return[o.context,s,i]}),[o]),w=Re.useMemo((()=>L),[i,L]),x=Re.useContext(w),C=Boolean(o.store)&&Boolean(o.store.getState)&&Boolean(o.store.dispatch),j=Boolean(x)&&Boolean(x.store);const B=C?o.store:x.store,$=j?x.getServerState:B.getState,U=Re.useMemo((()=>function finalPropsSelectorFactory(s,{initMapStateToProps:o,initMapDispatchToProps:i,initMergeProps:a,...u}){return pureFinalPropsSelectorFactory(o(s,u),i(s,u),a(s,u),s,u)}(B.dispatch,a)),[B]),[z,Y]=Re.useMemo((()=>{if(!V)return hO;const s=createSubscription(B,C?void 0:x.subscription),o=s.notifyNestedSubs.bind(s);return[s,o]}),[B,C,x]),Z=Re.useMemo((()=>C?x:{...x,subscription:z}),[C,x,z]),ee=Re.useRef(void 0),ie=Re.useRef(_),ae=Re.useRef(void 0),ce=Re.useRef(!1),le=Re.useRef(!1),pe=Re.useRef(void 0);Vk((()=>(le.current=!0,()=>{le.current=!1})),[]);const de=Re.useMemo((()=>()=>ae.current&&_===ie.current?ae.current:U(B.getState(),_)),[B,_]),fe=Re.useMemo((()=>s=>z?function subscribeUpdates(s,o,i,a,u,_,w,x,C,j,L){if(!s)return()=>{};let B=!1,$=null;const checkForUpdates=()=>{if(B||!x.current)return;const s=o.getState();let i,U;try{i=a(s,u.current)}catch(s){U=s,$=s}U||($=null),i===_.current?w.current||j():(_.current=i,C.current=i,w.current=!0,L())};return i.onStateChange=checkForUpdates,i.trySubscribe(),checkForUpdates(),()=>{if(B=!0,i.tryUnsubscribe(),i.onStateChange=null,$)throw $}}(V,B,z,U,ie,ee,ce,le,ae,Y,s):()=>{}),[z]);let ye;!function useIsomorphicLayoutEffectWithArgs(s,o,i){Vk((()=>s(...o)),i)}(captureWrapperProps,[ie,ee,ce,_,ae,Y]);try{ye=Re.useSyncExternalStore(fe,de,$?()=>U($(),_):de)}catch(s){throw pe.current&&(s.message+=`\nThe error may be correlated with this previous error:\n${pe.current.stack}\n\n`),s}Vk((()=>{pe.current=void 0,ae.current=void 0,ee.current=ye}));const be=Re.useMemo((()=>Re.createElement(s,{...ye,ref:u})),[u,s,ye]);return Re.useMemo((()=>V?Re.createElement(w.Provider,{value:Z},be):be),[w,be,Z])}const j=Re.memo(ConnectFunction);if(j.WrappedComponent=s,j.displayName=ConnectFunction.displayName=i,C){const o=Re.forwardRef((function forwardConnectRef(s,o){return Re.createElement(j,{...s,reactReduxForwardedRef:o})}));return o.displayName=i,o.WrappedComponent=s,hoistNonReactStatics(o,s)}return hoistNonReactStatics(j,s)}};var fO=function Provider(s){const{children:o,context:i,serverState:a,store:u}=s,_=Re.useMemo((()=>{const s=createSubscription(u);return{store:u,subscription:s,getServerState:a?()=>a:void 0}}),[u,a]),w=Re.useMemo((()=>u.getState()),[u]);Vk((()=>{const{subscription:s}=_;return s.onStateChange=s.notifyNestedSubs,s.trySubscribe(),w!==u.getState()&&s.notifyNestedSubs(),()=>{s.tryUnsubscribe(),s.onStateChange=void 0}}),[_,w]);const x=i||pO;return Re.createElement(x.Provider,{value:_},o)};var mO=__webpack_require__(83488),gO=__webpack_require__.n(mO);const withSystem=s=>o=>{const{fn:i}=s();class WithSystem extends Re.Component{render(){return Re.createElement(o,Mn()({},s(),this.props,this.context))}}return WithSystem.displayName=`WithSystem(${i.getDisplayName(o)})`,WithSystem},withRoot=(s,o)=>i=>{const{fn:a}=s();class WithRoot extends Re.Component{render(){return Re.createElement(fO,{store:o},Re.createElement(i,Mn()({},this.props,this.context)))}}return WithRoot.displayName=`WithRoot(${a.getDisplayName(i)})`,WithRoot},withConnect=(s,o,i)=>compose(i?withRoot(s,i):gO(),dO(((i,a)=>{const u={...a,...s()},_=o.prototype?.mapStateToProps||(s=>({state:s}));return _(i,u)})),withSystem(s))(o),handleProps=(s,o,i,a)=>{for(const u in o){const _=o[u];"function"==typeof _&&_(i[u],a[u],s())}},withMappedContainer=(s,o,i)=>(o,a)=>{const{fn:u}=s(),_=i(o,"root");class WithMappedContainer extends Re.Component{constructor(o,i){super(o,i),handleProps(s,a,o,{})}UNSAFE_componentWillReceiveProps(o){handleProps(s,a,o,this.props)}render(){const s=Gt()(this.props,a?Object.keys(a):[]);return Re.createElement(_,s)}}return WithMappedContainer.displayName=`WithMappedContainer(${u.getDisplayName(_)})`,WithMappedContainer},render=(s,o,i,a)=>u=>{const _=i(s,o,a)("App","root"),{createRoot:w}=Zx;w(u).render(Re.createElement(_,null))},getComponent=(s,o,i)=>(a,u,_={})=>{if("string"!=typeof a)throw new TypeError("Need a string, to fetch a component. Was given a "+typeof a);const w=i(a);return w?u?"root"===u?withConnect(s,w,o()):withConnect(s,w):w:(_.failSilently||s().log.warn("Could not find component:",a),null)},getDisplayName=s=>s.displayName||s.name||"Component",view=({getComponents:s,getStore:o,getSystem:i})=>{const a=(u=getComponent(i,o,s),Pt(u,((...s)=>JSON.stringify(s))));var u;const _=(s=>utils_memoizeN(s,((...s)=>s)))(withMappedContainer(i,0,a));return{rootInjects:{getComponent:a,makeMappedContainer:_,render:render(i,o,getComponent,s)},fn:{getDisplayName}}},view_legacy=({React:s,getSystem:o,getStore:i,getComponents:a})=>{const u={},_=parseInt(s?.version,10);return _>=16&&_<18&&(u.render=((s,o,i,a)=>u=>{const _=i(s,o,a)("App","root");Zx.render(Re.createElement(_,null),u)})(o,i,getComponent,a)),{rootInjects:u}};function downloadUrlPlugin(s){let{fn:o}=s;const i={download:s=>({errActions:i,specSelectors:a,specActions:u,getConfigs:_})=>{let{fetch:w}=o;const x=_();function next(o){if(o instanceof Error||o.status>=400)return u.updateLoadingStatus("failed"),i.newThrownErr(Object.assign(new Error((o.message||o.statusText)+" "+s),{source:"fetch"})),void(!o.status&&o instanceof Error&&function checkPossibleFailReasons(){try{let o;if("URL"in lt?o=new URL(s):(o=document.createElement("a"),o.href=s),"https:"!==o.protocol&&"https:"===lt.location.protocol){const s=Object.assign(new Error(`Possible mixed-content issue? The page was loaded over https:// but a ${o.protocol}// URL was specified. Check that you are not attempting to load mixed content.`),{source:"fetch"});return void i.newThrownErr(s)}if(o.origin!==lt.location.origin){const s=Object.assign(new Error(`Possible cross-origin (CORS) issue? The URL origin (${o.origin}) does not match the page (${lt.location.origin}). Check the server returns the correct 'Access-Control-Allow-*' headers.`),{source:"fetch"});i.newThrownErr(s)}}catch(s){return}}());u.updateLoadingStatus("success"),u.updateSpec(o.text),a.url()!==s&&u.updateUrl(s)}s=s||a.url(),u.updateLoadingStatus("loading"),i.clear({source:"fetch"}),w({url:s,loadSpec:!0,requestInterceptor:x.requestInterceptor||(s=>s),responseInterceptor:x.responseInterceptor||(s=>s),credentials:"same-origin",headers:{Accept:"application/json,*/*"}}).then(next,next)},updateLoadingStatus:s=>{let o=[null,"loading","failed","success","failedConfig"];return-1===o.indexOf(s)&&console.error(`Error: ${s} is not one of ${JSON.stringify(o)}`),{type:"spec_update_loading_status",payload:s}}};let a={loadingStatus:Ut((s=>s||(0,ze.Map)()),(s=>s.get("loadingStatus")||null))};return{statePlugins:{spec:{actions:i,reducers:{spec_update_loading_status:(s,o)=>"string"==typeof o.payload?s.set("loadingStatus",o.payload):s},selectors:a}}}}function arrayLikeToArray_arrayLikeToArray(s,o){(null==o||o>s.length)&&(o=s.length);for(var i=0,a=Array(o);i<o;i++)a[i]=s[i];return a}function toConsumableArray_toConsumableArray(s){return function arrayWithoutHoles_arrayWithoutHoles(s){if(Array.isArray(s))return arrayLikeToArray_arrayLikeToArray(s)}(s)||function iterableToArray_iterableToArray(s){if("undefined"!=typeof Symbol&&null!=s[Symbol.iterator]||null!=s["@@iterator"])return Array.from(s)}(s)||function unsupportedIterableToArray_unsupportedIterableToArray(s,o){if(s){if("string"==typeof s)return arrayLikeToArray_arrayLikeToArray(s,o);var i={}.toString.call(s).slice(8,-1);return"Object"===i&&s.constructor&&(i=s.constructor.name),"Map"===i||"Set"===i?Array.from(s):"Arguments"===i||/^(?:Ui|I)nt(?:8|16|32)(?:Clamped)?Array$/.test(i)?arrayLikeToArray_arrayLikeToArray(s,o):void 0}}(s)||function nonIterableSpread_nonIterableSpread(){throw new TypeError("Invalid attempt to spread non-iterable instance.\nIn order to be iterable, non-array objects must have a [Symbol.iterator]() method.")}()}function typeof_typeof(s){return typeof_typeof="function"==typeof Symbol&&"symbol"==typeof Symbol.iterator?function(s){return typeof s}:function(s){return s&&"function"==typeof Symbol&&s.constructor===Symbol&&s!==Symbol.prototype?"symbol":typeof s},typeof_typeof(s)}function toPropertyKey(s){var o=function toPrimitive(s,o){if("object"!=typeof_typeof(s)||!s)return s;var i=s[Symbol.toPrimitive];if(void 0!==i){var a=i.call(s,o||"default");if("object"!=typeof_typeof(a))return a;throw new TypeError("@@toPrimitive must return a primitive value.")}return("string"===o?String:Number)(s)}(s,"string");return"symbol"==typeof_typeof(o)?o:o+""}function defineProperty_defineProperty(s,o,i){return(o=toPropertyKey(o))in s?Object.defineProperty(s,o,{value:i,enumerable:!0,configurable:!0,writable:!0}):s[o]=i,s}function extends_extends(){return extends_extends=Object.assign?Object.assign.bind():function(s){for(var o=1;o<arguments.length;o++){var i=arguments[o];for(var a in i)({}).hasOwnProperty.call(i,a)&&(s[a]=i[a])}return s},extends_extends.apply(null,arguments)}function create_element_ownKeys(s,o){var i=Object.keys(s);if(Object.getOwnPropertySymbols){var a=Object.getOwnPropertySymbols(s);o&&(a=a.filter((function(o){return Object.getOwnPropertyDescriptor(s,o).enumerable}))),i.push.apply(i,a)}return i}function _objectSpread(s){for(var o=1;o<arguments.length;o++){var i=null!=arguments[o]?arguments[o]:{};o%2?create_element_ownKeys(Object(i),!0).forEach((function(o){defineProperty_defineProperty(s,o,i[o])})):Object.getOwnPropertyDescriptors?Object.defineProperties(s,Object.getOwnPropertyDescriptors(i)):create_element_ownKeys(Object(i)).forEach((function(o){Object.defineProperty(s,o,Object.getOwnPropertyDescriptor(i,o))}))}return s}var yO={};function createStyleObject(s){var o=arguments.length>1&&void 0!==arguments[1]?arguments[1]:{},i=arguments.length>2?arguments[2]:void 0;return function getClassNameCombinations(s){if(0===s.length||1===s.length)return s;var o=s.join(".");return yO[o]||(yO[o]=function powerSetPermutations(s){var o=s.length;return 0===o||1===o?s:2===o?[s[0],s[1],"".concat(s[0],".").concat(s[1]),"".concat(s[1],".").concat(s[0])]:3===o?[s[0],s[1],s[2],"".concat(s[0],".").concat(s[1]),"".concat(s[0],".").concat(s[2]),"".concat(s[1],".").concat(s[0]),"".concat(s[1],".").concat(s[2]),"".concat(s[2],".").concat(s[0]),"".concat(s[2],".").concat(s[1]),"".concat(s[0],".").concat(s[1],".").concat(s[2]),"".concat(s[0],".").concat(s[2],".").concat(s[1]),"".concat(s[1],".").concat(s[0],".").concat(s[2]),"".concat(s[1],".").concat(s[2],".").concat(s[0]),"".concat(s[2],".").concat(s[0],".").concat(s[1]),"".concat(s[2],".").concat(s[1],".").concat(s[0])]:o>=4?[s[0],s[1],s[2],s[3],"".concat(s[0],".").concat(s[1]),"".concat(s[0],".").concat(s[2]),"".concat(s[0],".").concat(s[3]),"".concat(s[1],".").concat(s[0]),"".concat(s[1],".").concat(s[2]),"".concat(s[1],".").concat(s[3]),"".concat(s[2],".").concat(s[0]),"".concat(s[2],".").concat(s[1]),"".concat(s[2],".").concat(s[3]),"".concat(s[3],".").concat(s[0]),"".concat(s[3],".").concat(s[1]),"".concat(s[3],".").concat(s[2]),"".concat(s[0],".").concat(s[1],".").concat(s[2]),"".concat(s[0],".").concat(s[1],".").concat(s[3]),"".concat(s[0],".").concat(s[2],".").concat(s[1]),"".concat(s[0],".").concat(s[2],".").concat(s[3]),"".concat(s[0],".").concat(s[3],".").concat(s[1]),"".concat(s[0],".").concat(s[3],".").concat(s[2]),"".concat(s[1],".").concat(s[0],".").concat(s[2]),"".concat(s[1],".").concat(s[0],".").concat(s[3]),"".concat(s[1],".").concat(s[2],".").concat(s[0]),"".concat(s[1],".").concat(s[2],".").concat(s[3]),"".concat(s[1],".").concat(s[3],".").concat(s[0]),"".concat(s[1],".").concat(s[3],".").concat(s[2]),"".concat(s[2],".").concat(s[0],".").concat(s[1]),"".concat(s[2],".").concat(s[0],".").concat(s[3]),"".concat(s[2],".").concat(s[1],".").concat(s[0]),"".concat(s[2],".").concat(s[1],".").concat(s[3]),"".concat(s[2],".").concat(s[3],".").concat(s[0]),"".concat(s[2],".").concat(s[3],".").concat(s[1]),"".concat(s[3],".").concat(s[0],".").concat(s[1]),"".concat(s[3],".").concat(s[0],".").concat(s[2]),"".concat(s[3],".").concat(s[1],".").concat(s[0]),"".concat(s[3],".").concat(s[1],".").concat(s[2]),"".concat(s[3],".").concat(s[2],".").concat(s[0]),"".concat(s[3],".").concat(s[2],".").concat(s[1]),"".concat(s[0],".").concat(s[1],".").concat(s[2],".").concat(s[3]),"".concat(s[0],".").concat(s[1],".").concat(s[3],".").concat(s[2]),"".concat(s[0],".").concat(s[2],".").concat(s[1],".").concat(s[3]),"".concat(s[0],".").concat(s[2],".").concat(s[3],".").concat(s[1]),"".concat(s[0],".").concat(s[3],".").concat(s[1],".").concat(s[2]),"".concat(s[0],".").concat(s[3],".").concat(s[2],".").concat(s[1]),"".concat(s[1],".").concat(s[0],".").concat(s[2],".").concat(s[3]),"".concat(s[1],".").concat(s[0],".").concat(s[3],".").concat(s[2]),"".concat(s[1],".").concat(s[2],".").concat(s[0],".").concat(s[3]),"".concat(s[1],".").concat(s[2],".").concat(s[3],".").concat(s[0]),"".concat(s[1],".").concat(s[3],".").concat(s[0],".").concat(s[2]),"".concat(s[1],".").concat(s[3],".").concat(s[2],".").concat(s[0]),"".concat(s[2],".").concat(s[0],".").concat(s[1],".").concat(s[3]),"".concat(s[2],".").concat(s[0],".").concat(s[3],".").concat(s[1]),"".concat(s[2],".").concat(s[1],".").concat(s[0],".").concat(s[3]),"".concat(s[2],".").concat(s[1],".").concat(s[3],".").concat(s[0]),"".concat(s[2],".").concat(s[3],".").concat(s[0],".").concat(s[1]),"".concat(s[2],".").concat(s[3],".").concat(s[1],".").concat(s[0]),"".concat(s[3],".").concat(s[0],".").concat(s[1],".").concat(s[2]),"".concat(s[3],".").concat(s[0],".").concat(s[2],".").concat(s[1]),"".concat(s[3],".").concat(s[1],".").concat(s[0],".").concat(s[2]),"".concat(s[3],".").concat(s[1],".").concat(s[2],".").concat(s[0]),"".concat(s[3],".").concat(s[2],".").concat(s[0],".").concat(s[1]),"".concat(s[3],".").concat(s[2],".").concat(s[1],".").concat(s[0])]:void 0}(s)),yO[o]}(s.filter((function(s){return"token"!==s}))).reduce((function(s,o){return _objectSpread(_objectSpread({},s),i[o])}),o)}function createClassNameString(s){return s.join(" ")}function createElement(s){var o=s.node,i=s.stylesheet,a=s.style,u=void 0===a?{}:a,_=s.useInlineStyles,w=s.key,x=o.properties,C=o.type,j=o.tagName,L=o.value;if("text"===C)return L;if(j){var B,$=function createChildren(s,o){var i=0;return function(a){return i+=1,a.map((function(a,u){return createElement({node:a,stylesheet:s,useInlineStyles:o,key:"code-segment-".concat(i,"-").concat(u)})}))}}(i,_);if(_){var U=Object.keys(i).reduce((function(s,o){return o.split(".").forEach((function(o){s.includes(o)||s.push(o)})),s}),[]),V=x.className&&x.className.includes("token")?["token"]:[],z=x.className&&V.concat(x.className.filter((function(s){return!U.includes(s)})));B=_objectSpread(_objectSpread({},x),{},{className:createClassNameString(z)||void 0,style:createStyleObject(x.className,Object.assign({},x.style,u),i)})}else B=_objectSpread(_objectSpread({},x),{},{className:createClassNameString(x.className)});var Y=$(o.children);return Re.createElement(j,extends_extends({key:w},B),Y)}}var vO=["language","children","style","customStyle","codeTagProps","useInlineStyles","showLineNumbers","showInlineLineNumbers","startingLineNumber","lineNumberContainerStyle","lineNumberStyle","wrapLines","wrapLongLines","lineProps","renderer","PreTag","CodeTag","code","astGenerator"];function highlight_ownKeys(s,o){var i=Object.keys(s);if(Object.getOwnPropertySymbols){var a=Object.getOwnPropertySymbols(s);o&&(a=a.filter((function(o){return Object.getOwnPropertyDescriptor(s,o).enumerable}))),i.push.apply(i,a)}return i}function highlight_objectSpread(s){for(var o=1;o<arguments.length;o++){var i=null!=arguments[o]?arguments[o]:{};o%2?highlight_ownKeys(Object(i),!0).forEach((function(o){defineProperty_defineProperty(s,o,i[o])})):Object.getOwnPropertyDescriptors?Object.defineProperties(s,Object.getOwnPropertyDescriptors(i)):highlight_ownKeys(Object(i)).forEach((function(o){Object.defineProperty(s,o,Object.getOwnPropertyDescriptor(i,o))}))}return s}var bO=/\n/g;function AllLineNumbers(s){var o=s.codeString,i=s.codeStyle,a=s.containerStyle,u=void 0===a?{float:"left",paddingRight:"10px"}:a,_=s.numberStyle,w=void 0===_?{}:_,x=s.startingLineNumber;return Re.createElement("code",{style:Object.assign({},i,u)},function getAllLineNumbers(s){var o=s.lines,i=s.startingLineNumber,a=s.style;return o.map((function(s,o){var u=o+i;return Re.createElement("span",{key:"line-".concat(o),className:"react-syntax-highlighter-line-number",style:"function"==typeof a?a(u):a},"".concat(u,"\n"))}))}({lines:o.replace(/\n$/,"").split("\n"),style:w,startingLineNumber:x}))}function getInlineLineNumber(s,o){return{type:"element",tagName:"span",properties:{key:"line-number--".concat(s),className:["comment","linenumber","react-syntax-highlighter-line-number"],style:o},children:[{type:"text",value:s}]}}function assembleLineNumberStyles(s,o,i){var a,u={display:"inline-block",minWidth:(a=i,"".concat(a.toString().length,".25em")),paddingRight:"1em",textAlign:"right",userSelect:"none"},_="function"==typeof s?s(o):s;return highlight_objectSpread(highlight_objectSpread({},u),_)}function createLineElement(s){var o=s.children,i=s.lineNumber,a=s.lineNumberStyle,u=s.largestLineNumber,_=s.showInlineLineNumbers,w=s.lineProps,x=void 0===w?{}:w,C=s.className,j=void 0===C?[]:C,L=s.showLineNumbers,B=s.wrapLongLines,$=s.wrapLines,U=void 0!==$&&$?highlight_objectSpread({},"function"==typeof x?x(i):x):{};if(U.className=U.className?[].concat(toConsumableArray_toConsumableArray(U.className.trim().split(/\s+/)),toConsumableArray_toConsumableArray(j)):j,i&&_){var V=assembleLineNumberStyles(a,i,u);o.unshift(getInlineLineNumber(i,V))}return B&L&&(U.style=highlight_objectSpread({display:"flex"},U.style)),{type:"element",tagName:"span",properties:U,children:o}}function flattenCodeTree(s){var o=arguments.length>1&&void 0!==arguments[1]?arguments[1]:[],i=arguments.length>2&&void 0!==arguments[2]?arguments[2]:[];void 0===s.length&&(s=[s]);for(var a=0;a<s.length;a++){var u=s[a];if("text"===u.type)i.push(createLineElement({children:[u],className:toConsumableArray_toConsumableArray(new Set(o))}));else if(u.children){var _,w=o.concat((null===(_=u.properties)||void 0===_?void 0:_.className)||[]);flattenCodeTree(u.children,w).forEach((function(s){return i.push(s)}))}}return i}function processLines(s,o,i,a,u,_,w,x,C){var j,L=flattenCodeTree(s.value),B=[],$=-1,U=0;function createLine(s,_){var j=arguments.length>2&&void 0!==arguments[2]?arguments[2]:[];return o||j.length>0?function createWrappedLine(s,_){return createLineElement({children:s,lineNumber:_,lineNumberStyle:x,largestLineNumber:w,showInlineLineNumbers:u,lineProps:i,className:arguments.length>2&&void 0!==arguments[2]?arguments[2]:[],showLineNumbers:a,wrapLongLines:C,wrapLines:o})}(s,_,j):function createUnwrappedLine(s,o){if(a&&o&&u){var i=assembleLineNumberStyles(x,o,w);s.unshift(getInlineLineNumber(o,i))}return s}(s,_)}for(var V=function _loop(){var s=L[U],o=s.children[0].value,i=function getNewLines(s){return s.match(bO)}(o);if(i){var u=o.split("\n");u.forEach((function(o,i){var w=a&&B.length+_,x={type:"text",value:"".concat(o,"\n")};if(0===i){var C=createLine(L.slice($+1,U).concat(createLineElement({children:[x],className:s.properties.className})),w);B.push(C)}else if(i===u.length-1){var j=L[U+1]&&L[U+1].children&&L[U+1].children[0],V={type:"text",value:"".concat(o)};if(j){var z=createLineElement({children:[V],className:s.properties.className});L.splice(U+1,0,z)}else{var Y=createLine([V],w,s.properties.className);B.push(Y)}}else{var Z=createLine([x],w,s.properties.className);B.push(Z)}})),$=U}U++};U<L.length;)V();if($!==L.length-1){var z=L.slice($+1,L.length);if(z&&z.length){var Y=createLine(z,a&&B.length+_);B.push(Y)}}return o?B:(j=[]).concat.apply(j,B)}function defaultRenderer(s){var o=s.rows,i=s.stylesheet,a=s.useInlineStyles;return o.map((function(s,o){return createElement({node:s,stylesheet:i,useInlineStyles:a,key:"code-segment-".concat(o)})}))}function isHighlightJs(s){return s&&void 0!==s.highlightAuto}var _O=__webpack_require__(43768),SO=function highlight(s,o){return function SyntaxHighlighter(i){var a,u,_=i.language,w=i.children,x=i.style,C=void 0===x?o:x,j=i.customStyle,L=void 0===j?{}:j,B=i.codeTagProps,$=void 0===B?{className:_?"language-".concat(_):void 0,style:highlight_objectSpread(highlight_objectSpread({},C['code[class*="language-"]']),C['code[class*="language-'.concat(_,'"]')])}:B,U=i.useInlineStyles,V=void 0===U||U,z=i.showLineNumbers,Y=void 0!==z&&z,Z=i.showInlineLineNumbers,ee=void 0===Z||Z,ie=i.startingLineNumber,ae=void 0===ie?1:ie,ce=i.lineNumberContainerStyle,le=i.lineNumberStyle,pe=void 0===le?{}:le,de=i.wrapLines,fe=i.wrapLongLines,ye=void 0!==fe&&fe,be=i.lineProps,_e=void 0===be?{}:be,Se=i.renderer,we=i.PreTag,xe=void 0===we?"pre":we,Pe=i.CodeTag,Te=void 0===Pe?"code":Pe,$e=i.code,qe=void 0===$e?(Array.isArray(w)?w[0]:w)||"":$e,ze=i.astGenerator,We=function _objectWithoutProperties(s,o){if(null==s)return{};var i,a,u=function _objectWithoutPropertiesLoose(s,o){if(null==s)return{};var i={};for(var a in s)if({}.hasOwnProperty.call(s,a)){if(-1!==o.indexOf(a))continue;i[a]=s[a]}return i}(s,o);if(Object.getOwnPropertySymbols){var _=Object.getOwnPropertySymbols(s);for(a=0;a<_.length;a++)i=_[a],-1===o.indexOf(i)&&{}.propertyIsEnumerable.call(s,i)&&(u[i]=s[i])}return u}(i,vO);ze=ze||s;var He=Y?Re.createElement(AllLineNumbers,{containerStyle:ce,codeStyle:$.style||{},numberStyle:pe,startingLineNumber:ae,codeString:qe}):null,Ye=C.hljs||C['pre[class*="language-"]']||{backgroundColor:"#fff"},Xe=isHighlightJs(ze)?"hljs":"prismjs",Qe=V?Object.assign({},We,{style:Object.assign({},Ye,L)}):Object.assign({},We,{className:We.className?"".concat(Xe," ").concat(We.className):Xe,style:Object.assign({},L)});if($.style=highlight_objectSpread(ye?{whiteSpace:"pre-wrap"}:{whiteSpace:"pre"},$.style),!ze)return Re.createElement(xe,Qe,He,Re.createElement(Te,$,qe));(void 0===de&&Se||ye)&&(de=!0),Se=Se||defaultRenderer;var et=[{type:"text",value:qe}],tt=function getCodeTree(s){var o=s.astGenerator,i=s.language,a=s.code,u=s.defaultCodeValue;if(isHighlightJs(o)){var _=function(s,o){return-1!==s.listLanguages().indexOf(o)}(o,i);return"text"===i?{value:u,language:"text"}:_?o.highlight(i,a):o.highlightAuto(a)}try{return i&&"text"!==i?{value:o.highlight(a,i)}:{value:u}}catch(s){return{value:u}}}({astGenerator:ze,language:_,code:qe,defaultCodeValue:et});null===tt.language&&(tt.value=et);var rt=processLines(tt,de,_e,Y,ee,ae,ae+(null!==(a=null===(u=qe.match(/\n/g))||void 0===u?void 0:u.length)&&void 0!==a?a:0),pe,ye);return Re.createElement(xe,Qe,Re.createElement(Te,$,!ee&&He,Se({rows:rt,stylesheet:C,useInlineStyles:V})))}}(_O,{});SO.registerLanguage=_O.registerLanguage;const EO=SO;var wO=__webpack_require__(95089);const xO=__webpack_require__.n(wO)();var kO=__webpack_require__(65772);const OO=__webpack_require__.n(kO)();var AO=__webpack_require__(17285);const CO=__webpack_require__.n(AO)();var jO=__webpack_require__(35344);const PO=__webpack_require__.n(jO)();var IO=__webpack_require__(17533);const TO=__webpack_require__.n(IO)();var NO=__webpack_require__(73402);const MO=__webpack_require__.n(NO)();var RO=__webpack_require__(26571);const DO=__webpack_require__.n(RO)(),after_load=()=>{EO.registerLanguage("json",OO),EO.registerLanguage("js",xO),EO.registerLanguage("xml",CO),EO.registerLanguage("yaml",TO),EO.registerLanguage("http",MO),EO.registerLanguage("bash",PO),EO.registerLanguage("powershell",DO),EO.registerLanguage("javascript",xO)},LO={hljs:{display:"block",overflowX:"auto",padding:"0.5em",background:"#333",color:"white"},"hljs-name":{fontWeight:"bold"},"hljs-strong":{fontWeight:"bold"},"hljs-code":{fontStyle:"italic",color:"#888"},"hljs-emphasis":{fontStyle:"italic"},"hljs-tag":{color:"#62c8f3"},"hljs-variable":{color:"#ade5fc"},"hljs-template-variable":{color:"#ade5fc"},"hljs-selector-id":{color:"#ade5fc"},"hljs-selector-class":{color:"#ade5fc"},"hljs-string":{color:"#a2fca2"},"hljs-bullet":{color:"#d36363"},"hljs-type":{color:"#ffa"},"hljs-title":{color:"#ffa"},"hljs-section":{color:"#ffa"},"hljs-attribute":{color:"#ffa"},"hljs-quote":{color:"#ffa"},"hljs-built_in":{color:"#ffa"},"hljs-builtin-name":{color:"#ffa"},"hljs-number":{color:"#d36363"},"hljs-symbol":{color:"#d36363"},"hljs-keyword":{color:"#fcc28c"},"hljs-selector-tag":{color:"#fcc28c"},"hljs-literal":{color:"#fcc28c"},"hljs-comment":{color:"#888"},"hljs-deletion":{color:"#333",backgroundColor:"#fc9b9b"},"hljs-regexp":{color:"#c6b4f0"},"hljs-link":{color:"#c6b4f0"},"hljs-meta":{color:"#fc9b9b"},"hljs-addition":{backgroundColor:"#a2fca2",color:"#333"}},FO={agate:LO,arta:{hljs:{display:"block",overflowX:"auto",padding:"0.5em",background:"#222",color:"#aaa"},"hljs-subst":{color:"#aaa"},"hljs-section":{color:"#fff",fontWeight:"bold"},"hljs-comment":{color:"#444"},"hljs-quote":{color:"#444"},"hljs-meta":{color:"#444"},"hljs-string":{color:"#ffcc33"},"hljs-symbol":{color:"#ffcc33"},"hljs-bullet":{color:"#ffcc33"},"hljs-regexp":{color:"#ffcc33"},"hljs-number":{color:"#00cc66"},"hljs-addition":{color:"#00cc66"},"hljs-built_in":{color:"#32aaee"},"hljs-builtin-name":{color:"#32aaee"},"hljs-literal":{color:"#32aaee"},"hljs-type":{color:"#32aaee"},"hljs-template-variable":{color:"#32aaee"},"hljs-attribute":{color:"#32aaee"},"hljs-link":{color:"#32aaee"},"hljs-keyword":{color:"#6644aa"},"hljs-selector-tag":{color:"#6644aa"},"hljs-name":{color:"#6644aa"},"hljs-selector-id":{color:"#6644aa"},"hljs-selector-class":{color:"#6644aa"},"hljs-title":{color:"#bb1166"},"hljs-variable":{color:"#bb1166"},"hljs-deletion":{color:"#bb1166"},"hljs-template-tag":{color:"#bb1166"},"hljs-doctag":{fontWeight:"bold"},"hljs-strong":{fontWeight:"bold"},"hljs-emphasis":{fontStyle:"italic"}},monokai:{hljs:{display:"block",overflowX:"auto",padding:"0.5em",background:"#272822",color:"#ddd"},"hljs-tag":{color:"#f92672"},"hljs-keyword":{color:"#f92672",fontWeight:"bold"},"hljs-selector-tag":{color:"#f92672",fontWeight:"bold"},"hljs-literal":{color:"#f92672",fontWeight:"bold"},"hljs-strong":{color:"#f92672"},"hljs-name":{color:"#f92672"},"hljs-code":{color:"#66d9ef"},"hljs-class .hljs-title":{color:"white"},"hljs-attribute":{color:"#bf79db"},"hljs-symbol":{color:"#bf79db"},"hljs-regexp":{color:"#bf79db"},"hljs-link":{color:"#bf79db"},"hljs-string":{color:"#a6e22e"},"hljs-bullet":{color:"#a6e22e"},"hljs-subst":{color:"#a6e22e"},"hljs-title":{color:"#a6e22e",fontWeight:"bold"},"hljs-section":{color:"#a6e22e",fontWeight:"bold"},"hljs-emphasis":{color:"#a6e22e"},"hljs-type":{color:"#a6e22e",fontWeight:"bold"},"hljs-built_in":{color:"#a6e22e"},"hljs-builtin-name":{color:"#a6e22e"},"hljs-selector-attr":{color:"#a6e22e"},"hljs-selector-pseudo":{color:"#a6e22e"},"hljs-addition":{color:"#a6e22e"},"hljs-variable":{color:"#a6e22e"},"hljs-template-tag":{color:"#a6e22e"},"hljs-template-variable":{color:"#a6e22e"},"hljs-comment":{color:"#75715e"},"hljs-quote":{color:"#75715e"},"hljs-deletion":{color:"#75715e"},"hljs-meta":{color:"#75715e"},"hljs-doctag":{fontWeight:"bold"},"hljs-selector-id":{fontWeight:"bold"}},nord:{hljs:{display:"block",overflowX:"auto",padding:"0.5em",background:"#2E3440",color:"#D8DEE9"},"hljs-subst":{color:"#D8DEE9"},"hljs-selector-tag":{color:"#81A1C1"},"hljs-selector-id":{color:"#8FBCBB",fontWeight:"bold"},"hljs-selector-class":{color:"#8FBCBB"},"hljs-selector-attr":{color:"#8FBCBB"},"hljs-selector-pseudo":{color:"#88C0D0"},"hljs-addition":{backgroundColor:"rgba(163, 190, 140, 0.5)"},"hljs-deletion":{backgroundColor:"rgba(191, 97, 106, 0.5)"},"hljs-built_in":{color:"#8FBCBB"},"hljs-type":{color:"#8FBCBB"},"hljs-class":{color:"#8FBCBB"},"hljs-function":{color:"#88C0D0"},"hljs-function > .hljs-title":{color:"#88C0D0"},"hljs-keyword":{color:"#81A1C1"},"hljs-literal":{color:"#81A1C1"},"hljs-symbol":{color:"#81A1C1"},"hljs-number":{color:"#B48EAD"},"hljs-regexp":{color:"#EBCB8B"},"hljs-string":{color:"#A3BE8C"},"hljs-title":{color:"#8FBCBB"},"hljs-params":{color:"#D8DEE9"},"hljs-bullet":{color:"#81A1C1"},"hljs-code":{color:"#8FBCBB"},"hljs-emphasis":{fontStyle:"italic"},"hljs-formula":{color:"#8FBCBB"},"hljs-strong":{fontWeight:"bold"},"hljs-link:hover":{textDecoration:"underline"},"hljs-quote":{color:"#4C566A"},"hljs-comment":{color:"#4C566A"},"hljs-doctag":{color:"#8FBCBB"},"hljs-meta":{color:"#5E81AC"},"hljs-meta-keyword":{color:"#5E81AC"},"hljs-meta-string":{color:"#A3BE8C"},"hljs-attr":{color:"#8FBCBB"},"hljs-attribute":{color:"#D8DEE9"},"hljs-builtin-name":{color:"#81A1C1"},"hljs-name":{color:"#81A1C1"},"hljs-section":{color:"#88C0D0"},"hljs-tag":{color:"#81A1C1"},"hljs-variable":{color:"#D8DEE9"},"hljs-template-variable":{color:"#D8DEE9"},"hljs-template-tag":{color:"#5E81AC"},"abnf .hljs-attribute":{color:"#88C0D0"},"abnf .hljs-symbol":{color:"#EBCB8B"},"apache .hljs-attribute":{color:"#88C0D0"},"apache .hljs-section":{color:"#81A1C1"},"arduino .hljs-built_in":{color:"#88C0D0"},"aspectj .hljs-meta":{color:"#D08770"},"aspectj > .hljs-title":{color:"#88C0D0"},"bnf .hljs-attribute":{color:"#8FBCBB"},"clojure .hljs-name":{color:"#88C0D0"},"clojure .hljs-symbol":{color:"#EBCB8B"},"coq .hljs-built_in":{color:"#88C0D0"},"cpp .hljs-meta-string":{color:"#8FBCBB"},"css .hljs-built_in":{color:"#88C0D0"},"css .hljs-keyword":{color:"#D08770"},"diff .hljs-meta":{color:"#8FBCBB"},"ebnf .hljs-attribute":{color:"#8FBCBB"},"glsl .hljs-built_in":{color:"#88C0D0"},"groovy .hljs-meta:not(:first-child)":{color:"#D08770"},"haxe .hljs-meta":{color:"#D08770"},"java .hljs-meta":{color:"#D08770"},"ldif .hljs-attribute":{color:"#8FBCBB"},"lisp .hljs-name":{color:"#88C0D0"},"lua .hljs-built_in":{color:"#88C0D0"},"moonscript .hljs-built_in":{color:"#88C0D0"},"nginx .hljs-attribute":{color:"#88C0D0"},"nginx .hljs-section":{color:"#5E81AC"},"pf .hljs-built_in":{color:"#88C0D0"},"processing .hljs-built_in":{color:"#88C0D0"},"scss .hljs-keyword":{color:"#81A1C1"},"stylus .hljs-keyword":{color:"#81A1C1"},"swift .hljs-meta":{color:"#D08770"},"vim .hljs-built_in":{color:"#88C0D0",fontStyle:"italic"},"yaml .hljs-meta":{color:"#D08770"}},obsidian:{hljs:{display:"block",overflowX:"auto",padding:"0.5em",background:"#282b2e",color:"#e0e2e4"},"hljs-keyword":{color:"#93c763",fontWeight:"bold"},"hljs-selector-tag":{color:"#93c763",fontWeight:"bold"},"hljs-literal":{color:"#93c763",fontWeight:"bold"},"hljs-selector-id":{color:"#93c763"},"hljs-number":{color:"#ffcd22"},"hljs-attribute":{color:"#668bb0"},"hljs-code":{color:"white"},"hljs-class .hljs-title":{color:"white"},"hljs-section":{color:"white",fontWeight:"bold"},"hljs-regexp":{color:"#d39745"},"hljs-link":{color:"#d39745"},"hljs-meta":{color:"#557182"},"hljs-tag":{color:"#8cbbad"},"hljs-name":{color:"#8cbbad",fontWeight:"bold"},"hljs-bullet":{color:"#8cbbad"},"hljs-subst":{color:"#8cbbad"},"hljs-emphasis":{color:"#8cbbad"},"hljs-type":{color:"#8cbbad",fontWeight:"bold"},"hljs-built_in":{color:"#8cbbad"},"hljs-selector-attr":{color:"#8cbbad"},"hljs-selector-pseudo":{color:"#8cbbad"},"hljs-addition":{color:"#8cbbad"},"hljs-variable":{color:"#8cbbad"},"hljs-template-tag":{color:"#8cbbad"},"hljs-template-variable":{color:"#8cbbad"},"hljs-string":{color:"#ec7600"},"hljs-symbol":{color:"#ec7600"},"hljs-comment":{color:"#818e96"},"hljs-quote":{color:"#818e96"},"hljs-deletion":{color:"#818e96"},"hljs-selector-class":{color:"#A082BD"},"hljs-doctag":{fontWeight:"bold"},"hljs-title":{fontWeight:"bold"},"hljs-strong":{fontWeight:"bold"}},"tomorrow-night":{"hljs-comment":{color:"#969896"},"hljs-quote":{color:"#969896"},"hljs-variable":{color:"#cc6666"},"hljs-template-variable":{color:"#cc6666"},"hljs-tag":{color:"#cc6666"},"hljs-name":{color:"#cc6666"},"hljs-selector-id":{color:"#cc6666"},"hljs-selector-class":{color:"#cc6666"},"hljs-regexp":{color:"#cc6666"},"hljs-deletion":{color:"#cc6666"},"hljs-number":{color:"#de935f"},"hljs-built_in":{color:"#de935f"},"hljs-builtin-name":{color:"#de935f"},"hljs-literal":{color:"#de935f"},"hljs-type":{color:"#de935f"},"hljs-params":{color:"#de935f"},"hljs-meta":{color:"#de935f"},"hljs-link":{color:"#de935f"},"hljs-attribute":{color:"#f0c674"},"hljs-string":{color:"#b5bd68"},"hljs-symbol":{color:"#b5bd68"},"hljs-bullet":{color:"#b5bd68"},"hljs-addition":{color:"#b5bd68"},"hljs-title":{color:"#81a2be"},"hljs-section":{color:"#81a2be"},"hljs-keyword":{color:"#b294bb"},"hljs-selector-tag":{color:"#b294bb"},hljs:{display:"block",overflowX:"auto",background:"#1d1f21",color:"#c5c8c6",padding:"0.5em"},"hljs-emphasis":{fontStyle:"italic"},"hljs-strong":{fontWeight:"bold"}},idea:{hljs:{display:"block",overflowX:"auto",padding:"0.5em",color:"#000",background:"#fff"},"hljs-subst":{fontWeight:"normal",color:"#000"},"hljs-title":{fontWeight:"normal",color:"#000"},"hljs-comment":{color:"#808080",fontStyle:"italic"},"hljs-quote":{color:"#808080",fontStyle:"italic"},"hljs-meta":{color:"#808000"},"hljs-tag":{background:"#efefef"},"hljs-section":{fontWeight:"bold",color:"#000080"},"hljs-name":{fontWeight:"bold",color:"#000080"},"hljs-literal":{fontWeight:"bold",color:"#000080"},"hljs-keyword":{fontWeight:"bold",color:"#000080"},"hljs-selector-tag":{fontWeight:"bold",color:"#000080"},"hljs-type":{fontWeight:"bold",color:"#000080"},"hljs-selector-id":{fontWeight:"bold",color:"#000080"},"hljs-selector-class":{fontWeight:"bold",color:"#000080"},"hljs-attribute":{fontWeight:"bold",color:"#0000ff"},"hljs-number":{fontWeight:"normal",color:"#0000ff"},"hljs-regexp":{fontWeight:"normal",color:"#0000ff"},"hljs-link":{fontWeight:"normal",color:"#0000ff"},"hljs-string":{color:"#008000",fontWeight:"bold"},"hljs-symbol":{color:"#000",background:"#d0eded",fontStyle:"italic"},"hljs-bullet":{color:"#000",background:"#d0eded",fontStyle:"italic"},"hljs-formula":{color:"#000",background:"#d0eded",fontStyle:"italic"},"hljs-doctag":{textDecoration:"underline"},"hljs-variable":{color:"#660e7a"},"hljs-template-variable":{color:"#660e7a"},"hljs-addition":{background:"#baeeba"},"hljs-deletion":{background:"#ffc8bd"},"hljs-emphasis":{fontStyle:"italic"},"hljs-strong":{fontWeight:"bold"}}},BO=LO,components_SyntaxHighlighter=({language:s,className:o="",getConfigs:i,syntaxHighlighting:a={},children:u=""})=>{const _=i().syntaxHighlight.theme,{styles:w,defaultStyle:x}=a,C=w?.[_]??x;return Re.createElement(EO,{language:s,className:o,style:C},u)};var $O=__webpack_require__(5419),qO=__webpack_require__.n($O);const components_HighlightCode=({fileName:s="response.txt",className:o,downloadable:i,getComponent:a,canCopy:u,language:_,children:w})=>{const x=(0,Re.useRef)(null),C=a("SyntaxHighlighter",!0),handlePreventYScrollingBeyondElement=s=>{const{target:o,deltaY:i}=s,{scrollHeight:a,offsetHeight:u,scrollTop:_}=o;a>u&&(0===_&&i<0||u+_>=a&&i>0)&&s.preventDefault()};return(0,Re.useEffect)((()=>{const s=Array.from(x.current.childNodes).filter((s=>!!s.nodeType&&s.classList.contains("microlight")));return s.forEach((s=>s.addEventListener("mousewheel",handlePreventYScrollingBeyondElement,{passive:!1}))),()=>{s.forEach((s=>s.removeEventListener("mousewheel",handlePreventYScrollingBeyondElement)))}}),[w,o,_]),Re.createElement("div",{className:"highlight-code",ref:x},u&&Re.createElement("div",{className:"copy-to-clipboard"},Re.createElement(Hn.CopyToClipboard,{text:w},Re.createElement("button",null))),i?Re.createElement("button",{className:"download-contents",onClick:()=>{qO()(w,s)}},"Download"):null,Re.createElement(C,{language:_,className:Jn()(o,"microlight"),renderPlainText:({children:s,PlainTextViewer:i})=>Re.createElement(i,{className:o},s)},w))},components_PlainTextViewer=({className:s="",children:o})=>Re.createElement("pre",{className:Jn()("microlight",s)},o),wrap_components_SyntaxHighlighter=(s,o)=>({renderPlainText:i,children:a,...u})=>{const _=o.getConfigs().syntaxHighlight.activated,w=o.getComponent("PlainTextViewer");return _||"function"!=typeof i?_?Re.createElement(s,u,a):Re.createElement(w,null,a):i({children:a,PlainTextViewer:w})},SyntaxHighlightingPlugin1=()=>({afterLoad:after_load,rootInjects:{syntaxHighlighting:{styles:FO,defaultStyle:BO}},components:{SyntaxHighlighter:components_SyntaxHighlighter,HighlightCode:components_HighlightCode,PlainTextViewer:components_PlainTextViewer}}),SyntaxHighlightingPlugin2=()=>({wrapComponents:{SyntaxHighlighter:wrap_components_SyntaxHighlighter}}),syntax_highlighting=()=>[SyntaxHighlightingPlugin1,SyntaxHighlightingPlugin2],versions_after_load=()=>{const{GIT_DIRTY:s,GIT_COMMIT:o,PACKAGE_VERSION:i,BUILD_TIME:a}={PACKAGE_VERSION:"5.31.0",GIT_COMMIT:"gcf11271c",GIT_DIRTY:!0,BUILD_TIME:"Thu, 11 Dec 2025 15:56:57 GMT"};lt.versions=lt.versions||{},lt.versions.swaggerUI={version:i,gitRevision:o,gitDirty:s,buildTimestamp:a}},versions=()=>({afterLoad:versions_after_load});var UO=__webpack_require__(47248),VO=__webpack_require__.n(UO);const zO=console.error,withErrorBoundary=s=>o=>{const{getComponent:i,fn:a}=s(),u=i("ErrorBoundary"),_=a.getDisplayName(o);class WithErrorBoundary extends Re.Component{render(){return Re.createElement(u,{targetName:_,getComponent:i,fn:a},Re.createElement(o,Mn()({},this.props,this.context)))}}var w;return WithErrorBoundary.displayName=`WithErrorBoundary(${_})`,(w=o).prototype&&w.prototype.isReactComponent&&(WithErrorBoundary.prototype.mapStateToProps=o.prototype.mapStateToProps),WithErrorBoundary},fallback=({name:s})=>Re.createElement("div",{className:"fallback"},"😱 ",Re.createElement("i",null,"Could not render ","t"===s?"this component":s,", see the console."));class ErrorBoundary extends Re.Component{static defaultProps={targetName:"this component",getComponent:()=>fallback,fn:{componentDidCatch:zO},children:null};static getDerivedStateFromError(s){return{hasError:!0,error:s}}constructor(...s){super(...s),this.state={hasError:!1,error:null}}componentDidCatch(s,o){this.props.fn.componentDidCatch(s,o)}render(){const{getComponent:s,targetName:o,children:i}=this.props;if(this.state.hasError){const i=s("Fallback");return Re.createElement(i,{name:o})}return i}}const WO=ErrorBoundary,safe_render=({componentList:s=[],fullOverride:o=!1}={})=>({getSystem:i})=>{const a=o?s:["App","BaseLayout","VersionPragmaFilter","InfoContainer","ServersContainer","SchemesContainer","AuthorizeBtnContainer","FilterContainer","Operations","OperationContainer","parameters","responses","OperationServers","Models","ModelWrapper",...s],u=VO()(a,Array(a.length).fill(((s,{fn:o})=>o.withErrorBoundary(s))));return{fn:{componentDidCatch:zO,withErrorBoundary:withErrorBoundary(i)},components:{ErrorBoundary:WO,Fallback:fallback},wrapComponents:u}};class App extends Re.Component{getLayout(){const{getComponent:s,layoutSelectors:o}=this.props,i=o.current(),a=s(i,!0);return a||(()=>Re.createElement("h1",null,' No layout defined for "',i,'" '))}render(){const s=this.getLayout();return Re.createElement(s,null)}}const JO=App;class AuthorizationPopup extends Re.Component{close=()=>{let{authActions:s}=this.props;s.showDefinitions(!1)};render(){let{authSelectors:s,authActions:o,getComponent:i,errSelectors:a,specSelectors:u,fn:{AST:_={}}}=this.props,w=s.shownDefinitions();const x=i("auths"),C=i("CloseIcon");return Re.createElement("div",{className:"dialog-ux"},Re.createElement("div",{className:"backdrop-ux"}),Re.createElement("div",{className:"modal-ux"},Re.createElement("div",{className:"modal-dialog-ux"},Re.createElement("div",{className:"modal-ux-inner"},Re.createElement("div",{className:"modal-ux-header"},Re.createElement("h3",null,"Available authorizations"),Re.createElement("button",{type:"button",className:"close-modal",onClick:this.close},Re.createElement(C,null))),Re.createElement("div",{className:"modal-ux-content"},w.valueSeq().map(((w,C)=>Re.createElement(x,{key:C,AST:_,definitions:w,getComponent:i,errSelectors:a,authSelectors:s,authActions:o,specSelectors:u}))))))))}}class AuthorizeBtn extends Re.Component{render(){let{isAuthorized:s,showPopup:o,onClick:i,getComponent:a}=this.props;const u=a("authorizationPopup",!0),_=a("LockAuthIcon",!0),w=a("UnlockAuthIcon",!0);return Re.createElement("div",{className:"auth-wrapper"},Re.createElement("button",{className:s?"btn authorize locked":"btn authorize unlocked",onClick:i},Re.createElement("span",null,"Authorize"),s?Re.createElement(_,null):Re.createElement(w,null)),o&&Re.createElement(u,null))}}class AuthorizeBtnContainer extends Re.Component{render(){const{authActions:s,authSelectors:o,specSelectors:i,getComponent:a}=this.props,u=i.securityDefinitions(),_=o.definitionsToAuthorize(),w=a("authorizeBtn");return u?Re.createElement(w,{onClick:()=>s.showDefinitions(_),isAuthorized:!!o.authorized().size,showPopup:!!o.shownDefinitions(),getComponent:a}):null}}class AuthorizeOperationBtn extends Re.Component{onClick=s=>{s.stopPropagation();let{onClick:o}=this.props;o&&o()};render(){let{isAuthorized:s,getComponent:o}=this.props;const i=o("LockAuthOperationIcon",!0),a=o("UnlockAuthOperationIcon",!0);return Re.createElement("button",{className:"authorization__btn","aria-label":s?"authorization button locked":"authorization button unlocked",onClick:this.onClick},s?Re.createElement(i,{className:"locked"}):Re.createElement(a,{className:"unlocked"}))}}class Auths extends Re.Component{constructor(s,o){super(s,o),this.state={}}onAuthChange=s=>{let{name:o}=s;this.setState({[o]:s})};submitAuth=s=>{s.preventDefault();let{authActions:o}=this.props;o.authorizeWithPersistOption(this.state)};logoutClick=s=>{s.preventDefault();let{authActions:o,definitions:i}=this.props,a=i.map(((s,o)=>o)).toArray();this.setState(a.reduce(((s,o)=>(s[o]="",s)),{})),o.logoutWithPersistOption(a)};close=s=>{s.preventDefault();let{authActions:o}=this.props;o.showDefinitions(!1)};render(){let{definitions:s,getComponent:o,authSelectors:i,errSelectors:a}=this.props;const u=o("AuthItem"),_=o("oauth2",!0),w=o("Button");let x=i.authorized(),C=s.filter(((s,o)=>!!x.get(o))),j=s.filter((s=>"oauth2"!==s.get("type"))),L=s.filter((s=>"oauth2"===s.get("type")));return Re.createElement("div",{className:"auth-container"},!!j.size&&Re.createElement("form",{onSubmit:this.submitAuth},j.map(((s,_)=>Re.createElement(u,{key:_,schema:s,name:_,getComponent:o,onAuthChange:this.onAuthChange,authorized:x,errSelectors:a,authSelectors:i}))).toArray(),Re.createElement("div",{className:"auth-btn-wrapper"},j.size===C.size?Re.createElement(w,{className:"btn modal-btn auth",onClick:this.logoutClick,"aria-label":"Remove authorization"},"Logout"):Re.createElement(w,{type:"submit",className:"btn modal-btn auth authorize","aria-label":"Apply credentials"},"Authorize"),Re.createElement(w,{className:"btn modal-btn auth btn-done",onClick:this.close},"Close"))),L&&L.size?Re.createElement("div",null,Re.createElement("div",{className:"scope-def"},Re.createElement("p",null,"Scopes are used to grant an application different levels of access to data on behalf of the end user. Each API may declare one or more scopes."),Re.createElement("p",null,"API requires the following scopes. Select which ones you want to grant to Swagger UI.")),s.filter((s=>"oauth2"===s.get("type"))).map(((s,o)=>Re.createElement("div",{key:o},Re.createElement(_,{authorized:x,schema:s,name:o})))).toArray()):null)}}class auth_item_Auths extends Re.Component{render(){let{schema:s,name:o,getComponent:i,onAuthChange:a,authorized:u,errSelectors:_,authSelectors:w}=this.props;const x=i("apiKeyAuth"),C=i("basicAuth");let j;const L=s.get("type");switch(L){case"apiKey":j=Re.createElement(x,{key:o,schema:s,name:o,errSelectors:_,authorized:u,getComponent:i,onChange:a,authSelectors:w});break;case"basic":j=Re.createElement(C,{key:o,schema:s,name:o,errSelectors:_,authorized:u,getComponent:i,onChange:a,authSelectors:w});break;default:j=Re.createElement("div",{key:o},"Unknown security definition type ",L)}return Re.createElement("div",{key:`${o}-jump`},j)}}class AuthError extends Re.Component{render(){let{error:s}=this.props,o=s.get("level"),i=s.get("message"),a=s.get("source");return Re.createElement("div",{className:"errors"},Re.createElement("b",null,a," ",o),Re.createElement("span",null,i))}}class ApiKeyAuth extends Re.Component{constructor(s,o){super(s,o);let{name:i,schema:a}=this.props,u=this.getValue();this.state={name:i,schema:a,value:u}}getValue(){let{name:s,authorized:o}=this.props;return o&&o.getIn([s,"value"])}onChange=s=>{let{onChange:o}=this.props,i=s.target.value,a=Object.assign({},this.state,{value:i});this.setState(a),o(a)};render(){let{schema:s,getComponent:o,errSelectors:i,name:a,authSelectors:u}=this.props;const _=o("Input"),w=o("Row"),x=o("Col"),C=o("authError"),j=o("Markdown",!0),L=o("JumpToPath",!0),B=u.selectAuthPath(a);let $=this.getValue(),U=i.allErrors().filter((s=>s.get("authId")===a));return Re.createElement("div",null,Re.createElement("h4",null,Re.createElement("code",null,a||s.get("name"))," (apiKey)",Re.createElement(L,{path:B})),$&&Re.createElement("h6",null,"Authorized"),Re.createElement(w,null,Re.createElement(j,{source:s.get("description")})),Re.createElement(w,null,Re.createElement("p",null,"Name: ",Re.createElement("code",null,s.get("name")))),Re.createElement(w,null,Re.createElement("p",null,"In: ",Re.createElement("code",null,s.get("in")))),Re.createElement(w,null,Re.createElement("label",{htmlFor:"api_key_value"},"Value:"),$?Re.createElement("code",null," ****** "):Re.createElement(x,null,Re.createElement(_,{id:"api_key_value",type:"text",onChange:this.onChange,autoFocus:!0}))),U.valueSeq().map(((s,o)=>Re.createElement(C,{error:s,key:o}))))}}class BasicAuth extends Re.Component{constructor(s,o){super(s,o);let{schema:i,name:a}=this.props,u=this.getValue().username;this.state={name:a,schema:i,value:u?{username:u}:{}}}getValue(){let{authorized:s,name:o}=this.props;return s&&s.getIn([o,"value"])||{}}onChange=s=>{let{onChange:o}=this.props,{value:i,name:a}=s.target,u=this.state.value;u[a]=i,this.setState({value:u}),o(this.state)};render(){let{schema:s,getComponent:o,name:i,errSelectors:a,authSelectors:u}=this.props;const _=o("Input"),w=o("Row"),x=o("Col"),C=o("authError"),j=o("JumpToPath",!0),L=o("Markdown",!0),B=u.selectAuthPath(i);let $=this.getValue().username,U=a.allErrors().filter((s=>s.get("authId")===i));return Re.createElement("div",null,Re.createElement("h4",null,"Basic authorization",Re.createElement(j,{path:B})),$&&Re.createElement("h6",null,"Authorized"),Re.createElement(w,null,Re.createElement(L,{source:s.get("description")})),Re.createElement(w,null,Re.createElement("label",{htmlFor:"auth_username"},"Username:"),$?Re.createElement("code",null," ",$," "):Re.createElement(x,null,Re.createElement(_,{id:"auth_username",type:"text",required:"required",name:"username",onChange:this.onChange,autoFocus:!0}))),Re.createElement(w,null,Re.createElement("label",{htmlFor:"auth_password"},"Password:"),$?Re.createElement("code",null," ****** "):Re.createElement(x,null,Re.createElement(_,{id:"auth_password",autoComplete:"new-password",name:"password",type:"password",onChange:this.onChange}))),U.valueSeq().map(((s,o)=>Re.createElement(C,{error:s,key:o}))))}}function example_Example(s){const{example:o,showValue:i,getComponent:a}=s,u=a("Markdown",!0),_=a("HighlightCode",!0);return o&&ze.Map.isMap(o)?Re.createElement("div",{className:"example"},o.get("description")?Re.createElement("section",{className:"example__section"},Re.createElement("div",{className:"example__section-header"},"Example Description"),Re.createElement("p",null,Re.createElement(u,{source:o.get("description")}))):null,i&&o.has("value")?Re.createElement("section",{className:"example__section"},Re.createElement("div",{className:"example__section-header"},"Example Value"),Re.createElement(_,null,stringify(o.get("value")))):null):null}class ExamplesSelect extends Re.PureComponent{static defaultProps={examples:(0,ze.Map)({}),onSelect:(...s)=>console.log("DEBUG: ExamplesSelect was not given an onSelect callback",...s),currentExampleKey:null,showLabels:!0};_onSelect=(s,{isSyntheticChange:o=!1}={})=>{"function"==typeof this.props.onSelect&&this.props.onSelect(s,{isSyntheticChange:o})};_onDomSelect=s=>{if("function"==typeof this.props.onSelect){const o=s.target.selectedOptions[0].getAttribute("value");this._onSelect(o,{isSyntheticChange:!1})}};getCurrentExample=()=>{const{examples:s,currentExampleKey:o}=this.props,i=s.get(o),a=s.keySeq().first(),u=s.get(a);return i||u||(0,ze.Map)({})};componentDidMount(){const{onSelect:s,examples:o}=this.props;if("function"==typeof s){const s=o.first(),i=o.keyOf(s);this._onSelect(i,{isSyntheticChange:!0})}}UNSAFE_componentWillReceiveProps(s){const{currentExampleKey:o,examples:i}=s;if(i!==this.props.examples&&!i.has(o)){const s=i.first(),o=i.keyOf(s);this._onSelect(o,{isSyntheticChange:!0})}}render(){const{examples:s,currentExampleKey:o,isValueModified:i,isModifiedValueAvailable:a,showLabels:u}=this.props;return Re.createElement("div",{className:"examples-select"},u?Re.createElement("span",{className:"examples-select__section-label"},"Examples: "):null,Re.createElement("select",{className:"examples-select-element",onChange:this._onDomSelect,value:a&&i?"__MODIFIED__VALUE__":o||""},a?Re.createElement("option",{value:"__MODIFIED__VALUE__"},"[Modified value]"):null,s.map(((s,o)=>Re.createElement("option",{key:o,value:o},ze.Map.isMap(s)&&s.get("summary")||o))).valueSeq()))}}const stringifyUnlessList=s=>ze.List.isList(s)?s:stringify(s);class ExamplesSelectValueRetainer extends Re.PureComponent{static defaultProps={userHasEditedBody:!1,examples:(0,ze.Map)({}),currentNamespace:"__DEFAULT__NAMESPACE__",setRetainRequestBodyValueFlag:()=>{},onSelect:(...s)=>console.log("ExamplesSelectValueRetainer: no `onSelect` function was provided",...s),updateValue:(...s)=>console.log("ExamplesSelectValueRetainer: no `updateValue` function was provided",...s)};constructor(s){super(s);const o=this._getCurrentExampleValue();this.state={[s.currentNamespace]:(0,ze.Map)({lastUserEditedValue:this.props.currentUserInputValue,lastDownstreamValue:o,isModifiedValueSelected:this.props.userHasEditedBody||this.props.currentUserInputValue!==o})}}componentWillUnmount(){this.props.setRetainRequestBodyValueFlag(!1)}_getStateForCurrentNamespace=()=>{const{currentNamespace:s}=this.props;return(this.state[s]||(0,ze.Map)()).toObject()};_setStateForCurrentNamespace=s=>{const{currentNamespace:o}=this.props;return this._setStateForNamespace(o,s)};_setStateForNamespace=(s,o)=>{const i=(this.state[s]||(0,ze.Map)()).mergeDeep(o);return this.setState({[s]:i})};_isCurrentUserInputSameAsExampleValue=()=>{const{currentUserInputValue:s}=this.props;return this._getCurrentExampleValue()===s};_getValueForExample=(s,o)=>{const{examples:i}=o||this.props;return stringifyUnlessList((i||(0,ze.Map)({})).getIn([s,"value"]))};_getCurrentExampleValue=s=>{const{currentKey:o}=s||this.props;return this._getValueForExample(o,s||this.props)};_onExamplesSelect=(s,{isSyntheticChange:o}={},...i)=>{const{onSelect:a,updateValue:u,currentUserInputValue:_,userHasEditedBody:w}=this.props,{lastUserEditedValue:x}=this._getStateForCurrentNamespace(),C=this._getValueForExample(s);if("__MODIFIED__VALUE__"===s)return u(stringifyUnlessList(x)),this._setStateForCurrentNamespace({isModifiedValueSelected:!0});"function"==typeof a&&a(s,{isSyntheticChange:o},...i),this._setStateForCurrentNamespace({lastDownstreamValue:C,isModifiedValueSelected:o&&w||!!_&&_!==C}),o||"function"==typeof u&&u(stringifyUnlessList(C))};UNSAFE_componentWillReceiveProps(s){const{currentUserInputValue:o,examples:i,onSelect:a,userHasEditedBody:u}=s,{lastUserEditedValue:_,lastDownstreamValue:w}=this._getStateForCurrentNamespace(),x=this._getValueForExample(s.currentKey,s),C=i.filter((s=>ze.Map.isMap(s)&&(s.get("value")===o||stringify(s.get("value"))===o)));if(C.size){let o;o=C.has(s.currentKey)?s.currentKey:C.keySeq().first(),a(o,{isSyntheticChange:!0})}else o!==this.props.currentUserInputValue&&o!==_&&o!==w&&(this.props.setRetainRequestBodyValueFlag(!0),this._setStateForNamespace(s.currentNamespace,{lastUserEditedValue:s.currentUserInputValue,isModifiedValueSelected:u||o!==x}))}render(){const{currentUserInputValue:s,examples:o,currentKey:i,getComponent:a,userHasEditedBody:u}=this.props,{lastDownstreamValue:_,lastUserEditedValue:w,isModifiedValueSelected:x}=this._getStateForCurrentNamespace(),C=a("ExamplesSelect");return Re.createElement(C,{examples:o,currentExampleKey:i,onSelect:this._onExamplesSelect,isModifiedValueAvailable:!!w&&w!==_,isValueModified:void 0!==s&&x&&s!==this._getCurrentExampleValue()||u})}}function oauth2_authorize_authorize({auth:s,authActions:o,errActions:i,configs:a,authConfigs:u={},currentServer:_}){let{schema:w,scopes:x,name:C,clientId:j}=s,L=w.get("flow"),B=[];switch(L){case"password":return void o.authorizePassword(s);case"application":case"clientCredentials":case"client_credentials":return void o.authorizeApplication(s);case"accessCode":case"authorizationCode":case"authorization_code":B.push("response_type=code");break;case"implicit":B.push("response_type=token")}"string"==typeof j&&B.push("client_id="+encodeURIComponent(j));let $=a.oauth2RedirectUrl;if(void 0===$)return void i.newAuthErr({authId:C,source:"validation",level:"error",message:"oauth2RedirectUrl configuration is not passed. Oauth2 authorization cannot be performed."});B.push("redirect_uri="+encodeURIComponent($));let U=[];if(Array.isArray(x)?U=x:We().List.isList(x)&&(U=x.toArray()),U.length>0){let s=u.scopeSeparator||" ";B.push("scope="+encodeURIComponent(U.join(s)))}let V=utils_btoa(new Date);if(B.push("state="+encodeURIComponent(V)),void 0!==u.realm&&B.push("realm="+encodeURIComponent(u.realm)),("authorizationCode"===L||"authorization_code"===L||"accessCode"===L)&&u.usePkceWithAuthorizationCodeGrant){const o=function generateCodeVerifier(){return b64toB64UrlEncoded(xt()(32).toString("base64"))}(),i=function createCodeChallenge(s){return b64toB64UrlEncoded(Ot()("sha256").update(s).digest("base64"))}(o);B.push("code_challenge="+i),B.push("code_challenge_method=S256"),s.codeVerifier=o}let{additionalQueryStringParams:z}=u;for(let s in z)void 0!==z[s]&&B.push([s,z[s]].map(encodeURIComponent).join("="));const Y=w.get("authorizationUrl");let Z;Z=_?Nt()(sanitizeUrl(Y),_,!0).toString():sanitizeUrl(Y);let ee,ie=[Z,B.join("&")].join("string"!=typeof Y||Y.includes("?")?"&":"?");ee="implicit"===L?o.preAuthorizeImplicit:u.useBasicAuthenticationWithAccessCodeGrant?o.authorizeAccessCodeWithBasicAuthentication:o.authorizeAccessCodeWithFormParams,o.authPopup(ie,{auth:s,state:V,redirectUrl:$,callback:ee,errCb:i.newAuthErr})}class Oauth2 extends Re.Component{constructor(s,o){super(s,o);let{name:i,schema:a,authorized:u,authSelectors:_}=this.props,w=u&&u.get(i),x=_.getConfigs()||{},C=w&&w.get("username")||"",j=w&&w.get("clientId")||x.clientId||"",L=w&&w.get("clientSecret")||x.clientSecret||"",B=w&&w.get("passwordType")||"basic",$=w&&w.get("scopes")||x.scopes||[];"string"==typeof $&&($=$.split(x.scopeSeparator||" ")),this.state={appName:x.appName,name:i,schema:a,scopes:$,clientId:j,clientSecret:L,username:C,password:"",passwordType:B}}close=s=>{s.preventDefault();let{authActions:o}=this.props;o.showDefinitions(!1)};authorize=()=>{let{authActions:s,errActions:o,getConfigs:i,authSelectors:a,oas3Selectors:u}=this.props,_=i(),w=a.getConfigs();o.clear({authId:name,type:"auth",source:"auth"}),oauth2_authorize_authorize({auth:this.state,currentServer:u.serverEffectiveValue(u.selectedServer()),authActions:s,errActions:o,configs:_,authConfigs:w})};onScopeChange=s=>{let{target:o}=s,{checked:i}=o,a=o.dataset.value;if(i&&-1===this.state.scopes.indexOf(a)){let s=this.state.scopes.concat([a]);this.setState({scopes:s})}else!i&&this.state.scopes.indexOf(a)>-1&&this.setState({scopes:this.state.scopes.filter((s=>s!==a))})};onInputChange=s=>{let{target:{dataset:{name:o},value:i}}=s,a={[o]:i};this.setState(a)};selectScopes=s=>{s.target.dataset.all?this.setState({scopes:Array.from((this.props.schema.get("allowedScopes")||this.props.schema.get("scopes")).keys())}):this.setState({scopes:[]})};logout=s=>{s.preventDefault();let{authActions:o,errActions:i,name:a}=this.props;i.clear({authId:a,type:"auth",source:"auth"}),o.logoutWithPersistOption([a])};render(){let{schema:s,getComponent:o,authSelectors:i,errSelectors:a,name:u,specSelectors:_}=this.props;const w=o("Input"),x=o("Row"),C=o("Col"),j=o("Button"),L=o("authError"),B=o("JumpToPath",!0),$=o("Markdown",!0),U=o("InitializedInput"),{isOAS3:V}=_;let z=V()?s.get("openIdConnectUrl"):null;const Y="implicit",Z="password",ee=V()?z?"authorization_code":"authorizationCode":"accessCode",ie=V()?z?"client_credentials":"clientCredentials":"application",ae=i.selectAuthPath(u);let ce=!!(i.getConfigs()||{}).usePkceWithAuthorizationCodeGrant,le=s.get("flow"),pe=le===ee&&ce?le+" with PKCE":le,de=s.get("allowedScopes")||s.get("scopes"),fe=!!i.authorized().get(u),ye=a.allErrors().filter((s=>s.get("authId")===u)),be=!ye.filter((s=>"validation"===s.get("source"))).size,_e=s.get("description");return Re.createElement("div",null,Re.createElement("h4",null,u," (OAuth2, ",pe,") ",Re.createElement(B,{path:ae})),this.state.appName?Re.createElement("h5",null,"Application: ",this.state.appName," "):null,_e&&Re.createElement($,{source:s.get("description")}),fe&&Re.createElement("h6",null,"Authorized"),z&&Re.createElement("p",null,"OpenID Connect URL: ",Re.createElement("code",null,z)),(le===Y||le===ee)&&Re.createElement("p",null,"Authorization URL: ",Re.createElement("code",null,s.get("authorizationUrl"))),(le===Z||le===ee||le===ie)&&Re.createElement("p",null,"Token URL:",Re.createElement("code",null," ",s.get("tokenUrl"))),Re.createElement("p",{className:"flow"},"Flow: ",Re.createElement("code",null,pe)),le!==Z?null:Re.createElement(x,null,Re.createElement(x,null,Re.createElement("label",{htmlFor:"oauth_username"},"username:"),fe?Re.createElement("code",null," ",this.state.username," "):Re.createElement(C,{tablet:10,desktop:10},Re.createElement("input",{id:"oauth_username",type:"text","data-name":"username",onChange:this.onInputChange,autoFocus:!0}))),Re.createElement(x,null,Re.createElement("label",{htmlFor:"oauth_password"},"password:"),fe?Re.createElement("code",null," ****** "):Re.createElement(C,{tablet:10,desktop:10},Re.createElement("input",{id:"oauth_password",type:"password","data-name":"password",onChange:this.onInputChange}))),Re.createElement(x,null,Re.createElement("label",{htmlFor:"password_type"},"Client credentials location:"),fe?Re.createElement("code",null," ",this.state.passwordType," "):Re.createElement(C,{tablet:10,desktop:10},Re.createElement("select",{id:"password_type","data-name":"passwordType",onChange:this.onInputChange},Re.createElement("option",{value:"basic"},"Authorization header"),Re.createElement("option",{value:"request-body"},"Request body"))))),(le===ie||le===Y||le===ee||le===Z)&&(!fe||fe&&this.state.clientId)&&Re.createElement(x,null,Re.createElement("label",{htmlFor:`client_id_${le}`},"client_id:"),fe?Re.createElement("code",null," ****** "):Re.createElement(C,{tablet:10,desktop:10},Re.createElement(U,{id:`client_id_${le}`,type:"text",required:le===Z,initialValue:this.state.clientId,"data-name":"clientId",onChange:this.onInputChange}))),(le===ie||le===ee||le===Z)&&Re.createElement(x,null,Re.createElement("label",{htmlFor:`client_secret_${le}`},"client_secret:"),fe?Re.createElement("code",null," ****** "):Re.createElement(C,{tablet:10,desktop:10},Re.createElement(U,{id:`client_secret_${le}`,initialValue:this.state.clientSecret,type:"password","data-name":"clientSecret",onChange:this.onInputChange}))),!fe&&de&&de.size?Re.createElement("div",{className:"scopes"},Re.createElement("h2",null,"Scopes:",Re.createElement("a",{onClick:this.selectScopes,"data-all":!0},"select all"),Re.createElement("a",{onClick:this.selectScopes},"select none")),de.map(((s,o)=>Re.createElement(x,{key:o},Re.createElement("div",{className:"checkbox"},Re.createElement(w,{"data-value":o,id:`${o}-${le}-checkbox-${this.state.name}`,disabled:fe,checked:this.state.scopes.includes(o),type:"checkbox",onChange:this.onScopeChange}),Re.createElement("label",{htmlFor:`${o}-${le}-checkbox-${this.state.name}`},Re.createElement("span",{className:"item"}),Re.createElement("div",{className:"text"},Re.createElement("p",{className:"name"},o),Re.createElement("p",{className:"description"},s))))))).toArray()):null,ye.valueSeq().map(((s,o)=>Re.createElement(L,{error:s,key:o}))),Re.createElement("div",{className:"auth-btn-wrapper"},be&&(fe?Re.createElement(j,{className:"btn modal-btn auth authorize",onClick:this.logout,"aria-label":"Remove authorization"},"Logout"):Re.createElement(j,{className:"btn modal-btn auth authorize",onClick:this.authorize,"aria-label":"Apply given OAuth2 credentials"},"Authorize")),Re.createElement(j,{className:"btn modal-btn auth btn-done",onClick:this.close},"Close")))}}class Clear extends Re.Component{onClick=()=>{let{specActions:s,path:o,method:i}=this.props;s.clearResponse(o,i),s.clearRequest(o,i)};render(){return Re.createElement("button",{className:"btn btn-clear opblock-control__btn",onClick:this.onClick},"Clear")}}const live_response_Headers=({headers:s})=>Re.createElement("div",null,Re.createElement("h5",null,"Response headers"),Re.createElement("pre",{className:"microlight"},s)),Duration=({duration:s})=>Re.createElement("div",null,Re.createElement("h5",null,"Request duration"),Re.createElement("pre",{className:"microlight"},s," ms"));class LiveResponse extends Re.Component{shouldComponentUpdate(s){return this.props.response!==s.response||this.props.path!==s.path||this.props.method!==s.method||this.props.displayRequestDuration!==s.displayRequestDuration}render(){const{response:s,getComponent:o,getConfigs:i,displayRequestDuration:a,specSelectors:u,path:_,method:w}=this.props,{showMutatedRequest:x,requestSnippetsEnabled:C}=i(),j=x?u.mutatedRequestFor(_,w):u.requestFor(_,w),L=s.get("status"),B=j.get("url"),$=s.get("headers").toJS(),U=s.get("notDocumented"),V=s.get("error"),z=s.get("text"),Y=s.get("duration"),Z=Object.keys($),ee=$["content-type"]||$["Content-Type"],ie=o("responseBody"),ae=Z.map((s=>{var o=Array.isArray($[s])?$[s].join():$[s];return Re.createElement("span",{className:"headerline",key:s}," ",s,": ",o," ")})),ce=0!==ae.length,le=o("Markdown",!0),pe=o("RequestSnippets",!0),de=o("curl",!0);return Re.createElement("div",null,j&&C?Re.createElement(pe,{request:j}):Re.createElement(de,{request:j}),B&&Re.createElement("div",null,Re.createElement("div",{className:"request-url"},Re.createElement("h4",null,"Request URL"),Re.createElement("pre",{className:"microlight"},B))),Re.createElement("h4",null,"Server response"),Re.createElement("table",{className:"responses-table live-responses-table"},Re.createElement("thead",null,Re.createElement("tr",{className:"responses-header"},Re.createElement("td",{className:"col_header response-col_status"},"Code"),Re.createElement("td",{className:"col_header response-col_description"},"Details"))),Re.createElement("tbody",null,Re.createElement("tr",{className:"response"},Re.createElement("td",{className:"response-col_status"},L,U?Re.createElement("div",{className:"response-undocumented"},Re.createElement("i",null," Undocumented ")):null),Re.createElement("td",{className:"response-col_description"},V?Re.createElement(le,{source:`${""!==s.get("name")?`${s.get("name")}: `:""}${s.get("message")}`}):null,z?Re.createElement(ie,{content:z,contentType:ee,url:B,headers:$,getConfigs:i,getComponent:o}):null,ce?Re.createElement(live_response_Headers,{headers:ae}):null,a&&Y?Re.createElement(Duration,{duration:Y}):null)))))}}class OnlineValidatorBadge extends Re.Component{constructor(s,o){super(s,o);let{getConfigs:i}=s,{validatorUrl:a}=i();this.state={url:this.getDefinitionUrl(),validatorUrl:void 0===a?"https://validator.swagger.io/validator":a}}getDefinitionUrl=()=>{let{specSelectors:s}=this.props;return new(Nt())(s.url(),lt.location).toString()};UNSAFE_componentWillReceiveProps(s){let{getConfigs:o}=s,{validatorUrl:i}=o();this.setState({url:this.getDefinitionUrl(),validatorUrl:void 0===i?"https://validator.swagger.io/validator":i})}render(){let{getConfigs:s}=this.props,{spec:o}=s(),i=sanitizeUrl(this.state.validatorUrl);return"object"==typeof o&&Object.keys(o).length?null:this.state.url&&requiresValidationURL(this.state.validatorUrl)&&requiresValidationURL(this.state.url)?Re.createElement("span",{className:"float-right"},Re.createElement("a",{target:"_blank",rel:"noopener noreferrer",href:`${i}/debug?url=${encodeURIComponent(this.state.url)}`},Re.createElement(ValidatorImage,{src:`${i}?url=${encodeURIComponent(this.state.url)}`,alt:"Online validator badge"}))):null}}class ValidatorImage extends Re.Component{constructor(s){super(s),this.state={loaded:!1,error:!1}}componentDidMount(){const s=new Image;s.onload=()=>{this.setState({loaded:!0})},s.onerror=()=>{this.setState({error:!0})},s.src=this.props.src}UNSAFE_componentWillReceiveProps(s){if(s.src!==this.props.src){const o=new Image;o.onload=()=>{this.setState({loaded:!0})},o.onerror=()=>{this.setState({error:!0})},o.src=s.src}}render(){return this.state.error?Re.createElement("img",{alt:"Error"}):this.state.loaded?Re.createElement("img",{src:this.props.src,alt:this.props.alt}):null}}class Operations extends Re.Component{render(){let{specSelectors:s}=this.props;const o=s.taggedOperations();return 0===o.size?Re.createElement("h3",null," No operations defined in spec!"):Re.createElement("div",null,o.map(this.renderOperationTag).toArray(),o.size<1?Re.createElement("h3",null," No operations defined in spec! "):null)}renderOperationTag=(s,o)=>{const{specSelectors:i,getComponent:a,oas3Selectors:u,layoutSelectors:_,layoutActions:w,getConfigs:x}=this.props,C=i.validOperationMethods(),j=a("OperationContainer",!0),L=a("OperationTag"),B=s.get("operations");return Re.createElement(L,{key:"operation-"+o,tagObj:s,tag:o,oas3Selectors:u,layoutSelectors:_,layoutActions:w,getConfigs:x,getComponent:a,specUrl:i.url()},Re.createElement("div",{className:"operation-tag-content"},B.map((s=>{const i=s.get("path"),a=s.get("method"),u=We().List(["paths",i,a]);return-1===C.indexOf(a)?null:Re.createElement(j,{key:`${i}-${a}`,specPath:u,op:s,path:i,method:a,tag:o})})).toArray()))}}class OperationTag extends Re.Component{static defaultProps={tagObj:We().fromJS({}),tag:""};render(){const{tagObj:s,tag:o,children:i,oas3Selectors:a,layoutSelectors:u,layoutActions:_,getConfigs:w,getComponent:x,specUrl:C}=this.props;let{docExpansion:j,deepLinking:L}=w();const B=x("Collapse"),$=x("Markdown",!0),U=x("DeepLink"),V=x("Link"),z=x("ArrowUpIcon"),Y=x("ArrowDownIcon");let Z,ee=s.getIn(["tagDetails","description"],null),ie=s.getIn(["tagDetails","externalDocs","description"]),ae=s.getIn(["tagDetails","externalDocs","url"]);Z=isFunc(a)&&isFunc(a.selectedServer)?safeBuildUrl(ae,C,{selectedServer:a.selectedServer()}):ae;let ce=["operations-tag",o],le=u.isShown(ce,"full"===j||"list"===j);return Re.createElement("div",{className:le?"opblock-tag-section is-open":"opblock-tag-section"},Re.createElement("h3",{onClick:()=>_.show(ce,!le),className:ee?"opblock-tag":"opblock-tag no-desc",id:ce.map((s=>escapeDeepLinkPath(s))).join("-"),"data-tag":o,"data-is-open":le},Re.createElement(U,{enabled:L,isShown:le,path:createDeepLinkPath(o),text:o}),ee?Re.createElement("small",null,Re.createElement($,{source:ee})):Re.createElement("small",null),Z?Re.createElement("div",{className:"info__externaldocs"},Re.createElement("small",null,Re.createElement(V,{href:sanitizeUrl(Z),onClick:s=>s.stopPropagation(),target:"_blank"},ie||Z))):null,Re.createElement("button",{"aria-expanded":le,className:"expand-operation",title:le?"Collapse operation":"Expand operation",onClick:()=>_.show(ce,!le)},le?Re.createElement(z,{className:"arrow"}):Re.createElement(Y,{className:"arrow"}))),Re.createElement(B,{isOpened:le},i))}}class operation_Operation extends Re.PureComponent{static defaultProps={operation:null,response:null,request:null,specPath:(0,ze.List)(),summary:""};render(){let{specPath:s,response:o,request:i,toggleShown:a,onTryoutClick:u,onResetClick:_,onCancelClick:w,onExecute:x,fn:C,getComponent:j,getConfigs:L,specActions:B,specSelectors:$,authActions:U,authSelectors:V,oas3Actions:z,oas3Selectors:Y}=this.props,Z=this.props.operation,{deprecated:ee,isShown:ie,path:ae,method:ce,op:le,tag:pe,operationId:de,allowTryItOut:fe,displayRequestDuration:ye,tryItOutEnabled:be,executeInProgress:_e}=Z.toJS(),{description:Se,externalDocs:we,schemes:xe}=le;const Pe=we?safeBuildUrl(we.url,$.url(),{selectedServer:Y.selectedServer()}):"";let Te=Z.getIn(["op"]),$e=Te.get("responses"),qe=function getList(s,o){if(!We().Iterable.isIterable(s))return We().List();let i=s.getIn(Array.isArray(o)?o:[o]);return We().List.isList(i)?i:We().List()}(Te,["parameters"]),ze=$.operationScheme(ae,ce),He=["operations",pe,de],Ye=getExtensions(Te);const Xe=j("responses"),Qe=j("parameters"),et=j("execute"),tt=j("clear"),rt=j("Collapse"),nt=j("Markdown",!0),st=j("schemes"),ot=j("OperationServers"),it=j("OperationExt"),at=j("OperationSummary"),ct=j("Link"),{showExtensions:lt}=L();if($e&&o&&o.size>0){let s=!$e.get(String(o.get("status")))&&!$e.get("default");o=o.set("notDocumented",s)}let ut=[ae,ce];const pt=$.validationErrors([ae,ce]);return Re.createElement("div",{className:ee?"opblock opblock-deprecated":ie?`opblock opblock-${ce} is-open`:`opblock opblock-${ce}`,id:escapeDeepLinkPath(He.join("-"))},Re.createElement(at,{operationProps:Z,isShown:ie,toggleShown:a,getComponent:j,authActions:U,authSelectors:V,specPath:s}),Re.createElement(rt,{isOpened:ie},Re.createElement("div",{className:"opblock-body"},Te&&Te.size||null===Te?null:Re.createElement(rolling_load,{height:"32px",width:"32px",className:"opblock-loading-animation"}),ee&&Re.createElement("h4",{className:"opblock-title_normal"}," Warning: Deprecated"),Se&&Re.createElement("div",{className:"opblock-description-wrapper"},Re.createElement("div",{className:"opblock-description"},Re.createElement(nt,{source:Se}))),Pe?Re.createElement("div",{className:"opblock-external-docs-wrapper"},Re.createElement("h4",{className:"opblock-title_normal"},"Find more details"),Re.createElement("div",{className:"opblock-external-docs"},we.description&&Re.createElement("span",{className:"opblock-external-docs__description"},Re.createElement(nt,{source:we.description})),Re.createElement(ct,{target:"_blank",className:"opblock-external-docs__link",href:sanitizeUrl(Pe)},Pe))):null,Te&&Te.size?Re.createElement(Qe,{parameters:qe,specPath:s.push("parameters"),operation:Te,onChangeKey:ut,onTryoutClick:u,onResetClick:_,onCancelClick:w,tryItOutEnabled:be,allowTryItOut:fe,fn:C,getComponent:j,specActions:B,specSelectors:$,pathMethod:[ae,ce],getConfigs:L,oas3Actions:z,oas3Selectors:Y}):null,be?Re.createElement(ot,{getComponent:j,path:ae,method:ce,operationServers:Te.get("servers"),pathServers:$.paths().getIn([ae,"servers"]),getSelectedServer:Y.selectedServer,setSelectedServer:z.setSelectedServer,setServerVariableValue:z.setServerVariableValue,getServerVariable:Y.serverVariableValue,getEffectiveServerValue:Y.serverEffectiveValue}):null,be&&fe&&xe&&xe.size?Re.createElement("div",{className:"opblock-schemes"},Re.createElement(st,{schemes:xe,path:ae,method:ce,specActions:B,currentScheme:ze})):null,!be||!fe||pt.length<=0?null:Re.createElement("div",{className:"validation-errors errors-wrapper"},"Please correct the following validation errors and try again.",Re.createElement("ul",null,pt.map(((s,o)=>Re.createElement("li",{key:o}," ",s," "))))),Re.createElement("div",{className:be&&o&&fe?"btn-group":"execute-wrapper"},be&&fe?Re.createElement(et,{operation:Te,specActions:B,specSelectors:$,oas3Selectors:Y,oas3Actions:z,path:ae,method:ce,onExecute:x,disabled:_e}):null,be&&o&&fe?Re.createElement(tt,{specActions:B,path:ae,method:ce}):null),_e?Re.createElement("div",{className:"loading-container"},Re.createElement("div",{className:"loading"})):null,$e?Re.createElement(Xe,{responses:$e,request:i,tryItOutResponse:o,getComponent:j,getConfigs:L,specSelectors:$,oas3Actions:z,oas3Selectors:Y,specActions:B,produces:$.producesOptionsFor([ae,ce]),producesValue:$.currentProducesFor([ae,ce]),specPath:s.push("responses"),path:ae,method:ce,displayRequestDuration:ye,fn:C}):null,lt&&Ye.size?Re.createElement(it,{extensions:Ye,getComponent:j}):null)))}}class OperationContainer extends Re.PureComponent{constructor(s,o){super(s,o);const{tryItOutEnabled:i}=s.getConfigs();this.state={tryItOutEnabled:i,executeInProgress:!1}}static defaultProps={showSummary:!0,response:null,allowTryItOut:!0,displayOperationId:!1,displayRequestDuration:!1};mapStateToProps(s,o){const{op:i,layoutSelectors:a,getConfigs:u}=o,{docExpansion:_,deepLinking:w,displayOperationId:x,displayRequestDuration:C,supportedSubmitMethods:j}=u(),L=a.showSummary(),B=i.getIn(["operation","__originalOperationId"])||i.getIn(["operation","operationId"])||opId(i.get("operation"),o.path,o.method)||i.get("id"),$=["operations",o.tag,B],U=j.indexOf(o.method)>=0&&(void 0===o.allowTryItOut?o.specSelectors.allowTryItOutFor(o.path,o.method):o.allowTryItOut),V=i.getIn(["operation","security"])||o.specSelectors.security();return{operationId:B,isDeepLinkingEnabled:w,showSummary:L,displayOperationId:x,displayRequestDuration:C,allowTryItOut:U,security:V,isAuthorized:o.authSelectors.isAuthorized(V),isShown:a.isShown($,"full"===_),jumpToKey:`paths.${o.path}.${o.method}`,response:o.specSelectors.responseFor(o.path,o.method),request:o.specSelectors.requestFor(o.path,o.method)}}componentDidMount(){const{isShown:s}=this.props,o=this.getResolvedSubtree();s&&void 0===o&&this.requestResolvedSubtree()}componentDidUpdate(s){const{response:o,isShown:i}=this.props,a=this.getResolvedSubtree();o!==s.response&&this.setState({executeInProgress:!1}),i&&void 0===a&&!s.isShown&&this.requestResolvedSubtree()}toggleShown=()=>{let{layoutActions:s,tag:o,operationId:i,isShown:a}=this.props;const u=this.getResolvedSubtree();a||void 0!==u||this.requestResolvedSubtree(),s.show(["operations",o,i],!a)};onCancelClick=()=>{this.setState({tryItOutEnabled:!this.state.tryItOutEnabled})};onTryoutClick=()=>{this.setState({tryItOutEnabled:!this.state.tryItOutEnabled})};onResetClick=s=>{const o=this.props.oas3Selectors.selectDefaultRequestBodyValue(...s),i=this.props.oas3Selectors.requestContentType(...s);if("application/x-www-form-urlencoded"===i||"multipart/form-data"===i){const i=JSON.parse(o);Object.entries(i).forEach((([s,o])=>{Array.isArray(o)?i[s]=i[s].map((s=>"object"==typeof s?JSON.stringify(s,null,2):s)):"object"==typeof o&&(i[s]=JSON.stringify(i[s],null,2))})),this.props.oas3Actions.setRequestBodyValue({value:(0,ze.fromJS)(i),pathMethod:s})}else this.props.oas3Actions.setRequestBodyValue({value:o,pathMethod:s})};onExecute=()=>{this.setState({executeInProgress:!0})};getResolvedSubtree=()=>{const{specSelectors:s,path:o,method:i,specPath:a}=this.props;return a?s.specResolvedSubtree(a.toJS()):s.specResolvedSubtree(["paths",o,i])};requestResolvedSubtree=()=>{const{specActions:s,path:o,method:i,specPath:a}=this.props;return a?s.requestResolvedSubtree(a.toJS()):s.requestResolvedSubtree(["paths",o,i])};render(){let{op:s,tag:o,path:i,method:a,security:u,isAuthorized:_,operationId:w,showSummary:x,isShown:C,jumpToKey:j,allowTryItOut:L,response:B,request:$,displayOperationId:U,displayRequestDuration:V,isDeepLinkingEnabled:z,specPath:Y,specSelectors:Z,specActions:ee,getComponent:ie,getConfigs:ae,layoutSelectors:ce,layoutActions:le,authActions:pe,authSelectors:de,oas3Actions:fe,oas3Selectors:ye,fn:be}=this.props;const _e=ie("operation"),Se=this.getResolvedSubtree()||(0,ze.Map)(),we=(0,ze.fromJS)({op:Se,tag:o,path:i,summary:s.getIn(["operation","summary"])||"",deprecated:Se.get("deprecated")||s.getIn(["operation","deprecated"])||!1,method:a,security:u,isAuthorized:_,operationId:w,originalOperationId:Se.getIn(["operation","__originalOperationId"]),showSummary:x,isShown:C,jumpToKey:j,allowTryItOut:L,request:$,displayOperationId:U,displayRequestDuration:V,isDeepLinkingEnabled:z,executeInProgress:this.state.executeInProgress,tryItOutEnabled:this.state.tryItOutEnabled});return Re.createElement(_e,{operation:we,response:B,request:$,isShown:C,toggleShown:this.toggleShown,onTryoutClick:this.onTryoutClick,onResetClick:this.onResetClick,onCancelClick:this.onCancelClick,onExecute:this.onExecute,specPath:Y,specActions:ee,specSelectors:Z,oas3Actions:fe,oas3Selectors:ye,layoutActions:le,layoutSelectors:ce,authActions:pe,authSelectors:de,getComponent:ie,getConfigs:ae,fn:be})}}var HO=__webpack_require__(13222),KO=__webpack_require__.n(HO);class OperationSummary extends Re.PureComponent{static defaultProps={operationProps:null,specPath:(0,ze.List)(),summary:""};render(){let{isShown:s,toggleShown:o,getComponent:i,authActions:a,authSelectors:u,operationProps:_,specPath:w}=this.props,{summary:x,isAuthorized:C,method:j,op:L,showSummary:B,path:$,operationId:U,originalOperationId:V,displayOperationId:z}=_.toJS(),{summary:Y}=L,Z=_.get("security");const ee=i("authorizeOperationBtn",!0),ie=i("OperationSummaryMethod"),ae=i("OperationSummaryPath"),ce=i("JumpToPath",!0),le=i("CopyToClipboardBtn",!0),pe=i("ArrowUpIcon"),de=i("ArrowDownIcon"),fe=Z&&!!Z.count(),ye=fe&&1===Z.size&&Z.first().isEmpty(),be=!fe||ye;return Re.createElement("div",{className:`opblock-summary opblock-summary-${j}`},Re.createElement("button",{"aria-expanded":s,className:"opblock-summary-control",onClick:o},Re.createElement(ie,{method:j}),Re.createElement("div",{className:"opblock-summary-path-description-wrapper"},Re.createElement(ae,{getComponent:i,operationProps:_,specPath:w}),B?Re.createElement("div",{className:"opblock-summary-description"},KO()(Y||x)):null),z&&(V||U)?Re.createElement("span",{className:"opblock-summary-operation-id"},V||U):null),Re.createElement(le,{textToCopy:`${w.get(1)}`}),be?null:Re.createElement(ee,{isAuthorized:C,onClick:()=>{const s=u.definitionsForRequirements(Z);a.showDefinitions(s)}}),Re.createElement(ce,{path:w}),Re.createElement("button",{"aria-label":`${j} ${$.replace(/\//g,"​/")}`,className:"opblock-control-arrow","aria-expanded":s,tabIndex:"-1",onClick:o},s?Re.createElement(pe,{className:"arrow"}):Re.createElement(de,{className:"arrow"})))}}class OperationSummaryMethod extends Re.PureComponent{static defaultProps={operationProps:null};render(){let{method:s}=this.props;return Re.createElement("span",{className:"opblock-summary-method"},s.toUpperCase())}}class OperationSummaryPath extends Re.PureComponent{render(){let{getComponent:s,operationProps:o}=this.props,{deprecated:i,isShown:a,path:u,tag:_,operationId:w,isDeepLinkingEnabled:x}=o.toJS();const C=u.split(/(?=\/)/g);for(let s=1;s<C.length;s+=2)C.splice(s,0,Re.createElement("wbr",{key:s}));const j=s("DeepLink");return Re.createElement("span",{className:i?"opblock-summary-path__deprecated":"opblock-summary-path","data-path":u},Re.createElement(j,{enabled:x,isShown:a,path:createDeepLinkPath(`${_}/${w}`),text:C}))}}const operation_extensions=({extensions:s,getComponent:o})=>{let i=o("OperationExtRow");return Re.createElement("div",{className:"opblock-section"},Re.createElement("div",{className:"opblock-section-header"},Re.createElement("h4",null,"Extensions")),Re.createElement("div",{className:"table-container"},Re.createElement("table",null,Re.createElement("thead",null,Re.createElement("tr",null,Re.createElement("td",{className:"col_header"},"Field"),Re.createElement("td",{className:"col_header"},"Value"))),Re.createElement("tbody",null,s.entrySeq().map((([s,o])=>Re.createElement(i,{key:`${s}-${o}`,xKey:s,xVal:o})))))))},operation_extension_row=({xKey:s,xVal:o})=>{const i=o?o.toJS?o.toJS():o:null;return Re.createElement("tr",null,Re.createElement("td",null,s),Re.createElement("td",null,JSON.stringify(i)))};function createHtmlReadyId(s,o="_"){return s.replace(/[^\w-]/g,o)}class responses_Responses extends Re.Component{static defaultProps={tryItOutResponse:null,produces:(0,ze.fromJS)(["application/json"]),displayRequestDuration:!1};onChangeProducesWrapper=s=>this.props.specActions.changeProducesValue([this.props.path,this.props.method],s);onResponseContentTypeChange=({controlsAcceptHeader:s,value:o})=>{const{oas3Actions:i,path:a,method:u}=this.props;s&&i.setResponseContentType({value:o,path:a,method:u})};render(){let{responses:s,tryItOutResponse:o,getComponent:i,getConfigs:a,specSelectors:u,fn:_,producesValue:w,displayRequestDuration:x,specPath:C,path:j,method:L,oas3Selectors:B,oas3Actions:$}=this.props,U=function defaultStatusCode(s){let o=s.keySeq();return o.contains(jt)?jt:o.filter((s=>"2"===(s+"")[0])).sort().first()}(s);const V=i("contentType"),z=i("liveResponse"),Y=i("response");let Z=this.props.produces&&this.props.produces.size?this.props.produces:responses_Responses.defaultProps.produces;const ee=u.isOAS3()?function getAcceptControllingResponse(s){if(!We().OrderedMap.isOrderedMap(s))return null;if(!s.size)return null;const o=s.find(((s,o)=>o.startsWith("2")&&Object.keys(s.get("content")||{}).length>0)),i=s.get("default")||We().OrderedMap(),a=(i.get("content")||We().OrderedMap()).keySeq().toJS().length?i:null;return o||a}(s):null,ie=s.filter(((s,o)=>!isExtension(o))),ae=createHtmlReadyId(`${L}${j}_responses`),ce=`${ae}_select`;return ie&&ie.size?Re.createElement("div",{className:"responses-wrapper"},Re.createElement("div",{className:"opblock-section-header"},Re.createElement("h4",null,"Responses"),u.isOAS3()?null:Re.createElement("label",{htmlFor:ce},Re.createElement("span",null,"Response content type"),Re.createElement(V,{value:w,ariaControls:ae,ariaLabel:"Response content type",className:"execute-content-type",contentTypes:Z,controlId:ce,onChange:this.onChangeProducesWrapper}))),Re.createElement("div",{className:"responses-inner"},o?Re.createElement("div",null,Re.createElement(z,{response:o,getComponent:i,getConfigs:a,specSelectors:u,path:this.props.path,method:this.props.method,displayRequestDuration:x}),Re.createElement("h4",null,"Responses")):null,Re.createElement("table",{"aria-live":"polite",className:"responses-table",id:ae,role:"region"},Re.createElement("thead",null,Re.createElement("tr",{className:"responses-header"},Re.createElement("td",{className:"col_header response-col_status"},"Code"),Re.createElement("td",{className:"col_header response-col_description"},"Description"),u.isOAS3()?Re.createElement("td",{className:"col col_header response-col_links"},"Links"):null)),Re.createElement("tbody",null,ie.entrySeq().map((([s,x])=>{let V=o&&o.get("status")==s?"response_current":"";return Re.createElement(Y,{key:s,path:j,method:L,specPath:C.push(s),isDefault:U===s,fn:_,className:V,code:s,response:x,specSelectors:u,controlsAcceptHeader:x===ee,onContentTypeChange:this.onResponseContentTypeChange,contentType:w,getConfigs:a,activeExamplesKey:B.activeExamplesMember(j,L,"responses",s),oas3Actions:$,getComponent:i})})).toArray())))):null}}function getKnownSyntaxHighlighterLanguage(s){const o=function canJsonParse(s){try{return!!JSON.parse(s)}catch(s){return null}}(s);return o?"json":null}class response_Response extends Re.Component{constructor(s,o){super(s,o),this.state={responseContentType:""}}static defaultProps={response:(0,ze.fromJS)({}),onContentTypeChange:()=>{}};_onContentTypeChange=s=>{const{onContentTypeChange:o,controlsAcceptHeader:i}=this.props;this.setState({responseContentType:s}),o({value:s,controlsAcceptHeader:i})};getTargetExamplesKey=()=>{const{response:s,contentType:o,activeExamplesKey:i}=this.props,a=this.state.responseContentType||o,u=s.getIn(["content",a],(0,ze.Map)({})).get("examples",null).keySeq().first();return i||u};render(){let{path:s,method:o,code:i,response:a,className:u,specPath:_,fn:w,getComponent:x,getConfigs:C,specSelectors:j,contentType:L,controlsAcceptHeader:B,oas3Actions:$}=this.props,{inferSchema:U,getSampleSchema:V}=w,z=j.isOAS3();const{showExtensions:Y}=C();let Z=Y?getExtensions(a):null,ee=a.get("headers"),ie=a.get("links");const ae=x("ResponseExtension"),ce=x("headers"),le=x("HighlightCode",!0),pe=x("modelExample"),de=x("Markdown",!0),fe=x("operationLink"),ye=x("contentType"),be=x("ExamplesSelect"),_e=x("Example");var Se,we;const xe=this.state.responseContentType||L,Pe=a.getIn(["content",xe],(0,ze.Map)({})),Te=Pe.get("examples",null);if(z){const s=Pe.get("schema");Se=s?U(s.toJS()):null,we=s?_.push("content",this.state.responseContentType,"schema"):_}else Se=a.get("schema"),we=a.has("schema")?_.push("schema"):_;let $e,qe,We=!1,He={includeReadOnly:!0};if(z)if(qe=Pe.get("schema")?.toJS(),ze.Map.isMap(Te)&&!Te.isEmpty()){const s=this.getTargetExamplesKey(),getMediaTypeExample=s=>ze.Map.isMap(s)?s.get("value"):void 0;$e=getMediaTypeExample(Te.get(s,(0,ze.Map)({}))),void 0===$e&&($e=getMediaTypeExample(Te.values().next().value)),We=!0}else void 0!==Pe.get("example")&&($e=Pe.get("example"),We=!0);else{qe=Se,He={...He,includeWriteOnly:!0};const s=a.getIn(["examples",xe]);s&&($e=s,We=!0)}const Ye=((s,o)=>{if(null==s)return null;const i=getKnownSyntaxHighlighterLanguage(s)?"json":null;return Re.createElement("div",null,Re.createElement(o,{className:"example",language:i},stringify(s)))})(V(qe,xe,He,We?$e:void 0),le);return Re.createElement("tr",{className:"response "+(u||""),"data-code":i},Re.createElement("td",{className:"response-col_status"},i),Re.createElement("td",{className:"response-col_description"},Re.createElement("div",{className:"response-col_description__inner"},Re.createElement(de,{source:a.get("description")})),Y&&Z.size?Z.entrySeq().map((([s,o])=>Re.createElement(ae,{key:`${s}-${o}`,xKey:s,xVal:o}))):null,z&&a.get("content")?Re.createElement("section",{className:"response-controls"},Re.createElement("div",{className:Jn()("response-control-media-type",{"response-control-media-type--accept-controller":B})},Re.createElement("small",{className:"response-control-media-type__title"},"Media type"),Re.createElement(ye,{value:this.state.responseContentType,contentTypes:a.get("content")?a.get("content").keySeq():(0,ze.Seq)(),onChange:this._onContentTypeChange,ariaLabel:"Media Type"}),B?Re.createElement("small",{className:"response-control-media-type__accept-message"},"Controls ",Re.createElement("code",null,"Accept")," header."):null),ze.Map.isMap(Te)&&!Te.isEmpty()?Re.createElement("div",{className:"response-control-examples"},Re.createElement("small",{className:"response-control-examples__title"},"Examples"),Re.createElement(be,{examples:Te,currentExampleKey:this.getTargetExamplesKey(),onSelect:a=>$.setActiveExamplesMember({name:a,pathMethod:[s,o],contextType:"responses",contextName:i}),showLabels:!1})):null):null,Ye||Se?Re.createElement(pe,{specPath:we,getComponent:x,getConfigs:C,specSelectors:j,schema:fromJSOrdered(Se),example:Ye,includeReadOnly:!0}):null,z&&Te?Re.createElement(_e,{example:Te.get(this.getTargetExamplesKey(),(0,ze.Map)({})),getComponent:x,getConfigs:C,omitValue:!0}):null,ee?Re.createElement(ce,{headers:ee,getComponent:x}):null),z?Re.createElement("td",{className:"response-col_links"},ie?ie.toSeq().entrySeq().map((([s,o])=>Re.createElement(fe,{key:s,name:s,link:o,getComponent:x}))):Re.createElement("i",null,"No links")):null)}}const response_extension=({xKey:s,xVal:o})=>Re.createElement("div",{className:"response__extension"},s,": ",String(o));var GO=__webpack_require__(26657),YO=__webpack_require__.n(GO),XO=__webpack_require__(80218),QO=__webpack_require__.n(XO);class ResponseBody extends Re.PureComponent{state={parsedContent:null};updateParsedContent=s=>{const{content:o}=this.props;if(s!==o)if(o&&o instanceof Blob){var i=new FileReader;i.onload=()=>{this.setState({parsedContent:i.result})},i.readAsText(o)}else this.setState({parsedContent:o.toString()})};componentDidMount(){this.updateParsedContent(null)}componentDidUpdate(s){this.updateParsedContent(s.content)}render(){let{content:s,contentType:o,url:i,headers:a={},getComponent:u}=this.props;const{parsedContent:_}=this.state,w=u("HighlightCode",!0),x="response_"+(new Date).getTime();let C,j;if(i=i||"",(/^application\/octet-stream/i.test(o)||a["Content-Disposition"]&&/attachment/i.test(a["Content-Disposition"])||a["content-disposition"]&&/attachment/i.test(a["content-disposition"])||a["Content-Description"]&&/File Transfer/i.test(a["Content-Description"])||a["content-description"]&&/File Transfer/i.test(a["content-description"]))&&(s.size>0||s.length>0))if("Blob"in window){let u=o||"text/html",_=s instanceof Blob?s:new Blob([s],{type:u}),w=window.URL.createObjectURL(_),x=[u,i.substr(i.lastIndexOf("/")+1),w].join(":"),C=a["content-disposition"]||a["Content-Disposition"];if(void 0!==C){let s=function extractFileNameFromContentDispositionHeader(s){let o;if([/filename\*=[^']+'\w*'"([^"]+)";?/i,/filename\*=[^']+'\w*'([^;]+);?/i,/filename="([^;]*);?"/i,/filename=([^;]*);?/i].some((i=>(o=i.exec(s),null!==o))),null!==o&&o.length>1)try{return decodeURIComponent(o[1])}catch(s){console.error(s)}return null}(C);null!==s&&(x=s)}j=lt.navigator&&lt.navigator.msSaveOrOpenBlob?Re.createElement("div",null,Re.createElement("a",{href:w,onClick:()=>lt.navigator.msSaveOrOpenBlob(_,x)},"Download file")):Re.createElement("div",null,Re.createElement("a",{href:w,download:x},"Download file"))}else j=Re.createElement("pre",{className:"microlight"},"Download headers detected but your browser does not support downloading binary via XHR (Blob).");else if(/json/i.test(o)){let o=null;getKnownSyntaxHighlighterLanguage(s)&&(o="json");try{C=JSON.stringify(JSON.parse(s),null,"  ")}catch(o){C="can't parse JSON.  Raw result:\n\n"+s}j=Re.createElement(w,{language:o,downloadable:!0,fileName:`${x}.json`,canCopy:!0},C)}else/xml/i.test(o)?(C=YO()(s,{textNodesOnSameLine:!0,indentor:"  "}),j=Re.createElement(w,{downloadable:!0,fileName:`${x}.xml`,canCopy:!0},C)):j="text/html"===QO()(o)||/text\/plain/.test(o)?Re.createElement(w,{downloadable:!0,fileName:`${x}.html`,canCopy:!0},s):"text/csv"===QO()(o)||/text\/csv/.test(o)?Re.createElement(w,{downloadable:!0,fileName:`${x}.csv`,canCopy:!0},s):/^image\//i.test(o)?o.includes("svg")?Re.createElement("div",null," ",s," "):Re.createElement("img",{src:window.URL.createObjectURL(s)}):/^audio\//i.test(o)?Re.createElement("pre",{className:"microlight"},Re.createElement("audio",{controls:!0,key:i},Re.createElement("source",{src:i,type:o}))):"string"==typeof s?Re.createElement(w,{downloadable:!0,fileName:`${x}.txt`,canCopy:!0},s):s.size>0?_?Re.createElement("div",null,Re.createElement("p",{className:"i"},"Unrecognized response type; displaying content as text."),Re.createElement(w,{downloadable:!0,fileName:`${x}.txt`,canCopy:!0},_)):Re.createElement("p",{className:"i"},"Unrecognized response type; unable to display."):null;return j?Re.createElement("div",null,Re.createElement("h5",null,"Response body"),j):null}}class Parameters extends Re.Component{constructor(s){super(s),this.state={callbackVisible:!1,parametersVisible:!0}}static defaultProps={onTryoutClick:Function.prototype,onCancelClick:Function.prototype,tryItOutEnabled:!1,allowTryItOut:!0,onChangeKey:[],specPath:[]};onChange=(s,o,i)=>{let{specActions:{changeParamByIdentity:a},onChangeKey:u}=this.props;a(u,s,o,i)};onChangeConsumesWrapper=s=>{let{specActions:{changeConsumesValue:o},onChangeKey:i}=this.props;o(i,s)};toggleTab=s=>"parameters"===s?this.setState({parametersVisible:!0,callbackVisible:!1}):"callbacks"===s?this.setState({callbackVisible:!0,parametersVisible:!1}):void 0;onChangeMediaType=({value:s,pathMethod:o})=>{let{specActions:i,oas3Selectors:a,oas3Actions:u}=this.props;const _=a.hasUserEditedBody(...o),w=a.shouldRetainRequestBodyValue(...o);u.setRequestContentType({value:s,pathMethod:o}),u.initRequestBodyValidateError({pathMethod:o}),_||(w||u.setRequestBodyValue({value:void 0,pathMethod:o}),i.clearResponse(...o),i.clearRequest(...o),i.clearValidateParams(o))};render(){let{onTryoutClick:s,onResetClick:o,parameters:i,allowTryItOut:a,tryItOutEnabled:u,specPath:_,fn:w,getComponent:x,getConfigs:C,specSelectors:j,specActions:L,pathMethod:B,oas3Actions:$,oas3Selectors:U,operation:V}=this.props;const z=x("parameterRow"),Y=x("TryItOutButton"),Z=x("contentType"),ee=x("Callbacks",!0),ie=x("RequestBody",!0),ae=u&&a,ce=j.isOAS3(),le=`${createHtmlReadyId(`${B[1]}${B[0]}_requests`)}_select`,pe=V.get("requestBody"),de=Object.values(i.reduce(((s,o)=>{if(ze.Map.isMap(o)){const i=o.get("in");s[i]??=[],s[i].push(o)}return s}),{})).reduce(((s,o)=>s.concat(o)),[]);return Re.createElement("div",{className:"opblock-section"},Re.createElement("div",{className:"opblock-section-header"},ce?Re.createElement("div",{className:"tab-header"},Re.createElement("div",{onClick:()=>this.toggleTab("parameters"),className:`tab-item ${this.state.parametersVisible&&"active"}`},Re.createElement("h4",{className:"opblock-title"},Re.createElement("span",null,"Parameters"))),V.get("callbacks")?Re.createElement("div",{onClick:()=>this.toggleTab("callbacks"),className:`tab-item ${this.state.callbackVisible&&"active"}`},Re.createElement("h4",{className:"opblock-title"},Re.createElement("span",null,"Callbacks"))):null):Re.createElement("div",{className:"tab-header"},Re.createElement("h4",{className:"opblock-title"},"Parameters")),a?Re.createElement(Y,{isOAS3:j.isOAS3(),hasUserEditedBody:U.hasUserEditedBody(...B),enabled:u,onCancelClick:this.props.onCancelClick,onTryoutClick:s,onResetClick:()=>o(B)}):null),this.state.parametersVisible?Re.createElement("div",{className:"parameters-container"},de.length?Re.createElement("div",{className:"table-container"},Re.createElement("table",{className:"parameters"},Re.createElement("thead",null,Re.createElement("tr",null,Re.createElement("th",{className:"col_header parameters-col_name"},"Name"),Re.createElement("th",{className:"col_header parameters-col_description"},"Description"))),Re.createElement("tbody",null,de.map(((s,o)=>Re.createElement(z,{fn:w,specPath:_.push(o.toString()),getComponent:x,getConfigs:C,rawParam:s,param:j.parameterWithMetaByIdentity(B,s),key:`${s.get("in")}.${s.get("name")}`,onChange:this.onChange,onChangeConsumes:this.onChangeConsumesWrapper,specSelectors:j,specActions:L,oas3Actions:$,oas3Selectors:U,pathMethod:B,isExecute:ae})))))):Re.createElement("div",{className:"opblock-description-wrapper"},Re.createElement("p",null,"No parameters"))):null,this.state.callbackVisible?Re.createElement("div",{className:"callbacks-container opblock-description-wrapper"},Re.createElement(ee,{callbacks:(0,ze.Map)(V.get("callbacks")),specPath:_.slice(0,-1).push("callbacks")})):null,ce&&pe&&this.state.parametersVisible&&Re.createElement("div",{className:"opblock-section opblock-section-request-body"},Re.createElement("div",{className:"opblock-section-header"},Re.createElement("h4",{className:`opblock-title parameter__name ${pe.get("required")&&"required"}`},"Request body"),Re.createElement("label",{id:le},Re.createElement(Z,{value:U.requestContentType(...B),contentTypes:pe.get("content",(0,ze.List)()).keySeq(),onChange:s=>{this.onChangeMediaType({value:s,pathMethod:B})},className:"body-param-content-type",ariaLabel:"Request content type",controlId:le}))),Re.createElement("div",{className:"opblock-description-wrapper"},Re.createElement(ie,{setRetainRequestBodyValueFlag:s=>$.setRetainRequestBodyValueFlag({value:s,pathMethod:B}),userHasEditedBody:U.hasUserEditedBody(...B),specPath:_.slice(0,-1).push("requestBody"),requestBody:pe,requestBodyValue:U.requestBodyValue(...B),requestBodyInclusionSetting:U.requestBodyInclusionSetting(...B),requestBodyErrors:U.requestBodyErrors(...B),isExecute:ae,getConfigs:C,activeExamplesKey:U.activeExamplesMember(...B,"requestBody","requestBody"),updateActiveExamplesKey:s=>{this.props.oas3Actions.setActiveExamplesMember({name:s,pathMethod:this.props.pathMethod,contextType:"requestBody",contextName:"requestBody"})},onChange:(s,o)=>{if(o){const i=U.requestBodyValue(...B),a=ze.Map.isMap(i)?i:(0,ze.Map)();return $.setRequestBodyValue({pathMethod:B,value:a.setIn(o,s)})}$.setRequestBodyValue({value:s,pathMethod:B})},onChangeIncludeEmpty:(s,o)=>{$.setRequestBodyInclusion({pathMethod:B,value:o,name:s})},contentType:U.requestContentType(...B)}))))}}const parameter_extension=({xKey:s,xVal:o})=>Re.createElement("div",{className:"parameter__extension"},s,": ",String(o)),ZO={onChange:()=>{},isIncludedOptions:{}};class ParameterIncludeEmpty extends Re.Component{static defaultProps=ZO;componentDidMount(){const{isIncludedOptions:s,onChange:o}=this.props,{shouldDispatchInit:i,defaultValue:a}=s;i&&o(a)}onCheckboxChange=s=>{const{onChange:o}=this.props;o(s.target.checked)};render(){let{isIncluded:s,isDisabled:o}=this.props;return Re.createElement("div",null,Re.createElement("label",{htmlFor:"include_empty_value",className:Jn()("parameter__empty_value_toggle",{disabled:o})},Re.createElement("input",{id:"include_empty_value",type:"checkbox",disabled:o,checked:!o&&s,onChange:this.onCheckboxChange}),"Send empty value"))}}class ParameterRow extends Re.Component{constructor(s,o){super(s,o),this.setDefaultValue()}UNSAFE_componentWillReceiveProps(s){let o,{specSelectors:i,pathMethod:a,rawParam:u}=s,_=i.isOAS3(),w=i.parameterWithMetaByIdentity(a,u)||new ze.Map;if(w=w.isEmpty()?u:w,_){let{schema:s}=getParameterSchema(w,{isOAS3:_});o=s?s.get("enum"):void 0}else o=w?w.get("enum"):void 0;let x,C=w?w.get("value"):void 0;void 0!==C?x=C:u.get("required")&&o&&o.size&&(x=o.first()),void 0!==x&&x!==C&&this.onChangeWrapper(function numberToString(s){return"number"==typeof s?s.toString():s}(x)),this.setDefaultValue()}onChangeWrapper=(s,o=!1)=>{let i,{onChange:a,rawParam:u}=this.props;return i=""===s||s&&0===s.size?null:s,a(u,i,o)};_onExampleSelect=s=>{this.props.oas3Actions.setActiveExamplesMember({name:s,pathMethod:this.props.pathMethod,contextType:"parameters",contextName:this.getParamKey()})};onChangeIncludeEmpty=s=>{let{specActions:o,param:i,pathMethod:a}=this.props;const u=i.get("name"),_=i.get("in");return o.updateEmptyParamInclusion(a,u,_,s)};setDefaultValue=()=>{let{specSelectors:s,pathMethod:o,rawParam:i,oas3Selectors:a,fn:u}=this.props;const _=s.parameterWithMetaByIdentity(o,i)||(0,ze.Map)();let{schema:w}=getParameterSchema(_,{isOAS3:s.isOAS3()});const x=_.get("content",(0,ze.Map)()).keySeq().first(),C=w?u.getSampleSchema(w.toJS(),x,{includeWriteOnly:!0}):null;if(_&&void 0===_.get("value")&&"body"!==_.get("in")){let i;if(s.isSwagger2())i=void 0!==_.get("x-example")?_.get("x-example"):void 0!==_.getIn(["schema","example"])?_.getIn(["schema","example"]):w&&w.getIn(["default"]);else if(s.isOAS3()){w=this.composeJsonSchema(w);const s=a.activeExamplesMember(...o,"parameters",this.getParamKey());i=void 0!==_.getIn(["examples",s,"value"])?_.getIn(["examples",s,"value"]):void 0!==_.getIn(["content",x,"example"])?_.getIn(["content",x,"example"]):void 0!==_.get("example")?_.get("example"):void 0!==(w&&w.get("example"))?w&&w.get("example"):void 0!==(w&&w.get("default"))?w&&w.get("default"):_.get("default")}void 0===i||ze.List.isList(i)||(i=stringify(i));const j=u.getSchemaObjectType(w),L=u.getSchemaObjectType(w?.get("items"));void 0!==i?this.onChangeWrapper(i):"object"===j&&C&&!_.get("examples")?this.onChangeWrapper(ze.List.isList(C)?C:stringify(C)):"array"===j&&"object"===L&&C&&!_.get("examples")&&this.onChangeWrapper(ze.List.isList(C)?C:(0,ze.List)(JSON.parse(C)))}};getParamKey(){const{param:s}=this.props;return s?`${s.get("name")}-${s.get("in")}`:null}composeJsonSchema(s){const{fn:o}=this.props,i=s.get("oneOf")?.get(0)?.toJS(),a=s.get("anyOf")?.get(0)?.toJS();return(0,ze.fromJS)(o.mergeJsonSchema(s.toJS(),i??a??{}))}render(){let{param:s,rawParam:o,getComponent:i,getConfigs:a,isExecute:u,fn:_,onChangeConsumes:w,specSelectors:x,pathMethod:C,specPath:j,oas3Selectors:L}=this.props,B=x.isOAS3();const{showExtensions:$,showCommonExtensions:U}=a();if(s||(s=o),!o)return null;const V=i("JsonSchemaForm"),z=i("ParamBody");let Y=s.get("in"),Z="body"!==Y?null:Re.createElement(z,{getComponent:i,getConfigs:a,fn:_,param:s,consumes:x.consumesOptionsFor(C),consumesValue:x.contentTypeValues(C).get("requestContentType"),onChange:this.onChangeWrapper,onChangeConsumes:w,isExecute:u,specSelectors:x,pathMethod:C});const ee=i("modelExample"),ie=i("Markdown",!0),ae=i("ParameterExt"),ce=i("ParameterIncludeEmpty"),le=i("ExamplesSelectValueRetainer"),pe=i("Example");let{schema:de}=getParameterSchema(s,{isOAS3:B}),fe=x.parameterWithMetaByIdentity(C,o)||(0,ze.Map)();const ye=fe.get("content",(0,ze.Map)()).keySeq().first();B&&(de=this.composeJsonSchema(de));let be=de?de.get("format"):null,_e="formData"===Y,Se="FormData"in lt,we=s.get("required");const xe=_.getSchemaObjectType(de),Pe=_.getSchemaObjectType(de?.get("items")),Te=_.getSchemaObjectTypeLabel(de),$e=!Z&&"object"===xe,qe=!Z&&"object"===Pe;let We,He,Ye,Xe,Qe=fe?fe.get("value"):"",et=U?getCommonExtensions(de):null,tt=$?getExtensions(s):null,rt=!1;void 0!==s&&de&&(We=de.get("items")),void 0!==We?(He=We.get("enum"),Ye=We.get("default")):de&&(He=de.get("enum")),He&&He.size&&He.size>0&&(rt=!0),void 0!==s&&(de&&(Ye=de.get("default")),void 0===Ye&&(Ye=s.get("default")),Xe=s.get("example"),void 0===Xe&&(Xe=s.get("x-example")));const nt=Z?null:Re.createElement(V,{fn:_,getComponent:i,value:Qe,required:we,disabled:!u,description:s.get("name"),onChange:this.onChangeWrapper,errors:fe.get("errors"),schema:de});return Re.createElement("tr",{"data-param-name":s.get("name"),"data-param-in":s.get("in")},Re.createElement("td",{className:"parameters-col_name"},Re.createElement("div",{className:we?"parameter__name required":"parameter__name"},s.get("name"),we?Re.createElement("span",null," *"):null),Re.createElement("div",{className:"parameter__type"},Te,be&&Re.createElement("span",{className:"prop-format"},"($",be,")")),Re.createElement("div",{className:"parameter__deprecated"},B&&s.get("deprecated")?"deprecated":null),Re.createElement("div",{className:"parameter__in"},"(",s.get("in"),")")),Re.createElement("td",{className:"parameters-col_description"},s.get("description")?Re.createElement(ie,{source:s.get("description")}):null,!Z&&u||!rt?null:Re.createElement(ie,{className:"parameter__enum",source:"<i>Available values</i> : "+He.map((function(s){return s})).toArray().map(String).join(", ")}),!Z&&u||void 0===Ye?null:Re.createElement(ie,{className:"parameter__default",source:"<i>Default value</i> : "+Ye}),!Z&&u||void 0===Xe?null:Re.createElement(ie,{source:"<i>Example</i> : "+Xe}),_e&&!Se&&Re.createElement("div",null,"Error: your browser does not support FormData"),B&&s.get("examples")?Re.createElement("section",{className:"parameter-controls"},Re.createElement(le,{examples:s.get("examples"),onSelect:this._onExampleSelect,updateValue:this.onChangeWrapper,getComponent:i,defaultToFirstExample:!0,currentKey:L.activeExamplesMember(...C,"parameters",this.getParamKey()),currentUserInputValue:Qe})):null,$e||qe?Re.createElement(ee,{getComponent:i,specPath:ye?j.push("content",ye,"schema"):j.push("schema"),getConfigs:a,isExecute:u,specSelectors:x,schema:de,example:nt}):nt,Z&&de?Re.createElement(ee,{getComponent:i,specPath:j.push("schema"),getConfigs:a,isExecute:u,specSelectors:x,schema:de,example:Z,includeWriteOnly:!0}):null,!Z&&u&&s.get("allowEmptyValue")?Re.createElement(ce,{onChange:this.onChangeIncludeEmpty,isIncluded:x.parameterInclusionSettingFor(C,s.get("name"),s.get("in")),isDisabled:!isEmptyValue(Qe)}):null,B&&s.get("examples")?Re.createElement(pe,{example:s.getIn(["examples",L.activeExamplesMember(...C,"parameters",this.getParamKey())]),getComponent:i,getConfigs:a}):null,U&&et.size?et.entrySeq().map((([s,o])=>Re.createElement(ae,{key:`${s}-${o}`,xKey:s,xVal:o}))):null,$&&tt.size?tt.entrySeq().map((([s,o])=>Re.createElement(ae,{key:`${s}-${o}`,xKey:s,xVal:o}))):null))}}class Execute extends Re.Component{handleValidateParameters=()=>{let{specSelectors:s,specActions:o,path:i,method:a}=this.props;return o.validateParams([i,a]),s.validateBeforeExecute([i,a])};handleValidateRequestBody=()=>{let{path:s,method:o,specSelectors:i,oas3Selectors:a,oas3Actions:u}=this.props,_={missingBodyValue:!1,missingRequiredKeys:[]};u.clearRequestBodyValidateError({path:s,method:o});let w=i.getOAS3RequiredRequestBodyContentType([s,o]),x=a.requestBodyValue(s,o),C=a.validateBeforeExecute([s,o]),j=a.requestContentType(s,o);if(!C)return _.missingBodyValue=!0,u.setRequestBodyValidateError({path:s,method:o,validationErrors:_}),!1;if(!w)return!0;let L=a.validateShallowRequired({oas3RequiredRequestBodyContentType:w,oas3RequestContentType:j,oas3RequestBodyValue:x});return!L||L.length<1||(L.forEach((s=>{_.missingRequiredKeys.push(s)})),u.setRequestBodyValidateError({path:s,method:o,validationErrors:_}),!1)};handleValidationResultPass=()=>{let{specActions:s,operation:o,path:i,method:a}=this.props;this.props.onExecute&&this.props.onExecute(),s.execute({operation:o,path:i,method:a})};handleValidationResultFail=()=>{let{specActions:s,path:o,method:i}=this.props;s.clearValidateParams([o,i]),setTimeout((()=>{s.validateParams([o,i])}),40)};handleValidationResult=s=>{s?this.handleValidationResultPass():this.handleValidationResultFail()};onClick=()=>{let s=this.handleValidateParameters(),o=this.handleValidateRequestBody(),i=s&&o;this.handleValidationResult(i)};onChangeProducesWrapper=s=>this.props.specActions.changeProducesValue([this.props.path,this.props.method],s);render(){const{disabled:s}=this.props;return Re.createElement("button",{className:"btn execute opblock-control__btn",onClick:this.onClick,disabled:s},"Execute")}}class headers_Headers extends Re.Component{render(){let{headers:s,getComponent:o}=this.props;const i=o("Property"),a=o("Markdown",!0);return s&&s.size?Re.createElement("div",{className:"headers-wrapper"},Re.createElement("h4",{className:"headers__title"},"Headers:"),Re.createElement("table",{className:"headers"},Re.createElement("thead",null,Re.createElement("tr",{className:"header-row"},Re.createElement("th",{className:"header-col"},"Name"),Re.createElement("th",{className:"header-col"},"Description"),Re.createElement("th",{className:"header-col"},"Type"))),Re.createElement("tbody",null,s.entrySeq().map((([s,o])=>{if(!We().Map.isMap(o))return null;const u=o.get("description"),_=o.getIn(["schema"])?o.getIn(["schema","type"]):o.getIn(["type"]),w=o.getIn(["schema","example"]);return Re.createElement("tr",{key:s},Re.createElement("td",{className:"header-col"},s),Re.createElement("td",{className:"header-col"},u?Re.createElement(a,{source:u}):null),Re.createElement("td",{className:"header-col"},_," ",w?Re.createElement(i,{propKey:"Example",propVal:w,propClass:"header-example"}):null))})).toArray()))):null}}class Errors extends Re.Component{render(){let{editorActions:s,errSelectors:o,layoutSelectors:i,layoutActions:a,getComponent:u}=this.props;const _=u("Collapse");if(s&&s.jumpToLine)var w=s.jumpToLine;let x=o.allErrors().filter((s=>"thrown"===s.get("type")||"error"===s.get("level")));if(!x||x.count()<1)return null;let C=i.isShown(["errorPane"],!0),j=x.sortBy((s=>s.get("line")));return Re.createElement("pre",{className:"errors-wrapper"},Re.createElement("hgroup",{className:"error"},Re.createElement("h4",{className:"errors__title"},"Errors"),Re.createElement("button",{className:"btn errors__clear-btn",onClick:()=>a.show(["errorPane"],!C)},C?"Hide":"Show")),Re.createElement(_,{isOpened:C,animated:!0},Re.createElement("div",{className:"errors"},j.map(((s,o)=>{let i=s.get("type");return"thrown"===i||"auth"===i?Re.createElement(ThrownErrorItem,{key:o,error:s.get("error")||s,jumpToLine:w}):"spec"===i?Re.createElement(SpecErrorItem,{key:o,error:s,jumpToLine:w}):void 0})))))}}const ThrownErrorItem=({error:s,jumpToLine:o})=>{if(!s)return null;let i=s.get("line");return Re.createElement("div",{className:"error-wrapper"},s?Re.createElement("div",null,Re.createElement("h4",null,s.get("source")&&s.get("level")?toTitleCase(s.get("source"))+" "+s.get("level"):"",s.get("path")?Re.createElement("small",null," at ",s.get("path")):null),Re.createElement("span",{className:"message thrown"},s.get("message")),Re.createElement("div",{className:"error-line"},i&&o?Re.createElement("a",{onClick:o.bind(null,i)},"Jump to line ",i):null)):null)},SpecErrorItem=({error:s,jumpToLine:o=null})=>{let i=null;return s.get("path")?i=ze.List.isList(s.get("path"))?Re.createElement("small",null,"at ",s.get("path").join(".")):Re.createElement("small",null,"at ",s.get("path")):s.get("line")&&!o&&(i=Re.createElement("small",null,"on line ",s.get("line"))),Re.createElement("div",{className:"error-wrapper"},s?Re.createElement("div",null,Re.createElement("h4",null,toTitleCase(s.get("source"))+" "+s.get("level")," ",i),Re.createElement("span",{className:"message"},s.get("message")),Re.createElement("div",{className:"error-line"},o?Re.createElement("a",{onClick:o.bind(null,s.get("line"))},"Jump to line ",s.get("line")):null)):null)};function toTitleCase(s){return(s||"").split(" ").map((s=>s[0].toUpperCase()+s.slice(1))).join(" ")}const content_type_noop=()=>{};class ContentType extends Re.Component{static defaultProps={onChange:content_type_noop,value:null,contentTypes:(0,ze.fromJS)(["application/json"])};componentDidMount(){const{contentTypes:s,onChange:o}=this.props;s&&s.size&&o(s.first())}componentDidUpdate(){const{contentTypes:s,value:o,onChange:i}=this.props;s&&s.size&&(s.includes(o)||i(s.first()))}onChangeWrapper=s=>this.props.onChange(s.target.value);render(){let{ariaControls:s,ariaLabel:o,className:i,contentTypes:a,controlId:u,value:_}=this.props;return a&&a.size?Re.createElement("div",{className:"content-type-wrapper "+(i||"")},Re.createElement("select",{"aria-controls":s,"aria-label":o,className:"content-type",id:u,onChange:this.onChangeWrapper,value:_||""},a.map((s=>Re.createElement("option",{key:s,value:s},s))).toArray())):null}}function xclass(...s){return s.filter((s=>!!s)).join(" ").trim()}class Container extends Re.Component{render(){let{fullscreen:s,full:o,...i}=this.props;if(s)return Re.createElement("section",i);let a="swagger-container"+(o?"-full":"");return Re.createElement("section",Mn()({},i,{className:xclass(i.className,a)}))}}const eA={mobile:"",tablet:"-tablet",desktop:"-desktop",large:"-hd"};class Col extends Re.Component{render(){const{hide:s,keepContents:o,mobile:i,tablet:a,desktop:u,large:_,...w}=this.props;if(s&&!o)return Re.createElement("span",null);let x=[];for(let s in eA){if(!Object.prototype.hasOwnProperty.call(eA,s))continue;let o=eA[s];if(s in this.props){let i=this.props[s];if(i<1){x.push("none"+o);continue}x.push("block"+o),x.push("col-"+i+o)}}s&&x.push("hidden");let C=xclass(w.className,...x);return Re.createElement("section",Mn()({},w,{className:C}))}}class Row extends Re.Component{render(){return Re.createElement("div",Mn()({},this.props,{className:xclass(this.props.className,"wrapper")}))}}class Button extends Re.Component{static defaultProps={className:""};render(){return Re.createElement("button",Mn()({},this.props,{className:xclass(this.props.className,"button")}))}}const TextArea=s=>Re.createElement("textarea",s),Input=s=>Re.createElement("input",s);class Select extends Re.Component{static defaultProps={multiple:!1,allowEmptyValue:!0};constructor(s,o){let i;super(s,o),i=s.value?s.value:s.multiple?[""]:"",this.state={value:i}}onChange=s=>{let o,{onChange:i,multiple:a}=this.props,u=[].slice.call(s.target.options);o=a?u.filter((function(s){return s.selected})).map((function(s){return s.value})):s.target.value,this.setState({value:o}),i&&i(o)};UNSAFE_componentWillReceiveProps(s){s.value!==this.props.value&&this.setState({value:s.value})}render(){let{allowedValues:s,multiple:o,allowEmptyValue:i,disabled:a}=this.props,u=this.state.value?.toJS?.()||this.state.value;return Re.createElement("select",{className:this.props.className,multiple:o,value:u,onChange:this.onChange,disabled:a},i?Re.createElement("option",{value:""},"--"):null,s.map((function(s,o){return Re.createElement("option",{key:o,value:String(s)},String(s))})))}}class layout_utils_Link extends Re.Component{render(){return Re.createElement("a",Mn()({},this.props,{rel:"noopener noreferrer",className:xclass(this.props.className,"link")}))}}const NoMargin=({children:s})=>Re.createElement("div",{className:"no-margin"}," ",s," ");class Collapse extends Re.Component{static defaultProps={isOpened:!1,animated:!1};renderNotAnimated(){return this.props.isOpened?Re.createElement(NoMargin,null,this.props.children):Re.createElement("noscript",null)}render(){let{animated:s,isOpened:o,children:i}=this.props;return s?(i=o?i:null,Re.createElement(NoMargin,null,i)):this.renderNotAnimated()}}class Overview extends Re.Component{constructor(...s){super(...s),this.setTagShown=this._setTagShown.bind(this)}_setTagShown(s,o){this.props.layoutActions.show(s,o)}showOp(s,o){let{layoutActions:i}=this.props;i.show(s,o)}render(){let{specSelectors:s,layoutSelectors:o,layoutActions:i,getComponent:a}=this.props,u=s.taggedOperations();const _=a("Collapse");return Re.createElement("div",null,Re.createElement("h4",{className:"overview-title"},"Overview"),u.map(((s,a)=>{let u=s.get("operations"),w=["overview-tags",a],x=o.isShown(w,!0);return Re.createElement("div",{key:"overview-"+a},Re.createElement("h4",{onClick:()=>i.show(w,!x),className:"link overview-tag"}," ",x?"-":"+",a),Re.createElement(_,{isOpened:x,animated:!0},u.map((s=>{let{path:a,method:u,id:_}=s.toObject(),w="operations",x=_,C=o.isShown([w,x]);return Re.createElement(OperationLink,{key:_,path:a,method:u,id:a+"-"+u,shown:C,showOpId:x,showOpIdPrefix:w,href:`#operation-${x}`,onClick:i.show})})).toArray()))})).toArray(),u.size<1&&Re.createElement("h3",null," No operations defined in spec! "))}}class OperationLink extends Re.Component{constructor(s){super(s),this.onClick=this._onClick.bind(this)}_onClick(){let{showOpId:s,showOpIdPrefix:o,onClick:i,shown:a}=this.props;i([o,s],!a)}render(){let{id:s,method:o,shown:i,href:a}=this.props;return Re.createElement(layout_utils_Link,{href:a,onClick:this.onClick,className:"block opblock-link "+(i?"shown":"")},Re.createElement("div",null,Re.createElement("small",{className:`bold-label-${o}`},o.toUpperCase()),Re.createElement("span",{className:"bold-label"},s)))}}class InitializedInput extends Re.Component{componentDidMount(){this.props.initialValue&&(this.inputRef.value=this.props.initialValue)}render(){const{value:s,defaultValue:o,initialValue:i,...a}=this.props;return Re.createElement("input",Mn()({},a,{ref:s=>this.inputRef=s}))}}class InfoBasePath extends Re.Component{render(){const{host:s,basePath:o}=this.props;return Re.createElement("pre",{className:"base-url"},"[ Base URL: ",s,o," ]")}}class InfoUrl extends Re.PureComponent{render(){const{url:s,getComponent:o}=this.props,i=o("Link");return Re.createElement(i,{target:"_blank",href:sanitizeUrl(s)},Re.createElement("span",{className:"url"}," ",s))}}class info_Info extends Re.Component{render(){const{info:s,url:o,host:i,basePath:a,getComponent:u,externalDocs:_,selectedServer:w,url:x}=this.props,C=s.get("version"),j=s.get("description"),L=s.get("title"),B=safeBuildUrl(s.get("termsOfService"),x,{selectedServer:w}),$=s.get("contact"),U=s.get("license"),V=safeBuildUrl(_&&_.get("url"),x,{selectedServer:w}),z=_&&_.get("description"),Y=u("Markdown",!0),Z=u("Link"),ee=u("VersionStamp"),ie=u("OpenAPIVersion"),ae=u("InfoUrl"),ce=u("InfoBasePath"),le=u("License"),pe=u("Contact");return Re.createElement("div",{className:"info"},Re.createElement("hgroup",{className:"main"},Re.createElement("h1",{className:"title"},L,Re.createElement("span",null,C&&Re.createElement(ee,{version:C}),Re.createElement(ie,{oasVersion:"2.0"}))),i||a?Re.createElement(ce,{host:i,basePath:a}):null,o&&Re.createElement(ae,{getComponent:u,url:o})),Re.createElement("div",{className:"description"},Re.createElement(Y,{source:j})),B&&Re.createElement("div",{className:"info__tos"},Re.createElement(Z,{target:"_blank",href:sanitizeUrl(B)},"Terms of service")),$?.size>0&&Re.createElement(pe,{getComponent:u,data:$,selectedServer:w,url:o}),U?.size>0&&Re.createElement(le,{getComponent:u,license:U,selectedServer:w,url:o}),V?Re.createElement(Z,{className:"info__extdocs",target:"_blank",href:sanitizeUrl(V)},z||V):null)}}const tA=info_Info;class InfoContainer extends Re.Component{render(){const{specSelectors:s,getComponent:o,oas3Selectors:i}=this.props,a=s.info(),u=s.url(),_=s.basePath(),w=s.host(),x=s.externalDocs(),C=i.selectedServer(),j=o("info");return Re.createElement("div",null,a&&a.count()?Re.createElement(j,{info:a,url:u,host:w,basePath:_,externalDocs:x,getComponent:o,selectedServer:C}):null)}}class contact_Contact extends Re.Component{render(){const{data:s,getComponent:o,selectedServer:i,url:a}=this.props,u=s.get("name","the developer"),_=safeBuildUrl(s.get("url"),a,{selectedServer:i}),w=s.get("email"),x=o("Link");return Re.createElement("div",{className:"info__contact"},_&&Re.createElement("div",null,Re.createElement(x,{href:sanitizeUrl(_),target:"_blank"},u," - Website")),w&&Re.createElement(x,{href:sanitizeUrl(`mailto:${w}`)},_?`Send email to ${u}`:`Contact ${u}`))}}const rA=contact_Contact;class license_License extends Re.Component{render(){const{license:s,getComponent:o,selectedServer:i,url:a}=this.props,u=s.get("name","License"),_=safeBuildUrl(s.get("url"),a,{selectedServer:i}),w=o("Link");return Re.createElement("div",{className:"info__license"},_?Re.createElement("div",{className:"info__license__url"},Re.createElement(w,{target:"_blank",href:sanitizeUrl(_)},u)):Re.createElement("span",null,u))}}const nA=license_License;class JumpToPath extends Re.Component{render(){return null}}class CopyToClipboardBtn extends Re.Component{render(){let{getComponent:s}=this.props;const o=s("CopyIcon");return Re.createElement("div",{className:"view-line-link copy-to-clipboard",title:"Copy to clipboard"},Re.createElement(Hn.CopyToClipboard,{text:this.props.textToCopy},Re.createElement(o,null)))}}class Footer extends Re.Component{render(){return Re.createElement("div",{className:"footer"})}}class FilterContainer extends Re.Component{onFilterChange=s=>{const{target:{value:o}}=s;this.props.layoutActions.updateFilter(o)};render(){const{specSelectors:s,layoutSelectors:o,getComponent:i}=this.props,a=i("Col"),u="loading"===s.loadingStatus(),_="failed"===s.loadingStatus(),w=o.currentFilter(),x=["operation-filter-input"];return _&&x.push("failed"),u&&x.push("loading"),Re.createElement("div",null,!1===w?null:Re.createElement("div",{className:"filter-container"},Re.createElement(a,{className:"filter wrapper",mobile:12},Re.createElement("input",{className:x.join(" "),placeholder:"Filter by tag",type:"text",onChange:this.onFilterChange,value:"string"==typeof w?w:"",disabled:u}))))}}const sA=Function.prototype;class ParamBody extends Re.PureComponent{static defaultProp={consumes:(0,ze.fromJS)(["application/json"]),param:(0,ze.fromJS)({}),onChange:sA,onChangeConsumes:sA};constructor(s,o){super(s,o),this.state={isEditBox:!1,value:""}}componentDidMount(){this.updateValues.call(this,this.props)}UNSAFE_componentWillReceiveProps(s){this.updateValues.call(this,s)}updateValues=s=>{let{param:o,isExecute:i,consumesValue:a=""}=s,u=/xml/i.test(a),_=/json/i.test(a),w=u?o.get("value_xml"):o.get("value");if(void 0!==w){let s=!w&&_?"{}":w;this.setState({value:s}),this.onChange(s,{isXml:u,isEditBox:i})}else u?this.onChange(this.sample("xml"),{isXml:u,isEditBox:i}):this.onChange(this.sample(),{isEditBox:i})};sample=s=>{let{param:o,fn:i}=this.props,a=i.inferSchema(o.toJS());return i.getSampleSchema(a,s,{includeWriteOnly:!0})};onChange=(s,{isEditBox:o,isXml:i})=>{this.setState({value:s,isEditBox:o}),this._onChange(s,i)};_onChange=(s,o)=>{(this.props.onChange||sA)(s,o)};handleOnChange=s=>{const{consumesValue:o}=this.props,i=/xml/i.test(o),a=s.target.value;this.onChange(a,{isXml:i,isEditBox:this.state.isEditBox})};toggleIsEditBox=()=>this.setState((s=>({isEditBox:!s.isEditBox})));render(){let{onChangeConsumes:s,param:o,isExecute:i,specSelectors:a,pathMethod:u,getComponent:_}=this.props;const w=_("Button"),x=_("TextArea"),C=_("HighlightCode",!0),j=_("contentType");let L=(a?a.parameterWithMetaByIdentity(u,o):o).get("errors",(0,ze.List)()),B=a.contentTypeValues(u).get("requestContentType"),$=this.props.consumes&&this.props.consumes.size?this.props.consumes:ParamBody.defaultProp.consumes,{value:U,isEditBox:V}=this.state,z=null;getKnownSyntaxHighlighterLanguage(U)&&(z="json");const Y=`${createHtmlReadyId(`${u[1]}${u[0]}_parameters`)}_select`;return Re.createElement("div",{className:"body-param","data-param-name":o.get("name"),"data-param-in":o.get("in")},V&&i?Re.createElement(x,{className:"body-param__text"+(L.count()?" invalid":""),value:U,onChange:this.handleOnChange}):U&&Re.createElement(C,{className:"body-param__example",language:z},U),Re.createElement("div",{className:"body-param-options"},i?Re.createElement("div",{className:"body-param-edit"},Re.createElement(w,{className:V?"btn cancel body-param__example-edit":"btn edit body-param__example-edit",onClick:this.toggleIsEditBox},V?"Cancel":"Edit")):null,Re.createElement("label",{htmlFor:Y},Re.createElement("span",null,"Parameter content type"),Re.createElement(j,{value:B,contentTypes:$,onChange:s,className:"body-param-content-type",ariaLabel:"Parameter content type",controlId:Y}))))}}class Curl extends Re.Component{render(){const{request:s,getComponent:o}=this.props,i=requestSnippetGenerator_curl_bash(s),a=o("SyntaxHighlighter",!0);return Re.createElement("div",{className:"curl-command"},Re.createElement("h4",null,"Curl"),Re.createElement("div",{className:"copy-to-clipboard"},Re.createElement(Hn.CopyToClipboard,{text:i},Re.createElement("button",null))),Re.createElement("div",null,Re.createElement(a,{language:"bash",className:"curl microlight",renderPlainText:({children:s,PlainTextViewer:o})=>Re.createElement(o,{className:"curl"},s)},i)))}}const property=({propKey:s,propVal:o,propClass:i})=>Re.createElement("span",{className:i},Re.createElement("br",null),s,": ",stringify(o));class TryItOutButton extends Re.Component{static defaultProps={onTryoutClick:Function.prototype,onCancelClick:Function.prototype,onResetClick:Function.prototype,enabled:!1,hasUserEditedBody:!1,isOAS3:!1};render(){const{onTryoutClick:s,onCancelClick:o,onResetClick:i,enabled:a,hasUserEditedBody:u,isOAS3:_}=this.props,w=_&&u;return Re.createElement("div",{className:w?"try-out btn-group":"try-out"},a?Re.createElement("button",{className:"btn try-out__btn cancel",onClick:o},"Cancel"):Re.createElement("button",{className:"btn try-out__btn",onClick:s},"Try it out "),w&&Re.createElement("button",{className:"btn try-out__btn reset",onClick:i},"Reset"))}}class VersionPragmaFilter extends Re.PureComponent{static defaultProps={alsoShow:null,children:null,bypass:!1};render(){const{bypass:s,isSwagger2:o,isOAS3:i,alsoShow:a}=this.props;return s?Re.createElement("div",null,this.props.children):o&&i?Re.createElement("div",{className:"version-pragma"},a,Re.createElement("div",{className:"version-pragma__message version-pragma__message--ambiguous"},Re.createElement("div",null,Re.createElement("h3",null,"Unable to render this definition"),Re.createElement("p",null,Re.createElement("code",null,"swagger")," and ",Re.createElement("code",null,"openapi")," fields cannot be present in the same Swagger or OpenAPI definition. Please remove one of the fields."),Re.createElement("p",null,"Supported version fields are ",Re.createElement("code",null,"swagger: ",'"2.0"')," and those that match ",Re.createElement("code",null,"openapi: 3.0.n")," (for example, ",Re.createElement("code",null,"openapi: 3.0.4"),").")))):o||i?Re.createElement("div",null,this.props.children):Re.createElement("div",{className:"version-pragma"},a,Re.createElement("div",{className:"version-pragma__message version-pragma__message--missing"},Re.createElement("div",null,Re.createElement("h3",null,"Unable to render this definition"),Re.createElement("p",null,"The provided definition does not specify a valid version field."),Re.createElement("p",null,"Please indicate a valid Swagger or OpenAPI version field. Supported version fields are ",Re.createElement("code",null,"swagger: ",'"2.0"')," and those that match ",Re.createElement("code",null,"openapi: 3.0.n")," (for example, ",Re.createElement("code",null,"openapi: 3.0.4"),")."))))}}const version_stamp=({version:s})=>Re.createElement("small",null,Re.createElement("pre",{className:"version"}," ",s," ")),openapi_version=({oasVersion:s})=>Re.createElement("small",{className:"version-stamp"},Re.createElement("pre",{className:"version"},"OAS ",s)),deep_link=({enabled:s,path:o,text:i})=>Re.createElement("a",{className:"nostyle",onClick:s?s=>s.preventDefault():null,href:s?`#/${o}`:null},Re.createElement("span",null,i)),svg_assets=()=>Re.createElement("div",null,Re.createElement("svg",{xmlns:"http://www.w3.org/2000/svg",xmlnsXlink:"http://www.w3.org/1999/xlink",className:"svg-assets"},Re.createElement("defs",null,Re.createElement("symbol",{viewBox:"0 0 20 20",id:"unlocked"},Re.createElement("path",{d:"M15.8 8H14V5.6C14 2.703 12.665 1 10 1 7.334 1 6 2.703 6 5.6V6h2v-.801C8 3.754 8.797 3 10 3c1.203 0 2 .754 2 2.199V8H4c-.553 0-1 .646-1 1.199V17c0 .549.428 1.139.951 1.307l1.197.387C5.672 18.861 6.55 19 7.1 19h5.8c.549 0 1.428-.139 1.951-.307l1.196-.387c.524-.167.953-.757.953-1.306V9.199C17 8.646 16.352 8 15.8 8z"})),Re.createElement("symbol",{viewBox:"0 0 20 20",id:"locked"},Re.createElement("path",{d:"M15.8 8H14V5.6C14 2.703 12.665 1 10 1 7.334 1 6 2.703 6 5.6V8H4c-.553 0-1 .646-1 1.199V17c0 .549.428 1.139.951 1.307l1.197.387C5.672 18.861 6.55 19 7.1 19h5.8c.549 0 1.428-.139 1.951-.307l1.196-.387c.524-.167.953-.757.953-1.306V9.199C17 8.646 16.352 8 15.8 8zM12 8H8V5.199C8 3.754 8.797 3 10 3c1.203 0 2 .754 2 2.199V8z"})),Re.createElement("symbol",{viewBox:"0 0 20 20",id:"close"},Re.createElement("path",{d:"M14.348 14.849c-.469.469-1.229.469-1.697 0L10 11.819l-2.651 3.029c-.469.469-1.229.469-1.697 0-.469-.469-.469-1.229 0-1.697l2.758-3.15-2.759-3.152c-.469-.469-.469-1.228 0-1.697.469-.469 1.228-.469 1.697 0L10 8.183l2.651-3.031c.469-.469 1.228-.469 1.697 0 .469.469.469 1.229 0 1.697l-2.758 3.152 2.758 3.15c.469.469.469 1.229 0 1.698z"})),Re.createElement("symbol",{viewBox:"0 0 20 20",id:"large-arrow"},Re.createElement("path",{d:"M13.25 10L6.109 2.58c-.268-.27-.268-.707 0-.979.268-.27.701-.27.969 0l7.83 7.908c.268.271.268.709 0 .979l-7.83 7.908c-.268.271-.701.27-.969 0-.268-.269-.268-.707 0-.979L13.25 10z"})),Re.createElement("symbol",{viewBox:"0 0 20 20",id:"large-arrow-down"},Re.createElement("path",{d:"M17.418 6.109c.272-.268.709-.268.979 0s.271.701 0 .969l-7.908 7.83c-.27.268-.707.268-.979 0l-7.908-7.83c-.27-.268-.27-.701 0-.969.271-.268.709-.268.979 0L10 13.25l7.418-7.141z"})),Re.createElement("symbol",{viewBox:"0 0 20 20",id:"large-arrow-up"},Re.createElement("path",{d:"M 17.418 14.908 C 17.69 15.176 18.127 15.176 18.397 14.908 C 18.667 14.64 18.668 14.207 18.397 13.939 L 10.489 6.109 C 10.219 5.841 9.782 5.841 9.51 6.109 L 1.602 13.939 C 1.332 14.207 1.332 14.64 1.602 14.908 C 1.873 15.176 2.311 15.176 2.581 14.908 L 10 7.767 L 17.418 14.908 Z"})),Re.createElement("symbol",{viewBox:"0 0 24 24",id:"jump-to"},Re.createElement("path",{d:"M19 7v4H5.83l3.58-3.59L8 6l-6 6 6 6 1.41-1.41L5.83 13H21V7z"})),Re.createElement("symbol",{viewBox:"0 0 24 24",id:"expand"},Re.createElement("path",{d:"M10 18h4v-2h-4v2zM3 6v2h18V6H3zm3 7h12v-2H6v2z"})),Re.createElement("symbol",{viewBox:"0 0 15 16",id:"copy"},Re.createElement("g",{transform:"translate(2, -1)"},Re.createElement("path",{fill:"#ffffff",fillRule:"evenodd",d:"M2 13h4v1H2v-1zm5-6H2v1h5V7zm2 3V8l-3 3 3 3v-2h5v-2H9zM4.5 9H2v1h2.5V9zM2 12h2.5v-1H2v1zm9 1h1v2c-.02.28-.11.52-.3.7-.19.18-.42.28-.7.3H1c-.55 0-1-.45-1-1V4c0-.55.45-1 1-1h3c0-1.11.89-2 2-2 1.11 0 2 .89 2 2h3c.55 0 1 .45 1 1v5h-1V6H1v9h10v-2zM2 5h8c0-.55-.45-1-1-1H8c-.55 0-1-.45-1-1s-.45-1-1-1-1 .45-1 1-.45 1-1 1H3c-.55 0-1 .45-1 1z"}))))));var oA;function decodeEntity(s){return(oA=oA||document.createElement("textarea")).innerHTML="&"+s+";",oA.value}var iA=Object.prototype.hasOwnProperty;function index_browser_has(s,o){return!!s&&iA.call(s,o)}function index_browser_assign(s){return[].slice.call(arguments,1).forEach((function(o){if(o){if("object"!=typeof o)throw new TypeError(o+"must be object");Object.keys(o).forEach((function(i){s[i]=o[i]}))}})),s}var aA=/\\([\\!"#$%&'()*+,.\/:;<=>?@[\]^_`{|}~-])/g;function unescapeMd(s){return s.indexOf("\\")<0?s:s.replace(aA,"$1")}function isValidEntityCode(s){return!(s>=55296&&s<=57343)&&(!(s>=64976&&s<=65007)&&(!!(65535&~s&&65534!=(65535&s))&&(!(s>=0&&s<=8)&&(11!==s&&(!(s>=14&&s<=31)&&(!(s>=127&&s<=159)&&!(s>1114111)))))))}function fromCodePoint(s){if(s>65535){var o=55296+((s-=65536)>>10),i=56320+(1023&s);return String.fromCharCode(o,i)}return String.fromCharCode(s)}var cA=/&([a-z#][a-z0-9]{1,31});/gi,lA=/^#((?:x[a-f0-9]{1,8}|[0-9]{1,8}))/i;function replaceEntityPattern(s,o){var i=0,a=decodeEntity(o);return o!==a?a:35===o.charCodeAt(0)&&lA.test(o)&&isValidEntityCode(i="x"===o[1].toLowerCase()?parseInt(o.slice(2),16):parseInt(o.slice(1),10))?fromCodePoint(i):s}function replaceEntities(s){return s.indexOf("&")<0?s:s.replace(cA,replaceEntityPattern)}var uA=/[&<>"]/,pA=/[&<>"]/g,hA={"&":"&amp;","<":"&lt;",">":"&gt;",'"':"&quot;"};function replaceUnsafeChar(s){return hA[s]}function escapeHtml(s){return uA.test(s)?s.replace(pA,replaceUnsafeChar):s}var dA={};function nextToken(s,o){return++o>=s.length-2?o:"paragraph_open"===s[o].type&&s[o].tight&&"inline"===s[o+1].type&&0===s[o+1].content.length&&"paragraph_close"===s[o+2].type&&s[o+2].tight?nextToken(s,o+2):o}dA.blockquote_open=function(){return"<blockquote>\n"},dA.blockquote_close=function(s,o){return"</blockquote>"+fA(s,o)},dA.code=function(s,o){return s[o].block?"<pre><code>"+escapeHtml(s[o].content)+"</code></pre>"+fA(s,o):"<code>"+escapeHtml(s[o].content)+"</code>"},dA.fence=function(s,o,i,a,u){var _,w,x=s[o],C="",j=i.langPrefix;if(x.params){if(w=(_=x.params.split(/\s+/g)).join(" "),index_browser_has(u.rules.fence_custom,_[0]))return u.rules.fence_custom[_[0]](s,o,i,a,u);C=' class="'+j+escapeHtml(replaceEntities(unescapeMd(w)))+'"'}return"<pre><code"+C+">"+(i.highlight&&i.highlight.apply(i.highlight,[x.content].concat(_))||escapeHtml(x.content))+"</code></pre>"+fA(s,o)},dA.fence_custom={},dA.heading_open=function(s,o){return"<h"+s[o].hLevel+">"},dA.heading_close=function(s,o){return"</h"+s[o].hLevel+">\n"},dA.hr=function(s,o,i){return(i.xhtmlOut?"<hr />":"<hr>")+fA(s,o)},dA.bullet_list_open=function(){return"<ul>\n"},dA.bullet_list_close=function(s,o){return"</ul>"+fA(s,o)},dA.list_item_open=function(){return"<li>"},dA.list_item_close=function(){return"</li>\n"},dA.ordered_list_open=function(s,o){var i=s[o];return"<ol"+(i.order>1?' start="'+i.order+'"':"")+">\n"},dA.ordered_list_close=function(s,o){return"</ol>"+fA(s,o)},dA.paragraph_open=function(s,o){return s[o].tight?"":"<p>"},dA.paragraph_close=function(s,o){var i=!(s[o].tight&&o&&"inline"===s[o-1].type&&!s[o-1].content);return(s[o].tight?"":"</p>")+(i?fA(s,o):"")},dA.link_open=function(s,o,i){var a=s[o].title?' title="'+escapeHtml(replaceEntities(s[o].title))+'"':"",u=i.linkTarget?' target="'+i.linkTarget+'"':"";return'<a href="'+escapeHtml(s[o].href)+'"'+a+u+">"},dA.link_close=function(){return"</a>"},dA.image=function(s,o,i){var a=' src="'+escapeHtml(s[o].src)+'"',u=s[o].title?' title="'+escapeHtml(replaceEntities(s[o].title))+'"':"";return"<img"+a+(' alt="'+(s[o].alt?escapeHtml(replaceEntities(unescapeMd(s[o].alt))):"")+'"')+u+(i.xhtmlOut?" /":"")+">"},dA.table_open=function(){return"<table>\n"},dA.table_close=function(){return"</table>\n"},dA.thead_open=function(){return"<thead>\n"},dA.thead_close=function(){return"</thead>\n"},dA.tbody_open=function(){return"<tbody>\n"},dA.tbody_close=function(){return"</tbody>\n"},dA.tr_open=function(){return"<tr>"},dA.tr_close=function(){return"</tr>\n"},dA.th_open=function(s,o){var i=s[o];return"<th"+(i.align?' style="text-align:'+i.align+'"':"")+">"},dA.th_close=function(){return"</th>"},dA.td_open=function(s,o){var i=s[o];return"<td"+(i.align?' style="text-align:'+i.align+'"':"")+">"},dA.td_close=function(){return"</td>"},dA.strong_open=function(){return"<strong>"},dA.strong_close=function(){return"</strong>"},dA.em_open=function(){return"<em>"},dA.em_close=function(){return"</em>"},dA.del_open=function(){return"<del>"},dA.del_close=function(){return"</del>"},dA.ins_open=function(){return"<ins>"},dA.ins_close=function(){return"</ins>"},dA.mark_open=function(){return"<mark>"},dA.mark_close=function(){return"</mark>"},dA.sub=function(s,o){return"<sub>"+escapeHtml(s[o].content)+"</sub>"},dA.sup=function(s,o){return"<sup>"+escapeHtml(s[o].content)+"</sup>"},dA.hardbreak=function(s,o,i){return i.xhtmlOut?"<br />\n":"<br>\n"},dA.softbreak=function(s,o,i){return i.breaks?i.xhtmlOut?"<br />\n":"<br>\n":"\n"},dA.text=function(s,o){return escapeHtml(s[o].content)},dA.htmlblock=function(s,o){return s[o].content},dA.htmltag=function(s,o){return s[o].content},dA.abbr_open=function(s,o){return'<abbr title="'+escapeHtml(replaceEntities(s[o].title))+'">'},dA.abbr_close=function(){return"</abbr>"},dA.footnote_ref=function(s,o){var i=Number(s[o].id+1).toString(),a="fnref"+i;return s[o].subId>0&&(a+=":"+s[o].subId),'<sup class="footnote-ref"><a href="#fn'+i+'" id="'+a+'">['+i+"]</a></sup>"},dA.footnote_block_open=function(s,o,i){return(i.xhtmlOut?'<hr class="footnotes-sep" />\n':'<hr class="footnotes-sep">\n')+'<section class="footnotes">\n<ol class="footnotes-list">\n'},dA.footnote_block_close=function(){return"</ol>\n</section>\n"},dA.footnote_open=function(s,o){return'<li id="fn'+Number(s[o].id+1).toString()+'"  class="footnote-item">'},dA.footnote_close=function(){return"</li>\n"},dA.footnote_anchor=function(s,o){var i="fnref"+Number(s[o].id+1).toString();return s[o].subId>0&&(i+=":"+s[o].subId),' <a href="#'+i+'" class="footnote-backref">↩</a>'},dA.dl_open=function(){return"<dl>\n"},dA.dt_open=function(){return"<dt>"},dA.dd_open=function(){return"<dd>"},dA.dl_close=function(){return"</dl>\n"},dA.dt_close=function(){return"</dt>\n"},dA.dd_close=function(){return"</dd>\n"};var fA=dA.getBreak=function getBreak(s,o){return(o=nextToken(s,o))<s.length&&"list_item_close"===s[o].type?"":"\n"};function Renderer(){this.rules=index_browser_assign({},dA),this.getBreak=dA.getBreak}function Ruler(){this.__rules__=[],this.__cache__=null}function StateInline(s,o,i,a,u){this.src=s,this.env=a,this.options=i,this.parser=o,this.tokens=u,this.pos=0,this.posMax=this.src.length,this.level=0,this.pending="",this.pendingLevel=0,this.cache=[],this.isInLabel=!1,this.linkLevel=0,this.linkContent="",this.labelUnmatchedScopes=0}function parseLinkLabel(s,o){var i,a,u,_=-1,w=s.posMax,x=s.pos,C=s.isInLabel;if(s.isInLabel)return-1;if(s.labelUnmatchedScopes)return s.labelUnmatchedScopes--,-1;for(s.pos=o+1,s.isInLabel=!0,i=1;s.pos<w;){if(91===(u=s.src.charCodeAt(s.pos)))i++;else if(93===u&&0===--i){a=!0;break}s.parser.skipToken(s)}return a?(_=s.pos,s.labelUnmatchedScopes=0):s.labelUnmatchedScopes=i-1,s.pos=x,s.isInLabel=C,_}function parseAbbr(s,o,i,a){var u,_,w,x,C,j;if(42!==s.charCodeAt(0))return-1;if(91!==s.charCodeAt(1))return-1;if(-1===s.indexOf("]:"))return-1;if((_=parseLinkLabel(u=new StateInline(s,o,i,a,[]),1))<0||58!==s.charCodeAt(_+1))return-1;for(x=u.posMax,w=_+2;w<x&&10!==u.src.charCodeAt(w);w++);return C=s.slice(2,_),0===(j=s.slice(_+2,w).trim()).length?-1:(a.abbreviations||(a.abbreviations={}),void 0===a.abbreviations[":"+C]&&(a.abbreviations[":"+C]=j),w)}function normalizeLink(s){var o=replaceEntities(s);try{o=decodeURI(o)}catch(s){}return encodeURI(o)}function parseLinkDestination(s,o){var i,a,u,_=o,w=s.posMax;if(60===s.src.charCodeAt(o)){for(o++;o<w;){if(10===(i=s.src.charCodeAt(o)))return!1;if(62===i)return u=normalizeLink(unescapeMd(s.src.slice(_+1,o))),!!s.parser.validateLink(u)&&(s.pos=o+1,s.linkContent=u,!0);92===i&&o+1<w?o+=2:o++}return!1}for(a=0;o<w&&32!==(i=s.src.charCodeAt(o))&&!(i<32||127===i);)if(92===i&&o+1<w)o+=2;else{if(40===i&&++a>1)break;if(41===i&&--a<0)break;o++}return _!==o&&(u=unescapeMd(s.src.slice(_,o)),!!s.parser.validateLink(u)&&(s.linkContent=u,s.pos=o,!0))}function parseLinkTitle(s,o){var i,a=o,u=s.posMax,_=s.src.charCodeAt(o);if(34!==_&&39!==_&&40!==_)return!1;for(o++,40===_&&(_=41);o<u;){if((i=s.src.charCodeAt(o))===_)return s.pos=o+1,s.linkContent=unescapeMd(s.src.slice(a+1,o)),!0;92===i&&o+1<u?o+=2:o++}return!1}function normalizeReference(s){return s.trim().replace(/\s+/g," ").toUpperCase()}function parseReference(s,o,i,a){var u,_,w,x,C,j,L,B,$;if(91!==s.charCodeAt(0))return-1;if(-1===s.indexOf("]:"))return-1;if((_=parseLinkLabel(u=new StateInline(s,o,i,a,[]),0))<0||58!==s.charCodeAt(_+1))return-1;for(x=u.posMax,w=_+2;w<x&&(32===(C=u.src.charCodeAt(w))||10===C);w++);if(!parseLinkDestination(u,w))return-1;for(L=u.linkContent,j=w=u.pos,w+=1;w<x&&(32===(C=u.src.charCodeAt(w))||10===C);w++);for(w<x&&j!==w&&parseLinkTitle(u,w)?(B=u.linkContent,w=u.pos):(B="",w=j);w<x&&32===u.src.charCodeAt(w);)w++;return w<x&&10!==u.src.charCodeAt(w)?-1:($=normalizeReference(s.slice(1,_)),void 0===a.references[$]&&(a.references[$]={title:B,href:L}),w)}Renderer.prototype.renderInline=function(s,o,i){for(var a=this.rules,u=s.length,_=0,w="";u--;)w+=a[s[_].type](s,_++,o,i,this);return w},Renderer.prototype.render=function(s,o,i){for(var a=this.rules,u=s.length,_=-1,w="";++_<u;)"inline"===s[_].type?w+=this.renderInline(s[_].children,o,i):w+=a[s[_].type](s,_,o,i,this);return w},Ruler.prototype.__find__=function(s){for(var o=this.__rules__.length,i=-1;o--;)if(this.__rules__[++i].name===s)return i;return-1},Ruler.prototype.__compile__=function(){var s=this,o=[""];s.__rules__.forEach((function(s){s.enabled&&s.alt.forEach((function(s){o.indexOf(s)<0&&o.push(s)}))})),s.__cache__={},o.forEach((function(o){s.__cache__[o]=[],s.__rules__.forEach((function(i){i.enabled&&(o&&i.alt.indexOf(o)<0||s.__cache__[o].push(i.fn))}))}))},Ruler.prototype.at=function(s,o,i){var a=this.__find__(s),u=i||{};if(-1===a)throw new Error("Parser rule not found: "+s);this.__rules__[a].fn=o,this.__rules__[a].alt=u.alt||[],this.__cache__=null},Ruler.prototype.before=function(s,o,i,a){var u=this.__find__(s),_=a||{};if(-1===u)throw new Error("Parser rule not found: "+s);this.__rules__.splice(u,0,{name:o,enabled:!0,fn:i,alt:_.alt||[]}),this.__cache__=null},Ruler.prototype.after=function(s,o,i,a){var u=this.__find__(s),_=a||{};if(-1===u)throw new Error("Parser rule not found: "+s);this.__rules__.splice(u+1,0,{name:o,enabled:!0,fn:i,alt:_.alt||[]}),this.__cache__=null},Ruler.prototype.push=function(s,o,i){var a=i||{};this.__rules__.push({name:s,enabled:!0,fn:o,alt:a.alt||[]}),this.__cache__=null},Ruler.prototype.enable=function(s,o){s=Array.isArray(s)?s:[s],o&&this.__rules__.forEach((function(s){s.enabled=!1})),s.forEach((function(s){var o=this.__find__(s);if(o<0)throw new Error("Rules manager: invalid rule name "+s);this.__rules__[o].enabled=!0}),this),this.__cache__=null},Ruler.prototype.disable=function(s){(s=Array.isArray(s)?s:[s]).forEach((function(s){var o=this.__find__(s);if(o<0)throw new Error("Rules manager: invalid rule name "+s);this.__rules__[o].enabled=!1}),this),this.__cache__=null},Ruler.prototype.getRules=function(s){return null===this.__cache__&&this.__compile__(),this.__cache__[s]||[]},StateInline.prototype.pushPending=function(){this.tokens.push({type:"text",content:this.pending,level:this.pendingLevel}),this.pending=""},StateInline.prototype.push=function(s){this.pending&&this.pushPending(),this.tokens.push(s),this.pendingLevel=this.level},StateInline.prototype.cacheSet=function(s,o){for(var i=this.cache.length;i<=s;i++)this.cache.push(0);this.cache[s]=o},StateInline.prototype.cacheGet=function(s){return s<this.cache.length?this.cache[s]:0};var mA=" \n()[]'\".,!?-";function regEscape(s){return s.replace(/([-()\[\]{}+?*.$\^|,:#<!\\])/g,"\\$1")}var gA=/\+-|\.\.|\?\?\?\?|!!!!|,,|--/,yA=/\((c|tm|r|p)\)/gi,vA={c:"©",r:"®",p:"§",tm:"™"};function replaceScopedAbbr(s){return s.indexOf("(")<0?s:s.replace(yA,(function(s,o){return vA[o.toLowerCase()]}))}var bA=/['"]/,_A=/['"]/g,SA=/[-\s()\[\]]/;function isLetter(s,o){return!(o<0||o>=s.length)&&!SA.test(s[o])}function replaceAt(s,o,i){return s.substr(0,o)+i+s.substr(o+1)}var EA=[["block",function block(s){s.inlineMode?s.tokens.push({type:"inline",content:s.src.replace(/\n/g," ").trim(),level:0,lines:[0,1],children:[]}):s.block.parse(s.src,s.options,s.env,s.tokens)}],["abbr",function abbr(s){var o,i,a,u,_=s.tokens;if(!s.inlineMode)for(o=1,i=_.length-1;o<i;o++)if("paragraph_open"===_[o-1].type&&"inline"===_[o].type&&"paragraph_close"===_[o+1].type){for(a=_[o].content;a.length&&!((u=parseAbbr(a,s.inline,s.options,s.env))<0);)a=a.slice(u).trim();_[o].content=a,a.length||(_[o-1].tight=!0,_[o+1].tight=!0)}}],["references",function references(s){var o,i,a,u,_=s.tokens;if(s.env.references=s.env.references||{},!s.inlineMode)for(o=1,i=_.length-1;o<i;o++)if("inline"===_[o].type&&"paragraph_open"===_[o-1].type&&"paragraph_close"===_[o+1].type){for(a=_[o].content;a.length&&!((u=parseReference(a,s.inline,s.options,s.env))<0);)a=a.slice(u).trim();_[o].content=a,a.length||(_[o-1].tight=!0,_[o+1].tight=!0)}}],["inline",function inline(s){var o,i,a,u=s.tokens;for(i=0,a=u.length;i<a;i++)"inline"===(o=u[i]).type&&s.inline.parse(o.content,s.options,s.env,o.children)}],["footnote_tail",function footnote_block(s){var o,i,a,u,_,w,x,C,j,L=0,B=!1,$={};if(s.env.footnotes&&(s.tokens=s.tokens.filter((function(s){return"footnote_reference_open"===s.type?(B=!0,C=[],j=s.label,!1):"footnote_reference_close"===s.type?(B=!1,$[":"+j]=C,!1):(B&&C.push(s),!B)})),s.env.footnotes.list)){for(w=s.env.footnotes.list,s.tokens.push({type:"footnote_block_open",level:L++}),o=0,i=w.length;o<i;o++){for(s.tokens.push({type:"footnote_open",id:o,level:L++}),w[o].tokens?((x=[]).push({type:"paragraph_open",tight:!1,level:L++}),x.push({type:"inline",content:"",level:L,children:w[o].tokens}),x.push({type:"paragraph_close",tight:!1,level:--L})):w[o].label&&(x=$[":"+w[o].label]),s.tokens=s.tokens.concat(x),_="paragraph_close"===s.tokens[s.tokens.length-1].type?s.tokens.pop():null,u=w[o].count>0?w[o].count:1,a=0;a<u;a++)s.tokens.push({type:"footnote_anchor",id:o,subId:a,level:L});_&&s.tokens.push(_),s.tokens.push({type:"footnote_close",level:--L})}s.tokens.push({type:"footnote_block_close",level:--L})}}],["abbr2",function abbr2(s){var o,i,a,u,_,w,x,C,j,L,B,$,U=s.tokens;if(s.env.abbreviations)for(s.env.abbrRegExp||($="(^|["+mA.split("").map(regEscape).join("")+"])("+Object.keys(s.env.abbreviations).map((function(s){return s.substr(1)})).sort((function(s,o){return o.length-s.length})).map(regEscape).join("|")+")($|["+mA.split("").map(regEscape).join("")+"])",s.env.abbrRegExp=new RegExp($,"g")),L=s.env.abbrRegExp,i=0,a=U.length;i<a;i++)if("inline"===U[i].type)for(o=(u=U[i].children).length-1;o>=0;o--)if("text"===(_=u[o]).type){for(C=0,w=_.content,L.lastIndex=0,j=_.level,x=[];B=L.exec(w);)L.lastIndex>C&&x.push({type:"text",content:w.slice(C,B.index+B[1].length),level:j}),x.push({type:"abbr_open",title:s.env.abbreviations[":"+B[2]],level:j++}),x.push({type:"text",content:B[2],level:j}),x.push({type:"abbr_close",level:--j}),C=L.lastIndex-B[3].length;x.length&&(C<w.length&&x.push({type:"text",content:w.slice(C),level:j}),U[i].children=u=[].concat(u.slice(0,o),x,u.slice(o+1)))}}],["replacements",function index_browser_replace(s){var o,i,a,u,_;if(s.options.typographer)for(_=s.tokens.length-1;_>=0;_--)if("inline"===s.tokens[_].type)for(o=(u=s.tokens[_].children).length-1;o>=0;o--)"text"===(i=u[o]).type&&(a=replaceScopedAbbr(a=i.content),gA.test(a)&&(a=a.replace(/\+-/g,"±").replace(/\.{2,}/g,"…").replace(/([?!])…/g,"$1..").replace(/([?!]){4,}/g,"$1$1$1").replace(/,{2,}/g,",").replace(/(^|[^-])---([^-]|$)/gm,"$1—$2").replace(/(^|\s)--(\s|$)/gm,"$1–$2").replace(/(^|[^-\s])--([^-\s]|$)/gm,"$1–$2")),i.content=a)}],["smartquotes",function smartquotes(s){var o,i,a,u,_,w,x,C,j,L,B,$,U,V,z,Y,Z;if(s.options.typographer)for(Z=[],z=s.tokens.length-1;z>=0;z--)if("inline"===s.tokens[z].type)for(Y=s.tokens[z].children,Z.length=0,o=0;o<Y.length;o++)if("text"===(i=Y[o]).type&&!bA.test(i.text)){for(x=Y[o].level,U=Z.length-1;U>=0&&!(Z[U].level<=x);U--);Z.length=U+1,_=0,w=(a=i.content).length;e:for(;_<w&&(_A.lastIndex=_,u=_A.exec(a));)if(C=!isLetter(a,u.index-1),_=u.index+1,V="'"===u[0],(j=!isLetter(a,_))||C){if(B=!j,$=!C)for(U=Z.length-1;U>=0&&(L=Z[U],!(Z[U].level<x));U--)if(L.single===V&&Z[U].level===x){L=Z[U],V?(Y[L.token].content=replaceAt(Y[L.token].content,L.pos,s.options.quotes[2]),i.content=replaceAt(i.content,u.index,s.options.quotes[3])):(Y[L.token].content=replaceAt(Y[L.token].content,L.pos,s.options.quotes[0]),i.content=replaceAt(i.content,u.index,s.options.quotes[1])),Z.length=U;continue e}B?Z.push({token:o,pos:u.index,single:V,level:x}):$&&V&&(i.content=replaceAt(i.content,u.index,"’"))}else V&&(i.content=replaceAt(i.content,u.index,"’"))}}]];function Core(){this.options={},this.ruler=new Ruler;for(var s=0;s<EA.length;s++)this.ruler.push(EA[s][0],EA[s][1])}function StateBlock(s,o,i,a,u){var _,w,x,C,j,L,B;for(this.src=s,this.parser=o,this.options=i,this.env=a,this.tokens=u,this.bMarks=[],this.eMarks=[],this.tShift=[],this.blkIndent=0,this.line=0,this.lineMax=0,this.tight=!1,this.parentType="root",this.ddIndent=-1,this.level=0,this.result="",L=0,B=!1,x=C=L=0,j=(w=this.src).length;C<j;C++){if(_=w.charCodeAt(C),!B){if(32===_){L++;continue}B=!0}10!==_&&C!==j-1||(10!==_&&C++,this.bMarks.push(x),this.eMarks.push(C),this.tShift.push(L),B=!1,L=0,x=C+1)}this.bMarks.push(w.length),this.eMarks.push(w.length),this.tShift.push(0),this.lineMax=this.bMarks.length-1}function skipBulletListMarker(s,o){var i,a,u;return(a=s.bMarks[o]+s.tShift[o])>=(u=s.eMarks[o])||42!==(i=s.src.charCodeAt(a++))&&45!==i&&43!==i||a<u&&32!==s.src.charCodeAt(a)?-1:a}function skipOrderedListMarker(s,o){var i,a=s.bMarks[o]+s.tShift[o],u=s.eMarks[o];if(a+1>=u)return-1;if((i=s.src.charCodeAt(a++))<48||i>57)return-1;for(;;){if(a>=u)return-1;if(!((i=s.src.charCodeAt(a++))>=48&&i<=57)){if(41===i||46===i)break;return-1}}return a<u&&32!==s.src.charCodeAt(a)?-1:a}Core.prototype.process=function(s){var o,i,a;for(o=0,i=(a=this.ruler.getRules("")).length;o<i;o++)a[o](s)},StateBlock.prototype.isEmpty=function isEmpty(s){return this.bMarks[s]+this.tShift[s]>=this.eMarks[s]},StateBlock.prototype.skipEmptyLines=function skipEmptyLines(s){for(var o=this.lineMax;s<o&&!(this.bMarks[s]+this.tShift[s]<this.eMarks[s]);s++);return s},StateBlock.prototype.skipSpaces=function skipSpaces(s){for(var o=this.src.length;s<o&&32===this.src.charCodeAt(s);s++);return s},StateBlock.prototype.skipChars=function skipChars(s,o){for(var i=this.src.length;s<i&&this.src.charCodeAt(s)===o;s++);return s},StateBlock.prototype.skipCharsBack=function skipCharsBack(s,o,i){if(s<=i)return s;for(;s>i;)if(o!==this.src.charCodeAt(--s))return s+1;return s},StateBlock.prototype.getLines=function getLines(s,o,i,a){var u,_,w,x,C,j=s;if(s>=o)return"";if(j+1===o)return _=this.bMarks[j]+Math.min(this.tShift[j],i),w=a?this.eMarks[j]+1:this.eMarks[j],this.src.slice(_,w);for(x=new Array(o-s),u=0;j<o;j++,u++)(C=this.tShift[j])>i&&(C=i),C<0&&(C=0),_=this.bMarks[j]+C,w=j+1<o||a?this.eMarks[j]+1:this.eMarks[j],x[u]=this.src.slice(_,w);return x.join("")};var wA={};["article","aside","button","blockquote","body","canvas","caption","col","colgroup","dd","div","dl","dt","embed","fieldset","figcaption","figure","footer","form","h1","h2","h3","h4","h5","h6","header","hgroup","hr","iframe","li","map","object","ol","output","p","pre","progress","script","section","style","table","tbody","td","textarea","tfoot","th","tr","thead","ul","video"].forEach((function(s){wA[s]=!0}));var xA=/^<([a-zA-Z]{1,15})[\s\/>]/,kA=/^<\/([a-zA-Z]{1,15})[\s>]/;function index_browser_getLine(s,o){var i=s.bMarks[o]+s.blkIndent,a=s.eMarks[o];return s.src.substr(i,a-i)}function skipMarker(s,o){var i,a,u=s.bMarks[o]+s.tShift[o],_=s.eMarks[o];return u>=_||126!==(a=s.src.charCodeAt(u++))&&58!==a||u===(i=s.skipSpaces(u))||i>=_?-1:i}var OA=[["code",function code(s,o,i){var a,u;if(s.tShift[o]-s.blkIndent<4)return!1;for(u=a=o+1;a<i;)if(s.isEmpty(a))a++;else{if(!(s.tShift[a]-s.blkIndent>=4))break;u=++a}return s.line=a,s.tokens.push({type:"code",content:s.getLines(o,u,4+s.blkIndent,!0),block:!0,lines:[o,s.line],level:s.level}),!0}],["fences",function fences(s,o,i,a){var u,_,w,x,C,j=!1,L=s.bMarks[o]+s.tShift[o],B=s.eMarks[o];if(L+3>B)return!1;if(126!==(u=s.src.charCodeAt(L))&&96!==u)return!1;if(C=L,(_=(L=s.skipChars(L,u))-C)<3)return!1;if((w=s.src.slice(L,B).trim()).indexOf("`")>=0)return!1;if(a)return!0;for(x=o;!(++x>=i)&&!((L=C=s.bMarks[x]+s.tShift[x])<(B=s.eMarks[x])&&s.tShift[x]<s.blkIndent);)if(s.src.charCodeAt(L)===u&&!(s.tShift[x]-s.blkIndent>=4||(L=s.skipChars(L,u))-C<_||(L=s.skipSpaces(L))<B)){j=!0;break}return _=s.tShift[o],s.line=x+(j?1:0),s.tokens.push({type:"fence",params:w,content:s.getLines(o+1,x,_,!0),lines:[o,s.line],level:s.level}),!0},["paragraph","blockquote","list"]],["blockquote",function blockquote(s,o,i,a){var u,_,w,x,C,j,L,B,$,U,V,z=s.bMarks[o]+s.tShift[o],Y=s.eMarks[o];if(z>Y)return!1;if(62!==s.src.charCodeAt(z++))return!1;if(s.level>=s.options.maxNesting)return!1;if(a)return!0;for(32===s.src.charCodeAt(z)&&z++,C=s.blkIndent,s.blkIndent=0,x=[s.bMarks[o]],s.bMarks[o]=z,_=(z=z<Y?s.skipSpaces(z):z)>=Y,w=[s.tShift[o]],s.tShift[o]=z-s.bMarks[o],B=s.parser.ruler.getRules("blockquote"),u=o+1;u<i&&!((z=s.bMarks[u]+s.tShift[u])>=(Y=s.eMarks[u]));u++)if(62!==s.src.charCodeAt(z++)){if(_)break;for(V=!1,$=0,U=B.length;$<U;$++)if(B[$](s,u,i,!0)){V=!0;break}if(V)break;x.push(s.bMarks[u]),w.push(s.tShift[u]),s.tShift[u]=-1337}else 32===s.src.charCodeAt(z)&&z++,x.push(s.bMarks[u]),s.bMarks[u]=z,_=(z=z<Y?s.skipSpaces(z):z)>=Y,w.push(s.tShift[u]),s.tShift[u]=z-s.bMarks[u];for(j=s.parentType,s.parentType="blockquote",s.tokens.push({type:"blockquote_open",lines:L=[o,0],level:s.level++}),s.parser.tokenize(s,o,u),s.tokens.push({type:"blockquote_close",level:--s.level}),s.parentType=j,L[1]=s.line,$=0;$<w.length;$++)s.bMarks[$+o]=x[$],s.tShift[$+o]=w[$];return s.blkIndent=C,!0},["paragraph","blockquote","list"]],["hr",function hr(s,o,i,a){var u,_,w,x=s.bMarks[o],C=s.eMarks[o];if((x+=s.tShift[o])>C)return!1;if(42!==(u=s.src.charCodeAt(x++))&&45!==u&&95!==u)return!1;for(_=1;x<C;){if((w=s.src.charCodeAt(x++))!==u&&32!==w)return!1;w===u&&_++}return!(_<3)&&(a||(s.line=o+1,s.tokens.push({type:"hr",lines:[o,s.line],level:s.level})),!0)},["paragraph","blockquote","list"]],["list",function index_browser_list(s,o,i,a){var u,_,w,x,C,j,L,B,$,U,V,z,Y,Z,ee,ie,ae,ce,le,pe,de,fe=!0;if((B=skipOrderedListMarker(s,o))>=0)z=!0;else{if(!((B=skipBulletListMarker(s,o))>=0))return!1;z=!1}if(s.level>=s.options.maxNesting)return!1;if(V=s.src.charCodeAt(B-1),a)return!0;for(Z=s.tokens.length,z?(L=s.bMarks[o]+s.tShift[o],U=Number(s.src.substr(L,B-L-1)),s.tokens.push({type:"ordered_list_open",order:U,lines:ie=[o,0],level:s.level++})):s.tokens.push({type:"bullet_list_open",lines:ie=[o,0],level:s.level++}),u=o,ee=!1,ce=s.parser.ruler.getRules("list");!(!(u<i)||(($=(Y=s.skipSpaces(B))>=s.eMarks[u]?1:Y-B)>4&&($=1),$<1&&($=1),_=B-s.bMarks[u]+$,s.tokens.push({type:"list_item_open",lines:ae=[o,0],level:s.level++}),x=s.blkIndent,C=s.tight,w=s.tShift[o],j=s.parentType,s.tShift[o]=Y-s.bMarks[o],s.blkIndent=_,s.tight=!0,s.parentType="list",s.parser.tokenize(s,o,i,!0),s.tight&&!ee||(fe=!1),ee=s.line-o>1&&s.isEmpty(s.line-1),s.blkIndent=x,s.tShift[o]=w,s.tight=C,s.parentType=j,s.tokens.push({type:"list_item_close",level:--s.level}),u=o=s.line,ae[1]=u,Y=s.bMarks[o],u>=i)||s.isEmpty(u)||s.tShift[u]<s.blkIndent);){for(de=!1,le=0,pe=ce.length;le<pe;le++)if(ce[le](s,u,i,!0)){de=!0;break}if(de)break;if(z){if((B=skipOrderedListMarker(s,u))<0)break}else if((B=skipBulletListMarker(s,u))<0)break;if(V!==s.src.charCodeAt(B-1))break}return s.tokens.push({type:z?"ordered_list_close":"bullet_list_close",level:--s.level}),ie[1]=u,s.line=u,fe&&function markTightParagraphs(s,o){var i,a,u=s.level+2;for(i=o+2,a=s.tokens.length-2;i<a;i++)s.tokens[i].level===u&&"paragraph_open"===s.tokens[i].type&&(s.tokens[i+2].tight=!0,s.tokens[i].tight=!0,i+=2)}(s,Z),!0},["paragraph","blockquote"]],["footnote",function footnote(s,o,i,a){var u,_,w,x,C,j=s.bMarks[o]+s.tShift[o],L=s.eMarks[o];if(j+4>L)return!1;if(91!==s.src.charCodeAt(j))return!1;if(94!==s.src.charCodeAt(j+1))return!1;if(s.level>=s.options.maxNesting)return!1;for(x=j+2;x<L;x++){if(32===s.src.charCodeAt(x))return!1;if(93===s.src.charCodeAt(x))break}return x!==j+2&&(!(x+1>=L||58!==s.src.charCodeAt(++x))&&(a||(x++,s.env.footnotes||(s.env.footnotes={}),s.env.footnotes.refs||(s.env.footnotes.refs={}),C=s.src.slice(j+2,x-2),s.env.footnotes.refs[":"+C]=-1,s.tokens.push({type:"footnote_reference_open",label:C,level:s.level++}),u=s.bMarks[o],_=s.tShift[o],w=s.parentType,s.tShift[o]=s.skipSpaces(x)-x,s.bMarks[o]=x,s.blkIndent+=4,s.parentType="footnote",s.tShift[o]<s.blkIndent&&(s.tShift[o]+=s.blkIndent,s.bMarks[o]-=s.blkIndent),s.parser.tokenize(s,o,i,!0),s.parentType=w,s.blkIndent-=4,s.tShift[o]=_,s.bMarks[o]=u,s.tokens.push({type:"footnote_reference_close",level:--s.level})),!0))},["paragraph"]],["heading",function heading(s,o,i,a){var u,_,w,x=s.bMarks[o]+s.tShift[o],C=s.eMarks[o];if(x>=C)return!1;if(35!==(u=s.src.charCodeAt(x))||x>=C)return!1;for(_=1,u=s.src.charCodeAt(++x);35===u&&x<C&&_<=6;)_++,u=s.src.charCodeAt(++x);return!(_>6||x<C&&32!==u)&&(a||(C=s.skipCharsBack(C,32,x),(w=s.skipCharsBack(C,35,x))>x&&32===s.src.charCodeAt(w-1)&&(C=w),s.line=o+1,s.tokens.push({type:"heading_open",hLevel:_,lines:[o,s.line],level:s.level}),x<C&&s.tokens.push({type:"inline",content:s.src.slice(x,C).trim(),level:s.level+1,lines:[o,s.line],children:[]}),s.tokens.push({type:"heading_close",hLevel:_,level:s.level})),!0)},["paragraph","blockquote"]],["lheading",function lheading(s,o,i){var a,u,_,w=o+1;return!(w>=i)&&(!(s.tShift[w]<s.blkIndent)&&(!(s.tShift[w]-s.blkIndent>3)&&(!((u=s.bMarks[w]+s.tShift[w])>=(_=s.eMarks[w]))&&((45===(a=s.src.charCodeAt(u))||61===a)&&(u=s.skipChars(u,a),!((u=s.skipSpaces(u))<_)&&(u=s.bMarks[o]+s.tShift[o],s.line=w+1,s.tokens.push({type:"heading_open",hLevel:61===a?1:2,lines:[o,s.line],level:s.level}),s.tokens.push({type:"inline",content:s.src.slice(u,s.eMarks[o]).trim(),level:s.level+1,lines:[o,s.line-1],children:[]}),s.tokens.push({type:"heading_close",hLevel:61===a?1:2,level:s.level}),!0))))))}],["htmlblock",function htmlblock(s,o,i,a){var u,_,w,x=s.bMarks[o],C=s.eMarks[o],j=s.tShift[o];if(x+=j,!s.options.html)return!1;if(j>3||x+2>=C)return!1;if(60!==s.src.charCodeAt(x))return!1;if(33===(u=s.src.charCodeAt(x+1))||63===u){if(a)return!0}else{if(47!==u&&!function isLetter$1(s){var o=32|s;return o>=97&&o<=122}(u))return!1;if(47===u){if(!(_=s.src.slice(x,C).match(kA)))return!1}else if(!(_=s.src.slice(x,C).match(xA)))return!1;if(!0!==wA[_[1].toLowerCase()])return!1;if(a)return!0}for(w=o+1;w<s.lineMax&&!s.isEmpty(w);)w++;return s.line=w,s.tokens.push({type:"htmlblock",level:s.level,lines:[o,s.line],content:s.getLines(o,w,0,!0)}),!0},["paragraph","blockquote"]],["table",function table(s,o,i,a){var u,_,w,x,C,j,L,B,$,U,V;if(o+2>i)return!1;if(C=o+1,s.tShift[C]<s.blkIndent)return!1;if((w=s.bMarks[C]+s.tShift[C])>=s.eMarks[C])return!1;if(124!==(u=s.src.charCodeAt(w))&&45!==u&&58!==u)return!1;if(_=index_browser_getLine(s,o+1),!/^[-:| ]+$/.test(_))return!1;if((j=_.split("|"))<=2)return!1;for(B=[],x=0;x<j.length;x++){if(!($=j[x].trim())){if(0===x||x===j.length-1)continue;return!1}if(!/^:?-+:?$/.test($))return!1;58===$.charCodeAt($.length-1)?B.push(58===$.charCodeAt(0)?"center":"right"):58===$.charCodeAt(0)?B.push("left"):B.push("")}if(-1===(_=index_browser_getLine(s,o).trim()).indexOf("|"))return!1;if(j=_.replace(/^\||\|$/g,"").split("|"),B.length!==j.length)return!1;if(a)return!0;for(s.tokens.push({type:"table_open",lines:U=[o,0],level:s.level++}),s.tokens.push({type:"thead_open",lines:[o,o+1],level:s.level++}),s.tokens.push({type:"tr_open",lines:[o,o+1],level:s.level++}),x=0;x<j.length;x++)s.tokens.push({type:"th_open",align:B[x],lines:[o,o+1],level:s.level++}),s.tokens.push({type:"inline",content:j[x].trim(),lines:[o,o+1],level:s.level,children:[]}),s.tokens.push({type:"th_close",level:--s.level});for(s.tokens.push({type:"tr_close",level:--s.level}),s.tokens.push({type:"thead_close",level:--s.level}),s.tokens.push({type:"tbody_open",lines:V=[o+2,0],level:s.level++}),C=o+2;C<i&&!(s.tShift[C]<s.blkIndent)&&-1!==(_=index_browser_getLine(s,C).trim()).indexOf("|");C++){for(j=_.replace(/^\||\|$/g,"").split("|"),s.tokens.push({type:"tr_open",level:s.level++}),x=0;x<j.length;x++)s.tokens.push({type:"td_open",align:B[x],level:s.level++}),L=j[x].substring(124===j[x].charCodeAt(0)?1:0,124===j[x].charCodeAt(j[x].length-1)?j[x].length-1:j[x].length).trim(),s.tokens.push({type:"inline",content:L,level:s.level,children:[]}),s.tokens.push({type:"td_close",level:--s.level});s.tokens.push({type:"tr_close",level:--s.level})}return s.tokens.push({type:"tbody_close",level:--s.level}),s.tokens.push({type:"table_close",level:--s.level}),U[1]=V[1]=C,s.line=C,!0},["paragraph"]],["deflist",function deflist(s,o,i,a){var u,_,w,x,C,j,L,B,$,U,V,z,Y,Z;if(a)return!(s.ddIndent<0)&&skipMarker(s,o)>=0;if(L=o+1,s.isEmpty(L)&&++L>i)return!1;if(s.tShift[L]<s.blkIndent)return!1;if((u=skipMarker(s,L))<0)return!1;if(s.level>=s.options.maxNesting)return!1;j=s.tokens.length,s.tokens.push({type:"dl_open",lines:C=[o,0],level:s.level++}),w=o,_=L;e:for(;;){for(Z=!0,Y=!1,s.tokens.push({type:"dt_open",lines:[w,w],level:s.level++}),s.tokens.push({type:"inline",content:s.getLines(w,w+1,s.blkIndent,!1).trim(),level:s.level+1,lines:[w,w],children:[]}),s.tokens.push({type:"dt_close",level:--s.level});;){if(s.tokens.push({type:"dd_open",lines:x=[L,0],level:s.level++}),z=s.tight,$=s.ddIndent,B=s.blkIndent,V=s.tShift[_],U=s.parentType,s.blkIndent=s.ddIndent=s.tShift[_]+2,s.tShift[_]=u-s.bMarks[_],s.tight=!0,s.parentType="deflist",s.parser.tokenize(s,_,i,!0),s.tight&&!Y||(Z=!1),Y=s.line-_>1&&s.isEmpty(s.line-1),s.tShift[_]=V,s.tight=z,s.parentType=U,s.blkIndent=B,s.ddIndent=$,s.tokens.push({type:"dd_close",level:--s.level}),x[1]=L=s.line,L>=i)break e;if(s.tShift[L]<s.blkIndent)break e;if((u=skipMarker(s,L))<0)break;_=L}if(L>=i)break;if(w=L,s.isEmpty(w))break;if(s.tShift[w]<s.blkIndent)break;if((_=w+1)>=i)break;if(s.isEmpty(_)&&_++,_>=i)break;if(s.tShift[_]<s.blkIndent)break;if((u=skipMarker(s,_))<0)break}return s.tokens.push({type:"dl_close",level:--s.level}),C[1]=L,s.line=L,Z&&function markTightParagraphs$1(s,o){var i,a,u=s.level+2;for(i=o+2,a=s.tokens.length-2;i<a;i++)s.tokens[i].level===u&&"paragraph_open"===s.tokens[i].type&&(s.tokens[i+2].tight=!0,s.tokens[i].tight=!0,i+=2)}(s,j),!0},["paragraph"]],["paragraph",function paragraph(s,o){var i,a,u,_,w,x,C=o+1;if(C<(i=s.lineMax)&&!s.isEmpty(C))for(x=s.parser.ruler.getRules("paragraph");C<i&&!s.isEmpty(C);C++)if(!(s.tShift[C]-s.blkIndent>3)){for(u=!1,_=0,w=x.length;_<w;_++)if(x[_](s,C,i,!0)){u=!0;break}if(u)break}return a=s.getLines(o,C,s.blkIndent,!1).trim(),s.line=C,a.length&&(s.tokens.push({type:"paragraph_open",tight:!1,lines:[o,s.line],level:s.level}),s.tokens.push({type:"inline",content:a,level:s.level+1,lines:[o,s.line],children:[]}),s.tokens.push({type:"paragraph_close",tight:!1,level:s.level})),!0}]];function ParserBlock(){this.ruler=new Ruler;for(var s=0;s<OA.length;s++)this.ruler.push(OA[s][0],OA[s][1],{alt:(OA[s][2]||[]).slice()})}ParserBlock.prototype.tokenize=function(s,o,i){for(var a,u=this.ruler.getRules(""),_=u.length,w=o,x=!1;w<i&&(s.line=w=s.skipEmptyLines(w),!(w>=i))&&!(s.tShift[w]<s.blkIndent);){for(a=0;a<_&&!u[a](s,w,i,!1);a++);if(s.tight=!x,s.isEmpty(s.line-1)&&(x=!0),(w=s.line)<i&&s.isEmpty(w)){if(x=!0,++w<i&&"list"===s.parentType&&s.isEmpty(w))break;s.line=w}}};var AA=/[\n\t]/g,CA=/\r[\n\u0085]|[\u2424\u2028\u0085]/g,jA=/\u00a0/g;function isTerminatorChar(s){switch(s){case 10:case 92:case 96:case 42:case 95:case 94:case 91:case 93:case 33:case 38:case 60:case 62:case 123:case 125:case 36:case 37:case 64:case 126:case 43:case 61:case 58:return!0;default:return!1}}ParserBlock.prototype.parse=function(s,o,i,a){var u,_=0,w=0;if(!s)return[];(s=(s=s.replace(jA," ")).replace(CA,"\n")).indexOf("\t")>=0&&(s=s.replace(AA,(function(o,i){var a;return 10===s.charCodeAt(i)?(_=i+1,w=0,o):(a="    ".slice((i-_-w)%4),w=i-_+1,a)}))),u=new StateBlock(s,this,o,i,a),this.tokenize(u,u.line,u.lineMax)};for(var PA=[],IA=0;IA<256;IA++)PA.push(0);function isAlphaNum(s){return s>=48&&s<=57||s>=65&&s<=90||s>=97&&s<=122}function scanDelims(s,o){var i,a,u,_=o,w=!0,x=!0,C=s.posMax,j=s.src.charCodeAt(o);for(i=o>0?s.src.charCodeAt(o-1):-1;_<C&&s.src.charCodeAt(_)===j;)_++;return _>=C&&(w=!1),(u=_-o)>=4?w=x=!1:(32!==(a=_<C?s.src.charCodeAt(_):-1)&&10!==a||(w=!1),32!==i&&10!==i||(x=!1),95===j&&(isAlphaNum(i)&&(w=!1),isAlphaNum(a)&&(x=!1))),{can_open:w,can_close:x,delims:u}}"\\!\"#$%&'()*+,./:;<=>?@[]^_`{|}~-".split("").forEach((function(s){PA[s.charCodeAt(0)]=1}));var TA=/\\([ \\!"#$%&'()*+,.\/:;<=>?@[\]^_`{|}~-])/g;var NA=/\\([ \\!"#$%&'()*+,.\/:;<=>?@[\]^_`{|}~-])/g;var MA=["coap","doi","javascript","aaa","aaas","about","acap","cap","cid","crid","data","dav","dict","dns","file","ftp","geo","go","gopher","h323","http","https","iax","icap","im","imap","info","ipp","iris","iris.beep","iris.xpc","iris.xpcs","iris.lwz","ldap","mailto","mid","msrp","msrps","mtqp","mupdate","news","nfs","ni","nih","nntp","opaquelocktoken","pop","pres","rtsp","service","session","shttp","sieve","sip","sips","sms","snmp","soap.beep","soap.beeps","tag","tel","telnet","tftp","thismessage","tn3270","tip","tv","urn","vemmi","ws","wss","xcon","xcon-userid","xmlrpc.beep","xmlrpc.beeps","xmpp","z39.50r","z39.50s","adiumxtra","afp","afs","aim","apt","attachment","aw","beshare","bitcoin","bolo","callto","chrome","chrome-extension","com-eventbrite-attendee","content","cvs","dlna-playsingle","dlna-playcontainer","dtn","dvb","ed2k","facetime","feed","finger","fish","gg","git","gizmoproject","gtalk","hcp","icon","ipn","irc","irc6","ircs","itms","jar","jms","keyparc","lastfm","ldaps","magnet","maps","market","message","mms","ms-help","msnim","mumble","mvn","notes","oid","palm","paparazzi","platform","proxy","psyc","query","res","resource","rmi","rsync","rtmp","secondlife","sftp","sgn","skype","smb","soldat","spotify","ssh","steam","svn","teamspeak","things","udp","unreal","ut2004","ventrilo","view-source","webcal","wtai","wyciwyg","xfire","xri","ymsgr"],RA=/^<([a-zA-Z0-9.!#$%&'*+\/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*)>/,DA=/^<([a-zA-Z.\-]{1,25}):([^<>\x00-\x20]*)>/;function replace$1(s,o){return s=s.source,o=o||"",function self(i,a){return i?(a=a.source||a,s=s.replace(i,a),self):new RegExp(s,o)}}var LA=replace$1(/(?:unquoted|single_quoted|double_quoted)/)("unquoted",/[^"'=<>`\x00-\x20]+/)("single_quoted",/'[^']*'/)("double_quoted",/"[^"]*"/)(),FA=replace$1(/(?:\s+attr_name(?:\s*=\s*attr_value)?)/)("attr_name",/[a-zA-Z_:][a-zA-Z0-9:._-]*/)("attr_value",LA)(),BA=replace$1(/<[A-Za-z][A-Za-z0-9]*attribute*\s*\/?>/)("attribute",FA)(),$A=replace$1(/^(?:open_tag|close_tag|comment|processing|declaration|cdata)/)("open_tag",BA)("close_tag",/<\/[A-Za-z][A-Za-z0-9]*\s*>/)("comment",/<!---->|<!--(?:-?[^>-])(?:-?[^-])*-->/)("processing",/<[?].*?[?]>/)("declaration",/<![A-Z]+\s+[^>]*>/)("cdata",/<!\[CDATA\[[\s\S]*?\]\]>/)();var qA=/^&#((?:x[a-f0-9]{1,8}|[0-9]{1,8}));/i,UA=/^&([a-z][a-z0-9]{1,31});/i;var VA=[["text",function index_browser_text(s,o){for(var i=s.pos;i<s.posMax&&!isTerminatorChar(s.src.charCodeAt(i));)i++;return i!==s.pos&&(o||(s.pending+=s.src.slice(s.pos,i)),s.pos=i,!0)}],["newline",function newline(s,o){var i,a,u=s.pos;if(10!==s.src.charCodeAt(u))return!1;if(i=s.pending.length-1,a=s.posMax,!o)if(i>=0&&32===s.pending.charCodeAt(i))if(i>=1&&32===s.pending.charCodeAt(i-1)){for(var _=i-2;_>=0;_--)if(32!==s.pending.charCodeAt(_)){s.pending=s.pending.substring(0,_+1);break}s.push({type:"hardbreak",level:s.level})}else s.pending=s.pending.slice(0,-1),s.push({type:"softbreak",level:s.level});else s.push({type:"softbreak",level:s.level});for(u++;u<a&&32===s.src.charCodeAt(u);)u++;return s.pos=u,!0}],["escape",function index_browser_escape(s,o){var i,a=s.pos,u=s.posMax;if(92!==s.src.charCodeAt(a))return!1;if(++a<u){if((i=s.src.charCodeAt(a))<256&&0!==PA[i])return o||(s.pending+=s.src[a]),s.pos+=2,!0;if(10===i){for(o||s.push({type:"hardbreak",level:s.level}),a++;a<u&&32===s.src.charCodeAt(a);)a++;return s.pos=a,!0}}return o||(s.pending+="\\"),s.pos++,!0}],["backticks",function backticks(s,o){var i,a,u,_,w,x=s.pos;if(96!==s.src.charCodeAt(x))return!1;for(i=x,x++,a=s.posMax;x<a&&96===s.src.charCodeAt(x);)x++;for(u=s.src.slice(i,x),_=w=x;-1!==(_=s.src.indexOf("`",w));){for(w=_+1;w<a&&96===s.src.charCodeAt(w);)w++;if(w-_===u.length)return o||s.push({type:"code",content:s.src.slice(x,_).replace(/[ \n]+/g," ").trim(),block:!1,level:s.level}),s.pos=w,!0}return o||(s.pending+=u),s.pos+=u.length,!0}],["del",function del(s,o){var i,a,u,_,w,x=s.posMax,C=s.pos;if(126!==s.src.charCodeAt(C))return!1;if(o)return!1;if(C+4>=x)return!1;if(126!==s.src.charCodeAt(C+1))return!1;if(s.level>=s.options.maxNesting)return!1;if(_=C>0?s.src.charCodeAt(C-1):-1,w=s.src.charCodeAt(C+2),126===_)return!1;if(126===w)return!1;if(32===w||10===w)return!1;for(a=C+2;a<x&&126===s.src.charCodeAt(a);)a++;if(a>C+3)return s.pos+=a-C,o||(s.pending+=s.src.slice(C,a)),!0;for(s.pos=C+2,u=1;s.pos+1<x;){if(126===s.src.charCodeAt(s.pos)&&126===s.src.charCodeAt(s.pos+1)&&(_=s.src.charCodeAt(s.pos-1),126!==(w=s.pos+2<x?s.src.charCodeAt(s.pos+2):-1)&&126!==_&&(32!==_&&10!==_?u--:32!==w&&10!==w&&u++,u<=0))){i=!0;break}s.parser.skipToken(s)}return i?(s.posMax=s.pos,s.pos=C+2,o||(s.push({type:"del_open",level:s.level++}),s.parser.tokenize(s),s.push({type:"del_close",level:--s.level})),s.pos=s.posMax+2,s.posMax=x,!0):(s.pos=C,!1)}],["ins",function ins(s,o){var i,a,u,_,w,x=s.posMax,C=s.pos;if(43!==s.src.charCodeAt(C))return!1;if(o)return!1;if(C+4>=x)return!1;if(43!==s.src.charCodeAt(C+1))return!1;if(s.level>=s.options.maxNesting)return!1;if(_=C>0?s.src.charCodeAt(C-1):-1,w=s.src.charCodeAt(C+2),43===_)return!1;if(43===w)return!1;if(32===w||10===w)return!1;for(a=C+2;a<x&&43===s.src.charCodeAt(a);)a++;if(a!==C+2)return s.pos+=a-C,o||(s.pending+=s.src.slice(C,a)),!0;for(s.pos=C+2,u=1;s.pos+1<x;){if(43===s.src.charCodeAt(s.pos)&&43===s.src.charCodeAt(s.pos+1)&&(_=s.src.charCodeAt(s.pos-1),43!==(w=s.pos+2<x?s.src.charCodeAt(s.pos+2):-1)&&43!==_&&(32!==_&&10!==_?u--:32!==w&&10!==w&&u++,u<=0))){i=!0;break}s.parser.skipToken(s)}return i?(s.posMax=s.pos,s.pos=C+2,o||(s.push({type:"ins_open",level:s.level++}),s.parser.tokenize(s),s.push({type:"ins_close",level:--s.level})),s.pos=s.posMax+2,s.posMax=x,!0):(s.pos=C,!1)}],["mark",function mark(s,o){var i,a,u,_,w,x=s.posMax,C=s.pos;if(61!==s.src.charCodeAt(C))return!1;if(o)return!1;if(C+4>=x)return!1;if(61!==s.src.charCodeAt(C+1))return!1;if(s.level>=s.options.maxNesting)return!1;if(_=C>0?s.src.charCodeAt(C-1):-1,w=s.src.charCodeAt(C+2),61===_)return!1;if(61===w)return!1;if(32===w||10===w)return!1;for(a=C+2;a<x&&61===s.src.charCodeAt(a);)a++;if(a!==C+2)return s.pos+=a-C,o||(s.pending+=s.src.slice(C,a)),!0;for(s.pos=C+2,u=1;s.pos+1<x;){if(61===s.src.charCodeAt(s.pos)&&61===s.src.charCodeAt(s.pos+1)&&(_=s.src.charCodeAt(s.pos-1),61!==(w=s.pos+2<x?s.src.charCodeAt(s.pos+2):-1)&&61!==_&&(32!==_&&10!==_?u--:32!==w&&10!==w&&u++,u<=0))){i=!0;break}s.parser.skipToken(s)}return i?(s.posMax=s.pos,s.pos=C+2,o||(s.push({type:"mark_open",level:s.level++}),s.parser.tokenize(s),s.push({type:"mark_close",level:--s.level})),s.pos=s.posMax+2,s.posMax=x,!0):(s.pos=C,!1)}],["emphasis",function emphasis(s,o){var i,a,u,_,w,x,C,j=s.posMax,L=s.pos,B=s.src.charCodeAt(L);if(95!==B&&42!==B)return!1;if(o)return!1;if(i=(C=scanDelims(s,L)).delims,!C.can_open)return s.pos+=i,o||(s.pending+=s.src.slice(L,s.pos)),!0;if(s.level>=s.options.maxNesting)return!1;for(s.pos=L+i,x=[i];s.pos<j;)if(s.src.charCodeAt(s.pos)!==B)s.parser.skipToken(s);else{if(a=(C=scanDelims(s,s.pos)).delims,C.can_close){for(_=x.pop(),w=a;_!==w;){if(w<_){x.push(_-w);break}if(w-=_,0===x.length)break;s.pos+=_,_=x.pop()}if(0===x.length){i=_,u=!0;break}s.pos+=a;continue}C.can_open&&x.push(a),s.pos+=a}return u?(s.posMax=s.pos,s.pos=L+i,o||(2!==i&&3!==i||s.push({type:"strong_open",level:s.level++}),1!==i&&3!==i||s.push({type:"em_open",level:s.level++}),s.parser.tokenize(s),1!==i&&3!==i||s.push({type:"em_close",level:--s.level}),2!==i&&3!==i||s.push({type:"strong_close",level:--s.level})),s.pos=s.posMax+i,s.posMax=j,!0):(s.pos=L,!1)}],["sub",function sub(s,o){var i,a,u=s.posMax,_=s.pos;if(126!==s.src.charCodeAt(_))return!1;if(o)return!1;if(_+2>=u)return!1;if(s.level>=s.options.maxNesting)return!1;for(s.pos=_+1;s.pos<u;){if(126===s.src.charCodeAt(s.pos)){i=!0;break}s.parser.skipToken(s)}return i&&_+1!==s.pos?(a=s.src.slice(_+1,s.pos)).match(/(^|[^\\])(\\\\)*\s/)?(s.pos=_,!1):(s.posMax=s.pos,s.pos=_+1,o||s.push({type:"sub",level:s.level,content:a.replace(TA,"$1")}),s.pos=s.posMax+1,s.posMax=u,!0):(s.pos=_,!1)}],["sup",function sup(s,o){var i,a,u=s.posMax,_=s.pos;if(94!==s.src.charCodeAt(_))return!1;if(o)return!1;if(_+2>=u)return!1;if(s.level>=s.options.maxNesting)return!1;for(s.pos=_+1;s.pos<u;){if(94===s.src.charCodeAt(s.pos)){i=!0;break}s.parser.skipToken(s)}return i&&_+1!==s.pos?(a=s.src.slice(_+1,s.pos)).match(/(^|[^\\])(\\\\)*\s/)?(s.pos=_,!1):(s.posMax=s.pos,s.pos=_+1,o||s.push({type:"sup",level:s.level,content:a.replace(NA,"$1")}),s.pos=s.posMax+1,s.posMax=u,!0):(s.pos=_,!1)}],["links",function links(s,o){var i,a,u,_,w,x,C,j,L=!1,B=s.pos,$=s.posMax,U=s.pos,V=s.src.charCodeAt(U);if(33===V&&(L=!0,V=s.src.charCodeAt(++U)),91!==V)return!1;if(s.level>=s.options.maxNesting)return!1;if(i=U+1,(a=parseLinkLabel(s,U))<0)return!1;if((x=a+1)<$&&40===s.src.charCodeAt(x)){for(x++;x<$&&(32===(j=s.src.charCodeAt(x))||10===j);x++);if(x>=$)return!1;for(U=x,parseLinkDestination(s,x)?(_=s.linkContent,x=s.pos):_="",U=x;x<$&&(32===(j=s.src.charCodeAt(x))||10===j);x++);if(x<$&&U!==x&&parseLinkTitle(s,x))for(w=s.linkContent,x=s.pos;x<$&&(32===(j=s.src.charCodeAt(x))||10===j);x++);else w="";if(x>=$||41!==s.src.charCodeAt(x))return s.pos=B,!1;x++}else{if(s.linkLevel>0)return!1;for(;x<$&&(32===(j=s.src.charCodeAt(x))||10===j);x++);if(x<$&&91===s.src.charCodeAt(x)&&(U=x+1,(x=parseLinkLabel(s,x))>=0?u=s.src.slice(U,x++):x=U-1),u||(void 0===u&&(x=a+1),u=s.src.slice(i,a)),!(C=s.env.references[normalizeReference(u)]))return s.pos=B,!1;_=C.href,w=C.title}return o||(s.pos=i,s.posMax=a,L?s.push({type:"image",src:_,title:w,alt:s.src.substr(i,a-i),level:s.level}):(s.push({type:"link_open",href:_,title:w,level:s.level++}),s.linkLevel++,s.parser.tokenize(s),s.linkLevel--,s.push({type:"link_close",level:--s.level}))),s.pos=x,s.posMax=$,!0}],["footnote_inline",function footnote_inline(s,o){var i,a,u,_,w=s.posMax,x=s.pos;return!(x+2>=w)&&(94===s.src.charCodeAt(x)&&(91===s.src.charCodeAt(x+1)&&(!(s.level>=s.options.maxNesting)&&(i=x+2,!((a=parseLinkLabel(s,x+1))<0)&&(o||(s.env.footnotes||(s.env.footnotes={}),s.env.footnotes.list||(s.env.footnotes.list=[]),u=s.env.footnotes.list.length,s.pos=i,s.posMax=a,s.push({type:"footnote_ref",id:u,level:s.level}),s.linkLevel++,_=s.tokens.length,s.parser.tokenize(s),s.env.footnotes.list[u]={tokens:s.tokens.splice(_)},s.linkLevel--),s.pos=a+1,s.posMax=w,!0)))))}],["footnote_ref",function footnote_ref(s,o){var i,a,u,_,w=s.posMax,x=s.pos;if(x+3>w)return!1;if(!s.env.footnotes||!s.env.footnotes.refs)return!1;if(91!==s.src.charCodeAt(x))return!1;if(94!==s.src.charCodeAt(x+1))return!1;if(s.level>=s.options.maxNesting)return!1;for(a=x+2;a<w;a++){if(32===s.src.charCodeAt(a))return!1;if(10===s.src.charCodeAt(a))return!1;if(93===s.src.charCodeAt(a))break}return a!==x+2&&(!(a>=w)&&(a++,i=s.src.slice(x+2,a-1),void 0!==s.env.footnotes.refs[":"+i]&&(o||(s.env.footnotes.list||(s.env.footnotes.list=[]),s.env.footnotes.refs[":"+i]<0?(u=s.env.footnotes.list.length,s.env.footnotes.list[u]={label:i,count:0},s.env.footnotes.refs[":"+i]=u):u=s.env.footnotes.refs[":"+i],_=s.env.footnotes.list[u].count,s.env.footnotes.list[u].count++,s.push({type:"footnote_ref",id:u,subId:_,level:s.level})),s.pos=a,s.posMax=w,!0)))}],["autolink",function autolink(s,o){var i,a,u,_,w,x=s.pos;return 60===s.src.charCodeAt(x)&&(!((i=s.src.slice(x)).indexOf(">")<0)&&((a=i.match(DA))?!(MA.indexOf(a[1].toLowerCase())<0)&&(w=normalizeLink(_=a[0].slice(1,-1)),!!s.parser.validateLink(_)&&(o||(s.push({type:"link_open",href:w,level:s.level}),s.push({type:"text",content:_,level:s.level+1}),s.push({type:"link_close",level:s.level})),s.pos+=a[0].length,!0)):!!(u=i.match(RA))&&(w=normalizeLink("mailto:"+(_=u[0].slice(1,-1))),!!s.parser.validateLink(w)&&(o||(s.push({type:"link_open",href:w,level:s.level}),s.push({type:"text",content:_,level:s.level+1}),s.push({type:"link_close",level:s.level})),s.pos+=u[0].length,!0))))}],["htmltag",function htmltag(s,o){var i,a,u,_=s.pos;return!!s.options.html&&(u=s.posMax,!(60!==s.src.charCodeAt(_)||_+2>=u)&&(!(33!==(i=s.src.charCodeAt(_+1))&&63!==i&&47!==i&&!function isLetter$2(s){var o=32|s;return o>=97&&o<=122}(i))&&(!!(a=s.src.slice(_).match($A))&&(o||s.push({type:"htmltag",content:s.src.slice(_,_+a[0].length),level:s.level}),s.pos+=a[0].length,!0))))}],["entity",function entity(s,o){var i,a,u=s.pos,_=s.posMax;if(38!==s.src.charCodeAt(u))return!1;if(u+1<_)if(35===s.src.charCodeAt(u+1)){if(a=s.src.slice(u).match(qA))return o||(i="x"===a[1][0].toLowerCase()?parseInt(a[1].slice(1),16):parseInt(a[1],10),s.pending+=isValidEntityCode(i)?fromCodePoint(i):fromCodePoint(65533)),s.pos+=a[0].length,!0}else if(a=s.src.slice(u).match(UA)){var w=decodeEntity(a[1]);if(a[1]!==w)return o||(s.pending+=w),s.pos+=a[0].length,!0}return o||(s.pending+="&"),s.pos++,!0}]];function ParserInline(){this.ruler=new Ruler;for(var s=0;s<VA.length;s++)this.ruler.push(VA[s][0],VA[s][1]);this.validateLink=validateLink}function validateLink(s){var o=s.trim().toLowerCase();return-1===(o=replaceEntities(o)).indexOf(":")||-1===["vbscript","javascript","file","data"].indexOf(o.split(":")[0])}ParserInline.prototype.skipToken=function(s){var o,i,a=this.ruler.getRules(""),u=a.length,_=s.pos;if((i=s.cacheGet(_))>0)s.pos=i;else{for(o=0;o<u;o++)if(a[o](s,!0))return void s.cacheSet(_,s.pos);s.pos++,s.cacheSet(_,s.pos)}},ParserInline.prototype.tokenize=function(s){for(var o,i,a=this.ruler.getRules(""),u=a.length,_=s.posMax;s.pos<_;){for(i=0;i<u&&!(o=a[i](s,!1));i++);if(o){if(s.pos>=_)break}else s.pending+=s.src[s.pos++]}s.pending&&s.pushPending()},ParserInline.prototype.parse=function(s,o,i,a){var u=new StateInline(s,this,o,i,a);this.tokenize(u)};var zA={default:{options:{html:!1,xhtmlOut:!1,breaks:!1,langPrefix:"language-",linkTarget:"",typographer:!1,quotes:"“”‘’",highlight:null,maxNesting:20},components:{core:{rules:["block","inline","references","replacements","smartquotes","references","abbr2","footnote_tail"]},block:{rules:["blockquote","code","fences","footnote","heading","hr","htmlblock","lheading","list","paragraph","table"]},inline:{rules:["autolink","backticks","del","emphasis","entity","escape","footnote_ref","htmltag","links","newline","text"]}}},full:{options:{html:!1,xhtmlOut:!1,breaks:!1,langPrefix:"language-",linkTarget:"",typographer:!1,quotes:"“”‘’",highlight:null,maxNesting:20},components:{core:{},block:{},inline:{}}},commonmark:{options:{html:!0,xhtmlOut:!0,breaks:!1,langPrefix:"language-",linkTarget:"",typographer:!1,quotes:"“”‘’",highlight:null,maxNesting:20},components:{core:{rules:["block","inline","references","abbr2"]},block:{rules:["blockquote","code","fences","heading","hr","htmlblock","lheading","list","paragraph"]},inline:{rules:["autolink","backticks","emphasis","entity","escape","htmltag","links","newline","text"]}}}};function StateCore(s,o,i){this.src=o,this.env=i,this.options=s.options,this.tokens=[],this.inlineMode=!1,this.inline=s.inline,this.block=s.block,this.renderer=s.renderer,this.typographer=s.typographer}function Remarkable(s,o){"string"!=typeof s&&(o=s,s="default"),o&&null!=o.linkify&&console.warn("linkify option is removed. Use linkify plugin instead:\n\nimport Remarkable from 'remarkable';\nimport linkify from 'remarkable/linkify';\nnew Remarkable().use(linkify)\n"),this.inline=new ParserInline,this.block=new ParserBlock,this.core=new Core,this.renderer=new Renderer,this.ruler=new Ruler,this.options={},this.configure(zA[s]),this.set(o||{})}Remarkable.prototype.set=function(s){index_browser_assign(this.options,s)},Remarkable.prototype.configure=function(s){var o=this;if(!s)throw new Error("Wrong `remarkable` preset, check name/content");s.options&&o.set(s.options),s.components&&Object.keys(s.components).forEach((function(i){s.components[i].rules&&o[i].ruler.enable(s.components[i].rules,!0)}))},Remarkable.prototype.use=function(s,o){return s(this,o),this},Remarkable.prototype.parse=function(s,o){var i=new StateCore(this,s,o);return this.core.process(i),i.tokens},Remarkable.prototype.render=function(s,o){return o=o||{},this.renderer.render(this.parse(s,o),this.options,o)},Remarkable.prototype.parseInline=function(s,o){var i=new StateCore(this,s,o);return i.inlineMode=!0,this.core.process(i),i.tokens},Remarkable.prototype.renderInline=function(s,o){return o=o||{},this.renderer.render(this.parseInline(s,o),this.options,o)};function indexOf(s,o){if(Array.prototype.indexOf)return s.indexOf(o);for(var i=0,a=s.length;i<a;i++)if(s[i]===o)return i;return-1}function utils_remove(s,o){for(var i=s.length-1;i>=0;i--)!0===o(s[i])&&s.splice(i,1)}function throwUnhandledCaseError(s){throw new Error("Unhandled case for value: '".concat(s,"'"))}var WA=function(){function HtmlTag(s){void 0===s&&(s={}),this.tagName="",this.attrs={},this.innerHTML="",this.whitespaceRegex=/\s+/,this.tagName=s.tagName||"",this.attrs=s.attrs||{},this.innerHTML=s.innerHtml||s.innerHTML||""}return HtmlTag.prototype.setTagName=function(s){return this.tagName=s,this},HtmlTag.prototype.getTagName=function(){return this.tagName||""},HtmlTag.prototype.setAttr=function(s,o){return this.getAttrs()[s]=o,this},HtmlTag.prototype.getAttr=function(s){return this.getAttrs()[s]},HtmlTag.prototype.setAttrs=function(s){return Object.assign(this.getAttrs(),s),this},HtmlTag.prototype.getAttrs=function(){return this.attrs||(this.attrs={})},HtmlTag.prototype.setClass=function(s){return this.setAttr("class",s)},HtmlTag.prototype.addClass=function(s){for(var o,i=this.getClass(),a=this.whitespaceRegex,u=i?i.split(a):[],_=s.split(a);o=_.shift();)-1===indexOf(u,o)&&u.push(o);return this.getAttrs().class=u.join(" "),this},HtmlTag.prototype.removeClass=function(s){for(var o,i=this.getClass(),a=this.whitespaceRegex,u=i?i.split(a):[],_=s.split(a);u.length&&(o=_.shift());){var w=indexOf(u,o);-1!==w&&u.splice(w,1)}return this.getAttrs().class=u.join(" "),this},HtmlTag.prototype.getClass=function(){return this.getAttrs().class||""},HtmlTag.prototype.hasClass=function(s){return-1!==(" "+this.getClass()+" ").indexOf(" "+s+" ")},HtmlTag.prototype.setInnerHTML=function(s){return this.innerHTML=s,this},HtmlTag.prototype.setInnerHtml=function(s){return this.setInnerHTML(s)},HtmlTag.prototype.getInnerHTML=function(){return this.innerHTML||""},HtmlTag.prototype.getInnerHtml=function(){return this.getInnerHTML()},HtmlTag.prototype.toAnchorString=function(){var s=this.getTagName(),o=this.buildAttrsStr();return["<",s,o=o?" "+o:"",">",this.getInnerHtml(),"</",s,">"].join("")},HtmlTag.prototype.buildAttrsStr=function(){if(!this.attrs)return"";var s=this.getAttrs(),o=[];for(var i in s)s.hasOwnProperty(i)&&o.push(i+'="'+s[i]+'"');return o.join(" ")},HtmlTag}();var JA=function(){function AnchorTagBuilder(s){void 0===s&&(s={}),this.newWindow=!1,this.truncate={},this.className="",this.newWindow=s.newWindow||!1,this.truncate=s.truncate||{},this.className=s.className||""}return AnchorTagBuilder.prototype.build=function(s){return new WA({tagName:"a",attrs:this.createAttrs(s),innerHtml:this.processAnchorText(s.getAnchorText())})},AnchorTagBuilder.prototype.createAttrs=function(s){var o={href:s.getAnchorHref()},i=this.createCssClass(s);return i&&(o.class=i),this.newWindow&&(o.target="_blank",o.rel="noopener noreferrer"),this.truncate&&this.truncate.length&&this.truncate.length<s.getAnchorText().length&&(o.title=s.getAnchorHref()),o},AnchorTagBuilder.prototype.createCssClass=function(s){var o=this.className;if(o){for(var i=[o],a=s.getCssClassSuffixes(),u=0,_=a.length;u<_;u++)i.push(o+"-"+a[u]);return i.join(" ")}return""},AnchorTagBuilder.prototype.processAnchorText=function(s){return s=this.doTruncate(s)},AnchorTagBuilder.prototype.doTruncate=function(s){var o=this.truncate;if(!o||!o.length)return s;var i=o.length,a=o.location;return"smart"===a?function truncateSmart(s,o,i){var a,u;null==i?(i="&hellip;",u=3,a=8):(u=i.length,a=i.length);var buildUrl=function(s){var o="";return s.scheme&&s.host&&(o+=s.scheme+"://"),s.host&&(o+=s.host),s.path&&(o+="/"+s.path),s.query&&(o+="?"+s.query),s.fragment&&(o+="#"+s.fragment),o},buildSegment=function(s,o){var a=o/2,u=Math.ceil(a),_=-1*Math.floor(a),w="";return _<0&&(w=s.substr(_)),s.substr(0,u)+i+w};if(s.length<=o)return s;var _=o-u,w=function(s){var o={},i=s,a=i.match(/^([a-z]+):\/\//i);return a&&(o.scheme=a[1],i=i.substr(a[0].length)),(a=i.match(/^(.*?)(?=(\?|#|\/|$))/i))&&(o.host=a[1],i=i.substr(a[0].length)),(a=i.match(/^\/(.*?)(?=(\?|#|$))/i))&&(o.path=a[1],i=i.substr(a[0].length)),(a=i.match(/^\?(.*?)(?=(#|$))/i))&&(o.query=a[1],i=i.substr(a[0].length)),(a=i.match(/^#(.*?)$/i))&&(o.fragment=a[1]),o}(s);if(w.query){var x=w.query.match(/^(.*?)(?=(\?|\#))(.*?)$/i);x&&(w.query=w.query.substr(0,x[1].length),s=buildUrl(w))}if(s.length<=o)return s;if(w.host&&(w.host=w.host.replace(/^www\./,""),s=buildUrl(w)),s.length<=o)return s;var C="";if(w.host&&(C+=w.host),C.length>=_)return w.host.length==o?(w.host.substr(0,o-u)+i).substr(0,_+a):buildSegment(C,_).substr(0,_+a);var j="";if(w.path&&(j+="/"+w.path),w.query&&(j+="?"+w.query),j){if((C+j).length>=_)return(C+j).length==o?(C+j).substr(0,o):(C+buildSegment(j,_-C.length)).substr(0,_+a);C+=j}if(w.fragment){var L="#"+w.fragment;if((C+L).length>=_)return(C+L).length==o?(C+L).substr(0,o):(C+buildSegment(L,_-C.length)).substr(0,_+a);C+=L}if(w.scheme&&w.host){var B=w.scheme+"://";if((C+B).length<_)return(B+C).substr(0,o)}if(C.length<=o)return C;var $="";return _>0&&($=C.substr(-1*Math.floor(_/2))),(C.substr(0,Math.ceil(_/2))+i+$).substr(0,_+a)}(s,i):"middle"===a?function truncateMiddle(s,o,i){if(s.length<=o)return s;var a,u;null==i?(i="&hellip;",a=8,u=3):(a=i.length,u=i.length);var _=o-u,w="";return _>0&&(w=s.substr(-1*Math.floor(_/2))),(s.substr(0,Math.ceil(_/2))+i+w).substr(0,_+a)}(s,i):function truncateEnd(s,o,i){return function ellipsis(s,o,i){var a;return s.length>o&&(null==i?(i="&hellip;",a=3):a=i.length,s=s.substring(0,o-a)+i),s}(s,o,i)}(s,i)},AnchorTagBuilder}(),HA=function(){function Match(s){this.__jsduckDummyDocProp=null,this.matchedText="",this.offset=0,this.tagBuilder=s.tagBuilder,this.matchedText=s.matchedText,this.offset=s.offset}return Match.prototype.getMatchedText=function(){return this.matchedText},Match.prototype.setOffset=function(s){this.offset=s},Match.prototype.getOffset=function(){return this.offset},Match.prototype.getCssClassSuffixes=function(){return[this.getType()]},Match.prototype.buildTag=function(){return this.tagBuilder.build(this)},Match}(),extendStatics=function(s,o){return extendStatics=Object.setPrototypeOf||{__proto__:[]}instanceof Array&&function(s,o){s.__proto__=o}||function(s,o){for(var i in o)Object.prototype.hasOwnProperty.call(o,i)&&(s[i]=o[i])},extendStatics(s,o)};function tslib_es6_extends(s,o){if("function"!=typeof o&&null!==o)throw new TypeError("Class extends value "+String(o)+" is not a constructor or null");function __(){this.constructor=s}extendStatics(s,o),s.prototype=null===o?Object.create(o):(__.prototype=o.prototype,new __)}var __assign=function(){return __assign=Object.assign||function __assign(s){for(var o,i=1,a=arguments.length;i<a;i++)for(var u in o=arguments[i])Object.prototype.hasOwnProperty.call(o,u)&&(s[u]=o[u]);return s},__assign.apply(this,arguments)};Object.create;Object.create;"function"==typeof SuppressedError&&SuppressedError;var KA,GA=function(s){function EmailMatch(o){var i=s.call(this,o)||this;return i.email="",i.email=o.email,i}return tslib_es6_extends(EmailMatch,s),EmailMatch.prototype.getType=function(){return"email"},EmailMatch.prototype.getEmail=function(){return this.email},EmailMatch.prototype.getAnchorHref=function(){return"mailto:"+this.email},EmailMatch.prototype.getAnchorText=function(){return this.email},EmailMatch}(HA),YA=function(s){function HashtagMatch(o){var i=s.call(this,o)||this;return i.serviceName="",i.hashtag="",i.serviceName=o.serviceName,i.hashtag=o.hashtag,i}return tslib_es6_extends(HashtagMatch,s),HashtagMatch.prototype.getType=function(){return"hashtag"},HashtagMatch.prototype.getServiceName=function(){return this.serviceName},HashtagMatch.prototype.getHashtag=function(){return this.hashtag},HashtagMatch.prototype.getAnchorHref=function(){var s=this.serviceName,o=this.hashtag;switch(s){case"twitter":return"https://twitter.com/hashtag/"+o;case"facebook":return"https://www.facebook.com/hashtag/"+o;case"instagram":return"https://instagram.com/explore/tags/"+o;case"tiktok":return"https://www.tiktok.com/tag/"+o;default:throw new Error("Unknown service name to point hashtag to: "+s)}},HashtagMatch.prototype.getAnchorText=function(){return"#"+this.hashtag},HashtagMatch}(HA),XA=function(s){function MentionMatch(o){var i=s.call(this,o)||this;return i.serviceName="twitter",i.mention="",i.mention=o.mention,i.serviceName=o.serviceName,i}return tslib_es6_extends(MentionMatch,s),MentionMatch.prototype.getType=function(){return"mention"},MentionMatch.prototype.getMention=function(){return this.mention},MentionMatch.prototype.getServiceName=function(){return this.serviceName},MentionMatch.prototype.getAnchorHref=function(){switch(this.serviceName){case"twitter":return"https://twitter.com/"+this.mention;case"instagram":return"https://instagram.com/"+this.mention;case"soundcloud":return"https://soundcloud.com/"+this.mention;case"tiktok":return"https://www.tiktok.com/@"+this.mention;default:throw new Error("Unknown service name to point mention to: "+this.serviceName)}},MentionMatch.prototype.getAnchorText=function(){return"@"+this.mention},MentionMatch.prototype.getCssClassSuffixes=function(){var o=s.prototype.getCssClassSuffixes.call(this),i=this.getServiceName();return i&&o.push(i),o},MentionMatch}(HA),QA=function(s){function PhoneMatch(o){var i=s.call(this,o)||this;return i.number="",i.plusSign=!1,i.number=o.number,i.plusSign=o.plusSign,i}return tslib_es6_extends(PhoneMatch,s),PhoneMatch.prototype.getType=function(){return"phone"},PhoneMatch.prototype.getPhoneNumber=function(){return this.number},PhoneMatch.prototype.getNumber=function(){return this.getPhoneNumber()},PhoneMatch.prototype.getAnchorHref=function(){return"tel:"+(this.plusSign?"+":"")+this.number},PhoneMatch.prototype.getAnchorText=function(){return this.matchedText},PhoneMatch}(HA),ZA=function(s){function UrlMatch(o){var i=s.call(this,o)||this;return i.url="",i.urlMatchType="scheme",i.protocolUrlMatch=!1,i.protocolRelativeMatch=!1,i.stripPrefix={scheme:!0,www:!0},i.stripTrailingSlash=!0,i.decodePercentEncoding=!0,i.schemePrefixRegex=/^(https?:\/\/)?/i,i.wwwPrefixRegex=/^(https?:\/\/)?(www\.)?/i,i.protocolRelativeRegex=/^\/\//,i.protocolPrepended=!1,i.urlMatchType=o.urlMatchType,i.url=o.url,i.protocolUrlMatch=o.protocolUrlMatch,i.protocolRelativeMatch=o.protocolRelativeMatch,i.stripPrefix=o.stripPrefix,i.stripTrailingSlash=o.stripTrailingSlash,i.decodePercentEncoding=o.decodePercentEncoding,i}return tslib_es6_extends(UrlMatch,s),UrlMatch.prototype.getType=function(){return"url"},UrlMatch.prototype.getUrlMatchType=function(){return this.urlMatchType},UrlMatch.prototype.getUrl=function(){var s=this.url;return this.protocolRelativeMatch||this.protocolUrlMatch||this.protocolPrepended||(s=this.url="http://"+s,this.protocolPrepended=!0),s},UrlMatch.prototype.getAnchorHref=function(){return this.getUrl().replace(/&amp;/g,"&")},UrlMatch.prototype.getAnchorText=function(){var s=this.getMatchedText();return this.protocolRelativeMatch&&(s=this.stripProtocolRelativePrefix(s)),this.stripPrefix.scheme&&(s=this.stripSchemePrefix(s)),this.stripPrefix.www&&(s=this.stripWwwPrefix(s)),this.stripTrailingSlash&&(s=this.removeTrailingSlash(s)),this.decodePercentEncoding&&(s=this.removePercentEncoding(s)),s},UrlMatch.prototype.stripSchemePrefix=function(s){return s.replace(this.schemePrefixRegex,"")},UrlMatch.prototype.stripWwwPrefix=function(s){return s.replace(this.wwwPrefixRegex,"$1")},UrlMatch.prototype.stripProtocolRelativePrefix=function(s){return s.replace(this.protocolRelativeRegex,"")},UrlMatch.prototype.removeTrailingSlash=function(s){return"/"===s.charAt(s.length-1)&&(s=s.slice(0,-1)),s},UrlMatch.prototype.removePercentEncoding=function(s){var o=s.replace(/%22/gi,"&quot;").replace(/%26/gi,"&amp;").replace(/%27/gi,"&#39;").replace(/%3C/gi,"&lt;").replace(/%3E/gi,"&gt;");try{return decodeURIComponent(o)}catch(s){return o}},UrlMatch}(HA),eC=function eC(s){this.__jsduckDummyDocProp=null,this.tagBuilder=s.tagBuilder},tC=/[A-Za-z]/,rC=/[\d]/,nC=/[\D]/,sC=/\s/,oC=/['"]/,iC=/[\x00-\x1F\x7F]/,aC=/A-Za-z\xAA\xB5\xBA\xC0-\xD6\xD8-\xF6\xF8-\u02C1\u02C6-\u02D1\u02E0-\u02E4\u02EC\u02EE\u0370-\u0374\u0376\u0377\u037A-\u037D\u037F\u0386\u0388-\u038A\u038C\u038E-\u03A1\u03A3-\u03F5\u03F7-\u0481\u048A-\u052F\u0531-\u0556\u0559\u0561-\u0587\u05D0-\u05EA\u05F0-\u05F2\u0620-\u064A\u066E\u066F\u0671-\u06D3\u06D5\u06E5\u06E6\u06EE\u06EF\u06FA-\u06FC\u06FF\u0710\u0712-\u072F\u074D-\u07A5\u07B1\u07CA-\u07EA\u07F4\u07F5\u07FA\u0800-\u0815\u081A\u0824\u0828\u0840-\u0858\u08A0-\u08B4\u08B6-\u08BD\u0904-\u0939\u093D\u0950\u0958-\u0961\u0971-\u0980\u0985-\u098C\u098F\u0990\u0993-\u09A8\u09AA-\u09B0\u09B2\u09B6-\u09B9\u09BD\u09CE\u09DC\u09DD\u09DF-\u09E1\u09F0\u09F1\u0A05-\u0A0A\u0A0F\u0A10\u0A13-\u0A28\u0A2A-\u0A30\u0A32\u0A33\u0A35\u0A36\u0A38\u0A39\u0A59-\u0A5C\u0A5E\u0A72-\u0A74\u0A85-\u0A8D\u0A8F-\u0A91\u0A93-\u0AA8\u0AAA-\u0AB0\u0AB2\u0AB3\u0AB5-\u0AB9\u0ABD\u0AD0\u0AE0\u0AE1\u0AF9\u0B05-\u0B0C\u0B0F\u0B10\u0B13-\u0B28\u0B2A-\u0B30\u0B32\u0B33\u0B35-\u0B39\u0B3D\u0B5C\u0B5D\u0B5F-\u0B61\u0B71\u0B83\u0B85-\u0B8A\u0B8E-\u0B90\u0B92-\u0B95\u0B99\u0B9A\u0B9C\u0B9E\u0B9F\u0BA3\u0BA4\u0BA8-\u0BAA\u0BAE-\u0BB9\u0BD0\u0C05-\u0C0C\u0C0E-\u0C10\u0C12-\u0C28\u0C2A-\u0C39\u0C3D\u0C58-\u0C5A\u0C60\u0C61\u0C80\u0C85-\u0C8C\u0C8E-\u0C90\u0C92-\u0CA8\u0CAA-\u0CB3\u0CB5-\u0CB9\u0CBD\u0CDE\u0CE0\u0CE1\u0CF1\u0CF2\u0D05-\u0D0C\u0D0E-\u0D10\u0D12-\u0D3A\u0D3D\u0D4E\u0D54-\u0D56\u0D5F-\u0D61\u0D7A-\u0D7F\u0D85-\u0D96\u0D9A-\u0DB1\u0DB3-\u0DBB\u0DBD\u0DC0-\u0DC6\u0E01-\u0E30\u0E32\u0E33\u0E40-\u0E46\u0E81\u0E82\u0E84\u0E87\u0E88\u0E8A\u0E8D\u0E94-\u0E97\u0E99-\u0E9F\u0EA1-\u0EA3\u0EA5\u0EA7\u0EAA\u0EAB\u0EAD-\u0EB0\u0EB2\u0EB3\u0EBD\u0EC0-\u0EC4\u0EC6\u0EDC-\u0EDF\u0F00\u0F40-\u0F47\u0F49-\u0F6C\u0F88-\u0F8C\u1000-\u102A\u103F\u1050-\u1055\u105A-\u105D\u1061\u1065\u1066\u106E-\u1070\u1075-\u1081\u108E\u10A0-\u10C5\u10C7\u10CD\u10D0-\u10FA\u10FC-\u1248\u124A-\u124D\u1250-\u1256\u1258\u125A-\u125D\u1260-\u1288\u128A-\u128D\u1290-\u12B0\u12B2-\u12B5\u12B8-\u12BE\u12C0\u12C2-\u12C5\u12C8-\u12D6\u12D8-\u1310\u1312-\u1315\u1318-\u135A\u1380-\u138F\u13A0-\u13F5\u13F8-\u13FD\u1401-\u166C\u166F-\u167F\u1681-\u169A\u16A0-\u16EA\u16F1-\u16F8\u1700-\u170C\u170E-\u1711\u1720-\u1731\u1740-\u1751\u1760-\u176C\u176E-\u1770\u1780-\u17B3\u17D7\u17DC\u1820-\u1877\u1880-\u1884\u1887-\u18A8\u18AA\u18B0-\u18F5\u1900-\u191E\u1950-\u196D\u1970-\u1974\u1980-\u19AB\u19B0-\u19C9\u1A00-\u1A16\u1A20-\u1A54\u1AA7\u1B05-\u1B33\u1B45-\u1B4B\u1B83-\u1BA0\u1BAE\u1BAF\u1BBA-\u1BE5\u1C00-\u1C23\u1C4D-\u1C4F\u1C5A-\u1C7D\u1C80-\u1C88\u1CE9-\u1CEC\u1CEE-\u1CF1\u1CF5\u1CF6\u1D00-\u1DBF\u1E00-\u1F15\u1F18-\u1F1D\u1F20-\u1F45\u1F48-\u1F4D\u1F50-\u1F57\u1F59\u1F5B\u1F5D\u1F5F-\u1F7D\u1F80-\u1FB4\u1FB6-\u1FBC\u1FBE\u1FC2-\u1FC4\u1FC6-\u1FCC\u1FD0-\u1FD3\u1FD6-\u1FDB\u1FE0-\u1FEC\u1FF2-\u1FF4\u1FF6-\u1FFC\u2071\u207F\u2090-\u209C\u2102\u2107\u210A-\u2113\u2115\u2119-\u211D\u2124\u2126\u2128\u212A-\u212D\u212F-\u2139\u213C-\u213F\u2145-\u2149\u214E\u2183\u2184\u2C00-\u2C2E\u2C30-\u2C5E\u2C60-\u2CE4\u2CEB-\u2CEE\u2CF2\u2CF3\u2D00-\u2D25\u2D27\u2D2D\u2D30-\u2D67\u2D6F\u2D80-\u2D96\u2DA0-\u2DA6\u2DA8-\u2DAE\u2DB0-\u2DB6\u2DB8-\u2DBE\u2DC0-\u2DC6\u2DC8-\u2DCE\u2DD0-\u2DD6\u2DD8-\u2DDE\u2E2F\u3005\u3006\u3031-\u3035\u303B\u303C\u3041-\u3096\u309D-\u309F\u30A1-\u30FA\u30FC-\u30FF\u3105-\u312D\u3131-\u318E\u31A0-\u31BA\u31F0-\u31FF\u3400-\u4DB5\u4E00-\u9FD5\uA000-\uA48C\uA4D0-\uA4FD\uA500-\uA60C\uA610-\uA61F\uA62A\uA62B\uA640-\uA66E\uA67F-\uA69D\uA6A0-\uA6E5\uA717-\uA71F\uA722-\uA788\uA78B-\uA7AE\uA7B0-\uA7B7\uA7F7-\uA801\uA803-\uA805\uA807-\uA80A\uA80C-\uA822\uA840-\uA873\uA882-\uA8B3\uA8F2-\uA8F7\uA8FB\uA8FD\uA90A-\uA925\uA930-\uA946\uA960-\uA97C\uA984-\uA9B2\uA9CF\uA9E0-\uA9E4\uA9E6-\uA9EF\uA9FA-\uA9FE\uAA00-\uAA28\uAA40-\uAA42\uAA44-\uAA4B\uAA60-\uAA76\uAA7A\uAA7E-\uAAAF\uAAB1\uAAB5\uAAB6\uAAB9-\uAABD\uAAC0\uAAC2\uAADB-\uAADD\uAAE0-\uAAEA\uAAF2-\uAAF4\uAB01-\uAB06\uAB09-\uAB0E\uAB11-\uAB16\uAB20-\uAB26\uAB28-\uAB2E\uAB30-\uAB5A\uAB5C-\uAB65\uAB70-\uABE2\uAC00-\uD7A3\uD7B0-\uD7C6\uD7CB-\uD7FB\uF900-\uFA6D\uFA70-\uFAD9\uFB00-\uFB06\uFB13-\uFB17\uFB1D\uFB1F-\uFB28\uFB2A-\uFB36\uFB38-\uFB3C\uFB3E\uFB40\uFB41\uFB43\uFB44\uFB46-\uFBB1\uFBD3-\uFD3D\uFD50-\uFD8F\uFD92-\uFDC7\uFDF0-\uFDFB\uFE70-\uFE74\uFE76-\uFEFC\uFF21-\uFF3A\uFF41-\uFF5A\uFF66-\uFFBE\uFFC2-\uFFC7\uFFCA-\uFFCF\uFFD2-\uFFD7\uFFDA-\uFFDC/.source,cC=aC+/\u2700-\u27bf\udde6-\uddff\ud800-\udbff\udc00-\udfff\ufe0e\ufe0f\u0300-\u036f\ufe20-\ufe23\u20d0-\u20f0\ud83c\udffb-\udfff\u200d\u3299\u3297\u303d\u3030\u24c2\ud83c\udd70-\udd71\udd7e-\udd7f\udd8e\udd91-\udd9a\udde6-\uddff\ude01-\ude02\ude1a\ude2f\ude32-\ude3a\ude50-\ude51\u203c\u2049\u25aa-\u25ab\u25b6\u25c0\u25fb-\u25fe\u00a9\u00ae\u2122\u2139\udc04\u2600-\u26FF\u2b05\u2b06\u2b07\u2b1b\u2b1c\u2b50\u2b55\u231a\u231b\u2328\u23cf\u23e9-\u23f3\u23f8-\u23fa\udccf\u2935\u2934\u2190-\u21ff/.source+/\u0300-\u036F\u0483-\u0489\u0591-\u05BD\u05BF\u05C1\u05C2\u05C4\u05C5\u05C7\u0610-\u061A\u064B-\u065F\u0670\u06D6-\u06DC\u06DF-\u06E4\u06E7\u06E8\u06EA-\u06ED\u0711\u0730-\u074A\u07A6-\u07B0\u07EB-\u07F3\u0816-\u0819\u081B-\u0823\u0825-\u0827\u0829-\u082D\u0859-\u085B\u08D4-\u08E1\u08E3-\u0903\u093A-\u093C\u093E-\u094F\u0951-\u0957\u0962\u0963\u0981-\u0983\u09BC\u09BE-\u09C4\u09C7\u09C8\u09CB-\u09CD\u09D7\u09E2\u09E3\u0A01-\u0A03\u0A3C\u0A3E-\u0A42\u0A47\u0A48\u0A4B-\u0A4D\u0A51\u0A70\u0A71\u0A75\u0A81-\u0A83\u0ABC\u0ABE-\u0AC5\u0AC7-\u0AC9\u0ACB-\u0ACD\u0AE2\u0AE3\u0B01-\u0B03\u0B3C\u0B3E-\u0B44\u0B47\u0B48\u0B4B-\u0B4D\u0B56\u0B57\u0B62\u0B63\u0B82\u0BBE-\u0BC2\u0BC6-\u0BC8\u0BCA-\u0BCD\u0BD7\u0C00-\u0C03\u0C3E-\u0C44\u0C46-\u0C48\u0C4A-\u0C4D\u0C55\u0C56\u0C62\u0C63\u0C81-\u0C83\u0CBC\u0CBE-\u0CC4\u0CC6-\u0CC8\u0CCA-\u0CCD\u0CD5\u0CD6\u0CE2\u0CE3\u0D01-\u0D03\u0D3E-\u0D44\u0D46-\u0D48\u0D4A-\u0D4D\u0D57\u0D62\u0D63\u0D82\u0D83\u0DCA\u0DCF-\u0DD4\u0DD6\u0DD8-\u0DDF\u0DF2\u0DF3\u0E31\u0E34-\u0E3A\u0E47-\u0E4E\u0EB1\u0EB4-\u0EB9\u0EBB\u0EBC\u0EC8-\u0ECD\u0F18\u0F19\u0F35\u0F37\u0F39\u0F3E\u0F3F\u0F71-\u0F84\u0F86\u0F87\u0F8D-\u0F97\u0F99-\u0FBC\u0FC6\u102B-\u103E\u1056-\u1059\u105E-\u1060\u1062-\u1064\u1067-\u106D\u1071-\u1074\u1082-\u108D\u108F\u109A-\u109D\u135D-\u135F\u1712-\u1714\u1732-\u1734\u1752\u1753\u1772\u1773\u17B4-\u17D3\u17DD\u180B-\u180D\u1885\u1886\u18A9\u1920-\u192B\u1930-\u193B\u1A17-\u1A1B\u1A55-\u1A5E\u1A60-\u1A7C\u1A7F\u1AB0-\u1ABE\u1B00-\u1B04\u1B34-\u1B44\u1B6B-\u1B73\u1B80-\u1B82\u1BA1-\u1BAD\u1BE6-\u1BF3\u1C24-\u1C37\u1CD0-\u1CD2\u1CD4-\u1CE8\u1CED\u1CF2-\u1CF4\u1CF8\u1CF9\u1DC0-\u1DF5\u1DFB-\u1DFF\u20D0-\u20F0\u2CEF-\u2CF1\u2D7F\u2DE0-\u2DFF\u302A-\u302F\u3099\u309A\uA66F-\uA672\uA674-\uA67D\uA69E\uA69F\uA6F0\uA6F1\uA802\uA806\uA80B\uA823-\uA827\uA880\uA881\uA8B4-\uA8C5\uA8E0-\uA8F1\uA926-\uA92D\uA947-\uA953\uA980-\uA983\uA9B3-\uA9C0\uA9E5\uAA29-\uAA36\uAA43\uAA4C\uAA4D\uAA7B-\uAA7D\uAAB0\uAAB2-\uAAB4\uAAB7\uAAB8\uAABE\uAABF\uAAC1\uAAEB-\uAAEF\uAAF5\uAAF6\uABE3-\uABEA\uABEC\uABED\uFB1E\uFE00-\uFE0F\uFE20-\uFE2F/.source,lC=/0-9\u0660-\u0669\u06F0-\u06F9\u07C0-\u07C9\u0966-\u096F\u09E6-\u09EF\u0A66-\u0A6F\u0AE6-\u0AEF\u0B66-\u0B6F\u0BE6-\u0BEF\u0C66-\u0C6F\u0CE6-\u0CEF\u0D66-\u0D6F\u0DE6-\u0DEF\u0E50-\u0E59\u0ED0-\u0ED9\u0F20-\u0F29\u1040-\u1049\u1090-\u1099\u17E0-\u17E9\u1810-\u1819\u1946-\u194F\u19D0-\u19D9\u1A80-\u1A89\u1A90-\u1A99\u1B50-\u1B59\u1BB0-\u1BB9\u1C40-\u1C49\u1C50-\u1C59\uA620-\uA629\uA8D0-\uA8D9\uA900-\uA909\uA9D0-\uA9D9\uA9F0-\uA9F9\uAA50-\uAA59\uABF0-\uABF9\uFF10-\uFF19/.source,uC=cC+lC,pC=cC+lC,hC=new RegExp("[".concat(pC,"]")),dC="(?:["+lC+"]{1,3}\\.){3}["+lC+"]{1,3}",fC="["+pC+"](?:["+pC+"\\-_]{0,61}["+pC+"])?",getDomainLabelStr=function(s){return"(?=("+fC+"))\\"+s},getDomainNameStr=function(s){return"(?:"+getDomainLabelStr(s)+"(?:\\."+getDomainLabelStr(s+1)+"){0,126}|"+dC+")"},mC=(new RegExp("["+pC+".\\-]*["+pC+"\\-]"),hC),gC=/(?:xn--vermgensberatung-pwb|xn--vermgensberater-ctb|xn--clchc0ea0b2g2a9gcd|xn--w4r85el8fhu5dnra|northwesternmutual|travelersinsurance|vermögensberatung|xn--5su34j936bgsg|xn--bck1b9a5dre4c|xn--mgbah1a3hjkrd|xn--mgbai9azgqp6j|xn--mgberp4a5d4ar|xn--xkc2dl3a5ee0h|vermögensberater|xn--fzys8d69uvgm|xn--mgba7c0bbn0a|xn--mgbcpq6gpa1a|xn--xkc2al3hye2a|americanexpress|kerryproperties|sandvikcoromant|xn--i1b6b1a6a2e|xn--kcrx77d1x4a|xn--lgbbat1ad8j|xn--mgba3a4f16a|xn--mgbaakc7dvf|xn--mgbc0a9azcg|xn--nqv7fs00ema|americanfamily|bananarepublic|cancerresearch|cookingchannel|kerrylogistics|weatherchannel|xn--54b7fta0cc|xn--6qq986b3xl|xn--80aqecdr1a|xn--b4w605ferd|xn--fiq228c5hs|xn--h2breg3eve|xn--jlq480n2rg|xn--jlq61u9w7b|xn--mgba3a3ejt|xn--mgbaam7a8h|xn--mgbayh7gpa|xn--mgbbh1a71e|xn--mgbca7dzdo|xn--mgbi4ecexp|xn--mgbx4cd0ab|xn--rvc1e0am3e|international|lifeinsurance|travelchannel|wolterskluwer|xn--cckwcxetd|xn--eckvdtc9d|xn--fpcrj9c3d|xn--fzc2c9e2c|xn--h2brj9c8c|xn--tiq49xqyj|xn--yfro4i67o|xn--ygbi2ammx|construction|lplfinancial|scholarships|versicherung|xn--3e0b707e|xn--45br5cyl|xn--4dbrk0ce|xn--80adxhks|xn--80asehdb|xn--8y0a063a|xn--gckr3f0f|xn--mgb9awbf|xn--mgbab2bd|xn--mgbgu82a|xn--mgbpl2fh|xn--mgbt3dhd|xn--mk1bu44c|xn--ngbc5azd|xn--ngbe9e0a|xn--ogbpf8fl|xn--qcka1pmc|accountants|barclaycard|blackfriday|blockbuster|bridgestone|calvinklein|contractors|creditunion|engineering|enterprises|foodnetwork|investments|kerryhotels|lamborghini|motorcycles|olayangroup|photography|playstation|productions|progressive|redumbrella|williamhill|xn--11b4c3d|xn--1ck2e1b|xn--1qqw23a|xn--2scrj9c|xn--3bst00m|xn--3ds443g|xn--3hcrj9c|xn--42c2d9a|xn--45brj9c|xn--55qw42g|xn--6frz82g|xn--80ao21a|xn--9krt00a|xn--cck2b3b|xn--czr694b|xn--d1acj3b|xn--efvy88h|xn--fct429k|xn--fjq720a|xn--flw351e|xn--g2xx48c|xn--gecrj9c|xn--gk3at1e|xn--h2brj9c|xn--hxt814e|xn--imr513n|xn--j6w193g|xn--jvr189m|xn--kprw13d|xn--kpry57d|xn--mgbbh1a|xn--mgbtx2b|xn--mix891f|xn--nyqy26a|xn--otu796d|xn--pgbs0dh|xn--q9jyb4c|xn--rhqv96g|xn--rovu88b|xn--s9brj9c|xn--ses554g|xn--t60b56a|xn--vuq861b|xn--w4rs40l|xn--xhq521b|xn--zfr164b|சிங்கப்பூர்|accountant|apartments|associates|basketball|bnpparibas|boehringer|capitalone|consulting|creditcard|cuisinella|eurovision|extraspace|foundation|healthcare|immobilien|industries|management|mitsubishi|nextdirect|properties|protection|prudential|realestate|republican|restaurant|schaeffler|tatamotors|technology|university|vlaanderen|volkswagen|xn--30rr7y|xn--3pxu8k|xn--45q11c|xn--4gbrim|xn--55qx5d|xn--5tzm5g|xn--80aswg|xn--90a3ac|xn--9dbq2a|xn--9et52u|xn--c2br7g|xn--cg4bki|xn--czrs0t|xn--czru2d|xn--fiq64b|xn--fiqs8s|xn--fiqz9s|xn--io0a7i|xn--kput3i|xn--mxtq1m|xn--o3cw4h|xn--pssy2u|xn--q7ce6a|xn--unup4y|xn--wgbh1c|xn--wgbl6a|xn--y9a3aq|accenture|alfaromeo|allfinanz|amsterdam|analytics|aquarelle|barcelona|bloomberg|christmas|community|directory|education|equipment|fairwinds|financial|firestone|fresenius|frontdoor|furniture|goldpoint|hisamitsu|homedepot|homegoods|homesense|institute|insurance|kuokgroup|lancaster|landrover|lifestyle|marketing|marshalls|melbourne|microsoft|panasonic|passagens|pramerica|richardli|shangrila|solutions|statebank|statefarm|stockholm|travelers|vacations|xn--90ais|xn--c1avg|xn--d1alf|xn--e1a4c|xn--fhbei|xn--j1aef|xn--j1amh|xn--l1acc|xn--ngbrx|xn--nqv7f|xn--p1acf|xn--qxa6a|xn--tckwe|xn--vhquv|yodobashi|موريتانيا|abudhabi|airforce|allstate|attorney|barclays|barefoot|bargains|baseball|boutique|bradesco|broadway|brussels|builders|business|capetown|catering|catholic|cipriani|cityeats|cleaning|clinique|clothing|commbank|computer|delivery|deloitte|democrat|diamonds|discount|discover|download|engineer|ericsson|etisalat|exchange|feedback|fidelity|firmdale|football|frontier|goodyear|grainger|graphics|guardian|hdfcbank|helsinki|holdings|hospital|infiniti|ipiranga|istanbul|jpmorgan|lighting|lundbeck|marriott|maserati|mckinsey|memorial|merckmsd|mortgage|observer|partners|pharmacy|pictures|plumbing|property|redstone|reliance|saarland|samsclub|security|services|shopping|showtime|softbank|software|stcgroup|supplies|training|vanguard|ventures|verisign|woodside|xn--90ae|xn--node|xn--p1ai|xn--qxam|yokohama|السعودية|abogado|academy|agakhan|alibaba|android|athleta|auction|audible|auspost|avianca|banamex|bauhaus|bentley|bestbuy|booking|brother|bugatti|capital|caravan|careers|channel|charity|chintai|citadel|clubmed|college|cologne|comcast|company|compare|contact|cooking|corsica|country|coupons|courses|cricket|cruises|dentist|digital|domains|exposed|express|farmers|fashion|ferrari|ferrero|finance|fishing|fitness|flights|florist|flowers|forsale|frogans|fujitsu|gallery|genting|godaddy|grocery|guitars|hamburg|hangout|hitachi|holiday|hosting|hoteles|hotmail|hyundai|ismaili|jewelry|juniper|kitchen|komatsu|lacaixa|lanxess|lasalle|latrobe|leclerc|limited|lincoln|markets|monster|netbank|netflix|network|neustar|okinawa|oldnavy|organic|origins|philips|pioneer|politie|realtor|recipes|rentals|reviews|rexroth|samsung|sandvik|schmidt|schwarz|science|shiksha|singles|staples|storage|support|surgery|systems|temasek|theater|theatre|tickets|tiffany|toshiba|trading|walmart|wanggou|watches|weather|website|wedding|whoswho|windows|winners|xfinity|yamaxun|youtube|zuerich|католик|اتصالات|البحرين|الجزائر|العليان|پاکستان|كاثوليك|இந்தியா|abarth|abbott|abbvie|africa|agency|airbus|airtel|alipay|alsace|alstom|amazon|anquan|aramco|author|bayern|beauty|berlin|bharti|bostik|boston|broker|camera|career|casino|center|chanel|chrome|church|circle|claims|clinic|coffee|comsec|condos|coupon|credit|cruise|dating|datsun|dealer|degree|dental|design|direct|doctor|dunlop|dupont|durban|emerck|energy|estate|events|expert|family|flickr|futbol|gallup|garden|george|giving|global|google|gratis|health|hermes|hiphop|hockey|hotels|hughes|imamat|insure|intuit|jaguar|joburg|juegos|kaufen|kinder|kindle|kosher|lancia|latino|lawyer|lefrak|living|locker|london|luxury|madrid|maison|makeup|market|mattel|mobile|monash|mormon|moscow|museum|mutual|nagoya|natura|nissan|nissay|norton|nowruz|office|olayan|online|oracle|orange|otsuka|pfizer|photos|physio|pictet|quebec|racing|realty|reisen|repair|report|review|rocher|rogers|ryukyu|safety|sakura|sanofi|school|schule|search|secure|select|shouji|soccer|social|stream|studio|supply|suzuki|swatch|sydney|taipei|taobao|target|tattoo|tennis|tienda|tjmaxx|tkmaxx|toyota|travel|unicom|viajes|viking|villas|virgin|vision|voting|voyage|vuelos|walter|webcam|xihuan|yachts|yandex|zappos|москва|онлайн|ابوظبي|ارامكو|الاردن|المغرب|امارات|فلسطين|مليسيا|भारतम्|இலங்கை|ファッション|actor|adult|aetna|amfam|amica|apple|archi|audio|autos|azure|baidu|beats|bible|bingo|black|boats|bosch|build|canon|cards|chase|cheap|cisco|citic|click|cloud|coach|codes|crown|cymru|dabur|dance|deals|delta|drive|dubai|earth|edeka|email|epson|faith|fedex|final|forex|forum|gallo|games|gifts|gives|glass|globo|gmail|green|gripe|group|gucci|guide|homes|honda|horse|house|hyatt|ikano|irish|jetzt|koeln|kyoto|lamer|lease|legal|lexus|lilly|linde|lipsy|loans|locus|lotte|lotto|macys|mango|media|miami|money|movie|music|nexus|nikon|ninja|nokia|nowtv|omega|osaka|paris|parts|party|phone|photo|pizza|place|poker|praxi|press|prime|promo|quest|radio|rehab|reise|ricoh|rocks|rodeo|rugby|salon|sener|seven|sharp|shell|shoes|skype|sling|smart|smile|solar|space|sport|stada|store|study|style|sucks|swiss|tatar|tires|tirol|tmall|today|tokyo|tools|toray|total|tours|trade|trust|tunes|tushu|ubank|vegas|video|vodka|volvo|wales|watch|weber|weibo|works|world|xerox|yahoo|ישראל|ایران|بازار|بھارت|سودان|سورية|همراه|भारोत|संगठन|বাংলা|భారత్|ഭാരതം|嘉里大酒店|aarp|able|adac|aero|akdn|ally|amex|arab|army|arpa|arte|asda|asia|audi|auto|baby|band|bank|bbva|beer|best|bike|bing|blog|blue|bofa|bond|book|buzz|cafe|call|camp|care|cars|casa|case|cash|cbre|cern|chat|citi|city|club|cool|coop|cyou|data|date|dclk|deal|dell|desi|diet|dish|docs|dvag|erni|fage|fail|fans|farm|fast|fiat|fido|film|fire|fish|flir|food|ford|free|fund|game|gbiz|gent|ggee|gift|gmbh|gold|golf|goog|guge|guru|hair|haus|hdfc|help|here|hgtv|host|hsbc|icbc|ieee|imdb|immo|info|itau|java|jeep|jobs|jprs|kddi|kids|kiwi|kpmg|kred|land|lego|lgbt|lidl|life|like|limo|link|live|loan|loft|love|ltda|luxe|maif|meet|meme|menu|mini|mint|mobi|moda|moto|name|navy|news|next|nico|nike|ollo|open|page|pars|pccw|pics|ping|pink|play|plus|pohl|porn|post|prod|prof|qpon|read|reit|rent|rest|rich|room|rsvp|ruhr|safe|sale|sarl|save|saxo|scot|seat|seek|sexy|shaw|shia|shop|show|silk|sina|site|skin|sncf|sohu|song|sony|spot|star|surf|talk|taxi|team|tech|teva|tiaa|tips|town|toys|tube|vana|visa|viva|vivo|vote|voto|wang|weir|wien|wiki|wine|work|xbox|yoga|zara|zero|zone|дети|сайт|بارت|بيتك|ڀارت|تونس|شبكة|عراق|عمان|موقع|भारत|ভারত|ভাৰত|ਭਾਰਤ|ભારત|ଭାରତ|ಭಾರತ|ලංකා|アマゾン|グーグル|クラウド|ポイント|组织机构|電訊盈科|香格里拉|aaa|abb|abc|aco|ads|aeg|afl|aig|anz|aol|app|art|aws|axa|bar|bbc|bbt|bcg|bcn|bet|bid|bio|biz|bms|bmw|bom|boo|bot|box|buy|bzh|cab|cal|cam|car|cat|cba|cbn|cbs|ceo|cfa|cfd|com|cpa|crs|dad|day|dds|dev|dhl|diy|dnp|dog|dot|dtv|dvr|eat|eco|edu|esq|eus|fan|fit|fly|foo|fox|frl|ftr|fun|fyi|gal|gap|gay|gdn|gea|gle|gmo|gmx|goo|gop|got|gov|hbo|hiv|hkt|hot|how|ibm|ice|icu|ifm|inc|ing|ink|int|ist|itv|jcb|jio|jll|jmp|jnj|jot|joy|kfh|kia|kim|kpn|krd|lat|law|lds|llc|llp|lol|lpl|ltd|man|map|mba|med|men|mil|mit|mlb|mls|mma|moe|moi|mom|mov|msd|mtn|mtr|nab|nba|nec|net|new|nfl|ngo|nhk|now|nra|nrw|ntt|nyc|obi|one|ong|onl|ooo|org|ott|ovh|pay|pet|phd|pid|pin|pnc|pro|pru|pub|pwc|red|ren|ril|rio|rip|run|rwe|sap|sas|sbi|sbs|sca|scb|ses|sew|sex|sfr|ski|sky|soy|spa|srl|stc|tab|tax|tci|tdk|tel|thd|tjx|top|trv|tui|tvs|ubs|uno|uol|ups|vet|vig|vin|vip|wed|win|wme|wow|wtc|wtf|xin|xxx|xyz|you|yun|zip|бел|ком|қаз|мкд|мон|орг|рус|срб|укр|հայ|קום|عرب|قطر|كوم|مصر|कॉम|नेट|คอม|ไทย|ລາວ|ストア|セール|みんな|中文网|亚马逊|天主教|我爱你|新加坡|淡马锡|诺基亚|飞利浦|ac|ad|ae|af|ag|ai|al|am|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cu|cv|cw|cx|cy|cz|de|dj|dk|dm|do|dz|ec|ee|eg|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw|ελ|ευ|бг|ею|рф|გე|닷넷|닷컴|삼성|한국|コム|世界|中信|中国|中國|企业|佛山|信息|健康|八卦|公司|公益|台湾|台灣|商城|商店|商标|嘉里|在线|大拿|娱乐|家電|广东|微博|慈善|手机|招聘|政务|政府|新闻|时尚|書籍|机构|游戏|澳門|点看|移动|网址|网店|网站|网络|联通|谷歌|购物|通販|集团|食品|餐厅|香港)/,yC=new RegExp("[".concat(pC,"!#$%&'*+/=?^_`{|}~-]")),vC=new RegExp("^".concat(gC.source,"$")),bC=function(s){function EmailMatcher(){var o=null!==s&&s.apply(this,arguments)||this;return o.localPartCharRegex=yC,o.strictTldRegex=vC,o}return tslib_es6_extends(EmailMatcher,s),EmailMatcher.prototype.parseMatches=function(s){for(var o=this.tagBuilder,i=this.localPartCharRegex,a=this.strictTldRegex,u=[],_=s.length,w=new _C,x={m:"a",a:"i",i:"l",l:"t",t:"o",o:":"},C=0,j=0,L=w;C<_;){var B=s.charAt(C);switch(j){case 0:stateNonEmailAddress(B);break;case 1:stateMailTo(s.charAt(C-1),B);break;case 2:stateLocalPart(B);break;case 3:stateLocalPartDot(B);break;case 4:stateAtSign(B);break;case 5:stateDomainChar(B);break;case 6:stateDomainHyphen(B);break;case 7:stateDomainDot(B);break;default:throwUnhandledCaseError(j)}C++}return captureMatchIfValidAndReset(),u;function stateNonEmailAddress(s){"m"===s?beginEmailMatch(1):i.test(s)&&beginEmailMatch()}function stateMailTo(s,o){":"===s?i.test(o)?(j=2,L=new _C(__assign(__assign({},L),{hasMailtoPrefix:!0}))):resetToNonEmailMatchState():x[s]===o||(i.test(o)?j=2:"."===o?j=3:"@"===o?j=4:resetToNonEmailMatchState())}function stateLocalPart(s){"."===s?j=3:"@"===s?j=4:i.test(s)||resetToNonEmailMatchState()}function stateLocalPartDot(s){"."===s||"@"===s?resetToNonEmailMatchState():i.test(s)?j=2:resetToNonEmailMatchState()}function stateAtSign(s){mC.test(s)?j=5:resetToNonEmailMatchState()}function stateDomainChar(s){"."===s?j=7:"-"===s?j=6:mC.test(s)||captureMatchIfValidAndReset()}function stateDomainHyphen(s){"-"===s||"."===s?captureMatchIfValidAndReset():mC.test(s)?j=5:captureMatchIfValidAndReset()}function stateDomainDot(s){"."===s||"-"===s?captureMatchIfValidAndReset():mC.test(s)?(j=5,L=new _C(__assign(__assign({},L),{hasDomainDot:!0}))):captureMatchIfValidAndReset()}function beginEmailMatch(s){void 0===s&&(s=2),j=s,L=new _C({idx:C})}function resetToNonEmailMatchState(){j=0,L=w}function captureMatchIfValidAndReset(){if(L.hasDomainDot){var i=s.slice(L.idx,C);/[-.]$/.test(i)&&(i=i.slice(0,-1));var _=L.hasMailtoPrefix?i.slice(7):i;(function doesEmailHaveValidTld(s){var o=s.split(".").pop()||"",i=o.toLowerCase();return a.test(i)})(_)&&u.push(new GA({tagBuilder:o,matchedText:i,offset:L.idx,email:_}))}resetToNonEmailMatchState()}},EmailMatcher}(eC),_C=function _C(s){void 0===s&&(s={}),this.idx=void 0!==s.idx?s.idx:-1,this.hasMailtoPrefix=!!s.hasMailtoPrefix,this.hasDomainDot=!!s.hasDomainDot},SC=function(){function UrlMatchValidator(){}return UrlMatchValidator.isValid=function(s,o){return!(o&&!this.isValidUriScheme(o)||this.urlMatchDoesNotHaveProtocolOrDot(s,o)||this.urlMatchDoesNotHaveAtLeastOneWordChar(s,o)&&!this.isValidIpAddress(s)||this.containsMultipleDots(s))},UrlMatchValidator.isValidIpAddress=function(s){var o=new RegExp(this.hasFullProtocolRegex.source+this.ipRegex.source);return null!==s.match(o)},UrlMatchValidator.containsMultipleDots=function(s){var o=s;return this.hasFullProtocolRegex.test(s)&&(o=s.split("://")[1]),o.split("/")[0].indexOf("..")>-1},UrlMatchValidator.isValidUriScheme=function(s){var o=s.match(this.uriSchemeRegex),i=o&&o[0].toLowerCase();return"javascript:"!==i&&"vbscript:"!==i},UrlMatchValidator.urlMatchDoesNotHaveProtocolOrDot=function(s,o){return!(!s||o&&this.hasFullProtocolRegex.test(o)||-1!==s.indexOf("."))},UrlMatchValidator.urlMatchDoesNotHaveAtLeastOneWordChar=function(s,o){return!(!s||!o)&&(!this.hasFullProtocolRegex.test(o)&&!this.hasWordCharAfterProtocolRegex.test(s))},UrlMatchValidator.hasFullProtocolRegex=/^[A-Za-z][-.+A-Za-z0-9]*:\/\//,UrlMatchValidator.uriSchemeRegex=/^[A-Za-z][-.+A-Za-z0-9]*:/,UrlMatchValidator.hasWordCharAfterProtocolRegex=new RegExp(":[^\\s]*?["+aC+"]"),UrlMatchValidator.ipRegex=/[0-9][0-9]?[0-9]?\.[0-9][0-9]?[0-9]?\.[0-9][0-9]?[0-9]?\.[0-9][0-9]?[0-9]?(:[0-9]*)?\/?$/,UrlMatchValidator}(),EC=(KA=new RegExp("[/?#](?:["+pC+"\\-+&@#/%=~_()|'$*\\[\\]{}?!:,.;^✓]*["+pC+"\\-+&@#/%=~_()|'$*\\[\\]{}✓])?"),new RegExp(["(?:","(",/(?:[A-Za-z][-.+A-Za-z0-9]{0,63}:(?![A-Za-z][-.+A-Za-z0-9]{0,63}:\/\/)(?!\d+\/?)(?:\/\/)?)/.source,getDomainNameStr(2),")","|","(","(//)?",/(?:www\.)/.source,getDomainNameStr(6),")","|","(","(//)?",getDomainNameStr(10)+"\\.",gC.source,"(?![-"+uC+"])",")",")","(?::[0-9]+)?","(?:"+KA.source+")?"].join(""),"gi")),wC=new RegExp("["+pC+"]"),xC=function(s){function UrlMatcher(o){var i=s.call(this,o)||this;return i.stripPrefix={scheme:!0,www:!0},i.stripTrailingSlash=!0,i.decodePercentEncoding=!0,i.matcherRegex=EC,i.wordCharRegExp=wC,i.stripPrefix=o.stripPrefix,i.stripTrailingSlash=o.stripTrailingSlash,i.decodePercentEncoding=o.decodePercentEncoding,i}return tslib_es6_extends(UrlMatcher,s),UrlMatcher.prototype.parseMatches=function(s){for(var o,i=this.matcherRegex,a=this.stripPrefix,u=this.stripTrailingSlash,_=this.decodePercentEncoding,w=this.tagBuilder,x=[],_loop_1=function(){var i=o[0],j=o[1],L=o[4],B=o[5],$=o[9],U=o.index,V=B||$,z=s.charAt(U-1);if(!SC.isValid(i,j))return"continue";if(U>0&&"@"===z)return"continue";if(U>0&&V&&C.wordCharRegExp.test(z))return"continue";if(/\?$/.test(i)&&(i=i.substr(0,i.length-1)),C.matchHasUnbalancedClosingParen(i))i=i.substr(0,i.length-1);else{var Y=C.matchHasInvalidCharAfterTld(i,j);Y>-1&&(i=i.substr(0,Y))}var Z=["http://","https://"].find((function(s){return!!j&&-1!==j.indexOf(s)}));if(Z){var ee=i.indexOf(Z);i=i.substr(ee),j=j.substr(ee),U+=ee}var ie=j?"scheme":L?"www":"tld",ae=!!j;x.push(new ZA({tagBuilder:w,matchedText:i,offset:U,urlMatchType:ie,url:i,protocolUrlMatch:ae,protocolRelativeMatch:!!V,stripPrefix:a,stripTrailingSlash:u,decodePercentEncoding:_}))},C=this;null!==(o=i.exec(s));)_loop_1();return x},UrlMatcher.prototype.matchHasUnbalancedClosingParen=function(s){var o,i=s.charAt(s.length-1);if(")"===i)o="(";else if("]"===i)o="[";else{if("}"!==i)return!1;o="{"}for(var a=0,u=0,_=s.length-1;u<_;u++){var w=s.charAt(u);w===o?a++:w===i&&(a=Math.max(a-1,0))}return 0===a},UrlMatcher.prototype.matchHasInvalidCharAfterTld=function(s,o){if(!s)return-1;var i=0;o&&(i=s.indexOf(":"),s=s.slice(i));var a=new RegExp("^((.?//)?[-."+pC+"]*[-"+pC+"]\\.[-"+pC+"]+)").exec(s);return null===a?-1:(i+=a[1].length,s=s.slice(a[1].length),/^[^-.A-Za-z0-9:\/?#]/.test(s)?i:-1)},UrlMatcher}(eC),kC=new RegExp("[_".concat(pC,"]")),OC=function(s){function HashtagMatcher(o){var i=s.call(this,o)||this;return i.serviceName="twitter",i.serviceName=o.serviceName,i}return tslib_es6_extends(HashtagMatcher,s),HashtagMatcher.prototype.parseMatches=function(s){for(var o=this.tagBuilder,i=this.serviceName,a=[],u=s.length,_=0,w=-1,x=0;_<u;){var C=s.charAt(_);switch(x){case 0:stateNone(C);break;case 1:stateNonHashtagWordChar(C);break;case 2:stateHashtagHashChar(C);break;case 3:stateHashtagTextChar(C);break;default:throwUnhandledCaseError(x)}_++}return captureMatchIfValid(),a;function stateNone(s){"#"===s?(x=2,w=_):hC.test(s)&&(x=1)}function stateNonHashtagWordChar(s){hC.test(s)||(x=0)}function stateHashtagHashChar(s){x=kC.test(s)?3:hC.test(s)?1:0}function stateHashtagTextChar(s){kC.test(s)||(captureMatchIfValid(),w=-1,x=hC.test(s)?1:0)}function captureMatchIfValid(){if(w>-1&&_-w<=140){var u=s.slice(w,_),x=new YA({tagBuilder:o,matchedText:u,offset:w,serviceName:i,hashtag:u.slice(1)});a.push(x)}}},HashtagMatcher}(eC),AC=["twitter","facebook","instagram","tiktok"],CC=new RegExp("".concat(/(?:(?:(?:(\+)?\d{1,3}[-\040.]?)?\(?\d{3}\)?[-\040.]?\d{3}[-\040.]?\d{4})|(?:(\+)(?:9[976]\d|8[987530]\d|6[987]\d|5[90]\d|42\d|3[875]\d|2[98654321]\d|9[8543210]|8[6421]|6[6543210]|5[87654321]|4[987654310]|3[9643210]|2[70]|7|1)[-\040.]?(?:\d[-\040.]?){6,12}\d+))([,;]+[0-9]+#?)*/.source,"|").concat(/(0([1-9]{1}-?[1-9]\d{3}|[1-9]{2}-?\d{3}|[1-9]{2}\d{1}-?\d{2}|[1-9]{2}\d{2}-?\d{1})-?\d{4}|0[789]0-?\d{4}-?\d{4}|050-?\d{4}-?\d{4})/.source),"g"),jC=function(s){function PhoneMatcher(){var o=null!==s&&s.apply(this,arguments)||this;return o.matcherRegex=CC,o}return tslib_es6_extends(PhoneMatcher,s),PhoneMatcher.prototype.parseMatches=function(s){for(var o,i=this.matcherRegex,a=this.tagBuilder,u=[];null!==(o=i.exec(s));){var _=o[0],w=_.replace(/[^0-9,;#]/g,""),x=!(!o[1]&&!o[2]),C=0==o.index?"":s.substr(o.index-1,1),j=s.substr(o.index+_.length,1),L=!C.match(/\d/)&&!j.match(/\d/);this.testMatch(o[3])&&this.testMatch(_)&&L&&u.push(new QA({tagBuilder:a,matchedText:_,offset:o.index,number:w,plusSign:x}))}return u},PhoneMatcher.prototype.testMatch=function(s){return nC.test(s)},PhoneMatcher}(eC),PC=new RegExp("@[_".concat(pC,"]{1,50}(?![_").concat(pC,"])"),"g"),IC=new RegExp("@[_.".concat(pC,"]{1,30}(?![_").concat(pC,"])"),"g"),TC=new RegExp("@[-_.".concat(pC,"]{1,50}(?![-_").concat(pC,"])"),"g"),NC=new RegExp("@[_.".concat(pC,"]{1,23}[_").concat(pC,"](?![_").concat(pC,"])"),"g"),MC=new RegExp("[^"+pC+"]"),RC=function(s){function MentionMatcher(o){var i=s.call(this,o)||this;return i.serviceName="twitter",i.matcherRegexes={twitter:PC,instagram:IC,soundcloud:TC,tiktok:NC},i.nonWordCharRegex=MC,i.serviceName=o.serviceName,i}return tslib_es6_extends(MentionMatcher,s),MentionMatcher.prototype.parseMatches=function(s){var o,i=this.serviceName,a=this.matcherRegexes[this.serviceName],u=this.nonWordCharRegex,_=this.tagBuilder,w=[];if(!a)return w;for(;null!==(o=a.exec(s));){var x=o.index,C=s.charAt(x-1);if(0===x||u.test(C)){var j=o[0].replace(/\.+$/g,""),L=j.slice(1);w.push(new XA({tagBuilder:_,matchedText:j,offset:x,serviceName:i,mention:L}))}}return w},MentionMatcher}(eC);function parseHtml(s,o){for(var i=o.onOpenTag,a=o.onCloseTag,u=o.onText,_=o.onComment,w=o.onDoctype,x=new DC,C=0,j=s.length,L=0,B=0,$=x;C<j;){var U=s.charAt(C);switch(L){case 0:stateData(U);break;case 1:stateTagOpen(U);break;case 2:stateEndTagOpen(U);break;case 3:stateTagName(U);break;case 4:stateBeforeAttributeName(U);break;case 5:stateAttributeName(U);break;case 6:stateAfterAttributeName(U);break;case 7:stateBeforeAttributeValue(U);break;case 8:stateAttributeValueDoubleQuoted(U);break;case 9:stateAttributeValueSingleQuoted(U);break;case 10:stateAttributeValueUnquoted(U);break;case 11:stateAfterAttributeValueQuoted(U);break;case 12:stateSelfClosingStartTag(U);break;case 13:stateMarkupDeclarationOpen(U);break;case 14:stateCommentStart(U);break;case 15:stateCommentStartDash(U);break;case 16:stateComment(U);break;case 17:stateCommentEndDash(U);break;case 18:stateCommentEnd(U);break;case 19:stateCommentEndBang(U);break;case 20:stateDoctype(U);break;default:throwUnhandledCaseError(L)}C++}function stateData(s){"<"===s&&startNewTag()}function stateTagOpen(s){"!"===s?L=13:"/"===s?(L=2,$=new DC(__assign(__assign({},$),{isClosing:!0}))):"<"===s?startNewTag():tC.test(s)?(L=3,$=new DC(__assign(__assign({},$),{isOpening:!0}))):(L=0,$=x)}function stateTagName(s){sC.test(s)?($=new DC(__assign(__assign({},$),{name:captureTagName()})),L=4):"<"===s?startNewTag():"/"===s?($=new DC(__assign(__assign({},$),{name:captureTagName()})),L=12):">"===s?($=new DC(__assign(__assign({},$),{name:captureTagName()})),emitTagAndPreviousTextNode()):tC.test(s)||rC.test(s)||":"===s||resetToDataState()}function stateEndTagOpen(s){">"===s?resetToDataState():tC.test(s)?L=3:resetToDataState()}function stateBeforeAttributeName(s){sC.test(s)||("/"===s?L=12:">"===s?emitTagAndPreviousTextNode():"<"===s?startNewTag():"="===s||oC.test(s)||iC.test(s)?resetToDataState():L=5)}function stateAttributeName(s){sC.test(s)?L=6:"/"===s?L=12:"="===s?L=7:">"===s?emitTagAndPreviousTextNode():"<"===s?startNewTag():oC.test(s)&&resetToDataState()}function stateAfterAttributeName(s){sC.test(s)||("/"===s?L=12:"="===s?L=7:">"===s?emitTagAndPreviousTextNode():"<"===s?startNewTag():oC.test(s)?resetToDataState():L=5)}function stateBeforeAttributeValue(s){sC.test(s)||('"'===s?L=8:"'"===s?L=9:/[>=`]/.test(s)?resetToDataState():"<"===s?startNewTag():L=10)}function stateAttributeValueDoubleQuoted(s){'"'===s&&(L=11)}function stateAttributeValueSingleQuoted(s){"'"===s&&(L=11)}function stateAttributeValueUnquoted(s){sC.test(s)?L=4:">"===s?emitTagAndPreviousTextNode():"<"===s&&startNewTag()}function stateAfterAttributeValueQuoted(s){sC.test(s)?L=4:"/"===s?L=12:">"===s?emitTagAndPreviousTextNode():"<"===s?startNewTag():(L=4,function reconsumeCurrentCharacter(){C--}())}function stateSelfClosingStartTag(s){">"===s?($=new DC(__assign(__assign({},$),{isClosing:!0})),emitTagAndPreviousTextNode()):L=4}function stateMarkupDeclarationOpen(o){"--"===s.substr(C,2)?(C+=2,$=new DC(__assign(__assign({},$),{type:"comment"})),L=14):"DOCTYPE"===s.substr(C,7).toUpperCase()?(C+=7,$=new DC(__assign(__assign({},$),{type:"doctype"})),L=20):resetToDataState()}function stateCommentStart(s){"-"===s?L=15:">"===s?resetToDataState():L=16}function stateCommentStartDash(s){"-"===s?L=18:">"===s?resetToDataState():L=16}function stateComment(s){"-"===s&&(L=17)}function stateCommentEndDash(s){L="-"===s?18:16}function stateCommentEnd(s){">"===s?emitTagAndPreviousTextNode():"!"===s?L=19:"-"===s||(L=16)}function stateCommentEndBang(s){"-"===s?L=17:">"===s?emitTagAndPreviousTextNode():L=16}function stateDoctype(s){">"===s?emitTagAndPreviousTextNode():"<"===s&&startNewTag()}function resetToDataState(){L=0,$=x}function startNewTag(){L=1,$=new DC({idx:C})}function emitTagAndPreviousTextNode(){var o=s.slice(B,$.idx);o&&u(o,B),"comment"===$.type?_($.idx):"doctype"===$.type?w($.idx):($.isOpening&&i($.name,$.idx),$.isClosing&&a($.name,$.idx)),resetToDataState(),B=C+1}function captureTagName(){var o=$.idx+($.isClosing?2:1);return s.slice(o,C).toLowerCase()}B<C&&function emitText(){var o=s.slice(B,C);u(o,B),B=C+1}()}var DC=function DC(s){void 0===s&&(s={}),this.idx=void 0!==s.idx?s.idx:-1,this.type=s.type||"tag",this.name=s.name||"",this.isOpening=!!s.isOpening,this.isClosing=!!s.isClosing},LC=function(){function Autolinker(s){void 0===s&&(s={}),this.version=Autolinker.version,this.urls={},this.email=!0,this.phone=!0,this.hashtag=!1,this.mention=!1,this.newWindow=!0,this.stripPrefix={scheme:!0,www:!0},this.stripTrailingSlash=!0,this.decodePercentEncoding=!0,this.truncate={length:0,location:"end"},this.className="",this.replaceFn=null,this.context=void 0,this.sanitizeHtml=!1,this.matchers=null,this.tagBuilder=null,this.urls=this.normalizeUrlsCfg(s.urls),this.email="boolean"==typeof s.email?s.email:this.email,this.phone="boolean"==typeof s.phone?s.phone:this.phone,this.hashtag=s.hashtag||this.hashtag,this.mention=s.mention||this.mention,this.newWindow="boolean"==typeof s.newWindow?s.newWindow:this.newWindow,this.stripPrefix=this.normalizeStripPrefixCfg(s.stripPrefix),this.stripTrailingSlash="boolean"==typeof s.stripTrailingSlash?s.stripTrailingSlash:this.stripTrailingSlash,this.decodePercentEncoding="boolean"==typeof s.decodePercentEncoding?s.decodePercentEncoding:this.decodePercentEncoding,this.sanitizeHtml=s.sanitizeHtml||!1;var o=this.mention;if(!1!==o&&-1===["twitter","instagram","soundcloud","tiktok"].indexOf(o))throw new Error("invalid `mention` cfg '".concat(o,"' - see docs"));var i=this.hashtag;if(!1!==i&&-1===AC.indexOf(i))throw new Error("invalid `hashtag` cfg '".concat(i,"' - see docs"));this.truncate=this.normalizeTruncateCfg(s.truncate),this.className=s.className||this.className,this.replaceFn=s.replaceFn||this.replaceFn,this.context=s.context||this}return Autolinker.link=function(s,o){return new Autolinker(o).link(s)},Autolinker.parse=function(s,o){return new Autolinker(o).parse(s)},Autolinker.prototype.normalizeUrlsCfg=function(s){return null==s&&(s=!0),"boolean"==typeof s?{schemeMatches:s,wwwMatches:s,tldMatches:s}:{schemeMatches:"boolean"!=typeof s.schemeMatches||s.schemeMatches,wwwMatches:"boolean"!=typeof s.wwwMatches||s.wwwMatches,tldMatches:"boolean"!=typeof s.tldMatches||s.tldMatches}},Autolinker.prototype.normalizeStripPrefixCfg=function(s){return null==s&&(s=!0),"boolean"==typeof s?{scheme:s,www:s}:{scheme:"boolean"!=typeof s.scheme||s.scheme,www:"boolean"!=typeof s.www||s.www}},Autolinker.prototype.normalizeTruncateCfg=function(s){return"number"==typeof s?{length:s,location:"end"}:function defaults(s,o){for(var i in o)o.hasOwnProperty(i)&&void 0===s[i]&&(s[i]=o[i]);return s}(s||{},{length:Number.POSITIVE_INFINITY,location:"end"})},Autolinker.prototype.parse=function(s){var o=this,i=["a","style","script"],a=0,u=[];return parseHtml(s,{onOpenTag:function(s){i.indexOf(s)>=0&&a++},onText:function(s,i){if(0===a){var _=function splitAndCapture(s,o){if(!o.global)throw new Error("`splitRegex` must have the 'g' flag set");for(var i,a=[],u=0;i=o.exec(s);)a.push(s.substring(u,i.index)),a.push(i[0]),u=i.index+i[0].length;return a.push(s.substring(u)),a}(s,/(&nbsp;|&#160;|&lt;|&#60;|&gt;|&#62;|&quot;|&#34;|&#39;)/gi),w=i;_.forEach((function(s,i){if(i%2==0){var a=o.parseText(s,w);u.push.apply(u,a)}w+=s.length}))}},onCloseTag:function(s){i.indexOf(s)>=0&&(a=Math.max(a-1,0))},onComment:function(s){},onDoctype:function(s){}}),u=this.compactMatches(u),u=this.removeUnwantedMatches(u)},Autolinker.prototype.compactMatches=function(s){s.sort((function(s,o){return s.getOffset()-o.getOffset()}));for(var o=0;o<s.length-1;){var i=s[o],a=i.getOffset(),u=i.getMatchedText().length,_=a+u;if(o+1<s.length){if(s[o+1].getOffset()===a){var w=s[o+1].getMatchedText().length>u?o:o+1;s.splice(w,1);continue}if(s[o+1].getOffset()<_){s.splice(o+1,1);continue}}o++}return s},Autolinker.prototype.removeUnwantedMatches=function(s){return this.hashtag||utils_remove(s,(function(s){return"hashtag"===s.getType()})),this.email||utils_remove(s,(function(s){return"email"===s.getType()})),this.phone||utils_remove(s,(function(s){return"phone"===s.getType()})),this.mention||utils_remove(s,(function(s){return"mention"===s.getType()})),this.urls.schemeMatches||utils_remove(s,(function(s){return"url"===s.getType()&&"scheme"===s.getUrlMatchType()})),this.urls.wwwMatches||utils_remove(s,(function(s){return"url"===s.getType()&&"www"===s.getUrlMatchType()})),this.urls.tldMatches||utils_remove(s,(function(s){return"url"===s.getType()&&"tld"===s.getUrlMatchType()})),s},Autolinker.prototype.parseText=function(s,o){void 0===o&&(o=0),o=o||0;for(var i=this.getMatchers(),a=[],u=0,_=i.length;u<_;u++){for(var w=i[u].parseMatches(s),x=0,C=w.length;x<C;x++)w[x].setOffset(o+w[x].getOffset());a.push.apply(a,w)}return a},Autolinker.prototype.link=function(s){if(!s)return"";this.sanitizeHtml&&(s=s.replace(/</g,"&lt;").replace(/>/g,"&gt;"));for(var o=this.parse(s),i=[],a=0,u=0,_=o.length;u<_;u++){var w=o[u];i.push(s.substring(a,w.getOffset())),i.push(this.createMatchReturnVal(w)),a=w.getOffset()+w.getMatchedText().length}return i.push(s.substring(a)),i.join("")},Autolinker.prototype.createMatchReturnVal=function(s){var o;return this.replaceFn&&(o=this.replaceFn.call(this.context,s)),"string"==typeof o?o:!1===o?s.getMatchedText():o instanceof WA?o.toAnchorString():s.buildTag().toAnchorString()},Autolinker.prototype.getMatchers=function(){if(this.matchers)return this.matchers;var s=this.getTagBuilder(),o=[new OC({tagBuilder:s,serviceName:this.hashtag}),new bC({tagBuilder:s}),new jC({tagBuilder:s}),new RC({tagBuilder:s,serviceName:this.mention}),new xC({tagBuilder:s,stripPrefix:this.stripPrefix,stripTrailingSlash:this.stripTrailingSlash,decodePercentEncoding:this.decodePercentEncoding})];return this.matchers=o},Autolinker.prototype.getTagBuilder=function(){var s=this.tagBuilder;return s||(s=this.tagBuilder=new JA({newWindow:this.newWindow,truncate:this.truncate,className:this.className})),s},Autolinker.version="3.16.2",Autolinker.AnchorTagBuilder=JA,Autolinker.HtmlTag=WA,Autolinker.matcher={Email:bC,Hashtag:OC,Matcher:eC,Mention:RC,Phone:jC,Url:xC},Autolinker.match={Email:GA,Hashtag:YA,Match:HA,Mention:XA,Phone:QA,Url:ZA},Autolinker}();const FC=LC;var BC=/www|@|\:\/\//;function isLinkOpen(s){return/^<a[>\s]/i.test(s)}function isLinkClose(s){return/^<\/a\s*>/i.test(s)}function createLinkifier(){var s=[],o=new FC({stripPrefix:!1,url:!0,email:!0,replaceFn:function(o){switch(o.getType()){case"url":s.push({text:o.matchedText,url:o.getUrl()});break;case"email":s.push({text:o.matchedText,url:"mailto:"+o.getEmail().replace(/^mailto:/i,"")})}return!1}});return{links:s,autolinker:o}}function parseTokens(s){var o,i,a,u,_,w,x,C,j,L,B,$,U,V=s.tokens,z=null;for(i=0,a=V.length;i<a;i++)if("inline"===V[i].type)for(B=0,o=(u=V[i].children).length-1;o>=0;o--)if("link_close"!==(_=u[o]).type){if("htmltag"===_.type&&(isLinkOpen(_.content)&&B>0&&B--,isLinkClose(_.content)&&B++),!(B>0)&&"text"===_.type&&BC.test(_.content)){if(z||($=(z=createLinkifier()).links,U=z.autolinker),w=_.content,$.length=0,U.link(w),!$.length)continue;for(x=[],L=_.level,C=0;C<$.length;C++)s.inline.validateLink($[C].url)&&((j=w.indexOf($[C].text))&&x.push({type:"text",content:w.slice(0,j),level:L}),x.push({type:"link_open",href:$[C].url,title:"",level:L++}),x.push({type:"text",content:$[C].text,level:L}),x.push({type:"link_close",level:--L}),w=w.slice(j+$[C].text.length));w.length&&x.push({type:"text",content:w,level:L}),V[i].children=u=[].concat(u.slice(0,o),x,u.slice(o+1))}}else for(o--;u[o].level!==_.level&&"link_open"!==u[o].type;)o--}function linkify(s){s.core.ruler.push("linkify",parseTokens)}const{entries:$C,setPrototypeOf:qC,isFrozen:UC,getPrototypeOf:VC,getOwnPropertyDescriptor:zC}=Object;let{freeze:WC,seal:JC,create:HC}=Object,{apply:KC,construct:GC}="undefined"!=typeof Reflect&&Reflect;WC||(WC=function freeze(s){return s}),JC||(JC=function seal(s){return s}),KC||(KC=function apply(s,o,i){return s.apply(o,i)}),GC||(GC=function construct(s,o){return new s(...o)});const YC=unapply(Array.prototype.forEach),XC=unapply(Array.prototype.lastIndexOf),QC=unapply(Array.prototype.pop),ZC=unapply(Array.prototype.push),ej=unapply(Array.prototype.splice),fj=unapply(String.prototype.toLowerCase),mj=unapply(String.prototype.toString),_j=unapply(String.prototype.match),Aj=unapply(String.prototype.replace),Cj=unapply(String.prototype.indexOf),Nj=unapply(String.prototype.trim),Bj=unapply(Object.prototype.hasOwnProperty),$j=unapply(RegExp.prototype.test),zj=function unconstruct(s){return function(){for(var o=arguments.length,i=new Array(o),a=0;a<o;a++)i[a]=arguments[a];return GC(s,i)}}(TypeError);function unapply(s){return function(o){o instanceof RegExp&&(o.lastIndex=0);for(var i=arguments.length,a=new Array(i>1?i-1:0),u=1;u<i;u++)a[u-1]=arguments[u];return KC(s,o,a)}}function addToSet(s,o){let i=arguments.length>2&&void 0!==arguments[2]?arguments[2]:fj;qC&&qC(s,null);let a=o.length;for(;a--;){let u=o[a];if("string"==typeof u){const s=i(u);s!==u&&(UC(o)||(o[a]=s),u=s)}s[u]=!0}return s}function purify_es_cleanArray(s){for(let o=0;o<s.length;o++){Bj(s,o)||(s[o]=null)}return s}function clone(s){const o=HC(null);for(const[i,a]of $C(s)){Bj(s,i)&&(Array.isArray(a)?o[i]=purify_es_cleanArray(a):a&&"object"==typeof a&&a.constructor===Object?o[i]=clone(a):o[i]=a)}return o}function lookupGetter(s,o){for(;null!==s;){const i=zC(s,o);if(i){if(i.get)return unapply(i.get);if("function"==typeof i.value)return unapply(i.value)}s=VC(s)}return function fallbackValue(){return null}}const Jj=WC(["a","abbr","acronym","address","area","article","aside","audio","b","bdi","bdo","big","blink","blockquote","body","br","button","canvas","caption","center","cite","code","col","colgroup","content","data","datalist","dd","decorator","del","details","dfn","dialog","dir","div","dl","dt","element","em","fieldset","figcaption","figure","font","footer","form","h1","h2","h3","h4","h5","h6","head","header","hgroup","hr","html","i","img","input","ins","kbd","label","legend","li","main","map","mark","marquee","menu","menuitem","meter","nav","nobr","ol","optgroup","option","output","p","picture","pre","progress","q","rp","rt","ruby","s","samp","section","select","shadow","small","source","spacer","span","strike","strong","style","sub","summary","sup","table","tbody","td","template","textarea","tfoot","th","thead","time","tr","track","tt","u","ul","var","video","wbr"]),Kj=WC(["svg","a","altglyph","altglyphdef","altglyphitem","animatecolor","animatemotion","animatetransform","circle","clippath","defs","desc","ellipse","filter","font","g","glyph","glyphref","hkern","image","line","lineargradient","marker","mask","metadata","mpath","path","pattern","polygon","polyline","radialgradient","rect","stop","style","switch","symbol","text","textpath","title","tref","tspan","view","vkern"]),Gj=WC(["feBlend","feColorMatrix","feComponentTransfer","feComposite","feConvolveMatrix","feDiffuseLighting","feDisplacementMap","feDistantLight","feDropShadow","feFlood","feFuncA","feFuncB","feFuncG","feFuncR","feGaussianBlur","feImage","feMerge","feMergeNode","feMorphology","feOffset","fePointLight","feSpecularLighting","feSpotLight","feTile","feTurbulence"]),Xj=WC(["animate","color-profile","cursor","discard","font-face","font-face-format","font-face-name","font-face-src","font-face-uri","foreignobject","hatch","hatchpath","mesh","meshgradient","meshpatch","meshrow","missing-glyph","script","set","solidcolor","unknown","use"]),eP=WC(["math","menclose","merror","mfenced","mfrac","mglyph","mi","mlabeledtr","mmultiscripts","mn","mo","mover","mpadded","mphantom","mroot","mrow","ms","mspace","msqrt","mstyle","msub","msup","msubsup","mtable","mtd","mtext","mtr","munder","munderover","mprescripts"]),tP=WC(["maction","maligngroup","malignmark","mlongdiv","mscarries","mscarry","msgroup","mstack","msline","msrow","semantics","annotation","annotation-xml","mprescripts","none"]),rP=WC(["#text"]),nP=WC(["accept","action","align","alt","autocapitalize","autocomplete","autopictureinpicture","autoplay","background","bgcolor","border","capture","cellpadding","cellspacing","checked","cite","class","clear","color","cols","colspan","controls","controlslist","coords","crossorigin","datetime","decoding","default","dir","disabled","disablepictureinpicture","disableremoteplayback","download","draggable","enctype","enterkeyhint","face","for","headers","height","hidden","high","href","hreflang","id","inputmode","integrity","ismap","kind","label","lang","list","loading","loop","low","max","maxlength","media","method","min","minlength","multiple","muted","name","nonce","noshade","novalidate","nowrap","open","optimum","pattern","placeholder","playsinline","popover","popovertarget","popovertargetaction","poster","preload","pubdate","radiogroup","readonly","rel","required","rev","reversed","role","rows","rowspan","spellcheck","scope","selected","shape","size","sizes","span","srclang","start","src","srcset","step","style","summary","tabindex","title","translate","type","usemap","valign","value","width","wrap","xmlns","slot"]),sP=WC(["accent-height","accumulate","additive","alignment-baseline","amplitude","ascent","attributename","attributetype","azimuth","basefrequency","baseline-shift","begin","bias","by","class","clip","clippathunits","clip-path","clip-rule","color","color-interpolation","color-interpolation-filters","color-profile","color-rendering","cx","cy","d","dx","dy","diffuseconstant","direction","display","divisor","dur","edgemode","elevation","end","exponent","fill","fill-opacity","fill-rule","filter","filterunits","flood-color","flood-opacity","font-family","font-size","font-size-adjust","font-stretch","font-style","font-variant","font-weight","fx","fy","g1","g2","glyph-name","glyphref","gradientunits","gradienttransform","height","href","id","image-rendering","in","in2","intercept","k","k1","k2","k3","k4","kerning","keypoints","keysplines","keytimes","lang","lengthadjust","letter-spacing","kernelmatrix","kernelunitlength","lighting-color","local","marker-end","marker-mid","marker-start","markerheight","markerunits","markerwidth","maskcontentunits","maskunits","max","mask","media","method","mode","min","name","numoctaves","offset","operator","opacity","order","orient","orientation","origin","overflow","paint-order","path","pathlength","patterncontentunits","patterntransform","patternunits","points","preservealpha","preserveaspectratio","primitiveunits","r","rx","ry","radius","refx","refy","repeatcount","repeatdur","restart","result","rotate","scale","seed","shape-rendering","slope","specularconstant","specularexponent","spreadmethod","startoffset","stddeviation","stitchtiles","stop-color","stop-opacity","stroke-dasharray","stroke-dashoffset","stroke-linecap","stroke-linejoin","stroke-miterlimit","stroke-opacity","stroke","stroke-width","style","surfacescale","systemlanguage","tabindex","tablevalues","targetx","targety","transform","transform-origin","text-anchor","text-decoration","text-rendering","textlength","type","u1","u2","unicode","values","viewbox","visibility","version","vert-adv-y","vert-origin-x","vert-origin-y","width","word-spacing","wrap","writing-mode","xchannelselector","ychannelselector","x","x1","x2","xmlns","y","y1","y2","z","zoomandpan"]),oP=WC(["accent","accentunder","align","bevelled","close","columnsalign","columnlines","columnspan","denomalign","depth","dir","display","displaystyle","encoding","fence","frame","height","href","id","largeop","length","linethickness","lspace","lquote","mathbackground","mathcolor","mathsize","mathvariant","maxsize","minsize","movablelimits","notation","numalign","open","rowalign","rowlines","rowspacing","rowspan","rspace","rquote","scriptlevel","scriptminsize","scriptsizemultiplier","selection","separator","separators","stretchy","subscriptshift","supscriptshift","symmetric","voffset","width","xmlns"]),iP=WC(["xlink:href","xml:id","xlink:title","xml:space","xmlns:xlink"]),aP=JC(/\{\{[\w\W]*|[\w\W]*\}\}/gm),cP=JC(/<%[\w\W]*|[\w\W]*%>/gm),lP=JC(/\$\{[\w\W]*/gm),uP=JC(/^data-[\-\w.\u00B7-\uFFFF]+$/),pP=JC(/^aria-[\-\w]+$/),hP=JC(/^(?:(?:(?:f|ht)tps?|mailto|tel|callto|sms|cid|xmpp|matrix):|[^a-z]|[a-z+.\-]+(?:[^a-z+.\-:]|$))/i),dP=JC(/^(?:\w+script|data):/i),fP=JC(/[\u0000-\u0020\u00A0\u1680\u180E\u2000-\u2029\u205F\u3000]/g),mP=JC(/^html$/i),gP=JC(/^[a-z][.\w]*(-[.\w]+)+$/i);var yP=Object.freeze({__proto__:null,ARIA_ATTR:pP,ATTR_WHITESPACE:fP,CUSTOM_ELEMENT:gP,DATA_ATTR:uP,DOCTYPE_NAME:mP,ERB_EXPR:cP,IS_ALLOWED_URI:hP,IS_SCRIPT_OR_DATA:dP,MUSTACHE_EXPR:aP,TMPLIT_EXPR:lP});const vP=1,bP=3,_P=7,SP=8,EP=9,wP=function getGlobal(){return"undefined"==typeof window?null:window};var xP=function createDOMPurify(){let s=arguments.length>0&&void 0!==arguments[0]?arguments[0]:wP();const DOMPurify=s=>createDOMPurify(s);if(DOMPurify.version="3.2.6",DOMPurify.removed=[],!s||!s.document||s.document.nodeType!==EP||!s.Element)return DOMPurify.isSupported=!1,DOMPurify;let{document:o}=s;const i=o,a=i.currentScript,{DocumentFragment:u,HTMLTemplateElement:_,Node:w,Element:x,NodeFilter:C,NamedNodeMap:j=s.NamedNodeMap||s.MozNamedAttrMap,HTMLFormElement:L,DOMParser:B,trustedTypes:$}=s,U=x.prototype,V=lookupGetter(U,"cloneNode"),z=lookupGetter(U,"remove"),Y=lookupGetter(U,"nextSibling"),Z=lookupGetter(U,"childNodes"),ee=lookupGetter(U,"parentNode");if("function"==typeof _){const s=o.createElement("template");s.content&&s.content.ownerDocument&&(o=s.content.ownerDocument)}let ie,ae="";const{implementation:ce,createNodeIterator:le,createDocumentFragment:pe,getElementsByTagName:de}=o,{importNode:fe}=i;let ye={afterSanitizeAttributes:[],afterSanitizeElements:[],afterSanitizeShadowDOM:[],beforeSanitizeAttributes:[],beforeSanitizeElements:[],beforeSanitizeShadowDOM:[],uponSanitizeAttribute:[],uponSanitizeElement:[],uponSanitizeShadowNode:[]};DOMPurify.isSupported="function"==typeof $C&&"function"==typeof ee&&ce&&void 0!==ce.createHTMLDocument;const{MUSTACHE_EXPR:be,ERB_EXPR:_e,TMPLIT_EXPR:Se,DATA_ATTR:we,ARIA_ATTR:xe,IS_SCRIPT_OR_DATA:Pe,ATTR_WHITESPACE:Te,CUSTOM_ELEMENT:Re}=yP;let{IS_ALLOWED_URI:$e}=yP,qe=null;const ze=addToSet({},[...Jj,...Kj,...Gj,...eP,...rP]);let We=null;const He=addToSet({},[...nP,...sP,...oP,...iP]);let Ye=Object.seal(HC(null,{tagNameCheck:{writable:!0,configurable:!1,enumerable:!0,value:null},attributeNameCheck:{writable:!0,configurable:!1,enumerable:!0,value:null},allowCustomizedBuiltInElements:{writable:!0,configurable:!1,enumerable:!0,value:!1}})),Xe=null,Qe=null,et=!0,tt=!0,rt=!1,nt=!0,st=!1,ot=!0,it=!1,at=!1,ct=!1,lt=!1,ut=!1,pt=!1,ht=!0,dt=!1,mt=!0,gt=!1,yt={},vt=null;const bt=addToSet({},["annotation-xml","audio","colgroup","desc","foreignobject","head","iframe","math","mi","mn","mo","ms","mtext","noembed","noframes","noscript","plaintext","script","style","svg","template","thead","title","video","xmp"]);let _t=null;const St=addToSet({},["audio","video","img","source","image","track"]);let Et=null;const wt=addToSet({},["alt","class","for","id","label","name","pattern","placeholder","role","summary","title","value","style","xmlns"]),xt="http://www.w3.org/1998/Math/MathML",kt="http://www.w3.org/2000/svg",Ot="http://www.w3.org/1999/xhtml";let At=Ot,Ct=!1,jt=null;const Pt=addToSet({},[xt,kt,Ot],mj);let It=addToSet({},["mi","mo","mn","ms","mtext"]),Tt=addToSet({},["annotation-xml"]);const Nt=addToSet({},["title","style","font","a","script"]);let Mt=null;const Rt=["application/xhtml+xml","text/html"];let Dt=null,Lt=null;const Ft=o.createElement("form"),Bt=function isRegexOrFunction(s){return s instanceof RegExp||s instanceof Function},$t=function _parseConfig(){let s=arguments.length>0&&void 0!==arguments[0]?arguments[0]:{};if(!Lt||Lt!==s){if(s&&"object"==typeof s||(s={}),s=clone(s),Mt=-1===Rt.indexOf(s.PARSER_MEDIA_TYPE)?"text/html":s.PARSER_MEDIA_TYPE,Dt="application/xhtml+xml"===Mt?mj:fj,qe=Bj(s,"ALLOWED_TAGS")?addToSet({},s.ALLOWED_TAGS,Dt):ze,We=Bj(s,"ALLOWED_ATTR")?addToSet({},s.ALLOWED_ATTR,Dt):He,jt=Bj(s,"ALLOWED_NAMESPACES")?addToSet({},s.ALLOWED_NAMESPACES,mj):Pt,Et=Bj(s,"ADD_URI_SAFE_ATTR")?addToSet(clone(wt),s.ADD_URI_SAFE_ATTR,Dt):wt,_t=Bj(s,"ADD_DATA_URI_TAGS")?addToSet(clone(St),s.ADD_DATA_URI_TAGS,Dt):St,vt=Bj(s,"FORBID_CONTENTS")?addToSet({},s.FORBID_CONTENTS,Dt):bt,Xe=Bj(s,"FORBID_TAGS")?addToSet({},s.FORBID_TAGS,Dt):clone({}),Qe=Bj(s,"FORBID_ATTR")?addToSet({},s.FORBID_ATTR,Dt):clone({}),yt=!!Bj(s,"USE_PROFILES")&&s.USE_PROFILES,et=!1!==s.ALLOW_ARIA_ATTR,tt=!1!==s.ALLOW_DATA_ATTR,rt=s.ALLOW_UNKNOWN_PROTOCOLS||!1,nt=!1!==s.ALLOW_SELF_CLOSE_IN_ATTR,st=s.SAFE_FOR_TEMPLATES||!1,ot=!1!==s.SAFE_FOR_XML,it=s.WHOLE_DOCUMENT||!1,lt=s.RETURN_DOM||!1,ut=s.RETURN_DOM_FRAGMENT||!1,pt=s.RETURN_TRUSTED_TYPE||!1,ct=s.FORCE_BODY||!1,ht=!1!==s.SANITIZE_DOM,dt=s.SANITIZE_NAMED_PROPS||!1,mt=!1!==s.KEEP_CONTENT,gt=s.IN_PLACE||!1,$e=s.ALLOWED_URI_REGEXP||hP,At=s.NAMESPACE||Ot,It=s.MATHML_TEXT_INTEGRATION_POINTS||It,Tt=s.HTML_INTEGRATION_POINTS||Tt,Ye=s.CUSTOM_ELEMENT_HANDLING||{},s.CUSTOM_ELEMENT_HANDLING&&Bt(s.CUSTOM_ELEMENT_HANDLING.tagNameCheck)&&(Ye.tagNameCheck=s.CUSTOM_ELEMENT_HANDLING.tagNameCheck),s.CUSTOM_ELEMENT_HANDLING&&Bt(s.CUSTOM_ELEMENT_HANDLING.attributeNameCheck)&&(Ye.attributeNameCheck=s.CUSTOM_ELEMENT_HANDLING.attributeNameCheck),s.CUSTOM_ELEMENT_HANDLING&&"boolean"==typeof s.CUSTOM_ELEMENT_HANDLING.allowCustomizedBuiltInElements&&(Ye.allowCustomizedBuiltInElements=s.CUSTOM_ELEMENT_HANDLING.allowCustomizedBuiltInElements),st&&(tt=!1),ut&&(lt=!0),yt&&(qe=addToSet({},rP),We=[],!0===yt.html&&(addToSet(qe,Jj),addToSet(We,nP)),!0===yt.svg&&(addToSet(qe,Kj),addToSet(We,sP),addToSet(We,iP)),!0===yt.svgFilters&&(addToSet(qe,Gj),addToSet(We,sP),addToSet(We,iP)),!0===yt.mathMl&&(addToSet(qe,eP),addToSet(We,oP),addToSet(We,iP))),s.ADD_TAGS&&(qe===ze&&(qe=clone(qe)),addToSet(qe,s.ADD_TAGS,Dt)),s.ADD_ATTR&&(We===He&&(We=clone(We)),addToSet(We,s.ADD_ATTR,Dt)),s.ADD_URI_SAFE_ATTR&&addToSet(Et,s.ADD_URI_SAFE_ATTR,Dt),s.FORBID_CONTENTS&&(vt===bt&&(vt=clone(vt)),addToSet(vt,s.FORBID_CONTENTS,Dt)),mt&&(qe["#text"]=!0),it&&addToSet(qe,["html","head","body"]),qe.table&&(addToSet(qe,["tbody"]),delete Xe.tbody),s.TRUSTED_TYPES_POLICY){if("function"!=typeof s.TRUSTED_TYPES_POLICY.createHTML)throw zj('TRUSTED_TYPES_POLICY configuration option must provide a "createHTML" hook.');if("function"!=typeof s.TRUSTED_TYPES_POLICY.createScriptURL)throw zj('TRUSTED_TYPES_POLICY configuration option must provide a "createScriptURL" hook.');ie=s.TRUSTED_TYPES_POLICY,ae=ie.createHTML("")}else void 0===ie&&(ie=function _createTrustedTypesPolicy(s,o){if("object"!=typeof s||"function"!=typeof s.createPolicy)return null;let i=null;const a="data-tt-policy-suffix";o&&o.hasAttribute(a)&&(i=o.getAttribute(a));const u="dompurify"+(i?"#"+i:"");try{return s.createPolicy(u,{createHTML:s=>s,createScriptURL:s=>s})}catch(s){return console.warn("TrustedTypes policy "+u+" could not be created."),null}}($,a)),null!==ie&&"string"==typeof ae&&(ae=ie.createHTML(""));WC&&WC(s),Lt=s}},qt=addToSet({},[...Kj,...Gj,...Xj]),Ut=addToSet({},[...eP,...tP]),Vt=function _forceRemove(s){ZC(DOMPurify.removed,{element:s});try{ee(s).removeChild(s)}catch(o){z(s)}},zt=function _removeAttribute(s,o){try{ZC(DOMPurify.removed,{attribute:o.getAttributeNode(s),from:o})}catch(s){ZC(DOMPurify.removed,{attribute:null,from:o})}if(o.removeAttribute(s),"is"===s)if(lt||ut)try{Vt(o)}catch(s){}else try{o.setAttribute(s,"")}catch(s){}},Wt=function _initDocument(s){let i=null,a=null;if(ct)s="<remove></remove>"+s;else{const o=_j(s,/^[\r\n\t ]+/);a=o&&o[0]}"application/xhtml+xml"===Mt&&At===Ot&&(s='<html xmlns="http://www.w3.org/1999/xhtml"><head></head><body>'+s+"</body></html>");const u=ie?ie.createHTML(s):s;if(At===Ot)try{i=(new B).parseFromString(u,Mt)}catch(s){}if(!i||!i.documentElement){i=ce.createDocument(At,"template",null);try{i.documentElement.innerHTML=Ct?ae:u}catch(s){}}const _=i.body||i.documentElement;return s&&a&&_.insertBefore(o.createTextNode(a),_.childNodes[0]||null),At===Ot?de.call(i,it?"html":"body")[0]:it?i.documentElement:_},Jt=function _createNodeIterator(s){return le.call(s.ownerDocument||s,s,C.SHOW_ELEMENT|C.SHOW_COMMENT|C.SHOW_TEXT|C.SHOW_PROCESSING_INSTRUCTION|C.SHOW_CDATA_SECTION,null)},Ht=function _isClobbered(s){return s instanceof L&&("string"!=typeof s.nodeName||"string"!=typeof s.textContent||"function"!=typeof s.removeChild||!(s.attributes instanceof j)||"function"!=typeof s.removeAttribute||"function"!=typeof s.setAttribute||"string"!=typeof s.namespaceURI||"function"!=typeof s.insertBefore||"function"!=typeof s.hasChildNodes)},Kt=function _isNode(s){return"function"==typeof w&&s instanceof w};function _executeHooks(s,o,i){YC(s,(s=>{s.call(DOMPurify,o,i,Lt)}))}const Gt=function _sanitizeElements(s){let o=null;if(_executeHooks(ye.beforeSanitizeElements,s,null),Ht(s))return Vt(s),!0;const i=Dt(s.nodeName);if(_executeHooks(ye.uponSanitizeElement,s,{tagName:i,allowedTags:qe}),ot&&s.hasChildNodes()&&!Kt(s.firstElementChild)&&$j(/<[/\w!]/g,s.innerHTML)&&$j(/<[/\w!]/g,s.textContent))return Vt(s),!0;if(s.nodeType===_P)return Vt(s),!0;if(ot&&s.nodeType===SP&&$j(/<[/\w]/g,s.data))return Vt(s),!0;if(!qe[i]||Xe[i]){if(!Xe[i]&&Xt(i)){if(Ye.tagNameCheck instanceof RegExp&&$j(Ye.tagNameCheck,i))return!1;if(Ye.tagNameCheck instanceof Function&&Ye.tagNameCheck(i))return!1}if(mt&&!vt[i]){const o=ee(s)||s.parentNode,i=Z(s)||s.childNodes;if(i&&o){for(let a=i.length-1;a>=0;--a){const u=V(i[a],!0);u.__removalCount=(s.__removalCount||0)+1,o.insertBefore(u,Y(s))}}}return Vt(s),!0}return s instanceof x&&!function _checkValidNamespace(s){let o=ee(s);o&&o.tagName||(o={namespaceURI:At,tagName:"template"});const i=fj(s.tagName),a=fj(o.tagName);return!!jt[s.namespaceURI]&&(s.namespaceURI===kt?o.namespaceURI===Ot?"svg"===i:o.namespaceURI===xt?"svg"===i&&("annotation-xml"===a||It[a]):Boolean(qt[i]):s.namespaceURI===xt?o.namespaceURI===Ot?"math"===i:o.namespaceURI===kt?"math"===i&&Tt[a]:Boolean(Ut[i]):s.namespaceURI===Ot?!(o.namespaceURI===kt&&!Tt[a])&&!(o.namespaceURI===xt&&!It[a])&&!Ut[i]&&(Nt[i]||!qt[i]):!("application/xhtml+xml"!==Mt||!jt[s.namespaceURI]))}(s)?(Vt(s),!0):"noscript"!==i&&"noembed"!==i&&"noframes"!==i||!$j(/<\/no(script|embed|frames)/i,s.innerHTML)?(st&&s.nodeType===bP&&(o=s.textContent,YC([be,_e,Se],(s=>{o=Aj(o,s," ")})),s.textContent!==o&&(ZC(DOMPurify.removed,{element:s.cloneNode()}),s.textContent=o)),_executeHooks(ye.afterSanitizeElements,s,null),!1):(Vt(s),!0)},Yt=function _isValidAttribute(s,i,a){if(ht&&("id"===i||"name"===i)&&(a in o||a in Ft))return!1;if(tt&&!Qe[i]&&$j(we,i));else if(et&&$j(xe,i));else if(!We[i]||Qe[i]){if(!(Xt(s)&&(Ye.tagNameCheck instanceof RegExp&&$j(Ye.tagNameCheck,s)||Ye.tagNameCheck instanceof Function&&Ye.tagNameCheck(s))&&(Ye.attributeNameCheck instanceof RegExp&&$j(Ye.attributeNameCheck,i)||Ye.attributeNameCheck instanceof Function&&Ye.attributeNameCheck(i))||"is"===i&&Ye.allowCustomizedBuiltInElements&&(Ye.tagNameCheck instanceof RegExp&&$j(Ye.tagNameCheck,a)||Ye.tagNameCheck instanceof Function&&Ye.tagNameCheck(a))))return!1}else if(Et[i]);else if($j($e,Aj(a,Te,"")));else if("src"!==i&&"xlink:href"!==i&&"href"!==i||"script"===s||0!==Cj(a,"data:")||!_t[s]){if(rt&&!$j(Pe,Aj(a,Te,"")));else if(a)return!1}else;return!0},Xt=function _isBasicCustomElement(s){return"annotation-xml"!==s&&_j(s,Re)},Qt=function _sanitizeAttributes(s){_executeHooks(ye.beforeSanitizeAttributes,s,null);const{attributes:o}=s;if(!o||Ht(s))return;const i={attrName:"",attrValue:"",keepAttr:!0,allowedAttributes:We,forceKeepAttr:void 0};let a=o.length;for(;a--;){const u=o[a],{name:_,namespaceURI:w,value:x}=u,C=Dt(_),j=x;let L="value"===_?j:Nj(j);if(i.attrName=C,i.attrValue=L,i.keepAttr=!0,i.forceKeepAttr=void 0,_executeHooks(ye.uponSanitizeAttribute,s,i),L=i.attrValue,!dt||"id"!==C&&"name"!==C||(zt(_,s),L="user-content-"+L),ot&&$j(/((--!?|])>)|<\/(style|title)/i,L)){zt(_,s);continue}if(i.forceKeepAttr)continue;if(!i.keepAttr){zt(_,s);continue}if(!nt&&$j(/\/>/i,L)){zt(_,s);continue}st&&YC([be,_e,Se],(s=>{L=Aj(L,s," ")}));const B=Dt(s.nodeName);if(Yt(B,C,L)){if(ie&&"object"==typeof $&&"function"==typeof $.getAttributeType)if(w);else switch($.getAttributeType(B,C)){case"TrustedHTML":L=ie.createHTML(L);break;case"TrustedScriptURL":L=ie.createScriptURL(L)}if(L!==j)try{w?s.setAttributeNS(w,_,L):s.setAttribute(_,L),Ht(s)?Vt(s):QC(DOMPurify.removed)}catch(o){zt(_,s)}}else zt(_,s)}_executeHooks(ye.afterSanitizeAttributes,s,null)},Zt=function _sanitizeShadowDOM(s){let o=null;const i=Jt(s);for(_executeHooks(ye.beforeSanitizeShadowDOM,s,null);o=i.nextNode();)_executeHooks(ye.uponSanitizeShadowNode,o,null),Gt(o),Qt(o),o.content instanceof u&&_sanitizeShadowDOM(o.content);_executeHooks(ye.afterSanitizeShadowDOM,s,null)};return DOMPurify.sanitize=function(s){let o=arguments.length>1&&void 0!==arguments[1]?arguments[1]:{},a=null,_=null,x=null,C=null;if(Ct=!s,Ct&&(s="\x3c!--\x3e"),"string"!=typeof s&&!Kt(s)){if("function"!=typeof s.toString)throw zj("toString is not a function");if("string"!=typeof(s=s.toString()))throw zj("dirty is not a string, aborting")}if(!DOMPurify.isSupported)return s;if(at||$t(o),DOMPurify.removed=[],"string"==typeof s&&(gt=!1),gt){if(s.nodeName){const o=Dt(s.nodeName);if(!qe[o]||Xe[o])throw zj("root node is forbidden and cannot be sanitized in-place")}}else if(s instanceof w)a=Wt("\x3c!----\x3e"),_=a.ownerDocument.importNode(s,!0),_.nodeType===vP&&"BODY"===_.nodeName||"HTML"===_.nodeName?a=_:a.appendChild(_);else{if(!lt&&!st&&!it&&-1===s.indexOf("<"))return ie&&pt?ie.createHTML(s):s;if(a=Wt(s),!a)return lt?null:pt?ae:""}a&&ct&&Vt(a.firstChild);const j=Jt(gt?s:a);for(;x=j.nextNode();)Gt(x),Qt(x),x.content instanceof u&&Zt(x.content);if(gt)return s;if(lt){if(ut)for(C=pe.call(a.ownerDocument);a.firstChild;)C.appendChild(a.firstChild);else C=a;return(We.shadowroot||We.shadowrootmode)&&(C=fe.call(i,C,!0)),C}let L=it?a.outerHTML:a.innerHTML;return it&&qe["!doctype"]&&a.ownerDocument&&a.ownerDocument.doctype&&a.ownerDocument.doctype.name&&$j(mP,a.ownerDocument.doctype.name)&&(L="<!DOCTYPE "+a.ownerDocument.doctype.name+">\n"+L),st&&YC([be,_e,Se],(s=>{L=Aj(L,s," ")})),ie&&pt?ie.createHTML(L):L},DOMPurify.setConfig=function(){$t(arguments.length>0&&void 0!==arguments[0]?arguments[0]:{}),at=!0},DOMPurify.clearConfig=function(){Lt=null,at=!1},DOMPurify.isValidAttribute=function(s,o,i){Lt||$t({});const a=Dt(s),u=Dt(o);return Yt(a,u,i)},DOMPurify.addHook=function(s,o){"function"==typeof o&&ZC(ye[s],o)},DOMPurify.removeHook=function(s,o){if(void 0!==o){const i=XC(ye[s],o);return-1===i?void 0:ej(ye[s],i,1)[0]}return QC(ye[s])},DOMPurify.removeHooks=function(s){ye[s]=[]},DOMPurify.removeAllHooks=function(){ye={afterSanitizeAttributes:[],afterSanitizeElements:[],afterSanitizeShadowDOM:[],beforeSanitizeAttributes:[],beforeSanitizeElements:[],beforeSanitizeShadowDOM:[],uponSanitizeAttribute:[],uponSanitizeElement:[],uponSanitizeShadowNode:[]}},DOMPurify}();xP.addHook&&xP.addHook("beforeSanitizeElements",(function(s){return s.href&&s.setAttribute("rel","noopener noreferrer"),s}));const kP=function Markdown({source:s,className:o="",getConfigs:i=()=>({useUnsafeMarkdown:!1})}){if("string"!=typeof s)return null;const a=new Remarkable({html:!0,typographer:!0,breaks:!0,linkTarget:"_blank"}).use(linkify);a.core.ruler.disable(["replacements","smartquotes"]);const{useUnsafeMarkdown:u}=i(),_=a.render(s),w=sanitizer(_,{useUnsafeMarkdown:u});return s&&_&&w?Re.createElement("div",{className:Jn()(o,"markdown"),dangerouslySetInnerHTML:{__html:w}}):null};function sanitizer(s,{useUnsafeMarkdown:o=!1}={}){const i=o,a=o?[]:["style","class"];return o&&!sanitizer.hasWarnedAboutDeprecation&&(console.warn("useUnsafeMarkdown display configuration parameter is deprecated since >3.26.0 and will be removed in v4.0.0."),sanitizer.hasWarnedAboutDeprecation=!0),xP.sanitize(s,{ADD_ATTR:["target"],FORBID_TAGS:["style","form"],ALLOW_DATA_ATTR:i,FORBID_ATTR:a})}sanitizer.hasWarnedAboutDeprecation=!1;class BaseLayout extends Re.Component{render(){const{errSelectors:s,specSelectors:o,getComponent:i}=this.props,a=i("SvgAssets"),u=i("InfoContainer",!0),_=i("VersionPragmaFilter"),w=i("operations",!0),x=i("Models",!0),C=i("Webhooks",!0),j=i("Row"),L=i("Col"),B=i("errors",!0),$=i("ServersContainer",!0),U=i("SchemesContainer",!0),V=i("AuthorizeBtnContainer",!0),z=i("FilterContainer",!0),Y=o.isSwagger2(),Z=o.isOAS3(),ee=o.isOAS31(),ie=!o.specStr(),ae=o.loadingStatus();let ce=null;if("loading"===ae&&(ce=Re.createElement("div",{className:"info"},Re.createElement("div",{className:"loading-container"},Re.createElement("div",{className:"loading"})))),"failed"===ae&&(ce=Re.createElement("div",{className:"info"},Re.createElement("div",{className:"loading-container"},Re.createElement("h4",{className:"title"},"Failed to load API definition."),Re.createElement(B,null)))),"failedConfig"===ae){const o=s.lastError(),i=o?o.get("message"):"";ce=Re.createElement("div",{className:"info failed-config"},Re.createElement("div",{className:"loading-container"},Re.createElement("h4",{className:"title"},"Failed to load remote configuration."),Re.createElement("p",null,i)))}if(!ce&&ie&&(ce=Re.createElement("h4",null,"No API definition provided.")),ce)return Re.createElement("div",{className:"swagger-ui"},Re.createElement("div",{className:"loading-container"},ce));const le=o.servers(),pe=o.schemes(),de=le&&le.size,fe=pe&&pe.size,ye=!!o.securityDefinitions();return Re.createElement("div",{className:"swagger-ui"},Re.createElement(a,null),Re.createElement(_,{isSwagger2:Y,isOAS3:Z,alsoShow:Re.createElement(B,null)},Re.createElement(B,null),Re.createElement(j,{className:"information-container"},Re.createElement(L,{mobile:12},Re.createElement(u,null))),de||fe||ye?Re.createElement("div",{className:"scheme-container"},Re.createElement(L,{className:"schemes wrapper",mobile:12},de||fe?Re.createElement("div",{className:"schemes-server-container"},de?Re.createElement($,null):null,fe?Re.createElement(U,null):null):null,ye?Re.createElement(V,null):null)):null,Re.createElement(z,null),Re.createElement(j,null,Re.createElement(L,{mobile:12,desktop:12},Re.createElement(w,null))),ee&&Re.createElement(j,{className:"webhooks-container"},Re.createElement(L,{mobile:12,desktop:12},Re.createElement(C,null))),Re.createElement(j,null,Re.createElement(L,{mobile:12,desktop:12},Re.createElement(x,null)))))}}const core_components=()=>({components:{App:JO,authorizationPopup:AuthorizationPopup,authorizeBtn:AuthorizeBtn,AuthorizeBtnContainer,authorizeOperationBtn:AuthorizeOperationBtn,auths:Auths,AuthItem:auth_item_Auths,authError:AuthError,oauth2:Oauth2,apiKeyAuth:ApiKeyAuth,basicAuth:BasicAuth,clear:Clear,liveResponse:LiveResponse,InitializedInput,info:tA,InfoContainer,InfoUrl,InfoBasePath,Contact:rA,License:nA,JumpToPath,CopyToClipboardBtn,onlineValidatorBadge:OnlineValidatorBadge,operations:Operations,operation:operation_Operation,OperationSummary,OperationSummaryMethod,OperationSummaryPath,responses:responses_Responses,response:response_Response,ResponseExtension:response_extension,responseBody:ResponseBody,parameters:Parameters,parameterRow:ParameterRow,execute:Execute,headers:headers_Headers,errors:Errors,contentType:ContentType,overview:Overview,footer:Footer,FilterContainer,ParamBody,curl:Curl,Property:property,TryItOutButton,Markdown:kP,BaseLayout,VersionPragmaFilter,VersionStamp:version_stamp,OperationExt:operation_extensions,OperationExtRow:operation_extension_row,ParameterExt:parameter_extension,ParameterIncludeEmpty,OperationTag,OperationContainer,OpenAPIVersion:openapi_version,DeepLink:deep_link,SvgAssets:svg_assets,Example:example_Example,ExamplesSelect,ExamplesSelectValueRetainer}}),form_components=()=>({components:{..._e}}),base=()=>[configsPlugin,util,logs,view,view_legacy,plugins_spec,err,icons,plugins_layout,json_schema_5,json_schema_5_samples,core_components,form_components,swagger_client,auth,downloadUrlPlugin,deep_linking,filter,on_complete,plugins_request_snippets,syntax_highlighting,versions,safe_render()],OP=(0,ze.Map)();function onlyOAS3(s){return(o,i)=>(...a)=>{if(i.getSystem().specSelectors.isOAS3()){const o=s(...a);return"function"==typeof o?o(i):o}return o(...a)}}const AP=onlyOAS3(xs()(null)),CP=onlyOAS3(((s,o)=>s=>s.getSystem().specSelectors.findSchema(o))),jP=onlyOAS3((()=>s=>{const o=s.getSystem().specSelectors.specJson().getIn(["components","schemas"]);return ze.Map.isMap(o)?o:OP})),PP=onlyOAS3((()=>s=>s.getSystem().specSelectors.specJson().hasIn(["servers",0]))),IP=onlyOAS3(Ut(Ns,(s=>s.getIn(["components","securitySchemes"])||null))),wrap_selectors_validOperationMethods=(s,o)=>(i,...a)=>o.specSelectors.isOAS3()?o.oas3Selectors.validOperationMethods():s(...a),TP=AP,NP=AP,MP=AP,RP=AP,DP=AP;const LP=function wrap_selectors_onlyOAS3(s){return(o,i)=>(...a)=>{if(i.getSystem().specSelectors.isOAS3()){let o=i.getState().getIn(["spec","resolvedSubtrees","components","securitySchemes"]);return s(i,o,...a)}return o(...a)}}(Ut((s=>s),(({specSelectors:s})=>s.securityDefinitions()),((s,o)=>{let i=(0,ze.List)();return o?(o.entrySeq().forEach((([s,o])=>{const a=o?.get("type");if("oauth2"===a&&o.get("flows").entrySeq().forEach((([a,u])=>{let _=(0,ze.fromJS)({flow:a,authorizationUrl:u.get("authorizationUrl"),tokenUrl:u.get("tokenUrl"),scopes:u.get("scopes"),type:o.get("type"),description:o.get("description")});i=i.push(new ze.Map({[s]:_.filter((s=>void 0!==s))}))})),"http"!==a&&"apiKey"!==a||(i=i.push(new ze.Map({[s]:o}))),"openIdConnect"===a&&o.get("openIdConnectData")){let a=o.get("openIdConnectData");(a.get("grant_types_supported")||["authorization_code","implicit"]).forEach((u=>{let _=a.get("scopes_supported")&&a.get("scopes_supported").reduce(((s,o)=>s.set(o,"")),new ze.Map),w=(0,ze.fromJS)({flow:u,authorizationUrl:a.get("authorization_endpoint"),tokenUrl:a.get("token_endpoint"),scopes:_,type:"oauth2",openIdConnectUrl:o.get("openIdConnectUrl")});i=i.push(new ze.Map({[s]:w.filter((s=>void 0!==s))}))}))}})),i):i})));function OAS3ComponentWrapFactory(s){return(o,i)=>a=>"function"==typeof i.specSelectors?.isOAS3?i.specSelectors.isOAS3()?Re.createElement(s,Mn()({},a,i,{Ori:o})):Re.createElement(o,a):(console.warn("OAS3 wrapper: couldn't get spec"),null)}const FP=(0,ze.Map)(),selectors_isSwagger2=()=>s=>function isSwagger2(s){const o=s.get("swagger");return"string"==typeof o&&"2.0"===o}(s.getSystem().specSelectors.specJson()),selectors_isOAS30=()=>s=>function isOAS30(s){const o=s.get("openapi");return"string"==typeof o&&/^3\.0\.(?:[1-9]\d*|0)$/.test(o)}(s.getSystem().specSelectors.specJson()),selectors_isOAS3=()=>s=>s.getSystem().specSelectors.isOAS30();function selectors_onlyOAS3(s){return(o,...i)=>a=>{if(a.specSelectors.isOAS3()){const u=s(o,...i);return"function"==typeof u?u(a):u}return null}}const BP=selectors_onlyOAS3((()=>s=>s.specSelectors.specJson().get("servers",FP))),findSchema=(s,o)=>{const i=s.getIn(["resolvedSubtrees","components","schemas",o],null),a=s.getIn(["json","components","schemas",o],null);return i||a||null},$P=selectors_onlyOAS3(((s,{callbacks:o,specPath:i})=>s=>{const a=s.specSelectors.validOperationMethods();return ze.Map.isMap(o)?o.reduce(((s,o,u)=>{if(!ze.Map.isMap(o))return s;const _=o.reduce(((s,o,_)=>{if(!ze.Map.isMap(o))return s;const w=o.entrySeq().filter((([s])=>a.includes(s))).map((([s,o])=>({operation:(0,ze.Map)({operation:o}),method:s,path:_,callbackName:u,specPath:i.concat([u,_,s])})));return s.concat(w)}),(0,ze.List)());return s.concat(_)}),(0,ze.List)()).groupBy((s=>s.callbackName)).map((s=>s.toArray())).toObject():{}})),callbacks=({callbacks:s,specPath:o,specSelectors:i,getComponent:a})=>{const u=i.callbacksOperations({callbacks:s,specPath:o}),_=Object.keys(u),w=a("OperationContainer",!0);return 0===_.length?Re.createElement("span",null,"No callbacks"):Re.createElement("div",null,_.map((s=>Re.createElement("div",{key:`${s}`},Re.createElement("h2",null,s),u[s].map((o=>Re.createElement(w,{key:`${s}-${o.path}-${o.method}`,op:o.operation,tag:"callbacks",method:o.method,path:o.path,specPath:o.specPath,allowTryItOut:!1})))))))},getDefaultRequestBodyValue=(s,o,i,a)=>{const u=s.getIn(["content",o])??(0,ze.OrderedMap)(),_=u.get("schema",(0,ze.OrderedMap)()).toJS(),w=void 0!==u.get("examples"),x=u.get("example"),C=w?u.getIn(["examples",i,"value"]):x;return stringify(a.getSampleSchema(_,o,{includeWriteOnly:!0},C))},components_request_body=({userHasEditedBody:s,requestBody:o,requestBodyValue:i,requestBodyInclusionSetting:a,requestBodyErrors:u,getComponent:_,getConfigs:w,specSelectors:x,fn:C,contentType:j,isExecute:L,specPath:B,onChange:$,onChangeIncludeEmpty:U,activeExamplesKey:V,updateActiveExamplesKey:z,setRetainRequestBodyValueFlag:Y})=>{const handleFile=s=>{$(s.target.files[0])},setIsIncludedOptions=s=>{let o={key:s,shouldDispatchInit:!1,defaultValue:!0};return"no value"===a.get(s,"no value")&&(o.shouldDispatchInit=!0),o},Z=_("Markdown",!0),ee=_("modelExample"),ie=_("RequestBodyEditor"),ae=_("HighlightCode",!0),ce=_("ExamplesSelectValueRetainer"),le=_("Example"),pe=_("ParameterIncludeEmpty"),{showCommonExtensions:de}=w(),fe=o?.get("description")??null,ye=o?.get("content")??new ze.OrderedMap;j=j||ye.keySeq().first()||"";const be=ye.get(j)??(0,ze.OrderedMap)(),_e=be.get("schema",(0,ze.OrderedMap)()),Se=be.get("examples",null),we=Se?.map(((s,i)=>{const a=s?.get("value",null);return a&&(s=s.set("value",getDefaultRequestBodyValue(o,j,i,C),a)),s}));u=ze.List.isList(u)?u:(0,ze.List)();if(C.isFileUploadIntended(be?.get("schema"),j)){const s=_("Input");return L?Re.createElement(s,{type:"file",onChange:handleFile}):Re.createElement("i",null,"Example values are not available for ",Re.createElement("code",null,j)," media types.")}if(!be.size)return null;if(C.hasSchemaType(be.get("schema"),"object")&&("application/x-www-form-urlencoded"===j||0===j.indexOf("multipart/"))&&_e.get("properties",(0,ze.OrderedMap)()).size>0){const s=_("JsonSchemaForm"),o=_("ParameterExt"),j=_e.get("properties",(0,ze.OrderedMap)());return i=ze.Map.isMap(i)?i:(0,ze.OrderedMap)(),Re.createElement("div",{className:"table-container"},fe&&Re.createElement(Z,{source:fe}),Re.createElement("table",null,Re.createElement("tbody",null,ze.Map.isMap(j)&&j.entrySeq().map((([j,V])=>{if(V.get("readOnly"))return;const z=V.get("oneOf")?.get(0)?.toJS(),Y=V.get("anyOf")?.get(0)?.toJS();V=(0,ze.fromJS)(C.mergeJsonSchema(V.toJS(),z??Y??{}));let ie=de?getCommonExtensions(V):null;const ae=_e.get("required",(0,ze.List)()).includes(j),ce=C.getSchemaObjectType(V),le=C.getSchemaObjectTypeLabel(V),fe=C.getSchemaObjectType(V?.get("items")),ye=V.get("format"),be=V.get("description"),Se=i.getIn([j,"value"]),we=i.getIn([j,"errors"])||u,xe=a.get(j)||!1;let Pe=C.getSampleSchema(V,!1,{includeWriteOnly:!0});!1===Pe&&(Pe="false"),0===Pe&&(Pe="0"),"string"!=typeof Pe&&"object"===ce&&(Pe=stringify(Pe)),"string"==typeof Pe&&"array"===ce&&(Pe=JSON.parse(Pe));const Te=C.isFileUploadIntended(V),$e=Re.createElement(s,{fn:C,dispatchInitialValue:!Te,schema:V,description:j,getComponent:_,value:void 0===Se?Pe:Se,required:ae,errors:we,onChange:s=>{$(s,[j])}});return Re.createElement("tr",{key:j,className:"parameters","data-property-name":j},Re.createElement("td",{className:"parameters-col_name"},Re.createElement("div",{className:ae?"parameter__name required":"parameter__name"},j,ae?Re.createElement("span",null," *"):null),Re.createElement("div",{className:"parameter__type"},le,ye&&Re.createElement("span",{className:"prop-format"},"($",ye,")"),de&&ie.size?ie.entrySeq().map((([s,i])=>Re.createElement(o,{key:`${s}-${i}`,xKey:s,xVal:i}))):null),Re.createElement("div",{className:"parameter__deprecated"},V.get("deprecated")?"deprecated":null)),Re.createElement("td",{className:"parameters-col_description"},Re.createElement(Z,{source:be}),L?Re.createElement("div",null,"object"===ce||"object"===fe?Re.createElement(ee,{getComponent:_,specPath:B.push("schema"),getConfigs:w,isExecute:L,specSelectors:x,schema:V,example:$e}):$e,ae?null:Re.createElement(pe,{onChange:s=>U(j,s),isIncluded:xe,isIncludedOptions:setIsIncludedOptions(j),isDisabled:Array.isArray(Se)?0!==Se.length:!isEmptyValue(Se)})):null))})))))}const xe=getDefaultRequestBodyValue(o,j,V,C);let Pe=null;getKnownSyntaxHighlighterLanguage(xe)&&(Pe="json");const Te=L?Re.createElement(ie,{value:i,errors:u,defaultValue:xe,onChange:$,getComponent:_}):Re.createElement(ae,{className:"body-param__example",language:Pe},stringify(i)||xe);return Re.createElement("div",null,fe&&Re.createElement(Z,{source:fe}),we?Re.createElement(ce,{userHasEditedBody:s,examples:we,currentKey:V,currentUserInputValue:i,onSelect:s=>{z(s)},updateValue:$,defaultToFirstExample:!0,getComponent:_,setRetainRequestBodyValueFlag:Y}):null,Re.createElement(ee,{getComponent:_,getConfigs:w,specSelectors:x,expandDepth:1,isExecute:L,schema:be.get("schema"),specPath:B.push("content",j,"schema"),example:Te,includeWriteOnly:!0}),we?Re.createElement(le,{example:we.get(V),getComponent:_,getConfigs:w}):null)};class operation_link_OperationLink extends Re.Component{render(){const{link:s,name:o,getComponent:i}=this.props,a=i("Markdown",!0);let u=s.get("operationId")||s.get("operationRef"),_=s.get("parameters")&&s.get("parameters").toJS(),w=s.get("description");return Re.createElement("div",{className:"operation-link"},Re.createElement("div",{className:"description"},Re.createElement("b",null,Re.createElement("code",null,o)),w?Re.createElement(a,{source:w}):null),Re.createElement("pre",null,"Operation `",u,"`",Re.createElement("br",null),Re.createElement("br",null),"Parameters ",function padString(s,o){if("string"!=typeof o)return"";return o.split("\n").map(((o,i)=>i>0?Array(s+1).join(" ")+o:o)).join("\n")}(0,JSON.stringify(_,null,2))||"{}",Re.createElement("br",null)))}}const qP=operation_link_OperationLink,components_servers=({servers:s,currentServer:o,setSelectedServer:i,setServerVariableValue:a,getServerVariable:u,getEffectiveServerValue:_})=>{const w=(s.find((s=>s.get("url")===o))||(0,ze.OrderedMap)()).get("variables")||(0,ze.OrderedMap)(),x=0!==w.size;(0,Re.useEffect)((()=>{o||i(s.first()?.get("url"))}),[]),(0,Re.useEffect)((()=>{const u=s.find((s=>s.get("url")===o));if(!u)return void i(s.first().get("url"));(u.get("variables")||(0,ze.OrderedMap)()).map(((s,i)=>{a({server:o,key:i,val:s.get("default")||""})}))}),[o,s]);const C=(0,Re.useCallback)((s=>{i(s.target.value)}),[i]),j=(0,Re.useCallback)((s=>{const i=s.target.getAttribute("data-variable"),u=s.target.value;a({server:o,key:i,val:u})}),[a,o]);return Re.createElement("div",{className:"servers"},Re.createElement("label",{htmlFor:"servers"},Re.createElement("select",{onChange:C,value:o,id:"servers"},s.valueSeq().map((s=>Re.createElement("option",{value:s.get("url"),key:s.get("url")},s.get("url"),s.get("description")&&` - ${s.get("description")}`))).toArray())),x&&Re.createElement("div",null,Re.createElement("div",{className:"computed-url"},"Computed URL:",Re.createElement("code",null,_(o))),Re.createElement("h4",null,"Server variables"),Re.createElement("table",null,Re.createElement("tbody",null,w.entrySeq().map((([s,i])=>Re.createElement("tr",{key:s},Re.createElement("td",null,s),Re.createElement("td",null,i.get("enum")?Re.createElement("select",{"data-variable":s,onChange:j},i.get("enum").map((i=>Re.createElement("option",{selected:i===u(o,s),key:i,value:i},i)))):Re.createElement("input",{type:"text",value:u(o,s)||"",onChange:j,"data-variable":s})))))))))};class ServersContainer extends Re.Component{render(){const{specSelectors:s,oas3Selectors:o,oas3Actions:i,getComponent:a}=this.props,u=s.servers(),_=a("Servers");return u&&u.size?Re.createElement("div",null,Re.createElement("span",{className:"servers-title"},"Servers"),Re.createElement(_,{servers:u,currentServer:o.selectedServer(),setSelectedServer:i.setSelectedServer,setServerVariableValue:i.setServerVariableValue,getServerVariable:o.serverVariableValue,getEffectiveServerValue:o.serverEffectiveValue})):null}}const UP=Function.prototype;class RequestBodyEditor extends Re.PureComponent{static defaultProps={onChange:UP,userHasEditedBody:!1};constructor(s,o){super(s,o),this.state={value:stringify(s.value)||s.defaultValue},s.onChange(s.value)}applyDefaultValue=s=>{const{onChange:o,defaultValue:i}=s||this.props;return this.setState({value:i}),o(i)};onChange=s=>{this.props.onChange(stringify(s))};onDomChange=s=>{const o=s.target.value;this.setState({value:o},(()=>this.onChange(o)))};UNSAFE_componentWillReceiveProps(s){this.props.value!==s.value&&s.value!==this.state.value&&this.setState({value:stringify(s.value)}),!s.value&&s.defaultValue&&this.state.value&&this.applyDefaultValue(s)}render(){let{getComponent:s,errors:o}=this.props,{value:i}=this.state,a=o.size>0;const u=s("TextArea");return Re.createElement("div",{className:"body-param"},Re.createElement(u,{className:Jn()("body-param__text",{invalid:a}),title:o.size?o.join(", "):"",value:i,onChange:this.onDomChange}))}}class HttpAuth extends Re.Component{constructor(s,o){super(s,o);let{name:i,schema:a}=this.props,u=this.getValue();this.state={name:i,schema:a,value:u}}getValue(){let{name:s,authorized:o}=this.props;return o&&o.getIn([s,"value"])}onChange=s=>{let{onChange:o}=this.props,{value:i,name:a}=s.target,u=Object.assign({},this.state.value);a?u[a]=i:u=i,this.setState({value:u},(()=>o(this.state)))};render(){let{schema:s,getComponent:o,errSelectors:i,name:a,authSelectors:u}=this.props;const _=o("Input"),w=o("Row"),x=o("Col"),C=o("authError"),j=o("Markdown",!0),L=o("JumpToPath",!0),B=(s.get("scheme")||"").toLowerCase(),$=u.selectAuthPath(a);let U=this.getValue(),V=i.allErrors().filter((s=>s.get("authId")===a));if("basic"===B){let o=U?U.get("username"):null;return Re.createElement("div",null,Re.createElement("h4",null,Re.createElement("code",null,a),"  (http, Basic)",Re.createElement(L,{path:$})),o&&Re.createElement("h6",null,"Authorized"),Re.createElement(w,null,Re.createElement(j,{source:s.get("description")})),Re.createElement(w,null,Re.createElement("label",{htmlFor:"auth-basic-username"},"Username:"),o?Re.createElement("code",null," ",o," "):Re.createElement(x,null,Re.createElement(_,{id:"auth-basic-username",type:"text",required:"required",name:"username","aria-label":"auth-basic-username",onChange:this.onChange,autoFocus:!0}))),Re.createElement(w,null,Re.createElement("label",{htmlFor:"auth-basic-password"},"Password:"),o?Re.createElement("code",null," ****** "):Re.createElement(x,null,Re.createElement(_,{id:"auth-basic-password",autoComplete:"new-password",name:"password",type:"password","aria-label":"auth-basic-password",onChange:this.onChange}))),V.valueSeq().map(((s,o)=>Re.createElement(C,{error:s,key:o}))))}return"bearer"===B?Re.createElement("div",null,Re.createElement("h4",null,Re.createElement("code",null,a),"  (http, Bearer)",Re.createElement(L,{path:$})),U&&Re.createElement("h6",null,"Authorized"),Re.createElement(w,null,Re.createElement(j,{source:s.get("description")})),Re.createElement(w,null,Re.createElement("label",{htmlFor:"auth-bearer-value"},"Value:"),U?Re.createElement("code",null," ****** "):Re.createElement(x,null,Re.createElement(_,{id:"auth-bearer-value",type:"text","aria-label":"auth-bearer-value",onChange:this.onChange,autoFocus:!0}))),V.valueSeq().map(((s,o)=>Re.createElement(C,{error:s,key:o})))):Re.createElement("div",null,Re.createElement("em",null,Re.createElement("b",null,a)," HTTP authentication: unsupported scheme ",`'${B}'`))}}class operation_servers_OperationServers extends Re.Component{setSelectedServer=s=>{const{path:o,method:i}=this.props;return this.forceUpdate(),this.props.setSelectedServer(s,`${o}:${i}`)};setServerVariableValue=s=>{const{path:o,method:i}=this.props;return this.forceUpdate(),this.props.setServerVariableValue({...s,namespace:`${o}:${i}`})};getSelectedServer=()=>{const{path:s,method:o}=this.props;return this.props.getSelectedServer(`${s}:${o}`)};getServerVariable=(s,o)=>{const{path:i,method:a}=this.props;return this.props.getServerVariable({namespace:`${i}:${a}`,server:s},o)};getEffectiveServerValue=s=>{const{path:o,method:i}=this.props;return this.props.getEffectiveServerValue({server:s,namespace:`${o}:${i}`})};render(){const{operationServers:s,pathServers:o,getComponent:i}=this.props;if(!s&&!o)return null;const a=i("Servers"),u=s||o,_=s?"operation":"path";return Re.createElement("div",{className:"opblock-section operation-servers"},Re.createElement("div",{className:"opblock-section-header"},Re.createElement("div",{className:"tab-header"},Re.createElement("h4",{className:"opblock-title"},"Servers"))),Re.createElement("div",{className:"opblock-description-wrapper"},Re.createElement("h4",{className:"message"},"These ",_,"-level options override the global server options."),Re.createElement(a,{servers:u,currentServer:this.getSelectedServer(),setSelectedServer:this.setSelectedServer,setServerVariableValue:this.setServerVariableValue,getServerVariable:this.getServerVariable,getEffectiveServerValue:this.getEffectiveServerValue})))}}const VP={Callbacks:callbacks,HttpAuth,RequestBody:components_request_body,Servers:components_servers,ServersContainer,RequestBodyEditor,OperationServers:operation_servers_OperationServers,operationLink:qP},zP=new Remarkable("commonmark");zP.block.ruler.enable(["table"]),zP.set({linkTarget:"_blank"});const WP=OAS3ComponentWrapFactory((({source:s,className:o="",getConfigs:i=()=>({useUnsafeMarkdown:!1})})=>{if("string"!=typeof s)return null;if(s){const{useUnsafeMarkdown:a}=i(),u=sanitizer(zP.render(s),{useUnsafeMarkdown:a});let _;return"string"==typeof u&&(_=u.trim()),Re.createElement("div",{dangerouslySetInnerHTML:{__html:_},className:Jn()(o,"renderedMarkdown")})}return null})),JP=OAS3ComponentWrapFactory((({Ori:s,...o})=>{const{schema:i,getComponent:a,errSelectors:u,authorized:_,onAuthChange:w,name:x,authSelectors:C}=o,j=a("HttpAuth");return"http"===i.get("type")?Re.createElement(j,{key:x,schema:i,name:x,errSelectors:u,authorized:_,getComponent:a,onChange:w,authSelectors:C}):Re.createElement(s,o)})),HP=OAS3ComponentWrapFactory(OnlineValidatorBadge);class ModelComponent extends Re.Component{render(){let{getConfigs:s,schema:o,Ori:i}=this.props,a=["model-box"],u=null;return!0===o.get("deprecated")&&(a.push("deprecated"),u=Re.createElement("span",{className:"model-deprecated-warning"},"Deprecated:")),Re.createElement("div",{className:a.join(" ")},u,Re.createElement(i,Mn()({},this.props,{getConfigs:s,depth:1,expandDepth:this.props.expandDepth||0})))}}const KP=OAS3ComponentWrapFactory(ModelComponent),GP=OAS3ComponentWrapFactory((({Ori:s,...o})=>{const{schema:i,getComponent:a,errors:u,onChange:_,fn:w}=o,x=w.isFileUploadIntended(i),C=a("Input");return x?Re.createElement(C,{type:"file",className:u.length?"invalid":"",title:u.length?u:"",onChange:s=>{_(s.target.files[0])},disabled:s.isDisabled}):Re.createElement(s,o)})),YP={Markdown:WP,AuthItem:JP,OpenAPIVersion:function OAS30ComponentWrapFactory(s){return(o,i)=>a=>"function"==typeof i.specSelectors?.isOAS30?i.specSelectors.isOAS30()?Re.createElement(s,Mn()({},a,i,{Ori:o})):Re.createElement(o,a):(console.warn("OAS30 wrapper: couldn't get spec"),null)}((s=>{const{Ori:o}=s;return Re.createElement(o,{oasVersion:"3.0"})})),JsonSchema_string:GP,model:KP,onlineValidatorBadge:HP},XP="oas3_set_servers",QP="oas3_set_request_body_value",ZP="oas3_set_request_body_retain_flag",eI="oas3_set_request_body_inclusion",tI="oas3_set_active_examples_member",rI="oas3_set_request_content_type",nI="oas3_set_response_content_type",sI="oas3_set_server_variable_value",oI="oas3_set_request_body_validate_error",iI="oas3_clear_request_body_validate_error",aI="oas3_clear_request_body_value";function setSelectedServer(s,o){return{type:XP,payload:{selectedServerUrl:s,namespace:o}}}function setRequestBodyValue({value:s,pathMethod:o}){return{type:QP,payload:{value:s,pathMethod:o}}}const setRetainRequestBodyValueFlag=({value:s,pathMethod:o})=>({type:ZP,payload:{value:s,pathMethod:o}});function setRequestBodyInclusion({value:s,pathMethod:o,name:i}){return{type:eI,payload:{value:s,pathMethod:o,name:i}}}function setActiveExamplesMember({name:s,pathMethod:o,contextType:i,contextName:a}){return{type:tI,payload:{name:s,pathMethod:o,contextType:i,contextName:a}}}function setRequestContentType({value:s,pathMethod:o}){return{type:rI,payload:{value:s,pathMethod:o}}}function setResponseContentType({value:s,path:o,method:i}){return{type:nI,payload:{value:s,path:o,method:i}}}function setServerVariableValue({server:s,namespace:o,key:i,val:a}){return{type:sI,payload:{server:s,namespace:o,key:i,val:a}}}const setRequestBodyValidateError=({path:s,method:o,validationErrors:i})=>({type:oI,payload:{path:s,method:o,validationErrors:i}}),clearRequestBodyValidateError=({path:s,method:o})=>({type:iI,payload:{path:s,method:o}}),initRequestBodyValidateError=({pathMethod:s})=>({type:iI,payload:{path:s[0],method:s[1]}}),clearRequestBodyValue=({pathMethod:s})=>({type:aI,payload:{pathMethod:s}});var cI=__webpack_require__(60680),lI=__webpack_require__.n(cI);const oas3_selectors_onlyOAS3=s=>(o,...i)=>a=>{if(a.getSystem().specSelectors.isOAS3()){const u=s(o,...i);return"function"==typeof u?u(a):u}return null};const uI=oas3_selectors_onlyOAS3(((s,o)=>{const i=o?[o,"selectedServer"]:["selectedServer"];return s.getIn(i)||""})),pI=oas3_selectors_onlyOAS3(((s,o,i)=>s.getIn(["requestData",o,i,"bodyValue"])||null)),hI=oas3_selectors_onlyOAS3(((s,o,i)=>s.getIn(["requestData",o,i,"retainBodyValue"])||!1)),selectDefaultRequestBodyValue=(s,o,i)=>s=>{const{oas3Selectors:a,specSelectors:u,fn:_}=s.getSystem();if(u.isOAS3()){const s=a.requestContentType(o,i);if(s)return getDefaultRequestBodyValue(u.specResolvedSubtree(["paths",o,i,"requestBody"]),s,a.activeExamplesMember(o,i,"requestBody","requestBody"),_)}return null},dI=oas3_selectors_onlyOAS3(((s,o,i)=>s=>{const{oas3Selectors:a,specSelectors:u,fn:_}=s;let w=!1;const x=a.requestContentType(o,i);let C=a.requestBodyValue(o,i);const j=u.specResolvedSubtree(["paths",o,i,"requestBody"]);if(!j)return!1;if(ze.Map.isMap(C)&&(C=stringify(C.mapEntries((s=>ze.Map.isMap(s[1])?[s[0],s[1].get("value")]:s)).toJS())),ze.List.isList(C)&&(C=stringify(C)),x){const s=getDefaultRequestBodyValue(j,x,a.activeExamplesMember(o,i,"requestBody","requestBody"),_);w=!!C&&C!==s}return w})),fI=oas3_selectors_onlyOAS3(((s,o,i)=>s.getIn(["requestData",o,i,"bodyInclusion"])||(0,ze.Map)())),mI=oas3_selectors_onlyOAS3(((s,o,i)=>s.getIn(["requestData",o,i,"errors"])||null)),gI=oas3_selectors_onlyOAS3(((s,o,i,a,u)=>s.getIn(["examples",o,i,a,u,"activeExample"])||null)),yI=oas3_selectors_onlyOAS3(((s,o,i)=>s.getIn(["requestData",o,i,"requestContentType"])||null)),vI=oas3_selectors_onlyOAS3(((s,o,i)=>s.getIn(["requestData",o,i,"responseContentType"])||null)),bI=oas3_selectors_onlyOAS3(((s,o,i)=>{let a;if("string"!=typeof o){const{server:s,namespace:u}=o;a=u?[u,"serverVariableValues",s,i]:["serverVariableValues",s,i]}else{a=["serverVariableValues",o,i]}return s.getIn(a)||null})),_I=oas3_selectors_onlyOAS3(((s,o)=>{let i;if("string"!=typeof o){const{server:s,namespace:a}=o;i=a?[a,"serverVariableValues",s]:["serverVariableValues",s]}else{i=["serverVariableValues",o]}return s.getIn(i)||(0,ze.OrderedMap)()})),SI=oas3_selectors_onlyOAS3(((s,o)=>{var i,a;if("string"!=typeof o){const{server:u,namespace:_}=o;a=u,i=_?s.getIn([_,"serverVariableValues",a]):s.getIn(["serverVariableValues",a])}else a=o,i=s.getIn(["serverVariableValues",a]);i=i||(0,ze.OrderedMap)();let u=a;return i.map(((s,o)=>{u=u.replace(new RegExp(`{${lI()(o)}}`,"g"),s)})),u})),EI=function validateRequestBodyIsRequired(s){return(...o)=>i=>{const a=i.getSystem().specSelectors.specJson();let u=[...o][1]||[];return!a.getIn(["paths",...u,"requestBody","required"])||s(...o)}}(((s,o)=>((s,o)=>(o=o||[],!!s.getIn(["requestData",...o,"bodyValue"])))(s,o))),validateShallowRequired=(s,{oas3RequiredRequestBodyContentType:o,oas3RequestContentType:i,oas3RequestBodyValue:a})=>{let u=[];if(!ze.Map.isMap(a))return u;let _=[];return Object.keys(o.requestContentType).forEach((s=>{if(s===i){o.requestContentType[s].forEach((s=>{_.indexOf(s)<0&&_.push(s)}))}})),_.forEach((s=>{a.getIn([s,"value"])||u.push(s)})),u},wI=xs()(["get","put","post","delete","options","head","patch","trace"]),xI={[XP]:(s,{payload:{selectedServerUrl:o,namespace:i}})=>{const a=i?[i,"selectedServer"]:["selectedServer"];return s.setIn(a,o)},[QP]:(s,{payload:{value:o,pathMethod:i}})=>{let[a,u]=i;if(!ze.Map.isMap(o))return s.setIn(["requestData",a,u,"bodyValue"],o);let _=s.getIn(["requestData",a,u,"bodyValue"])||(0,ze.Map)();ze.Map.isMap(_)||(_=(0,ze.Map)());let w=_;const[...x]=o.keys();return x.forEach((s=>{let i=o.getIn([s]);w.has(s)&&ze.Map.isMap(i)||(w=w.setIn([s,"value"],i))})),s.setIn(["requestData",a,u,"bodyValue"],w)},[ZP]:(s,{payload:{value:o,pathMethod:i}})=>{let[a,u]=i;return s.setIn(["requestData",a,u,"retainBodyValue"],o)},[eI]:(s,{payload:{value:o,pathMethod:i,name:a}})=>{let[u,_]=i;return s.setIn(["requestData",u,_,"bodyInclusion",a],o)},[tI]:(s,{payload:{name:o,pathMethod:i,contextType:a,contextName:u}})=>{let[_,w]=i;return s.setIn(["examples",_,w,a,u,"activeExample"],o)},[rI]:(s,{payload:{value:o,pathMethod:i}})=>{let[a,u]=i;return s.setIn(["requestData",a,u,"requestContentType"],o)},[nI]:(s,{payload:{value:o,path:i,method:a}})=>s.setIn(["requestData",i,a,"responseContentType"],o),[sI]:(s,{payload:{server:o,namespace:i,key:a,val:u}})=>{const _=i?[i,"serverVariableValues",o,a]:["serverVariableValues",o,a];return s.setIn(_,u)},[oI]:(s,{payload:{path:o,method:i,validationErrors:a}})=>{let u=[];if(u.push("Required field is not provided"),a.missingBodyValue)return s.setIn(["requestData",o,i,"errors"],(0,ze.fromJS)(u));if(a.missingRequiredKeys&&a.missingRequiredKeys.length>0){const{missingRequiredKeys:_}=a;return s.updateIn(["requestData",o,i,"bodyValue"],(0,ze.fromJS)({}),(s=>_.reduce(((s,o)=>s.setIn([o,"errors"],(0,ze.fromJS)(u))),s)))}return console.warn("unexpected result: SET_REQUEST_BODY_VALIDATE_ERROR"),s},[iI]:(s,{payload:{path:o,method:i}})=>{const a=s.getIn(["requestData",o,i,"bodyValue"]);if(!ze.Map.isMap(a))return s.setIn(["requestData",o,i,"errors"],(0,ze.fromJS)([]));const[...u]=a.keys();return u?s.updateIn(["requestData",o,i,"bodyValue"],(0,ze.fromJS)({}),(s=>u.reduce(((s,o)=>s.setIn([o,"errors"],(0,ze.fromJS)([]))),s))):s},[aI]:(s,{payload:{pathMethod:o}})=>{let[i,a]=o;const u=s.getIn(["requestData",i,a,"bodyValue"]);return u?ze.Map.isMap(u)?s.setIn(["requestData",i,a,"bodyValue"],(0,ze.Map)()):s.setIn(["requestData",i,a,"bodyValue"],""):s}};function oas3({getSystem:s}){const o=(s=>(o,i=null)=>{const{getConfigs:a,fn:u}=s(),{fileUploadMediaTypes:_}=a();if("string"==typeof i&&_.some((s=>i.startsWith(s))))return!0;const w=ze.Map.isMap(o);if(!w&&!as()(o))return!1;const x=w?o.get("format"):o.format;return u.hasSchemaType(o,"string")&&["binary","byte"].includes(x)})(s);return{components:VP,wrapComponents:YP,statePlugins:{spec:{wrapSelectors:Se,selectors:xe},auth:{wrapSelectors:we},oas3:{actions:{...Pe},reducers:xI,selectors:{...Te}}},fn:{isFileUploadIntended:o,isFileUploadIntendedOAS30:o}}}const webhooks=({specSelectors:s,getComponent:o})=>{const i=s.selectWebhooksOperations();if(!i)return null;const a=Object.keys(i),u=o("OperationContainer",!0);return 0===a.length?null:Re.createElement("div",{className:"webhooks"},Re.createElement("h2",null,"Webhooks"),a.map((s=>Re.createElement("div",{key:`${s}-webhook`},i[s].map((o=>Re.createElement(u,{key:`${s}-${o.method}-webhook`,op:o.operation,tag:"webhooks",method:o.method,path:s,specPath:(0,ze.List)(o.specPath),allowTryItOut:!1})))))))},oas31_components_license=({getComponent:s,specSelectors:o})=>{const i=o.selectLicenseNameField(),a=o.selectLicenseUrl(),u=s("Link");return Re.createElement("div",{className:"info__license"},a?Re.createElement("div",{className:"info__license__url"},Re.createElement(u,{target:"_blank",href:sanitizeUrl(a)},i)):Re.createElement("span",null,i))},oas31_components_contact=({getComponent:s,specSelectors:o})=>{const i=o.selectContactNameField(),a=o.selectContactUrl(),u=o.selectContactEmailField(),_=s("Link");return Re.createElement("div",{className:"info__contact"},a&&Re.createElement("div",null,Re.createElement(_,{href:sanitizeUrl(a),target:"_blank"},i," - Website")),u&&Re.createElement(_,{href:sanitizeUrl(`mailto:${u}`)},a?`Send email to ${i}`:`Contact ${i}`))},oas31_components_info=({getComponent:s,specSelectors:o})=>{const i=o.version(),a=o.url(),u=o.basePath(),_=o.host(),w=o.selectInfoSummaryField(),x=o.selectInfoDescriptionField(),C=o.selectInfoTitleField(),j=o.selectInfoTermsOfServiceUrl(),L=o.selectExternalDocsUrl(),B=o.selectExternalDocsDescriptionField(),$=o.contact(),U=o.license(),V=s("Markdown",!0),z=s("Link"),Y=s("VersionStamp"),Z=s("OpenAPIVersion"),ee=s("InfoUrl"),ie=s("InfoBasePath"),ae=s("License",!0),ce=s("Contact",!0),le=s("JsonSchemaDialect",!0);return Re.createElement("div",{className:"info"},Re.createElement("hgroup",{className:"main"},Re.createElement("h1",{className:"title"},C,Re.createElement("span",null,i&&Re.createElement(Y,{version:i}),Re.createElement(Z,{oasVersion:"3.1"}))),(_||u)&&Re.createElement(ie,{host:_,basePath:u}),a&&Re.createElement(ee,{getComponent:s,url:a})),w&&Re.createElement("p",{className:"info__summary"},w),Re.createElement("div",{className:"info__description description"},Re.createElement(V,{source:x})),j&&Re.createElement("div",{className:"info__tos"},Re.createElement(z,{target:"_blank",href:sanitizeUrl(j)},"Terms of service")),$.size>0&&Re.createElement(ce,null),U.size>0&&Re.createElement(ae,null),L&&Re.createElement(z,{className:"info__extdocs",target:"_blank",href:sanitizeUrl(L)},B||L),Re.createElement(le,null))},json_schema_dialect=({getComponent:s,specSelectors:o})=>{const i=o.selectJsonSchemaDialectField(),a=o.selectJsonSchemaDialectDefault(),u=s("Link");return Re.createElement(Re.Fragment,null,i&&i===a&&Re.createElement("p",{className:"info__jsonschemadialect"},"JSON Schema dialect:"," ",Re.createElement(u,{target:"_blank",href:sanitizeUrl(i)},i)),i&&i!==a&&Re.createElement("div",{className:"error-wrapper"},Re.createElement("div",{className:"no-margin"},Re.createElement("div",{className:"errors"},Re.createElement("div",{className:"errors-wrapper"},Re.createElement("h4",{className:"center"},"Warning"),Re.createElement("p",{className:"message"},Re.createElement("strong",null,"OpenAPI.jsonSchemaDialect")," field contains a value different from the default value of"," ",Re.createElement(u,{target:"_blank",href:a},a),". Values different from the default one are currently not supported. Please either omit the field or provide it with the default value."))))))},version_pragma_filter=({bypass:s,isSwagger2:o,isOAS3:i,isOAS31:a,alsoShow:u,children:_})=>s?Re.createElement("div",null,_):o&&(i||a)?Re.createElement("div",{className:"version-pragma"},u,Re.createElement("div",{className:"version-pragma__message version-pragma__message--ambiguous"},Re.createElement("div",null,Re.createElement("h3",null,"Unable to render this definition"),Re.createElement("p",null,Re.createElement("code",null,"swagger")," and ",Re.createElement("code",null,"openapi")," fields cannot be present in the same Swagger or OpenAPI definition. Please remove one of the fields."),Re.createElement("p",null,"Supported version fields are ",Re.createElement("code",null,'swagger: "2.0"')," and those that match ",Re.createElement("code",null,"openapi: 3.x.y")," (for example,"," ",Re.createElement("code",null,"openapi: 3.1.0"),").")))):o||i||a?Re.createElement("div",null,_):Re.createElement("div",{className:"version-pragma"},u,Re.createElement("div",{className:"version-pragma__message version-pragma__message--missing"},Re.createElement("div",null,Re.createElement("h3",null,"Unable to render this definition"),Re.createElement("p",null,"The provided definition does not specify a valid version field."),Re.createElement("p",null,"Please indicate a valid Swagger or OpenAPI version field. Supported version fields are ",Re.createElement("code",null,'swagger: "2.0"')," and those that match ",Re.createElement("code",null,"openapi: 3.x.y")," (for example,"," ",Re.createElement("code",null,"openapi: 3.1.0"),").")))),getModelName=s=>"string"==typeof s&&s.includes("#/components/schemas/")?(s=>{const o=s.replace(/~1/g,"/").replace(/~0/g,"~");try{return decodeURIComponent(o)}catch{return o}})(s.replace(/^.*#\/components\/schemas\//,"")):null,kI=(0,Re.forwardRef)((({schema:s,getComponent:o,onToggle:i=()=>{},specPath:a},u)=>{const _=o("JSONSchema202012"),w=getModelName(s.get("$$ref")),x=(0,Re.useCallback)(((s,o)=>{i(w,o)}),[w,i]);return Re.createElement(_,{name:w,schema:s.toJS(),ref:u,onExpand:x,identifier:a.toJS().join("_")})})),OI=kI,models=({specActions:s,specSelectors:o,layoutSelectors:i,layoutActions:a,getComponent:u,getConfigs:_,fn:w})=>{const x=o.selectSchemas(),C=Object.keys(x).length>0,j=["components","schemas"],{docExpansion:L,defaultModelsExpandDepth:B}=_(),$=B>0&&"none"!==L,U=i.isShown(j,$),V=u("Collapse"),z=u("JSONSchema202012"),Y=u("ArrowUpIcon"),Z=u("ArrowDownIcon"),{getTitle:ee}=w.jsonSchema202012.useFn();(0,Re.useEffect)((()=>{const a=Object.entries(x).some((([s])=>i.isShown([...j,s],!1))),u=U&&(B>1||a),_=null!=o.specResolvedSubtree(j);u&&!_&&s.requestResolvedSubtree(j)}),[U,B]);const ie=(0,Re.useCallback)((()=>{a.show(j,!U)}),[U]),ae=(0,Re.useCallback)((s=>{null!==s&&a.readyToScroll(j,s)}),[]),handleJSONSchema202012Ref=s=>o=>{null!==o&&a.readyToScroll([...j,s],o)},handleJSONSchema202012Expand=i=>(u,_)=>{const w=[...j,i];if(_){null!=o.specResolvedSubtree(w)||s.requestResolvedSubtree([...j,i]),a.show(w,!0)}else a.show(w,!1)};return!C||B<0?null:Re.createElement("section",{className:Jn()("models",{"is-open":U}),ref:ae},Re.createElement("h4",null,Re.createElement("button",{"aria-expanded":U,className:"models-control",onClick:ie},Re.createElement("span",null,"Schemas"),U?Re.createElement(Y,null):Re.createElement(Z,null))),Re.createElement(V,{isOpened:U},Object.entries(x).map((([s,o])=>{const i=ee(o,{lookup:"basic"})||s;return Re.createElement(z,{key:s,ref:handleJSONSchema202012Ref(s),schema:o,name:i,onExpand:handleJSONSchema202012Expand(s)})}))))},mutual_tls_auth=({schema:s,getComponent:o,name:i,authSelectors:a})=>{const u=o("JumpToPath",!0),_=a.selectAuthPath(i);return Re.createElement("div",null,Re.createElement("h4",null,i," (mutualTLS) ",Re.createElement(u,{path:_})),Re.createElement("p",null,"Mutual TLS is required by this API/Operation. Certificates are managed via your Operating System and/or your browser."),Re.createElement("p",null,s.get("description")))};class auths_Auths extends Re.Component{constructor(s,o){super(s,o),this.state={}}onAuthChange=s=>{let{name:o}=s;this.setState({[o]:s})};submitAuth=s=>{s.preventDefault();let{authActions:o}=this.props;o.authorizeWithPersistOption(this.state)};logoutClick=s=>{s.preventDefault();let{authActions:o,definitions:i}=this.props,a=i.map(((s,o)=>o)).toArray();this.setState(a.reduce(((s,o)=>(s[o]="",s)),{})),o.logoutWithPersistOption(a)};close=s=>{s.preventDefault();let{authActions:o}=this.props;o.showDefinitions(!1)};render(){let{definitions:s,getComponent:o,authSelectors:i,errSelectors:a}=this.props;const u=o("AuthItem"),_=o("oauth2",!0),w=o("Button"),x=i.authorized(),C=s.filter(((s,o)=>!!x.get(o))),j=s.filter((s=>"oauth2"!==s.get("type")&&"mutualTLS"!==s.get("type"))),L=s.filter((s=>"oauth2"===s.get("type"))),B=s.filter((s=>"mutualTLS"===s.get("type")));return Re.createElement("div",{className:"auth-container"},j.size>0&&Re.createElement("form",{onSubmit:this.submitAuth},j.map(((s,_)=>Re.createElement(u,{key:_,schema:s,name:_,getComponent:o,onAuthChange:this.onAuthChange,authorized:x,errSelectors:a,authSelectors:i}))).toArray(),Re.createElement("div",{className:"auth-btn-wrapper"},j.size===C.size?Re.createElement(w,{className:"btn modal-btn auth",onClick:this.logoutClick,"aria-label":"Remove authorization"},"Logout"):Re.createElement(w,{type:"submit",className:"btn modal-btn auth authorize","aria-label":"Apply credentials"},"Authorize"),Re.createElement(w,{className:"btn modal-btn auth btn-done",onClick:this.close},"Close"))),L.size>0?Re.createElement("div",null,Re.createElement("div",{className:"scope-def"},Re.createElement("p",null,"Scopes are used to grant an application different levels of access to data on behalf of the end user. Each API may declare one or more scopes."),Re.createElement("p",null,"API requires the following scopes. Select which ones you want to grant to Swagger UI.")),s.filter((s=>"oauth2"===s.get("type"))).map(((s,o)=>Re.createElement("div",{key:o},Re.createElement(_,{authorized:x,schema:s,name:o})))).toArray()):null,B.size>0&&Re.createElement("div",null,B.map(((s,_)=>Re.createElement(u,{key:_,schema:s,name:_,getComponent:o,onAuthChange:this.onAuthChange,authorized:x,errSelectors:a,authSelectors:i}))).toArray()))}}const AI=auths_Auths,isOAS31=s=>{const o=s.get("openapi");return"string"==typeof o&&/^3\.1\.(?:[1-9]\d*|0)$/.test(o)},fn_createOnlyOAS31Selector=s=>(o,...i)=>a=>{if(a.getSystem().specSelectors.isOAS31()){const u=s(o,...i);return"function"==typeof u?u(a):u}return null},createOnlyOAS31SelectorWrapper=s=>(o,i)=>(a,...u)=>{if(i.getSystem().specSelectors.isOAS31()){const _=s(a,...u);return"function"==typeof _?_(o,i):_}return o(...u)},fn_createSystemSelector=s=>(o,...i)=>a=>{const u=s(o,a,...i);return"function"==typeof u?u(a):u},createOnlyOAS31ComponentWrapper=s=>(o,i)=>a=>i.specSelectors.isOAS31()?Re.createElement(s,Mn()({},a,{originalComponent:o,getSystem:i.getSystem})):Re.createElement(o,a),wrapOAS31Fn=(s,o)=>{const{fn:i,specSelectors:a}=o;return Object.fromEntries(Object.entries(s).map((([s,o])=>{const u=i[s];return[s,(...s)=>a.isOAS31()?o(...s):"function"==typeof u?u(...s):void 0]})))},CI=createOnlyOAS31ComponentWrapper((({getSystem:s})=>{const o=s().getComponent("OAS31License",!0);return Re.createElement(o,null)})),jI=createOnlyOAS31ComponentWrapper((({getSystem:s})=>{const o=s().getComponent("OAS31Contact",!0);return Re.createElement(o,null)})),PI=createOnlyOAS31ComponentWrapper((({getSystem:s})=>{const o=s().getComponent("OAS31Info",!0);return Re.createElement(o,null)})),getProperties=(s,{includeReadOnly:o,includeWriteOnly:i})=>{if(!s?.properties)return{};const a=Object.entries(s.properties).filter((([,s])=>(!(!0===s?.readOnly)||o)&&(!(!0===s?.writeOnly)||i)));return Object.fromEntries(a)},makeGetSchemaKeywords=s=>{if("function"!=typeof s)return null;const o=s();return()=>[...o,"discriminator","xml","externalDocs","example","$$ref"]},II=createOnlyOAS31ComponentWrapper((({getSystem:s,...o})=>{const i=s(),{getComponent:a,fn:u,getConfigs:_}=i,w=_(),x=a("OAS31Model"),C=a("withJSONSchema202012SystemContext");return II.ModelWithJSONSchemaContext??=C(x,{config:{default$schema:"https://spec.openapis.org/oas/3.1/dialect/base",defaultExpandedLevels:w.defaultModelExpandDepth,includeReadOnly:o.includeReadOnly,includeWriteOnly:o.includeWriteOnly},fn:{getProperties:u.jsonSchema202012.getProperties,isExpandable:u.jsonSchema202012.isExpandable,getSchemaKeywords:makeGetSchemaKeywords(u.jsonSchema202012.getSchemaKeywords)}}),Re.createElement(II.ModelWithJSONSchemaContext,o)})),TI=II,NI=createOnlyOAS31ComponentWrapper((({getSystem:s})=>{const{getComponent:o,fn:i,getConfigs:a}=s(),u=a();if(NI.ModelsWithJSONSchemaContext)return Re.createElement(NI.ModelsWithJSONSchemaContext,null);const _=o("OAS31Models",!0),w=o("withJSONSchema202012SystemContext");return NI.ModelsWithJSONSchemaContext??=w(_,{config:{default$schema:"https://spec.openapis.org/oas/3.1/dialect/base",defaultExpandedLevels:u.defaultModelsExpandDepth-1,includeReadOnly:!0,includeWriteOnly:!0},fn:{getProperties:i.jsonSchema202012.getProperties,isExpandable:i.jsonSchema202012.isExpandable,getSchemaKeywords:makeGetSchemaKeywords(i.jsonSchema202012.getSchemaKeywords)}}),Re.createElement(NI.ModelsWithJSONSchemaContext,null)}));NI.ModelsWithJSONSchemaContext=null;const MI=NI,wrap_components_version_pragma_filter=(s,o)=>s=>{const i=o.specSelectors.isOAS31(),a=o.getComponent("OAS31VersionPragmaFilter");return Re.createElement(a,Mn()({isOAS31:i},s))},RI=createOnlyOAS31ComponentWrapper((({originalComponent:s,...o})=>{const{getComponent:i,schema:a,name:u}=o,_=i("MutualTLSAuth",!0);return"mutualTLS"===a.get("type")?Re.createElement(_,{schema:a,name:u}):Re.createElement(s,o)})),DI=RI,LI=createOnlyOAS31ComponentWrapper((({getSystem:s,...o})=>{const i=s().getComponent("OAS31Auths",!0);return Re.createElement(i,o)})),FI=(0,ze.Map)(),BI=Ut(((s,o)=>o.specSelectors.specJson()),isOAS31),selectors_webhooks=()=>s=>{const o=s.specSelectors.specJson().get("webhooks");return ze.Map.isMap(o)?o:FI},$I=Ut([(s,o)=>o.specSelectors.webhooks(),(s,o)=>o.specSelectors.validOperationMethods(),(s,o)=>o.specSelectors.specResolvedSubtree(["webhooks"])],((s,o)=>s.reduce(((s,i,a)=>{if(!ze.Map.isMap(i))return s;const u=i.entrySeq().filter((([s])=>o.includes(s))).map((([s,o])=>({operation:(0,ze.Map)({operation:o}),method:s,path:a,specPath:["webhooks",a,s]})));return s.concat(u)}),(0,ze.List)()).groupBy((s=>s.path)).map((s=>s.toArray())).toObject())),selectors_license=()=>s=>{const o=s.specSelectors.info().get("license");return ze.Map.isMap(o)?o:FI},selectLicenseNameField=()=>s=>s.specSelectors.license().get("name","License"),selectLicenseUrlField=()=>s=>s.specSelectors.license().get("url"),qI=Ut([(s,o)=>o.specSelectors.url(),(s,o)=>o.oas3Selectors.selectedServer(),(s,o)=>o.specSelectors.selectLicenseUrlField()],((s,o,i)=>{if(i)return safeBuildUrl(i,s,{selectedServer:o})})),selectLicenseIdentifierField=()=>s=>s.specSelectors.license().get("identifier"),selectors_contact=()=>s=>{const o=s.specSelectors.info().get("contact");return ze.Map.isMap(o)?o:FI},selectContactNameField=()=>s=>s.specSelectors.contact().get("name","the developer"),selectContactEmailField=()=>s=>s.specSelectors.contact().get("email"),selectContactUrlField=()=>s=>s.specSelectors.contact().get("url"),UI=Ut([(s,o)=>o.specSelectors.url(),(s,o)=>o.oas3Selectors.selectedServer(),(s,o)=>o.specSelectors.selectContactUrlField()],((s,o,i)=>{if(i)return safeBuildUrl(i,s,{selectedServer:o})})),selectInfoTitleField=()=>s=>s.specSelectors.info().get("title"),selectInfoSummaryField=()=>s=>s.specSelectors.info().get("summary"),selectInfoDescriptionField=()=>s=>s.specSelectors.info().get("description"),selectInfoTermsOfServiceField=()=>s=>s.specSelectors.info().get("termsOfService"),VI=Ut([(s,o)=>o.specSelectors.url(),(s,o)=>o.oas3Selectors.selectedServer(),(s,o)=>o.specSelectors.selectInfoTermsOfServiceField()],((s,o,i)=>{if(i)return safeBuildUrl(i,s,{selectedServer:o})})),selectExternalDocsDescriptionField=()=>s=>s.specSelectors.externalDocs().get("description"),selectExternalDocsUrlField=()=>s=>s.specSelectors.externalDocs().get("url"),zI=Ut([(s,o)=>o.specSelectors.url(),(s,o)=>o.oas3Selectors.selectedServer(),(s,o)=>o.specSelectors.selectExternalDocsUrlField()],((s,o,i)=>{if(i)return safeBuildUrl(i,s,{selectedServer:o})})),selectJsonSchemaDialectField=()=>s=>s.specSelectors.specJson().get("jsonSchemaDialect"),selectJsonSchemaDialectDefault=()=>"https://spec.openapis.org/oas/3.1/dialect/base",WI=Ut(((s,o)=>o.specSelectors.definitions()),((s,o)=>o.specSelectors.specResolvedSubtree(["components","schemas"])),((s,o)=>ze.Map.isMap(s)?ze.Map.isMap(o)?Object.entries(s.toJS()).reduce(((s,[i,a])=>{const u=o.get(i);return s[i]=u?.toJS()||a,s}),{}):s.toJS():{})),wrap_selectors_isOAS3=(s,o)=>(i,...a)=>o.specSelectors.isOAS31()||s(...a),JI=createOnlyOAS31SelectorWrapper((()=>(s,o)=>o.oas31Selectors.selectLicenseUrl())),HI=createOnlyOAS31SelectorWrapper((()=>(s,o)=>{const i=o.specSelectors.securityDefinitions();let a=s();return i?(i.entrySeq().forEach((([s,o])=>{const i=o?.get("type");"mutualTLS"===i&&(a=a.push(new ze.Map({[s]:o})))})),a):a})),KI=Ut([(s,o)=>o.specSelectors.url(),(s,o)=>o.oas3Selectors.selectedServer(),(s,o)=>o.specSelectors.selectLicenseUrlField(),(s,o)=>o.specSelectors.selectLicenseIdentifierField()],((s,o,i,a)=>i?safeBuildUrl(i,s,{selectedServer:o}):a?`https://spdx.org/licenses/${a}.html`:void 0)),keywords_Example=({schema:s,getSystem:o})=>{const{fn:i,getComponent:a}=o(),{hasKeyword:u}=i.jsonSchema202012.useFn(),_=a("JSONSchema202012JSONViewer");return u(s,"example")?Re.createElement(_,{name:"Example",value:s.example,className:"json-schema-2020-12-keyword json-schema-2020-12-keyword--example"}):null},keywords_Xml=({schema:s,getSystem:o})=>{const i=s?.xml||{},{fn:a,getComponent:u,getConfigs:_}=o(),{showExtensions:w}=_(),{useComponent:x,useIsExpanded:C,usePath:j,useLevel:L}=a.jsonSchema202012,{path:B}=j("xml"),{isExpanded:$,setExpanded:U,setCollapsed:V}=C("xml"),[z,Y]=L(),Z=w?getExtensions(i):[],ee=!!(i.name||i.namespace||i.prefix||Z.length>0),ie=x("Accordion"),ae=x("ExpandDeepButton"),ce=u("OpenAPI31Extensions"),le=u("JSONSchema202012PathContext")(),pe=u("JSONSchema202012LevelContext")(),de=(0,Re.useCallback)((()=>{$?V():U()}),[$,U,V]),fe=(0,Re.useCallback)(((s,o)=>{o?U({deep:!0}):V({deep:!0})}),[U,V]);return 0===Object.keys(i).length?null:Re.createElement(le.Provider,{value:B},Re.createElement(pe.Provider,{value:Y},Re.createElement("div",{className:"json-schema-2020-12-keyword json-schema-2020-12-keyword--xml","data-json-schema-level":z},ee?Re.createElement(Re.Fragment,null,Re.createElement(ie,{expanded:$,onChange:de},Re.createElement("span",{className:"json-schema-2020-12-keyword__name json-schema-2020-12-keyword__name--secondary"},"XML")),Re.createElement(ae,{expanded:$,onClick:fe})):Re.createElement("span",{className:"json-schema-2020-12-keyword__name json-schema-2020-12-keyword__name--secondary"},"XML"),!0===i.attribute&&Re.createElement("span",{className:"json-schema-2020-12__attribute json-schema-2020-12__attribute--muted"},"attribute"),!0===i.wrapped&&Re.createElement("span",{className:"json-schema-2020-12__attribute json-schema-2020-12__attribute--muted"},"wrapped"),Re.createElement("strong",{className:"json-schema-2020-12__attribute json-schema-2020-12__attribute--primary"},"object"),Re.createElement("ul",{className:Jn()("json-schema-2020-12-keyword__children",{"json-schema-2020-12-keyword__children--collapsed":!$})},$&&Re.createElement(Re.Fragment,null,i.name&&Re.createElement("li",{className:"json-schema-2020-12-property"},Re.createElement("div",{className:"json-schema-2020-12-keyword json-schema-2020-12-keyword"},Re.createElement("span",{className:"json-schema-2020-12-keyword__name json-schema-2020-12-keyword__name--secondary"},"name"),Re.createElement("span",{className:"json-schema-2020-12-keyword__value json-schema-2020-12-keyword__value--secondary"},i.name))),i.namespace&&Re.createElement("li",{className:"json-schema-2020-12-property"},Re.createElement("div",{className:"json-schema-2020-12-keyword"},Re.createElement("span",{className:"json-schema-2020-12-keyword__name json-schema-2020-12-keyword__name--secondary"},"namespace"),Re.createElement("span",{className:"json-schema-2020-12-keyword__value json-schema-2020-12-keyword__value--secondary"},i.namespace))),i.prefix&&Re.createElement("li",{className:"json-schema-2020-12-property"},Re.createElement("div",{className:"json-schema-2020-12-keyword"},Re.createElement("span",{className:"json-schema-2020-12-keyword__name json-schema-2020-12-keyword__name--secondary"},"prefix"),Re.createElement("span",{className:"json-schema-2020-12-keyword__value json-schema-2020-12-keyword__value--secondary"},i.prefix)))),Z.length>0&&Re.createElement(ce,{openAPISpecObj:i,openAPIExtensions:Z,getSystem:o})))))},Discriminator_DiscriminatorMapping=({discriminator:s})=>{const o=s?.mapping||{};return 0===Object.keys(o).length?null:Object.entries(o).map((([s,o])=>Re.createElement("div",{key:`${s}-${o}`,className:"json-schema-2020-12-keyword"},Re.createElement("span",{className:"json-schema-2020-12-keyword__name json-schema-2020-12-keyword__name--secondary"},s),Re.createElement("span",{className:"json-schema-2020-12-keyword__value json-schema-2020-12-keyword__value--secondary"},o))))},keywords_Discriminator_Discriminator=({schema:s,getSystem:o})=>{const i=s?.discriminator||{},{fn:a,getComponent:u,getConfigs:_}=o(),{showExtensions:w}=_(),{useComponent:x,useIsExpanded:C,usePath:j,useLevel:L}=a.jsonSchema202012,B="discriminator",{path:$}=j(B),{isExpanded:U,setExpanded:V,setCollapsed:z}=C(B),[Y,Z]=L(),ee=w?getExtensions(i):[],ie=!!(i.mapping||ee.length>0),ae=x("Accordion"),ce=x("ExpandDeepButton"),le=u("OpenAPI31Extensions"),pe=u("JSONSchema202012PathContext")(),de=u("JSONSchema202012LevelContext")(),fe=(0,Re.useCallback)((()=>{U?z():V()}),[U,V,z]),ye=(0,Re.useCallback)(((s,o)=>{o?V({deep:!0}):z({deep:!0})}),[V,z]);return 0===Object.keys(i).length?null:Re.createElement(pe.Provider,{value:$},Re.createElement(de.Provider,{value:Z},Re.createElement("div",{className:"json-schema-2020-12-keyword json-schema-2020-12-keyword--discriminator","data-json-schema-level":Y},ie?Re.createElement(Re.Fragment,null,Re.createElement(ae,{expanded:U,onChange:fe},Re.createElement("span",{className:"json-schema-2020-12-keyword__name json-schema-2020-12-keyword__name--secondary"},"Discriminator")),Re.createElement(ce,{expanded:U,onClick:ye})):Re.createElement("span",{className:"json-schema-2020-12-keyword__name json-schema-2020-12-keyword__name--secondary"},"Discriminator"),i.propertyName&&Re.createElement("span",{className:"json-schema-2020-12__attribute json-schema-2020-12__attribute--muted"},i.propertyName),Re.createElement("strong",{className:"json-schema-2020-12__attribute json-schema-2020-12__attribute--primary"},"object"),Re.createElement("ul",{className:Jn()("json-schema-2020-12-keyword__children",{"json-schema-2020-12-keyword__children--collapsed":!U})},U&&Re.createElement("li",{className:"json-schema-2020-12-property"},Re.createElement(Discriminator_DiscriminatorMapping,{discriminator:i})),ee.length>0&&Re.createElement(le,{openAPISpecObj:i,openAPIExtensions:ee,getSystem:o})))))},keywords_OpenAPIExtensions=({openAPISpecObj:s,getSystem:o,openAPIExtensions:i})=>{const{fn:a}=o(),{useComponent:u}=a.jsonSchema202012,_=u("JSONViewer");return i.map((o=>Re.createElement(_,{key:o,name:o,value:s[o],className:"json-schema-2020-12-json-viewer-extension-keyword"})))},keywords_ExternalDocs=({schema:s,getSystem:o})=>{const i=s?.externalDocs||{},{fn:a,getComponent:u,getConfigs:_}=o(),{showExtensions:w}=_(),{useComponent:x,useIsExpanded:C,usePath:j,useLevel:L}=a.jsonSchema202012,B="externalDocs",{path:$}=j(B),{isExpanded:U,setExpanded:V,setCollapsed:z}=C(B),[Y,Z]=L(),ee=w?getExtensions(i):[],ie=!!(i.description||i.url||ee.length>0),ae=x("Accordion"),ce=x("ExpandDeepButton"),le=u("JSONSchema202012KeywordDescription"),pe=u("Link"),de=u("OpenAPI31Extensions"),fe=u("JSONSchema202012PathContext")(),ye=u("JSONSchema202012LevelContext")(),be=(0,Re.useCallback)((()=>{U?z():V()}),[U,V,z]),_e=(0,Re.useCallback)(((s,o)=>{o?V({deep:!0}):z({deep:!0})}),[V,z]);return 0===Object.keys(i).length?null:Re.createElement(fe.Provider,{value:$},Re.createElement(ye.Provider,{value:Z},Re.createElement("div",{className:"json-schema-2020-12-keyword json-schema-2020-12-keyword--externalDocs","data-json-schema-level":Y},ie?Re.createElement(Re.Fragment,null,Re.createElement(ae,{expanded:U,onChange:be},Re.createElement("span",{className:"json-schema-2020-12-keyword__name json-schema-2020-12-keyword__name--secondary"},"External documentation")),Re.createElement(ce,{expanded:U,onClick:_e})):Re.createElement("span",{className:"json-schema-2020-12-keyword__name json-schema-2020-12-keyword__name--secondary"},"External documentation"),Re.createElement("strong",{className:"json-schema-2020-12__attribute json-schema-2020-12__attribute--primary"},"object"),Re.createElement("ul",{className:Jn()("json-schema-2020-12-keyword__children",{"json-schema-2020-12-keyword__children--collapsed":!U})},U&&Re.createElement(Re.Fragment,null,i.description&&Re.createElement("li",{className:"json-schema-2020-12-property"},Re.createElement(le,{schema:i,getSystem:o})),i.url&&Re.createElement("li",{className:"json-schema-2020-12-property"},Re.createElement("div",{className:"json-schema-2020-12-keyword json-schema-2020-12-keyword"},Re.createElement("span",{className:"json-schema-2020-12-keyword__name json-schema-2020-12-keyword__name--secondary"},"url"),Re.createElement("span",{className:"json-schema-2020-12-keyword__value json-schema-2020-12-keyword__value--secondary"},Re.createElement(pe,{target:"_blank",href:sanitizeUrl(i.url)},i.url))))),ee.length>0&&Re.createElement(de,{openAPISpecObj:i,openAPIExtensions:ee,getSystem:o})))))},keywords_Description=({schema:s,getSystem:o})=>{if(!s?.description)return null;const{getComponent:i}=o(),a=i("Markdown");return Re.createElement("div",{className:"json-schema-2020-12-keyword json-schema-2020-12-keyword--description"},Re.createElement("div",{className:"json-schema-2020-12-core-keyword__value json-schema-2020-12-core-keyword__value--secondary"},Re.createElement(a,{source:s.description})))},GI=createOnlyOAS31ComponentWrapper(keywords_Description),YI=createOnlyOAS31ComponentWrapper((({schema:s,getSystem:o,originalComponent:i})=>{const{getComponent:a}=o(),u=a("JSONSchema202012KeywordDiscriminator"),_=a("JSONSchema202012KeywordXml"),w=a("JSONSchema202012KeywordExample"),x=a("JSONSchema202012KeywordExternalDocs");return Re.createElement(Re.Fragment,null,Re.createElement(i,{schema:s}),Re.createElement(u,{schema:s,getSystem:o}),Re.createElement(_,{schema:s,getSystem:o}),Re.createElement(x,{schema:s,getSystem:o}),Re.createElement(w,{schema:s,getSystem:o}))})),XI=YI,keywords_Properties=({schema:s,getSystem:o})=>{const{fn:i,getComponent:a}=o(),{useComponent:u,usePath:_}=i.jsonSchema202012,{getDependentRequired:w,getProperties:x}=i.jsonSchema202012.useFn(),C=i.jsonSchema202012.useConfig(),j=Array.isArray(s?.required)?s.required:[],{path:L}=_("properties"),B=u("JSONSchema"),$=a("JSONSchema202012PathContext")(),U=x(s,C);return 0===Object.keys(U).length?null:Re.createElement($.Provider,{value:L},Re.createElement("div",{className:"json-schema-2020-12-keyword json-schema-2020-12-keyword--properties"},Re.createElement("ul",null,Object.entries(U).map((([o,i])=>{const a=j.includes(o),u=w(o,s);return Re.createElement("li",{key:o,className:Jn()("json-schema-2020-12-property",{"json-schema-2020-12-property--required":a})},Re.createElement(B,{name:o,schema:i,dependentRequired:u}))})))))},QI=createOnlyOAS31ComponentWrapper(keywords_Properties);const ZI=function oas31_after_load_afterLoad({fn:s,getSystem:o}){if(s.jsonSchema202012){const i=((s,o)=>{const{fn:i}=o();if("function"!=typeof s)return null;const{hasKeyword:a}=i.jsonSchema202012;return o=>s(o)||a(o,"example")||o?.xml||o?.discriminator||o?.externalDocs})(s.jsonSchema202012.isExpandable,o);Object.assign(this.fn.jsonSchema202012,{isExpandable:i,getProperties})}if("function"==typeof s.sampleFromSchema&&s.jsonSchema202012){const i=wrapOAS31Fn({sampleFromSchema:s.jsonSchema202012.sampleFromSchema,sampleFromSchemaGeneric:s.jsonSchema202012.sampleFromSchemaGeneric,createXMLExample:s.jsonSchema202012.createXMLExample,memoizedSampleFromSchema:s.jsonSchema202012.memoizedSampleFromSchema,memoizedCreateXMLExample:s.jsonSchema202012.memoizedCreateXMLExample,getJsonSampleSchema:s.jsonSchema202012.getJsonSampleSchema,getYamlSampleSchema:s.jsonSchema202012.getYamlSampleSchema,getXmlSampleSchema:s.jsonSchema202012.getXmlSampleSchema,getSampleSchema:s.jsonSchema202012.getSampleSchema,mergeJsonSchema:s.jsonSchema202012.mergeJsonSchema,getSchemaObjectTypeLabel:o=>s.jsonSchema202012.getType(immutableToJS(o)),getSchemaObjectType:o=>s.jsonSchema202012.foldType(immutableToJS(o)?.type)},o());Object.assign(this.fn,i)}const i=(s=>(o,i=null)=>{const{fn:a}=s();if(a.isFileUploadIntendedOAS30(o,i))return!0;const u=ze.Map.isMap(o);if(!u&&!as()(o))return!1;const _=u?o.get("contentMediaType"):o.contentMediaType,w=u?o.get("contentEncoding"):o.contentEncoding;return"string"==typeof _&&""!==_||"string"==typeof w&&""!==w})(o),{isFileUploadIntended:a}=wrapOAS31Fn({isFileUploadIntended:i},o());if(this.fn.isFileUploadIntended=a,this.fn.isFileUploadIntendedOAS31=i,s.jsonSchema202012){const{hasSchemaType:i}=wrapOAS31Fn({hasSchemaType:s.jsonSchema202012.hasSchemaType},o());this.fn.hasSchemaType=i}},oas31=({fn:s})=>{const o=s.createSystemSelector||fn_createSystemSelector,i=s.createOnlyOAS31Selector||fn_createOnlyOAS31Selector;return{afterLoad:ZI,fn:{isOAS31,createSystemSelector:fn_createSystemSelector,createOnlyOAS31Selector:fn_createOnlyOAS31Selector},components:{Webhooks:webhooks,JsonSchemaDialect:json_schema_dialect,MutualTLSAuth:mutual_tls_auth,OAS31Info:oas31_components_info,OAS31License:oas31_components_license,OAS31Contact:oas31_components_contact,OAS31VersionPragmaFilter:version_pragma_filter,OAS31Model:OI,OAS31Models:models,OAS31Auths:AI,JSONSchema202012KeywordExample:keywords_Example,JSONSchema202012KeywordXml:keywords_Xml,JSONSchema202012KeywordDiscriminator:keywords_Discriminator_Discriminator,JSONSchema202012KeywordExternalDocs:keywords_ExternalDocs,OpenAPI31Extensions:keywords_OpenAPIExtensions},wrapComponents:{InfoContainer:PI,License:CI,Contact:jI,VersionPragmaFilter:wrap_components_version_pragma_filter,Model:TI,Models:MI,AuthItem:DI,auths:LI,JSONSchema202012KeywordDescription:GI,JSONSchema202012KeywordExamples:XI,JSONSchema202012KeywordProperties:QI},statePlugins:{auth:{wrapSelectors:{definitionsToAuthorize:HI}},spec:{selectors:{isOAS31:o(BI),license:selectors_license,selectLicenseNameField,selectLicenseUrlField,selectLicenseIdentifierField:i(selectLicenseIdentifierField),selectLicenseUrl:o(qI),contact:selectors_contact,selectContactNameField,selectContactEmailField,selectContactUrlField,selectContactUrl:o(UI),selectInfoTitleField,selectInfoSummaryField:i(selectInfoSummaryField),selectInfoDescriptionField,selectInfoTermsOfServiceField,selectInfoTermsOfServiceUrl:o(VI),selectExternalDocsDescriptionField,selectExternalDocsUrlField,selectExternalDocsUrl:o(zI),webhooks:i(selectors_webhooks),selectWebhooksOperations:i(o($I)),selectJsonSchemaDialectField,selectJsonSchemaDialectDefault,selectSchemas:o(WI)},wrapSelectors:{isOAS3:wrap_selectors_isOAS3,selectLicenseUrl:JI}},oas31:{selectors:{selectLicenseUrl:i(o(KI))}}}}},eT=es().object,tT=es().bool,rT=(es().oneOfType([eT,tT]),(0,Re.createContext)(null));rT.displayName="JSONSchemaContext";const nT=(0,Re.createContext)(0);nT.displayName="JSONSchemaLevelContext";const sT=(0,Re.createContext)(new Set),oT=(0,Re.createContext)([]);class JSONSchemaIsExpandedState{static Collapsed="collapsed";static Expanded="expanded";static DeeplyExpanded="deeply-expanded"}const useConfig=()=>{const{config:s}=(0,Re.useContext)(rT);return s},useComponent=s=>{const{components:o}=(0,Re.useContext)(rT);return o[s]||null},useFn=(s=void 0)=>{const{fn:o}=(0,Re.useContext)(rT);return void 0!==s?o[s]:o},useJSONSchemaContextState=()=>{const[,s]=(0,Re.useState)(null),{state:o}=(0,Re.useContext)(rT);return{state:o,setState:i=>{i(o),s({})}}},useLevel=()=>{const s=(0,Re.useContext)(nT);return[s,s+1]},usePath=s=>{const o=(0,Re.useContext)(oT),{setState:i}=useJSONSchemaContextState(),a="string"==typeof s?[...o,s]:o;return{path:a,pathMutator:(s,o={deep:!1})=>{const u=a.toString(),updateFn=o=>{o.paths[u]=s,s===JSONSchemaIsExpandedState.Collapsed&&Object.keys(o.paths).forEach((s=>{s.startsWith(u)&&o.paths[s]===JSONSchemaIsExpandedState.DeeplyExpanded&&(o.paths[s]=JSONSchemaIsExpandedState.Expanded)}))},updateDeepFn=o=>{Object.keys(o.paths).forEach((i=>{i.startsWith(u)&&(o.paths[i]=s)}))};o.deep?i(updateDeepFn):i(updateFn)}}},useIsExpanded=s=>{const[o]=useLevel(),{defaultExpandedLevels:i}=useConfig(),{path:a,pathMutator:u}=usePath(s),{path:_}=usePath(),{state:w}=useJSONSchemaContextState(),x=w.paths[a.toString()],C=w.paths[_.toString()]??w.paths[_.slice(0,-1).toString()],j=x??(i-o>0?JSONSchemaIsExpandedState.Expanded:JSONSchemaIsExpandedState.Collapsed),L=j!==JSONSchemaIsExpandedState.Collapsed;(0,Re.useEffect)((()=>{u(C===JSONSchemaIsExpandedState.DeeplyExpanded?JSONSchemaIsExpandedState.DeeplyExpanded:j)}),[C]);return{isExpanded:L,setExpanded:(0,Re.useCallback)(((s={deep:!1})=>{u(s.deep?JSONSchemaIsExpandedState.DeeplyExpanded:JSONSchemaIsExpandedState.Expanded)}),[]),setCollapsed:(0,Re.useCallback)(((s={deep:!1})=>{u(JSONSchemaIsExpandedState.Collapsed,s)}),[])}},useRenderedSchemas=(s=void 0)=>{if(void 0===s)return(0,Re.useContext)(sT);const o=(0,Re.useContext)(sT);return new Set([...o,s])},iT=(0,Re.forwardRef)((({schema:s,name:o="",dependentRequired:i=[],onExpand:a=()=>{},identifier:u=""},_)=>{const w=useFn(),x=u||s?.$id||o,{path:C}=usePath(x),{isExpanded:j,setExpanded:L,setCollapsed:B}=useIsExpanded(x),[$,U]=useLevel(),V=(()=>{const[s]=useLevel();return s>0})(),z=w.isExpandable(s)||i.length>0,Y=(s=>useRenderedSchemas().has(s))(s),Z=useRenderedSchemas(s),ee=w.stringifyConstraints(s),ie=useComponent("Accordion"),ae=useComponent("Keyword$schema"),ce=useComponent("Keyword$vocabulary"),le=useComponent("Keyword$id"),pe=useComponent("Keyword$anchor"),de=useComponent("Keyword$dynamicAnchor"),fe=useComponent("Keyword$ref"),ye=useComponent("Keyword$dynamicRef"),be=useComponent("Keyword$defs"),_e=useComponent("Keyword$comment"),Se=useComponent("KeywordAllOf"),we=useComponent("KeywordAnyOf"),xe=useComponent("KeywordOneOf"),Pe=useComponent("KeywordNot"),Te=useComponent("KeywordIf"),$e=useComponent("KeywordThen"),qe=useComponent("KeywordElse"),ze=useComponent("KeywordDependentSchemas"),We=useComponent("KeywordPrefixItems"),He=useComponent("KeywordItems"),Ye=useComponent("KeywordContains"),Xe=useComponent("KeywordProperties"),Qe=useComponent("KeywordPatternProperties"),et=useComponent("KeywordAdditionalProperties"),tt=useComponent("KeywordPropertyNames"),rt=useComponent("KeywordUnevaluatedItems"),nt=useComponent("KeywordUnevaluatedProperties"),st=useComponent("KeywordType"),ot=useComponent("KeywordEnum"),it=useComponent("KeywordConst"),at=useComponent("KeywordConstraint"),ct=useComponent("KeywordDependentRequired"),lt=useComponent("KeywordContentSchema"),ut=useComponent("KeywordTitle"),pt=useComponent("KeywordDescription"),ht=useComponent("KeywordDefault"),dt=useComponent("KeywordDeprecated"),mt=useComponent("KeywordReadOnly"),gt=useComponent("KeywordWriteOnly"),yt=useComponent("KeywordExamples"),vt=useComponent("ExtensionKeywords"),bt=useComponent("ExpandDeepButton"),_t=(0,Re.useCallback)(((s,o)=>{o?L():B(),a(s,o,!1)}),[a,L,B]),St=(0,Re.useCallback)(((s,o)=>{o?L({deep:!0}):B({deep:!0}),a(s,o,!0)}),[a,L,B]);return Re.createElement(oT.Provider,{value:C},Re.createElement(nT.Provider,{value:U},Re.createElement(sT.Provider,{value:Z},Re.createElement("article",{ref:_,"data-json-schema-level":$,className:Jn()("json-schema-2020-12",{"json-schema-2020-12--embedded":V,"json-schema-2020-12--circular":Y})},Re.createElement("div",{className:"json-schema-2020-12-head"},z&&!Y?Re.createElement(Re.Fragment,null,Re.createElement(ie,{expanded:j,onChange:_t},Re.createElement(ut,{title:o,schema:s})),Re.createElement(bt,{expanded:j,onClick:St})):Re.createElement(ut,{title:o,schema:s}),Re.createElement(dt,{schema:s}),Re.createElement(mt,{schema:s}),Re.createElement(gt,{schema:s}),Re.createElement(st,{schema:s,isCircular:Y}),ee.length>0&&ee.map((s=>Re.createElement(at,{key:`${s.scope}-${s.value}`,constraint:s})))),Re.createElement("div",{className:Jn()("json-schema-2020-12-body",{"json-schema-2020-12-body--collapsed":!j})},j&&Re.createElement(Re.Fragment,null,Re.createElement(pt,{schema:s}),!Y&&z&&Re.createElement(Re.Fragment,null,Re.createElement(Xe,{schema:s}),Re.createElement(Qe,{schema:s}),Re.createElement(et,{schema:s}),Re.createElement(nt,{schema:s}),Re.createElement(tt,{schema:s}),Re.createElement(Se,{schema:s}),Re.createElement(we,{schema:s}),Re.createElement(xe,{schema:s}),Re.createElement(Pe,{schema:s}),Re.createElement(Te,{schema:s}),Re.createElement($e,{schema:s}),Re.createElement(qe,{schema:s}),Re.createElement(ze,{schema:s}),Re.createElement(We,{schema:s}),Re.createElement(He,{schema:s}),Re.createElement(rt,{schema:s}),Re.createElement(Ye,{schema:s}),Re.createElement(lt,{schema:s})),Re.createElement(ot,{schema:s}),Re.createElement(it,{schema:s}),Re.createElement(ct,{schema:s,dependentRequired:i}),Re.createElement(ht,{schema:s}),Re.createElement(yt,{schema:s}),Re.createElement(ae,{schema:s}),Re.createElement(ce,{schema:s}),Re.createElement(le,{schema:s}),Re.createElement(pe,{schema:s}),Re.createElement(de,{schema:s}),Re.createElement(fe,{schema:s}),!Y&&z&&Re.createElement(be,{schema:s}),Re.createElement(ye,{schema:s}),Re.createElement(_e,{schema:s}),Re.createElement(vt,{schema:s})))))))})),aT=iT,keywords_$schema=({schema:s})=>s?.$schema?Re.createElement("div",{className:"json-schema-2020-12-keyword json-schema-2020-12-keyword--$schema"},Re.createElement("span",{className:"json-schema-2020-12-keyword__name json-schema-2020-12-keyword__name--secondary"},"$schema"),Re.createElement("span",{className:"json-schema-2020-12-keyword__value json-schema-2020-12-keyword__value--secondary"},s.$schema)):null,$vocabulary_$vocabulary=({schema:s})=>{const o="$vocabulary",{path:i}=usePath(o),{isExpanded:a,setExpanded:u,setCollapsed:_}=useIsExpanded(o),w=useComponent("Accordion"),x=(0,Re.useCallback)((()=>{a?_():u()}),[a,u,_]);return s?.$vocabulary?"object"!=typeof s.$vocabulary?null:Re.createElement(oT.Provider,{value:i},Re.createElement("div",{className:"json-schema-2020-12-keyword json-schema-2020-12-keyword--$vocabulary"},Re.createElement(w,{expanded:a,onChange:x},Re.createElement("span",{className:"json-schema-2020-12-keyword__name json-schema-2020-12-keyword__name--secondary"},"$vocabulary")),Re.createElement("strong",{className:"json-schema-2020-12__attribute json-schema-2020-12__attribute--primary"},"object"),Re.createElement("ul",null,a&&Object.entries(s.$vocabulary).map((([s,o])=>Re.createElement("li",{key:s,className:Jn()("json-schema-2020-12-$vocabulary-uri",{"json-schema-2020-12-$vocabulary-uri--disabled":!o})},Re.createElement("span",{className:"json-schema-2020-12-keyword__value json-schema-2020-12-keyword__value--secondary"},s))))))):null},keywords_$id=({schema:s})=>s?.$id?Re.createElement("div",{className:"json-schema-2020-12-keyword json-schema-2020-12-keyword--$id"},Re.createElement("span",{className:"json-schema-2020-12-keyword__name json-schema-2020-12-keyword__name--secondary"},"$id"),Re.createElement("span",{className:"json-schema-2020-12-keyword__value json-schema-2020-12-keyword__value--secondary"},s.$id)):null,keywords_$anchor=({schema:s})=>s?.$anchor?Re.createElement("div",{className:"json-schema-2020-12-keyword json-schema-2020-12-keyword--$anchor"},Re.createElement("span",{className:"json-schema-2020-12-keyword__name json-schema-2020-12-keyword__name--secondary"},"$anchor"),Re.createElement("span",{className:"json-schema-2020-12-keyword__value json-schema-2020-12-keyword__value--secondary"},s.$anchor)):null,keywords_$dynamicAnchor=({schema:s})=>s?.$dynamicAnchor?Re.createElement("div",{className:"json-schema-2020-12-keyword json-schema-2020-12-keyword--$dynamicAnchor"},Re.createElement("span",{className:"json-schema-2020-12-keyword__name json-schema-2020-12-keyword__name--secondary"},"$dynamicAnchor"),Re.createElement("span",{className:"json-schema-2020-12-keyword__value json-schema-2020-12-keyword__value--secondary"},s.$dynamicAnchor)):null,keywords_$ref=({schema:s})=>s?.$ref?Re.createElement("div",{className:"json-schema-2020-12-keyword json-schema-2020-12-keyword--$ref"},Re.createElement("span",{className:"json-schema-2020-12-keyword__name json-schema-2020-12-keyword__name--secondary"},"$ref"),Re.createElement("span",{className:"json-schema-2020-12-keyword__value json-schema-2020-12-keyword__value--secondary"},s.$ref)):null,keywords_$dynamicRef=({schema:s})=>s?.$dynamicRef?Re.createElement("div",{className:"json-schema-2020-12-keyword json-schema-2020-12-keyword--$dynamicRef"},Re.createElement("span",{className:"json-schema-2020-12-keyword__name json-schema-2020-12-keyword__name--secondary"},"$dynamicRef"),Re.createElement("span",{className:"json-schema-2020-12-keyword__value json-schema-2020-12-keyword__value--secondary"},s.$dynamicRef)):null,keywords_$defs=({schema:s})=>{const o=s?.$defs||{},i="$defs",{path:a}=usePath(i),{isExpanded:u,setExpanded:_,setCollapsed:w}=useIsExpanded(i),[x,C]=useLevel(),j=useComponent("Accordion"),L=useComponent("ExpandDeepButton"),B=useComponent("JSONSchema"),$=(0,Re.useCallback)((()=>{u?w():_()}),[u,_,w]),U=(0,Re.useCallback)(((s,o)=>{o?_({deep:!0}):w({deep:!0})}),[_,w]);return 0===Object.keys(o).length?null:Re.createElement(oT.Provider,{value:a},Re.createElement(nT.Provider,{value:C},Re.createElement("div",{className:"json-schema-2020-12-keyword json-schema-2020-12-keyword--$defs","data-json-schema-level":x},Re.createElement(j,{expanded:u,onChange:$},Re.createElement("span",{className:"json-schema-2020-12-keyword__name json-schema-2020-12-keyword__name--secondary"},"$defs")),Re.createElement(L,{expanded:u,onClick:U}),Re.createElement("strong",{className:"json-schema-2020-12__attribute json-schema-2020-12__attribute--primary"},"object"),Re.createElement("ul",{className:Jn()("json-schema-2020-12-keyword__children",{"json-schema-2020-12-keyword__children--collapsed":!u})},u&&Re.createElement(Re.Fragment,null,Object.entries(o).map((([s,o])=>Re.createElement("li",{key:s,className:"json-schema-2020-12-property"},Re.createElement(B,{name:s,schema:o})))))))))},keywords_$comment=({schema:s})=>s?.$comment?Re.createElement("div",{className:"json-schema-2020-12-keyword json-schema-2020-12-keyword--$comment"},Re.createElement("span",{className:"json-schema-2020-12-keyword__name json-schema-2020-12-keyword__name--secondary"},"$comment"),Re.createElement("span",{className:"json-schema-2020-12-keyword__value json-schema-2020-12-keyword__value--secondary"},s.$comment)):null,keywords_AllOf=({schema:s})=>{const o=s?.allOf||[],i=useFn(),a="allOf",{path:u}=usePath(a),{isExpanded:_,setExpanded:w,setCollapsed:x}=useIsExpanded(a),[C,j]=useLevel(),L=useComponent("Accordion"),B=useComponent("ExpandDeepButton"),$=useComponent("JSONSchema"),U=useComponent("KeywordType"),V=(0,Re.useCallback)((()=>{_?x():w()}),[_,w,x]),z=(0,Re.useCallback)(((s,o)=>{o?w({deep:!0}):x({deep:!0})}),[w,x]);return Array.isArray(o)&&0!==o.length?Re.createElement(oT.Provider,{value:u},Re.createElement(nT.Provider,{value:j},Re.createElement("div",{className:"json-schema-2020-12-keyword json-schema-2020-12-keyword--allOf","data-json-schema-level":C},Re.createElement(L,{expanded:_,onChange:V},Re.createElement("span",{className:"json-schema-2020-12-keyword__name json-schema-2020-12-keyword__name--primary"},"All of")),Re.createElement(B,{expanded:_,onClick:z}),Re.createElement(U,{schema:{allOf:o}}),Re.createElement("ul",{className:Jn()("json-schema-2020-12-keyword__children",{"json-schema-2020-12-keyword__children--collapsed":!_})},_&&Re.createElement(Re.Fragment,null,o.map(((s,o)=>Re.createElement("li",{key:`#${o}`,className:"json-schema-2020-12-property"},Re.createElement($,{name:`#${o} ${i.getTitle(s)}`,schema:s}))))))))):null},keywords_AnyOf=({schema:s})=>{const o=s?.anyOf||[],i=useFn(),a="anyOf",{path:u}=usePath(a),{isExpanded:_,setExpanded:w,setCollapsed:x}=useIsExpanded(a),[C,j]=useLevel(),L=useComponent("Accordion"),B=useComponent("ExpandDeepButton"),$=useComponent("JSONSchema"),U=useComponent("KeywordType"),V=(0,Re.useCallback)((()=>{_?x():w()}),[_,w,x]),z=(0,Re.useCallback)(((s,o)=>{o?w({deep:!0}):x({deep:!0})}),[w,x]);return Array.isArray(o)&&0!==o.length?Re.createElement(oT.Provider,{value:u},Re.createElement(nT.Provider,{value:j},Re.createElement("div",{className:"json-schema-2020-12-keyword json-schema-2020-12-keyword--anyOf","data-json-schema-level":C},Re.createElement(L,{expanded:_,onChange:V},Re.createElement("span",{className:"json-schema-2020-12-keyword__name json-schema-2020-12-keyword__name--primary"},"Any of")),Re.createElement(B,{expanded:_,onClick:z}),Re.createElement(U,{schema:{anyOf:o}}),Re.createElement("ul",{className:Jn()("json-schema-2020-12-keyword__children",{"json-schema-2020-12-keyword__children--collapsed":!_})},_&&Re.createElement(Re.Fragment,null,o.map(((s,o)=>Re.createElement("li",{key:`#${o}`,className:"json-schema-2020-12-property"},Re.createElement($,{name:`#${o} ${i.getTitle(s)}`,schema:s}))))))))):null},keywords_OneOf=({schema:s})=>{const o=s?.oneOf||[],i=useFn(),a="oneOf",{path:u}=usePath(a),{isExpanded:_,setExpanded:w,setCollapsed:x}=useIsExpanded(a),[C,j]=useLevel(),L=useComponent("Accordion"),B=useComponent("ExpandDeepButton"),$=useComponent("JSONSchema"),U=useComponent("KeywordType"),V=(0,Re.useCallback)((()=>{_?x():w()}),[_,w,x]),z=(0,Re.useCallback)(((s,o)=>{o?w({deep:!0}):x({deep:!0})}),[w,x]);return Array.isArray(o)&&0!==o.length?Re.createElement(oT.Provider,{value:u},Re.createElement(nT.Provider,{value:j},Re.createElement("div",{className:"json-schema-2020-12-keyword json-schema-2020-12-keyword--oneOf","data-json-schema-level":C},Re.createElement(L,{expanded:_,onChange:V},Re.createElement("span",{className:"json-schema-2020-12-keyword__name json-schema-2020-12-keyword__name--primary"},"One of")),Re.createElement(B,{expanded:_,onClick:z}),Re.createElement(U,{schema:{oneOf:o}}),Re.createElement("ul",{className:Jn()("json-schema-2020-12-keyword__children",{"json-schema-2020-12-keyword__children--collapsed":!_})},_&&Re.createElement(Re.Fragment,null,o.map(((s,o)=>Re.createElement("li",{key:`#${o}`,className:"json-schema-2020-12-property"},Re.createElement($,{name:`#${o} ${i.getTitle(s)}`,schema:s}))))))))):null},keywords_Not=({schema:s})=>{const o=useFn(),i=useComponent("JSONSchema");if(!o.hasKeyword(s,"not"))return null;const a=Re.createElement("span",{className:"json-schema-2020-12-keyword__name json-schema-2020-12-keyword__name--primary"},"Not");return Re.createElement("div",{className:"json-schema-2020-12-keyword json-schema-2020-12-keyword--not"},Re.createElement(i,{name:a,schema:s.not,identifier:"not"}))},keywords_If=({schema:s})=>{const o=useFn(),i=useComponent("JSONSchema");if(!o.hasKeyword(s,"if"))return null;const a=Re.createElement("span",{className:"json-schema-2020-12-keyword__name json-schema-2020-12-keyword__name--primary"},"If");return Re.createElement("div",{className:"json-schema-2020-12-keyword json-schema-2020-12-keyword--if"},Re.createElement(i,{name:a,schema:s.if,identifier:"if"}))},keywords_Then=({schema:s})=>{const o=useFn(),i=useComponent("JSONSchema");if(!o.hasKeyword(s,"then"))return null;const a=Re.createElement("span",{className:"json-schema-2020-12-keyword__name json-schema-2020-12-keyword__name--primary"},"Then");return Re.createElement("div",{className:"json-schema-2020-12-keyword json-schema-2020-12-keyword--then"},Re.createElement(i,{name:a,schema:s.then,identifier:"then"}))},keywords_Else=({schema:s})=>{const o=useFn(),i=useComponent("JSONSchema");if(!o.hasKeyword(s,"else"))return null;const a=Re.createElement("span",{className:"json-schema-2020-12-keyword__name json-schema-2020-12-keyword__name--primary"},"Else");return Re.createElement("div",{className:"json-schema-2020-12-keyword json-schema-2020-12-keyword--if"},Re.createElement(i,{name:a,schema:s.else,identifier:"else"}))},keywords_DependentSchemas=({schema:s})=>{const o=s?.dependentSchemas||[],i="dependentSchemas",{path:a}=usePath(i),{isExpanded:u,setExpanded:_,setCollapsed:w}=useIsExpanded(i),[x,C]=useLevel(),j=useComponent("Accordion"),L=useComponent("ExpandDeepButton"),B=useComponent("JSONSchema"),$=(0,Re.useCallback)((()=>{u?w():_()}),[u,_,w]),U=(0,Re.useCallback)(((s,o)=>{o?_({deep:!0}):w({deep:!0})}),[_,w]);return"object"!=typeof o||0===Object.keys(o).length?null:Re.createElement(oT.Provider,{value:a},Re.createElement(nT.Provider,{value:C},Re.createElement("div",{className:"json-schema-2020-12-keyword json-schema-2020-12-keyword--dependentSchemas","data-json-schema-level":x},Re.createElement(j,{expanded:u,onChange:$},Re.createElement("span",{className:"json-schema-2020-12-keyword__name json-schema-2020-12-keyword__name--primary"},"Dependent schemas")),Re.createElement(L,{expanded:u,onClick:U}),Re.createElement("strong",{className:"json-schema-2020-12__attribute json-schema-2020-12__attribute--primary"},"object"),Re.createElement("ul",{className:Jn()("json-schema-2020-12-keyword__children",{"json-schema-2020-12-keyword__children--collapsed":!u})},u&&Re.createElement(Re.Fragment,null,Object.entries(o).map((([s,o])=>Re.createElement("li",{key:s,className:"json-schema-2020-12-property"},Re.createElement(B,{name:s,schema:o})))))))))},keywords_PrefixItems=({schema:s})=>{const o=s?.prefixItems||[],i=useFn(),a="prefixItems",{path:u}=usePath(a),{isExpanded:_,setExpanded:w,setCollapsed:x}=useIsExpanded(a),[C,j]=useLevel(),L=useComponent("Accordion"),B=useComponent("ExpandDeepButton"),$=useComponent("JSONSchema"),U=useComponent("KeywordType"),V=(0,Re.useCallback)((()=>{_?x():w()}),[_,w,x]),z=(0,Re.useCallback)(((s,o)=>{o?w({deep:!0}):x({deep:!0})}),[w,x]);return Array.isArray(o)&&0!==o.length?Re.createElement(oT.Provider,{value:u},Re.createElement(nT.Provider,{value:j},Re.createElement("div",{className:"json-schema-2020-12-keyword json-schema-2020-12-keyword--prefixItems","data-json-schema-level":C},Re.createElement(L,{expanded:_,onChange:V},Re.createElement("span",{className:"json-schema-2020-12-keyword__name json-schema-2020-12-keyword__name--primary"},"Prefix items")),Re.createElement(B,{expanded:_,onClick:z}),Re.createElement(U,{schema:{prefixItems:o}}),Re.createElement("ul",{className:Jn()("json-schema-2020-12-keyword__children",{"json-schema-2020-12-keyword__children--collapsed":!_})},_&&Re.createElement(Re.Fragment,null,o.map(((s,o)=>Re.createElement("li",{key:`#${o}`,className:"json-schema-2020-12-property"},Re.createElement($,{name:`#${o} ${i.getTitle(s)}`,schema:s}))))))))):null},keywords_Items=({schema:s})=>{const o=useFn(),i=useComponent("JSONSchema");if(!o.hasKeyword(s,"items"))return null;const a=Re.createElement("span",{className:"json-schema-2020-12-keyword__name json-schema-2020-12-keyword__name--primary"},"Items");return Re.createElement("div",{className:"json-schema-2020-12-keyword json-schema-2020-12-keyword--items"},Re.createElement(i,{name:a,schema:s.items,identifier:"items"}))},keywords_Contains=({schema:s})=>{const o=useFn(),i=useComponent("JSONSchema");if(!o.hasKeyword(s,"contains"))return null;const a=Re.createElement("span",{className:"json-schema-2020-12-keyword__name json-schema-2020-12-keyword__name--primary"},"Contains");return Re.createElement("div",{className:"json-schema-2020-12-keyword json-schema-2020-12-keyword--contains"},Re.createElement(i,{name:a,schema:s.contains,identifier:"contains"}))},keywords_Properties_Properties=({schema:s})=>{const o=useFn(),i=s?.properties||{},a=Array.isArray(s?.required)?s.required:[],u=useComponent("JSONSchema"),{path:_}=usePath("properties");return 0===Object.keys(i).length?null:Re.createElement(oT.Provider,{value:_},Re.createElement("div",{className:"json-schema-2020-12-keyword json-schema-2020-12-keyword--properties"},Re.createElement("ul",null,Object.entries(i).map((([i,_])=>{const w=a.includes(i),x=o.getDependentRequired(i,s);return Re.createElement("li",{key:i,className:Jn()("json-schema-2020-12-property",{"json-schema-2020-12-property--required":w})},Re.createElement(u,{name:i,schema:_,dependentRequired:x}))})))))},PatternProperties_PatternProperties=({schema:s})=>{const o=s?.patternProperties||{},i=useComponent("JSONSchema"),{path:a}=usePath("patternProperties");return 0===Object.keys(o).length?null:Re.createElement(oT.Provider,{value:a},Re.createElement("div",{className:"json-schema-2020-12-keyword json-schema-2020-12-keyword--patternProperties"},Re.createElement("ul",null,Object.entries(o).map((([s,o])=>Re.createElement("li",{key:s,className:"json-schema-2020-12-property"},Re.createElement(i,{name:s,schema:o})))))))},keywords_AdditionalProperties=({schema:s})=>{const o=useFn(),i=useComponent("JSONSchema");if(!o.hasKeyword(s,"additionalProperties"))return null;const a=Re.createElement("span",{className:"json-schema-2020-12-keyword__name json-schema-2020-12-keyword__name--primary"},"Additional properties");return Re.createElement("div",{className:"json-schema-2020-12-keyword json-schema-2020-12-keyword--additionalProperties"},!0===s.additionalProperties?Re.createElement(Re.Fragment,null,a,Re.createElement("span",{className:"json-schema-2020-12__attribute json-schema-2020-12__attribute--primary"},"allowed")):!1===s.additionalProperties?Re.createElement(Re.Fragment,null,a,Re.createElement("span",{className:"json-schema-2020-12__attribute json-schema-2020-12__attribute--primary"},"forbidden")):Re.createElement(i,{name:a,schema:s.additionalProperties,identifier:"additionalProperties"}))},keywords_PropertyNames=({schema:s})=>{const o=useFn(),i=useComponent("JSONSchema"),a=Re.createElement("span",{className:"json-schema-2020-12-keyword__name json-schema-2020-12-keyword__name--primary"},"Property names");return o.hasKeyword(s,"propertyNames")?Re.createElement("div",{className:"json-schema-2020-12-keyword json-schema-2020-12-keyword--propertyNames"},Re.createElement(i,{name:a,schema:s.propertyNames,identifier:"propertyNames"})):null},keywords_UnevaluatedItems=({schema:s})=>{const o=useFn(),i=useComponent("JSONSchema");if(!o.hasKeyword(s,"unevaluatedItems"))return null;const a=Re.createElement("span",{className:"json-schema-2020-12-keyword__name json-schema-2020-12-keyword__name--primary"},"Unevaluated items");return Re.createElement("div",{className:"json-schema-2020-12-keyword json-schema-2020-12-keyword--unevaluatedItems"},Re.createElement(i,{name:a,schema:s.unevaluatedItems,identifier:"unevaluatedItems"}))},keywords_UnevaluatedProperties=({schema:s})=>{const o=useFn(),i=useComponent("JSONSchema");if(!o.hasKeyword(s,"unevaluatedProperties"))return null;const a=Re.createElement("span",{className:"json-schema-2020-12-keyword__name json-schema-2020-12-keyword__name--primary"},"Unevaluated properties");return Re.createElement("div",{className:"json-schema-2020-12-keyword json-schema-2020-12-keyword--unevaluatedProperties"},Re.createElement(i,{name:a,schema:s.unevaluatedProperties,identifier:"unevaluatedProperties"}))},keywords_Type=({schema:s,isCircular:o=!1})=>{const i=useFn().getType(s),a=o?" [circular]":"";return Re.createElement("strong",{className:"json-schema-2020-12__attribute json-schema-2020-12__attribute--primary"},`${i}${a}`)},Enum_Enum=({schema:s})=>{const o=useComponent("JSONViewer");return Array.isArray(s?.enum)?Re.createElement(o,{name:"Enum",value:s.enum,className:"json-schema-2020-12-keyword json-schema-2020-12-keyword--enum"}):null},Const_Const=({schema:s})=>{const o=useFn(),i=useComponent("JSONViewer");return o.hasKeyword(s,"const")?Re.createElement(i,{name:"Const",value:s.const,className:"json-schema-2020-12-keyword json-schema-2020-12-keyword--const"}):null},fn_upperFirst=s=>"string"==typeof s?`${s.charAt(0).toUpperCase()}${s.slice(1)}`:s,makeGetTitle=s=>(o,{lookup:i="extended"}={})=>{const a=s();if(null!=o?.title)return a.upperFirst(String(o.title));if("extended"===i){if(null!=o?.$anchor)return a.upperFirst(String(o.$anchor));if(null!=o?.$id)return String(o.$id)}return""},makeGetType=s=>{const getType=(o,i=new WeakSet)=>{const a=s();if(null==o)return"any";if(a.isBooleanJSONSchema(o))return o?"any":"never";if("object"!=typeof o)return"any";if(i.has(o))return"any";i.add(o);const{type:u,prefixItems:_,items:w}=o,getArrayType=()=>{if(Array.isArray(_)){const s=_.map((s=>getType(s,i))),o=w?getType(w,i):"any";return`array<[${s.join(", ")}], ${o}>`}if(w){return`array<${getType(w,i)}>`}return"array<any>"};if(o.not&&"any"===getType(o.not))return"never";const handleCombiningKeywords=(s,a)=>{if(Array.isArray(o[s])){return`(${o[s].map((s=>getType(s,i))).join(a)})`}return null},x=[Array.isArray(u)?u.map((s=>"array"===s?getArrayType():s)).join(" | "):"array"===u?getArrayType():["null","boolean","object","array","number","integer","string"].includes(u)?u:(()=>{if(Object.hasOwn(o,"prefixItems")||Object.hasOwn(o,"items")||Object.hasOwn(o,"contains"))return getArrayType();if(Object.hasOwn(o,"properties")||Object.hasOwn(o,"additionalProperties")||Object.hasOwn(o,"patternProperties"))return"object";if(["int32","int64"].includes(o.format))return"integer";if(["float","double"].includes(o.format))return"number";if(Object.hasOwn(o,"minimum")||Object.hasOwn(o,"maximum")||Object.hasOwn(o,"exclusiveMinimum")||Object.hasOwn(o,"exclusiveMaximum")||Object.hasOwn(o,"multipleOf"))return"number | integer";if(Object.hasOwn(o,"pattern")||Object.hasOwn(o,"format")||Object.hasOwn(o,"minLength")||Object.hasOwn(o,"maxLength")||Object.hasOwn(o,"contentEncoding")||Object.hasOwn(o,"contentMediaType"))return"string";if(void 0!==o.const){if(null===o.const)return"null";if("boolean"==typeof o.const)return"boolean";if("number"==typeof o.const)return Number.isInteger(o.const)?"integer":"number";if("string"==typeof o.const)return"string";if(Array.isArray(o.const))return"array<any>";if("object"==typeof o.const)return"object"}return null})(),handleCombiningKeywords("oneOf"," | "),handleCombiningKeywords("anyOf"," | "),handleCombiningKeywords("allOf"," & ")].filter(Boolean).join(" | ");return i.delete(o),x||"any"};return getType},isBooleanJSONSchema=s=>"boolean"==typeof s,hasKeyword=(s,o)=>null!==s&&"object"==typeof s&&Object.hasOwn(s,o),fn_makeIsExpandable=s=>o=>{const i=s();return o?.$schema||o?.$vocabulary||o?.$id||o?.$anchor||o?.$dynamicAnchor||o?.$ref||o?.$dynamicRef||o?.$defs||o?.$comment||o?.allOf||o?.anyOf||o?.oneOf||i.hasKeyword(o,"not")||i.hasKeyword(o,"if")||i.hasKeyword(o,"then")||i.hasKeyword(o,"else")||o?.dependentSchemas||o?.prefixItems||i.hasKeyword(o,"items")||i.hasKeyword(o,"contains")||o?.properties||o?.patternProperties||i.hasKeyword(o,"additionalProperties")||i.hasKeyword(o,"propertyNames")||i.hasKeyword(o,"unevaluatedItems")||i.hasKeyword(o,"unevaluatedProperties")||o?.description||o?.enum||i.hasKeyword(o,"const")||i.hasKeyword(o,"contentSchema")||i.hasKeyword(o,"default")||o?.examples||i.getExtensionKeywords(o).length>0},fn_stringify=s=>null===s||["number","bigint","boolean"].includes(typeof s)?String(s):Array.isArray(s)?`[${s.map(fn_stringify).join(", ")}]`:JSON.stringify(s),stringifyConstraintRange=(s,o,i)=>{const a="number"==typeof o,u="number"==typeof i;return a&&u?o===i?`${o} ${s}`:`[${o}, ${i}] ${s}`:a?`≥ ${o} ${s}`:u?`≤ ${i} ${s}`:null},stringifyConstraints=s=>{const o=[],i=(s=>{if("number"!=typeof s?.multipleOf)return null;if(s.multipleOf<=0)return null;if(1===s.multipleOf)return null;const{multipleOf:o}=s;if(Number.isInteger(o))return`multiple of ${o}`;const i=10**o.toString().split(".")[1].length;return`multiple of ${o*i}/${i}`})(s);null!==i&&o.push({scope:"number",value:i});const a=(s=>{const o=s?.minimum,i=s?.maximum,a=s?.exclusiveMinimum,u=s?.exclusiveMaximum,_="number"==typeof o,w="number"==typeof i,x="number"==typeof a,C="number"==typeof u,j=x&&(!_||o<a),L=C&&(!w||i>u);if((_||x)&&(w||C))return`${j?"(":"["}${j?a:o}, ${L?u:i}${L?")":"]"}`;if(_||x)return`${j?">":"≥"} ${j?a:o}`;if(w||C)return`${L?"<":"≤"} ${L?u:i}`;return null})(s);null!==a&&o.push({scope:"number",value:a}),s?.format&&o.push({scope:"string",value:s.format});const u=stringifyConstraintRange("characters",s?.minLength,s?.maxLength);null!==u&&o.push({scope:"string",value:u}),s?.pattern&&o.push({scope:"string",value:`matches ${s?.pattern}`}),s?.contentMediaType&&o.push({scope:"string",value:`media type: ${s.contentMediaType}`}),s?.contentEncoding&&o.push({scope:"string",value:`encoding: ${s.contentEncoding}`});const _=stringifyConstraintRange(s?.uniqueItems?"unique items":"items",s?.minItems,s?.maxItems);null!==_&&o.push({scope:"array",value:_}),s?.uniqueItems&&!_&&o.push({scope:"array",value:"unique"});const w=stringifyConstraintRange("contained items",s?.minContains,s?.maxContains);null!==w&&o.push({scope:"array",value:w});const x=stringifyConstraintRange("properties",s?.minProperties,s?.maxProperties);return null!==x&&o.push({scope:"object",value:x}),o},getDependentRequired=(s,o)=>o?.dependentRequired?Array.from(Object.entries(o.dependentRequired).reduce(((o,[i,a])=>Array.isArray(a)&&a.includes(s)?(o.add(i),o):o),new Set)):[],fn_isPlainObject=s=>"object"==typeof s&&null!==s&&!Array.isArray(s)&&(null===Object.getPrototypeOf(s)||Object.getPrototypeOf(s)===Object.prototype),getSchemaKeywords=()=>["$schema","$vocabulary","$id","$anchor","$dynamicAnchor","$dynamicRef","$ref","$defs","$comment","allOf","anyOf","oneOf","not","if","then","else","dependentSchemas","prefixItems","items","contains","properties","patternProperties","additionalProperties","propertyNames","unevaluatedItems","unevaluatedProperties","type","enum","const","multipleOf","maximum","exclusiveMaximum","minimum","exclusiveMinimum","maxLength","minLength","pattern","maxItems","minItems","uniqueItems","maxContains","minContains","maxProperties","minProperties","required","dependentRequired","title","description","default","deprecated","readOnly","writeOnly","examples","format","contentEncoding","contentMediaType","contentSchema"],makeGetExtensionKeywords=s=>o=>{const i=s().getSchemaKeywords();return fn_isPlainObject(o)?((s,o)=>{const i=new Set(o);return s.filter((s=>!i.has(s)))})(Object.keys(o),i):[]},fn_hasSchemaType=(s,o)=>{const i=ze.Map.isMap(s);if(!i&&!fn_isPlainObject(s))return!1;const hasType=s=>o===s||Array.isArray(o)&&o.includes(s),a=i?s.get("type"):s.type;return ze.List.isList(a)||Array.isArray(a)?a.some((s=>hasType(s))):hasType(a)},Constraint=({constraint:s})=>fn_isPlainObject(s)&&"string"==typeof s.scope&&"string"==typeof s.value?Re.createElement("span",{className:`json-schema-2020-12__constraint json-schema-2020-12__constraint--${s.scope}`},s.value):null,cT=Re.memo(Constraint),DependentRequired_DependentRequired=({dependentRequired:s})=>Array.isArray(s)&&0!==s.length?Re.createElement("div",{className:"json-schema-2020-12-keyword json-schema-2020-12-keyword--dependentRequired"},Re.createElement("span",{className:"json-schema-2020-12-keyword__name json-schema-2020-12-keyword__name--primary"},"Required when defined"),Re.createElement("ul",null,s.map((s=>Re.createElement("li",{key:s},Re.createElement("span",{className:"json-schema-2020-12-keyword__value json-schema-2020-12-keyword__value--warning"},s)))))):null,keywords_ContentSchema=({schema:s})=>{const o=useFn(),i=useComponent("JSONSchema");if(!o.hasKeyword(s,"contentSchema"))return null;const a=Re.createElement("span",{className:"json-schema-2020-12-keyword__name json-schema-2020-12-keyword__name--primary"},"Content schema");return Re.createElement("div",{className:"json-schema-2020-12-keyword json-schema-2020-12-keyword--contentSchema"},Re.createElement(i,{name:a,schema:s.contentSchema,identifier:"contentSchema"}))},Title_Title=({title:s="",schema:o})=>{const i=useFn(),a=s||i.getTitle(o);return a?Re.createElement("div",{className:"json-schema-2020-12__title"},a):null},keywords_Description_Description=({schema:s})=>s?.description?Re.createElement("div",{className:"json-schema-2020-12-keyword json-schema-2020-12-keyword--description"},Re.createElement("div",{className:"json-schema-2020-12-core-keyword__value json-schema-2020-12-core-keyword__value--secondary"},s.description)):null,Default_Default=({schema:s})=>{const o=useFn(),i=useComponent("JSONViewer");return o.hasKeyword(s,"default")?Re.createElement(i,{name:"Default",value:s.default,className:"json-schema-2020-12-keyword json-schema-2020-12-keyword--default"}):null},keywords_Deprecated=({schema:s})=>!0!==s?.deprecated?null:Re.createElement("span",{className:"json-schema-2020-12__attribute json-schema-2020-12__attribute--warning"},"deprecated"),keywords_ReadOnly=({schema:s})=>!0!==s?.readOnly?null:Re.createElement("span",{className:"json-schema-2020-12__attribute json-schema-2020-12__attribute--muted"},"read-only"),keywords_WriteOnly=({schema:s})=>!0!==s?.writeOnly?null:Re.createElement("span",{className:"json-schema-2020-12__attribute json-schema-2020-12__attribute--muted"},"write-only"),keywords_Examples_Examples=({schema:s})=>{const o=s?.examples||[],i=useComponent("JSONViewer");return Array.isArray(o)&&0!==o.length?Re.createElement(i,{name:"Examples",value:s.examples,className:"json-schema-2020-12-keyword json-schema-2020-12-keyword--examples"}):null},ExtensionKeywords_ExtensionKeywords=({schema:s})=>{const o=useFn(),i="ExtensionKeywords",{path:a}=usePath(i),{isExpanded:u,setExpanded:_,setCollapsed:w}=useIsExpanded(i),[x,C]=useLevel(),j=useComponent("Accordion"),L=useComponent("ExpandDeepButton"),B=useComponent("JSONViewer"),{showExtensionKeywords:$}=useConfig(),U=o.getExtensionKeywords(s),V=(0,Re.useCallback)((()=>{u?w():_()}),[u,_,w]),z=(0,Re.useCallback)(((s,o)=>{o?_({deep:!0}):w({deep:!0})}),[_,w]);return $&&0!==U.length?Re.createElement(oT.Provider,{value:a},Re.createElement(nT.Provider,{value:C},Re.createElement("div",{className:"json-schema-2020-12-keyword json-schema-2020-12-keyword--extension-keywords","data-json-schema-level":x},Re.createElement(j,{expanded:u,onChange:V},Re.createElement("span",{className:"json-schema-2020-12-keyword__name json-schema-2020-12-keyword__name--extension"},"Extension Keywords")),Re.createElement(L,{expanded:u,onClick:z}),Re.createElement("ul",{className:Jn()("json-schema-2020-12-keyword__children",{"json-schema-2020-12-keyword__children--collapsed":!u})},u&&Re.createElement(Re.Fragment,null,U.map((o=>Re.createElement(B,{key:o,name:o,value:s[o],className:"json-schema-2020-12-json-viewer-extension-keyword"})))))))):null},JSONViewer=({name:s,value:o,className:i})=>{const a=useFn(),{path:u}=usePath(s),{isExpanded:_,setExpanded:w,setCollapsed:x}=useIsExpanded(s),[C,j]=useLevel(),L=useComponent("Accordion"),B=useComponent("ExpandDeepButton"),$="string"==typeof o||"number"==typeof o||"bigint"==typeof o||"boolean"==typeof o||"symbol"==typeof o||null==o,U=(s=>fn_isPlainObject(s)&&0===Object.keys(s).length)(o)||(s=>Array.isArray(s)&&0===s.length)(o),V=(0,Re.useCallback)((()=>{_?x():w()}),[_,w,x]),z=(0,Re.useCallback)(((s,o)=>{o?w({deep:!0}):x({deep:!0})}),[w,x]);return $?Re.createElement("div",{className:Jn()("json-schema-2020-12-json-viewer",i)},Re.createElement("span",{className:"json-schema-2020-12-json-viewer__name json-schema-2020-12-json-viewer__name--secondary"},s),Re.createElement("span",{className:"json-schema-2020-12-json-viewer__value json-schema-2020-12-json-viewer__value--secondary"},a.stringify(o))):U?Re.createElement("div",{className:Jn()("json-schema-2020-12-json-viewer",i)},Re.createElement("span",{className:"json-schema-2020-12-json-viewer__name json-schema-2020-12-json-viewer__name--secondary"},s),Re.createElement("strong",{className:"json-schema-2020-12__attribute json-schema-2020-12__attribute--primary"},Array.isArray(o)?"empty array":"empty object")):Re.createElement(oT.Provider,{value:u},Re.createElement(nT.Provider,{value:j},Re.createElement("div",{className:Jn()("json-schema-2020-12-json-viewer",i),"data-json-schema-level":C},Re.createElement(L,{expanded:_,onChange:V},Re.createElement("span",{className:"json-schema-2020-12-json-viewer__name json-schema-2020-12-json-viewer__name--secondary"},s)),Re.createElement(B,{expanded:_,onClick:z}),Re.createElement("strong",{className:"json-schema-2020-12__attribute json-schema-2020-12__attribute--primary"},Array.isArray(o)?"array":"object"),Re.createElement("ul",{className:Jn()("json-schema-2020-12-json-viewer__children",{"json-schema-2020-12-json-viewer__children--collapsed":!_})},_&&Re.createElement(Re.Fragment,null,Array.isArray(o)?o.map(((s,o)=>Re.createElement("li",{key:`#${o}`,className:"json-schema-2020-12-property"},Re.createElement(JSONViewer,{name:`#${o}`,value:s,className:i})))):Object.entries(o).map((([s,o])=>Re.createElement("li",{key:s,className:"json-schema-2020-12-property"},Re.createElement(JSONViewer,{name:s,value:o,className:i})))))))))},lT=JSONViewer,Accordion_Accordion=({expanded:s=!1,children:o,onChange:i})=>{const a=useComponent("ChevronRightIcon"),u=(0,Re.useCallback)((o=>{i(o,!s)}),[s,i]);return Re.createElement("button",{type:"button",className:"json-schema-2020-12-accordion",onClick:u},Re.createElement("div",{className:"json-schema-2020-12-accordion__children"},o),Re.createElement("span",{className:Jn()("json-schema-2020-12-accordion__icon",{"json-schema-2020-12-accordion__icon--expanded":s,"json-schema-2020-12-accordion__icon--collapsed":!s})},Re.createElement(a,null)))},ExpandDeepButton_ExpandDeepButton=({expanded:s,onClick:o})=>{const i=(0,Re.useCallback)((i=>{o(i,!s)}),[s,o]);return Re.createElement("button",{type:"button",className:"json-schema-2020-12-expand-deep-button",onClick:i},s?"Collapse all":"Expand all")},icons_ChevronRight=()=>Re.createElement("svg",{xmlns:"http://www.w3.org/2000/svg",width:"24",height:"24",viewBox:"0 0 24 24"},Re.createElement("path",{d:"M10 6L8.59 7.41 13.17 12l-4.58 4.59L10 18l6-6z"})),withJSONSchemaContext=(s,o={})=>{const i={components:{JSONSchema:aT,Keyword$schema:keywords_$schema,Keyword$vocabulary:$vocabulary_$vocabulary,Keyword$id:keywords_$id,Keyword$anchor:keywords_$anchor,Keyword$dynamicAnchor:keywords_$dynamicAnchor,Keyword$ref:keywords_$ref,Keyword$dynamicRef:keywords_$dynamicRef,Keyword$defs:keywords_$defs,Keyword$comment:keywords_$comment,KeywordAllOf:keywords_AllOf,KeywordAnyOf:keywords_AnyOf,KeywordOneOf:keywords_OneOf,KeywordNot:keywords_Not,KeywordIf:keywords_If,KeywordThen:keywords_Then,KeywordElse:keywords_Else,KeywordDependentSchemas:keywords_DependentSchemas,KeywordPrefixItems:keywords_PrefixItems,KeywordItems:keywords_Items,KeywordContains:keywords_Contains,KeywordProperties:keywords_Properties_Properties,KeywordPatternProperties:PatternProperties_PatternProperties,KeywordAdditionalProperties:keywords_AdditionalProperties,KeywordPropertyNames:keywords_PropertyNames,KeywordUnevaluatedItems:keywords_UnevaluatedItems,KeywordUnevaluatedProperties:keywords_UnevaluatedProperties,KeywordType:keywords_Type,KeywordEnum:Enum_Enum,KeywordConst:Const_Const,KeywordConstraint:cT,KeywordDependentRequired:DependentRequired_DependentRequired,KeywordContentSchema:keywords_ContentSchema,KeywordTitle:Title_Title,KeywordDescription:keywords_Description_Description,KeywordDefault:Default_Default,KeywordDeprecated:keywords_Deprecated,KeywordReadOnly:keywords_ReadOnly,KeywordWriteOnly:keywords_WriteOnly,KeywordExamples:keywords_Examples_Examples,ExtensionKeywords:ExtensionKeywords_ExtensionKeywords,JSONViewer:lT,Accordion:Accordion_Accordion,ExpandDeepButton:ExpandDeepButton_ExpandDeepButton,ChevronRightIcon:icons_ChevronRight,...o.components},config:{default$schema:"https://json-schema.org/draft/2020-12/schema",defaultExpandedLevels:0,showExtensionKeywords:!0,...o.config},fn:{upperFirst:fn_upperFirst,getTitle:makeGetTitle(useFn),getType:makeGetType(useFn),isBooleanJSONSchema,hasKeyword,isExpandable:fn_makeIsExpandable(useFn),stringify:fn_stringify,stringifyConstraints,getDependentRequired,getSchemaKeywords,getExtensionKeywords:makeGetExtensionKeywords(useFn),...o.fn},state:{paths:{}}},HOC=o=>Re.createElement(rT.Provider,{value:i},Re.createElement(s,o));return HOC.contexts={JSONSchemaContext:rT},HOC.displayName=s.displayName,HOC},makeWithJSONSchemaSystemContext=({getSystem:s})=>(o,i={})=>{const{getComponent:a,getConfigs:u}=s(),_=u(),w=a("JSONSchema202012"),x=a("JSONSchema202012Keyword$schema"),C=a("JSONSchema202012Keyword$vocabulary"),j=a("JSONSchema202012Keyword$id"),L=a("JSONSchema202012Keyword$anchor"),B=a("JSONSchema202012Keyword$dynamicAnchor"),$=a("JSONSchema202012Keyword$ref"),U=a("JSONSchema202012Keyword$dynamicRef"),V=a("JSONSchema202012Keyword$defs"),z=a("JSONSchema202012Keyword$comment"),Y=a("JSONSchema202012KeywordAllOf"),Z=a("JSONSchema202012KeywordAnyOf"),ee=a("JSONSchema202012KeywordOneOf"),ie=a("JSONSchema202012KeywordNot"),ae=a("JSONSchema202012KeywordIf"),ce=a("JSONSchema202012KeywordThen"),le=a("JSONSchema202012KeywordElse"),pe=a("JSONSchema202012KeywordDependentSchemas"),de=a("JSONSchema202012KeywordPrefixItems"),fe=a("JSONSchema202012KeywordItems"),ye=a("JSONSchema202012KeywordContains"),be=a("JSONSchema202012KeywordProperties"),_e=a("JSONSchema202012KeywordPatternProperties"),Se=a("JSONSchema202012KeywordAdditionalProperties"),we=a("JSONSchema202012KeywordPropertyNames"),xe=a("JSONSchema202012KeywordUnevaluatedItems"),Pe=a("JSONSchema202012KeywordUnevaluatedProperties"),Te=a("JSONSchema202012KeywordType"),Re=a("JSONSchema202012KeywordEnum"),$e=a("JSONSchema202012KeywordConst"),qe=a("JSONSchema202012KeywordConstraint"),ze=a("JSONSchema202012KeywordDependentRequired"),We=a("JSONSchema202012KeywordContentSchema"),He=a("JSONSchema202012KeywordTitle"),Ye=a("JSONSchema202012KeywordDescription"),Xe=a("JSONSchema202012KeywordDefault"),Qe=a("JSONSchema202012KeywordDeprecated"),et=a("JSONSchema202012KeywordReadOnly"),tt=a("JSONSchema202012KeywordWriteOnly"),rt=a("JSONSchema202012KeywordExamples"),nt=a("JSONSchema202012ExtensionKeywords"),st=a("JSONSchema202012JSONViewer"),ot=a("JSONSchema202012Accordion"),it=a("JSONSchema202012ExpandDeepButton"),at=a("JSONSchema202012ChevronRightIcon");return withJSONSchemaContext(o,{components:{JSONSchema:w,Keyword$schema:x,Keyword$vocabulary:C,Keyword$id:j,Keyword$anchor:L,Keyword$dynamicAnchor:B,Keyword$ref:$,Keyword$dynamicRef:U,Keyword$defs:V,Keyword$comment:z,KeywordAllOf:Y,KeywordAnyOf:Z,KeywordOneOf:ee,KeywordNot:ie,KeywordIf:ae,KeywordThen:ce,KeywordElse:le,KeywordDependentSchemas:pe,KeywordPrefixItems:de,KeywordItems:fe,KeywordContains:ye,KeywordProperties:be,KeywordPatternProperties:_e,KeywordAdditionalProperties:Se,KeywordPropertyNames:we,KeywordUnevaluatedItems:xe,KeywordUnevaluatedProperties:Pe,KeywordType:Te,KeywordEnum:Re,KeywordConst:$e,KeywordConstraint:qe,KeywordDependentRequired:ze,KeywordContentSchema:We,KeywordTitle:He,KeywordDescription:Ye,KeywordDefault:Xe,KeywordDeprecated:Qe,KeywordReadOnly:et,KeywordWriteOnly:tt,KeywordExamples:rt,ExtensionKeywords:nt,JSONViewer:st,Accordion:ot,ExpandDeepButton:it,ChevronRightIcon:at,...i.components},config:{showExtensionKeywords:_.showExtensions,...i.config},fn:{...i.fn}})},json_schema_2020_12=({getSystem:s,fn:o})=>{const fnAccessor=()=>({upperFirst:o.upperFirst,...o.jsonSchema202012});return{components:{JSONSchema202012:aT,JSONSchema202012Keyword$schema:keywords_$schema,JSONSchema202012Keyword$vocabulary:$vocabulary_$vocabulary,JSONSchema202012Keyword$id:keywords_$id,JSONSchema202012Keyword$anchor:keywords_$anchor,JSONSchema202012Keyword$dynamicAnchor:keywords_$dynamicAnchor,JSONSchema202012Keyword$ref:keywords_$ref,JSONSchema202012Keyword$dynamicRef:keywords_$dynamicRef,JSONSchema202012Keyword$defs:keywords_$defs,JSONSchema202012Keyword$comment:keywords_$comment,JSONSchema202012KeywordAllOf:keywords_AllOf,JSONSchema202012KeywordAnyOf:keywords_AnyOf,JSONSchema202012KeywordOneOf:keywords_OneOf,JSONSchema202012KeywordNot:keywords_Not,JSONSchema202012KeywordIf:keywords_If,JSONSchema202012KeywordThen:keywords_Then,JSONSchema202012KeywordElse:keywords_Else,JSONSchema202012KeywordDependentSchemas:keywords_DependentSchemas,JSONSchema202012KeywordPrefixItems:keywords_PrefixItems,JSONSchema202012KeywordItems:keywords_Items,JSONSchema202012KeywordContains:keywords_Contains,JSONSchema202012KeywordProperties:keywords_Properties_Properties,JSONSchema202012KeywordPatternProperties:PatternProperties_PatternProperties,JSONSchema202012KeywordAdditionalProperties:keywords_AdditionalProperties,JSONSchema202012KeywordPropertyNames:keywords_PropertyNames,JSONSchema202012KeywordUnevaluatedItems:keywords_UnevaluatedItems,JSONSchema202012KeywordUnevaluatedProperties:keywords_UnevaluatedProperties,JSONSchema202012KeywordType:keywords_Type,JSONSchema202012KeywordEnum:Enum_Enum,JSONSchema202012KeywordConst:Const_Const,JSONSchema202012KeywordConstraint:cT,JSONSchema202012KeywordDependentRequired:DependentRequired_DependentRequired,JSONSchema202012KeywordContentSchema:keywords_ContentSchema,JSONSchema202012KeywordTitle:Title_Title,JSONSchema202012KeywordDescription:keywords_Description_Description,JSONSchema202012KeywordDefault:Default_Default,JSONSchema202012KeywordDeprecated:keywords_Deprecated,JSONSchema202012KeywordReadOnly:keywords_ReadOnly,JSONSchema202012KeywordWriteOnly:keywords_WriteOnly,JSONSchema202012KeywordExamples:keywords_Examples_Examples,JSONSchema202012ExtensionKeywords:ExtensionKeywords_ExtensionKeywords,JSONSchema202012JSONViewer:lT,JSONSchema202012Accordion:Accordion_Accordion,JSONSchema202012ExpandDeepButton:ExpandDeepButton_ExpandDeepButton,JSONSchema202012ChevronRightIcon:icons_ChevronRight,withJSONSchema202012Context:withJSONSchemaContext,withJSONSchema202012SystemContext:makeWithJSONSchemaSystemContext(s()),JSONSchema202012PathContext:()=>oT,JSONSchema202012LevelContext:()=>nT},fn:{upperFirst:fn_upperFirst,jsonSchema202012:{getTitle:makeGetTitle(fnAccessor),getType:makeGetType(fnAccessor),isExpandable:fn_makeIsExpandable(fnAccessor),isBooleanJSONSchema,hasKeyword,useFn,useConfig,useComponent,useIsExpanded,usePath,useLevel,getSchemaKeywords,getExtensionKeywords:makeGetExtensionKeywords(fnAccessor),hasSchemaType:fn_hasSchemaType}}}},array=(s,{sample:o=[]}={})=>((s,o={})=>{const{minItems:i,maxItems:a,uniqueItems:u}=o,{contains:_,minContains:w,maxContains:x}=o;let C=[...s];if(null!=_&&"object"==typeof _){if(Number.isInteger(w)&&w>1){const s=C.at(0);for(let o=1;o<w;o+=1)C.unshift(s)}Number.isInteger(x)}if(Number.isInteger(a)&&a>0&&(C=s.slice(0,a)),Number.isInteger(i)&&i>0)for(let s=0;C.length<i;s+=1)C.push(C[s%C.length]);return!0===u&&(C=Array.from(new Set(C))),C})(o,s),object=()=>{throw new Error("Not implemented")},bytes=s=>xt()(s),random_pick=s=>s.at(0),predicates_isBooleanJSONSchema=s=>"boolean"==typeof s,isJSONSchemaObject=s=>as()(s),isJSONSchema=s=>predicates_isBooleanJSONSchema(s)||isJSONSchemaObject(s);const uT=class Registry{data={};register(s,o){this.data[s]=o}unregister(s){void 0===s?this.data={}:delete this.data[s]}get(s){return this.data[s]}},int32=()=>0,int64=()=>0,generators_float=()=>.1,generators_double=()=>.1,email=()=>"user@example.com",idn_email=()=>"실례@example.com",hostname=()=>"example.com",idn_hostname=()=>"실례.com",ipv4=()=>"198.51.100.42",ipv6=()=>"2001:0db8:5b96:0000:0000:426f:8e17:642a",uri=()=>"https://example.com/",uri_reference=()=>"path/index.html",iri=()=>"https://실례.com/",iri_reference=()=>"path/실례.html",uuid=()=>"3fa85f64-5717-4562-b3fc-2c963f66afa6",uri_template=()=>"https://example.com/dictionary/{term:1}/{term}",generators_json_pointer=()=>"/a/b/c",relative_json_pointer=()=>"1/0",date_time=()=>(new Date).toISOString(),date=()=>(new Date).toISOString().substring(0,10),time=()=>(new Date).toISOString().substring(11),duration=()=>"P3D",generators_password=()=>"********",regex=()=>"^[a-z]+$";const pT=new class FormatRegistry extends uT{#s={int32,int64,float:generators_float,double:generators_double,email,"idn-email":idn_email,hostname,"idn-hostname":idn_hostname,ipv4,ipv6,uri,"uri-reference":uri_reference,iri,"iri-reference":iri_reference,uuid,"uri-template":uri_template,"json-pointer":generators_json_pointer,"relative-json-pointer":relative_json_pointer,"date-time":date_time,date,time,duration,password:generators_password,regex};data={...this.#s};get defaults(){return{...this.#s}}},formatAPI=(s,o)=>"function"==typeof o?pT.register(s,o):null===o?pT.unregister(s):pT.get(s);formatAPI.getDefaults=()=>pT.defaults;const hT=formatAPI;var dT=__webpack_require__(48287).Buffer;const _7bit=s=>dT.from(s).toString("ascii");var fT=__webpack_require__(48287).Buffer;const _8bit=s=>fT.from(s).toString("utf8");var mT=__webpack_require__(48287).Buffer;const encoders_binary=s=>mT.from(s).toString("binary"),quoted_printable=s=>{let o="";for(let i=0;i<s.length;i++){const a=s.charCodeAt(i);if(61===a)o+="=3D";else if(a>=33&&a<=60||a>=62&&a<=126||9===a||32===a)o+=s.charAt(i);else if(13===a||10===a)o+="\r\n";else if(a>126){const a=unescape(encodeURIComponent(s.charAt(i)));for(let s=0;s<a.length;s++)o+="="+("0"+a.charCodeAt(s).toString(16)).slice(-2).toUpperCase()}else o+="="+("0"+a.toString(16)).slice(-2).toUpperCase()}return o};var gT=__webpack_require__(48287).Buffer;const base16=s=>gT.from(s).toString("hex");var yT=__webpack_require__(48287).Buffer;const base32=s=>{const o=yT.from(s).toString("utf8"),i="ABCDEFGHIJKLMNOPQRSTUVWXYZ234567";let a=0,u="",_=0,w=0;for(let s=0;s<o.length;s++)for(_=_<<8|o.charCodeAt(s),w+=8;w>=5;)u+=i.charAt(_>>>w-5&31),w-=5;w>0&&(u+=i.charAt(_<<5-w&31),a=(8-8*o.length%5)%5);for(let s=0;s<a;s++)u+="=";return u};var vT=__webpack_require__(48287).Buffer;const base64=s=>vT.from(s).toString("base64");var bT=__webpack_require__(48287).Buffer;const base64url=s=>bT.from(s).toString("base64url");const _T=new class EncoderRegistry extends uT{#s={"7bit":_7bit,"8bit":_8bit,binary:encoders_binary,"quoted-printable":quoted_printable,base16,base32,base64,base64url};data={...this.#s};get defaults(){return{...this.#s}}},encoderAPI=(s,o)=>"function"==typeof o?_T.register(s,o):null===o?_T.unregister(s):_T.get(s);encoderAPI.getDefaults=()=>_T.defaults;const ST=encoderAPI,ET={"text/plain":()=>"string","text/css":()=>".selector { border: 1px solid red }","text/csv":()=>"value1,value2,value3","text/html":()=>"<p>content</p>","text/calendar":()=>"BEGIN:VCALENDAR","text/javascript":()=>"console.dir('Hello world!');","text/xml":()=>'<person age="30">John Doe</person>',"text/*":()=>"string"},wT={"image/*":()=>bytes(25).toString("binary")},xT={"audio/*":()=>bytes(25).toString("binary")},kT={"video/*":()=>bytes(25).toString("binary")},OT={"application/json":()=>'{"key":"value"}',"application/ld+json":()=>'{"name": "John Doe"}',"application/x-httpd-php":()=>"<?php echo '<p>Hello World!</p>'; ?>","application/rtf":()=>String.raw`{\rtf1\adeflang1025\ansi\ansicpg1252\uc1`,"application/x-sh":()=>'echo "Hello World!"',"application/xhtml+xml":()=>"<p>content</p>","application/*":()=>bytes(25).toString("binary")};const AT=new class MediaTypeRegistry extends uT{#s={...ET,...wT,...xT,...kT,...OT};data={...this.#s};get defaults(){return{...this.#s}}},mediaTypeAPI=(s,o)=>{if("function"==typeof o)return AT.register(s,o);if(null===o)return AT.unregister(s);const i=s.split(";").at(0),a=`${i.split("/").at(0)}/*`;return AT.get(s)||AT.get(i)||AT.get(a)};mediaTypeAPI.getDefaults=()=>AT.defaults;const CT=mediaTypeAPI,applyStringConstraints=(s,o={})=>{const{maxLength:i,minLength:a}=o;let u=s;if(Number.isInteger(i)&&i>0&&(u=u.slice(0,i)),Number.isInteger(a)&&a>0){let s=0;for(;u.length<a;)u+=u[s++%u.length]}return u},types_string=(s,{sample:o}={})=>{const{contentEncoding:i,contentMediaType:a,contentSchema:u}=s,{pattern:_,format:w}=s,x=ST(i)||gO();let C;return C="string"==typeof _?applyStringConstraints((s=>{try{const o=/(?<=(?<!\\)\{)(\d{3,})(?=\})|(?<=(?<!\\)\{\d*,)(\d{3,})(?=\})|(?<=(?<!\\)\{)(\d{3,})(?=,\d*\})/g,i=s.replace(o,"100"),a=new(ps())(i);return a.max=100,a.gen()}catch{return"string"}})(_),s):"string"==typeof w?(s=>{const{format:o}=s,i=hT(o);return"function"==typeof i?i(s):"string"})(s):isJSONSchema(u)&&"string"==typeof a&&void 0!==o?Array.isArray(o)||"object"==typeof o?JSON.stringify(o):applyStringConstraints(String(o),s):"string"==typeof a?(s=>{const{contentMediaType:o}=s,i=CT(o);return"function"==typeof i?i(s):"string"})(s):applyStringConstraints("string",s),x(C)},applyNumberConstraints=(s,o={})=>{const{minimum:i,maximum:a,exclusiveMinimum:u,exclusiveMaximum:_}=o,{multipleOf:w}=o,x=Number.isInteger(s)?1:Number.EPSILON;let C="number"==typeof i?i:null,j="number"==typeof a?a:null,L=s;if("number"==typeof u&&(C=null!==C?Math.max(C,u+x):u+x),"number"==typeof _&&(j=null!==j?Math.min(j,_-x):_-x),L=C>j&&s||C||j||L,"number"==typeof w&&w>0){const s=L%w;L=0===s?L:L+w-s}return L},types_number=s=>{const{format:o}=s;let i;return i="string"==typeof o?(s=>{const{format:o}=s,i=hT(o);return"function"==typeof i?i(s):0})(s):0,applyNumberConstraints(i,s)},types_integer=s=>{const{format:o}=s;let i;return i="string"==typeof o?(s=>{const{format:o}=s,i=hT(o);if("function"==typeof i)return i(s);switch(o){case"int32":return int32();case"int64":return int64()}return 0})(s):0,applyNumberConstraints(i,s)},types_boolean=s=>"boolean"!=typeof s.default||s.default,jT=new Proxy({array,object,string:types_string,number:types_number,integer:types_integer,boolean:types_boolean,null:()=>null},{get:(s,o)=>"string"==typeof o&&Object.hasOwn(s,o)?s[o]:()=>`Unknown Type: ${o}`}),PT=["array","object","number","integer","string","boolean","null"],hasExample=s=>{if(!isJSONSchemaObject(s))return!1;const{examples:o,example:i,default:a}=s;return!!(Array.isArray(o)&&o.length>=1)||(void 0!==a||void 0!==i)},extractExample=s=>{if(!isJSONSchemaObject(s))return null;const{examples:o,example:i,default:a}=s;return Array.isArray(o)&&o.length>=1?o.at(0):void 0!==a?a:void 0!==i?i:void 0},IT={array:["items","prefixItems","contains","maxContains","minContains","maxItems","minItems","uniqueItems","unevaluatedItems"],object:["properties","additionalProperties","patternProperties","propertyNames","minProperties","maxProperties","required","dependentSchemas","dependentRequired","unevaluatedProperties"],string:["pattern","format","minLength","maxLength","contentEncoding","contentMediaType","contentSchema"],integer:["minimum","maximum","exclusiveMinimum","exclusiveMaximum","multipleOf"]};IT.number=IT.integer;const TT="string",inferTypeFromValue=s=>void 0===s?null:null===s?"null":Array.isArray(s)?"array":Number.isInteger(s)?"integer":typeof s,foldType=s=>{if(Array.isArray(s)&&s.length>=1){if(s.includes("array"))return"array";if(s.includes("object"))return"object";{const o=s.filter((s=>"null"!==s)),i=random_pick(o.length>0?o:s);if(PT.includes(i))return i}}return PT.includes(s)?s:null},inferType=(s,o=new WeakSet)=>{if(!isJSONSchemaObject(s))return TT;if(o.has(s))return TT;o.add(s);let{type:i,const:a}=s;if(i=foldType(i),"string"!=typeof i){const o=Object.keys(IT);e:for(let a=0;a<o.length;a+=1){const u=o[a],_=IT[u];for(let o=0;o<_.length;o+=1){const a=_[o];if(Object.hasOwn(s,a)){i=u;break e}}}}if("string"!=typeof i&&void 0!==a){const s=inferTypeFromValue(a);i="string"==typeof s?s:i}if("string"!=typeof i){const combineTypes=i=>{if(Array.isArray(s[i])){const a=s[i].map((s=>inferType(s,o)));return foldType(a)}return null},a=combineTypes("allOf"),u=combineTypes("anyOf"),_=combineTypes("oneOf"),w=s.not?inferType(s.not,o):null;(a||u||_||w)&&(i=foldType([a,u,_,w].filter(Boolean)))}if("string"!=typeof i&&hasExample(s)){const o=extractExample(s),a=inferTypeFromValue(o);i="string"==typeof a?a:i}return o.delete(s),i||TT},type_getType=s=>inferType(s),typeCast=s=>predicates_isBooleanJSONSchema(s)?(s=>!1===s?{not:{}}:{})(s):isJSONSchemaObject(s)?s:{},merge_merge=(s,o,i={})=>{if(predicates_isBooleanJSONSchema(s)&&!0===s)return!0;if(predicates_isBooleanJSONSchema(s)&&!1===s)return!1;if(predicates_isBooleanJSONSchema(o)&&!0===o)return!0;if(predicates_isBooleanJSONSchema(o)&&!1===o)return!1;if(!isJSONSchema(s))return o;if(!isJSONSchema(o))return s;const a={...o,...s};if(o.type&&s.type&&Array.isArray(o.type)&&"string"==typeof o.type){const i=normalizeArray(o.type).concat(s.type);a.type=Array.from(new Set(i))}if(Array.isArray(o.required)&&Array.isArray(s.required)&&(a.required=[...new Set([...s.required,...o.required])]),o.properties&&s.properties){const u=new Set([...Object.keys(o.properties),...Object.keys(s.properties)]);a.properties={};for(const _ of u){const u=o.properties[_]||{},w=s.properties[_]||{};u.readOnly&&!i.includeReadOnly||u.writeOnly&&!i.includeWriteOnly?a.required=(a.required||[]).filter((s=>s!==_)):a.properties[_]=merge_merge(w,u,i)}}return isJSONSchema(o.items)&&isJSONSchema(s.items)&&(a.items=merge_merge(s.items,o.items,i)),isJSONSchema(o.contains)&&isJSONSchema(s.contains)&&(a.contains=merge_merge(s.contains,o.contains,i)),isJSONSchema(o.contentSchema)&&isJSONSchema(s.contentSchema)&&(a.contentSchema=merge_merge(s.contentSchema,o.contentSchema,i)),a},NT=merge_merge,main_sampleFromSchemaGeneric=(s,o={},i=void 0,a=!1)=>{if(null==s&&void 0===i)return;"function"==typeof s?.toJS&&(s=s.toJS()),s=typeCast(s);let u=void 0!==i||hasExample(s);const _=!u&&Array.isArray(s.oneOf)&&s.oneOf.length>0,w=!u&&Array.isArray(s.anyOf)&&s.anyOf.length>0;if(!u&&(_||w)){const i=typeCast(random_pick(_?s.oneOf:s.anyOf));!(s=NT(s,i,o)).xml&&i.xml&&(s.xml=i.xml),hasExample(s)&&hasExample(i)&&(u=!0)}const x={};let{xml:C,properties:j,additionalProperties:L,items:B,contains:$}=s||{},U=type_getType(s),{includeReadOnly:V,includeWriteOnly:z}=o;C=C||{};let Y,{name:Z,prefix:ee,namespace:ie}=C,ae={};if(Object.hasOwn(s,"type")||(s.type=U),a&&(Z=Z||"notagname",Y=(ee?`${ee}:`:"")+Z,ie)){x[ee?`xmlns:${ee}`:"xmlns"]=ie}a&&(ae[Y]=[]);const ce=objectify(j);let le,pe=0;const hasExceededMaxProperties=()=>Number.isInteger(s.maxProperties)&&s.maxProperties>0&&pe>=s.maxProperties,canAddProperty=o=>!(Number.isInteger(s.maxProperties)&&s.maxProperties>0)||!hasExceededMaxProperties()&&(!(o=>!Array.isArray(s.required)||0===s.required.length||!s.required.includes(o))(o)||s.maxProperties-pe-(()=>{if(!Array.isArray(s.required)||0===s.required.length)return 0;let o=0;return a?s.required.forEach((s=>o+=void 0===ae[s]?0:1)):s.required.forEach((s=>{o+=void 0===ae[Y]?.find((o=>void 0!==o[s]))?0:1})),s.required.length-o})()>0);if(le=a?(i,u=void 0)=>{if(s&&ce[i]){if(ce[i].xml=ce[i].xml||{},ce[i].xml.attribute){const s=Array.isArray(ce[i].enum)?random_pick(ce[i].enum):void 0;if(hasExample(ce[i]))x[ce[i].xml.name||i]=extractExample(ce[i]);else if(void 0!==s)x[ce[i].xml.name||i]=s;else{const s=typeCast(ce[i]),a=type_getType(s),_=ce[i].xml.name||i;if("array"===a){const s=main_sampleFromSchemaGeneric(ce[i],o,u,!1);x[_]=s.map((s=>as()(s)?"UnknownTypeObject":Array.isArray(s)?"UnknownTypeArray":s)).join(" ")}else x[_]="object"===a?"UnknownTypeObject":jT[a](s)}return}ce[i].xml.name=ce[i].xml.name||i}else ce[i]||!1===L||(ce[i]={xml:{name:i}});let _=main_sampleFromSchemaGeneric(ce[i],o,u,a);canAddProperty(i)&&(pe++,Array.isArray(_)?ae[Y]=ae[Y].concat(_):ae[Y].push(_))}:(i,u)=>{if(canAddProperty(i)){if(as()(s.discriminator?.mapping)&&s.discriminator.propertyName===i&&"string"==typeof s.$$ref){for(const o in s.discriminator.mapping)if(-1!==s.$$ref.search(s.discriminator.mapping[o])){ae[i]=o;break}}else ae[i]=main_sampleFromSchemaGeneric(ce[i],o,u,a);pe++}},u){let u;if(u=void 0!==i?i:extractExample(s),!a){if("number"==typeof u&&"string"===U)return`${u}`;if("string"!=typeof u||"string"===U)return u;try{return JSON.parse(u)}catch{return u}}if("array"===U){if(!Array.isArray(u)){if("string"==typeof u)return u;u=[u]}let i=[];return isJSONSchemaObject(B)&&(B.xml=B.xml||C||{},B.xml.name=B.xml.name||C.name,i=u.map((s=>main_sampleFromSchemaGeneric(B,o,s,a)))),isJSONSchemaObject($)&&($.xml=$.xml||C||{},$.xml.name=$.xml.name||C.name,i=[main_sampleFromSchemaGeneric($,o,void 0,a),...i]),i=jT.array(s,{sample:i}),C.wrapped?(ae[Y]=i,ds()(x)||ae[Y].push({_attr:x})):ae=i,ae}if("object"===U){if("string"==typeof u)return u;for(const s in u)Object.hasOwn(u,s)&&(ce[s]?.readOnly&&!V||ce[s]?.writeOnly&&!z||(ce[s]?.xml?.attribute?x[ce[s].xml.name||s]=u[s]:le(s,u[s])));return ds()(x)||ae[Y].push({_attr:x}),ae}return ae[Y]=ds()(x)?u:[{_attr:x},u],ae}if("array"===U){let i=[];if(isJSONSchemaObject($))if(a&&($.xml=$.xml||s.xml||{},$.xml.name=$.xml.name||C.name),Array.isArray($.anyOf)){const{anyOf:s,...u}=B;i.push(...$.anyOf.map((s=>main_sampleFromSchemaGeneric(NT(s,u,o),o,void 0,a))))}else if(Array.isArray($.oneOf)){const{oneOf:s,...u}=B;i.push(...$.oneOf.map((s=>main_sampleFromSchemaGeneric(NT(s,u,o),o,void 0,a))))}else{if(!(!a||a&&C.wrapped))return main_sampleFromSchemaGeneric($,o,void 0,a);i.push(main_sampleFromSchemaGeneric($,o,void 0,a))}if(isJSONSchemaObject(B))if(a&&(B.xml=B.xml||s.xml||{},B.xml.name=B.xml.name||C.name),Array.isArray(B.anyOf)){const{anyOf:s,...u}=B;i.push(...B.anyOf.map((s=>main_sampleFromSchemaGeneric(NT(s,u,o),o,void 0,a))))}else if(Array.isArray(B.oneOf)){const{oneOf:s,...u}=B;i.push(...B.oneOf.map((s=>main_sampleFromSchemaGeneric(NT(s,u,o),o,void 0,a))))}else{if(!(!a||a&&C.wrapped))return main_sampleFromSchemaGeneric(B,o,void 0,a);i.push(main_sampleFromSchemaGeneric(B,o,void 0,a))}return i=jT.array(s,{sample:i}),a&&C.wrapped?(ae[Y]=i,ds()(x)||ae[Y].push({_attr:x}),ae):i}if("object"===U){for(let s in ce)Object.hasOwn(ce,s)&&(ce[s]?.deprecated||ce[s]?.readOnly&&!V||ce[s]?.writeOnly&&!z||le(s));if(a&&x&&ae[Y].push({_attr:x}),hasExceededMaxProperties())return ae;if(predicates_isBooleanJSONSchema(L)&&L)a?ae[Y].push({additionalProp:"Anything can be here"}):ae.additionalProp1={},pe++;else if(isJSONSchemaObject(L)){const i=L,u=main_sampleFromSchemaGeneric(i,o,void 0,a);if(a&&"string"==typeof i?.xml?.name&&"notagname"!==i?.xml?.name)ae[Y].push(u);else{const o=i?.["x-additionalPropertiesName"]||"additionalProp",_=Number.isInteger(s.minProperties)&&s.minProperties>0&&pe<s.minProperties?s.minProperties-pe:3;for(let s=1;s<=_;s++){if(hasExceededMaxProperties())return ae;if(a){const i={};i[o+s]=u.notagname,ae[Y].push(i)}else ae[o+s]=u;pe++}}}return ae}let de;if(void 0!==s.const)de=s.const;else if(s&&Array.isArray(s.enum))de=random_pick(normalizeArray(s.enum));else{const i=isJSONSchemaObject(s.contentSchema)?main_sampleFromSchemaGeneric(s.contentSchema,o,void 0,a):void 0;de=jT[U](s,{sample:i})}return a?(ae[Y]=ds()(x)?de:[{_attr:x},de],ae):de},main_createXMLExample=(s,o,i)=>{const a=main_sampleFromSchemaGeneric(s,o,i,!0);if(a)return"string"==typeof a?a:ls()(a,{declaration:!0,indent:"\t"})},main_sampleFromSchema=(s,o,i)=>main_sampleFromSchemaGeneric(s,o,i,!1),main_resolver=(s,o,i)=>[s,JSON.stringify(o),JSON.stringify(i)],MT=utils_memoizeN(main_createXMLExample,main_resolver),RT=utils_memoizeN(main_sampleFromSchema,main_resolver);const DT=new class OptionRegistry extends uT{#s={};data={...this.#s};get defaults(){return{...this.#s}}},api_optionAPI=(s,o)=>(void 0!==o&&DT.register(s,o),DT.get(s)),LT=[{when:/json/,shouldStringifyTypes:["string"]}],FT=["object"],fn_get_json_sample_schema=s=>(o,i,a,u)=>{const{fn:_}=s(),w=_.jsonSchema202012.memoizedSampleFromSchema(o,i,u),x=typeof w,C=LT.reduce(((s,o)=>o.when.test(a)?[...s,...o.shouldStringifyTypes]:s),FT);return gt()(C,(s=>s===x))?JSON.stringify(w,null,2):w},fn_get_yaml_sample_schema=s=>(o,i,a,u)=>{const{fn:_}=s(),w=_.jsonSchema202012.getJsonSampleSchema(o,i,a,u);let x;try{x=fn.dump(fn.load(w),{lineWidth:-1},{schema:rn}),"\n"===x[x.length-1]&&(x=x.slice(0,x.length-1))}catch(s){return console.error(s),"error: could not generate yaml example"}return x.replace(/\t/g,"  ")},fn_get_xml_sample_schema=s=>(o,i,a)=>{const{fn:u}=s();if(o&&!o.xml&&(o.xml={}),o&&!o.xml.name){if(!o.$$ref&&(o.type||o.items||o.properties||o.additionalProperties))return'<?xml version="1.0" encoding="UTF-8"?>\n\x3c!-- XML example cannot be generated; root element name is undefined --\x3e';if(o.$$ref){let s=o.$$ref.match(/\S*\/(\S+)$/);o.xml.name=s[1]}}return u.jsonSchema202012.memoizedCreateXMLExample(o,i,a)},fn_get_sample_schema=s=>(o,i="",a={},u=void 0)=>{const{fn:_}=s();return"function"==typeof o?.toJS&&(o=o.toJS()),"function"==typeof u?.toJS&&(u=u.toJS()),/xml/.test(i)?_.jsonSchema202012.getXmlSampleSchema(o,a,u):/(yaml|yml)/.test(i)?_.jsonSchema202012.getYamlSampleSchema(o,a,i,u):_.jsonSchema202012.getJsonSampleSchema(o,a,i,u)},json_schema_2020_12_samples=({getSystem:s})=>{const o=fn_get_json_sample_schema(s),i=fn_get_yaml_sample_schema(s),a=fn_get_xml_sample_schema(s),u=fn_get_sample_schema(s);return{fn:{jsonSchema202012:{sampleFromSchema:main_sampleFromSchema,sampleFromSchemaGeneric:main_sampleFromSchemaGeneric,sampleOptionAPI:api_optionAPI,sampleEncoderAPI:ST,sampleFormatAPI:hT,sampleMediaTypeAPI:CT,createXMLExample:main_createXMLExample,memoizedSampleFromSchema:RT,memoizedCreateXMLExample:MT,getJsonSampleSchema:o,getYamlSampleSchema:i,getXmlSampleSchema:a,getSampleSchema:u,mergeJsonSchema:NT,foldType}}}};function PresetApis(){return[base,oas3,json_schema_2020_12,json_schema_2020_12_samples,oas31]}const inline_plugin=s=>()=>({fn:s.fn,components:s.components}),factorization_system=s=>{const o=Ye()({layout:{layout:s.layout,filter:s.filter},spec:{spec:"",url:s.url},requestSnippets:s.requestSnippets},s.initialState);if(s.initialState)for(const[i,a]of Object.entries(s.initialState))void 0===a&&delete o[i];return{system:{configs:s.configs},plugins:s.presets,state:o}},sources_query=()=>s=>{const o=s.queryConfigEnabled?(()=>{const s=new URLSearchParams(lt.location.search);return Object.fromEntries(s)})():{};return Object.entries(o).reduce(((s,[o,i])=>("config"===o?s.configUrl=i:"urls.primaryName"===o?s[o]=i:s=co()(s,o,i),s)),{})},sources_url=({url:s,system:o})=>async i=>{if(!s)return{};if("function"!=typeof o.configsActions?.getConfigByUrl)return{};const a=(()=>{const s={};return s.promise=new Promise(((o,i)=>{s.resolve=o,s.reject=i})),s})();return o.configsActions.getConfigByUrl({url:s,loadRemoteConfig:!0,requestInterceptor:i.requestInterceptor,responseInterceptor:i.responseInterceptor},(s=>{a.resolve(s)})),a.promise},runtime=()=>()=>{const s={};return globalThis.location&&(s.oauth2RedirectUrl=`${globalThis.location.protocol}//${globalThis.location.host}${globalThis.location.pathname.substring(0,globalThis.location.pathname.lastIndexOf("/"))}/oauth2-redirect.html`),s},BT=Object.freeze({dom_id:null,domNode:null,spec:{},url:"",urls:null,configUrl:null,layout:"BaseLayout",docExpansion:"list",maxDisplayedTags:-1,filter:!1,validatorUrl:"https://validator.swagger.io/validator",oauth2RedirectUrl:void 0,persistAuthorization:!1,configs:{},displayOperationId:!1,displayRequestDuration:!1,deepLinking:!1,tryItOutEnabled:!1,requestInterceptor:s=>(s.curlOptions=[],s),responseInterceptor:s=>s,showMutatedRequest:!0,defaultModelRendering:"example",defaultModelExpandDepth:1,defaultModelsExpandDepth:1,showExtensions:!1,showCommonExtensions:!1,withCredentials:!1,requestSnippetsEnabled:!1,requestSnippets:{generators:{curl_bash:{title:"cURL (bash)",syntax:"bash"},curl_powershell:{title:"cURL (PowerShell)",syntax:"powershell"},curl_cmd:{title:"cURL (CMD)",syntax:"bash"}},defaultExpanded:!0,languages:null},supportedSubmitMethods:["get","put","post","delete","options","head","patch","trace"],queryConfigEnabled:!1,presets:[PresetApis],plugins:[],initialState:{},fn:{},components:{},syntaxHighlight:{activated:!0,theme:"agate"},operationsSorter:null,tagsSorter:null,onComplete:null,modelPropertyMacro:null,parameterMacro:null,fileUploadMediaTypes:["application/octet-stream","image/","audio/","video/"],uncaughtExceptionHandler:null});var $T=__webpack_require__(61448),qT=__webpack_require__.n($T),UT=__webpack_require__(77731),VT=__webpack_require__.n(UT);const type_casters_array=(s,o=[])=>Array.isArray(s)?s:o,type_casters_boolean=(s,o=!1)=>!0===s||"true"===s||1===s||"1"===s||!1!==s&&"false"!==s&&0!==s&&"0"!==s&&o,dom_node=s=>null===s||"null"===s?null:s,type_casters_filter=s=>{const o=String(s);return type_casters_boolean(s,o)},type_casters_function=(s,o)=>"function"==typeof s?s:o,nullable_array=s=>Array.isArray(s)?s:null,nullable_function=s=>"function"==typeof s?s:null,nullable_string=s=>null===s||"null"===s?null:String(s),type_casters_number=(s,o=-1)=>{const i=parseInt(s,10);return Number.isNaN(i)?o:i},type_casters_object=(s,o={})=>as()(s)?s:o,sorter=s=>"function"==typeof s||"string"==typeof s?s:null,type_casters_string=s=>String(s),syntax_highlight=(s,o)=>as()(s)?s:!1===s||"false"===s||0===s||"0"===s?{activated:!1}:o,undefined_string=s=>void 0===s||"undefined"===s?void 0:String(s),zT={components:{typeCaster:type_casters_object},configs:{typeCaster:type_casters_object},configUrl:{typeCaster:nullable_string},deepLinking:{typeCaster:type_casters_boolean,defaultValue:BT.deepLinking},defaultModelExpandDepth:{typeCaster:type_casters_number,defaultValue:BT.defaultModelExpandDepth},defaultModelRendering:{typeCaster:type_casters_string},defaultModelsExpandDepth:{typeCaster:type_casters_number,defaultValue:BT.defaultModelsExpandDepth},displayOperationId:{typeCaster:type_casters_boolean,defaultValue:BT.displayOperationId},displayRequestDuration:{typeCaster:type_casters_boolean,defaultValue:BT.displayRequestDuration},docExpansion:{typeCaster:type_casters_string},dom_id:{typeCaster:nullable_string},domNode:{typeCaster:dom_node},fileUploadMediaTypes:{typeCaster:type_casters_array,defaultValue:BT.fileUploadMediaTypes},filter:{typeCaster:type_casters_filter},fn:{typeCaster:type_casters_object},initialState:{typeCaster:type_casters_object},layout:{typeCaster:type_casters_string},maxDisplayedTags:{typeCaster:type_casters_number,defaultValue:BT.maxDisplayedTags},modelPropertyMacro:{typeCaster:nullable_function},oauth2RedirectUrl:{typeCaster:undefined_string},onComplete:{typeCaster:nullable_function},operationsSorter:{typeCaster:sorter},paramaterMacro:{typeCaster:nullable_function},persistAuthorization:{typeCaster:type_casters_boolean,defaultValue:BT.persistAuthorization},plugins:{typeCaster:type_casters_array,defaultValue:BT.plugins},presets:{typeCaster:type_casters_array,defaultValue:BT.presets},requestInterceptor:{typeCaster:type_casters_function,defaultValue:BT.requestInterceptor},requestSnippets:{typeCaster:type_casters_object,defaultValue:BT.requestSnippets},requestSnippetsEnabled:{typeCaster:type_casters_boolean,defaultValue:BT.requestSnippetsEnabled},responseInterceptor:{typeCaster:type_casters_function,defaultValue:BT.responseInterceptor},showCommonExtensions:{typeCaster:type_casters_boolean,defaultValue:BT.showCommonExtensions},showExtensions:{typeCaster:type_casters_boolean,defaultValue:BT.showExtensions},showMutatedRequest:{typeCaster:type_casters_boolean,defaultValue:BT.showMutatedRequest},spec:{typeCaster:type_casters_object,defaultValue:BT.spec},supportedSubmitMethods:{typeCaster:type_casters_array,defaultValue:BT.supportedSubmitMethods},syntaxHighlight:{typeCaster:syntax_highlight,defaultValue:BT.syntaxHighlight},"syntaxHighlight.activated":{typeCaster:type_casters_boolean,defaultValue:BT.syntaxHighlight.activated},"syntaxHighlight.theme":{typeCaster:type_casters_string},tagsSorter:{typeCaster:sorter},tryItOutEnabled:{typeCaster:type_casters_boolean,defaultValue:BT.tryItOutEnabled},url:{typeCaster:type_casters_string},urls:{typeCaster:nullable_array},"urls.primaryName":{typeCaster:type_casters_string},validatorUrl:{typeCaster:nullable_string},withCredentials:{typeCaster:type_casters_boolean,defaultValue:BT.withCredentials},uncaughtExceptionHandler:{typeCaster:nullable_function}},type_cast=s=>Object.entries(zT).reduce(((s,[o,{typeCaster:i,defaultValue:a}])=>{if(qT()(s,o)){const u=i(Cn()(s,o),a);s=VT()(o,u,s)}return s}),{...s}),config_merge=(s,...o)=>{let i=Symbol.for("domNode"),a=Symbol.for("primaryName");const u=[];for(const s of o){const o={...s};Object.hasOwn(o,"domNode")&&(i=o.domNode,delete o.domNode),Object.hasOwn(o,"urls.primaryName")?(a=o["urls.primaryName"],delete o["urls.primaryName"]):Array.isArray(o.urls)&&Object.hasOwn(o.urls,"primaryName")&&(a=o.urls.primaryName,delete o.urls.primaryName),u.push(o)}const _=Ye()(s,...u);return i!==Symbol.for("domNode")&&(_.domNode=i),a!==Symbol.for("primaryName")&&Array.isArray(_.urls)&&(_.urls.primaryName=a),type_cast(_)};function SwaggerUI(s){const o=sources_query()(s),i=runtime()(),a=SwaggerUI.config.merge({},SwaggerUI.config.defaults,i,s,o),u=factorization_system(a),_=inline_plugin(a),w=new Store(u);w.register([a.plugins,_]);const x=w.getSystem(),persistConfigs=s=>{w.setConfigs(s),x.configsActions.loaded()},updateSpec=s=>{!o.url&&"object"==typeof s.spec&&Object.keys(s.spec).length>0?(x.specActions.updateUrl(""),x.specActions.updateLoadingStatus("success"),x.specActions.updateSpec(JSON.stringify(s.spec))):"function"==typeof x.specActions.download&&s.url&&!s.urls&&(x.specActions.updateUrl(s.url),x.specActions.download(s.url))},render=s=>{if(s.domNode)x.render(s.domNode,"App");else if(s.dom_id){const o=document.querySelector(s.dom_id);x.render(o,"App")}else null===s.dom_id||null===s.domNode||console.error("Skipped rendering: no `dom_id` or `domNode` was specified")};return a.configUrl?((async()=>{const{configUrl:s}=a,i=await sources_url({url:s,system:x})(a),u=SwaggerUI.config.merge({},a,i,o);persistConfigs(u),null!==i&&updateSpec(u),render(u)})(),x):(persistConfigs(a),updateSpec(a),render(a),x)}SwaggerUI.System=Store,SwaggerUI.config={defaults:BT,merge:config_merge,typeCast:type_cast,typeCastMappings:zT},SwaggerUI.presets={base,apis:PresetApis},SwaggerUI.plugins={Auth:auth,Configs:configsPlugin,DeepLining:deep_linking,Err:err,Filter:filter,Icons:icons,JSONSchema5:json_schema_5,JSONSchema5Samples:json_schema_5_samples,JSONSchema202012:json_schema_2020_12,JSONSchema202012Samples:json_schema_2020_12_samples,Layout:plugins_layout,Logs:logs,OpenAPI30:oas3,OpenAPI31:oas3,OnComplete:on_complete,RequestSnippets:plugins_request_snippets,Spec:plugins_spec,SwaggerClient:swagger_client,Util:util,View:view,ViewLegacy:view_legacy,DownloadUrl:downloadUrlPlugin,SyntaxHighlighting:syntax_highlighting,Versions:versions,SafeRender:safe_render};const WT=SwaggerUI})(),i=i.default})()));
diff --git a/vllm/entrypoints/serve/instrumentator/static/swagger-ui.css b/vllm/entrypoints/serve/instrumentator/static/swagger-ui.css
new file mode 100644
index 0000000000000000000000000000000000000000..d8dacd416fa6f64ec87c630fd091c2e2be1e3a3c
--- /dev/null
+++ b/vllm/entrypoints/serve/instrumentator/static/swagger-ui.css
@@ -0,0 +1,3 @@
+.swagger-ui{color:#3b4151;font-family:sans-serif}.swagger-ui html{line-height:1.15;-ms-text-size-adjust:100%;-webkit-text-size-adjust:100%}.swagger-ui body{margin:0}.swagger-ui article,.swagger-ui aside,.swagger-ui footer,.swagger-ui header,.swagger-ui nav,.swagger-ui section{display:block}.swagger-ui h1{font-size:2em;margin:.67em 0}.swagger-ui figcaption,.swagger-ui figure,.swagger-ui main{display:block}.swagger-ui figure{margin:1em 40px}.swagger-ui hr{box-sizing:content-box;height:0;overflow:visible}.swagger-ui pre{font-family:monospace,monospace;font-size:1em}.swagger-ui a{background-color:transparent;-webkit-text-decoration-skip:objects}.swagger-ui abbr[title]{border-bottom:none;text-decoration:underline;-webkit-text-decoration:underline dotted;text-decoration:underline dotted}.swagger-ui b,.swagger-ui strong{font-weight:inherit;font-weight:bolder}.swagger-ui code,.swagger-ui kbd,.swagger-ui samp{font-family:monospace,monospace;font-size:1em}.swagger-ui dfn{font-style:italic}.swagger-ui mark{background-color:#ff0;color:#000}.swagger-ui small{font-size:80%}.swagger-ui sub,.swagger-ui sup{font-size:75%;line-height:0;position:relative;vertical-align:baseline}.swagger-ui sub{bottom:-.25em}.swagger-ui sup{top:-.5em}.swagger-ui audio,.swagger-ui video{display:inline-block}.swagger-ui audio:not([controls]){display:none;height:0}.swagger-ui img{border-style:none}.swagger-ui svg:not(:root){overflow:hidden}.swagger-ui button,.swagger-ui input,.swagger-ui optgroup,.swagger-ui select,.swagger-ui textarea{font-family:sans-serif;font-size:100%;line-height:1.15;margin:0}.swagger-ui button,.swagger-ui input{overflow:visible}.swagger-ui button,.swagger-ui select{text-transform:none}.swagger-ui [type=reset],.swagger-ui [type=submit],.swagger-ui button,.swagger-ui html [type=button]{-webkit-appearance:button}.swagger-ui [type=button]::-moz-focus-inner,.swagger-ui [type=reset]::-moz-focus-inner,.swagger-ui [type=submit]::-moz-focus-inner,.swagger-ui button::-moz-focus-inner{border-style:none;padding:0}.swagger-ui [type=button]:-moz-focusring,.swagger-ui [type=reset]:-moz-focusring,.swagger-ui [type=submit]:-moz-focusring,.swagger-ui button:-moz-focusring{outline:1px dotted ButtonText}.swagger-ui fieldset{padding:.35em .75em .625em}.swagger-ui legend{box-sizing:border-box;color:inherit;display:table;max-width:100%;padding:0;white-space:normal}.swagger-ui progress{display:inline-block;vertical-align:baseline}.swagger-ui textarea{overflow:auto}.swagger-ui [type=checkbox],.swagger-ui [type=radio]{box-sizing:border-box;padding:0}.swagger-ui [type=number]::-webkit-inner-spin-button,.swagger-ui [type=number]::-webkit-outer-spin-button{height:auto}.swagger-ui [type=search]{-webkit-appearance:textfield;outline-offset:-2px}.swagger-ui [type=search]::-webkit-search-cancel-button,.swagger-ui [type=search]::-webkit-search-decoration{-webkit-appearance:none}.swagger-ui ::-webkit-file-upload-button{-webkit-appearance:button;font:inherit}.swagger-ui details,.swagger-ui menu{display:block}.swagger-ui summary{display:list-item}.swagger-ui canvas{display:inline-block}.swagger-ui [hidden],.swagger-ui template{display:none}.swagger-ui .debug *{outline:1px solid gold}.swagger-ui .debug-white *{outline:1px solid #fff}.swagger-ui .debug-black *{outline:1px solid #000}.swagger-ui .debug-grid{background:transparent url(data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAgAAAAICAYAAADED76LAAAAGXRFWHRTb2Z0d2FyZQBBZG9iZSBJbWFnZVJlYWR5ccllPAAAAyhpVFh0WE1MOmNvbS5hZG9iZS54bXAAAAAAADw/eHBhY2tldCBiZWdpbj0i77u/IiBpZD0iVzVNME1wQ2VoaUh6cmVTek5UY3prYzlkIj8+IDx4OnhtcG1ldGEgeG1sbnM6eD0iYWRvYmU6bnM6bWV0YS8iIHg6eG1wdGs9IkFkb2JlIFhNUCBDb3JlIDUuNi1jMTExIDc5LjE1ODMyNSwgMjAxNS8wOS8xMC0wMToxMDoyMCAgICAgICAgIj4gPHJkZjpSREYgeG1sbnM6cmRmPSJodHRwOi8vd3d3LnczLm9yZy8xOTk5LzAyLzIyLXJkZi1zeW50YXgtbnMjIj4gPHJkZjpEZXNjcmlwdGlvbiByZGY6YWJvdXQ9IiIgeG1sbnM6eG1wTU09Imh0dHA6Ly9ucy5hZG9iZS5jb20veGFwLzEuMC9tbS8iIHhtbG5zOnN0UmVmPSJodHRwOi8vbnMuYWRvYmUuY29tL3hhcC8xLjAvc1R5cGUvUmVzb3VyY2VSZWYjIiB4bWxuczp4bXA9Imh0dHA6Ly9ucy5hZG9iZS5jb20veGFwLzEuMC8iIHhtcE1NOkRvY3VtZW50SUQ9InhtcC5kaWQ6MTRDOTY4N0U2N0VFMTFFNjg2MzZDQjkwNkQ4MjgwMEIiIHhtcE1NOkluc3RhbmNlSUQ9InhtcC5paWQ6MTRDOTY4N0Q2N0VFMTFFNjg2MzZDQjkwNkQ4MjgwMEIiIHhtcDpDcmVhdG9yVG9vbD0iQWRvYmUgUGhvdG9zaG9wIENDIDIwMTUgKE1hY2ludG9zaCkiPiA8eG1wTU06RGVyaXZlZEZyb20gc3RSZWY6aW5zdGFuY2VJRD0ieG1wLmlpZDo3NjcyQkQ3NjY3QzUxMUU2QjJCQ0UyNDA4MTAwMjE3MSIgc3RSZWY6ZG9jdW1lbnRJRD0ieG1wLmRpZDo3NjcyQkQ3NzY3QzUxMUU2QjJCQ0UyNDA4MTAwMjE3MSIvPiA8L3JkZjpEZXNjcmlwdGlvbj4gPC9yZGY6UkRGPiA8L3g6eG1wbWV0YT4gPD94cGFja2V0IGVuZD0iciI/PsBS+GMAAAAjSURBVHjaYvz//z8DLsD4gcGXiYEAGBIKGBne//fFpwAgwAB98AaF2pjlUQAAAABJRU5ErkJggg==) repeat 0 0}.swagger-ui .debug-grid-16{background:transparent url(data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAAGXRFWHRTb2Z0d2FyZQBBZG9iZSBJbWFnZVJlYWR5ccllPAAAAyhpVFh0WE1MOmNvbS5hZG9iZS54bXAAAAAAADw/eHBhY2tldCBiZWdpbj0i77u/IiBpZD0iVzVNME1wQ2VoaUh6cmVTek5UY3prYzlkIj8+IDx4OnhtcG1ldGEgeG1sbnM6eD0iYWRvYmU6bnM6bWV0YS8iIHg6eG1wdGs9IkFkb2JlIFhNUCBDb3JlIDUuNi1jMTExIDc5LjE1ODMyNSwgMjAxNS8wOS8xMC0wMToxMDoyMCAgICAgICAgIj4gPHJkZjpSREYgeG1sbnM6cmRmPSJodHRwOi8vd3d3LnczLm9yZy8xOTk5LzAyLzIyLXJkZi1zeW50YXgtbnMjIj4gPHJkZjpEZXNjcmlwdGlvbiByZGY6YWJvdXQ9IiIgeG1sbnM6eG1wTU09Imh0dHA6Ly9ucy5hZG9iZS5jb20veGFwLzEuMC9tbS8iIHhtbG5zOnN0UmVmPSJodHRwOi8vbnMuYWRvYmUuY29tL3hhcC8xLjAvc1R5cGUvUmVzb3VyY2VSZWYjIiB4bWxuczp4bXA9Imh0dHA6Ly9ucy5hZG9iZS5jb20veGFwLzEuMC8iIHhtcE1NOkRvY3VtZW50SUQ9InhtcC5kaWQ6ODYyRjhERDU2N0YyMTFFNjg2MzZDQjkwNkQ4MjgwMEIiIHhtcE1NOkluc3RhbmNlSUQ9InhtcC5paWQ6ODYyRjhERDQ2N0YyMTFFNjg2MzZDQjkwNkQ4MjgwMEIiIHhtcDpDcmVhdG9yVG9vbD0iQWRvYmUgUGhvdG9zaG9wIENDIDIwMTUgKE1hY2ludG9zaCkiPiA8eG1wTU06RGVyaXZlZEZyb20gc3RSZWY6aW5zdGFuY2VJRD0ieG1wLmlpZDo3NjcyQkQ3QTY3QzUxMUU2QjJCQ0UyNDA4MTAwMjE3MSIgc3RSZWY6ZG9jdW1lbnRJRD0ieG1wLmRpZDo3NjcyQkQ3QjY3QzUxMUU2QjJCQ0UyNDA4MTAwMjE3MSIvPiA8L3JkZjpEZXNjcmlwdGlvbj4gPC9yZGY6UkRGPiA8L3g6eG1wbWV0YT4gPD94cGFja2V0IGVuZD0iciI/PvCS01IAAABMSURBVHjaYmR4/5+BFPBfAMFm/MBgx8RAGWCn1AAmSg34Q6kBDKMGMDCwICeMIemF/5QawEipAWwUhwEjMDvbAWlWkvVBwu8vQIABAEwBCph8U6c0AAAAAElFTkSuQmCC) repeat 0 0}.swagger-ui .debug-grid-8-solid{background:#fff url(data:image/jpeg;base64,/9j/4QAYRXhpZgAASUkqAAgAAAAAAAAAAAAAAP/sABFEdWNreQABAAQAAAAAAAD/4QMxaHR0cDovL25zLmFkb2JlLmNvbS94YXAvMS4wLwA8P3hwYWNrZXQgYmVnaW49Iu+7vyIgaWQ9Ilc1TTBNcENlaGlIenJlU3pOVGN6a2M5ZCI/PiA8eDp4bXBtZXRhIHhtbG5zOng9ImFkb2JlOm5zOm1ldGEvIiB4OnhtcHRrPSJBZG9iZSBYTVAgQ29yZSA1LjYtYzExMSA3OS4xNTgzMjUsIDIwMTUvMDkvMTAtMDE6MTA6MjAgICAgICAgICI+IDxyZGY6UkRGIHhtbG5zOnJkZj0iaHR0cDovL3d3dy53My5vcmcvMTk5OS8wMi8yMi1yZGYtc3ludGF4LW5zIyI+IDxyZGY6RGVzY3JpcHRpb24gcmRmOmFib3V0PSIiIHhtbG5zOnhtcD0iaHR0cDovL25zLmFkb2JlLmNvbS94YXAvMS4wLyIgeG1sbnM6eG1wTU09Imh0dHA6Ly9ucy5hZG9iZS5jb20veGFwLzEuMC9tbS8iIHhtbG5zOnN0UmVmPSJodHRwOi8vbnMuYWRvYmUuY29tL3hhcC8xLjAvc1R5cGUvUmVzb3VyY2VSZWYjIiB4bXA6Q3JlYXRvclRvb2w9IkFkb2JlIFBob3Rvc2hvcCBDQyAyMDE1IChNYWNpbnRvc2gpIiB4bXBNTTpJbnN0YW5jZUlEPSJ4bXAuaWlkOkIxMjI0OTczNjdCMzExRTZCMkJDRTI0MDgxMDAyMTcxIiB4bXBNTTpEb2N1bWVudElEPSJ4bXAuZGlkOkIxMjI0OTc0NjdCMzExRTZCMkJDRTI0MDgxMDAyMTcxIj4gPHhtcE1NOkRlcml2ZWRGcm9tIHN0UmVmOmluc3RhbmNlSUQ9InhtcC5paWQ6QjEyMjQ5NzE2N0IzMTFFNkIyQkNFMjQwODEwMDIxNzEiIHN0UmVmOmRvY3VtZW50SUQ9InhtcC5kaWQ6QjEyMjQ5NzI2N0IzMTFFNkIyQkNFMjQwODEwMDIxNzEiLz4gPC9yZGY6RGVzY3JpcHRpb24+IDwvcmRmOlJERj4gPC94OnhtcG1ldGE+IDw/eHBhY2tldCBlbmQ9InIiPz7/7gAOQWRvYmUAZMAAAAAB/9sAhAAbGhopHSlBJiZBQi8vL0JHPz4+P0dHR0dHR0dHR0dHR0dHR0dHR0dHR0dHR0dHR0dHR0dHR0dHR0dHR0dHR0dHAR0pKTQmND8oKD9HPzU/R0dHR0dHR0dHR0dHR0dHR0dHR0dHR0dHR0dHR0dHR0dHR0dHR0dHR0dHR0dHR0dHR0f/wAARCAAIAAgDASIAAhEBAxEB/8QAWQABAQAAAAAAAAAAAAAAAAAAAAYBAQEAAAAAAAAAAAAAAAAAAAIEEAEBAAMBAAAAAAAAAAAAAAABADECA0ERAAEDBQAAAAAAAAAAAAAAAAARITFBUWESIv/aAAwDAQACEQMRAD8AoOnTV1QTD7JJshP3vSM3P//Z) repeat 0 0}.swagger-ui .debug-grid-16-solid{background:#fff url(data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAIAAACQkWg2AAAAGXRFWHRTb2Z0d2FyZQBBZG9iZSBJbWFnZVJlYWR5ccllPAAAAyhpVFh0WE1MOmNvbS5hZG9iZS54bXAAAAAAADw/eHBhY2tldCBiZWdpbj0i77u/IiBpZD0iVzVNME1wQ2VoaUh6cmVTek5UY3prYzlkIj8+IDx4OnhtcG1ldGEgeG1sbnM6eD0iYWRvYmU6bnM6bWV0YS8iIHg6eG1wdGs9IkFkb2JlIFhNUCBDb3JlIDUuNi1jMTExIDc5LjE1ODMyNSwgMjAxNS8wOS8xMC0wMToxMDoyMCAgICAgICAgIj4gPHJkZjpSREYgeG1sbnM6cmRmPSJodHRwOi8vd3d3LnczLm9yZy8xOTk5LzAyLzIyLXJkZi1zeW50YXgtbnMjIj4gPHJkZjpEZXNjcmlwdGlvbiByZGY6YWJvdXQ9IiIgeG1sbnM6eG1wPSJodHRwOi8vbnMuYWRvYmUuY29tL3hhcC8xLjAvIiB4bWxuczp4bXBNTT0iaHR0cDovL25zLmFkb2JlLmNvbS94YXAvMS4wL21tLyIgeG1sbnM6c3RSZWY9Imh0dHA6Ly9ucy5hZG9iZS5jb20veGFwLzEuMC9zVHlwZS9SZXNvdXJjZVJlZiMiIHhtcDpDcmVhdG9yVG9vbD0iQWRvYmUgUGhvdG9zaG9wIENDIDIwMTUgKE1hY2ludG9zaCkiIHhtcE1NOkluc3RhbmNlSUQ9InhtcC5paWQ6NzY3MkJEN0U2N0M1MTFFNkIyQkNFMjQwODEwMDIxNzEiIHhtcE1NOkRvY3VtZW50SUQ9InhtcC5kaWQ6NzY3MkJEN0Y2N0M1MTFFNkIyQkNFMjQwODEwMDIxNzEiPiA8eG1wTU06RGVyaXZlZEZyb20gc3RSZWY6aW5zdGFuY2VJRD0ieG1wLmlpZDo3NjcyQkQ3QzY3QzUxMUU2QjJCQ0UyNDA4MTAwMjE3MSIgc3RSZWY6ZG9jdW1lbnRJRD0ieG1wLmRpZDo3NjcyQkQ3RDY3QzUxMUU2QjJCQ0UyNDA4MTAwMjE3MSIvPiA8L3JkZjpEZXNjcmlwdGlvbj4gPC9yZGY6UkRGPiA8L3g6eG1wbWV0YT4gPD94cGFja2V0IGVuZD0iciI/Pve6J3kAAAAzSURBVHjaYvz//z8D0UDsMwMjSRoYP5Gq4SPNbRjVMEQ1fCRDg+in/6+J1AJUxsgAEGAA31BAJMS0GYEAAAAASUVORK5CYII=) repeat 0 0}.swagger-ui .border-box,.swagger-ui a,.swagger-ui article,.swagger-ui body,.swagger-ui code,.swagger-ui dd,.swagger-ui div,.swagger-ui dl,.swagger-ui dt,.swagger-ui fieldset,.swagger-ui footer,.swagger-ui form,.swagger-ui h1,.swagger-ui h2,.swagger-ui h3,.swagger-ui h4,.swagger-ui h5,.swagger-ui h6,.swagger-ui header,.swagger-ui html,.swagger-ui input[type=email],.swagger-ui input[type=number],.swagger-ui input[type=password],.swagger-ui input[type=tel],.swagger-ui input[type=text],.swagger-ui input[type=url],.swagger-ui legend,.swagger-ui li,.swagger-ui main,.swagger-ui ol,.swagger-ui p,.swagger-ui pre,.swagger-ui section,.swagger-ui table,.swagger-ui td,.swagger-ui textarea,.swagger-ui th,.swagger-ui tr,.swagger-ui ul{box-sizing:border-box}.swagger-ui .aspect-ratio{height:0;position:relative}.swagger-ui .aspect-ratio--16x9{padding-bottom:56.25%}.swagger-ui .aspect-ratio--9x16{padding-bottom:177.77%}.swagger-ui .aspect-ratio--4x3{padding-bottom:75%}.swagger-ui .aspect-ratio--3x4{padding-bottom:133.33%}.swagger-ui .aspect-ratio--6x4{padding-bottom:66.6%}.swagger-ui .aspect-ratio--4x6{padding-bottom:150%}.swagger-ui .aspect-ratio--8x5{padding-bottom:62.5%}.swagger-ui .aspect-ratio--5x8{padding-bottom:160%}.swagger-ui .aspect-ratio--7x5{padding-bottom:71.42%}.swagger-ui .aspect-ratio--5x7{padding-bottom:140%}.swagger-ui .aspect-ratio--1x1{padding-bottom:100%}.swagger-ui .aspect-ratio--object{bottom:0;height:100%;left:0;position:absolute;right:0;top:0;width:100%;z-index:100}@media screen and (min-width:30em){.swagger-ui .aspect-ratio-ns{height:0;position:relative}.swagger-ui .aspect-ratio--16x9-ns{padding-bottom:56.25%}.swagger-ui .aspect-ratio--9x16-ns{padding-bottom:177.77%}.swagger-ui .aspect-ratio--4x3-ns{padding-bottom:75%}.swagger-ui .aspect-ratio--3x4-ns{padding-bottom:133.33%}.swagger-ui .aspect-ratio--6x4-ns{padding-bottom:66.6%}.swagger-ui .aspect-ratio--4x6-ns{padding-bottom:150%}.swagger-ui .aspect-ratio--8x5-ns{padding-bottom:62.5%}.swagger-ui .aspect-ratio--5x8-ns{padding-bottom:160%}.swagger-ui .aspect-ratio--7x5-ns{padding-bottom:71.42%}.swagger-ui .aspect-ratio--5x7-ns{padding-bottom:140%}.swagger-ui .aspect-ratio--1x1-ns{padding-bottom:100%}.swagger-ui .aspect-ratio--object-ns{bottom:0;height:100%;left:0;position:absolute;right:0;top:0;width:100%;z-index:100}}@media screen and (min-width:30em)and (max-width:60em){.swagger-ui .aspect-ratio-m{height:0;position:relative}.swagger-ui .aspect-ratio--16x9-m{padding-bottom:56.25%}.swagger-ui .aspect-ratio--9x16-m{padding-bottom:177.77%}.swagger-ui .aspect-ratio--4x3-m{padding-bottom:75%}.swagger-ui .aspect-ratio--3x4-m{padding-bottom:133.33%}.swagger-ui .aspect-ratio--6x4-m{padding-bottom:66.6%}.swagger-ui .aspect-ratio--4x6-m{padding-bottom:150%}.swagger-ui .aspect-ratio--8x5-m{padding-bottom:62.5%}.swagger-ui .aspect-ratio--5x8-m{padding-bottom:160%}.swagger-ui .aspect-ratio--7x5-m{padding-bottom:71.42%}.swagger-ui .aspect-ratio--5x7-m{padding-bottom:140%}.swagger-ui .aspect-ratio--1x1-m{padding-bottom:100%}.swagger-ui .aspect-ratio--object-m{bottom:0;height:100%;left:0;position:absolute;right:0;top:0;width:100%;z-index:100}}@media screen and (min-width:60em){.swagger-ui .aspect-ratio-l{height:0;position:relative}.swagger-ui .aspect-ratio--16x9-l{padding-bottom:56.25%}.swagger-ui .aspect-ratio--9x16-l{padding-bottom:177.77%}.swagger-ui .aspect-ratio--4x3-l{padding-bottom:75%}.swagger-ui .aspect-ratio--3x4-l{padding-bottom:133.33%}.swagger-ui .aspect-ratio--6x4-l{padding-bottom:66.6%}.swagger-ui .aspect-ratio--4x6-l{padding-bottom:150%}.swagger-ui .aspect-ratio--8x5-l{padding-bottom:62.5%}.swagger-ui .aspect-ratio--5x8-l{padding-bottom:160%}.swagger-ui .aspect-ratio--7x5-l{padding-bottom:71.42%}.swagger-ui .aspect-ratio--5x7-l{padding-bottom:140%}.swagger-ui .aspect-ratio--1x1-l{padding-bottom:100%}.swagger-ui .aspect-ratio--object-l{bottom:0;height:100%;left:0;position:absolute;right:0;top:0;width:100%;z-index:100}}.swagger-ui img{max-width:100%}.swagger-ui .cover{background-size:cover!important}.swagger-ui .contain{background-size:contain!important}@media screen and (min-width:30em){.swagger-ui .cover-ns{background-size:cover!important}.swagger-ui .contain-ns{background-size:contain!important}}@media screen and (min-width:30em)and (max-width:60em){.swagger-ui .cover-m{background-size:cover!important}.swagger-ui .contain-m{background-size:contain!important}}@media screen and (min-width:60em){.swagger-ui .cover-l{background-size:cover!important}.swagger-ui .contain-l{background-size:contain!important}}.swagger-ui .bg-center{background-position:50%;background-repeat:no-repeat}.swagger-ui .bg-top{background-position:top;background-repeat:no-repeat}.swagger-ui .bg-right{background-position:100%;background-repeat:no-repeat}.swagger-ui .bg-bottom{background-position:bottom;background-repeat:no-repeat}.swagger-ui .bg-left{background-position:0;background-repeat:no-repeat}@media screen and (min-width:30em){.swagger-ui .bg-center-ns{background-position:50%;background-repeat:no-repeat}.swagger-ui .bg-top-ns{background-position:top;background-repeat:no-repeat}.swagger-ui .bg-right-ns{background-position:100%;background-repeat:no-repeat}.swagger-ui .bg-bottom-ns{background-position:bottom;background-repeat:no-repeat}.swagger-ui .bg-left-ns{background-position:0;background-repeat:no-repeat}}@media screen and (min-width:30em)and (max-width:60em){.swagger-ui .bg-center-m{background-position:50%;background-repeat:no-repeat}.swagger-ui .bg-top-m{background-position:top;background-repeat:no-repeat}.swagger-ui .bg-right-m{background-position:100%;background-repeat:no-repeat}.swagger-ui .bg-bottom-m{background-position:bottom;background-repeat:no-repeat}.swagger-ui .bg-left-m{background-position:0;background-repeat:no-repeat}}@media screen and (min-width:60em){.swagger-ui .bg-center-l{background-position:50%;background-repeat:no-repeat}.swagger-ui .bg-top-l{background-position:top;background-repeat:no-repeat}.swagger-ui .bg-right-l{background-position:100%;background-repeat:no-repeat}.swagger-ui .bg-bottom-l{background-position:bottom;background-repeat:no-repeat}.swagger-ui .bg-left-l{background-position:0;background-repeat:no-repeat}}.swagger-ui .outline{outline:1px solid}.swagger-ui .outline-transparent{outline:1px solid transparent}.swagger-ui .outline-0{outline:0}@media screen and (min-width:30em){.swagger-ui .outline-ns{outline:1px solid}.swagger-ui .outline-transparent-ns{outline:1px solid transparent}.swagger-ui .outline-0-ns{outline:0}}@media screen and (min-width:30em)and (max-width:60em){.swagger-ui .outline-m{outline:1px solid}.swagger-ui .outline-transparent-m{outline:1px solid transparent}.swagger-ui .outline-0-m{outline:0}}@media screen and (min-width:60em){.swagger-ui .outline-l{outline:1px solid}.swagger-ui .outline-transparent-l{outline:1px solid transparent}.swagger-ui .outline-0-l{outline:0}}.swagger-ui .ba{border-style:solid;border-width:1px}.swagger-ui .bt{border-top-style:solid;border-top-width:1px}.swagger-ui .br{border-right-style:solid;border-right-width:1px}.swagger-ui .bb{border-bottom-style:solid;border-bottom-width:1px}.swagger-ui .bl{border-left-style:solid;border-left-width:1px}.swagger-ui .bn{border-style:none;border-width:0}@media screen and (min-width:30em){.swagger-ui .ba-ns{border-style:solid;border-width:1px}.swagger-ui .bt-ns{border-top-style:solid;border-top-width:1px}.swagger-ui .br-ns{border-right-style:solid;border-right-width:1px}.swagger-ui .bb-ns{border-bottom-style:solid;border-bottom-width:1px}.swagger-ui .bl-ns{border-left-style:solid;border-left-width:1px}.swagger-ui .bn-ns{border-style:none;border-width:0}}@media screen and (min-width:30em)and (max-width:60em){.swagger-ui .ba-m{border-style:solid;border-width:1px}.swagger-ui .bt-m{border-top-style:solid;border-top-width:1px}.swagger-ui .br-m{border-right-style:solid;border-right-width:1px}.swagger-ui .bb-m{border-bottom-style:solid;border-bottom-width:1px}.swagger-ui .bl-m{border-left-style:solid;border-left-width:1px}.swagger-ui .bn-m{border-style:none;border-width:0}}@media screen and (min-width:60em){.swagger-ui .ba-l{border-style:solid;border-width:1px}.swagger-ui .bt-l{border-top-style:solid;border-top-width:1px}.swagger-ui .br-l{border-right-style:solid;border-right-width:1px}.swagger-ui .bb-l{border-bottom-style:solid;border-bottom-width:1px}.swagger-ui .bl-l{border-left-style:solid;border-left-width:1px}.swagger-ui .bn-l{border-style:none;border-width:0}}.swagger-ui .b--black{border-color:#000}.swagger-ui .b--near-black{border-color:#111}.swagger-ui .b--dark-gray{border-color:#333}.swagger-ui .b--mid-gray{border-color:#555}.swagger-ui .b--gray{border-color:#777}.swagger-ui .b--silver{border-color:#999}.swagger-ui .b--light-silver{border-color:#aaa}.swagger-ui .b--moon-gray{border-color:#ccc}.swagger-ui .b--light-gray{border-color:#eee}.swagger-ui .b--near-white{border-color:#f4f4f4}.swagger-ui .b--white{border-color:#fff}.swagger-ui .b--white-90{border-color:hsla(0,0%,100%,.9)}.swagger-ui .b--white-80{border-color:hsla(0,0%,100%,.8)}.swagger-ui .b--white-70{border-color:hsla(0,0%,100%,.7)}.swagger-ui .b--white-60{border-color:hsla(0,0%,100%,.6)}.swagger-ui .b--white-50{border-color:hsla(0,0%,100%,.5)}.swagger-ui .b--white-40{border-color:hsla(0,0%,100%,.4)}.swagger-ui .b--white-30{border-color:hsla(0,0%,100%,.3)}.swagger-ui .b--white-20{border-color:hsla(0,0%,100%,.2)}.swagger-ui .b--white-10{border-color:hsla(0,0%,100%,.1)}.swagger-ui .b--white-05{border-color:hsla(0,0%,100%,.05)}.swagger-ui .b--white-025{border-color:hsla(0,0%,100%,.025)}.swagger-ui .b--white-0125{border-color:hsla(0,0%,100%,.013)}.swagger-ui .b--black-90{border-color:rgba(0,0,0,.9)}.swagger-ui .b--black-80{border-color:rgba(0,0,0,.8)}.swagger-ui .b--black-70{border-color:rgba(0,0,0,.7)}.swagger-ui .b--black-60{border-color:rgba(0,0,0,.6)}.swagger-ui .b--black-50{border-color:rgba(0,0,0,.5)}.swagger-ui .b--black-40{border-color:rgba(0,0,0,.4)}.swagger-ui .b--black-30{border-color:rgba(0,0,0,.3)}.swagger-ui .b--black-20{border-color:rgba(0,0,0,.2)}.swagger-ui .b--black-10{border-color:rgba(0,0,0,.1)}.swagger-ui .b--black-05{border-color:rgba(0,0,0,.05)}.swagger-ui .b--black-025{border-color:rgba(0,0,0,.025)}.swagger-ui .b--black-0125{border-color:rgba(0,0,0,.013)}.swagger-ui .b--dark-red{border-color:#e7040f}.swagger-ui .b--red{border-color:#ff4136}.swagger-ui .b--light-red{border-color:#ff725c}.swagger-ui .b--orange{border-color:#ff6300}.swagger-ui .b--gold{border-color:#ffb700}.swagger-ui .b--yellow{border-color:gold}.swagger-ui .b--light-yellow{border-color:#fbf1a9}.swagger-ui .b--purple{border-color:#5e2ca5}.swagger-ui .b--light-purple{border-color:#a463f2}.swagger-ui .b--dark-pink{border-color:#d5008f}.swagger-ui .b--hot-pink{border-color:#ff41b4}.swagger-ui .b--pink{border-color:#ff80cc}.swagger-ui .b--light-pink{border-color:#ffa3d7}.swagger-ui .b--dark-green{border-color:#137752}.swagger-ui .b--green{border-color:#19a974}.swagger-ui .b--light-green{border-color:#9eebcf}.swagger-ui .b--navy{border-color:#001b44}.swagger-ui .b--dark-blue{border-color:#00449e}.swagger-ui .b--blue{border-color:#357edd}.swagger-ui .b--light-blue{border-color:#96ccff}.swagger-ui .b--lightest-blue{border-color:#cdecff}.swagger-ui .b--washed-blue{border-color:#f6fffe}.swagger-ui .b--washed-green{border-color:#e8fdf5}.swagger-ui .b--washed-yellow{border-color:#fffceb}.swagger-ui .b--washed-red{border-color:#ffdfdf}.swagger-ui .b--transparent{border-color:transparent}.swagger-ui .b--inherit{border-color:inherit}.swagger-ui .br0{border-radius:0}.swagger-ui .br1{border-radius:.125rem}.swagger-ui .br2{border-radius:.25rem}.swagger-ui .br3{border-radius:.5rem}.swagger-ui .br4{border-radius:1rem}.swagger-ui .br-100{border-radius:100%}.swagger-ui .br-pill{border-radius:9999px}.swagger-ui .br--bottom{border-top-left-radius:0;border-top-right-radius:0}.swagger-ui .br--top{border-bottom-left-radius:0;border-bottom-right-radius:0}.swagger-ui .br--right{border-bottom-left-radius:0;border-top-left-radius:0}.swagger-ui .br--left{border-bottom-right-radius:0;border-top-right-radius:0}@media screen and (min-width:30em){.swagger-ui .br0-ns{border-radius:0}.swagger-ui .br1-ns{border-radius:.125rem}.swagger-ui .br2-ns{border-radius:.25rem}.swagger-ui .br3-ns{border-radius:.5rem}.swagger-ui .br4-ns{border-radius:1rem}.swagger-ui .br-100-ns{border-radius:100%}.swagger-ui .br-pill-ns{border-radius:9999px}.swagger-ui .br--bottom-ns{border-top-left-radius:0;border-top-right-radius:0}.swagger-ui .br--top-ns{border-bottom-left-radius:0;border-bottom-right-radius:0}.swagger-ui .br--right-ns{border-bottom-left-radius:0;border-top-left-radius:0}.swagger-ui .br--left-ns{border-bottom-right-radius:0;border-top-right-radius:0}}@media screen and (min-width:30em)and (max-width:60em){.swagger-ui .br0-m{border-radius:0}.swagger-ui .br1-m{border-radius:.125rem}.swagger-ui .br2-m{border-radius:.25rem}.swagger-ui .br3-m{border-radius:.5rem}.swagger-ui .br4-m{border-radius:1rem}.swagger-ui .br-100-m{border-radius:100%}.swagger-ui .br-pill-m{border-radius:9999px}.swagger-ui .br--bottom-m{border-top-left-radius:0;border-top-right-radius:0}.swagger-ui .br--top-m{border-bottom-left-radius:0;border-bottom-right-radius:0}.swagger-ui .br--right-m{border-bottom-left-radius:0;border-top-left-radius:0}.swagger-ui .br--left-m{border-bottom-right-radius:0;border-top-right-radius:0}}@media screen and (min-width:60em){.swagger-ui .br0-l{border-radius:0}.swagger-ui .br1-l{border-radius:.125rem}.swagger-ui .br2-l{border-radius:.25rem}.swagger-ui .br3-l{border-radius:.5rem}.swagger-ui .br4-l{border-radius:1rem}.swagger-ui .br-100-l{border-radius:100%}.swagger-ui .br-pill-l{border-radius:9999px}.swagger-ui .br--bottom-l{border-top-left-radius:0;border-top-right-radius:0}.swagger-ui .br--top-l{border-bottom-left-radius:0;border-bottom-right-radius:0}.swagger-ui .br--right-l{border-bottom-left-radius:0;border-top-left-radius:0}.swagger-ui .br--left-l{border-bottom-right-radius:0;border-top-right-radius:0}}.swagger-ui .b--dotted{border-style:dotted}.swagger-ui .b--dashed{border-style:dashed}.swagger-ui .b--solid{border-style:solid}.swagger-ui .b--none{border-style:none}@media screen and (min-width:30em){.swagger-ui .b--dotted-ns{border-style:dotted}.swagger-ui .b--dashed-ns{border-style:dashed}.swagger-ui .b--solid-ns{border-style:solid}.swagger-ui .b--none-ns{border-style:none}}@media screen and (min-width:30em)and (max-width:60em){.swagger-ui .b--dotted-m{border-style:dotted}.swagger-ui .b--dashed-m{border-style:dashed}.swagger-ui .b--solid-m{border-style:solid}.swagger-ui .b--none-m{border-style:none}}@media screen and (min-width:60em){.swagger-ui .b--dotted-l{border-style:dotted}.swagger-ui .b--dashed-l{border-style:dashed}.swagger-ui .b--solid-l{border-style:solid}.swagger-ui .b--none-l{border-style:none}}.swagger-ui .bw0{border-width:0}.swagger-ui .bw1{border-width:.125rem}.swagger-ui .bw2{border-width:.25rem}.swagger-ui .bw3{border-width:.5rem}.swagger-ui .bw4{border-width:1rem}.swagger-ui .bw5{border-width:2rem}.swagger-ui .bt-0{border-top-width:0}.swagger-ui .br-0{border-right-width:0}.swagger-ui .bb-0{border-bottom-width:0}.swagger-ui .bl-0{border-left-width:0}@media screen and (min-width:30em){.swagger-ui .bw0-ns{border-width:0}.swagger-ui .bw1-ns{border-width:.125rem}.swagger-ui .bw2-ns{border-width:.25rem}.swagger-ui .bw3-ns{border-width:.5rem}.swagger-ui .bw4-ns{border-width:1rem}.swagger-ui .bw5-ns{border-width:2rem}.swagger-ui .bt-0-ns{border-top-width:0}.swagger-ui .br-0-ns{border-right-width:0}.swagger-ui .bb-0-ns{border-bottom-width:0}.swagger-ui .bl-0-ns{border-left-width:0}}@media screen and (min-width:30em)and (max-width:60em){.swagger-ui .bw0-m{border-width:0}.swagger-ui .bw1-m{border-width:.125rem}.swagger-ui .bw2-m{border-width:.25rem}.swagger-ui .bw3-m{border-width:.5rem}.swagger-ui .bw4-m{border-width:1rem}.swagger-ui .bw5-m{border-width:2rem}.swagger-ui .bt-0-m{border-top-width:0}.swagger-ui .br-0-m{border-right-width:0}.swagger-ui .bb-0-m{border-bottom-width:0}.swagger-ui .bl-0-m{border-left-width:0}}@media screen and (min-width:60em){.swagger-ui .bw0-l{border-width:0}.swagger-ui .bw1-l{border-width:.125rem}.swagger-ui .bw2-l{border-width:.25rem}.swagger-ui .bw3-l{border-width:.5rem}.swagger-ui .bw4-l{border-width:1rem}.swagger-ui .bw5-l{border-width:2rem}.swagger-ui .bt-0-l{border-top-width:0}.swagger-ui .br-0-l{border-right-width:0}.swagger-ui .bb-0-l{border-bottom-width:0}.swagger-ui .bl-0-l{border-left-width:0}}.swagger-ui .shadow-1{box-shadow:0 0 4px 2px rgba(0,0,0,.2)}.swagger-ui .shadow-2{box-shadow:0 0 8px 2px rgba(0,0,0,.2)}.swagger-ui .shadow-3{box-shadow:2px 2px 4px 2px rgba(0,0,0,.2)}.swagger-ui .shadow-4{box-shadow:2px 2px 8px 0 rgba(0,0,0,.2)}.swagger-ui .shadow-5{box-shadow:4px 4px 8px 0 rgba(0,0,0,.2)}@media screen and (min-width:30em){.swagger-ui .shadow-1-ns{box-shadow:0 0 4px 2px rgba(0,0,0,.2)}.swagger-ui .shadow-2-ns{box-shadow:0 0 8px 2px rgba(0,0,0,.2)}.swagger-ui .shadow-3-ns{box-shadow:2px 2px 4px 2px rgba(0,0,0,.2)}.swagger-ui .shadow-4-ns{box-shadow:2px 2px 8px 0 rgba(0,0,0,.2)}.swagger-ui .shadow-5-ns{box-shadow:4px 4px 8px 0 rgba(0,0,0,.2)}}@media screen and (min-width:30em)and (max-width:60em){.swagger-ui .shadow-1-m{box-shadow:0 0 4px 2px rgba(0,0,0,.2)}.swagger-ui .shadow-2-m{box-shadow:0 0 8px 2px rgba(0,0,0,.2)}.swagger-ui .shadow-3-m{box-shadow:2px 2px 4px 2px rgba(0,0,0,.2)}.swagger-ui .shadow-4-m{box-shadow:2px 2px 8px 0 rgba(0,0,0,.2)}.swagger-ui .shadow-5-m{box-shadow:4px 4px 8px 0 rgba(0,0,0,.2)}}@media screen and (min-width:60em){.swagger-ui .shadow-1-l{box-shadow:0 0 4px 2px rgba(0,0,0,.2)}.swagger-ui .shadow-2-l{box-shadow:0 0 8px 2px rgba(0,0,0,.2)}.swagger-ui .shadow-3-l{box-shadow:2px 2px 4px 2px rgba(0,0,0,.2)}.swagger-ui .shadow-4-l{box-shadow:2px 2px 8px 0 rgba(0,0,0,.2)}.swagger-ui .shadow-5-l{box-shadow:4px 4px 8px 0 rgba(0,0,0,.2)}}.swagger-ui .pre{overflow-x:auto;overflow-y:hidden;overflow:scroll}.swagger-ui .top-0{top:0}.swagger-ui .right-0{right:0}.swagger-ui .bottom-0{bottom:0}.swagger-ui .left-0{left:0}.swagger-ui .top-1{top:1rem}.swagger-ui .right-1{right:1rem}.swagger-ui .bottom-1{bottom:1rem}.swagger-ui .left-1{left:1rem}.swagger-ui .top-2{top:2rem}.swagger-ui .right-2{right:2rem}.swagger-ui .bottom-2{bottom:2rem}.swagger-ui .left-2{left:2rem}.swagger-ui .top--1{top:-1rem}.swagger-ui .right--1{right:-1rem}.swagger-ui .bottom--1{bottom:-1rem}.swagger-ui .left--1{left:-1rem}.swagger-ui .top--2{top:-2rem}.swagger-ui .right--2{right:-2rem}.swagger-ui .bottom--2{bottom:-2rem}.swagger-ui .left--2{left:-2rem}.swagger-ui .absolute--fill{bottom:0;left:0;right:0;top:0}@media screen and (min-width:30em){.swagger-ui .top-0-ns{top:0}.swagger-ui .left-0-ns{left:0}.swagger-ui .right-0-ns{right:0}.swagger-ui .bottom-0-ns{bottom:0}.swagger-ui .top-1-ns{top:1rem}.swagger-ui .left-1-ns{left:1rem}.swagger-ui .right-1-ns{right:1rem}.swagger-ui .bottom-1-ns{bottom:1rem}.swagger-ui .top-2-ns{top:2rem}.swagger-ui .left-2-ns{left:2rem}.swagger-ui .right-2-ns{right:2rem}.swagger-ui .bottom-2-ns{bottom:2rem}.swagger-ui .top--1-ns{top:-1rem}.swagger-ui .right--1-ns{right:-1rem}.swagger-ui .bottom--1-ns{bottom:-1rem}.swagger-ui .left--1-ns{left:-1rem}.swagger-ui .top--2-ns{top:-2rem}.swagger-ui .right--2-ns{right:-2rem}.swagger-ui .bottom--2-ns{bottom:-2rem}.swagger-ui .left--2-ns{left:-2rem}.swagger-ui .absolute--fill-ns{bottom:0;left:0;right:0;top:0}}@media screen and (min-width:30em)and (max-width:60em){.swagger-ui .top-0-m{top:0}.swagger-ui .left-0-m{left:0}.swagger-ui .right-0-m{right:0}.swagger-ui .bottom-0-m{bottom:0}.swagger-ui .top-1-m{top:1rem}.swagger-ui .left-1-m{left:1rem}.swagger-ui .right-1-m{right:1rem}.swagger-ui .bottom-1-m{bottom:1rem}.swagger-ui .top-2-m{top:2rem}.swagger-ui .left-2-m{left:2rem}.swagger-ui .right-2-m{right:2rem}.swagger-ui .bottom-2-m{bottom:2rem}.swagger-ui .top--1-m{top:-1rem}.swagger-ui .right--1-m{right:-1rem}.swagger-ui .bottom--1-m{bottom:-1rem}.swagger-ui .left--1-m{left:-1rem}.swagger-ui .top--2-m{top:-2rem}.swagger-ui .right--2-m{right:-2rem}.swagger-ui .bottom--2-m{bottom:-2rem}.swagger-ui .left--2-m{left:-2rem}.swagger-ui .absolute--fill-m{bottom:0;left:0;right:0;top:0}}@media screen and (min-width:60em){.swagger-ui .top-0-l{top:0}.swagger-ui .left-0-l{left:0}.swagger-ui .right-0-l{right:0}.swagger-ui .bottom-0-l{bottom:0}.swagger-ui .top-1-l{top:1rem}.swagger-ui .left-1-l{left:1rem}.swagger-ui .right-1-l{right:1rem}.swagger-ui .bottom-1-l{bottom:1rem}.swagger-ui .top-2-l{top:2rem}.swagger-ui .left-2-l{left:2rem}.swagger-ui .right-2-l{right:2rem}.swagger-ui .bottom-2-l{bottom:2rem}.swagger-ui .top--1-l{top:-1rem}.swagger-ui .right--1-l{right:-1rem}.swagger-ui .bottom--1-l{bottom:-1rem}.swagger-ui .left--1-l{left:-1rem}.swagger-ui .top--2-l{top:-2rem}.swagger-ui .right--2-l{right:-2rem}.swagger-ui .bottom--2-l{bottom:-2rem}.swagger-ui .left--2-l{left:-2rem}.swagger-ui .absolute--fill-l{bottom:0;left:0;right:0;top:0}}.swagger-ui .cf:after,.swagger-ui .cf:before{content:" ";display:table}.swagger-ui .cf:after{clear:both}.swagger-ui .cf{zoom:1}.swagger-ui .cl{clear:left}.swagger-ui .cr{clear:right}.swagger-ui .cb{clear:both}.swagger-ui .cn{clear:none}@media screen and (min-width:30em){.swagger-ui .cl-ns{clear:left}.swagger-ui .cr-ns{clear:right}.swagger-ui .cb-ns{clear:both}.swagger-ui .cn-ns{clear:none}}@media screen and (min-width:30em)and (max-width:60em){.swagger-ui .cl-m{clear:left}.swagger-ui .cr-m{clear:right}.swagger-ui .cb-m{clear:both}.swagger-ui .cn-m{clear:none}}@media screen and (min-width:60em){.swagger-ui .cl-l{clear:left}.swagger-ui .cr-l{clear:right}.swagger-ui .cb-l{clear:both}.swagger-ui .cn-l{clear:none}}.swagger-ui .flex{display:flex}.swagger-ui .inline-flex{display:inline-flex}.swagger-ui .flex-auto{flex:1 1 auto;min-height:0;min-width:0}.swagger-ui .flex-none{flex:none}.swagger-ui .flex-column{flex-direction:column}.swagger-ui .flex-row{flex-direction:row}.swagger-ui .flex-wrap{flex-wrap:wrap}.swagger-ui .flex-nowrap{flex-wrap:nowrap}.swagger-ui .flex-wrap-reverse{flex-wrap:wrap-reverse}.swagger-ui .flex-column-reverse{flex-direction:column-reverse}.swagger-ui .flex-row-reverse{flex-direction:row-reverse}.swagger-ui .items-start{align-items:flex-start}.swagger-ui .items-end{align-items:flex-end}.swagger-ui .items-center{align-items:center}.swagger-ui .items-baseline{align-items:baseline}.swagger-ui .items-stretch{align-items:stretch}.swagger-ui .self-start{align-self:flex-start}.swagger-ui .self-end{align-self:flex-end}.swagger-ui .self-center{align-self:center}.swagger-ui .self-baseline{align-self:baseline}.swagger-ui .self-stretch{align-self:stretch}.swagger-ui .justify-start{justify-content:flex-start}.swagger-ui .justify-end{justify-content:flex-end}.swagger-ui .justify-center{justify-content:center}.swagger-ui .justify-between{justify-content:space-between}.swagger-ui .justify-around{justify-content:space-around}.swagger-ui .content-start{align-content:flex-start}.swagger-ui .content-end{align-content:flex-end}.swagger-ui .content-center{align-content:center}.swagger-ui .content-between{align-content:space-between}.swagger-ui .content-around{align-content:space-around}.swagger-ui .content-stretch{align-content:stretch}.swagger-ui .order-0{order:0}.swagger-ui .order-1{order:1}.swagger-ui .order-2{order:2}.swagger-ui .order-3{order:3}.swagger-ui .order-4{order:4}.swagger-ui .order-5{order:5}.swagger-ui .order-6{order:6}.swagger-ui .order-7{order:7}.swagger-ui .order-8{order:8}.swagger-ui .order-last{order:99999}.swagger-ui .flex-grow-0{flex-grow:0}.swagger-ui .flex-grow-1{flex-grow:1}.swagger-ui .flex-shrink-0{flex-shrink:0}.swagger-ui .flex-shrink-1{flex-shrink:1}@media screen and (min-width:30em){.swagger-ui .flex-ns{display:flex}.swagger-ui .inline-flex-ns{display:inline-flex}.swagger-ui .flex-auto-ns{flex:1 1 auto;min-height:0;min-width:0}.swagger-ui .flex-none-ns{flex:none}.swagger-ui .flex-column-ns{flex-direction:column}.swagger-ui .flex-row-ns{flex-direction:row}.swagger-ui .flex-wrap-ns{flex-wrap:wrap}.swagger-ui .flex-nowrap-ns{flex-wrap:nowrap}.swagger-ui .flex-wrap-reverse-ns{flex-wrap:wrap-reverse}.swagger-ui .flex-column-reverse-ns{flex-direction:column-reverse}.swagger-ui .flex-row-reverse-ns{flex-direction:row-reverse}.swagger-ui .items-start-ns{align-items:flex-start}.swagger-ui .items-end-ns{align-items:flex-end}.swagger-ui .items-center-ns{align-items:center}.swagger-ui .items-baseline-ns{align-items:baseline}.swagger-ui .items-stretch-ns{align-items:stretch}.swagger-ui .self-start-ns{align-self:flex-start}.swagger-ui .self-end-ns{align-self:flex-end}.swagger-ui .self-center-ns{align-self:center}.swagger-ui .self-baseline-ns{align-self:baseline}.swagger-ui .self-stretch-ns{align-self:stretch}.swagger-ui .justify-start-ns{justify-content:flex-start}.swagger-ui .justify-end-ns{justify-content:flex-end}.swagger-ui .justify-center-ns{justify-content:center}.swagger-ui .justify-between-ns{justify-content:space-between}.swagger-ui .justify-around-ns{justify-content:space-around}.swagger-ui .content-start-ns{align-content:flex-start}.swagger-ui .content-end-ns{align-content:flex-end}.swagger-ui .content-center-ns{align-content:center}.swagger-ui .content-between-ns{align-content:space-between}.swagger-ui .content-around-ns{align-content:space-around}.swagger-ui .content-stretch-ns{align-content:stretch}.swagger-ui .order-0-ns{order:0}.swagger-ui .order-1-ns{order:1}.swagger-ui .order-2-ns{order:2}.swagger-ui .order-3-ns{order:3}.swagger-ui .order-4-ns{order:4}.swagger-ui .order-5-ns{order:5}.swagger-ui .order-6-ns{order:6}.swagger-ui .order-7-ns{order:7}.swagger-ui .order-8-ns{order:8}.swagger-ui .order-last-ns{order:99999}.swagger-ui .flex-grow-0-ns{flex-grow:0}.swagger-ui .flex-grow-1-ns{flex-grow:1}.swagger-ui .flex-shrink-0-ns{flex-shrink:0}.swagger-ui .flex-shrink-1-ns{flex-shrink:1}}@media screen and (min-width:30em)and (max-width:60em){.swagger-ui .flex-m{display:flex}.swagger-ui .inline-flex-m{display:inline-flex}.swagger-ui .flex-auto-m{flex:1 1 auto;min-height:0;min-width:0}.swagger-ui .flex-none-m{flex:none}.swagger-ui .flex-column-m{flex-direction:column}.swagger-ui .flex-row-m{flex-direction:row}.swagger-ui .flex-wrap-m{flex-wrap:wrap}.swagger-ui .flex-nowrap-m{flex-wrap:nowrap}.swagger-ui .flex-wrap-reverse-m{flex-wrap:wrap-reverse}.swagger-ui .flex-column-reverse-m{flex-direction:column-reverse}.swagger-ui .flex-row-reverse-m{flex-direction:row-reverse}.swagger-ui .items-start-m{align-items:flex-start}.swagger-ui .items-end-m{align-items:flex-end}.swagger-ui .items-center-m{align-items:center}.swagger-ui .items-baseline-m{align-items:baseline}.swagger-ui .items-stretch-m{align-items:stretch}.swagger-ui .self-start-m{align-self:flex-start}.swagger-ui .self-end-m{align-self:flex-end}.swagger-ui .self-center-m{align-self:center}.swagger-ui .self-baseline-m{align-self:baseline}.swagger-ui .self-stretch-m{align-self:stretch}.swagger-ui .justify-start-m{justify-content:flex-start}.swagger-ui .justify-end-m{justify-content:flex-end}.swagger-ui .justify-center-m{justify-content:center}.swagger-ui .justify-between-m{justify-content:space-between}.swagger-ui .justify-around-m{justify-content:space-around}.swagger-ui .content-start-m{align-content:flex-start}.swagger-ui .content-end-m{align-content:flex-end}.swagger-ui .content-center-m{align-content:center}.swagger-ui .content-between-m{align-content:space-between}.swagger-ui .content-around-m{align-content:space-around}.swagger-ui .content-stretch-m{align-content:stretch}.swagger-ui .order-0-m{order:0}.swagger-ui .order-1-m{order:1}.swagger-ui .order-2-m{order:2}.swagger-ui .order-3-m{order:3}.swagger-ui .order-4-m{order:4}.swagger-ui .order-5-m{order:5}.swagger-ui .order-6-m{order:6}.swagger-ui .order-7-m{order:7}.swagger-ui .order-8-m{order:8}.swagger-ui .order-last-m{order:99999}.swagger-ui .flex-grow-0-m{flex-grow:0}.swagger-ui .flex-grow-1-m{flex-grow:1}.swagger-ui .flex-shrink-0-m{flex-shrink:0}.swagger-ui .flex-shrink-1-m{flex-shrink:1}}@media screen and (min-width:60em){.swagger-ui .flex-l{display:flex}.swagger-ui .inline-flex-l{display:inline-flex}.swagger-ui .flex-auto-l{flex:1 1 auto;min-height:0;min-width:0}.swagger-ui .flex-none-l{flex:none}.swagger-ui .flex-column-l{flex-direction:column}.swagger-ui .flex-row-l{flex-direction:row}.swagger-ui .flex-wrap-l{flex-wrap:wrap}.swagger-ui .flex-nowrap-l{flex-wrap:nowrap}.swagger-ui .flex-wrap-reverse-l{flex-wrap:wrap-reverse}.swagger-ui .flex-column-reverse-l{flex-direction:column-reverse}.swagger-ui .flex-row-reverse-l{flex-direction:row-reverse}.swagger-ui .items-start-l{align-items:flex-start}.swagger-ui .items-end-l{align-items:flex-end}.swagger-ui .items-center-l{align-items:center}.swagger-ui .items-baseline-l{align-items:baseline}.swagger-ui .items-stretch-l{align-items:stretch}.swagger-ui .self-start-l{align-self:flex-start}.swagger-ui .self-end-l{align-self:flex-end}.swagger-ui .self-center-l{align-self:center}.swagger-ui .self-baseline-l{align-self:baseline}.swagger-ui .self-stretch-l{align-self:stretch}.swagger-ui .justify-start-l{justify-content:flex-start}.swagger-ui .justify-end-l{justify-content:flex-end}.swagger-ui .justify-center-l{justify-content:center}.swagger-ui .justify-between-l{justify-content:space-between}.swagger-ui .justify-around-l{justify-content:space-around}.swagger-ui .content-start-l{align-content:flex-start}.swagger-ui .content-end-l{align-content:flex-end}.swagger-ui .content-center-l{align-content:center}.swagger-ui .content-between-l{align-content:space-between}.swagger-ui .content-around-l{align-content:space-around}.swagger-ui .content-stretch-l{align-content:stretch}.swagger-ui .order-0-l{order:0}.swagger-ui .order-1-l{order:1}.swagger-ui .order-2-l{order:2}.swagger-ui .order-3-l{order:3}.swagger-ui .order-4-l{order:4}.swagger-ui .order-5-l{order:5}.swagger-ui .order-6-l{order:6}.swagger-ui .order-7-l{order:7}.swagger-ui .order-8-l{order:8}.swagger-ui .order-last-l{order:99999}.swagger-ui .flex-grow-0-l{flex-grow:0}.swagger-ui .flex-grow-1-l{flex-grow:1}.swagger-ui .flex-shrink-0-l{flex-shrink:0}.swagger-ui .flex-shrink-1-l{flex-shrink:1}}.swagger-ui .dn{display:none}.swagger-ui .di{display:inline}.swagger-ui .db{display:block}.swagger-ui .dib{display:inline-block}.swagger-ui .dit{display:inline-table}.swagger-ui .dt{display:table}.swagger-ui .dtc{display:table-cell}.swagger-ui .dt-row{display:table-row}.swagger-ui .dt-row-group{display:table-row-group}.swagger-ui .dt-column{display:table-column}.swagger-ui .dt-column-group{display:table-column-group}.swagger-ui .dt--fixed{table-layout:fixed;width:100%}@media screen and (min-width:30em){.swagger-ui .dn-ns{display:none}.swagger-ui .di-ns{display:inline}.swagger-ui .db-ns{display:block}.swagger-ui .dib-ns{display:inline-block}.swagger-ui .dit-ns{display:inline-table}.swagger-ui .dt-ns{display:table}.swagger-ui .dtc-ns{display:table-cell}.swagger-ui .dt-row-ns{display:table-row}.swagger-ui .dt-row-group-ns{display:table-row-group}.swagger-ui .dt-column-ns{display:table-column}.swagger-ui .dt-column-group-ns{display:table-column-group}.swagger-ui .dt--fixed-ns{table-layout:fixed;width:100%}}@media screen and (min-width:30em)and (max-width:60em){.swagger-ui .dn-m{display:none}.swagger-ui .di-m{display:inline}.swagger-ui .db-m{display:block}.swagger-ui .dib-m{display:inline-block}.swagger-ui .dit-m{display:inline-table}.swagger-ui .dt-m{display:table}.swagger-ui .dtc-m{display:table-cell}.swagger-ui .dt-row-m{display:table-row}.swagger-ui .dt-row-group-m{display:table-row-group}.swagger-ui .dt-column-m{display:table-column}.swagger-ui .dt-column-group-m{display:table-column-group}.swagger-ui .dt--fixed-m{table-layout:fixed;width:100%}}@media screen and (min-width:60em){.swagger-ui .dn-l{display:none}.swagger-ui .di-l{display:inline}.swagger-ui .db-l{display:block}.swagger-ui .dib-l{display:inline-block}.swagger-ui .dit-l{display:inline-table}.swagger-ui .dt-l{display:table}.swagger-ui .dtc-l{display:table-cell}.swagger-ui .dt-row-l{display:table-row}.swagger-ui .dt-row-group-l{display:table-row-group}.swagger-ui .dt-column-l{display:table-column}.swagger-ui .dt-column-group-l{display:table-column-group}.swagger-ui .dt--fixed-l{table-layout:fixed;width:100%}}.swagger-ui .fl{_display:inline;float:left}.swagger-ui .fr{_display:inline;float:right}.swagger-ui .fn{float:none}@media screen and (min-width:30em){.swagger-ui .fl-ns{_display:inline;float:left}.swagger-ui .fr-ns{_display:inline;float:right}.swagger-ui .fn-ns{float:none}}@media screen and (min-width:30em)and (max-width:60em){.swagger-ui .fl-m{_display:inline;float:left}.swagger-ui .fr-m{_display:inline;float:right}.swagger-ui .fn-m{float:none}}@media screen and (min-width:60em){.swagger-ui .fl-l{_display:inline;float:left}.swagger-ui .fr-l{_display:inline;float:right}.swagger-ui .fn-l{float:none}}.swagger-ui .sans-serif{font-family:-apple-system,BlinkMacSystemFont,avenir next,avenir,helvetica,helvetica neue,ubuntu,roboto,noto,segoe ui,arial,sans-serif}.swagger-ui .serif{font-family:georgia,serif}.swagger-ui .system-sans-serif{font-family:sans-serif}.swagger-ui .system-serif{font-family:serif}.swagger-ui .code,.swagger-ui code{font-family:Consolas,monaco,monospace}.swagger-ui .courier{font-family:Courier Next,courier,monospace}.swagger-ui .helvetica{font-family:helvetica neue,helvetica,sans-serif}.swagger-ui .avenir{font-family:avenir next,avenir,sans-serif}.swagger-ui .athelas{font-family:athelas,georgia,serif}.swagger-ui .georgia{font-family:georgia,serif}.swagger-ui .times{font-family:times,serif}.swagger-ui .bodoni{font-family:Bodoni MT,serif}.swagger-ui .calisto{font-family:Calisto MT,serif}.swagger-ui .garamond{font-family:garamond,serif}.swagger-ui .baskerville{font-family:baskerville,serif}.swagger-ui .i{font-style:italic}.swagger-ui .fs-normal{font-style:normal}@media screen and (min-width:30em){.swagger-ui .i-ns{font-style:italic}.swagger-ui .fs-normal-ns{font-style:normal}}@media screen and (min-width:30em)and (max-width:60em){.swagger-ui .i-m{font-style:italic}.swagger-ui .fs-normal-m{font-style:normal}}@media screen and (min-width:60em){.swagger-ui .i-l{font-style:italic}.swagger-ui .fs-normal-l{font-style:normal}}.swagger-ui .normal{font-weight:400}.swagger-ui .b{font-weight:700}.swagger-ui .fw1{font-weight:100}.swagger-ui .fw2{font-weight:200}.swagger-ui .fw3{font-weight:300}.swagger-ui .fw4{font-weight:400}.swagger-ui .fw5{font-weight:500}.swagger-ui .fw6{font-weight:600}.swagger-ui .fw7{font-weight:700}.swagger-ui .fw8{font-weight:800}.swagger-ui .fw9{font-weight:900}@media screen and (min-width:30em){.swagger-ui .normal-ns{font-weight:400}.swagger-ui .b-ns{font-weight:700}.swagger-ui .fw1-ns{font-weight:100}.swagger-ui .fw2-ns{font-weight:200}.swagger-ui .fw3-ns{font-weight:300}.swagger-ui .fw4-ns{font-weight:400}.swagger-ui .fw5-ns{font-weight:500}.swagger-ui .fw6-ns{font-weight:600}.swagger-ui .fw7-ns{font-weight:700}.swagger-ui .fw8-ns{font-weight:800}.swagger-ui .fw9-ns{font-weight:900}}@media screen and (min-width:30em)and (max-width:60em){.swagger-ui .normal-m{font-weight:400}.swagger-ui .b-m{font-weight:700}.swagger-ui .fw1-m{font-weight:100}.swagger-ui .fw2-m{font-weight:200}.swagger-ui .fw3-m{font-weight:300}.swagger-ui .fw4-m{font-weight:400}.swagger-ui .fw5-m{font-weight:500}.swagger-ui .fw6-m{font-weight:600}.swagger-ui .fw7-m{font-weight:700}.swagger-ui .fw8-m{font-weight:800}.swagger-ui .fw9-m{font-weight:900}}@media screen and (min-width:60em){.swagger-ui .normal-l{font-weight:400}.swagger-ui .b-l{font-weight:700}.swagger-ui .fw1-l{font-weight:100}.swagger-ui .fw2-l{font-weight:200}.swagger-ui .fw3-l{font-weight:300}.swagger-ui .fw4-l{font-weight:400}.swagger-ui .fw5-l{font-weight:500}.swagger-ui .fw6-l{font-weight:600}.swagger-ui .fw7-l{font-weight:700}.swagger-ui .fw8-l{font-weight:800}.swagger-ui .fw9-l{font-weight:900}}.swagger-ui .input-reset{-webkit-appearance:none;-moz-appearance:none}.swagger-ui .button-reset::-moz-focus-inner,.swagger-ui .input-reset::-moz-focus-inner{border:0;padding:0}.swagger-ui .h1{height:1rem}.swagger-ui .h2{height:2rem}.swagger-ui .h3{height:4rem}.swagger-ui .h4{height:8rem}.swagger-ui .h5{height:16rem}.swagger-ui .h-25{height:25%}.swagger-ui .h-50{height:50%}.swagger-ui .h-75{height:75%}.swagger-ui .h-100{height:100%}.swagger-ui .min-h-100{min-height:100%}.swagger-ui .vh-25{height:25vh}.swagger-ui .vh-50{height:50vh}.swagger-ui .vh-75{height:75vh}.swagger-ui .vh-100{height:100vh}.swagger-ui .min-vh-100{min-height:100vh}.swagger-ui .h-auto{height:auto}.swagger-ui .h-inherit{height:inherit}@media screen and (min-width:30em){.swagger-ui .h1-ns{height:1rem}.swagger-ui .h2-ns{height:2rem}.swagger-ui .h3-ns{height:4rem}.swagger-ui .h4-ns{height:8rem}.swagger-ui .h5-ns{height:16rem}.swagger-ui .h-25-ns{height:25%}.swagger-ui .h-50-ns{height:50%}.swagger-ui .h-75-ns{height:75%}.swagger-ui .h-100-ns{height:100%}.swagger-ui .min-h-100-ns{min-height:100%}.swagger-ui .vh-25-ns{height:25vh}.swagger-ui .vh-50-ns{height:50vh}.swagger-ui .vh-75-ns{height:75vh}.swagger-ui .vh-100-ns{height:100vh}.swagger-ui .min-vh-100-ns{min-height:100vh}.swagger-ui .h-auto-ns{height:auto}.swagger-ui .h-inherit-ns{height:inherit}}@media screen and (min-width:30em)and (max-width:60em){.swagger-ui .h1-m{height:1rem}.swagger-ui .h2-m{height:2rem}.swagger-ui .h3-m{height:4rem}.swagger-ui .h4-m{height:8rem}.swagger-ui .h5-m{height:16rem}.swagger-ui .h-25-m{height:25%}.swagger-ui .h-50-m{height:50%}.swagger-ui .h-75-m{height:75%}.swagger-ui .h-100-m{height:100%}.swagger-ui .min-h-100-m{min-height:100%}.swagger-ui .vh-25-m{height:25vh}.swagger-ui .vh-50-m{height:50vh}.swagger-ui .vh-75-m{height:75vh}.swagger-ui .vh-100-m{height:100vh}.swagger-ui .min-vh-100-m{min-height:100vh}.swagger-ui .h-auto-m{height:auto}.swagger-ui .h-inherit-m{height:inherit}}@media screen and (min-width:60em){.swagger-ui .h1-l{height:1rem}.swagger-ui .h2-l{height:2rem}.swagger-ui .h3-l{height:4rem}.swagger-ui .h4-l{height:8rem}.swagger-ui .h5-l{height:16rem}.swagger-ui .h-25-l{height:25%}.swagger-ui .h-50-l{height:50%}.swagger-ui .h-75-l{height:75%}.swagger-ui .h-100-l{height:100%}.swagger-ui .min-h-100-l{min-height:100%}.swagger-ui .vh-25-l{height:25vh}.swagger-ui .vh-50-l{height:50vh}.swagger-ui .vh-75-l{height:75vh}.swagger-ui .vh-100-l{height:100vh}.swagger-ui .min-vh-100-l{min-height:100vh}.swagger-ui .h-auto-l{height:auto}.swagger-ui .h-inherit-l{height:inherit}}.swagger-ui .tracked{letter-spacing:.1em}.swagger-ui .tracked-tight{letter-spacing:-.05em}.swagger-ui .tracked-mega{letter-spacing:.25em}@media screen and (min-width:30em){.swagger-ui .tracked-ns{letter-spacing:.1em}.swagger-ui .tracked-tight-ns{letter-spacing:-.05em}.swagger-ui .tracked-mega-ns{letter-spacing:.25em}}@media screen and (min-width:30em)and (max-width:60em){.swagger-ui .tracked-m{letter-spacing:.1em}.swagger-ui .tracked-tight-m{letter-spacing:-.05em}.swagger-ui .tracked-mega-m{letter-spacing:.25em}}@media screen and (min-width:60em){.swagger-ui .tracked-l{letter-spacing:.1em}.swagger-ui .tracked-tight-l{letter-spacing:-.05em}.swagger-ui .tracked-mega-l{letter-spacing:.25em}}.swagger-ui .lh-solid{line-height:1}.swagger-ui .lh-title{line-height:1.25}.swagger-ui .lh-copy{line-height:1.5}@media screen and (min-width:30em){.swagger-ui .lh-solid-ns{line-height:1}.swagger-ui .lh-title-ns{line-height:1.25}.swagger-ui .lh-copy-ns{line-height:1.5}}@media screen and (min-width:30em)and (max-width:60em){.swagger-ui .lh-solid-m{line-height:1}.swagger-ui .lh-title-m{line-height:1.25}.swagger-ui .lh-copy-m{line-height:1.5}}@media screen and (min-width:60em){.swagger-ui .lh-solid-l{line-height:1}.swagger-ui .lh-title-l{line-height:1.25}.swagger-ui .lh-copy-l{line-height:1.5}}.swagger-ui .link{-webkit-text-decoration:none;text-decoration:none}.swagger-ui .link,.swagger-ui .link:active,.swagger-ui .link:focus,.swagger-ui .link:hover,.swagger-ui .link:link,.swagger-ui .link:visited{transition:color .15s ease-in}.swagger-ui .link:focus{outline:1px dotted currentColor}.swagger-ui .list{list-style-type:none}.swagger-ui .mw-100{max-width:100%}.swagger-ui .mw1{max-width:1rem}.swagger-ui .mw2{max-width:2rem}.swagger-ui .mw3{max-width:4rem}.swagger-ui .mw4{max-width:8rem}.swagger-ui .mw5{max-width:16rem}.swagger-ui .mw6{max-width:32rem}.swagger-ui .mw7{max-width:48rem}.swagger-ui .mw8{max-width:64rem}.swagger-ui .mw9{max-width:96rem}.swagger-ui .mw-none{max-width:none}@media screen and (min-width:30em){.swagger-ui .mw-100-ns{max-width:100%}.swagger-ui .mw1-ns{max-width:1rem}.swagger-ui .mw2-ns{max-width:2rem}.swagger-ui .mw3-ns{max-width:4rem}.swagger-ui .mw4-ns{max-width:8rem}.swagger-ui .mw5-ns{max-width:16rem}.swagger-ui .mw6-ns{max-width:32rem}.swagger-ui .mw7-ns{max-width:48rem}.swagger-ui .mw8-ns{max-width:64rem}.swagger-ui .mw9-ns{max-width:96rem}.swagger-ui .mw-none-ns{max-width:none}}@media screen and (min-width:30em)and (max-width:60em){.swagger-ui .mw-100-m{max-width:100%}.swagger-ui .mw1-m{max-width:1rem}.swagger-ui .mw2-m{max-width:2rem}.swagger-ui .mw3-m{max-width:4rem}.swagger-ui .mw4-m{max-width:8rem}.swagger-ui .mw5-m{max-width:16rem}.swagger-ui .mw6-m{max-width:32rem}.swagger-ui .mw7-m{max-width:48rem}.swagger-ui .mw8-m{max-width:64rem}.swagger-ui .mw9-m{max-width:96rem}.swagger-ui .mw-none-m{max-width:none}}@media screen and (min-width:60em){.swagger-ui .mw-100-l{max-width:100%}.swagger-ui .mw1-l{max-width:1rem}.swagger-ui .mw2-l{max-width:2rem}.swagger-ui .mw3-l{max-width:4rem}.swagger-ui .mw4-l{max-width:8rem}.swagger-ui .mw5-l{max-width:16rem}.swagger-ui .mw6-l{max-width:32rem}.swagger-ui .mw7-l{max-width:48rem}.swagger-ui .mw8-l{max-width:64rem}.swagger-ui .mw9-l{max-width:96rem}.swagger-ui .mw-none-l{max-width:none}}.swagger-ui .w1{width:1rem}.swagger-ui .w2{width:2rem}.swagger-ui .w3{width:4rem}.swagger-ui .w4{width:8rem}.swagger-ui .w5{width:16rem}.swagger-ui .w-10{width:10%}.swagger-ui .w-20{width:20%}.swagger-ui .w-25{width:25%}.swagger-ui .w-30{width:30%}.swagger-ui .w-33{width:33%}.swagger-ui .w-34{width:34%}.swagger-ui .w-40{width:40%}.swagger-ui .w-50{width:50%}.swagger-ui .w-60{width:60%}.swagger-ui .w-70{width:70%}.swagger-ui .w-75{width:75%}.swagger-ui .w-80{width:80%}.swagger-ui .w-90{width:90%}.swagger-ui .w-100{width:100%}.swagger-ui .w-third{width:33.3333333333%}.swagger-ui .w-two-thirds{width:66.6666666667%}.swagger-ui .w-auto{width:auto}@media screen and (min-width:30em){.swagger-ui .w1-ns{width:1rem}.swagger-ui .w2-ns{width:2rem}.swagger-ui .w3-ns{width:4rem}.swagger-ui .w4-ns{width:8rem}.swagger-ui .w5-ns{width:16rem}.swagger-ui .w-10-ns{width:10%}.swagger-ui .w-20-ns{width:20%}.swagger-ui .w-25-ns{width:25%}.swagger-ui .w-30-ns{width:30%}.swagger-ui .w-33-ns{width:33%}.swagger-ui .w-34-ns{width:34%}.swagger-ui .w-40-ns{width:40%}.swagger-ui .w-50-ns{width:50%}.swagger-ui .w-60-ns{width:60%}.swagger-ui .w-70-ns{width:70%}.swagger-ui .w-75-ns{width:75%}.swagger-ui .w-80-ns{width:80%}.swagger-ui .w-90-ns{width:90%}.swagger-ui .w-100-ns{width:100%}.swagger-ui .w-third-ns{width:33.3333333333%}.swagger-ui .w-two-thirds-ns{width:66.6666666667%}.swagger-ui .w-auto-ns{width:auto}}@media screen and (min-width:30em)and (max-width:60em){.swagger-ui .w1-m{width:1rem}.swagger-ui .w2-m{width:2rem}.swagger-ui .w3-m{width:4rem}.swagger-ui .w4-m{width:8rem}.swagger-ui .w5-m{width:16rem}.swagger-ui .w-10-m{width:10%}.swagger-ui .w-20-m{width:20%}.swagger-ui .w-25-m{width:25%}.swagger-ui .w-30-m{width:30%}.swagger-ui .w-33-m{width:33%}.swagger-ui .w-34-m{width:34%}.swagger-ui .w-40-m{width:40%}.swagger-ui .w-50-m{width:50%}.swagger-ui .w-60-m{width:60%}.swagger-ui .w-70-m{width:70%}.swagger-ui .w-75-m{width:75%}.swagger-ui .w-80-m{width:80%}.swagger-ui .w-90-m{width:90%}.swagger-ui .w-100-m{width:100%}.swagger-ui .w-third-m{width:33.3333333333%}.swagger-ui .w-two-thirds-m{width:66.6666666667%}.swagger-ui .w-auto-m{width:auto}}@media screen and (min-width:60em){.swagger-ui .w1-l{width:1rem}.swagger-ui .w2-l{width:2rem}.swagger-ui .w3-l{width:4rem}.swagger-ui .w4-l{width:8rem}.swagger-ui .w5-l{width:16rem}.swagger-ui .w-10-l{width:10%}.swagger-ui .w-20-l{width:20%}.swagger-ui .w-25-l{width:25%}.swagger-ui .w-30-l{width:30%}.swagger-ui .w-33-l{width:33%}.swagger-ui .w-34-l{width:34%}.swagger-ui .w-40-l{width:40%}.swagger-ui .w-50-l{width:50%}.swagger-ui .w-60-l{width:60%}.swagger-ui .w-70-l{width:70%}.swagger-ui .w-75-l{width:75%}.swagger-ui .w-80-l{width:80%}.swagger-ui .w-90-l{width:90%}.swagger-ui .w-100-l{width:100%}.swagger-ui .w-third-l{width:33.3333333333%}.swagger-ui .w-two-thirds-l{width:66.6666666667%}.swagger-ui .w-auto-l{width:auto}}.swagger-ui .overflow-visible{overflow:visible}.swagger-ui .overflow-hidden{overflow:hidden}.swagger-ui .overflow-scroll{overflow:scroll}.swagger-ui .overflow-auto{overflow:auto}.swagger-ui .overflow-x-visible{overflow-x:visible}.swagger-ui .overflow-x-hidden{overflow-x:hidden}.swagger-ui .overflow-x-scroll{overflow-x:scroll}.swagger-ui .overflow-x-auto{overflow-x:auto}.swagger-ui .overflow-y-visible{overflow-y:visible}.swagger-ui .overflow-y-hidden{overflow-y:hidden}.swagger-ui .overflow-y-scroll{overflow-y:scroll}.swagger-ui .overflow-y-auto{overflow-y:auto}@media screen and (min-width:30em){.swagger-ui .overflow-visible-ns{overflow:visible}.swagger-ui .overflow-hidden-ns{overflow:hidden}.swagger-ui .overflow-scroll-ns{overflow:scroll}.swagger-ui .overflow-auto-ns{overflow:auto}.swagger-ui .overflow-x-visible-ns{overflow-x:visible}.swagger-ui .overflow-x-hidden-ns{overflow-x:hidden}.swagger-ui .overflow-x-scroll-ns{overflow-x:scroll}.swagger-ui .overflow-x-auto-ns{overflow-x:auto}.swagger-ui .overflow-y-visible-ns{overflow-y:visible}.swagger-ui .overflow-y-hidden-ns{overflow-y:hidden}.swagger-ui .overflow-y-scroll-ns{overflow-y:scroll}.swagger-ui .overflow-y-auto-ns{overflow-y:auto}}@media screen and (min-width:30em)and (max-width:60em){.swagger-ui .overflow-visible-m{overflow:visible}.swagger-ui .overflow-hidden-m{overflow:hidden}.swagger-ui .overflow-scroll-m{overflow:scroll}.swagger-ui .overflow-auto-m{overflow:auto}.swagger-ui .overflow-x-visible-m{overflow-x:visible}.swagger-ui .overflow-x-hidden-m{overflow-x:hidden}.swagger-ui .overflow-x-scroll-m{overflow-x:scroll}.swagger-ui .overflow-x-auto-m{overflow-x:auto}.swagger-ui .overflow-y-visible-m{overflow-y:visible}.swagger-ui .overflow-y-hidden-m{overflow-y:hidden}.swagger-ui .overflow-y-scroll-m{overflow-y:scroll}.swagger-ui .overflow-y-auto-m{overflow-y:auto}}@media screen and (min-width:60em){.swagger-ui .overflow-visible-l{overflow:visible}.swagger-ui .overflow-hidden-l{overflow:hidden}.swagger-ui .overflow-scroll-l{overflow:scroll}.swagger-ui .overflow-auto-l{overflow:auto}.swagger-ui .overflow-x-visible-l{overflow-x:visible}.swagger-ui .overflow-x-hidden-l{overflow-x:hidden}.swagger-ui .overflow-x-scroll-l{overflow-x:scroll}.swagger-ui .overflow-x-auto-l{overflow-x:auto}.swagger-ui .overflow-y-visible-l{overflow-y:visible}.swagger-ui .overflow-y-hidden-l{overflow-y:hidden}.swagger-ui .overflow-y-scroll-l{overflow-y:scroll}.swagger-ui .overflow-y-auto-l{overflow-y:auto}}.swagger-ui .static{position:static}.swagger-ui .relative{position:relative}.swagger-ui .absolute{position:absolute}.swagger-ui .fixed{position:fixed}@media screen and (min-width:30em){.swagger-ui .static-ns{position:static}.swagger-ui .relative-ns{position:relative}.swagger-ui .absolute-ns{position:absolute}.swagger-ui .fixed-ns{position:fixed}}@media screen and (min-width:30em)and (max-width:60em){.swagger-ui .static-m{position:static}.swagger-ui .relative-m{position:relative}.swagger-ui .absolute-m{position:absolute}.swagger-ui .fixed-m{position:fixed}}@media screen and (min-width:60em){.swagger-ui .static-l{position:static}.swagger-ui .relative-l{position:relative}.swagger-ui .absolute-l{position:absolute}.swagger-ui .fixed-l{position:fixed}}.swagger-ui .o-100{opacity:1}.swagger-ui .o-90{opacity:.9}.swagger-ui .o-80{opacity:.8}.swagger-ui .o-70{opacity:.7}.swagger-ui .o-60{opacity:.6}.swagger-ui .o-50{opacity:.5}.swagger-ui .o-40{opacity:.4}.swagger-ui .o-30{opacity:.3}.swagger-ui .o-20{opacity:.2}.swagger-ui .o-10{opacity:.1}.swagger-ui .o-05{opacity:.05}.swagger-ui .o-025{opacity:.025}.swagger-ui .o-0{opacity:0}.swagger-ui .rotate-45{transform:rotate(45deg)}.swagger-ui .rotate-90{transform:rotate(90deg)}.swagger-ui .rotate-135{transform:rotate(135deg)}.swagger-ui .rotate-180{transform:rotate(180deg)}.swagger-ui .rotate-225{transform:rotate(225deg)}.swagger-ui .rotate-270{transform:rotate(270deg)}.swagger-ui .rotate-315{transform:rotate(315deg)}@media screen and (min-width:30em){.swagger-ui .rotate-45-ns{transform:rotate(45deg)}.swagger-ui .rotate-90-ns{transform:rotate(90deg)}.swagger-ui .rotate-135-ns{transform:rotate(135deg)}.swagger-ui .rotate-180-ns{transform:rotate(180deg)}.swagger-ui .rotate-225-ns{transform:rotate(225deg)}.swagger-ui .rotate-270-ns{transform:rotate(270deg)}.swagger-ui .rotate-315-ns{transform:rotate(315deg)}}@media screen and (min-width:30em)and (max-width:60em){.swagger-ui .rotate-45-m{transform:rotate(45deg)}.swagger-ui .rotate-90-m{transform:rotate(90deg)}.swagger-ui .rotate-135-m{transform:rotate(135deg)}.swagger-ui .rotate-180-m{transform:rotate(180deg)}.swagger-ui .rotate-225-m{transform:rotate(225deg)}.swagger-ui .rotate-270-m{transform:rotate(270deg)}.swagger-ui .rotate-315-m{transform:rotate(315deg)}}@media screen and (min-width:60em){.swagger-ui .rotate-45-l{transform:rotate(45deg)}.swagger-ui .rotate-90-l{transform:rotate(90deg)}.swagger-ui .rotate-135-l{transform:rotate(135deg)}.swagger-ui .rotate-180-l{transform:rotate(180deg)}.swagger-ui .rotate-225-l{transform:rotate(225deg)}.swagger-ui .rotate-270-l{transform:rotate(270deg)}.swagger-ui .rotate-315-l{transform:rotate(315deg)}}.swagger-ui .black-90{color:rgba(0,0,0,.9)}.swagger-ui .black-80{color:rgba(0,0,0,.8)}.swagger-ui .black-70{color:rgba(0,0,0,.7)}.swagger-ui .black-60{color:rgba(0,0,0,.6)}.swagger-ui .black-50{color:rgba(0,0,0,.5)}.swagger-ui .black-40{color:rgba(0,0,0,.4)}.swagger-ui .black-30{color:rgba(0,0,0,.3)}.swagger-ui .black-20{color:rgba(0,0,0,.2)}.swagger-ui .black-10{color:rgba(0,0,0,.1)}.swagger-ui .black-05{color:rgba(0,0,0,.05)}.swagger-ui .white-90{color:hsla(0,0%,100%,.9)}.swagger-ui .white-80{color:hsla(0,0%,100%,.8)}.swagger-ui .white-70{color:hsla(0,0%,100%,.7)}.swagger-ui .white-60{color:hsla(0,0%,100%,.6)}.swagger-ui .white-50{color:hsla(0,0%,100%,.5)}.swagger-ui .white-40{color:hsla(0,0%,100%,.4)}.swagger-ui .white-30{color:hsla(0,0%,100%,.3)}.swagger-ui .white-20{color:hsla(0,0%,100%,.2)}.swagger-ui .white-10{color:hsla(0,0%,100%,.1)}.swagger-ui .black{color:#000}.swagger-ui .near-black{color:#111}.swagger-ui .dark-gray{color:#333}.swagger-ui .mid-gray{color:#555}.swagger-ui .gray{color:#777}.swagger-ui .silver{color:#999}.swagger-ui .light-silver{color:#aaa}.swagger-ui .moon-gray{color:#ccc}.swagger-ui .light-gray{color:#eee}.swagger-ui .near-white{color:#f4f4f4}.swagger-ui .white{color:#fff}.swagger-ui .dark-red{color:#e7040f}.swagger-ui .red{color:#ff4136}.swagger-ui .light-red{color:#ff725c}.swagger-ui .orange{color:#ff6300}.swagger-ui .gold{color:#ffb700}.swagger-ui .yellow{color:gold}.swagger-ui .light-yellow{color:#fbf1a9}.swagger-ui .purple{color:#5e2ca5}.swagger-ui .light-purple{color:#a463f2}.swagger-ui .dark-pink{color:#d5008f}.swagger-ui .hot-pink{color:#ff41b4}.swagger-ui .pink{color:#ff80cc}.swagger-ui .light-pink{color:#ffa3d7}.swagger-ui .dark-green{color:#137752}.swagger-ui .green{color:#19a974}.swagger-ui .light-green{color:#9eebcf}.swagger-ui .navy{color:#001b44}.swagger-ui .dark-blue{color:#00449e}.swagger-ui .blue{color:#357edd}.swagger-ui .light-blue{color:#96ccff}.swagger-ui .lightest-blue{color:#cdecff}.swagger-ui .washed-blue{color:#f6fffe}.swagger-ui .washed-green{color:#e8fdf5}.swagger-ui .washed-yellow{color:#fffceb}.swagger-ui .washed-red{color:#ffdfdf}.swagger-ui .color-inherit{color:inherit}.swagger-ui .bg-black-90{background-color:rgba(0,0,0,.9)}.swagger-ui .bg-black-80{background-color:rgba(0,0,0,.8)}.swagger-ui .bg-black-70{background-color:rgba(0,0,0,.7)}.swagger-ui .bg-black-60{background-color:rgba(0,0,0,.6)}.swagger-ui .bg-black-50{background-color:rgba(0,0,0,.5)}.swagger-ui .bg-black-40{background-color:rgba(0,0,0,.4)}.swagger-ui .bg-black-30{background-color:rgba(0,0,0,.3)}.swagger-ui .bg-black-20{background-color:rgba(0,0,0,.2)}.swagger-ui .bg-black-10{background-color:rgba(0,0,0,.1)}.swagger-ui .bg-black-05{background-color:rgba(0,0,0,.05)}.swagger-ui .bg-white-90{background-color:hsla(0,0%,100%,.9)}.swagger-ui .bg-white-80{background-color:hsla(0,0%,100%,.8)}.swagger-ui .bg-white-70{background-color:hsla(0,0%,100%,.7)}.swagger-ui .bg-white-60{background-color:hsla(0,0%,100%,.6)}.swagger-ui .bg-white-50{background-color:hsla(0,0%,100%,.5)}.swagger-ui .bg-white-40{background-color:hsla(0,0%,100%,.4)}.swagger-ui .bg-white-30{background-color:hsla(0,0%,100%,.3)}.swagger-ui .bg-white-20{background-color:hsla(0,0%,100%,.2)}.swagger-ui .bg-white-10{background-color:hsla(0,0%,100%,.1)}.swagger-ui .bg-black{background-color:#000}.swagger-ui .bg-near-black{background-color:#111}.swagger-ui .bg-dark-gray{background-color:#333}.swagger-ui .bg-mid-gray{background-color:#555}.swagger-ui .bg-gray{background-color:#777}.swagger-ui .bg-silver{background-color:#999}.swagger-ui .bg-light-silver{background-color:#aaa}.swagger-ui .bg-moon-gray{background-color:#ccc}.swagger-ui .bg-light-gray{background-color:#eee}.swagger-ui .bg-near-white{background-color:#f4f4f4}.swagger-ui .bg-white{background-color:#fff}.swagger-ui .bg-transparent{background-color:transparent}.swagger-ui .bg-dark-red{background-color:#e7040f}.swagger-ui .bg-red{background-color:#ff4136}.swagger-ui .bg-light-red{background-color:#ff725c}.swagger-ui .bg-orange{background-color:#ff6300}.swagger-ui .bg-gold{background-color:#ffb700}.swagger-ui .bg-yellow{background-color:gold}.swagger-ui .bg-light-yellow{background-color:#fbf1a9}.swagger-ui .bg-purple{background-color:#5e2ca5}.swagger-ui .bg-light-purple{background-color:#a463f2}.swagger-ui .bg-dark-pink{background-color:#d5008f}.swagger-ui .bg-hot-pink{background-color:#ff41b4}.swagger-ui .bg-pink{background-color:#ff80cc}.swagger-ui .bg-light-pink{background-color:#ffa3d7}.swagger-ui .bg-dark-green{background-color:#137752}.swagger-ui .bg-green{background-color:#19a974}.swagger-ui .bg-light-green{background-color:#9eebcf}.swagger-ui .bg-navy{background-color:#001b44}.swagger-ui .bg-dark-blue{background-color:#00449e}.swagger-ui .bg-blue{background-color:#357edd}.swagger-ui .bg-light-blue{background-color:#96ccff}.swagger-ui .bg-lightest-blue{background-color:#cdecff}.swagger-ui .bg-washed-blue{background-color:#f6fffe}.swagger-ui .bg-washed-green{background-color:#e8fdf5}.swagger-ui .bg-washed-yellow{background-color:#fffceb}.swagger-ui .bg-washed-red{background-color:#ffdfdf}.swagger-ui .bg-inherit{background-color:inherit}.swagger-ui .hover-black:focus,.swagger-ui .hover-black:hover{color:#000}.swagger-ui .hover-near-black:focus,.swagger-ui .hover-near-black:hover{color:#111}.swagger-ui .hover-dark-gray:focus,.swagger-ui .hover-dark-gray:hover{color:#333}.swagger-ui .hover-mid-gray:focus,.swagger-ui .hover-mid-gray:hover{color:#555}.swagger-ui .hover-gray:focus,.swagger-ui .hover-gray:hover{color:#777}.swagger-ui .hover-silver:focus,.swagger-ui .hover-silver:hover{color:#999}.swagger-ui .hover-light-silver:focus,.swagger-ui .hover-light-silver:hover{color:#aaa}.swagger-ui .hover-moon-gray:focus,.swagger-ui .hover-moon-gray:hover{color:#ccc}.swagger-ui .hover-light-gray:focus,.swagger-ui .hover-light-gray:hover{color:#eee}.swagger-ui .hover-near-white:focus,.swagger-ui .hover-near-white:hover{color:#f4f4f4}.swagger-ui .hover-white:focus,.swagger-ui .hover-white:hover{color:#fff}.swagger-ui .hover-black-90:focus,.swagger-ui .hover-black-90:hover{color:rgba(0,0,0,.9)}.swagger-ui .hover-black-80:focus,.swagger-ui .hover-black-80:hover{color:rgba(0,0,0,.8)}.swagger-ui .hover-black-70:focus,.swagger-ui .hover-black-70:hover{color:rgba(0,0,0,.7)}.swagger-ui .hover-black-60:focus,.swagger-ui .hover-black-60:hover{color:rgba(0,0,0,.6)}.swagger-ui .hover-black-50:focus,.swagger-ui .hover-black-50:hover{color:rgba(0,0,0,.5)}.swagger-ui .hover-black-40:focus,.swagger-ui .hover-black-40:hover{color:rgba(0,0,0,.4)}.swagger-ui .hover-black-30:focus,.swagger-ui .hover-black-30:hover{color:rgba(0,0,0,.3)}.swagger-ui .hover-black-20:focus,.swagger-ui .hover-black-20:hover{color:rgba(0,0,0,.2)}.swagger-ui .hover-black-10:focus,.swagger-ui .hover-black-10:hover{color:rgba(0,0,0,.1)}.swagger-ui .hover-white-90:focus,.swagger-ui .hover-white-90:hover{color:hsla(0,0%,100%,.9)}.swagger-ui .hover-white-80:focus,.swagger-ui .hover-white-80:hover{color:hsla(0,0%,100%,.8)}.swagger-ui .hover-white-70:focus,.swagger-ui .hover-white-70:hover{color:hsla(0,0%,100%,.7)}.swagger-ui .hover-white-60:focus,.swagger-ui .hover-white-60:hover{color:hsla(0,0%,100%,.6)}.swagger-ui .hover-white-50:focus,.swagger-ui .hover-white-50:hover{color:hsla(0,0%,100%,.5)}.swagger-ui .hover-white-40:focus,.swagger-ui .hover-white-40:hover{color:hsla(0,0%,100%,.4)}.swagger-ui .hover-white-30:focus,.swagger-ui .hover-white-30:hover{color:hsla(0,0%,100%,.3)}.swagger-ui .hover-white-20:focus,.swagger-ui .hover-white-20:hover{color:hsla(0,0%,100%,.2)}.swagger-ui .hover-white-10:focus,.swagger-ui .hover-white-10:hover{color:hsla(0,0%,100%,.1)}.swagger-ui .hover-inherit:focus,.swagger-ui .hover-inherit:hover{color:inherit}.swagger-ui .hover-bg-black:focus,.swagger-ui .hover-bg-black:hover{background-color:#000}.swagger-ui .hover-bg-near-black:focus,.swagger-ui .hover-bg-near-black:hover{background-color:#111}.swagger-ui .hover-bg-dark-gray:focus,.swagger-ui .hover-bg-dark-gray:hover{background-color:#333}.swagger-ui .hover-bg-mid-gray:focus,.swagger-ui .hover-bg-mid-gray:hover{background-color:#555}.swagger-ui .hover-bg-gray:focus,.swagger-ui .hover-bg-gray:hover{background-color:#777}.swagger-ui .hover-bg-silver:focus,.swagger-ui .hover-bg-silver:hover{background-color:#999}.swagger-ui .hover-bg-light-silver:focus,.swagger-ui .hover-bg-light-silver:hover{background-color:#aaa}.swagger-ui .hover-bg-moon-gray:focus,.swagger-ui .hover-bg-moon-gray:hover{background-color:#ccc}.swagger-ui .hover-bg-light-gray:focus,.swagger-ui .hover-bg-light-gray:hover{background-color:#eee}.swagger-ui .hover-bg-near-white:focus,.swagger-ui .hover-bg-near-white:hover{background-color:#f4f4f4}.swagger-ui .hover-bg-white:focus,.swagger-ui .hover-bg-white:hover{background-color:#fff}.swagger-ui .hover-bg-transparent:focus,.swagger-ui .hover-bg-transparent:hover{background-color:transparent}.swagger-ui .hover-bg-black-90:focus,.swagger-ui .hover-bg-black-90:hover{background-color:rgba(0,0,0,.9)}.swagger-ui .hover-bg-black-80:focus,.swagger-ui .hover-bg-black-80:hover{background-color:rgba(0,0,0,.8)}.swagger-ui .hover-bg-black-70:focus,.swagger-ui .hover-bg-black-70:hover{background-color:rgba(0,0,0,.7)}.swagger-ui .hover-bg-black-60:focus,.swagger-ui .hover-bg-black-60:hover{background-color:rgba(0,0,0,.6)}.swagger-ui .hover-bg-black-50:focus,.swagger-ui .hover-bg-black-50:hover{background-color:rgba(0,0,0,.5)}.swagger-ui .hover-bg-black-40:focus,.swagger-ui .hover-bg-black-40:hover{background-color:rgba(0,0,0,.4)}.swagger-ui .hover-bg-black-30:focus,.swagger-ui .hover-bg-black-30:hover{background-color:rgba(0,0,0,.3)}.swagger-ui .hover-bg-black-20:focus,.swagger-ui .hover-bg-black-20:hover{background-color:rgba(0,0,0,.2)}.swagger-ui .hover-bg-black-10:focus,.swagger-ui .hover-bg-black-10:hover{background-color:rgba(0,0,0,.1)}.swagger-ui .hover-bg-white-90:focus,.swagger-ui .hover-bg-white-90:hover{background-color:hsla(0,0%,100%,.9)}.swagger-ui .hover-bg-white-80:focus,.swagger-ui .hover-bg-white-80:hover{background-color:hsla(0,0%,100%,.8)}.swagger-ui .hover-bg-white-70:focus,.swagger-ui .hover-bg-white-70:hover{background-color:hsla(0,0%,100%,.7)}.swagger-ui .hover-bg-white-60:focus,.swagger-ui .hover-bg-white-60:hover{background-color:hsla(0,0%,100%,.6)}.swagger-ui .hover-bg-white-50:focus,.swagger-ui .hover-bg-white-50:hover{background-color:hsla(0,0%,100%,.5)}.swagger-ui .hover-bg-white-40:focus,.swagger-ui .hover-bg-white-40:hover{background-color:hsla(0,0%,100%,.4)}.swagger-ui .hover-bg-white-30:focus,.swagger-ui .hover-bg-white-30:hover{background-color:hsla(0,0%,100%,.3)}.swagger-ui .hover-bg-white-20:focus,.swagger-ui .hover-bg-white-20:hover{background-color:hsla(0,0%,100%,.2)}.swagger-ui .hover-bg-white-10:focus,.swagger-ui .hover-bg-white-10:hover{background-color:hsla(0,0%,100%,.1)}.swagger-ui .hover-dark-red:focus,.swagger-ui .hover-dark-red:hover{color:#e7040f}.swagger-ui .hover-red:focus,.swagger-ui .hover-red:hover{color:#ff4136}.swagger-ui .hover-light-red:focus,.swagger-ui .hover-light-red:hover{color:#ff725c}.swagger-ui .hover-orange:focus,.swagger-ui .hover-orange:hover{color:#ff6300}.swagger-ui .hover-gold:focus,.swagger-ui .hover-gold:hover{color:#ffb700}.swagger-ui .hover-yellow:focus,.swagger-ui .hover-yellow:hover{color:gold}.swagger-ui .hover-light-yellow:focus,.swagger-ui .hover-light-yellow:hover{color:#fbf1a9}.swagger-ui .hover-purple:focus,.swagger-ui .hover-purple:hover{color:#5e2ca5}.swagger-ui .hover-light-purple:focus,.swagger-ui .hover-light-purple:hover{color:#a463f2}.swagger-ui .hover-dark-pink:focus,.swagger-ui .hover-dark-pink:hover{color:#d5008f}.swagger-ui .hover-hot-pink:focus,.swagger-ui .hover-hot-pink:hover{color:#ff41b4}.swagger-ui .hover-pink:focus,.swagger-ui .hover-pink:hover{color:#ff80cc}.swagger-ui .hover-light-pink:focus,.swagger-ui .hover-light-pink:hover{color:#ffa3d7}.swagger-ui .hover-dark-green:focus,.swagger-ui .hover-dark-green:hover{color:#137752}.swagger-ui .hover-green:focus,.swagger-ui .hover-green:hover{color:#19a974}.swagger-ui .hover-light-green:focus,.swagger-ui .hover-light-green:hover{color:#9eebcf}.swagger-ui .hover-navy:focus,.swagger-ui .hover-navy:hover{color:#001b44}.swagger-ui .hover-dark-blue:focus,.swagger-ui .hover-dark-blue:hover{color:#00449e}.swagger-ui .hover-blue:focus,.swagger-ui .hover-blue:hover{color:#357edd}.swagger-ui .hover-light-blue:focus,.swagger-ui .hover-light-blue:hover{color:#96ccff}.swagger-ui .hover-lightest-blue:focus,.swagger-ui .hover-lightest-blue:hover{color:#cdecff}.swagger-ui .hover-washed-blue:focus,.swagger-ui .hover-washed-blue:hover{color:#f6fffe}.swagger-ui .hover-washed-green:focus,.swagger-ui .hover-washed-green:hover{color:#e8fdf5}.swagger-ui .hover-washed-yellow:focus,.swagger-ui .hover-washed-yellow:hover{color:#fffceb}.swagger-ui .hover-washed-red:focus,.swagger-ui .hover-washed-red:hover{color:#ffdfdf}.swagger-ui .hover-bg-dark-red:focus,.swagger-ui .hover-bg-dark-red:hover{background-color:#e7040f}.swagger-ui .hover-bg-red:focus,.swagger-ui .hover-bg-red:hover{background-color:#ff4136}.swagger-ui .hover-bg-light-red:focus,.swagger-ui .hover-bg-light-red:hover{background-color:#ff725c}.swagger-ui .hover-bg-orange:focus,.swagger-ui .hover-bg-orange:hover{background-color:#ff6300}.swagger-ui .hover-bg-gold:focus,.swagger-ui .hover-bg-gold:hover{background-color:#ffb700}.swagger-ui .hover-bg-yellow:focus,.swagger-ui .hover-bg-yellow:hover{background-color:gold}.swagger-ui .hover-bg-light-yellow:focus,.swagger-ui .hover-bg-light-yellow:hover{background-color:#fbf1a9}.swagger-ui .hover-bg-purple:focus,.swagger-ui .hover-bg-purple:hover{background-color:#5e2ca5}.swagger-ui .hover-bg-light-purple:focus,.swagger-ui .hover-bg-light-purple:hover{background-color:#a463f2}.swagger-ui .hover-bg-dark-pink:focus,.swagger-ui .hover-bg-dark-pink:hover{background-color:#d5008f}.swagger-ui .hover-bg-hot-pink:focus,.swagger-ui .hover-bg-hot-pink:hover{background-color:#ff41b4}.swagger-ui .hover-bg-pink:focus,.swagger-ui .hover-bg-pink:hover{background-color:#ff80cc}.swagger-ui .hover-bg-light-pink:focus,.swagger-ui .hover-bg-light-pink:hover{background-color:#ffa3d7}.swagger-ui .hover-bg-dark-green:focus,.swagger-ui .hover-bg-dark-green:hover{background-color:#137752}.swagger-ui .hover-bg-green:focus,.swagger-ui .hover-bg-green:hover{background-color:#19a974}.swagger-ui .hover-bg-light-green:focus,.swagger-ui .hover-bg-light-green:hover{background-color:#9eebcf}.swagger-ui .hover-bg-navy:focus,.swagger-ui .hover-bg-navy:hover{background-color:#001b44}.swagger-ui .hover-bg-dark-blue:focus,.swagger-ui .hover-bg-dark-blue:hover{background-color:#00449e}.swagger-ui .hover-bg-blue:focus,.swagger-ui .hover-bg-blue:hover{background-color:#357edd}.swagger-ui .hover-bg-light-blue:focus,.swagger-ui .hover-bg-light-blue:hover{background-color:#96ccff}.swagger-ui .hover-bg-lightest-blue:focus,.swagger-ui .hover-bg-lightest-blue:hover{background-color:#cdecff}.swagger-ui .hover-bg-washed-blue:focus,.swagger-ui .hover-bg-washed-blue:hover{background-color:#f6fffe}.swagger-ui .hover-bg-washed-green:focus,.swagger-ui .hover-bg-washed-green:hover{background-color:#e8fdf5}.swagger-ui .hover-bg-washed-yellow:focus,.swagger-ui .hover-bg-washed-yellow:hover{background-color:#fffceb}.swagger-ui .hover-bg-washed-red:focus,.swagger-ui .hover-bg-washed-red:hover{background-color:#ffdfdf}.swagger-ui .hover-bg-inherit:focus,.swagger-ui .hover-bg-inherit:hover{background-color:inherit}.swagger-ui .pa0{padding:0}.swagger-ui .pa1{padding:.25rem}.swagger-ui .pa2{padding:.5rem}.swagger-ui .pa3{padding:1rem}.swagger-ui .pa4{padding:2rem}.swagger-ui .pa5{padding:4rem}.swagger-ui .pa6{padding:8rem}.swagger-ui .pa7{padding:16rem}.swagger-ui .pl0{padding-left:0}.swagger-ui .pl1{padding-left:.25rem}.swagger-ui .pl2{padding-left:.5rem}.swagger-ui .pl3{padding-left:1rem}.swagger-ui .pl4{padding-left:2rem}.swagger-ui .pl5{padding-left:4rem}.swagger-ui .pl6{padding-left:8rem}.swagger-ui .pl7{padding-left:16rem}.swagger-ui .pr0{padding-right:0}.swagger-ui .pr1{padding-right:.25rem}.swagger-ui .pr2{padding-right:.5rem}.swagger-ui .pr3{padding-right:1rem}.swagger-ui .pr4{padding-right:2rem}.swagger-ui .pr5{padding-right:4rem}.swagger-ui .pr6{padding-right:8rem}.swagger-ui .pr7{padding-right:16rem}.swagger-ui .pb0{padding-bottom:0}.swagger-ui .pb1{padding-bottom:.25rem}.swagger-ui .pb2{padding-bottom:.5rem}.swagger-ui .pb3{padding-bottom:1rem}.swagger-ui .pb4{padding-bottom:2rem}.swagger-ui .pb5{padding-bottom:4rem}.swagger-ui .pb6{padding-bottom:8rem}.swagger-ui .pb7{padding-bottom:16rem}.swagger-ui .pt0{padding-top:0}.swagger-ui .pt1{padding-top:.25rem}.swagger-ui .pt2{padding-top:.5rem}.swagger-ui .pt3{padding-top:1rem}.swagger-ui .pt4{padding-top:2rem}.swagger-ui .pt5{padding-top:4rem}.swagger-ui .pt6{padding-top:8rem}.swagger-ui .pt7{padding-top:16rem}.swagger-ui .pv0{padding-bottom:0;padding-top:0}.swagger-ui .pv1{padding-bottom:.25rem;padding-top:.25rem}.swagger-ui .pv2{padding-bottom:.5rem;padding-top:.5rem}.swagger-ui .pv3{padding-bottom:1rem;padding-top:1rem}.swagger-ui .pv4{padding-bottom:2rem;padding-top:2rem}.swagger-ui .pv5{padding-bottom:4rem;padding-top:4rem}.swagger-ui .pv6{padding-bottom:8rem;padding-top:8rem}.swagger-ui .pv7{padding-bottom:16rem;padding-top:16rem}.swagger-ui .ph0{padding-left:0;padding-right:0}.swagger-ui .ph1{padding-left:.25rem;padding-right:.25rem}.swagger-ui .ph2{padding-left:.5rem;padding-right:.5rem}.swagger-ui .ph3{padding-left:1rem;padding-right:1rem}.swagger-ui .ph4{padding-left:2rem;padding-right:2rem}.swagger-ui .ph5{padding-left:4rem;padding-right:4rem}.swagger-ui .ph6{padding-left:8rem;padding-right:8rem}.swagger-ui .ph7{padding-left:16rem;padding-right:16rem}.swagger-ui .ma0{margin:0}.swagger-ui .ma1{margin:.25rem}.swagger-ui .ma2{margin:.5rem}.swagger-ui .ma3{margin:1rem}.swagger-ui .ma4{margin:2rem}.swagger-ui .ma5{margin:4rem}.swagger-ui .ma6{margin:8rem}.swagger-ui .ma7{margin:16rem}.swagger-ui .ml0{margin-left:0}.swagger-ui .ml1{margin-left:.25rem}.swagger-ui .ml2{margin-left:.5rem}.swagger-ui .ml3{margin-left:1rem}.swagger-ui .ml4{margin-left:2rem}.swagger-ui .ml5{margin-left:4rem}.swagger-ui .ml6{margin-left:8rem}.swagger-ui .ml7{margin-left:16rem}.swagger-ui .mr0{margin-right:0}.swagger-ui .mr1{margin-right:.25rem}.swagger-ui .mr2{margin-right:.5rem}.swagger-ui .mr3{margin-right:1rem}.swagger-ui .mr4{margin-right:2rem}.swagger-ui .mr5{margin-right:4rem}.swagger-ui .mr6{margin-right:8rem}.swagger-ui .mr7{margin-right:16rem}.swagger-ui .mb0{margin-bottom:0}.swagger-ui .mb1{margin-bottom:.25rem}.swagger-ui .mb2{margin-bottom:.5rem}.swagger-ui .mb3{margin-bottom:1rem}.swagger-ui .mb4{margin-bottom:2rem}.swagger-ui .mb5{margin-bottom:4rem}.swagger-ui .mb6{margin-bottom:8rem}.swagger-ui .mb7{margin-bottom:16rem}.swagger-ui .mt0{margin-top:0}.swagger-ui .mt1{margin-top:.25rem}.swagger-ui .mt2{margin-top:.5rem}.swagger-ui .mt3{margin-top:1rem}.swagger-ui .mt4{margin-top:2rem}.swagger-ui .mt5{margin-top:4rem}.swagger-ui .mt6{margin-top:8rem}.swagger-ui .mt7{margin-top:16rem}.swagger-ui .mv0{margin-bottom:0;margin-top:0}.swagger-ui .mv1{margin-bottom:.25rem;margin-top:.25rem}.swagger-ui .mv2{margin-bottom:.5rem;margin-top:.5rem}.swagger-ui .mv3{margin-bottom:1rem;margin-top:1rem}.swagger-ui .mv4{margin-bottom:2rem;margin-top:2rem}.swagger-ui .mv5{margin-bottom:4rem;margin-top:4rem}.swagger-ui .mv6{margin-bottom:8rem;margin-top:8rem}.swagger-ui .mv7{margin-bottom:16rem;margin-top:16rem}.swagger-ui .mh0{margin-left:0;margin-right:0}.swagger-ui .mh1{margin-left:.25rem;margin-right:.25rem}.swagger-ui .mh2{margin-left:.5rem;margin-right:.5rem}.swagger-ui .mh3{margin-left:1rem;margin-right:1rem}.swagger-ui .mh4{margin-left:2rem;margin-right:2rem}.swagger-ui .mh5{margin-left:4rem;margin-right:4rem}.swagger-ui .mh6{margin-left:8rem;margin-right:8rem}.swagger-ui .mh7{margin-left:16rem;margin-right:16rem}@media screen and (min-width:30em){.swagger-ui .pa0-ns{padding:0}.swagger-ui .pa1-ns{padding:.25rem}.swagger-ui .pa2-ns{padding:.5rem}.swagger-ui .pa3-ns{padding:1rem}.swagger-ui .pa4-ns{padding:2rem}.swagger-ui .pa5-ns{padding:4rem}.swagger-ui .pa6-ns{padding:8rem}.swagger-ui .pa7-ns{padding:16rem}.swagger-ui .pl0-ns{padding-left:0}.swagger-ui .pl1-ns{padding-left:.25rem}.swagger-ui .pl2-ns{padding-left:.5rem}.swagger-ui .pl3-ns{padding-left:1rem}.swagger-ui .pl4-ns{padding-left:2rem}.swagger-ui .pl5-ns{padding-left:4rem}.swagger-ui .pl6-ns{padding-left:8rem}.swagger-ui .pl7-ns{padding-left:16rem}.swagger-ui .pr0-ns{padding-right:0}.swagger-ui .pr1-ns{padding-right:.25rem}.swagger-ui .pr2-ns{padding-right:.5rem}.swagger-ui .pr3-ns{padding-right:1rem}.swagger-ui .pr4-ns{padding-right:2rem}.swagger-ui .pr5-ns{padding-right:4rem}.swagger-ui .pr6-ns{padding-right:8rem}.swagger-ui .pr7-ns{padding-right:16rem}.swagger-ui .pb0-ns{padding-bottom:0}.swagger-ui .pb1-ns{padding-bottom:.25rem}.swagger-ui .pb2-ns{padding-bottom:.5rem}.swagger-ui .pb3-ns{padding-bottom:1rem}.swagger-ui .pb4-ns{padding-bottom:2rem}.swagger-ui .pb5-ns{padding-bottom:4rem}.swagger-ui .pb6-ns{padding-bottom:8rem}.swagger-ui .pb7-ns{padding-bottom:16rem}.swagger-ui .pt0-ns{padding-top:0}.swagger-ui .pt1-ns{padding-top:.25rem}.swagger-ui .pt2-ns{padding-top:.5rem}.swagger-ui .pt3-ns{padding-top:1rem}.swagger-ui .pt4-ns{padding-top:2rem}.swagger-ui .pt5-ns{padding-top:4rem}.swagger-ui .pt6-ns{padding-top:8rem}.swagger-ui .pt7-ns{padding-top:16rem}.swagger-ui .pv0-ns{padding-bottom:0;padding-top:0}.swagger-ui .pv1-ns{padding-bottom:.25rem;padding-top:.25rem}.swagger-ui .pv2-ns{padding-bottom:.5rem;padding-top:.5rem}.swagger-ui .pv3-ns{padding-bottom:1rem;padding-top:1rem}.swagger-ui .pv4-ns{padding-bottom:2rem;padding-top:2rem}.swagger-ui .pv5-ns{padding-bottom:4rem;padding-top:4rem}.swagger-ui .pv6-ns{padding-bottom:8rem;padding-top:8rem}.swagger-ui .pv7-ns{padding-bottom:16rem;padding-top:16rem}.swagger-ui .ph0-ns{padding-left:0;padding-right:0}.swagger-ui .ph1-ns{padding-left:.25rem;padding-right:.25rem}.swagger-ui .ph2-ns{padding-left:.5rem;padding-right:.5rem}.swagger-ui .ph3-ns{padding-left:1rem;padding-right:1rem}.swagger-ui .ph4-ns{padding-left:2rem;padding-right:2rem}.swagger-ui .ph5-ns{padding-left:4rem;padding-right:4rem}.swagger-ui .ph6-ns{padding-left:8rem;padding-right:8rem}.swagger-ui .ph7-ns{padding-left:16rem;padding-right:16rem}.swagger-ui .ma0-ns{margin:0}.swagger-ui .ma1-ns{margin:.25rem}.swagger-ui .ma2-ns{margin:.5rem}.swagger-ui .ma3-ns{margin:1rem}.swagger-ui .ma4-ns{margin:2rem}.swagger-ui .ma5-ns{margin:4rem}.swagger-ui .ma6-ns{margin:8rem}.swagger-ui .ma7-ns{margin:16rem}.swagger-ui .ml0-ns{margin-left:0}.swagger-ui .ml1-ns{margin-left:.25rem}.swagger-ui .ml2-ns{margin-left:.5rem}.swagger-ui .ml3-ns{margin-left:1rem}.swagger-ui .ml4-ns{margin-left:2rem}.swagger-ui .ml5-ns{margin-left:4rem}.swagger-ui .ml6-ns{margin-left:8rem}.swagger-ui .ml7-ns{margin-left:16rem}.swagger-ui .mr0-ns{margin-right:0}.swagger-ui .mr1-ns{margin-right:.25rem}.swagger-ui .mr2-ns{margin-right:.5rem}.swagger-ui .mr3-ns{margin-right:1rem}.swagger-ui .mr4-ns{margin-right:2rem}.swagger-ui .mr5-ns{margin-right:4rem}.swagger-ui .mr6-ns{margin-right:8rem}.swagger-ui .mr7-ns{margin-right:16rem}.swagger-ui .mb0-ns{margin-bottom:0}.swagger-ui .mb1-ns{margin-bottom:.25rem}.swagger-ui .mb2-ns{margin-bottom:.5rem}.swagger-ui .mb3-ns{margin-bottom:1rem}.swagger-ui .mb4-ns{margin-bottom:2rem}.swagger-ui .mb5-ns{margin-bottom:4rem}.swagger-ui .mb6-ns{margin-bottom:8rem}.swagger-ui .mb7-ns{margin-bottom:16rem}.swagger-ui .mt0-ns{margin-top:0}.swagger-ui .mt1-ns{margin-top:.25rem}.swagger-ui .mt2-ns{margin-top:.5rem}.swagger-ui .mt3-ns{margin-top:1rem}.swagger-ui .mt4-ns{margin-top:2rem}.swagger-ui .mt5-ns{margin-top:4rem}.swagger-ui .mt6-ns{margin-top:8rem}.swagger-ui .mt7-ns{margin-top:16rem}.swagger-ui .mv0-ns{margin-bottom:0;margin-top:0}.swagger-ui .mv1-ns{margin-bottom:.25rem;margin-top:.25rem}.swagger-ui .mv2-ns{margin-bottom:.5rem;margin-top:.5rem}.swagger-ui .mv3-ns{margin-bottom:1rem;margin-top:1rem}.swagger-ui .mv4-ns{margin-bottom:2rem;margin-top:2rem}.swagger-ui .mv5-ns{margin-bottom:4rem;margin-top:4rem}.swagger-ui .mv6-ns{margin-bottom:8rem;margin-top:8rem}.swagger-ui .mv7-ns{margin-bottom:16rem;margin-top:16rem}.swagger-ui .mh0-ns{margin-left:0;margin-right:0}.swagger-ui .mh1-ns{margin-left:.25rem;margin-right:.25rem}.swagger-ui .mh2-ns{margin-left:.5rem;margin-right:.5rem}.swagger-ui .mh3-ns{margin-left:1rem;margin-right:1rem}.swagger-ui .mh4-ns{margin-left:2rem;margin-right:2rem}.swagger-ui .mh5-ns{margin-left:4rem;margin-right:4rem}.swagger-ui .mh6-ns{margin-left:8rem;margin-right:8rem}.swagger-ui .mh7-ns{margin-left:16rem;margin-right:16rem}}@media screen and (min-width:30em)and (max-width:60em){.swagger-ui .pa0-m{padding:0}.swagger-ui .pa1-m{padding:.25rem}.swagger-ui .pa2-m{padding:.5rem}.swagger-ui .pa3-m{padding:1rem}.swagger-ui .pa4-m{padding:2rem}.swagger-ui .pa5-m{padding:4rem}.swagger-ui .pa6-m{padding:8rem}.swagger-ui .pa7-m{padding:16rem}.swagger-ui .pl0-m{padding-left:0}.swagger-ui .pl1-m{padding-left:.25rem}.swagger-ui .pl2-m{padding-left:.5rem}.swagger-ui .pl3-m{padding-left:1rem}.swagger-ui .pl4-m{padding-left:2rem}.swagger-ui .pl5-m{padding-left:4rem}.swagger-ui .pl6-m{padding-left:8rem}.swagger-ui .pl7-m{padding-left:16rem}.swagger-ui .pr0-m{padding-right:0}.swagger-ui .pr1-m{padding-right:.25rem}.swagger-ui .pr2-m{padding-right:.5rem}.swagger-ui .pr3-m{padding-right:1rem}.swagger-ui .pr4-m{padding-right:2rem}.swagger-ui .pr5-m{padding-right:4rem}.swagger-ui .pr6-m{padding-right:8rem}.swagger-ui .pr7-m{padding-right:16rem}.swagger-ui .pb0-m{padding-bottom:0}.swagger-ui .pb1-m{padding-bottom:.25rem}.swagger-ui .pb2-m{padding-bottom:.5rem}.swagger-ui .pb3-m{padding-bottom:1rem}.swagger-ui .pb4-m{padding-bottom:2rem}.swagger-ui .pb5-m{padding-bottom:4rem}.swagger-ui .pb6-m{padding-bottom:8rem}.swagger-ui .pb7-m{padding-bottom:16rem}.swagger-ui .pt0-m{padding-top:0}.swagger-ui .pt1-m{padding-top:.25rem}.swagger-ui .pt2-m{padding-top:.5rem}.swagger-ui .pt3-m{padding-top:1rem}.swagger-ui .pt4-m{padding-top:2rem}.swagger-ui .pt5-m{padding-top:4rem}.swagger-ui .pt6-m{padding-top:8rem}.swagger-ui .pt7-m{padding-top:16rem}.swagger-ui .pv0-m{padding-bottom:0;padding-top:0}.swagger-ui .pv1-m{padding-bottom:.25rem;padding-top:.25rem}.swagger-ui .pv2-m{padding-bottom:.5rem;padding-top:.5rem}.swagger-ui .pv3-m{padding-bottom:1rem;padding-top:1rem}.swagger-ui .pv4-m{padding-bottom:2rem;padding-top:2rem}.swagger-ui .pv5-m{padding-bottom:4rem;padding-top:4rem}.swagger-ui .pv6-m{padding-bottom:8rem;padding-top:8rem}.swagger-ui .pv7-m{padding-bottom:16rem;padding-top:16rem}.swagger-ui .ph0-m{padding-left:0;padding-right:0}.swagger-ui .ph1-m{padding-left:.25rem;padding-right:.25rem}.swagger-ui .ph2-m{padding-left:.5rem;padding-right:.5rem}.swagger-ui .ph3-m{padding-left:1rem;padding-right:1rem}.swagger-ui .ph4-m{padding-left:2rem;padding-right:2rem}.swagger-ui .ph5-m{padding-left:4rem;padding-right:4rem}.swagger-ui .ph6-m{padding-left:8rem;padding-right:8rem}.swagger-ui .ph7-m{padding-left:16rem;padding-right:16rem}.swagger-ui .ma0-m{margin:0}.swagger-ui .ma1-m{margin:.25rem}.swagger-ui .ma2-m{margin:.5rem}.swagger-ui .ma3-m{margin:1rem}.swagger-ui .ma4-m{margin:2rem}.swagger-ui .ma5-m{margin:4rem}.swagger-ui .ma6-m{margin:8rem}.swagger-ui .ma7-m{margin:16rem}.swagger-ui .ml0-m{margin-left:0}.swagger-ui .ml1-m{margin-left:.25rem}.swagger-ui .ml2-m{margin-left:.5rem}.swagger-ui .ml3-m{margin-left:1rem}.swagger-ui .ml4-m{margin-left:2rem}.swagger-ui .ml5-m{margin-left:4rem}.swagger-ui .ml6-m{margin-left:8rem}.swagger-ui .ml7-m{margin-left:16rem}.swagger-ui .mr0-m{margin-right:0}.swagger-ui .mr1-m{margin-right:.25rem}.swagger-ui .mr2-m{margin-right:.5rem}.swagger-ui .mr3-m{margin-right:1rem}.swagger-ui .mr4-m{margin-right:2rem}.swagger-ui .mr5-m{margin-right:4rem}.swagger-ui .mr6-m{margin-right:8rem}.swagger-ui .mr7-m{margin-right:16rem}.swagger-ui .mb0-m{margin-bottom:0}.swagger-ui .mb1-m{margin-bottom:.25rem}.swagger-ui .mb2-m{margin-bottom:.5rem}.swagger-ui .mb3-m{margin-bottom:1rem}.swagger-ui .mb4-m{margin-bottom:2rem}.swagger-ui .mb5-m{margin-bottom:4rem}.swagger-ui .mb6-m{margin-bottom:8rem}.swagger-ui .mb7-m{margin-bottom:16rem}.swagger-ui .mt0-m{margin-top:0}.swagger-ui .mt1-m{margin-top:.25rem}.swagger-ui .mt2-m{margin-top:.5rem}.swagger-ui .mt3-m{margin-top:1rem}.swagger-ui .mt4-m{margin-top:2rem}.swagger-ui .mt5-m{margin-top:4rem}.swagger-ui .mt6-m{margin-top:8rem}.swagger-ui .mt7-m{margin-top:16rem}.swagger-ui .mv0-m{margin-bottom:0;margin-top:0}.swagger-ui .mv1-m{margin-bottom:.25rem;margin-top:.25rem}.swagger-ui .mv2-m{margin-bottom:.5rem;margin-top:.5rem}.swagger-ui .mv3-m{margin-bottom:1rem;margin-top:1rem}.swagger-ui .mv4-m{margin-bottom:2rem;margin-top:2rem}.swagger-ui .mv5-m{margin-bottom:4rem;margin-top:4rem}.swagger-ui .mv6-m{margin-bottom:8rem;margin-top:8rem}.swagger-ui .mv7-m{margin-bottom:16rem;margin-top:16rem}.swagger-ui .mh0-m{margin-left:0;margin-right:0}.swagger-ui .mh1-m{margin-left:.25rem;margin-right:.25rem}.swagger-ui .mh2-m{margin-left:.5rem;margin-right:.5rem}.swagger-ui .mh3-m{margin-left:1rem;margin-right:1rem}.swagger-ui .mh4-m{margin-left:2rem;margin-right:2rem}.swagger-ui .mh5-m{margin-left:4rem;margin-right:4rem}.swagger-ui .mh6-m{margin-left:8rem;margin-right:8rem}.swagger-ui .mh7-m{margin-left:16rem;margin-right:16rem}}@media screen and (min-width:60em){.swagger-ui .pa0-l{padding:0}.swagger-ui .pa1-l{padding:.25rem}.swagger-ui .pa2-l{padding:.5rem}.swagger-ui .pa3-l{padding:1rem}.swagger-ui .pa4-l{padding:2rem}.swagger-ui .pa5-l{padding:4rem}.swagger-ui .pa6-l{padding:8rem}.swagger-ui .pa7-l{padding:16rem}.swagger-ui .pl0-l{padding-left:0}.swagger-ui .pl1-l{padding-left:.25rem}.swagger-ui .pl2-l{padding-left:.5rem}.swagger-ui .pl3-l{padding-left:1rem}.swagger-ui .pl4-l{padding-left:2rem}.swagger-ui .pl5-l{padding-left:4rem}.swagger-ui .pl6-l{padding-left:8rem}.swagger-ui .pl7-l{padding-left:16rem}.swagger-ui .pr0-l{padding-right:0}.swagger-ui .pr1-l{padding-right:.25rem}.swagger-ui .pr2-l{padding-right:.5rem}.swagger-ui .pr3-l{padding-right:1rem}.swagger-ui .pr4-l{padding-right:2rem}.swagger-ui .pr5-l{padding-right:4rem}.swagger-ui .pr6-l{padding-right:8rem}.swagger-ui .pr7-l{padding-right:16rem}.swagger-ui .pb0-l{padding-bottom:0}.swagger-ui .pb1-l{padding-bottom:.25rem}.swagger-ui .pb2-l{padding-bottom:.5rem}.swagger-ui .pb3-l{padding-bottom:1rem}.swagger-ui .pb4-l{padding-bottom:2rem}.swagger-ui .pb5-l{padding-bottom:4rem}.swagger-ui .pb6-l{padding-bottom:8rem}.swagger-ui .pb7-l{padding-bottom:16rem}.swagger-ui .pt0-l{padding-top:0}.swagger-ui .pt1-l{padding-top:.25rem}.swagger-ui .pt2-l{padding-top:.5rem}.swagger-ui .pt3-l{padding-top:1rem}.swagger-ui .pt4-l{padding-top:2rem}.swagger-ui .pt5-l{padding-top:4rem}.swagger-ui .pt6-l{padding-top:8rem}.swagger-ui .pt7-l{padding-top:16rem}.swagger-ui .pv0-l{padding-bottom:0;padding-top:0}.swagger-ui .pv1-l{padding-bottom:.25rem;padding-top:.25rem}.swagger-ui .pv2-l{padding-bottom:.5rem;padding-top:.5rem}.swagger-ui .pv3-l{padding-bottom:1rem;padding-top:1rem}.swagger-ui .pv4-l{padding-bottom:2rem;padding-top:2rem}.swagger-ui .pv5-l{padding-bottom:4rem;padding-top:4rem}.swagger-ui .pv6-l{padding-bottom:8rem;padding-top:8rem}.swagger-ui .pv7-l{padding-bottom:16rem;padding-top:16rem}.swagger-ui .ph0-l{padding-left:0;padding-right:0}.swagger-ui .ph1-l{padding-left:.25rem;padding-right:.25rem}.swagger-ui .ph2-l{padding-left:.5rem;padding-right:.5rem}.swagger-ui .ph3-l{padding-left:1rem;padding-right:1rem}.swagger-ui .ph4-l{padding-left:2rem;padding-right:2rem}.swagger-ui .ph5-l{padding-left:4rem;padding-right:4rem}.swagger-ui .ph6-l{padding-left:8rem;padding-right:8rem}.swagger-ui .ph7-l{padding-left:16rem;padding-right:16rem}.swagger-ui .ma0-l{margin:0}.swagger-ui .ma1-l{margin:.25rem}.swagger-ui .ma2-l{margin:.5rem}.swagger-ui .ma3-l{margin:1rem}.swagger-ui .ma4-l{margin:2rem}.swagger-ui .ma5-l{margin:4rem}.swagger-ui .ma6-l{margin:8rem}.swagger-ui .ma7-l{margin:16rem}.swagger-ui .ml0-l{margin-left:0}.swagger-ui .ml1-l{margin-left:.25rem}.swagger-ui .ml2-l{margin-left:.5rem}.swagger-ui .ml3-l{margin-left:1rem}.swagger-ui .ml4-l{margin-left:2rem}.swagger-ui .ml5-l{margin-left:4rem}.swagger-ui .ml6-l{margin-left:8rem}.swagger-ui .ml7-l{margin-left:16rem}.swagger-ui .mr0-l{margin-right:0}.swagger-ui .mr1-l{margin-right:.25rem}.swagger-ui .mr2-l{margin-right:.5rem}.swagger-ui .mr3-l{margin-right:1rem}.swagger-ui .mr4-l{margin-right:2rem}.swagger-ui .mr5-l{margin-right:4rem}.swagger-ui .mr6-l{margin-right:8rem}.swagger-ui .mr7-l{margin-right:16rem}.swagger-ui .mb0-l{margin-bottom:0}.swagger-ui .mb1-l{margin-bottom:.25rem}.swagger-ui .mb2-l{margin-bottom:.5rem}.swagger-ui .mb3-l{margin-bottom:1rem}.swagger-ui .mb4-l{margin-bottom:2rem}.swagger-ui .mb5-l{margin-bottom:4rem}.swagger-ui .mb6-l{margin-bottom:8rem}.swagger-ui .mb7-l{margin-bottom:16rem}.swagger-ui .mt0-l{margin-top:0}.swagger-ui .mt1-l{margin-top:.25rem}.swagger-ui .mt2-l{margin-top:.5rem}.swagger-ui .mt3-l{margin-top:1rem}.swagger-ui .mt4-l{margin-top:2rem}.swagger-ui .mt5-l{margin-top:4rem}.swagger-ui .mt6-l{margin-top:8rem}.swagger-ui .mt7-l{margin-top:16rem}.swagger-ui .mv0-l{margin-bottom:0;margin-top:0}.swagger-ui .mv1-l{margin-bottom:.25rem;margin-top:.25rem}.swagger-ui .mv2-l{margin-bottom:.5rem;margin-top:.5rem}.swagger-ui .mv3-l{margin-bottom:1rem;margin-top:1rem}.swagger-ui .mv4-l{margin-bottom:2rem;margin-top:2rem}.swagger-ui .mv5-l{margin-bottom:4rem;margin-top:4rem}.swagger-ui .mv6-l{margin-bottom:8rem;margin-top:8rem}.swagger-ui .mv7-l{margin-bottom:16rem;margin-top:16rem}.swagger-ui .mh0-l{margin-left:0;margin-right:0}.swagger-ui .mh1-l{margin-left:.25rem;margin-right:.25rem}.swagger-ui .mh2-l{margin-left:.5rem;margin-right:.5rem}.swagger-ui .mh3-l{margin-left:1rem;margin-right:1rem}.swagger-ui .mh4-l{margin-left:2rem;margin-right:2rem}.swagger-ui .mh5-l{margin-left:4rem;margin-right:4rem}.swagger-ui .mh6-l{margin-left:8rem;margin-right:8rem}.swagger-ui .mh7-l{margin-left:16rem;margin-right:16rem}}.swagger-ui .na1{margin:-.25rem}.swagger-ui .na2{margin:-.5rem}.swagger-ui .na3{margin:-1rem}.swagger-ui .na4{margin:-2rem}.swagger-ui .na5{margin:-4rem}.swagger-ui .na6{margin:-8rem}.swagger-ui .na7{margin:-16rem}.swagger-ui .nl1{margin-left:-.25rem}.swagger-ui .nl2{margin-left:-.5rem}.swagger-ui .nl3{margin-left:-1rem}.swagger-ui .nl4{margin-left:-2rem}.swagger-ui .nl5{margin-left:-4rem}.swagger-ui .nl6{margin-left:-8rem}.swagger-ui .nl7{margin-left:-16rem}.swagger-ui .nr1{margin-right:-.25rem}.swagger-ui .nr2{margin-right:-.5rem}.swagger-ui .nr3{margin-right:-1rem}.swagger-ui .nr4{margin-right:-2rem}.swagger-ui .nr5{margin-right:-4rem}.swagger-ui .nr6{margin-right:-8rem}.swagger-ui .nr7{margin-right:-16rem}.swagger-ui .nb1{margin-bottom:-.25rem}.swagger-ui .nb2{margin-bottom:-.5rem}.swagger-ui .nb3{margin-bottom:-1rem}.swagger-ui .nb4{margin-bottom:-2rem}.swagger-ui .nb5{margin-bottom:-4rem}.swagger-ui .nb6{margin-bottom:-8rem}.swagger-ui .nb7{margin-bottom:-16rem}.swagger-ui .nt1{margin-top:-.25rem}.swagger-ui .nt2{margin-top:-.5rem}.swagger-ui .nt3{margin-top:-1rem}.swagger-ui .nt4{margin-top:-2rem}.swagger-ui .nt5{margin-top:-4rem}.swagger-ui .nt6{margin-top:-8rem}.swagger-ui .nt7{margin-top:-16rem}@media screen and (min-width:30em){.swagger-ui .na1-ns{margin:-.25rem}.swagger-ui .na2-ns{margin:-.5rem}.swagger-ui .na3-ns{margin:-1rem}.swagger-ui .na4-ns{margin:-2rem}.swagger-ui .na5-ns{margin:-4rem}.swagger-ui .na6-ns{margin:-8rem}.swagger-ui .na7-ns{margin:-16rem}.swagger-ui .nl1-ns{margin-left:-.25rem}.swagger-ui .nl2-ns{margin-left:-.5rem}.swagger-ui .nl3-ns{margin-left:-1rem}.swagger-ui .nl4-ns{margin-left:-2rem}.swagger-ui .nl5-ns{margin-left:-4rem}.swagger-ui .nl6-ns{margin-left:-8rem}.swagger-ui .nl7-ns{margin-left:-16rem}.swagger-ui .nr1-ns{margin-right:-.25rem}.swagger-ui .nr2-ns{margin-right:-.5rem}.swagger-ui .nr3-ns{margin-right:-1rem}.swagger-ui .nr4-ns{margin-right:-2rem}.swagger-ui .nr5-ns{margin-right:-4rem}.swagger-ui .nr6-ns{margin-right:-8rem}.swagger-ui .nr7-ns{margin-right:-16rem}.swagger-ui .nb1-ns{margin-bottom:-.25rem}.swagger-ui .nb2-ns{margin-bottom:-.5rem}.swagger-ui .nb3-ns{margin-bottom:-1rem}.swagger-ui .nb4-ns{margin-bottom:-2rem}.swagger-ui .nb5-ns{margin-bottom:-4rem}.swagger-ui .nb6-ns{margin-bottom:-8rem}.swagger-ui .nb7-ns{margin-bottom:-16rem}.swagger-ui .nt1-ns{margin-top:-.25rem}.swagger-ui .nt2-ns{margin-top:-.5rem}.swagger-ui .nt3-ns{margin-top:-1rem}.swagger-ui .nt4-ns{margin-top:-2rem}.swagger-ui .nt5-ns{margin-top:-4rem}.swagger-ui .nt6-ns{margin-top:-8rem}.swagger-ui .nt7-ns{margin-top:-16rem}}@media screen and (min-width:30em)and (max-width:60em){.swagger-ui .na1-m{margin:-.25rem}.swagger-ui .na2-m{margin:-.5rem}.swagger-ui .na3-m{margin:-1rem}.swagger-ui .na4-m{margin:-2rem}.swagger-ui .na5-m{margin:-4rem}.swagger-ui .na6-m{margin:-8rem}.swagger-ui .na7-m{margin:-16rem}.swagger-ui .nl1-m{margin-left:-.25rem}.swagger-ui .nl2-m{margin-left:-.5rem}.swagger-ui .nl3-m{margin-left:-1rem}.swagger-ui .nl4-m{margin-left:-2rem}.swagger-ui .nl5-m{margin-left:-4rem}.swagger-ui .nl6-m{margin-left:-8rem}.swagger-ui .nl7-m{margin-left:-16rem}.swagger-ui .nr1-m{margin-right:-.25rem}.swagger-ui .nr2-m{margin-right:-.5rem}.swagger-ui .nr3-m{margin-right:-1rem}.swagger-ui .nr4-m{margin-right:-2rem}.swagger-ui .nr5-m{margin-right:-4rem}.swagger-ui .nr6-m{margin-right:-8rem}.swagger-ui .nr7-m{margin-right:-16rem}.swagger-ui .nb1-m{margin-bottom:-.25rem}.swagger-ui .nb2-m{margin-bottom:-.5rem}.swagger-ui .nb3-m{margin-bottom:-1rem}.swagger-ui .nb4-m{margin-bottom:-2rem}.swagger-ui .nb5-m{margin-bottom:-4rem}.swagger-ui .nb6-m{margin-bottom:-8rem}.swagger-ui .nb7-m{margin-bottom:-16rem}.swagger-ui .nt1-m{margin-top:-.25rem}.swagger-ui .nt2-m{margin-top:-.5rem}.swagger-ui .nt3-m{margin-top:-1rem}.swagger-ui .nt4-m{margin-top:-2rem}.swagger-ui .nt5-m{margin-top:-4rem}.swagger-ui .nt6-m{margin-top:-8rem}.swagger-ui .nt7-m{margin-top:-16rem}}@media screen and (min-width:60em){.swagger-ui .na1-l{margin:-.25rem}.swagger-ui .na2-l{margin:-.5rem}.swagger-ui .na3-l{margin:-1rem}.swagger-ui .na4-l{margin:-2rem}.swagger-ui .na5-l{margin:-4rem}.swagger-ui .na6-l{margin:-8rem}.swagger-ui .na7-l{margin:-16rem}.swagger-ui .nl1-l{margin-left:-.25rem}.swagger-ui .nl2-l{margin-left:-.5rem}.swagger-ui .nl3-l{margin-left:-1rem}.swagger-ui .nl4-l{margin-left:-2rem}.swagger-ui .nl5-l{margin-left:-4rem}.swagger-ui .nl6-l{margin-left:-8rem}.swagger-ui .nl7-l{margin-left:-16rem}.swagger-ui .nr1-l{margin-right:-.25rem}.swagger-ui .nr2-l{margin-right:-.5rem}.swagger-ui .nr3-l{margin-right:-1rem}.swagger-ui .nr4-l{margin-right:-2rem}.swagger-ui .nr5-l{margin-right:-4rem}.swagger-ui .nr6-l{margin-right:-8rem}.swagger-ui .nr7-l{margin-right:-16rem}.swagger-ui .nb1-l{margin-bottom:-.25rem}.swagger-ui .nb2-l{margin-bottom:-.5rem}.swagger-ui .nb3-l{margin-bottom:-1rem}.swagger-ui .nb4-l{margin-bottom:-2rem}.swagger-ui .nb5-l{margin-bottom:-4rem}.swagger-ui .nb6-l{margin-bottom:-8rem}.swagger-ui .nb7-l{margin-bottom:-16rem}.swagger-ui .nt1-l{margin-top:-.25rem}.swagger-ui .nt2-l{margin-top:-.5rem}.swagger-ui .nt3-l{margin-top:-1rem}.swagger-ui .nt4-l{margin-top:-2rem}.swagger-ui .nt5-l{margin-top:-4rem}.swagger-ui .nt6-l{margin-top:-8rem}.swagger-ui .nt7-l{margin-top:-16rem}}.swagger-ui .collapse{border-collapse:collapse;border-spacing:0}.swagger-ui .striped--light-silver:nth-child(odd){background-color:#aaa}.swagger-ui .striped--moon-gray:nth-child(odd){background-color:#ccc}.swagger-ui .striped--light-gray:nth-child(odd){background-color:#eee}.swagger-ui .striped--near-white:nth-child(odd){background-color:#f4f4f4}.swagger-ui .stripe-light:nth-child(odd){background-color:hsla(0,0%,100%,.1)}.swagger-ui .stripe-dark:nth-child(odd){background-color:rgba(0,0,0,.1)}.swagger-ui .strike{-webkit-text-decoration:line-through;text-decoration:line-through}.swagger-ui .underline{-webkit-text-decoration:underline;text-decoration:underline}.swagger-ui .no-underline{-webkit-text-decoration:none;text-decoration:none}@media screen and (min-width:30em){.swagger-ui .strike-ns{-webkit-text-decoration:line-through;text-decoration:line-through}.swagger-ui .underline-ns{-webkit-text-decoration:underline;text-decoration:underline}.swagger-ui .no-underline-ns{-webkit-text-decoration:none;text-decoration:none}}@media screen and (min-width:30em)and (max-width:60em){.swagger-ui .strike-m{-webkit-text-decoration:line-through;text-decoration:line-through}.swagger-ui .underline-m{-webkit-text-decoration:underline;text-decoration:underline}.swagger-ui .no-underline-m{-webkit-text-decoration:none;text-decoration:none}}@media screen and (min-width:60em){.swagger-ui .strike-l{-webkit-text-decoration:line-through;text-decoration:line-through}.swagger-ui .underline-l{-webkit-text-decoration:underline;text-decoration:underline}.swagger-ui .no-underline-l{-webkit-text-decoration:none;text-decoration:none}}.swagger-ui .tl{text-align:left}.swagger-ui .tr{text-align:right}.swagger-ui .tc{text-align:center}.swagger-ui .tj{text-align:justify}@media screen and (min-width:30em){.swagger-ui .tl-ns{text-align:left}.swagger-ui .tr-ns{text-align:right}.swagger-ui .tc-ns{text-align:center}.swagger-ui .tj-ns{text-align:justify}}@media screen and (min-width:30em)and (max-width:60em){.swagger-ui .tl-m{text-align:left}.swagger-ui .tr-m{text-align:right}.swagger-ui .tc-m{text-align:center}.swagger-ui .tj-m{text-align:justify}}@media screen and (min-width:60em){.swagger-ui .tl-l{text-align:left}.swagger-ui .tr-l{text-align:right}.swagger-ui .tc-l{text-align:center}.swagger-ui .tj-l{text-align:justify}}.swagger-ui .ttc{text-transform:capitalize}.swagger-ui .ttl{text-transform:lowercase}.swagger-ui .ttu{text-transform:uppercase}.swagger-ui .ttn{text-transform:none}@media screen and (min-width:30em){.swagger-ui .ttc-ns{text-transform:capitalize}.swagger-ui .ttl-ns{text-transform:lowercase}.swagger-ui .ttu-ns{text-transform:uppercase}.swagger-ui .ttn-ns{text-transform:none}}@media screen and (min-width:30em)and (max-width:60em){.swagger-ui .ttc-m{text-transform:capitalize}.swagger-ui .ttl-m{text-transform:lowercase}.swagger-ui .ttu-m{text-transform:uppercase}.swagger-ui .ttn-m{text-transform:none}}@media screen and (min-width:60em){.swagger-ui .ttc-l{text-transform:capitalize}.swagger-ui .ttl-l{text-transform:lowercase}.swagger-ui .ttu-l{text-transform:uppercase}.swagger-ui .ttn-l{text-transform:none}}.swagger-ui .f-6,.swagger-ui .f-headline{font-size:6rem}.swagger-ui .f-5,.swagger-ui .f-subheadline{font-size:5rem}.swagger-ui .f1{font-size:3rem}.swagger-ui .f2{font-size:2.25rem}.swagger-ui .f3{font-size:1.5rem}.swagger-ui .f4{font-size:1.25rem}.swagger-ui .f5{font-size:1rem}.swagger-ui .f6{font-size:.875rem}.swagger-ui .f7{font-size:.75rem}@media screen and (min-width:30em){.swagger-ui .f-6-ns,.swagger-ui .f-headline-ns{font-size:6rem}.swagger-ui .f-5-ns,.swagger-ui .f-subheadline-ns{font-size:5rem}.swagger-ui .f1-ns{font-size:3rem}.swagger-ui .f2-ns{font-size:2.25rem}.swagger-ui .f3-ns{font-size:1.5rem}.swagger-ui .f4-ns{font-size:1.25rem}.swagger-ui .f5-ns{font-size:1rem}.swagger-ui .f6-ns{font-size:.875rem}.swagger-ui .f7-ns{font-size:.75rem}}@media screen and (min-width:30em)and (max-width:60em){.swagger-ui .f-6-m,.swagger-ui .f-headline-m{font-size:6rem}.swagger-ui .f-5-m,.swagger-ui .f-subheadline-m{font-size:5rem}.swagger-ui .f1-m{font-size:3rem}.swagger-ui .f2-m{font-size:2.25rem}.swagger-ui .f3-m{font-size:1.5rem}.swagger-ui .f4-m{font-size:1.25rem}.swagger-ui .f5-m{font-size:1rem}.swagger-ui .f6-m{font-size:.875rem}.swagger-ui .f7-m{font-size:.75rem}}@media screen and (min-width:60em){.swagger-ui .f-6-l,.swagger-ui .f-headline-l{font-size:6rem}.swagger-ui .f-5-l,.swagger-ui .f-subheadline-l{font-size:5rem}.swagger-ui .f1-l{font-size:3rem}.swagger-ui .f2-l{font-size:2.25rem}.swagger-ui .f3-l{font-size:1.5rem}.swagger-ui .f4-l{font-size:1.25rem}.swagger-ui .f5-l{font-size:1rem}.swagger-ui .f6-l{font-size:.875rem}.swagger-ui .f7-l{font-size:.75rem}}.swagger-ui .measure{max-width:30em}.swagger-ui .measure-wide{max-width:34em}.swagger-ui .measure-narrow{max-width:20em}.swagger-ui .indent{margin-bottom:0;margin-top:0;text-indent:1em}.swagger-ui .small-caps{font-feature-settings:"smcp";font-variant:small-caps}.swagger-ui .truncate{overflow:hidden;text-overflow:ellipsis;white-space:nowrap}@media screen and (min-width:30em){.swagger-ui .measure-ns{max-width:30em}.swagger-ui .measure-wide-ns{max-width:34em}.swagger-ui .measure-narrow-ns{max-width:20em}.swagger-ui .indent-ns{margin-bottom:0;margin-top:0;text-indent:1em}.swagger-ui .small-caps-ns{font-feature-settings:"smcp";font-variant:small-caps}.swagger-ui .truncate-ns{overflow:hidden;text-overflow:ellipsis;white-space:nowrap}}@media screen and (min-width:30em)and (max-width:60em){.swagger-ui .measure-m{max-width:30em}.swagger-ui .measure-wide-m{max-width:34em}.swagger-ui .measure-narrow-m{max-width:20em}.swagger-ui .indent-m{margin-bottom:0;margin-top:0;text-indent:1em}.swagger-ui .small-caps-m{font-feature-settings:"smcp";font-variant:small-caps}.swagger-ui .truncate-m{overflow:hidden;text-overflow:ellipsis;white-space:nowrap}}@media screen and (min-width:60em){.swagger-ui .measure-l{max-width:30em}.swagger-ui .measure-wide-l{max-width:34em}.swagger-ui .measure-narrow-l{max-width:20em}.swagger-ui .indent-l{margin-bottom:0;margin-top:0;text-indent:1em}.swagger-ui .small-caps-l{font-feature-settings:"smcp";font-variant:small-caps}.swagger-ui .truncate-l{overflow:hidden;text-overflow:ellipsis;white-space:nowrap}}.swagger-ui .overflow-container{overflow-y:scroll}.swagger-ui .center{margin-left:auto;margin-right:auto}.swagger-ui .mr-auto{margin-right:auto}.swagger-ui .ml-auto{margin-left:auto}@media screen and (min-width:30em){.swagger-ui .center-ns{margin-left:auto;margin-right:auto}.swagger-ui .mr-auto-ns{margin-right:auto}.swagger-ui .ml-auto-ns{margin-left:auto}}@media screen and (min-width:30em)and (max-width:60em){.swagger-ui .center-m{margin-left:auto;margin-right:auto}.swagger-ui .mr-auto-m{margin-right:auto}.swagger-ui .ml-auto-m{margin-left:auto}}@media screen and (min-width:60em){.swagger-ui .center-l{margin-left:auto;margin-right:auto}.swagger-ui .mr-auto-l{margin-right:auto}.swagger-ui .ml-auto-l{margin-left:auto}}.swagger-ui .clip{position:fixed!important;_position:absolute!important;clip:rect(1px 1px 1px 1px);clip:rect(1px,1px,1px,1px)}@media screen and (min-width:30em){.swagger-ui .clip-ns{position:fixed!important;_position:absolute!important;clip:rect(1px 1px 1px 1px);clip:rect(1px,1px,1px,1px)}}@media screen and (min-width:30em)and (max-width:60em){.swagger-ui .clip-m{position:fixed!important;_position:absolute!important;clip:rect(1px 1px 1px 1px);clip:rect(1px,1px,1px,1px)}}@media screen and (min-width:60em){.swagger-ui .clip-l{position:fixed!important;_position:absolute!important;clip:rect(1px 1px 1px 1px);clip:rect(1px,1px,1px,1px)}}.swagger-ui .ws-normal{white-space:normal}.swagger-ui .nowrap{white-space:nowrap}.swagger-ui .pre{white-space:pre}@media screen and (min-width:30em){.swagger-ui .ws-normal-ns{white-space:normal}.swagger-ui .nowrap-ns{white-space:nowrap}.swagger-ui .pre-ns{white-space:pre}}@media screen and (min-width:30em)and (max-width:60em){.swagger-ui .ws-normal-m{white-space:normal}.swagger-ui .nowrap-m{white-space:nowrap}.swagger-ui .pre-m{white-space:pre}}@media screen and (min-width:60em){.swagger-ui .ws-normal-l{white-space:normal}.swagger-ui .nowrap-l{white-space:nowrap}.swagger-ui .pre-l{white-space:pre}}.swagger-ui .v-base{vertical-align:baseline}.swagger-ui .v-mid{vertical-align:middle}.swagger-ui .v-top{vertical-align:top}.swagger-ui .v-btm{vertical-align:bottom}@media screen and (min-width:30em){.swagger-ui .v-base-ns{vertical-align:baseline}.swagger-ui .v-mid-ns{vertical-align:middle}.swagger-ui .v-top-ns{vertical-align:top}.swagger-ui .v-btm-ns{vertical-align:bottom}}@media screen and (min-width:30em)and (max-width:60em){.swagger-ui .v-base-m{vertical-align:baseline}.swagger-ui .v-mid-m{vertical-align:middle}.swagger-ui .v-top-m{vertical-align:top}.swagger-ui .v-btm-m{vertical-align:bottom}}@media screen and (min-width:60em){.swagger-ui .v-base-l{vertical-align:baseline}.swagger-ui .v-mid-l{vertical-align:middle}.swagger-ui .v-top-l{vertical-align:top}.swagger-ui .v-btm-l{vertical-align:bottom}}.swagger-ui .dim{opacity:1;transition:opacity .15s ease-in}.swagger-ui .dim:focus,.swagger-ui .dim:hover{opacity:.5;transition:opacity .15s ease-in}.swagger-ui .dim:active{opacity:.8;transition:opacity .15s ease-out}.swagger-ui .glow{transition:opacity .15s ease-in}.swagger-ui .glow:focus,.swagger-ui .glow:hover{opacity:1;transition:opacity .15s ease-in}.swagger-ui .hide-child .child{opacity:0;transition:opacity .15s ease-in}.swagger-ui .hide-child:active .child,.swagger-ui .hide-child:focus .child,.swagger-ui .hide-child:hover .child{opacity:1;transition:opacity .15s ease-in}.swagger-ui .underline-hover:focus,.swagger-ui .underline-hover:hover{-webkit-text-decoration:underline;text-decoration:underline}.swagger-ui .grow{-moz-osx-font-smoothing:grayscale;backface-visibility:hidden;transform:translateZ(0);transition:transform .25s ease-out}.swagger-ui .grow:focus,.swagger-ui .grow:hover{transform:scale(1.05)}.swagger-ui .grow:active{transform:scale(.9)}.swagger-ui .grow-large{-moz-osx-font-smoothing:grayscale;backface-visibility:hidden;transform:translateZ(0);transition:transform .25s ease-in-out}.swagger-ui .grow-large:focus,.swagger-ui .grow-large:hover{transform:scale(1.2)}.swagger-ui .grow-large:active{transform:scale(.95)}.swagger-ui .pointer:hover{cursor:pointer}.swagger-ui .shadow-hover{cursor:pointer;position:relative;transition:all .5s cubic-bezier(.165,.84,.44,1)}.swagger-ui .shadow-hover:after{border-radius:inherit;box-shadow:0 0 16px 2px rgba(0,0,0,.2);content:"";height:100%;left:0;opacity:0;position:absolute;top:0;transition:opacity .5s cubic-bezier(.165,.84,.44,1);width:100%;z-index:-1}.swagger-ui .shadow-hover:focus:after,.swagger-ui .shadow-hover:hover:after{opacity:1}.swagger-ui .bg-animate,.swagger-ui .bg-animate:focus,.swagger-ui .bg-animate:hover{transition:background-color .15s ease-in-out}.swagger-ui .z-0{z-index:0}.swagger-ui .z-1{z-index:1}.swagger-ui .z-2{z-index:2}.swagger-ui .z-3{z-index:3}.swagger-ui .z-4{z-index:4}.swagger-ui .z-5{z-index:5}.swagger-ui .z-999{z-index:999}.swagger-ui .z-9999{z-index:9999}.swagger-ui .z-max{z-index:2147483647}.swagger-ui .z-inherit{z-index:inherit}.swagger-ui .z-initial,.swagger-ui .z-unset{z-index:auto}.swagger-ui .nested-copy-line-height ol,.swagger-ui .nested-copy-line-height p,.swagger-ui .nested-copy-line-height ul{line-height:1.5}.swagger-ui .nested-headline-line-height h1,.swagger-ui .nested-headline-line-height h2,.swagger-ui .nested-headline-line-height h3,.swagger-ui .nested-headline-line-height h4,.swagger-ui .nested-headline-line-height h5,.swagger-ui .nested-headline-line-height h6{line-height:1.25}.swagger-ui .nested-list-reset ol,.swagger-ui .nested-list-reset ul{list-style-type:none;margin-left:0;padding-left:0}.swagger-ui .nested-copy-indent p+p{margin-bottom:0;margin-top:0;text-indent:.1em}.swagger-ui .nested-copy-seperator p+p{margin-top:1.5em}.swagger-ui .nested-img img{display:block;max-width:100%;width:100%}.swagger-ui .nested-links a{color:#357edd;transition:color .15s ease-in}.swagger-ui .nested-links a:focus,.swagger-ui .nested-links a:hover{color:#96ccff;transition:color .15s ease-in}.swagger-ui .wrapper{box-sizing:border-box;margin:0 auto;max-width:1460px;padding:0 20px;width:100%}.swagger-ui .opblock-tag-section{display:flex;flex-direction:column}.swagger-ui .try-out.btn-group{display:flex;flex:.1 2 auto;padding:0}.swagger-ui .try-out__btn{margin-left:1.25rem}.swagger-ui .opblock-tag{align-items:center;border-bottom:1px solid rgba(59,65,81,.3);cursor:pointer;display:flex;padding:10px 20px 10px 10px;transition:all .2s}.swagger-ui .opblock-tag:hover{background:rgba(0,0,0,.02)}.swagger-ui .opblock-tag{color:#3b4151;font-family:sans-serif;font-size:24px;margin:0 0 5px}.swagger-ui .opblock-tag.no-desc span{flex:1}.swagger-ui .opblock-tag svg{transition:all .4s}.swagger-ui .opblock-tag small{color:#3b4151;flex:2;font-family:sans-serif;font-size:14px;font-weight:400;padding:0 10px}.swagger-ui .opblock-tag>div{flex:1 1 150px;font-weight:400;overflow:hidden;text-overflow:ellipsis;white-space:nowrap}@media(max-width:640px){.swagger-ui .opblock-tag small,.swagger-ui .opblock-tag>div{flex:1}}.swagger-ui .opblock-tag .info__externaldocs{text-align:right}.swagger-ui .parameter__type{color:#3b4151;font-family:monospace;font-size:12px;font-weight:600;padding:5px 0}.swagger-ui .parameter-controls{margin-top:.75em}.swagger-ui .examples__title{display:block;font-size:1.1em;font-weight:700;margin-bottom:.75em}.swagger-ui .examples__section{margin-top:1.5em}.swagger-ui .examples__section-header{font-size:.9rem;font-weight:700;margin-bottom:.5rem}.swagger-ui .examples-select{display:inline-block;margin-bottom:.75em}.swagger-ui .examples-select .examples-select-element{width:100%}.swagger-ui .examples-select__section-label{font-size:.9rem;font-weight:700;margin-right:.5rem}.swagger-ui .example__section{margin-top:1.5em}.swagger-ui .example__section-header{font-size:.9rem;font-weight:700;margin-bottom:.5rem}.swagger-ui .view-line-link{cursor:pointer;margin:0 5px;position:relative;top:3px;transition:all .5s;width:20px}.swagger-ui .opblock{border:1px solid #000;border-radius:4px;box-shadow:0 0 3px rgba(0,0,0,.19);margin:0 0 15px}.swagger-ui .opblock .tab-header{display:flex;flex:1}.swagger-ui .opblock .tab-header .tab-item{cursor:pointer;padding:0 40px}.swagger-ui .opblock .tab-header .tab-item:first-of-type{padding:0 40px 0 0}.swagger-ui .opblock .tab-header .tab-item.active h4 span{position:relative}.swagger-ui .opblock .tab-header .tab-item.active h4 span:after{background:grey;bottom:-15px;content:"";height:4px;left:50%;position:absolute;transform:translateX(-50%);width:120%}.swagger-ui .opblock.is-open .opblock-summary{border-bottom:1px solid #000}.swagger-ui .opblock .opblock-section-header{align-items:center;background:hsla(0,0%,100%,.8);box-shadow:0 1px 2px rgba(0,0,0,.1);display:flex;min-height:50px;padding:8px 20px}.swagger-ui .opblock .opblock-section-header>label{align-items:center;color:#3b4151;display:flex;font-family:sans-serif;font-size:12px;font-weight:700;margin:0 0 0 auto}.swagger-ui .opblock .opblock-section-header>label>span{padding:0 10px 0 0}.swagger-ui .opblock .opblock-section-header h4{color:#3b4151;flex:1;font-family:sans-serif;font-size:14px;margin:0}.swagger-ui .opblock .opblock-summary-method{background:#000;border-radius:3px;color:#fff;font-family:sans-serif;font-size:14px;font-weight:700;min-width:80px;padding:6px 0;text-align:center;text-shadow:0 1px 0 rgba(0,0,0,.1)}@media(max-width:768px){.swagger-ui .opblock .opblock-summary-method{font-size:12px}}.swagger-ui .opblock .opblock-summary-operation-id,.swagger-ui .opblock .opblock-summary-path,.swagger-ui .opblock .opblock-summary-path__deprecated{align-items:center;color:#3b4151;display:flex;font-family:monospace;font-size:16px;font-weight:600;word-break:break-word}@media(max-width:768px){.swagger-ui .opblock .opblock-summary-operation-id,.swagger-ui .opblock .opblock-summary-path,.swagger-ui .opblock .opblock-summary-path__deprecated{font-size:12px}}.swagger-ui .opblock .opblock-summary-path{flex-shrink:1}@media(max-width:640px){.swagger-ui .opblock .opblock-summary-path{max-width:100%}}.swagger-ui .opblock .opblock-summary-path__deprecated{-webkit-text-decoration:line-through;text-decoration:line-through}.swagger-ui .opblock .opblock-summary-operation-id{font-size:14px}.swagger-ui .opblock .opblock-summary-description{color:#3b4151;font-family:sans-serif;font-size:13px;word-break:break-word}.swagger-ui .opblock .opblock-summary-path-description-wrapper{align-items:center;display:flex;flex-direction:row;flex-grow:1;flex-wrap:wrap;gap:0 10px;padding:0 10px}@media(max-width:550px){.swagger-ui .opblock .opblock-summary-path-description-wrapper{align-items:flex-start;flex-direction:column}}.swagger-ui .opblock .opblock-summary{align-items:center;cursor:pointer;display:flex;padding:5px}.swagger-ui .opblock .opblock-summary .view-line-link{cursor:pointer;margin:0;position:relative;top:2px;transition:all .5s;width:0}.swagger-ui .opblock .opblock-summary:hover .view-line-link{margin:0 5px;width:18px}.swagger-ui .opblock .opblock-summary:hover .view-line-link.copy-to-clipboard{width:24px}.swagger-ui .opblock.opblock-post{background:rgba(73,204,144,.1);border-color:#49cc90}.swagger-ui .opblock.opblock-post .opblock-summary-method{background:#49cc90}.swagger-ui .opblock.opblock-post .opblock-summary{border-color:#49cc90}.swagger-ui .opblock.opblock-post .tab-header .tab-item.active h4 span:after{background:#49cc90}.swagger-ui .opblock.opblock-put{background:rgba(252,161,48,.1);border-color:#fca130}.swagger-ui .opblock.opblock-put .opblock-summary-method{background:#fca130}.swagger-ui .opblock.opblock-put .opblock-summary{border-color:#fca130}.swagger-ui .opblock.opblock-put .tab-header .tab-item.active h4 span:after{background:#fca130}.swagger-ui .opblock.opblock-delete{background:rgba(249,62,62,.1);border-color:#f93e3e}.swagger-ui .opblock.opblock-delete .opblock-summary-method{background:#f93e3e}.swagger-ui .opblock.opblock-delete .opblock-summary{border-color:#f93e3e}.swagger-ui .opblock.opblock-delete .tab-header .tab-item.active h4 span:after{background:#f93e3e}.swagger-ui .opblock.opblock-get{background:rgba(97,175,254,.1);border-color:#61affe}.swagger-ui .opblock.opblock-get .opblock-summary-method{background:#61affe}.swagger-ui .opblock.opblock-get .opblock-summary{border-color:#61affe}.swagger-ui .opblock.opblock-get .tab-header .tab-item.active h4 span:after{background:#61affe}.swagger-ui .opblock.opblock-patch{background:rgba(80,227,194,.1);border-color:#50e3c2}.swagger-ui .opblock.opblock-patch .opblock-summary-method{background:#50e3c2}.swagger-ui .opblock.opblock-patch .opblock-summary{border-color:#50e3c2}.swagger-ui .opblock.opblock-patch .tab-header .tab-item.active h4 span:after{background:#50e3c2}.swagger-ui .opblock.opblock-head{background:rgba(144,18,254,.1);border-color:#9012fe}.swagger-ui .opblock.opblock-head .opblock-summary-method{background:#9012fe}.swagger-ui .opblock.opblock-head .opblock-summary{border-color:#9012fe}.swagger-ui .opblock.opblock-head .tab-header .tab-item.active h4 span:after{background:#9012fe}.swagger-ui .opblock.opblock-options{background:rgba(13,90,167,.1);border-color:#0d5aa7}.swagger-ui .opblock.opblock-options .opblock-summary-method{background:#0d5aa7}.swagger-ui .opblock.opblock-options .opblock-summary{border-color:#0d5aa7}.swagger-ui .opblock.opblock-options .tab-header .tab-item.active h4 span:after{background:#0d5aa7}.swagger-ui .opblock.opblock-deprecated{background:hsla(0,0%,92%,.1);border-color:#ebebeb;opacity:.6}.swagger-ui .opblock.opblock-deprecated .opblock-summary-method{background:#ebebeb}.swagger-ui .opblock.opblock-deprecated .opblock-summary{border-color:#ebebeb}.swagger-ui .opblock.opblock-deprecated .tab-header .tab-item.active h4 span:after{background:#ebebeb}.swagger-ui .opblock .opblock-schemes{padding:8px 20px}.swagger-ui .opblock .opblock-schemes .schemes-title{padding:0 10px 0 0}.swagger-ui .filter .operation-filter-input{border:2px solid #d8dde7;margin:20px 0;padding:10px;width:100%}.swagger-ui .download-url-wrapper .failed,.swagger-ui .filter .failed{color:red}.swagger-ui .download-url-wrapper .loading,.swagger-ui .filter .loading{color:#aaa}.swagger-ui .model-example{margin-top:1em}.swagger-ui .model-example .model-container{overflow-x:auto;width:100%}.swagger-ui .model-example .model-container .model-hint:not(.model-hint--embedded){top:-1.15em}.swagger-ui .tab{display:flex;list-style:none;padding:0}.swagger-ui .tab li{color:#3b4151;cursor:pointer;font-family:sans-serif;font-size:12px;min-width:60px;padding:0}.swagger-ui .tab li:first-of-type{padding-left:0;padding-right:12px;position:relative}.swagger-ui .tab li:first-of-type:after{background:rgba(0,0,0,.2);content:"";height:100%;position:absolute;right:6px;top:0;width:1px}.swagger-ui .tab li.active{font-weight:700}.swagger-ui .tab li button.tablinks{background:none;border:0;color:inherit;font-family:inherit;font-weight:inherit;padding:0}.swagger-ui .opblock-description-wrapper,.swagger-ui .opblock-external-docs-wrapper,.swagger-ui .opblock-title_normal{color:#3b4151;font-family:sans-serif;font-size:12px;margin:0 0 5px;padding:15px 20px}.swagger-ui .opblock-description-wrapper h4,.swagger-ui .opblock-external-docs-wrapper h4,.swagger-ui .opblock-title_normal h4{color:#3b4151;font-family:sans-serif;font-size:12px;margin:0 0 5px}.swagger-ui .opblock-description-wrapper p,.swagger-ui .opblock-external-docs-wrapper p,.swagger-ui .opblock-title_normal p{color:#3b4151;font-family:sans-serif;font-size:14px;margin:0}.swagger-ui .opblock-external-docs-wrapper h4{padding-left:0}.swagger-ui .execute-wrapper{padding:20px;text-align:right}.swagger-ui .execute-wrapper .btn{padding:8px 40px;width:100%}.swagger-ui .body-param-options{display:flex;flex-direction:column}.swagger-ui .body-param-options .body-param-edit{padding:10px 0}.swagger-ui .body-param-options label{padding:8px 0}.swagger-ui .body-param-options label select{margin:3px 0 0}.swagger-ui .responses-inner{padding:20px}.swagger-ui .responses-inner h4,.swagger-ui .responses-inner h5{color:#3b4151;font-family:sans-serif;font-size:12px;margin:10px 0 5px}.swagger-ui .responses-inner .curl{max-height:400px;min-height:6em;overflow-y:auto}.swagger-ui .response-col_status{color:#3b4151;font-family:sans-serif;font-size:14px}.swagger-ui .response-col_status .response-undocumented{color:#909090;font-family:monospace;font-size:11px;font-weight:600}.swagger-ui .response-col_links{color:#3b4151;font-family:sans-serif;font-size:14px;max-width:40em;padding-left:2em}.swagger-ui .response-col_links .response-undocumented{color:#909090;font-family:monospace;font-size:11px;font-weight:600}.swagger-ui .response-col_links .operation-link{margin-bottom:1.5em}.swagger-ui .response-col_links .operation-link .description{margin-bottom:.5em}.swagger-ui .opblock-body .opblock-loading-animation{display:block;margin:3em auto}.swagger-ui .opblock-body pre.microlight{background:#333;border-radius:4px;font-size:12px;hyphens:auto;margin:0;padding:10px;white-space:pre-wrap;word-break:break-all;word-break:break-word;word-wrap:break-word;color:#fff;font-family:monospace;font-weight:600}.swagger-ui .opblock-body pre.microlight .headerline{display:block}.swagger-ui .highlight-code{position:relative}.swagger-ui .highlight-code>.microlight{max-height:400px;min-height:6em;overflow-y:auto}.swagger-ui .highlight-code>.microlight code{white-space:pre-wrap!important;word-break:break-all}.swagger-ui .curl-command{position:relative}.swagger-ui .download-contents{align-items:center;background:#7d8293;border:none;border-radius:4px;bottom:10px;color:#fff;display:flex;font-family:sans-serif;font-size:14px;font-weight:600;height:30px;justify-content:center;padding:5px;position:absolute;right:10px;text-align:center}.swagger-ui .scheme-container{background:#fff;box-shadow:0 1px 2px 0 rgba(0,0,0,.15);margin:0 0 20px;padding:30px 0}.swagger-ui .scheme-container .schemes{align-items:flex-end;display:flex;flex-wrap:wrap;gap:10px;justify-content:space-between}.swagger-ui .scheme-container .schemes>.schemes-server-container{display:flex;flex-wrap:wrap;gap:10px}.swagger-ui .scheme-container .schemes>.schemes-server-container>label{color:#3b4151;display:flex;flex-direction:column;font-family:sans-serif;font-size:12px;font-weight:700;margin:-20px 15px 0 0}.swagger-ui .scheme-container .schemes>.schemes-server-container>label select{min-width:130px;text-transform:uppercase}.swagger-ui .scheme-container .schemes:not(:has(.schemes-server-container)){justify-content:flex-end}.swagger-ui .scheme-container .schemes .auth-wrapper{flex:none;justify-content:start}.swagger-ui .scheme-container .schemes .auth-wrapper .authorize{display:flex;flex-wrap:nowrap;margin:0;padding-right:20px}.swagger-ui .loading-container{align-items:center;display:flex;flex-direction:column;justify-content:center;margin-top:1em;min-height:1px;padding:40px 0 60px}.swagger-ui .loading-container .loading{position:relative}.swagger-ui .loading-container .loading:after{color:#3b4151;content:"loading";font-family:sans-serif;font-size:10px;font-weight:700;left:50%;position:absolute;text-transform:uppercase;top:50%;transform:translate(-50%,-50%)}.swagger-ui .loading-container .loading:before{animation:rotation 1s linear infinite,opacity .5s;backface-visibility:hidden;border:2px solid rgba(85,85,85,.1);border-radius:100%;border-top-color:rgba(0,0,0,.6);content:"";display:block;height:60px;left:50%;margin:-30px;opacity:1;position:absolute;top:50%;width:60px}@keyframes rotation{to{transform:rotate(1turn)}}.swagger-ui .response-controls{display:flex;padding-top:1em}.swagger-ui .response-control-media-type{margin-right:1em}.swagger-ui .response-control-media-type--accept-controller select{border-color:green}.swagger-ui .response-control-media-type__accept-message{color:green;font-size:.7em}.swagger-ui .response-control-examples__title,.swagger-ui .response-control-media-type__title{display:block;font-size:.7em;margin-bottom:.2em}@keyframes blinker{50%{opacity:0}}.swagger-ui .hidden{display:none}.swagger-ui .no-margin{border:none;height:auto;margin:0;padding:0}.swagger-ui .float-right{float:right}.swagger-ui .svg-assets{height:0;position:absolute;width:0}.swagger-ui section h3{color:#3b4151;font-family:sans-serif}.swagger-ui a.nostyle{display:inline}.swagger-ui a.nostyle,.swagger-ui a.nostyle:visited{color:inherit;cursor:pointer;text-decoration:inherit}.swagger-ui .fallback{color:#aaa;padding:1em}.swagger-ui .version-pragma{height:100%;padding:5em 0}.swagger-ui .version-pragma__message{display:flex;font-size:1.2em;height:100%;justify-content:center;line-height:1.5em;padding:0 .6em;text-align:center}.swagger-ui .version-pragma__message>div{flex:1;max-width:55ch}.swagger-ui .version-pragma__message code{background-color:#dedede;padding:4px 4px 2px;white-space:pre}.swagger-ui .opblock-link{font-weight:400}.swagger-ui .opblock-link.shown{font-weight:700}.swagger-ui span.token-string{color:#555}.swagger-ui span.token-not-formatted{color:#555;font-weight:700}.swagger-ui .btn{background:transparent;border:2px solid grey;border-radius:4px;box-shadow:0 1px 2px rgba(0,0,0,.1);color:#3b4151;font-family:sans-serif;font-size:14px;font-weight:700;padding:5px 23px;transition:all .3s}.swagger-ui .btn.btn-sm{font-size:12px;padding:4px 23px}.swagger-ui .btn[disabled]{cursor:not-allowed;opacity:.3}.swagger-ui .btn:hover{box-shadow:0 0 5px rgba(0,0,0,.3)}.swagger-ui .btn.cancel{background-color:transparent;border-color:#ff6060;color:#ff6060;font-family:sans-serif}.swagger-ui .btn.authorize{background-color:transparent;border-color:#49cc90;color:#49cc90;display:inline;line-height:1}.swagger-ui .btn.authorize span{float:left;padding:4px 20px 0 0}.swagger-ui .btn.authorize svg{fill:#49cc90}.swagger-ui .btn.execute{background-color:#4990e2;border-color:#4990e2;color:#fff}.swagger-ui .btn-group{display:flex;padding:30px}.swagger-ui .btn-group .btn{flex:1}.swagger-ui .btn-group .btn:first-child{border-radius:4px 0 0 4px}.swagger-ui .btn-group .btn:last-child{border-radius:0 4px 4px 0}.swagger-ui .authorization__btn{background:none;border:none;padding:0 0 0 10px}.swagger-ui .authorization__btn .locked{opacity:1}.swagger-ui .authorization__btn .unlocked{opacity:.4}.swagger-ui .model-box-control,.swagger-ui .models-control,.swagger-ui .opblock-summary-control{all:inherit;border-bottom:0;cursor:pointer;flex:1;padding:0}.swagger-ui .model-box-control:focus,.swagger-ui .models-control:focus,.swagger-ui .opblock-summary-control:focus{outline:auto}.swagger-ui .expand-methods,.swagger-ui .expand-operation{background:none;border:none}.swagger-ui .expand-methods svg,.swagger-ui .expand-operation svg{height:20px;width:20px}.swagger-ui .expand-methods{padding:0 10px}.swagger-ui .expand-methods:hover svg{fill:#404040}.swagger-ui .expand-methods svg{transition:all .3s;fill:#707070}.swagger-ui button{cursor:pointer}.swagger-ui button.invalid{animation:shake .4s 1;background:#feebeb;border-color:#f93e3e}.swagger-ui .copy-to-clipboard{align-items:center;background:#7d8293;border:none;border-radius:4px;bottom:10px;display:flex;height:30px;justify-content:center;position:absolute;right:100px;width:30px}.swagger-ui .copy-to-clipboard button{background:url("data:image/svg+xml;charset=utf-8,<svg xmlns=\"http://www.w3.org/2000/svg\" width=\"16\" height=\"15\" aria-hidden=\"true\"><path fill=\"%23fff\" fill-rule=\"evenodd\" d=\"M4 12h4v1H4zm5-6H4v1h5zm2 3V7l-3 3 3 3v-2h5V9zM6.5 8H4v1h2.5zM4 11h2.5v-1H4zm9 1h1v2c-.02.28-.11.52-.3.7s-.42.28-.7.3H3c-.55 0-1-.45-1-1V3c0-.55.45-1 1-1h3c0-1.11.89-2 2-2s2 .89 2 2h3c.55 0 1 .45 1 1v5h-1V5H3v9h10zM4 4h8c0-.55-.45-1-1-1h-1c-.55 0-1-.45-1-1s-.45-1-1-1-1 .45-1 1-.45 1-1 1H5c-.55 0-1 .45-1 1\"/></svg>") 50% no-repeat;border:none;flex-grow:1;flex-shrink:1;height:25px}.swagger-ui .copy-to-clipboard:active{background:#5e626f}.swagger-ui .opblock-control-arrow{background:none;border:none;text-align:center}.swagger-ui .curl-command .copy-to-clipboard{bottom:5px;height:20px;right:10px;width:20px}.swagger-ui .curl-command .copy-to-clipboard button{height:18px}.swagger-ui .opblock .opblock-summary .view-line-link.copy-to-clipboard{height:26px;position:static}.swagger-ui select{-webkit-appearance:none;-moz-appearance:none;appearance:none;background:#f7f7f7 url("data:image/svg+xml;charset=utf-8,<svg xmlns=\"http://www.w3.org/2000/svg\" viewBox=\"0 0 20 20\"><path d=\"M13.418 7.859a.695.695 0 0 1 .978 0 .68.68 0 0 1 0 .969l-3.908 3.83a.697.697 0 0 1-.979 0l-3.908-3.83a.68.68 0 0 1 0-.969.695.695 0 0 1 .978 0L10 11z\"/></svg>") right 10px center no-repeat;background-size:20px;border:2px solid #41444e;border-radius:4px;box-shadow:0 1px 2px 0 rgba(0,0,0,.25);color:#3b4151;font-family:sans-serif;font-size:14px;font-weight:700;padding:5px 40px 5px 10px}.swagger-ui select[multiple]{background:#f7f7f7;margin:5px 0;padding:5px}.swagger-ui select.invalid{animation:shake .4s 1;background:#feebeb;border-color:#f93e3e}.swagger-ui .opblock-body select{min-width:230px}@media(max-width:768px){.swagger-ui .opblock-body select{min-width:180px}}@media(max-width:640px){.swagger-ui .opblock-body select{min-width:100%;width:100%}}.swagger-ui label{color:#3b4151;font-family:sans-serif;font-size:12px;font-weight:700;margin:0 0 5px}.swagger-ui input[type=email],.swagger-ui input[type=file],.swagger-ui input[type=password],.swagger-ui input[type=search],.swagger-ui input[type=text]{line-height:1}@media(max-width:768px){.swagger-ui input[type=email],.swagger-ui input[type=file],.swagger-ui input[type=password],.swagger-ui input[type=search],.swagger-ui input[type=text]{max-width:175px}}.swagger-ui input[type=email],.swagger-ui input[type=file],.swagger-ui input[type=password],.swagger-ui input[type=search],.swagger-ui input[type=text],.swagger-ui textarea{background:#fff;border:1px solid #d9d9d9;border-radius:4px;margin:5px 0;min-width:100px;padding:8px 10px}.swagger-ui input[type=email].invalid,.swagger-ui input[type=file].invalid,.swagger-ui input[type=password].invalid,.swagger-ui input[type=search].invalid,.swagger-ui input[type=text].invalid,.swagger-ui textarea.invalid{animation:shake .4s 1;background:#feebeb;border-color:#f93e3e}.swagger-ui input[disabled],.swagger-ui select[disabled],.swagger-ui textarea[disabled]{background-color:#fafafa;color:#888;cursor:not-allowed}.swagger-ui select[disabled]{border-color:#888}.swagger-ui textarea[disabled]{background-color:#41444e;color:#fff}@keyframes shake{10%,90%{transform:translate3d(-1px,0,0)}20%,80%{transform:translate3d(2px,0,0)}30%,50%,70%{transform:translate3d(-4px,0,0)}40%,60%{transform:translate3d(4px,0,0)}}.swagger-ui textarea{background:hsla(0,0%,100%,.8);border:none;border-radius:4px;color:#3b4151;font-family:monospace;font-size:12px;font-weight:600;min-height:280px;outline:none;padding:10px;width:100%}.swagger-ui textarea:focus{border:2px solid #61affe}.swagger-ui textarea.curl{background:#41444e;border-radius:4px;color:#fff;font-family:monospace;font-size:12px;font-weight:600;margin:0;min-height:100px;padding:10px;resize:none}.swagger-ui .checkbox{color:#303030;padding:5px 0 10px;transition:opacity .5s}.swagger-ui .checkbox label{display:flex}.swagger-ui .checkbox p{color:#3b4151;font-family:monospace;font-style:italic;font-weight:400!important;font-weight:600;margin:0!important}.swagger-ui .checkbox input[type=checkbox]{display:none}.swagger-ui .checkbox input[type=checkbox]+label>.item{background:#e8e8e8;border-radius:1px;box-shadow:0 0 0 2px #e8e8e8;cursor:pointer;display:inline-block;flex:none;height:16px;margin:0 8px 0 0;padding:5px;position:relative;top:3px;width:16px}.swagger-ui .checkbox input[type=checkbox]+label>.item:active{transform:scale(.9)}.swagger-ui .checkbox input[type=checkbox]:checked+label>.item{background:#e8e8e8 url("data:image/svg+xml;charset=utf-8,<svg xmlns=\"http://www.w3.org/2000/svg\" width=\"10\" height=\"8\" viewBox=\"3 7 10 8\"><path fill=\"%2341474E\" fill-rule=\"evenodd\" d=\"M6.333 15 3 11.667l1.333-1.334 2 2L11.667 7 13 8.333z\"/></svg>") 50% no-repeat}.swagger-ui .dialog-ux{bottom:0;left:0;position:fixed;right:0;top:0;z-index:9999}.swagger-ui .dialog-ux .backdrop-ux{background:rgba(0,0,0,.8);bottom:0;left:0;position:fixed;right:0;top:0}.swagger-ui .dialog-ux .modal-ux{background:#fff;border:1px solid #ebebeb;border-radius:4px;box-shadow:0 10px 30px 0 rgba(0,0,0,.2);left:50%;max-width:650px;min-width:300px;position:absolute;top:50%;transform:translate(-50%,-50%);width:100%;z-index:9999}.swagger-ui .dialog-ux .modal-ux-content{max-height:540px;overflow-y:auto;padding:20px}.swagger-ui .dialog-ux .modal-ux-content p{color:#41444e;color:#3b4151;font-family:sans-serif;font-size:12px;margin:0 0 5px}.swagger-ui .dialog-ux .modal-ux-content h4{color:#3b4151;font-family:sans-serif;font-size:18px;font-weight:600;margin:15px 0 0}.swagger-ui .dialog-ux .modal-ux-header{align-items:center;border-bottom:1px solid #ebebeb;display:flex;padding:12px 0}.swagger-ui .dialog-ux .modal-ux-header .close-modal{-webkit-appearance:none;-moz-appearance:none;appearance:none;background:none;border:none;padding:0 10px}.swagger-ui .dialog-ux .modal-ux-header h3{color:#3b4151;flex:1;font-family:sans-serif;font-size:20px;font-weight:600;margin:0;padding:0 20px}.swagger-ui .model{color:#3b4151;font-family:monospace;font-size:12px;font-weight:300;font-weight:600}.swagger-ui .model .deprecated span,.swagger-ui .model .deprecated td{color:#a0a0a0!important}.swagger-ui .model .deprecated>td:first-of-type{-webkit-text-decoration:line-through;text-decoration:line-through}.swagger-ui .model-toggle{cursor:pointer;display:inline-block;font-size:10px;margin:auto .3em;position:relative;top:6px;transform:rotate(90deg);transform-origin:50% 50%;transition:transform .15s ease-in}.swagger-ui .model-toggle.collapsed{transform:rotate(0deg)}.swagger-ui .model-toggle:after{background:url("data:image/svg+xml;charset=utf-8,<svg xmlns=\"http://www.w3.org/2000/svg\" width=\"24\" height=\"24\" viewBox=\"0 0 24 24\"><path d=\"M10 6 8.59 7.41 13.17 12l-4.58 4.59L10 18l6-6z\"/></svg>") 50% no-repeat;background-size:100%;content:"";display:block;height:20px;width:20px}.swagger-ui .model-jump-to-path{cursor:pointer;position:relative}.swagger-ui .model-jump-to-path .view-line-link{cursor:pointer;position:absolute;top:-.4em}.swagger-ui .model-title{position:relative}.swagger-ui .model-title:hover .model-hint{display:block}.swagger-ui .model-hint{background:rgba(0,0,0,.7);border-radius:4px;color:#ebebeb;display:none;padding:.1em .5em;position:absolute;top:-1.8em;white-space:nowrap}.swagger-ui .model p{margin:0 0 1em}.swagger-ui .model .property{color:#999;font-style:italic}.swagger-ui .model .property.primitive{color:#6b6b6b}.swagger-ui .model .property.primitive.extension{display:block}.swagger-ui .model .property.primitive.extension>td:first-child{padding-left:0;padding-right:0;width:auto}.swagger-ui .model .property.primitive.extension>td:first-child:after{content:": "}.swagger-ui .model .external-docs,.swagger-ui table.model tr.description{color:#666;font-weight:400}.swagger-ui table.model tr.description td:first-child,.swagger-ui table.model tr.property-row.required td:first-child{font-weight:700}.swagger-ui table.model tr.property-row td{vertical-align:top}.swagger-ui table.model tr.property-row td:first-child{padding-right:.2em}.swagger-ui table.model tr.property-row .star{color:red}.swagger-ui table.model tr.extension{color:#777}.swagger-ui table.model tr.extension td:last-child{vertical-align:top}.swagger-ui table.model tr.external-docs td:first-child{font-weight:700}.swagger-ui table.model tr .renderedMarkdown p:first-child{margin-top:0}.swagger-ui section.models{border:1px solid rgba(59,65,81,.3);border-radius:4px;margin:30px 0}.swagger-ui section.models .pointer{cursor:pointer}.swagger-ui section.models.is-open{padding:0 0 20px}.swagger-ui section.models.is-open h4{border-bottom:1px solid rgba(59,65,81,.3);margin:0 0 5px}.swagger-ui section.models h4{align-items:center;color:#606060;cursor:pointer;display:flex;font-family:sans-serif;font-size:16px;margin:0;padding:10px 20px 10px 10px;transition:all .2s}.swagger-ui section.models h4 svg{transition:all .4s}.swagger-ui section.models h4 span{flex:1}.swagger-ui section.models h4:hover{background:rgba(0,0,0,.02)}.swagger-ui section.models h5{color:#707070;font-family:sans-serif;font-size:16px;margin:0 0 10px}.swagger-ui section.models .model-jump-to-path{position:relative;top:5px}.swagger-ui section.models .model-container{background:rgba(0,0,0,.05);border-radius:4px;margin:0 20px 15px;position:relative;transition:all .5s}.swagger-ui section.models .model-container:hover{background:rgba(0,0,0,.07)}.swagger-ui section.models .model-container:first-of-type{margin:20px}.swagger-ui section.models .model-container:last-of-type{margin:0 20px}.swagger-ui section.models .model-container .models-jump-to-path{opacity:.65;position:absolute;right:5px;top:8px}.swagger-ui section.models .model-box{background:none}.swagger-ui section.models .model-box:has(.model-box){overflow-x:auto;width:100%}.swagger-ui .model-box{background:rgba(0,0,0,.1);border-radius:4px;display:inline-block;padding:10px}.swagger-ui .model-box .model-jump-to-path{position:relative;top:4px}.swagger-ui .model-box.deprecated{opacity:.5}.swagger-ui .model-title{color:#505050;font-family:sans-serif;font-size:16px}.swagger-ui .model-title img{bottom:0;margin-left:1em;position:relative}.swagger-ui .model-deprecated-warning{color:#f93e3e;font-family:sans-serif;font-size:16px;font-weight:600;margin-right:1em}.swagger-ui span>span.model .brace-close{padding:0 0 0 10px}.swagger-ui .prop-name{display:inline-block;margin-right:1em}.swagger-ui .prop-type{color:#55a}.swagger-ui .prop-enum{display:block}.swagger-ui .prop-format{color:#606060}.swagger-ui .servers>label{color:#3b4151;font-family:sans-serif;font-size:12px;margin:-20px 15px 0 0}.swagger-ui .servers>label select{max-width:100%;min-width:130px;width:100%}.swagger-ui .servers h4.message{padding-bottom:2em}.swagger-ui .servers table tr{width:30em}.swagger-ui .servers table td{display:inline-block;max-width:15em;padding-bottom:10px;padding-top:10px;vertical-align:middle}.swagger-ui .servers table td:first-of-type{padding-right:1em}.swagger-ui .servers table td input{height:100%;width:100%}.swagger-ui .servers .computed-url{margin:2em 0}.swagger-ui .servers .computed-url code{display:inline-block;font-size:16px;margin:0 1em;padding:4px}.swagger-ui .servers-title{font-size:12px;font-weight:700}.swagger-ui .operation-servers h4.message{margin-bottom:2em}.swagger-ui table{border-collapse:collapse;padding:0 10px;width:100%}.swagger-ui table.model tbody tr td{padding:0 0 0 1em;vertical-align:top}.swagger-ui table.model tbody tr td:first-of-type{padding:0 0 0 2em;width:174px}.swagger-ui table.headers td{color:#3b4151;font-family:monospace;font-size:12px;font-weight:300;font-weight:600;vertical-align:middle}.swagger-ui table.headers .header-example{color:#999;font-style:italic}.swagger-ui table tbody tr td{padding:10px 0 0;vertical-align:top}.swagger-ui table tbody tr td:first-of-type{min-width:6em;padding:10px 0}.swagger-ui table tbody tr td:has(.model-box){max-width:1px}.swagger-ui table thead tr td,.swagger-ui table thead tr th{border-bottom:1px solid rgba(59,65,81,.2);color:#3b4151;font-family:sans-serif;font-size:12px;font-weight:700;padding:12px 0;text-align:left}.swagger-ui .parameters-col_description{margin-bottom:2em;width:99%}.swagger-ui .parameters-col_description input{max-width:340px;width:100%}.swagger-ui .parameters-col_description select{border-width:1px}.swagger-ui .parameters-col_description .markdown:first-child p:first-child,.swagger-ui .parameters-col_description .renderedMarkdown:first-child p:first-child{margin:0}.swagger-ui .parameter__name{color:#3b4151;font-family:sans-serif;font-size:16px;font-weight:400;margin-right:.75em}.swagger-ui .parameter__name.required{font-weight:700}.swagger-ui .parameter__name.required span{color:red}.swagger-ui .parameter__name.required:after{color:rgba(255,0,0,.6);content:"required";font-size:10px;padding:5px;position:relative;top:-6px}.swagger-ui .parameter__extension,.swagger-ui .parameter__in{color:grey;font-family:monospace;font-size:12px;font-style:italic;font-weight:600}.swagger-ui .parameter__deprecated{color:red;font-family:monospace;font-size:12px;font-style:italic;font-weight:600}.swagger-ui .parameter__empty_value_toggle{display:block;font-size:13px;padding-bottom:12px;padding-top:5px}.swagger-ui .parameter__empty_value_toggle input{margin-right:7px;width:auto}.swagger-ui .parameter__empty_value_toggle.disabled{opacity:.7}.swagger-ui .table-container{padding:20px}.swagger-ui .response-col_description{width:99%}.swagger-ui .response-col_description .markdown p:first-child,.swagger-ui .response-col_description .renderedMarkdown p:first-child{margin:0}.swagger-ui .response-col_description .markdown p:last-child,.swagger-ui .response-col_description .renderedMarkdown p:last-child{margin-bottom:0}.swagger-ui .response-col_links{min-width:6em}.swagger-ui .response__extension{color:grey;font-family:monospace;font-size:12px;font-style:italic;font-weight:600}.swagger-ui .topbar{background-color:#1b1b1b;padding:10px 0}.swagger-ui .topbar .topbar-wrapper{align-items:center;display:flex;flex-wrap:wrap;gap:10px}@media(max-width:550px){.swagger-ui .topbar .topbar-wrapper{align-items:start;flex-direction:column}}.swagger-ui .topbar a{align-items:center;color:#fff;display:flex;flex:1;font-family:sans-serif;font-size:1.5em;font-weight:700;max-width:300px;-webkit-text-decoration:none;text-decoration:none}.swagger-ui .topbar a span{margin:0;padding:0 10px}.swagger-ui .topbar .download-url-wrapper{display:flex;flex:3;justify-content:flex-end;margin-left:auto;max-width:600px}.swagger-ui .topbar .download-url-wrapper input[type=text]{border:2px solid #62a03f;border-radius:4px 0 0 4px;margin:0;max-width:100%;outline:none;width:100%}.swagger-ui .topbar .download-url-wrapper .select-label{align-items:center;color:#f0f0f0;display:flex;margin:0;max-width:600px;width:100%}.swagger-ui .topbar .download-url-wrapper .select-label span{flex:1;font-size:16px;padding:0 10px 0 0;text-align:right}.swagger-ui .topbar .download-url-wrapper .select-label select{border:2px solid #62a03f;box-shadow:none;flex:2;outline:none;width:100%}.swagger-ui .topbar .download-url-wrapper .download-url-button{background:#62a03f;border:none;border-radius:0 4px 4px 0;color:#fff;font-family:sans-serif;font-size:16px;font-weight:700;padding:4px 30px}@media(max-width:550px){.swagger-ui .topbar .download-url-wrapper{width:100%}}.swagger-ui .topbar .dark-mode-toggle{cursor:pointer;margin-left:10px;opacity:.8;transition:all .2s}.swagger-ui .topbar .dark-mode-toggle button{background:none;border:none;padding:0}.swagger-ui .topbar .dark-mode-toggle button svg{fill:#e4e6e6}.swagger-ui .topbar .dark-mode-toggle:hover{opacity:1}.swagger-ui .info{margin:50px 0}.swagger-ui .info.failed-config{margin-left:auto;margin-right:auto;max-width:880px;text-align:center}.swagger-ui .info hgroup.main{margin:0 0 20px}.swagger-ui .info hgroup.main a{font-size:12px}.swagger-ui .info li,.swagger-ui .info p,.swagger-ui .info pre,.swagger-ui .info table{font-size:14px}.swagger-ui .info h1,.swagger-ui .info h2,.swagger-ui .info h3,.swagger-ui .info h4,.swagger-ui .info h5,.swagger-ui .info li,.swagger-ui .info p,.swagger-ui .info table{color:#3b4151;font-family:sans-serif}.swagger-ui .info a{color:#4990e2;font-family:sans-serif;font-size:14px;transition:all .4s}.swagger-ui .info a:hover{color:#1f69c0}.swagger-ui .info>div{margin:0 0 5px}.swagger-ui .info .base-url{color:#3b4151;font-family:monospace;font-size:12px;font-weight:300!important;font-weight:600;margin:0}.swagger-ui .info .title{color:#3b4151;font-family:sans-serif;font-size:36px;margin:0}.swagger-ui .info .title small{background:#7d8492;border-radius:57px;display:inline-block;font-size:10px;margin:0 0 0 5px;padding:2px 4px;position:relative;top:-5px;vertical-align:super}.swagger-ui .info .title small.version-stamp{background-color:#89bf04}.swagger-ui .info .title small pre{color:#fff;font-family:sans-serif;margin:0;padding:0}.swagger-ui .auth-btn-wrapper{display:flex;justify-content:center;padding:10px 0}.swagger-ui .auth-btn-wrapper .btn-done{margin-right:1em}.swagger-ui .auth-wrapper{display:flex;flex:1;justify-content:flex-end}.swagger-ui .auth-wrapper .authorize{margin-left:10px;margin-right:10px;padding-right:20px}.swagger-ui .auth-container{border-bottom:1px solid #ebebeb;margin:0 0 10px;padding:10px 20px}.swagger-ui .auth-container:last-of-type{border:0;margin:0;padding:10px 20px}.swagger-ui .auth-container h4{margin:5px 0 15px!important}.swagger-ui .auth-container .wrapper{margin:0;padding:0}.swagger-ui .auth-container input[type=password],.swagger-ui .auth-container input[type=text]{min-width:230px}.swagger-ui .auth-container .errors{background-color:#fee;border-radius:4px;color:red;color:#3b4151;font-family:monospace;font-size:12px;font-weight:600;margin:1em;padding:10px}.swagger-ui .auth-container .errors b{margin-right:1em;text-transform:capitalize}.swagger-ui .scopes h2{color:#3b4151;font-family:sans-serif;font-size:14px}.swagger-ui .scopes h2 a{color:#4990e2;cursor:pointer;font-size:12px;padding-left:10px;-webkit-text-decoration:underline;text-decoration:underline}.swagger-ui .scope-def{padding:0 0 20px}.swagger-ui .errors-wrapper{animation:scaleUp .5s;background:rgba(249,62,62,.1);border:2px solid #f93e3e;border-radius:4px;margin:20px;padding:10px 20px}.swagger-ui .errors-wrapper .error-wrapper{margin:0 0 10px}.swagger-ui .errors-wrapper .errors h4{color:#3b4151;font-family:monospace;font-size:14px;font-weight:600;margin:0}.swagger-ui .errors-wrapper .errors small{color:#606060}.swagger-ui .errors-wrapper .errors .message{white-space:pre-line}.swagger-ui .errors-wrapper .errors .message.thrown{max-width:100%}.swagger-ui .errors-wrapper .errors .error-line{cursor:pointer;-webkit-text-decoration:underline;text-decoration:underline}.swagger-ui .errors-wrapper hgroup{align-items:center;display:flex}.swagger-ui .errors-wrapper hgroup h4{color:#3b4151;flex:1;font-family:sans-serif;font-size:20px;margin:0}@keyframes scaleUp{0%{opacity:0;transform:scale(.8)}to{opacity:1;transform:scale(1)}}.swagger-ui .Resizer.vertical.disabled{display:none}.swagger-ui .markdown p,.swagger-ui .markdown pre,.swagger-ui .renderedMarkdown p,.swagger-ui .renderedMarkdown pre{margin:1em auto;word-break:break-all;word-break:break-word}.swagger-ui .markdown pre,.swagger-ui .renderedMarkdown pre{background:none;color:#000;font-weight:400;padding:0;white-space:pre-wrap}.swagger-ui .markdown code,.swagger-ui .renderedMarkdown code{background:rgba(0,0,0,.05);border-radius:4px;color:#9012fe;font-family:monospace;font-size:14px;font-weight:600;padding:5px 7px}.swagger-ui .markdown pre>code,.swagger-ui .renderedMarkdown pre>code{display:block}.swagger-ui .json-schema-2020-12-keyword--\$vocabulary ul{border-left:1px dashed rgba(0,0,0,.1);margin:0 0 0 20px}.swagger-ui .json-schema-2020-12-\$vocabulary-uri{margin-left:35px}.swagger-ui .json-schema-2020-12-\$vocabulary-uri--disabled{-webkit-text-decoration:line-through;text-decoration:line-through}.swagger-ui .json-schema-2020-12-keyword--const .json-schema-2020-12-json-viewer__name,.swagger-ui .json-schema-2020-12-keyword--const .json-schema-2020-12-json-viewer__value{color:#3b4151;font-style:normal}.swagger-ui .json-schema-2020-12__constraint{background-color:#805ad5;border-radius:4px;color:#3b4151;color:#fff;font-family:monospace;font-weight:600;line-height:1.5;margin-left:10px;padding:1px 3px}.swagger-ui .json-schema-2020-12__constraint--string{background-color:#d69e2e;color:#fff}.swagger-ui .json-schema-2020-12-keyword--default .json-schema-2020-12-json-viewer__name,.swagger-ui .json-schema-2020-12-keyword--default .json-schema-2020-12-json-viewer__value{color:#3b4151;font-style:normal}.swagger-ui .json-schema-2020-12-keyword--dependentRequired>ul{display:inline-block;margin:0;padding:0}.swagger-ui .json-schema-2020-12-keyword--dependentRequired>ul li{display:inline;list-style-type:none}.swagger-ui .json-schema-2020-12-keyword--description{color:#6b6b6b;font-size:12px;margin-left:20px}.swagger-ui .json-schema-2020-12-keyword--description p{margin:0}.swagger-ui .json-schema-2020-12-keyword--enum .json-schema-2020-12-json-viewer__name,.swagger-ui .json-schema-2020-12-keyword--enum .json-schema-2020-12-json-viewer__value,.swagger-ui .json-schema-2020-12-keyword--examples .json-schema-2020-12-json-viewer__name,.swagger-ui .json-schema-2020-12-keyword--examples .json-schema-2020-12-json-viewer__value{color:#3b4151;font-style:normal}.swagger-ui .json-schema-2020-12-json-viewer-extension-keyword .json-schema-2020-12-json-viewer__name,.swagger-ui .json-schema-2020-12-json-viewer-extension-keyword .json-schema-2020-12-json-viewer__value{color:#929292;font-style:italic}.swagger-ui .json-schema-2020-12-keyword--patternProperties ul{border:none;margin:0;padding:0}.swagger-ui .json-schema-2020-12-keyword--patternProperties .json-schema-2020-12__title:first-of-type:after,.swagger-ui .json-schema-2020-12-keyword--patternProperties .json-schema-2020-12__title:first-of-type:before{color:#55a;content:"/"}.swagger-ui .json-schema-2020-12-keyword--properties>ul{border:none;margin:0;padding:0}.swagger-ui .json-schema-2020-12-property{list-style-type:none}.swagger-ui .json-schema-2020-12-property--required>.json-schema-2020-12:first-of-type>.json-schema-2020-12-head .json-schema-2020-12__title:after{color:red;content:"*";font-weight:700}.swagger-ui .json-schema-2020-12__title{color:#505050;display:inline-block;font-family:sans-serif;font-size:12px;font-weight:700;line-height:normal}.swagger-ui .json-schema-2020-12__title .json-schema-2020-12-keyword__name{margin:0}.swagger-ui .json-schema-2020-12-property{margin:7px 0}.swagger-ui .json-schema-2020-12-property .json-schema-2020-12__title{color:#3b4151;font-family:monospace;font-size:12px;font-weight:600;vertical-align:middle}.swagger-ui .json-schema-2020-12-keyword{margin:5px 0}.swagger-ui .json-schema-2020-12-keyword__children{border-left:1px dashed rgba(0,0,0,.1);margin:0 0 0 20px;padding:0}.swagger-ui .json-schema-2020-12-keyword__children--collapsed{display:none}.swagger-ui .json-schema-2020-12-keyword__name{font-size:12px;font-weight:700;margin-left:20px}.swagger-ui .json-schema-2020-12-keyword__name--primary{color:#3b4151;font-style:normal}.swagger-ui .json-schema-2020-12-keyword__name--secondary{color:#6b6b6b;font-style:italic}.swagger-ui .json-schema-2020-12-keyword__name--extension{color:#929292;font-style:italic}.swagger-ui .json-schema-2020-12-keyword__value{color:#6b6b6b;font-size:12px;font-style:italic;font-weight:400}.swagger-ui .json-schema-2020-12-keyword__value--primary{color:#3b4151;font-style:normal}.swagger-ui .json-schema-2020-12-keyword__value--secondary{color:#6b6b6b;font-style:italic}.swagger-ui .json-schema-2020-12-keyword__value--extension{color:#929292;font-style:italic}.swagger-ui .json-schema-2020-12-keyword__value--warning{border:1px dashed red;border-radius:4px;color:#3b4151;color:red;display:inline-block;font-family:monospace;font-style:normal;font-weight:600;line-height:1.5;margin-left:10px;padding:1px 4px}.swagger-ui .json-schema-2020-12-keyword__name--secondary+.json-schema-2020-12-keyword__value--secondary:before{content:"="}.swagger-ui .json-schema-2020-12__attribute{color:#3b4151;font-family:monospace;font-size:12px;padding-left:10px;text-transform:lowercase}.swagger-ui .json-schema-2020-12__attribute--primary{color:#55a}.swagger-ui .json-schema-2020-12__attribute--muted{color:gray}.swagger-ui .json-schema-2020-12__attribute--warning{color:red}.swagger-ui .json-schema-2020-12-json-viewer{margin:5px 0}.swagger-ui .json-schema-2020-12-json-viewer__children{border-left:1px dashed rgba(0,0,0,.1);margin:0 0 0 20px;padding:0}.swagger-ui .json-schema-2020-12-json-viewer__children--collapsed{display:none}.swagger-ui .json-schema-2020-12-json-viewer__name{font-size:12px;font-weight:700;margin-left:20px}.swagger-ui .json-schema-2020-12-json-viewer__name--primary{color:#3b4151;font-style:normal}.swagger-ui .json-schema-2020-12-json-viewer__name--secondary{color:#6b6b6b;font-style:italic}.swagger-ui .json-schema-2020-12-json-viewer__name--extension{color:#929292;font-style:italic}.swagger-ui .json-schema-2020-12-json-viewer__value{color:#6b6b6b;font-size:12px;font-style:italic;font-weight:400}.swagger-ui .json-schema-2020-12-json-viewer__value--primary{color:#3b4151;font-style:normal}.swagger-ui .json-schema-2020-12-json-viewer__value--secondary{color:#6b6b6b;font-style:italic}.swagger-ui .json-schema-2020-12-json-viewer__value--extension{color:#929292;font-style:italic}.swagger-ui .json-schema-2020-12-json-viewer__value--warning{border:1px dashed red;border-radius:4px;color:#3b4151;color:red;display:inline-block;font-family:monospace;font-style:normal;font-weight:600;line-height:1.5;margin-left:10px;padding:1px 4px}.swagger-ui .json-schema-2020-12-json-viewer__name--secondary+.json-schema-2020-12-json-viewer__value--secondary:before{content:"="}.swagger-ui .json-schema-2020-12{background-color:rgba(0,0,0,.05);border-radius:4px;margin:0 20px 15px;padding:12px 0 12px 20px}.swagger-ui .json-schema-2020-12:first-of-type{margin:20px}.swagger-ui .json-schema-2020-12:last-of-type{margin:0 20px}.swagger-ui .json-schema-2020-12--embedded{background-color:inherit;padding-bottom:0;padding-left:inherit;padding-right:inherit;padding-top:0}.swagger-ui .json-schema-2020-12-body{border-left:1px dashed rgba(0,0,0,.1);margin:2px 0}.swagger-ui .json-schema-2020-12-body--collapsed{display:none}.swagger-ui .json-schema-2020-12-accordion{border:none;outline:none;padding-left:0}.swagger-ui .json-schema-2020-12-accordion__children{display:inline-block}.swagger-ui .json-schema-2020-12-accordion__icon{display:inline-block;height:18px;vertical-align:bottom;width:18px}.swagger-ui .json-schema-2020-12-accordion__icon--expanded{transform:rotate(-90deg);transform-origin:50% 50%;transition:transform .15s ease-in}.swagger-ui .json-schema-2020-12-accordion__icon--collapsed{transform:rotate(0deg);transform-origin:50% 50%;transition:transform .15s ease-in}.swagger-ui .json-schema-2020-12-accordion__icon svg{height:20px;width:20px}.swagger-ui .json-schema-2020-12-expand-deep-button{border:none;color:#505050;color:#afaeae;font-family:sans-serif;font-size:12px;padding-right:0}.swagger-ui .model-box .json-schema-2020-12:not(.json-schema-2020-12--embedded)>.json-schema-2020-12-head .json-schema-2020-12__title:first-of-type{font-size:16px}.swagger-ui .model-box>.json-schema-2020-12{margin:0}.swagger-ui .model-box .json-schema-2020-12{background-color:transparent;padding:0}.swagger-ui .model-box .json-schema-2020-12-accordion,.swagger-ui .model-box .json-schema-2020-12-expand-deep-button{background-color:transparent}.swagger-ui .models .json-schema-2020-12:not(.json-schema-2020-12--embedded)>.json-schema-2020-12-head .json-schema-2020-12__title:first-of-type{font-size:16px}.swagger-ui .models .json-schema-2020-12:not(.json-schema-2020-12--embedded){overflow-x:auto;width:calc(100% - 40px)}html.dark-mode{background:#1c2022}html.dark-mode .swagger-ui{background:#1c2022;color:#e4e6e6}html.dark-mode .swagger-ui .authorization__btn svg,html.dark-mode .swagger-ui .expand-operation svg,html.dark-mode .swagger-ui .opblock-control-arrow svg{fill:#b7bcbf;opacity:1}html.dark-mode .swagger-ui .markdown p,html.dark-mode .swagger-ui .markdown pre,html.dark-mode .swagger-ui .renderedMarkdown p,html.dark-mode .swagger-ui .renderedMarkdown pre,html.dark-mode .swagger-ui section h3,html.dark-mode .swagger-ui table thead tr td,html.dark-mode .swagger-ui table thead tr th{color:#e4e6e6}html.dark-mode .swagger-ui .markdown code,html.dark-mode .swagger-ui .renderedMarkdown code{background:#080a0b;color:#b68ae1}html.dark-mode .swagger-ui input{background:#1c2022;border-color:#b7bcbf;color:#f0f1f1}html.dark-mode .swagger-ui input:focus:not(.download-url-input){border-color:#51a8ff!important;box-shadow:none;outline:none}html.dark-mode .swagger-ui textarea{background:#0d1014;border:1px solid #0d1014;color:#f0f1f1}html.dark-mode .swagger-ui textarea:focus{border-color:#51a8ff}html.dark-mode .swagger-ui textarea[disabled]{background-color:#202225;border-color:#202225;color:#8c969a}html.dark-mode .swagger-ui select{background:#1c2022 url("data:image/svg+xml;charset=utf-8,<svg xmlns=\"http://www.w3.org/2000/svg\" viewBox=\"0 0 20 20\"><path fill=\"%23B7BCBF\" d=\"M13.418 7.859a.695.695 0 0 1 .978 0 .68.68 0 0 1 0 .969l-3.908 3.83a.697.697 0 0 1-.979 0l-3.908-3.83a.68.68 0 0 1 0-.969.695.695 0 0 1 .978 0L10 11z\"/></svg>") right 10px center no-repeat;border-color:#b7bcbf;box-shadow:none;color:#f0f1f1;outline:none}html.dark-mode .swagger-ui select[multiple]{background:#1c2022}html.dark-mode .swagger-ui select:focus{border-color:#51a8ff}html.dark-mode .swagger-ui input::-moz-placeholder, html.dark-mode .swagger-ui textarea::-moz-placeholder{color:#f0f1f1;opacity:.5}html.dark-mode .swagger-ui input::placeholder,html.dark-mode .swagger-ui textarea::placeholder{color:#f0f1f1;opacity:.5}html.dark-mode .swagger-ui input.invalid,html.dark-mode .swagger-ui select.invalid,html.dark-mode .swagger-ui textarea.invalid{background:#1c2022;border-color:#ff5f5f}html.dark-mode .swagger-ui .topbar{background:#2a2e30}html.dark-mode .swagger-ui .topbar .download-url-wrapper .download-url-button{background:#1d632e;color:#e4e6e6}html.dark-mode .swagger-ui .topbar .download-url-wrapper .download-url-input{border-color:#1d632e}html.dark-mode .swagger-ui .topbar .download-url-wrapper .download-url-input.failed{color:#ff5f5f}html.dark-mode .swagger-ui .dialog-ux .modal-ux{background-color:#2a2e30;border:none;color:#e4e6e6}html.dark-mode .swagger-ui .dialog-ux .modal-ux-header{border-color:#545d61}html.dark-mode .swagger-ui .dialog-ux .modal-ux-header .close-modal svg{fill:#e4e6e6}html.dark-mode .swagger-ui .dialog-ux .modal-ux h2,html.dark-mode .swagger-ui .dialog-ux .modal-ux h3,html.dark-mode .swagger-ui .dialog-ux .modal-ux h4,html.dark-mode .swagger-ui .dialog-ux .modal-ux h5,html.dark-mode .swagger-ui .dialog-ux .modal-ux label,html.dark-mode .swagger-ui .dialog-ux .modal-ux p{color:#e4e6e6}html.dark-mode .swagger-ui .dialog-ux .modal-ux .scopes a{color:#51a8ff}html.dark-mode .swagger-ui .dialog-ux .modal-ux .btn.modal-btn{border-color:#3ece90;color:#3ece90}html.dark-mode .swagger-ui .dialog-ux .modal-ux .btn.modal-btn.btn-done{border-color:#e4e6e6;color:#e4e6e6}html.dark-mode .swagger-ui .dialog-ux .modal-ux .auth-container{border-color:#545d61}html.dark-mode .swagger-ui .dialog-ux .modal-ux .checkbox input[type=checkbox]+label>.item{background:#545d61;box-shadow:none;color:#f0f1f1!important}html.dark-mode .swagger-ui .dialog-ux .modal-ux .checkbox input[type=checkbox]:checked+label>.item{background:#545d61 url("data:image/svg+xml;charset=utf-8,<svg xmlns=\"http://www.w3.org/2000/svg\" width=\"10\" height=\"8\" viewBox=\"3 7 10 8\"><path fill=\"%23E4E6E6\" fill-rule=\"evenodd\" d=\"M6.333 15 3 11.667l1.333-1.334 2 2L11.667 7 13 8.333z\"/></svg>") 50% no-repeat}html.dark-mode .swagger-ui .loading-container .loading:before{border-color:#e4e6e6 #545d61 #545d61}html.dark-mode .swagger-ui .loading-container .loading:after{color:#e4e6e6}html.dark-mode .swagger-ui .scheme-container{background:#1c2022;box-shadow:0 1px 2px 0 #545d61}html.dark-mode .swagger-ui .scheme-container .schemes>.schemes-server-container>label{color:#e4e6e6}html.dark-mode .swagger-ui .scheme-container .btn.authorize{border-color:#3ece90;color:#3ece90}html.dark-mode .swagger-ui .scheme-container .btn.authorize svg{fill:#3ece90}html.dark-mode .swagger-ui .info .title,html.dark-mode .swagger-ui .info h1,html.dark-mode .swagger-ui .info h2,html.dark-mode .swagger-ui .info h3,html.dark-mode .swagger-ui .info h4,html.dark-mode .swagger-ui .info h5{color:#d2d6d7}html.dark-mode .swagger-ui .info .base-url,html.dark-mode .swagger-ui .info li,html.dark-mode .swagger-ui .info p,html.dark-mode .swagger-ui .info table{color:#e4e6e6}html.dark-mode .swagger-ui .info a{color:#51a8ff}html.dark-mode .swagger-ui .info .title small{background:#434b4f}html.dark-mode .swagger-ui .info .title small.version-stamp{background:#1d632e}html.dark-mode .swagger-ui .info .errors-wrapper{background:#434b4f;border-color:#ff5f5f}html.dark-mode .swagger-ui .info .errors-wrapper h4,html.dark-mode .swagger-ui .info .errors-wrapper span{color:#e4e6e6}html.dark-mode .swagger-ui .info .errors-wrapper .btn.errors__clear-btn{border-color:#e4e6e6;color:#e4e6e6}html.dark-mode .swagger-ui .copy-to-clipboard,html.dark-mode .swagger-ui .download-contents{background:#545d61;color:#e4e6e6}html.dark-mode .swagger-ui .copy-to-clipboard button,html.dark-mode .swagger-ui .download-contents button{background:url("data:image/svg+xml;charset=utf-8,<svg xmlns=\"http://www.w3.org/2000/svg\" width=\"16\" height=\"15\" aria-hidden=\"true\"><path fill=\"%23E4E6E6\" fill-rule=\"evenodd\" d=\"M4 12h4v1H4zm5-6H4v1h5zm2 3V7l-3 3 3 3v-2h5V9zM6.5 8H4v1h2.5zM4 11h2.5v-1H4zm9 1h1v2c-.02.28-.11.52-.3.7s-.42.28-.7.3H3c-.55 0-1-.45-1-1V3c0-.55.45-1 1-1h3c0-1.11.89-2 2-2s2 .89 2 2h3c.55 0 1 .45 1 1v5h-1V5H3v9h10zM4 4h8c0-.55-.45-1-1-1h-1c-.55 0-1-.45-1-1s-.45-1-1-1-1 .45-1 1-.45 1-1 1H5c-.55 0-1 .45-1 1\"/></svg>") 50% no-repeat}html.dark-mode .swagger-ui .opblock-tag{border-bottom-color:#545d61;color:#e4e6e6}html.dark-mode .swagger-ui .opblock-tag small{color:#e4e6e6}html.dark-mode .swagger-ui .opblock-tag a.link{color:#51a8ff}html.dark-mode .swagger-ui .opblock.opblock-post{background:#112929;border-color:#104834}html.dark-mode .swagger-ui .opblock.opblock-post thead tr td,html.dark-mode .swagger-ui .opblock.opblock-post thead tr th{border-color:#104834;opacity:1}html.dark-mode .swagger-ui .opblock.opblock-post .opblock-section-header{background:#14392c;border-bottom:1px solid #104834;border-top:1px solid #104834}html.dark-mode .swagger-ui .opblock.opblock-post .opblock-section-header .tab-header .tab-item .opblock-title span:after{background:#00b572}html.dark-mode .swagger-ui .opblock.opblock-post .opblock-summary{border-bottom:none;border-color:#104834}html.dark-mode .swagger-ui .opblock.opblock-post .opblock-summary-control:focus{outline:none}html.dark-mode .swagger-ui .opblock.opblock-post .opblock-summary-method{background:#00b572;color:#080a0b;text-shadow:none}html.dark-mode .swagger-ui .opblock.opblock-post .opblock-body>.opblock-description-wrapper,html.dark-mode .swagger-ui .opblock.opblock-post .opblock-body>.opblock-title_normal{border-top:1px solid #104834}html.dark-mode .swagger-ui .opblock.opblock-deprecated{background:#272c34;border-color:#495361}html.dark-mode .swagger-ui .opblock.opblock-deprecated thead tr td,html.dark-mode .swagger-ui .opblock.opblock-deprecated thead tr th{border-color:#495361;opacity:1}html.dark-mode .swagger-ui .opblock.opblock-deprecated .opblock-section-header{background:#262e36;border-bottom:1px solid #495361;border-top:1px solid #495361}html.dark-mode .swagger-ui .opblock.opblock-deprecated .opblock-section-header .tab-header .tab-item .opblock-title span:after{background:#6a6a6a}html.dark-mode .swagger-ui .opblock.opblock-deprecated .opblock-summary{border-bottom:none;border-color:#495361}html.dark-mode .swagger-ui .opblock.opblock-deprecated .opblock-summary-control:focus{outline:none}html.dark-mode .swagger-ui .opblock.opblock-deprecated .opblock-summary-method{background:#6a6a6a;color:#080a0b;text-shadow:none}html.dark-mode .swagger-ui .opblock.opblock-deprecated .opblock-body>.opblock-description-wrapper,html.dark-mode .swagger-ui .opblock.opblock-deprecated .opblock-body>.opblock-title_normal{border-top:1px solid #495361}html.dark-mode .swagger-ui .opblock.opblock-put{background:#27201e;border-color:#523524}html.dark-mode .swagger-ui .opblock.opblock-put thead tr td,html.dark-mode .swagger-ui .opblock.opblock-put thead tr th{border-color:#523524;opacity:1}html.dark-mode .swagger-ui .opblock.opblock-put .opblock-section-header{background:#9a5b3e;border-bottom:1px solid #523524;border-top:1px solid #523524}html.dark-mode .swagger-ui .opblock.opblock-put .opblock-section-header .tab-header .tab-item .opblock-title span:after{background:#ff7d35}html.dark-mode .swagger-ui .opblock.opblock-put .opblock-summary{border-bottom:none;border-color:#523524}html.dark-mode .swagger-ui .opblock.opblock-put .opblock-summary-control:focus{outline:none}html.dark-mode .swagger-ui .opblock.opblock-put .opblock-summary-method{background:#ff7d35;color:#080a0b;text-shadow:none}html.dark-mode .swagger-ui .opblock.opblock-put .opblock-body>.opblock-description-wrapper,html.dark-mode .swagger-ui .opblock.opblock-put .opblock-body>.opblock-title_normal{border-top:1px solid #523524}html.dark-mode .swagger-ui .opblock.opblock-get{background:#182536;border-color:#294262}html.dark-mode .swagger-ui .opblock.opblock-get thead tr td,html.dark-mode .swagger-ui .opblock.opblock-get thead tr th{border-color:#294262;opacity:1}html.dark-mode .swagger-ui .opblock.opblock-get .opblock-section-header{background:#1c3043;border-bottom:1px solid #294262;border-top:1px solid #294262}html.dark-mode .swagger-ui .opblock.opblock-get .opblock-section-header .tab-header .tab-item .opblock-title span:after{background:#55a1ff}html.dark-mode .swagger-ui .opblock.opblock-get .opblock-summary{border-bottom:none;border-color:#294262}html.dark-mode .swagger-ui .opblock.opblock-get .opblock-summary-control:focus{outline:none}html.dark-mode .swagger-ui .opblock.opblock-get .opblock-summary-method{background:#55a1ff;color:#080a0b;text-shadow:none}html.dark-mode .swagger-ui .opblock.opblock-get .opblock-body>.opblock-description-wrapper,html.dark-mode .swagger-ui .opblock.opblock-get .opblock-body>.opblock-title_normal{border-top:1px solid #294262}html.dark-mode .swagger-ui .opblock.opblock-delete{background:#241a20;border-color:#4b2420}html.dark-mode .swagger-ui .opblock.opblock-delete thead tr td,html.dark-mode .swagger-ui .opblock.opblock-delete thead tr th{border-color:#4b2420;opacity:1}html.dark-mode .swagger-ui .opblock.opblock-delete .opblock-section-header{background:#2f2020;border-bottom:1px solid #4b2420;border-top:1px solid #4b2420}html.dark-mode .swagger-ui .opblock.opblock-delete .opblock-section-header .tab-header .tab-item .opblock-title span:after{background:#eb6156}html.dark-mode .swagger-ui .opblock.opblock-delete .opblock-summary{border-bottom:none;border-color:#4b2420}html.dark-mode .swagger-ui .opblock.opblock-delete .opblock-summary-control:focus{outline:none}html.dark-mode .swagger-ui .opblock.opblock-delete .opblock-summary-method{background:#eb6156;color:#080a0b;text-shadow:none}html.dark-mode .swagger-ui .opblock.opblock-delete .opblock-body>.opblock-description-wrapper,html.dark-mode .swagger-ui .opblock.opblock-delete .opblock-body>.opblock-title_normal{border-top:1px solid #4b2420}html.dark-mode .swagger-ui .opblock.opblock-patch{background:#11282f;border-color:#16494b}html.dark-mode .swagger-ui .opblock.opblock-patch thead tr td,html.dark-mode .swagger-ui .opblock.opblock-patch thead tr th{border-color:#16494b;opacity:1}html.dark-mode .swagger-ui .opblock.opblock-patch .opblock-section-header{background:#113239;border-bottom:1px solid #16494b;border-top:1px solid #16494b}html.dark-mode .swagger-ui .opblock.opblock-patch .opblock-section-header .tab-header .tab-item .opblock-title span:after{background:#03b7bf}html.dark-mode .swagger-ui .opblock.opblock-patch .opblock-summary{border-bottom:none;border-color:#16494b}html.dark-mode .swagger-ui .opblock.opblock-patch .opblock-summary-control:focus{outline:none}html.dark-mode .swagger-ui .opblock.opblock-patch .opblock-summary-method{background:#03b7bf;color:#080a0b;text-shadow:none}html.dark-mode .swagger-ui .opblock.opblock-patch .opblock-body>.opblock-description-wrapper,html.dark-mode .swagger-ui .opblock.opblock-patch .opblock-body>.opblock-title_normal{border-top:1px solid #16494b}html.dark-mode .swagger-ui .opblock.opblock-head{background:#282231;border-color:#44336a}html.dark-mode .swagger-ui .opblock.opblock-head thead tr td,html.dark-mode .swagger-ui .opblock.opblock-head thead tr th{border-color:#44336a;opacity:1}html.dark-mode .swagger-ui .opblock.opblock-head .opblock-section-header{background:#352c45;border-bottom:1px solid #44336a;border-top:1px solid #44336a}html.dark-mode .swagger-ui .opblock.opblock-head .opblock-section-header .tab-header .tab-item .opblock-title span:after{background:#b889ff}html.dark-mode .swagger-ui .opblock.opblock-head .opblock-summary{border-bottom:none;border-color:#44336a}html.dark-mode .swagger-ui .opblock.opblock-head .opblock-summary-control:focus{outline:none}html.dark-mode .swagger-ui .opblock.opblock-head .opblock-summary-method{background:#b889ff;color:#080a0b;text-shadow:none}html.dark-mode .swagger-ui .opblock.opblock-head .opblock-body>.opblock-description-wrapper,html.dark-mode .swagger-ui .opblock.opblock-head .opblock-body>.opblock-title_normal{border-top:1px solid #44336a}html.dark-mode .swagger-ui .opblock.opblock-options{background:#202c3c;border-color:#33465e}html.dark-mode .swagger-ui .opblock.opblock-options thead tr td,html.dark-mode .swagger-ui .opblock.opblock-options thead tr th{border-color:#33465e;opacity:1}html.dark-mode .swagger-ui .opblock.opblock-options .opblock-section-header{background:#314558;border-bottom:1px solid #33465e;border-top:1px solid #33465e}html.dark-mode .swagger-ui .opblock.opblock-options .opblock-section-header .tab-header .tab-item .opblock-title span:after{background:#6895c8}html.dark-mode .swagger-ui .opblock.opblock-options .opblock-summary{border-bottom:none;border-color:#33465e}html.dark-mode .swagger-ui .opblock.opblock-options .opblock-summary-control:focus{outline:none}html.dark-mode .swagger-ui .opblock.opblock-options .opblock-summary-method{background:#6895c8;color:#080a0b;text-shadow:none}html.dark-mode .swagger-ui .opblock.opblock-options .opblock-body>.opblock-description-wrapper,html.dark-mode .swagger-ui .opblock.opblock-options .opblock-body>.opblock-title_normal{border-top:1px solid #33465e}html.dark-mode .swagger-ui .opblock .opblock-section-header{box-shadow:none}html.dark-mode .swagger-ui .opblock .opblock-section-header h4,html.dark-mode .swagger-ui .opblock .opblock-section-header label{color:#e4e6e6}html.dark-mode .swagger-ui .opblock .opblock-section-header .try-out__btn{border-color:#b7bcbf;box-shadow:none;color:#e4e6e6}html.dark-mode .swagger-ui .opblock .opblock-section-header .try-out__btn.cancel{border-color:#ff5f5f;color:#ff5f5f}html.dark-mode .swagger-ui .opblock .btn.json-schema-form-item-add,html.dark-mode .swagger-ui .opblock .btn.json-schema-form-item-remove{border-color:#e4e6e6;color:#e4e6e6}html.dark-mode .swagger-ui .opblock .validation-errors.errors-wrapper{background:#434b4f;border-color:#ff5f5f;color:#e4e6e6}html.dark-mode .swagger-ui .opblock .body-param-options label span,html.dark-mode .swagger-ui .opblock .opblock-description-wrapper i,html.dark-mode .swagger-ui .opblock .opblock-description-wrapper p,html.dark-mode .swagger-ui .opblock .opblock-external-docs-wrapper,html.dark-mode .swagger-ui .opblock .opblock-summary-description,html.dark-mode .swagger-ui .opblock .opblock-summary-operation-id,html.dark-mode .swagger-ui .opblock .opblock-summary-path,html.dark-mode .swagger-ui .opblock .opblock-summary-path__deprecated,html.dark-mode .swagger-ui .opblock .opblock-title_normal,html.dark-mode .swagger-ui .opblock .parameter__in,html.dark-mode .swagger-ui .opblock .parameter__name,html.dark-mode .swagger-ui .opblock .parameter__type,html.dark-mode .swagger-ui .opblock .parameter__type .prop-format,html.dark-mode .swagger-ui .opblock .response-col_links,html.dark-mode .swagger-ui .opblock .response-col_status,html.dark-mode .swagger-ui .opblock .response-col_undocumented{color:#e4e6e6}html.dark-mode .swagger-ui .opblock .opblock-external-docs a.link{color:#51a8ff}html.dark-mode .swagger-ui .opblock .parameter__name.required span,html.dark-mode .swagger-ui .opblock .parameter__name.required:after{color:#ff5f5f}html.dark-mode .swagger-ui .opblock .parameter__empty_value_toggle{color:#e4e6e6}html.dark-mode .swagger-ui .opblock .btn.execute{background:#51a8ff;border-color:#51a8ff;color:#080a0b}html.dark-mode .swagger-ui .opblock .btn.btn-clear{border-color:#e4e6e6;color:#e4e6e6}html.dark-mode .swagger-ui .opblock .highlight-code pre.microlight{background:#2a2e30!important;color:#f0f1f1}html.dark-mode .swagger-ui .opblock .curl-command .btn{background:#3b424d!important;border-color:#2a2e30!important;color:#ebebeb!important}html.dark-mode .swagger-ui .opblock .curl-command .btn.active{background:#2a2e30!important;color:#e4e6e6!important}html.dark-mode .swagger-ui .opblock pre.microlight{background:#2a2e30!important;color:#f0f1f1}html.dark-mode .swagger-ui .opblock .model-example .tab button{color:#e4e6e6}html.dark-mode .swagger-ui .opblock .model-example .tabitem:after{background:#6b757a}html.dark-mode .swagger-ui .opblock .responses-inner h4,html.dark-mode .swagger-ui .opblock .responses-inner h5{color:#e4e6e6}html.dark-mode .swagger-ui .opblock .response-control-media-type--accept-controller select.content-type{border-color:#4ac966}html.dark-mode .swagger-ui .opblock .response-control-media-type--accept-controller .response-control-media-type__accept-message{color:#4ac966}html.dark-mode .swagger-ui .model-toggle:after{background:url("data:image/svg+xml;charset=utf-8,<svg xmlns=\"http://www.w3.org/2000/svg\" width=\"24\" height=\"24\" viewBox=\"0 0 24 24\"><path fill=\"%23e4e6e6\" d=\"M10 6 8.59 7.41 13.17 12l-4.58 4.59L10 18l6-6z\"/></svg>") 50% no-repeat;background-size:100%}html.dark-mode .swagger-ui .model .prop-type{color:#b68ae1}html.dark-mode .swagger-ui .model .brace-close,html.dark-mode .swagger-ui .model .brace-open,html.dark-mode .swagger-ui .model .description,html.dark-mode .swagger-ui .model .prop-format,html.dark-mode .swagger-ui .model .property,html.dark-mode .swagger-ui .model .property-row{color:#e4e6e6}html.dark-mode .swagger-ui .model .property-row.required .star{color:#ff5f5f}html.dark-mode .swagger-ui .model-box{background:#2a2e30}html.dark-mode .swagger-ui .model-box .model,html.dark-mode .swagger-ui .model-box .model-title{color:#e4e6e6}html.dark-mode .swagger-ui .model-box-control:focus{outline:none}html.dark-mode .swagger-ui .model-box-control:not(.prop){color:#e4e6e6}html.dark-mode .swagger-ui .json-schema-2020-12,html.dark-mode .swagger-ui .json-schema-2020-12 button{background:#2a2e30}html.dark-mode .swagger-ui .json-schema-2020-12 button svg{fill:#e4e6e6}html.dark-mode .swagger-ui .json-schema-2020-12 a{color:#51a8ff}html.dark-mode .swagger-ui .json-schema-2020-12__title{color:#e4e6e6}html.dark-mode .swagger-ui .json-schema-2020-12-property--required>.json-schema-2020-12:first-of-type>.json-schema-2020-12-head .json-schema-2020-12__title:after{color:#ff5f5f}html.dark-mode .swagger-ui .json-schema-2020-12-expand-deep-button{color:#b7bcbf}html.dark-mode .swagger-ui .json-schema-2020-12-body{border-color:#b7bcbf}html.dark-mode .swagger-ui .json-schema-2020-12-keyword__name--primary{color:#e4e6e6}html.dark-mode .swagger-ui .json-schema-2020-12-keyword__name--secondary,html.dark-mode .swagger-ui .json-schema-2020-12-keyword__value--secondary{color:#b7bcbf}html.dark-mode .swagger-ui .json-schema-2020-12-keyword__value--warning{border-color:#ff5f5f;color:#ff5f5f}html.dark-mode .swagger-ui .json-schema-2020-12-keyword--\$vocabulary ul{border-color:#b7bcbf}html.dark-mode .swagger-ui .json-schema-2020-12-keyword--patternProperties .json-schema-2020-12__title:after,html.dark-mode .swagger-ui .json-schema-2020-12-keyword--patternProperties .json-schema-2020-12__title:before,html.dark-mode .swagger-ui .json-schema-2020-12__attribute--primary{color:#9898ff}html.dark-mode .swagger-ui .json-schema-2020-12__attribute--muted{color:#b7bcbf}html.dark-mode .swagger-ui .json-schema-2020-12__attribute--warning{color:#ff5f5f}html.dark-mode .swagger-ui .json-schema-2020-12-json-viewer__name--secondary,html.dark-mode .swagger-ui .json-schema-2020-12-json-viewer__value--secondary{color:#b7bcbf}html.dark-mode .swagger-ui .json-schema-2020-12__constraint{background:#9898ff;color:#080a0b}html.dark-mode .swagger-ui .json-schema-2020-12__constraint--string{background:#d4aa53}html.dark-mode .swagger-ui section.models,html.dark-mode .swagger-ui section.models h4{border-color:#545d61}html.dark-mode .swagger-ui section.models h4 span{color:#e4e6e6}html.dark-mode .swagger-ui section.models .model-container{background:#2a2e30}html.dark-mode .swagger-ui section.models .models-control:focus{outline:none}html.dark-mode .swagger-ui section.models .models-control svg{fill:#b7bcbf}
+
+/*# sourceMappingURL=swagger-ui.css.map*/
\ No newline at end of file
diff --git a/vllm/entrypoints/serve/rpc/__init__.py b/vllm/entrypoints/serve/rpc/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/vllm/entrypoints/serve/rpc/api_router.py b/vllm/entrypoints/serve/rpc/api_router.py
new file mode 100644
index 0000000000000000000000000000000000000000..54f582c408d543756e599e16e8bc10c8ca51c55c
--- /dev/null
+++ b/vllm/entrypoints/serve/rpc/api_router.py
@@ -0,0 +1,61 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+from http import HTTPStatus
+from typing import Any
+
+from fastapi import APIRouter, FastAPI, HTTPException, Request
+from fastapi.responses import JSONResponse, Response
+
+import vllm.envs as envs
+from vllm.engine.protocol import EngineClient
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+router = APIRouter()
+
+
+def engine_client(request: Request) -> EngineClient:
+    return request.app.state.engine_client
+
+
+@router.post("/collective_rpc")
+async def collective_rpc(raw_request: Request):
+    try:
+        body = await raw_request.json()
+    except json.JSONDecodeError as e:
+        raise HTTPException(
+            status_code=HTTPStatus.BAD_REQUEST.value,
+            detail=f"JSON decode error: {e}",
+        ) from e
+    method = body.get("method")
+    if method is None:
+        raise HTTPException(
+            status_code=HTTPStatus.BAD_REQUEST.value,
+            detail="Missing 'method' in request body",
+        )
+    # For security reason, only serialized string args/kwargs are passed.
+    # User-defined `method` is responsible for deserialization if needed.
+    args: list[str] = body.get("args", [])
+    kwargs: dict[str, str] = body.get("kwargs", {})
+    timeout: float | None = body.get("timeout")
+    results = await engine_client(raw_request).collective_rpc(
+        method=method, timeout=timeout, args=tuple(args), kwargs=kwargs
+    )
+    if results is None:
+        return Response(status_code=200)
+    response: list[Any] = []
+    for result in results:
+        if result is None or isinstance(result, dict | list):
+            response.append(result)
+        else:
+            response.append(str(result))
+    return JSONResponse(content={"results": response})
+
+
+def attach_router(app: FastAPI):
+    if not envs.VLLM_SERVER_DEV_MODE:
+        return
+    app.include_router(router)
diff --git a/vllm/entrypoints/serve/sleep/api_router.py b/vllm/entrypoints/serve/sleep/api_router.py
index bc01e185315c8823a23a2ac9c0efbd11b500eb17..c0e4c3028b2ea41465f3d5748ab557c5b49c34b6 100644
--- a/vllm/entrypoints/serve/sleep/api_router.py
+++ b/vllm/entrypoints/serve/sleep/api_router.py
@@ -52,9 +52,5 @@ async def is_sleeping(raw_request: Request):
 def attach_router(app: FastAPI):
     if not envs.VLLM_SERVER_DEV_MODE:
         return
-    logger.warning(
-        "SECURITY WARNING: Development endpoints are enabled! "
-        "This should NOT be used in production!"
-    )
 
     app.include_router(router)
diff --git a/vllm/entrypoints/serve/tokenize/api_router.py b/vllm/entrypoints/serve/tokenize/api_router.py
index a10e78c8d28ee00b304a842982e5c01bf5a56e10..77f8b79e50c8a295b834d9bf0f7e1e917db3fec4 100644
--- a/vllm/entrypoints/serve/tokenize/api_router.py
+++ b/vllm/entrypoints/serve/tokenize/api_router.py
@@ -49,14 +49,8 @@ async def tokenize(request: TokenizeRequest, raw_request: Request):
 
     try:
         generator = await handler.create_tokenize(request, raw_request)
-    except NotImplementedError as e:
-        raise HTTPException(
-            status_code=HTTPStatus.NOT_IMPLEMENTED.value, detail=str(e)
-        ) from e
     except Exception as e:
-        raise HTTPException(
-            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
-        ) from e
+        return handler.create_error_response(e)
 
     if isinstance(generator, ErrorResponse):
         return JSONResponse(
diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py
index f4a633c69cb0bba4b241537a2c71169c36e40907..4162f3df1d73d57e4d57f7a2e7ab7cc63796e294 100644
--- a/vllm/entrypoints/utils.py
+++ b/vllm/entrypoints/utils.py
@@ -7,8 +7,9 @@ import functools
 import os
 from argparse import Namespace
 from pathlib import Path
-from typing import Any
+from typing import TYPE_CHECKING, Any
 
+import regex as re
 from fastapi import Request
 from fastapi.responses import JSONResponse, StreamingResponse
 from starlette.background import BackgroundTask, BackgroundTasks
@@ -21,18 +22,25 @@ from vllm.entrypoints.chat_utils import (
     resolve_hf_chat_template,
     resolve_mistral_chat_template,
 )
-from vllm.entrypoints.openai.cli_args import make_arg_parser
-from vllm.entrypoints.openai.protocol import (
-    ChatCompletionRequest,
-    CompletionRequest,
-    StreamOptions,
-)
-from vllm.entrypoints.openai.serving_models import LoRAModulePath
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.tokenizers.mistral import MistralTokenizer
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 
+if TYPE_CHECKING:
+    from vllm.entrypoints.openai.protocol import (
+        ChatCompletionRequest,
+        CompletionRequest,
+        StreamOptions,
+    )
+    from vllm.entrypoints.openai.serving_models import LoRAModulePath
+else:
+    ChatCompletionRequest = object
+    CompletionRequest = object
+    StreamOptions = object
+    LoRAModulePath = object
+
+
 logger = init_logger(__name__)
 
 VLLM_SUBCMD_PARSER_EPILOG = (
@@ -205,7 +213,7 @@ def _validate_truncation_size(
 
 def get_max_tokens(
     max_model_len: int,
-    request: ChatCompletionRequest | CompletionRequest,
+    request: "ChatCompletionRequest | CompletionRequest",
     input_length: int,
     default_sampling_params: dict,
 ) -> int:
@@ -226,6 +234,8 @@ def get_max_tokens(
 
 
 def log_non_default_args(args: Namespace | EngineArgs):
+    from vllm.entrypoints.openai.cli_args import make_arg_parser
+
     non_default_args = {}
 
     # Handle Namespace
@@ -254,7 +264,7 @@ def log_non_default_args(args: Namespace | EngineArgs):
 
 
 def should_include_usage(
-    stream_options: StreamOptions | None, enable_force_include_usage: bool
+    stream_options: "StreamOptions | None", enable_force_include_usage: bool
 ) -> tuple[bool, bool]:
     if stream_options:
         include_usage = stream_options.include_usage or enable_force_include_usage
@@ -269,6 +279,8 @@ def should_include_usage(
 def process_lora_modules(
     args_lora_modules: list[LoRAModulePath], default_mm_loras: dict[str, str] | None
 ) -> list[LoRAModulePath]:
+    from vllm.entrypoints.openai.serving_models import LoRAModulePath
+
     lora_modules = args_lora_modules
     if default_mm_loras:
         default_mm_lora_paths = [
@@ -317,3 +329,8 @@ async def process_chat_template(
                     model_config.model,
                 )
     return resolved_chat_template
+
+
+def sanitize_message(message: str) -> str:
+    # Avoid leaking memory address from object reprs
+    return re.sub(r" at 0x[0-9a-f]+>", ">", message)
diff --git a/vllm/env_override.py b/vllm/env_override.py
index 667cb6321404cda83dd8fdbe46eb04c54e7edf8e..4321ffc8319d656108f1ce1064b49d48e0af1429 100644
--- a/vllm/env_override.py
+++ b/vllm/env_override.py
@@ -364,6 +364,30 @@ def _update_scheduler_patched(self) -> None:
         self.scheduler = Scheduler(self.operations)
 
 
+# ===================================================
+# torch 2.9 Inductor get_raw_stream workaround
+# ===================================================
+# Workaround for TorchInductor autotune using get_raw_stream() without defining it.
+# This occurs when compile_sizes > 1 in compilation_config.
+# For more context, see https://github.com/vllm-project/vllm/issues/30905.
+def _patch_get_raw_stream_if_needed():
+    """Workaround for TorchInductor autotune get_raw_stream() bug."""
+    from vllm.utils.torch_utils import is_torch_equal
+
+    # Only apply the patch for torch 2.9.0 or 2.9.1
+    if is_torch_equal("2.9.0") or is_torch_equal("2.9.1"):
+        import builtins
+
+        # Check if CUDA functionality is available without initializing CUDA
+        # _cuda_getCurrentRawStream only exists in CUDA builds of PyTorch
+        if hasattr(torch._C, "_cuda_getCurrentRawStream"):
+            from torch._C import _cuda_getCurrentRawStream as _get_raw_stream
+
+            builtins.get_raw_stream = _get_raw_stream
+
+
+_patch_get_raw_stream_if_needed()
+
 if is_torch_equal("2.9.0"):
     from torch._inductor.codegen.wrapper import PythonWrapperCodegen
     from torch._inductor.graph import GraphLowering
diff --git a/vllm/envs.py b/vllm/envs.py
index 2ca6f839619cb68b4f4f3611300d1ad076e866e9..030b5083f1694527e306520014c32a46ee03c1b8 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -24,6 +24,7 @@ if TYPE_CHECKING:
     LOCAL_RANK: int = 0
     CUDA_VISIBLE_DEVICES: str | None = None
     VLLM_ENGINE_ITERATION_TIMEOUT_S: int = 60
+    VLLM_ENGINE_READY_TIMEOUT_S: int = 600
     VLLM_API_KEY: str | None = None
     VLLM_DEBUG_LOG_API_SERVER_RESPONSE: bool = False
     S3_ACCESS_KEY_ID: str | None = None
@@ -74,7 +75,7 @@ if TYPE_CHECKING:
     VLLM_MEDIA_CONNECTOR: str = "http"
     VLLM_TARGET_DEVICE: str = "cuda"
     VLLM_MAIN_CUDA_VERSION: str = "12.9"
-    VLLM_FLOAT32_MATMUL_PRECISION: Literal["ieee", "tf32"] = "ieee"
+    VLLM_FLOAT32_MATMUL_PRECISION: Literal["highest", "high", "medium"] = "highest"
     MAX_JOBS: str | None = None
     NVCC_THREADS: str | None = None
     VLLM_USE_PRECOMPILED: bool = False
@@ -167,6 +168,7 @@ if TYPE_CHECKING:
         "relax",
     ] = "relax"
     VLLM_USE_FUSED_MOE_GROUPED_TOPK: bool = True
+    VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER: bool = False
     VLLM_USE_FLASHINFER_MOE_FP16: bool = False
     VLLM_USE_FLASHINFER_MOE_FP8: bool = False
     VLLM_USE_FLASHINFER_MOE_FP4: bool = False
@@ -202,12 +204,16 @@ if TYPE_CHECKING:
     VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16: bool = True
     VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB: int | None = None
     VLLM_NIXL_ABORT_REQUEST_TIMEOUT: int = 480
+    VLLM_MORIIO_CONNECTOR_READ_MODE: bool = False
+    VLLM_MORIIO_QP_PER_TRANSFER: int = 1
+    VLLM_MORIIO_POST_BATCH_SIZE: int = -1
+    VLLM_MORIIO_NUM_WORKERS: int = 1
     VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT: int = 480
     VLLM_USE_CUDNN_PREFILL: bool = False
     VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL: bool = False
     VLLM_ENABLE_CUDAGRAPH_GC: bool = False
     VLLM_LOOPBACK_IP: str = ""
-    VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE: bool = False
+    VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE: bool = True
     VLLM_ENABLE_RESPONSES_API_STORE: bool = False
     VLLM_USE_TRTLLM_ATTENTION: str | None = None
     VLLM_NVFP4_GEMM_BACKEND: str | None = None
@@ -244,6 +250,8 @@ if TYPE_CHECKING:
     VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD: int = 256
     VLLM_COMPILE_CACHE_SAVE_FORMAT: Literal["binary", "unpacked"] = "binary"
     VLLM_USE_V2_MODEL_RUNNER: bool = False
+    VLLM_LOG_MODEL_INSPECTION: bool = False
+    VLLM_DEBUG_MFU_METRICS: bool = False
     # add envs
     VLLM_OPTEST_URLS_PORT: int | None = None
     VLLM_OPTEST_MODELS_PATH: str = ""
@@ -279,7 +287,6 @@ if TYPE_CHECKING:
     VLLM_USE_OPT_RESHAPE_AND_CACHE: bool = False
     VLLM_USE_TOPK_RENORM: bool = False
     VLLM_USE_FUSED_RMS_ROPE: bool = False
-    VLLM_USE_MARLIN_W16A16_MOE:bool = False
     VLLM_W8A8_BACKEND: int = 3
 
 
@@ -317,11 +324,16 @@ def use_aot_compile() -> bool:
     from vllm.model_executor.layers.batch_invariant import (
         vllm_is_batch_invariant,
     )
+    from vllm.platforms import current_platform
     from vllm.utils.torch_utils import is_torch_equal_or_newer
 
     default_value = (
         "1"
-        if is_torch_equal_or_newer("2.10.0.dev") and not disable_compile_cache()
+        if is_torch_equal_or_newer("2.10.0.dev")
+        and not disable_compile_cache()
+        # Disabling AOT_COMPILE for CPU
+        # See: https://github.com/vllm-project/vllm/issues/32033
+        and not current_platform.is_cpu()
         else "0"
     )
 
@@ -494,13 +506,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_MAIN_CUDA_VERSION": lambda: os.getenv("VLLM_MAIN_CUDA_VERSION", "").lower()
     or "12.9",
     # Controls PyTorch float32 matmul precision mode within vLLM workers.
-    # Accepted values:
-    #   - "ieee" (default): force full IEEE FP32 matmul precision.
-    #   - "tf32": enable TensorFloat32-based fast matmul.
+    # Valid options mirror torch.set_float32_matmul_precision
     "VLLM_FLOAT32_MATMUL_PRECISION": env_with_choices(
         "VLLM_FLOAT32_MATMUL_PRECISION",
-        "ieee",
-        ["ieee", "tf32"],
+        "highest",
+        ["highest", "high", "medium"],
         case_sensitive=False,
     ),
     # Maximum number of compilation jobs to run in parallel.
@@ -644,6 +654,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_ENGINE_ITERATION_TIMEOUT_S": lambda: int(
         os.environ.get("VLLM_ENGINE_ITERATION_TIMEOUT_S", "120")
     ),
+    # Timeout in seconds for waiting for engine cores to become ready
+    # during startup. Default is 600 seconds (10 minutes).
+    "VLLM_ENGINE_READY_TIMEOUT_S": lambda: int(
+        os.environ.get("VLLM_ENGINE_READY_TIMEOUT_S", "600")
+    ),
     # API key for vLLM API server
     "VLLM_API_KEY": lambda: os.environ.get("VLLM_API_KEY", None),
     # Whether to log responses from API Server for debugging
@@ -714,7 +729,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
         None,
         lambda: list(
             __import__(
-                "vllm.attention.backends.registry", fromlist=["AttentionBackendEnum"]
+                "vllm.v1.attention.backends.registry", fromlist=["AttentionBackendEnum"]
             ).AttentionBackendEnum.__members__.keys()
         ),
     ),
@@ -1242,6 +1257,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_USE_FUSED_MOE_GROUPED_TOPK": lambda: bool(
         int(os.getenv("VLLM_USE_FUSED_MOE_GROUPED_TOPK", "1"))
     ),
+    # Allow use of FlashInfer FP8 block-scale GEMM for linear layers.
+    # This uses TensorRT-LLM kernels and requires SM90+ (Hopper).
+    "VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER": lambda: bool(
+        int(os.getenv("VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER", "0"))
+    ),
     # Allow use of FlashInfer MoE kernels for fused moe ops.
     "VLLM_USE_FLASHINFER_MOE_FP16": lambda: bool(
         int(os.getenv("VLLM_USE_FLASHINFER_MOE_FP16", "0"))
@@ -1303,7 +1323,8 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_MOONCAKE_BOOTSTRAP_PORT": lambda: int(
         os.getenv("VLLM_MOONCAKE_BOOTSTRAP_PORT", "8998")
     ),
-    # all2all backend for vllm's expert parallel communication
+    # [DEPRECATED - will be removed in v0.15.0] all2all backend for vllm's
+    # expert parallel communication. Use --all2all-backend CLI argument instead.
     # Available options:
     # - "naive": naive all2all implementation using broadcasts
     # - "allgather_reducescatter": all2all implementation based on allgather and
@@ -1314,7 +1335,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # - "flashinfer_all2allv", use flashinfer alltoallv kernels for mnnvl
     "VLLM_ALL2ALL_BACKEND": env_with_choices(
         "VLLM_ALL2ALL_BACKEND",
-        "allgather_reducescatter",
+        None,
         [
             "naive",
             "pplx",
@@ -1412,6 +1433,20 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_NIXL_ABORT_REQUEST_TIMEOUT": lambda: int(
         os.getenv("VLLM_NIXL_ABORT_REQUEST_TIMEOUT", "480")
     ),
+    # Controls the read mode for the Mori-IO connector
+    "VLLM_MORIIO_CONNECTOR_READ_MODE": lambda: (
+        os.getenv("VLLM_MORIIO_CONNECTOR_READ_MODE", "False").lower() in ("true", "1")
+    ),
+    # Controls the QP (Queue Pair) per transfer configuration for the Mori-IO connector
+    "VLLM_MORIIO_QP_PER_TRANSFER": lambda: int(
+        os.getenv("VLLM_MORIIO_QP_PER_TRANSFER", "1")
+    ),
+    # Controls the post-processing batch size for the Mori-IO connector
+    "VLLM_MORIIO_POST_BATCH_SIZE": lambda: int(
+        os.getenv("VLLM_MORIIO_POST_BATCH_SIZE", "-1")
+    ),
+    # Controls the number of workers for Mori operations for the Mori-IO connector
+    "VLLM_MORIIO_NUM_WORKERS": lambda: int(os.getenv("VLLM_MORIIO_NUM_WORKERS", "1")),
     # Timeout (in seconds) for MooncakeConnector in PD disaggregated setup.
     "VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT": lambda: int(
         os.getenv("VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT", "480")
@@ -1471,7 +1506,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # kv-cache memory usage and enable longer contexts)
     # TODO(lucas): Remove this flag once latency regression is resolved.
     "VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE": lambda: bool(
-        int(os.getenv("VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE", "0"))
+        int(os.getenv("VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE", "1"))
     ),
     # Enables support for the "store" option in the OpenAI Responses API.
     # When set to 1, vLLM's OpenAI server will retain the input and output
@@ -1606,6 +1641,16 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_USE_V2_MODEL_RUNNER": lambda: bool(
         int(os.getenv("VLLM_USE_V2_MODEL_RUNNER", "0"))
     ),
+    # Log model inspection after loading.
+    # If enabled, logs a transformers-style hierarchical view of the model
+    # with quantization methods and attention backends.
+    "VLLM_LOG_MODEL_INSPECTION": lambda: bool(
+        int(os.getenv("VLLM_LOG_MODEL_INSPECTION", "0"))
+    ),
+    # Debug logging for --enable-mfu-metrics
+    "VLLM_DEBUG_MFU_METRICS": lambda: bool(
+        int(os.getenv("VLLM_DEBUG_MFU_METRICS", "0"))
+    ),
     
     # add envs
     
@@ -1658,7 +1703,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
     
     # flag to control vllm to use optimized kernels
     "VLLM_CUSTOM_CACHE":
-    lambda: bool(int(os.environ.get("VLLM_CUSTOM_CACHE", "1"))),
+    lambda: bool(int(os.environ.get("VLLM_CUSTOM_CACHE", "0"))),
     
     # flag to control vllm to use optimized kernels
     "VLLM_CUSTOM_ALLREDUCE_SUPPORTED_WORLDSIZE_MAX":
@@ -1763,10 +1808,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_USE_FUSED_RMS_ROPE":
         lambda: (os.environ.get("VLLM_USE_FUSED_RMS_ROPE", "True").lower() in
                  ("true", "1")),
-    # vLLM will use Marlin W16A16 kernel for MoE experts
-    "VLLM_USE_MARLIN_W16A16_MOE":
-        lambda: (os.environ.get("VLLM_USE_MARLIN_W16A16_MOE", "False").lower() in
-                 ("true", "1")),
     # W8A8 GEMM backend selection for vLLM quantized models.
     # lightop/triton: 1
     # cutlass: 2 (will remove in the future)
@@ -1862,6 +1903,7 @@ def compile_factors() -> dict[str, object]:
         "VLLM_CI_USE_S3",
         "VLLM_MODEL_REDIRECT_PATH",
         "VLLM_HOST_IP",
+        "VLLM_FORCE_AOT_LOAD",
         "S3_ACCESS_KEY_ID",
         "S3_SECRET_ACCESS_KEY",
         "S3_ENDPOINT_URL",
diff --git a/vllm/exceptions.py b/vllm/exceptions.py
new file mode 100644
index 0000000000000000000000000000000000000000..411c5138210202836380750a87c5847dfdef6d5e
--- /dev/null
+++ b/vllm/exceptions.py
@@ -0,0 +1,36 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""Custom exceptions for vLLM."""
+
+from typing import Any
+
+
+class VLLMValidationError(ValueError):
+    """vLLM-specific validation error for request validation failures.
+
+    Args:
+        message: The error message describing the validation failure.
+        parameter: Optional parameter name that failed validation.
+        value: Optional value that was rejected during validation.
+    """
+
+    def __init__(
+        self,
+        message: str,
+        *,
+        parameter: str | None = None,
+        value: Any = None,
+    ) -> None:
+        super().__init__(message)
+        self.parameter = parameter
+        self.value = value
+
+    def __str__(self):
+        base = super().__str__()
+        extras = []
+        if self.parameter is not None:
+            extras.append(f"parameter={self.parameter}")
+        if self.value is not None:
+            extras.append(f"value={self.value}")
+        return f"{base} ({', '.join(extras)})" if extras else base
diff --git a/vllm/forward_context.py b/vllm/forward_context.py
index 9f5f20832cf6be29e2c5220e339fab1efab0d1eb..f8e5136890faa7d3a241835b1bf7ff19263cfe84 100644
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -5,16 +5,17 @@ import os
 import time
 from collections import defaultdict
 from contextlib import contextmanager
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import Any, NamedTuple
 
 import torch
 
 import vllm.envs as envs
-from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import CUDAGraphMode, ParallelConfig, VllmConfig
 from vllm.logger import init_logger
 
+from vllm.platforms import current_platform
+from vllm.v1.attention.backend import AttentionMetadata
 from vllm.v1.worker.dp_utils import coordinate_batch_across_dp
 from vllm.v1.worker.ubatch_utils import UBatchSlices
 
@@ -104,6 +105,7 @@ class DPMetadata:
     ) -> "DPMetadata":
         assert num_tokens_across_dp_cpu is not None
         assert parallel_config.data_parallel_size > 1
+        assert parallel_config.is_moe_model is not False
         dp_rank = parallel_config.data_parallel_rank
         batchsize = num_tokens
 
@@ -188,6 +190,7 @@ class DPMetadata:
 class ForwardContext:
     # copy from vllm_config.compilation_config.static_forward_context
     no_compile_layers: dict[str, Any]
+    attn_metadata: dict[str, AttentionMetadata] | list[dict[str, AttentionMetadata]]
     """
     Type Dict[str, AttentionMetadata] for v1, map from layer_name of each 
     attention layer to its attention metadata
@@ -195,7 +198,6 @@ class ForwardContext:
     for each microbatch.
     Set dynamically for each forward pass
     """
-    attn_metadata: dict[str, AttentionMetadata] | list[dict[str, AttentionMetadata]]
     # TODO: remove after making all virtual_engines share the same kv cache
     virtual_engine: int  # set dynamically for each forward pass
     # set dynamically for each forward pass
@@ -207,6 +209,8 @@ class ForwardContext:
 
     ubatch_slices: UBatchSlices | None = None
 
+    additional_kwargs: dict[str, Any] = field(default_factory=dict)
+
     def __post_init__(self):
         assert self.cudagraph_runtime_mode.valid_runtime_modes(), (
             f"Invalid cudagraph runtime mode: {self.cudagraph_runtime_mode}"
@@ -237,6 +241,7 @@ def create_forward_context(
     cudagraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
     batch_descriptor: BatchDescriptor | None = None,
     ubatch_slices: UBatchSlices | None = None,
+    additional_kwargs: dict[str, Any] | None = None,
 ):
     return ForwardContext(
         no_compile_layers=vllm_config.compilation_config.static_forward_context,
@@ -246,6 +251,7 @@ def create_forward_context(
         cudagraph_runtime_mode=cudagraph_runtime_mode,
         batch_descriptor=batch_descriptor,
         ubatch_slices=ubatch_slices,
+        additional_kwargs=additional_kwargs or {},
     )
 
 
@@ -311,6 +317,18 @@ def set_forward_context(
     if cudagraph_runtime_mode != CUDAGraphMode.NONE and num_tokens is not None:
         batch_descriptor = batch_descriptor or BatchDescriptor(num_tokens=num_tokens)
 
+    additional_kwargs = current_platform.set_additional_forward_context(
+        attn_metadata=attn_metadata,
+        vllm_config=vllm_config,
+        virtual_engine=virtual_engine,
+        dp_metadata=dp_metadata,
+        num_tokens=num_tokens,
+        num_tokens_across_dp=num_tokens_across_dp,
+        cudagraph_runtime_mode=cudagraph_runtime_mode,
+        batch_descriptor=batch_descriptor,
+        ubatch_slices=ubatch_slices,
+    )
+
     forward_context = create_forward_context(
         attn_metadata,
         vllm_config,
@@ -319,6 +337,7 @@ def set_forward_context(
         cudagraph_runtime_mode,
         batch_descriptor,
         ubatch_slices,
+        additional_kwargs,
     )
 
     try:
@@ -331,8 +350,6 @@ def set_forward_context(
             # we use synchronous scheduling right now,
             # adding a sync point here should not affect
             # scheduling of the next batch
-            from vllm.platforms import current_platform
-
             synchronize = current_platform.synchronize
             if synchronize is not None:
                 synchronize()
diff --git a/vllm/grpc/__init__.py b/vllm/grpc/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b59ee96fb986a4e43c4660acada3fba951777c9b
--- /dev/null
+++ b/vllm/grpc/__init__.py
@@ -0,0 +1,17 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+vLLM gRPC protocol definitions.
+
+This module contains the protocol buffer definitions for vLLM's gRPC API.
+The protobuf files are compiled into Python code using grpcio-tools.
+"""
+
+# These imports will be available after protobuf compilation
+# from vllm.grpc import vllm_engine_pb2
+# from vllm.grpc import vllm_engine_pb2_grpc
+
+__all__ = [
+    "vllm_engine_pb2",
+    "vllm_engine_pb2_grpc",
+]
diff --git a/vllm/grpc/compile_protos.py b/vllm/grpc/compile_protos.py
new file mode 100755
index 0000000000000000000000000000000000000000..92ad46e160a59445a50bc6a9790d34452b6b2c33
--- /dev/null
+++ b/vllm/grpc/compile_protos.py
@@ -0,0 +1,94 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Compile vLLM protobuf definitions into Python code.
+
+This script uses grpcio-tools to generate *_pb2.py, *_pb2_grpc.py, and
+*_pb2.pyi (type stubs) files from the vllm_engine.proto definition.
+
+NOTE: Proto compilation happens automatically during package build (via setup.py).
+This script is provided for developers who want to regenerate protos manually,
+e.g., after modifying vllm_engine.proto.
+
+Usage:
+    python vllm/grpc/compile_protos.py
+
+Requirements:
+    pip install grpcio-tools
+"""
+
+import sys
+from pathlib import Path
+
+
+def compile_protos():
+    """Compile protobuf definitions."""
+    # Get the vllm package root directory
+    script_dir = Path(__file__).parent
+    vllm_package_root = script_dir.parent.parent  # vllm/vllm/grpc -> vllm/
+
+    proto_file = script_dir / "vllm_engine.proto"
+
+    if not proto_file.exists():
+        print(f"Error: Proto file not found at {proto_file}")
+        return 1
+
+    print(f"Compiling protobuf: {proto_file}")
+    print(f"Output directory: {script_dir}")
+
+    # Compile the proto file
+    # We use vllm/vllm as the proto_path so that the package is vllm.grpc.engine
+    try:
+        from grpc_tools import protoc
+
+        result = protoc.main(
+            [
+                "grpc_tools.protoc",
+                f"--proto_path={vllm_package_root}",
+                f"--python_out={vllm_package_root}",
+                f"--grpc_python_out={vllm_package_root}",
+                f"--pyi_out={vllm_package_root}",  # Generate type stubs
+                str(script_dir / "vllm_engine.proto"),
+            ]
+        )
+
+        if result == 0:
+            # Add SPDX headers to generated files
+            spdx_header = (
+                "# SPDX-License-Identifier: Apache-2.0\n"
+                "# SPDX-FileCopyrightText: Copyright contributors to the vLLM project\n"
+            )
+
+            for generated_file in [
+                script_dir / "vllm_engine_pb2.py",
+                script_dir / "vllm_engine_pb2_grpc.py",
+                script_dir / "vllm_engine_pb2.pyi",
+            ]:
+                if generated_file.exists():
+                    content = generated_file.read_text()
+                    if not content.startswith("# SPDX-License-Identifier"):
+                        # Add mypy ignore-errors comment for all generated files
+                        header = spdx_header + "# mypy: ignore-errors\n"
+                        generated_file.write_text(header + content)
+
+            print("✓ Protobuf compilation successful!")
+            print(f"  Generated: {script_dir / 'vllm_engine_pb2.py'}")
+            print(f"  Generated: {script_dir / 'vllm_engine_pb2_grpc.py'}")
+            print(f"  Generated: {script_dir / 'vllm_engine_pb2.pyi'} (type stubs)")
+            return 0
+        else:
+            print(f"Error: protoc returned {result}")
+            return result
+
+    except ImportError:
+        print("Error: grpcio-tools not installed")
+        print("Install with: pip install grpcio-tools")
+        return 1
+    except Exception as e:
+        print(f"Error during compilation: {e}")
+        return 1
+
+
+if __name__ == "__main__":
+    sys.exit(compile_protos())
diff --git a/vllm/grpc/vllm_engine.proto b/vllm/grpc/vllm_engine.proto
new file mode 100644
index 0000000000000000000000000000000000000000..bbb1b9b00370fa96d1174d298ded8721760f2609
--- /dev/null
+++ b/vllm/grpc/vllm_engine.proto
@@ -0,0 +1,195 @@
+syntax = "proto3";
+
+package vllm.grpc.engine;
+
+// Service definition for vLLM engine communication
+// This protocol is designed for efficient binary communication between
+// the Rust router and vLLM Python engine (AsyncLLM).
+service VllmEngine {
+  // Submit a generation request (supports streaming)
+  rpc Generate(GenerateRequest) returns (stream GenerateResponse);
+
+  // Submit an embedding request
+  rpc Embed(EmbedRequest) returns (EmbedResponse);
+
+  // Health check
+  rpc HealthCheck(HealthCheckRequest) returns (HealthCheckResponse);
+
+  // Abort a running request
+  rpc Abort(AbortRequest) returns (AbortResponse);
+
+  // Get model information
+  rpc GetModelInfo(GetModelInfoRequest) returns (GetModelInfoResponse);
+
+  // Get server information
+  rpc GetServerInfo(GetServerInfoRequest) returns (GetServerInfoResponse);
+}
+
+// =====================
+// Common Types
+// =====================
+
+// Sampling parameters for text generation
+message SamplingParams {
+  optional float temperature = 1;
+  float top_p = 2;
+  uint32 top_k = 3;
+  float min_p = 4;
+  float frequency_penalty = 5;
+  float presence_penalty = 6;
+  float repetition_penalty = 7;
+
+  optional uint32 max_tokens = 8;
+  uint32 min_tokens = 9;
+
+  repeated string stop = 10;
+  repeated uint32 stop_token_ids = 11;
+
+  bool skip_special_tokens = 12;
+  bool spaces_between_special_tokens = 13;
+  bool ignore_eos = 14;
+
+  uint32 n = 15;  // Number of parallel samples
+
+  // Logprobs configuration
+  optional int32 logprobs = 22;  // Number of log probabilities per output token (-1 for all)
+  optional int32 prompt_logprobs = 23;  // Number of log probabilities per prompt token (-1 for all)
+
+  // Additional vLLM fields
+  optional int32 seed = 24;  // Random seed for reproducibility
+  bool include_stop_str_in_output = 25;  // Whether to include stop strings in output
+  map<int32, float> logit_bias = 26;  // Token ID to bias mapping (-100 to 100)
+  optional int32 truncate_prompt_tokens = 27;  // Prompt truncation (-1 for model max)
+
+  // Structured outputs (one of) - matches vLLM's StructuredOutputsParams
+  oneof constraint {
+    string json_schema = 16;  // JSON schema for structured output
+    string regex = 17;  // Regex pattern
+    string grammar = 18;  // Grammar/EBNF for structured output
+    string structural_tag = 19;  // Structural tag (e.g., Harmony models)
+    bool json_object = 20;  // Force JSON object output
+    ChoiceConstraint choice = 21;  // List of allowed choices
+  }
+}
+
+// Choice constraint for structured outputs
+message ChoiceConstraint {
+  repeated string choices = 1;
+}
+
+// Pre-tokenized input from Rust router
+message TokenizedInput {
+  string original_text = 1;  // For reference/debugging
+  repeated uint32 input_ids = 2;  // Actual token IDs to process
+}
+
+// =====================
+// Generate Request
+// =====================
+
+message GenerateRequest {
+  string request_id = 1;
+
+  // Prompt input
+  oneof input {
+    TokenizedInput tokenized = 2;
+    string text = 3;
+  }
+
+  // Generation parameters (includes logprobs config)
+  SamplingParams sampling_params = 4;
+
+  // Streaming
+  bool stream = 5;
+}
+
+// =====================
+// Generate Response
+// =====================
+
+message GenerateResponse {
+  oneof response {
+    GenerateStreamChunk chunk = 1;     // For streaming
+    GenerateComplete complete = 2;     // For final/non-streaming
+  }
+}
+
+message GenerateStreamChunk {
+  repeated uint32 token_ids = 1;       // Incremental tokens
+  uint32 prompt_tokens = 2;
+  uint32 completion_tokens = 3;
+  uint32 cached_tokens = 4;
+
+  // Logprobs support (TODO: implement in Phase 4)
+  // OutputLogProbs output_logprobs = 5;
+  // InputLogProbs input_logprobs = 6;  // Only in first chunk
+}
+
+message GenerateComplete {
+  repeated uint32 output_ids = 1;      // All output tokens
+  string finish_reason = 2;            // "stop", "length", "abort"
+  uint32 prompt_tokens = 3;
+  uint32 completion_tokens = 4;
+  uint32 cached_tokens = 5;
+
+  // Logprobs support (TODO: implement in Phase 4)
+  // OutputLogProbs output_logprobs = 6;
+  // InputLogProbs input_logprobs = 7;
+}
+
+// =====================
+// Embedding Request
+// =====================
+
+message EmbedRequest {
+  string request_id = 1;
+  TokenizedInput tokenized = 2;
+}
+
+message EmbedResponse {
+  repeated float embedding = 1;
+  uint32 prompt_tokens = 2;
+  uint32 embedding_dim = 3;
+}
+
+// =====================
+// Management Operations
+// =====================
+
+message HealthCheckRequest {}
+
+message HealthCheckResponse {
+  bool healthy = 1;
+  string message = 2;
+}
+
+message AbortRequest {
+  repeated string request_ids = 1;
+}
+
+message AbortResponse {
+}
+
+// =====================
+// Model and Server Info
+// =====================
+
+message GetModelInfoRequest {}
+
+message GetModelInfoResponse {
+  string model_path = 1;
+  bool is_generation = 2;
+  uint32 max_context_length = 3;
+  uint32 vocab_size = 4;
+  bool supports_vision = 5;
+}
+
+message GetServerInfoRequest {}
+
+message GetServerInfoResponse {
+  uint32 active_requests = 1;
+  bool is_paused = 2;
+  double last_receive_timestamp = 3;
+  double uptime_seconds = 4;
+  string server_type = 5;  // "vllm-grpc"
+}
diff --git a/vllm/inputs/parse.py b/vllm/inputs/parse.py
index 71289277eb987636040827db03d778cfa4128d45..5e7795a14072fff806f03c958105f6e4f9f4e59f 100644
--- a/vllm/inputs/parse.py
+++ b/vllm/inputs/parse.py
@@ -45,16 +45,17 @@ def parse_raw_prompts(
 
         # case 4: array of token arrays
         if is_list_of(prompt, list):
-            first = prompt[0]
-            if not isinstance(first, list):
-                raise ValueError("prompt expected to be a list of lists")
-
-            if len(first) == 0:
-                raise ValueError("Please provide at least one prompt")
-
-            # strict validation: every nested list must be list[int]
-            if not all(is_list_of(elem, int) for elem in prompt):
-                raise TypeError("Nested lists must contain only integers")
+            if len(prompt) == 1 and isinstance(prompt[0], list) and len(prompt[0]) == 0:
+                raise ValueError("please provide at least one prompt")
+            for elem in prompt:
+                if not isinstance(elem, list):
+                    raise TypeError(
+                        "prompt must be a list of lists, but found a non-list element."
+                    )
+                if not is_list_of(elem, int):
+                    raise TypeError(
+                        "Nested lists of tokens must contain only integers."
+                    )
 
             prompt = cast(list[list[int]], prompt)
             return [TokensPrompt(prompt_token_ids=elem) for elem in prompt]
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 0372b06d0017f00f2752c29bbf98bf7290d22630..6723809b51e05cac0a6ed865b01117712615e051 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -6,7 +6,7 @@ from typing import Any, cast
 
 from typing_extensions import assert_never
 
-from vllm.config import ModelConfig
+from vllm.config import ModelConfig, ObservabilityConfig
 from vllm.logger import init_logger
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
 from vllm.multimodal.cache import BaseMultiModalProcessorCache
@@ -47,6 +47,7 @@ class InputPreprocessor:
         self,
         model_config: ModelConfig,
         tokenizer: TokenizerLike | None,
+        observability_config: ObservabilityConfig | None = None,
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
         mm_processor_cache: BaseMultiModalProcessorCache | None = None,
     ) -> None:
@@ -54,6 +55,7 @@ class InputPreprocessor:
 
         self.model_config = model_config
         self.tokenizer = tokenizer
+        self.observability_config = observability_config
         self.mm_registry = mm_registry
         self.mm_processor_cache = mm_processor_cache
 
@@ -232,6 +234,7 @@ class InputPreprocessor:
         if not hasattr(self, "_mm_processor"):
             self._mm_processor = self.mm_registry.create_processor(
                 self.model_config,
+                self.observability_config,
                 tokenizer=self.tokenizer,
                 cache=self.mm_processor_cache,
             )
@@ -686,11 +689,7 @@ class InputPreprocessor:
         mm_uuids: MultiModalUUIDDict | None = None,
     ) -> ProcessorInputs:
         """Preprocess the input prompt."""
-        res = self._preprocess(
-            prompt,
-            tokenization_kwargs,
-            mm_uuids=mm_uuids,
-        )
+        res = self._preprocess(prompt, tokenization_kwargs, mm_uuids=mm_uuids)
 
         if self.mm_processor_cache and self.mm_cache_stats is not None:
             delta = self.mm_processor_cache.make_stats(delta=True)
diff --git a/vllm/lora/layers/__init__.py b/vllm/lora/layers/__init__.py
index 25364a5881364182a09add9c49564bdfe1c033e0..b002b16516f746735c214bc0feb4df98b5bbf95c 100644
--- a/vllm/lora/layers/__init__.py
+++ b/vllm/lora/layers/__init__.py
@@ -18,7 +18,7 @@ from vllm.lora.layers.row_parallel_linear import (
     RowParallelLinearWithLoRA,
     RowParallelLinearWithShardedLoRA,
 )
-from vllm.lora.layers.utils import LoRAMapping
+from vllm.lora.layers.utils import LoRAMapping, LoRAMappingType
 from vllm.lora.layers.vocal_parallel_embedding import VocabParallelEmbeddingWithLoRA
 
 __all__ = [
@@ -37,6 +37,7 @@ __all__ = [
     "RowParallelLinearWithShardedLoRA",
     "ReplicatedLinearWithLoRA",
     "LoRAMapping",
+    "LoRAMappingType",
     "FusedMoEWithLoRA",
     "FusedMoE3DWithLoRA",
 ]
diff --git a/vllm/lora/layers/base_linear.py b/vllm/lora/layers/base_linear.py
index 06ecc8d2f634c221c2b6fe566b35f74d5fbdd39c..fc79aec5d650fd39f7ccb1ebee3c978697d4a0a5 100644
--- a/vllm/lora/layers/base_linear.py
+++ b/vllm/lora/layers/base_linear.py
@@ -122,7 +122,9 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
     def apply(self, x: torch.Tensor, bias: torch.Tensor | None = None) -> torch.Tensor:
         output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
 
-        # In Transformers modeling backend, x and output have extra batch dimension like
+        original_shape = output.shape if output.ndim == 3 else None
+
+        # In transformers backend, x and output have extra batch dimension like
         # (1, seq_len, hidden_dim), while punica expects (seq_len, hidden_dim),
         # therefore we need to flatten the batch dimensions.
         if x.ndim == 3 and output.ndim == 3:
@@ -135,6 +137,11 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
         if not current_platform.can_update_inplace():
             output = lora_output
 
+        # Reshape the flattened output back to its original shape,
+        # as some MM encoders cannot handle flattened inputs.
+        if original_shape is not None:
+            output = output.reshape(original_shape)
+
         return output
 
     @property
diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py
index 24cab79a72443927fa887ca52be48ab85e5494df..9e78b616490920e3526b67ef3b07c86e10e093c8 100644
--- a/vllm/lora/layers/fused_moe.py
+++ b/vllm/lora/layers/fused_moe.py
@@ -24,7 +24,6 @@ from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
 )
 from vllm.model_executor.layers.fused_moe.fused_moe import (
     TritonExperts,
-    try_get_optimal_moe_config,
 )
 from vllm.model_executor.layers.fused_moe.fused_moe_modular_method import (
     FusedMoEModularMethod,
@@ -39,7 +38,7 @@ from vllm.model_executor.layers.fused_moe.prepare_finalize import (
     MoEPrepareAndFinalizeNoEP,
 )
 
-from .utils import _get_lora_device
+from .utils import _get_lora_device, try_get_optimal_moe_lora_config
 
 
 class FusedMoEWithLoRA(BaseLayerWithLoRA):
@@ -103,15 +102,21 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
             )
         else:  # fall back to the default config
             get_config_func = functools.partial(
-                try_get_optimal_moe_config,
-                layer.w13_weight.size(),
-                layer.w2_weight.size(),
-                top_k,
-                config_dtype,
+                try_get_optimal_moe_lora_config,
+                w1_shape=layer.w13_weight.size(),
+                w2_shape=layer.w2_weight.size(),
+                rank=rank,
+                top_k=top_k,
+                dtype=config_dtype,
+                M=M,
                 block_shape=layer.quant_method.moe_quant_config.block_shape,
             )
-            shrink_config = get_config_func(M)
-            expand_config = get_config_func(M)
+            shrink_config = get_config_func(
+                op_type=f"fused_moe_lora_{op_prefix}_shrink"
+            )
+            expand_config = get_config_func(
+                op_type=f"fused_moe_lora_{op_prefix}_expand"
+            )
         shrink_config = self._normalize_keys(shrink_config)
         expand_config = self._normalize_keys(expand_config)
         return shrink_config, expand_config
@@ -130,7 +135,6 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
                 prepare_finalize, self.base_layer
             ),
             self.base_layer.shared_experts,
-            getattr(self.base_layer, "shared_experts_stream", None),
         )
         if quant_config.use_mxfp4_w4a16:
             assert isinstance(
@@ -429,9 +433,9 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
         current_lora_rank = w13_lora_a.shape[1]
         assert current_lora_rank % self.tp_size == 0
         # Based on S-LoRA, we slice W13/W1/W3 A along the rank dim.
-        sliced_rank = current_lora_rank // self.tp_size
-        start_idx = self.tp_rank * sliced_rank
-        end_idx = (self.tp_rank + 1) * sliced_rank
+        shard_size = self.w13_lora_a_stacked[0].shape[2]
+        start_idx = self.tp_rank * shard_size
+        end_idx = (self.tp_rank + 1) * shard_size
         return w13_lora_a[:, start_idx:end_idx, :]
 
     def _slice_w13_b(self, w13_lora_b: torch.Tensor):
@@ -466,11 +470,10 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
             return w2_lora_b
         # Based on S-LoRA, we slice W2 B along the hidden_size dim.
         # w2_lora_b shape (num_experts,output_size,rank)
-        current_lora_size = w2_lora_b.shape[1]
+        shard_size = self.w2_lora_b_stacked[0].shape[2]
+        start_idx = self.tp_rank * shard_size
+        end_idx = (self.tp_rank + 1) * shard_size
 
-        sliced_size = current_lora_size // self.tp_size
-        start_idx = self.tp_rank * sliced_size
-        end_idx = (self.tp_rank + 1) * sliced_size
         return w2_lora_b[:, start_idx:end_idx, :]
 
     def reset_lora(self, index: int):
@@ -672,20 +675,9 @@ class FusedMoE3DWithLoRA(FusedMoEWithLoRA):
         self.reset_lora(index)
         self.adapter_enabled[index] = 1
 
-        num_experts = self.w13_lora_a_stacked[0].shape[1]
         w13_lora_a, w2_lora_a = lora_a
         w13_lora_b, w2_lora_b = lora_b
 
-        # (num_experts,rank,input_size)
-        w13_lora_a = w13_lora_a.reshape(num_experts, -1, w13_lora_a.shape[-1])
-        w2_lora_a = w2_lora_a.reshape(num_experts, -1, w2_lora_a.shape[-1])
-        # (output_size,num_experts,rank)
-        w13_lora_b = w13_lora_b.reshape(w13_lora_b.shape[0], num_experts, -1)
-        w2_lora_b = w2_lora_b.reshape(w2_lora_b.shape[0], num_experts, -1)
-        # (num_experts,output_size,rank)
-        w13_lora_b = w13_lora_b.permute(1, 0, 2)
-        w2_lora_b = w2_lora_b.permute(1, 0, 2)
-
         sliced_w13_lora_a = self._slice_w13_a(w13_lora_a)
         sliced_w13_lora_b = self._slice_w13_b(w13_lora_b)
 
@@ -732,7 +724,7 @@ class FusedMoE3DWithLoRA(FusedMoEWithLoRA):
         """
         Full size
         """
-        return self.w2_lora_a_stacked[0].shape[-2]
+        return self.base_layer.hidden_size
 
     @classmethod
     def can_replace_layer(
diff --git a/vllm/lora/layers/utils.py b/vllm/lora/layers/utils.py
index 74403240f6cc2ae98ea732dd4f55c21790a5acb4..26d1a53f013aec9044d21f47988e244188c27fcf 100644
--- a/vllm/lora/layers/utils.py
+++ b/vllm/lora/layers/utils.py
@@ -2,16 +2,27 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
+from enum import Enum
 
 import torch
 import torch.nn as nn
 
+from vllm.model_executor.layers.fused_moe.fused_moe import try_get_optimal_moe_config
+from vllm.utils.math_utils import next_power_of_2
+
+
+class LoRAMappingType(Enum):
+    LANGUAGE = 1
+    TOWER = 2
+    CONNECTOR = 3
+
 
 @dataclass
 class LoRAMapping:
     index_mapping: tuple[int, ...]
     prompt_mapping: tuple[int, ...]
     is_prefill: bool = False
+    type: LoRAMappingType = LoRAMappingType.LANGUAGE
 
     def __post_init__(self):
         self.index_mapping = tuple(self.index_mapping)
@@ -72,3 +83,33 @@ def _fully_sharded_can_replace(can_replace):
         )
 
     return dec
+
+
+def try_get_optimal_moe_lora_config(
+    op_type: str,
+    w1_shape: tuple[int, ...],
+    w2_shape: tuple[int, ...],
+    rank: int,
+    top_k: int,
+    dtype: str | None,
+    M: int,
+    block_shape: list[int] | None = None,
+) -> dict[str, int | None]:
+    config = try_get_optimal_moe_config(
+        w1_shape, w2_shape, top_k, dtype, M, block_shape
+    ).copy()
+    if op_type in [
+        "fused_moe_lora_w13_shrink",
+        "fused_moe_lora_w2_shrink",
+    ]:
+        config["BLOCK_SIZE_N"] = min(
+            config.get("BLOCK_SIZE_N", 64), next_power_of_2(rank)
+        )
+    elif op_type in [
+        "fused_moe_lora_w13_expand",
+        "fused_moe_lora_w2_expand",
+    ]:
+        config["BLOCK_SIZE_K"] = max(
+            16, min(config.get("BLOCK_SIZE_K", 32), next_power_of_2(rank))
+        )
+    return config
diff --git a/vllm/lora/lora_model.py b/vllm/lora/lora_model.py
index f5e36697ed18ca39b2a62117910b907519778d14..bc88c71eaf8d9951cad836b66809464ff5bc0745 100644
--- a/vllm/lora/lora_model.py
+++ b/vllm/lora/lora_model.py
@@ -12,7 +12,6 @@ from vllm.lora.peft_helper import PEFTHelper
 from vllm.lora.utils import (
     get_lora_id,
     is_base_embeddding_weights,
-    is_regex_target_modules,
     parse_fine_tuned_lora_name,
 )
 from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
@@ -201,37 +200,13 @@ class LoRAModel:
                 for module in f.keys():  # noqa
                     tensors[module] = f.get_tensor(module)
         elif os.path.isfile(lora_bin_file_path) or os.path.isfile(lora_pt_file_path):
-            # When a bin/pt file is provided, we rely on config to find
-            # unexpected modules.
-            unexpected_modules = []
-            target_modules = peft_helper.target_modules
-            if not isinstance(target_modules, list):
-                target_modules = [target_modules]
-            for module in target_modules:
-                # Compatible with more modules,
-                # such as:layers.11.self_attn.k_proj
-                part_name = module.split(".")[-1]
-                if part_name not in expected_lora_modules:
-                    unexpected_modules.append(module)
-            # loaded lora's target modules must be a subset of
-            # expected_lora_modules. It is not reliable. See
-            # https://github.com/vllm-project/vllm/pull/5909. But there's no
-            # other better mechanism.
-            if unexpected_modules and not is_regex_target_modules(
-                peft_helper.target_modules, expected_lora_modules
-            ):
-                raise ValueError(
-                    f"While loading {lora_dir}, expected"
-                    f" target modules in {expected_lora_modules}"
-                    f" but received {unexpected_modules}."
-                    f" Please verify that the loaded LoRA module is correct"
-                )
             lora_file_path = (
                 lora_bin_file_path
                 if os.path.isfile(lora_bin_file_path)
                 else lora_pt_file_path
             )
             tensors = torch.load(lora_file_path, map_location=device, weights_only=True)
+            check_unexpected_modules(tensors)
         else:
             raise ValueError(f"{lora_dir} doesn't contain tensors")
 
diff --git a/vllm/lora/model_manager.py b/vllm/lora/model_manager.py
index 44e0448d92de01dfff57dc1ed0b1f03ddcee5cef..70da246f288fcad7c5a7eb7acfa44801e0dc3571 100644
--- a/vllm/lora/model_manager.py
+++ b/vllm/lora/model_manager.py
@@ -9,12 +9,18 @@ import regex as re
 import torch
 from torch import nn
 
-from vllm.config.lora import LoRAConfig
+from vllm.config import VllmConfig
+from vllm.config.lora import LoRAConfig, ModelConfig
 from vllm.logger import init_logger
-from vllm.lora.layers import BaseLayerWithLoRA, FusedMoE3DWithLoRA, LoRAMapping
+from vllm.lora.layers import (
+    BaseLayerWithLoRA,
+    FusedMoE3DWithLoRA,
+    LoRAMapping,
+    LoRAMappingType,
+)
 from vllm.lora.lora_model import LoRAModel
 from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights
-from vllm.lora.punica_wrapper import get_punica_wrapper
+from vllm.lora.punica_wrapper import PunicaWrapperBase, get_punica_wrapper
 from vllm.lora.utils import (
     from_layer,
     from_layer_logits_processor,
@@ -28,12 +34,15 @@ from vllm.model_executor.models import SupportsLoRA, supports_multimodal
 from vllm.model_executor.models.interfaces import is_pooling_model
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.models.utils import PPMissingLayer
+from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.utils.cache import LRUCache
 from vllm.utils.platform_utils import is_pin_memory_available
+from vllm.v1.worker.utils import MultiModalBudget
 
 logger = init_logger(__name__)
 
 T = TypeVar("T")
+DEFAULT_LANGUAGE_WRAPPER_KEY = "language_model"
 
 
 class AdapterLRUCache(LRUCache[int, T]):
@@ -58,6 +67,7 @@ class LoRAModelManager:
         vocab_size: int,
         lora_config: LoRAConfig,
         device: torch.device,
+        vllm_config: VllmConfig | None = None,
     ):
         """Create a LoRAModelManager and adapter for a given model.
 
@@ -71,6 +81,11 @@ class LoRAModelManager:
             lora_config: the LoRA configuration.
         """
         self.model: SupportsLoRA = model
+        self.supported_lora_modules = get_supported_lora_modules(self.model)
+        assert self.supported_lora_modules, (
+            f"No supported LoRA modules found in {self.model.__class__.__name__}."
+        )
+
         self._registered_adapters: dict[int, LoRAModel] = {}
         # Dict instead of a set for compatibility with LRUCache.
         self._active_adapters: dict[int, None] = {}
@@ -82,35 +97,121 @@ class LoRAModelManager:
         self.max_num_batched_tokens = math.ceil(max_num_batched_tokens / 8) * 8
         self.lora_index_to_id: list[int | None] = [None] * self.lora_slots
         self.vocab_size = vocab_size
-        self.punica_wrapper = get_punica_wrapper(
-            max_num_batched_tokens,
-            max_batches=self.max_num_seqs,
-            device=self.device,
-            max_loras=self.lora_config.max_loras,
-        )
-
-        self.supported_lora_modules = get_supported_lora_modules(self.model)
-        assert self.supported_lora_modules, "No supported LoRA modules found in"
-        f" {self.model.__class__.__name__}."
-
         self.packed_modules_mapping = process_packed_modules_mapping(self.model)
-        # Used to indicate whether the model is a multimodal model
-        self.supports_mm: bool = (
-            supports_multimodal(self.model)
-            # In case the model only supports LoRA for
-            # text modules (e.g. ChatGLM)
-            and hasattr(self.model, "get_mm_mapping")
-        )
+
         self.is_pooling_model = is_pooling_model(self.model)
         self.packed_modules: dict[str, list[str]] = {}
         self.modules: dict[str, BaseLayerWithLoRA] = {}
         # Dict instead of a set for compatibility with LRUCache.
         self._last_mapping: LoRAMapping | None = None
         self._is_3d_moe_model = is_moe_model(self.model) and self.model.is_3d_moe_weight
+        self._init_punica_wrapper(max_num_batched_tokens, vllm_config)
         self._create_lora_modules()
 
         self.model.lora_manager = self
 
+    def _init_punica_wrapper(
+        self, max_num_batched_tokens: int, vllm_config: VllmConfig
+    ) -> None:
+        # Used to indicate whether the model is a multimodal model
+        self.supports_mm: bool = (
+            supports_multimodal(self.model)
+            # In case the model only supports LoRA for
+            # text modules (e.g. ChatGLM)
+            and hasattr(self.model, "get_mm_mapping")
+        )
+        self.punica_wrapper_mapping: dict[str, PunicaWrapperBase] = {}
+        if self.supports_mm:
+            self._maybe_init_mm(vllm_config, max_num_batched_tokens)
+        else:
+            llm_punica_wrapper = get_punica_wrapper(
+                max_num_batched_tokens,
+                max_batches=self.max_num_seqs,
+                device=self.device,
+                lora_config=self.lora_config,
+            )
+
+            self.punica_wrapper_mapping[DEFAULT_LANGUAGE_WRAPPER_KEY] = (
+                llm_punica_wrapper
+            )
+
+    def _maybe_init_mm(self, vllm_config: VllmConfig, max_num_batched_tokens) -> None:
+        self.supports_tower_connector_lora = False
+        model_config: ModelConfig = vllm_config.model_config
+        self.mm_mapping: MultiModelKeys = self.model.get_mm_mapping()
+
+        # Only one language model can be included in the model.
+        assert len(self.mm_mapping.language_model) == 1
+
+        # Language model punica wrapper
+        llm_punica_wrapper = get_punica_wrapper(
+            max_num_batched_tokens,
+            max_batches=self.max_num_seqs,
+            device=self.device,
+            lora_config=self.lora_config,
+        )
+        lm_prefix = self.mm_mapping.language_model[0]
+        self.punica_wrapper_mapping[lm_prefix] = llm_punica_wrapper
+
+        if self.lora_config.enable_tower_connector_lora:
+            self.mm_processor_info = MULTIMODAL_REGISTRY.create_processor(
+                model_config
+            ).info
+            self.supports_tower_connector_lora = self.supports_mm and hasattr(
+                self.model, "get_num_mm_encoder_tokens"
+            )
+        if not self.supports_tower_connector_lora:
+            return
+
+        logger.warning(
+            "LoRA for the tower and connector of multimodal models is "
+            "experimental and may contain bugs. Please report any related issues on "
+            "GitHub if you encounter them."
+        )
+
+        mm_budget = MultiModalBudget(
+            model_config,
+            vllm_config.scheduler_config,
+            MULTIMODAL_REGISTRY,
+        )
+        limit_per_prompt: int = max(
+            self.mm_processor_info.get_allowed_mm_limits().values()
+        )
+        num_encoder_tokens = self.model.get_num_mm_encoder_tokens(
+            mm_budget.get_encoder_budget()
+        )
+
+        # Tower wrappers
+        tower_punica_wrapper = get_punica_wrapper(
+            num_encoder_tokens,
+            max_batches=self.max_num_seqs * limit_per_prompt,
+            device=self.device,
+            lora_config=self.lora_config,
+        )
+        for prefix in self.mm_mapping.tower_model:
+            self.punica_wrapper_mapping[prefix] = tower_punica_wrapper
+
+        # Use wrapper for connector if present.
+        if self.mm_mapping.connector:
+            if hasattr(self.model, "get_num_mm_connector_tokens"):
+                connector_tokens = self.model.get_num_mm_connector_tokens(
+                    num_encoder_tokens
+                )
+                connector_punica_wrapper = get_punica_wrapper(
+                    connector_tokens,
+                    max_batches=self.max_num_seqs * limit_per_prompt,
+                    device=self.device,
+                    lora_config=self.lora_config,
+                )
+                for prefix in self.mm_mapping.connector:
+                    self.punica_wrapper_mapping[prefix] = connector_punica_wrapper
+            else:
+                logger.warning_once(
+                    "Connector LoRA support disabled: model does not implement "
+                    "get_num_mm_connector_tokens(). This method is required to "
+                    "determine the connector's token budget for LoRA operations."
+                )
+
     def __len__(self) -> int:
         return len(self._registered_adapters)
 
@@ -155,61 +256,7 @@ class LoRAModelManager:
             if not module_lora:
                 module.reset_lora(index)
                 continue
-            # Note (gnovack) - If MOE lora weights are not split into
-            # num_experts chunks, we split them here
-            if isinstance(module, FusedMoE3DWithLoRA) and torch.is_tensor(
-                module_lora.lora_a
-            ):
-                # Handle PEFT file format where experts.base_layer is the
-                # gate_up_proj and experts is the down_proj
-                gate_up_proj_lora = self._get_lora_layer_weights(
-                    lora_model, module_name + ".base_layer"
-                )
-                down_proj_lora = module_lora
-                # FIXME Edge case where LoRA is not added to gate_up_proj
-                # or down_proj
-                assert gate_up_proj_lora is not None
-                assert down_proj_lora is not None
-                if self._is_3d_moe_model:
-                    module_lora.lora_a = [
-                        gate_up_proj_lora.lora_a,
-                        down_proj_lora.lora_a,
-                    ]
-                    module_lora.lora_b = [
-                        gate_up_proj_lora.lora_b,
-                        down_proj_lora.lora_b,
-                    ]
-                else:
-                    # Some 3D MoE models haven't added the `is_3d_moe_weight`
-                    # attribute yet, so fallback here
-                    num_experts = module_lora.lora_a.shape[0] // module_lora.rank
-
-                    gate_proj_a = gate_up_proj_lora.lora_a.chunk(num_experts, dim=0)
-                    up_proj_a = gate_up_proj_lora.lora_a.chunk(num_experts, dim=0)
-
-                    gate_proj_b = gate_up_proj_lora.lora_b[::2, ...].chunk(
-                        num_experts, dim=-1
-                    )
-                    up_proj_b = gate_up_proj_lora.lora_b[1::2, ...].chunk(
-                        num_experts, dim=-1
-                    )
-
-                    down_proj_a = down_proj_lora.lora_a.chunk(num_experts, dim=0)
-                    down_proj_b = down_proj_lora.lora_b.chunk(num_experts, dim=-1)
-
-                    lora_a = []
-                    lora_b = []
-                    for i in range(num_experts):
-                        lora_a.append(gate_proj_a[i])
-                        lora_a.append(down_proj_a[i])
-                        lora_a.append(up_proj_a[i])
 
-                        lora_b.append(gate_proj_b[i])
-                        lora_b.append(down_proj_b[i])
-                        lora_b.append(up_proj_b[i])
-
-                    module_lora.lora_a = lora_a
-                    module_lora.lora_b = lora_b
             module.set_lora(
                 index,
                 module_lora.lora_a,
@@ -237,8 +284,24 @@ class LoRAModelManager:
         )  # type: ignore
 
     def _set_adapter_mapping(self, mapping: LoRAMapping) -> None:
-        # update lora states
-        self.punica_wrapper.update_metadata(
+        # Default to the main language model wrapper
+        if not (self.supports_mm and self.supports_tower_connector_lora):
+            target_prefix = (
+                self.mm_mapping.language_model[0]
+                if self.supports_mm
+                else DEFAULT_LANGUAGE_WRAPPER_KEY
+            )
+        elif mapping.type == LoRAMappingType.TOWER and self.mm_mapping.tower_model:
+            target_prefix = self.mm_mapping.tower_model[0]
+        elif mapping.type == LoRAMappingType.CONNECTOR and self.mm_mapping.connector:
+            target_prefix = self.mm_mapping.connector[0]
+        else:
+            target_prefix = self.mm_mapping.language_model[0]
+
+        punica_wrapper = self._get_punica_wrapper(target_prefix)
+        assert punica_wrapper is not None
+
+        punica_wrapper.update_metadata(
             mapping,
             self.lora_index_to_id,
             self.lora_slots + 1,
@@ -265,15 +328,17 @@ class LoRAModelManager:
 
             if not self._match_target_modules(module_name):
                 continue
-            # A temporary approach for multimodal models to support LoRA
-            # TODO: Remove this restriction
-            if self._filter_unsupported_mm_module(module_name):
+
+            punica_wrapper = self._get_punica_wrapper(module_name)
+            if punica_wrapper is None:
                 logger.warning(
-                    "Regarding multimodal models, vLLM currently only supports "
-                    "adding LoRA to language model, %s will be ignored.",
+                    "Regarding %s, vLLM currently only supports adding LoRA to"
+                    " language model, %s will be ignored.",
+                    self.model.__class__.__name__,
                     module_name,
                 )
                 continue
+
             parts = module_name.split(".")[-1]
             packed_moduled_lst = self.packed_modules_mapping.get(parts, [])
             if isinstance(module, FusedMoE):
@@ -328,10 +393,10 @@ class LoRAModelManager:
             if self.supports_mm and not isinstance(new_module, BaseLayerWithLoRA):
                 continue
             self.register_module(module_name, new_module)
+
             self._register_packed_modules(module_name)
             # All lora layers share the same punica_wrapper based on reference.
-            new_module.set_mapping(self.punica_wrapper)
-        pass
+            new_module.set_mapping(punica_wrapper)
 
     def register_module(self, module_name: str, module: "BaseLayerWithLoRA"):
         assert isinstance(module, BaseLayerWithLoRA), (
@@ -352,7 +417,7 @@ class LoRAModelManager:
             if (
                 not self._match_target_modules(module_name)
                 or not isinstance(module, BaseLayerWithLoRA)
-                or self._filter_unsupported_mm_module(module_name)
+                or self._get_punica_wrapper(module_name) is None
             ):
                 continue
             parts = module_name.split(".")
@@ -441,17 +506,22 @@ class LoRAModelManager:
             for target_module in self.supported_lora_modules
         )
 
-    def _filter_unsupported_mm_module(self, module_name: str) -> bool:
+    def _get_punica_wrapper(self, module_name: str) -> PunicaWrapperBase | None:
         """
-        Regarding multimodal models, vLLM currently only supports adding LoRA to
-        language model. LoRA for other modules, such as the vision tower, will
-        be filtered out.
+        Determine whether this module supports LoRA and which wrapper to use.
         """
-        if self.supports_mm:
-            module_mapping: MultiModelKeys = self.model.get_mm_mapping()
-            prefix_lst = module_mapping.connector + module_mapping.tower_model
-            return any([module_name.startswith(prefix) for prefix in prefix_lst])
-        return False
+        # For language model (early return)
+        if not self.supports_mm:
+            return self.punica_wrapper_mapping[DEFAULT_LANGUAGE_WRAPPER_KEY]
+
+        # For multimodal model
+        # NOTE Sort by prefix length (descending) to match the longest prefix first
+        # e.g., 'visual.merger' should match 'visual.merger' instead of 'visual.'
+        for prefix in sorted(self.punica_wrapper_mapping.keys(), key=len, reverse=True):
+            if module_name.startswith(prefix):
+                return self.punica_wrapper_mapping[prefix]
+
+        return None
 
     def _register_packed_modules(self, module_full_name: str) -> None:
         parts = module_full_name.split(".")
@@ -503,6 +573,10 @@ class LoRAModelManager:
         for lora in lora_model.loras.values():
             lora.optimize()
 
+        for module_name, module in self.modules.items():
+            if isinstance(module, FusedMoE3DWithLoRA):
+                self._stack_moe_lora_weights(lora_model, module, module_name)
+
         first_lora: LoRALayerWeights = next(iter(lora_model.loras.values()))
         assert first_lora.lora_a is not None
         if isinstance(first_lora.lora_a, list):
@@ -529,6 +603,91 @@ class LoRAModelManager:
                     lora.lora_a = lora.lora_a.pin_memory()
                     lora.lora_b = lora.lora_b.pin_memory()
 
+    def _stack_moe_lora_weights(
+        self, lora_model: LoRAModel, module: FusedMoE3DWithLoRA, module_name: str
+    ):
+        module_lora = self._get_lora_layer_weights(lora_model, module_name)
+
+        # Note (gnovack) - If MOE lora weights are not split into
+        # num_experts chunks, we split them here
+        if module_lora and torch.is_tensor(module_lora.lora_a):
+            # Handle PEFT file format where experts.base_layer is the
+            # gate_up_proj and experts is the down_proj
+            gate_up_proj_lora = self._get_lora_layer_weights(
+                lora_model, module_name + ".base_layer"
+            )
+            down_proj_lora = module_lora
+            # FIXME Edge case where LoRA is not added to gate_up_proj
+            # or down_proj
+            assert gate_up_proj_lora is not None
+            assert down_proj_lora is not None
+            if self._is_3d_moe_model:
+                num_experts = module.w13_lora_a_stacked[0].shape[1]
+
+                # (num_experts,rank,input_size)
+                gate_up_proj_lora.lora_a = gate_up_proj_lora.lora_a.reshape(
+                    num_experts, -1, gate_up_proj_lora.lora_a.shape[-1]
+                )
+                down_proj_lora.lora_a = down_proj_lora.lora_a.reshape(
+                    num_experts, -1, down_proj_lora.lora_a.shape[-1]
+                )
+
+                # (output_size,rank,num_experts)
+                gate_up_proj_lora.lora_b = gate_up_proj_lora.lora_b.reshape(
+                    gate_up_proj_lora.lora_b.shape[0], -1, num_experts
+                )
+                down_proj_lora.lora_b = down_proj_lora.lora_b.reshape(
+                    down_proj_lora.lora_b.shape[0], -1, num_experts
+                )
+
+                # (num_experts,output_size,rank)
+                gate_up_proj_lora.lora_b = gate_up_proj_lora.lora_b.permute(
+                    2, 0, 1
+                ).contiguous()
+                down_proj_lora.lora_b = down_proj_lora.lora_b.permute(
+                    2, 0, 1
+                ).contiguous()
+
+                module_lora.lora_a = [
+                    gate_up_proj_lora.lora_a,
+                    down_proj_lora.lora_a,
+                ]
+                module_lora.lora_b = [
+                    gate_up_proj_lora.lora_b,
+                    down_proj_lora.lora_b,
+                ]
+            else:
+                # Some 3D MoE models haven't added the `is_3d_moe_weight`
+                # attribute yet, so fallback here
+                num_experts = module_lora.lora_a.shape[0] // module_lora.rank
+
+                gate_proj_a = gate_up_proj_lora.lora_a.chunk(num_experts, dim=0)
+                up_proj_a = gate_up_proj_lora.lora_a.chunk(num_experts, dim=0)
+
+                gate_proj_b = gate_up_proj_lora.lora_b[::2, ...].chunk(
+                    num_experts, dim=-1
+                )
+                up_proj_b = gate_up_proj_lora.lora_b[1::2, ...].chunk(
+                    num_experts, dim=-1
+                )
+
+                down_proj_a = down_proj_lora.lora_a.chunk(num_experts, dim=0)
+                down_proj_b = down_proj_lora.lora_b.chunk(num_experts, dim=-1)
+
+                lora_a = []
+                lora_b = []
+                for i in range(num_experts):
+                    lora_a.append(gate_proj_a[i])
+                    lora_a.append(down_proj_a[i])
+                    lora_a.append(up_proj_a[i])
+
+                    lora_b.append(gate_proj_b[i])
+                    lora_b.append(down_proj_b[i])
+                    lora_b.append(up_proj_b[i])
+
+                module_lora.lora_a = lora_a
+                module_lora.lora_b = lora_b
+
     def _get_lora_layer_weights(
         self, lora_model: LoRAModel, module_name: str
     ) -> LoRALayerWeights | None:
@@ -596,9 +755,16 @@ class LRUCacheLoRAModelManager(LoRAModelManager):
         vocab_size: int,
         lora_config: LoRAConfig,
         device: torch.device,
+        vllm_config: VllmConfig | None = None,
     ):
         super().__init__(
-            model, max_num_seqs, max_num_batched_tokens, vocab_size, lora_config, device
+            model,
+            max_num_seqs,
+            max_num_batched_tokens,
+            vocab_size,
+            lora_config,
+            device,
+            vllm_config,
         )
         self._registered_adapters: LoRALRUCache = LoRALRUCache(
             self.capacity, self.deactivate_adapter
@@ -671,6 +837,7 @@ def create_lora_manager(
     max_num_batched_tokens: int,
     vocab_size: int,
     lora_config: LoRAConfig,
+    vllm_config: VllmConfig,
     device: torch.device,
     lora_manager_cls: type[LoRAModelManager] = LoRAModelManager,
     **kwargs,
@@ -684,6 +851,7 @@ def create_lora_manager(
         max_num_batched_tokens=max_num_batched_tokens,
         vocab_size=vocab_size,
         lora_config=lora_config,
+        vllm_config=vllm_config,
         device=device,
         **kwargs,
     )
diff --git a/vllm/lora/ops/triton_ops/fused_moe_lora_op.py b/vllm/lora/ops/triton_ops/fused_moe_lora_op.py
index 34383cdf1767c6823eb3c38d4014a825a8f09602..77103569109026aec0d69b8de2a0ca9d924dc615 100644
--- a/vllm/lora/ops/triton_ops/fused_moe_lora_op.py
+++ b/vllm/lora/ops/triton_ops/fused_moe_lora_op.py
@@ -156,14 +156,22 @@ def _fused_moe_lora_kernel(
         + offs_bn[None, :] * stride_bn
     )
 
+    if USE_GDC and IS_PRIMARY:
+        # GDC launch dependents hints the runtime system to launch dependent kernels.
+        tl.extra.cuda.gdc_launch_dependents()
+
     # accumulator
     accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+
+    if USE_GDC and not IS_PRIMARY:
+        tl.extra.cuda.gdc_wait()
+
     for k in range(0, grid_k):
         k_remaining = K - k * (BLOCK_SIZE_K * SPLIT_K)
-        # pre-fetch lora weight
-        b = tl.load(b_ptrs, mask=offs_k[:, None] < k_remaining, other=0.0)
         # GDC wait waits for ALL programs in the prior kernel to complete
         # before continuing.
+        # pre-fetch lora weight
+        b = tl.load(b_ptrs, mask=offs_k[:, None] < k_remaining, other=0.0)
         if USE_GDC and not IS_PRIMARY:
             tl.extra.cuda.gdc_wait()
         a = tl.load(
@@ -179,9 +187,6 @@ def _fused_moe_lora_kernel(
     if MUL_ROUTED_WEIGHT:
         moe_weight = tl.load(topk_weights_ptr + offs_token, mask=token_mask, other=0)
         accumulator = accumulator * moe_weight[:, None]
-    if USE_GDC and IS_PRIMARY:
-        # GDC launch dependents hints the runtime system to launch dependent kernels.
-        tl.extra.cuda.gdc_launch_dependents()
     accumulator = accumulator.to(c_ptr.dtype.element_ty)
     # Write back the block of the output
     offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
@@ -226,9 +231,9 @@ def _fused_moe_lora_shrink(
     num_stages: int,
     split_k: int,
     mul_routed_weight: bool = False,
+    use_gdc: bool = False,
 ) -> None:
     w1_lora_a_stacked = lora_a_stacked[0]
-    use_gdc = supports_pdl(qcurr_hidden_states.device)
     shrink_config = {
         "BLOCK_SIZE_M": block_size_m,
         "BLOCK_SIZE_N": block_size_n,
@@ -290,6 +295,7 @@ def _fused_moe_lora_shrink(
 def _fused_moe_lora_expand(
     output: torch.Tensor,  # (num_tokens, top_k_num, N*len(lora_a_stacked),)
     a_intermediate_cache1: torch.Tensor,  # (num_slices, M, top_k_num, max_lora_rank)
+    b_intermediate_cache1: torch.Tensor,  # (num_slices, M, top_k_num, output_dim_size)
     lora_b_stacked: list[
         torch.Tensor
     ],  # [(max_loras, num_experts, max_lora_rank, K,),...]
@@ -320,6 +326,7 @@ def _fused_moe_lora_expand(
     split_k: int,
     mul_routed_weight: bool = False,
     offset: int = 0,
+    use_gdc: bool = False,
 ) -> None:
     b_ptr = _get_ptr(lora_b_stacked, device)
     K = max_lora_rank
@@ -331,12 +338,6 @@ def _fused_moe_lora_expand(
         -1, a_intermediate_cache1.shape[3]
     )
 
-    b_intermediate_cache1 = torch.zeros(
-        (num_slices, M, top_k_num, w1_output_dim_size),
-        dtype=output.dtype,
-        device=device,
-    )
-    use_gdc = supports_pdl(a_intermediate_cache1.device)
     expand_config = {
         "BLOCK_SIZE_M": block_size_m,
         "BLOCK_SIZE_N": block_size_n,
@@ -460,6 +461,12 @@ def _fused_moe_lora(
         device=device,
     )
 
+    b_intermediate_cache1 = torch.zeros(
+        (num_slices, M, top_k_num, w1_output_dim_size),
+        dtype=output.dtype,
+        device=device,
+    )
+    use_gdc = supports_pdl(device) and not fully_sharded
     _fused_moe_lora_shrink(
         a_intermediate_cache1,
         qcurr_hidden_states,
@@ -488,6 +495,7 @@ def _fused_moe_lora(
         shrink_num_stages,
         shrink_split_k,
         mul_routed_weight,
+        use_gdc=use_gdc,
     )
 
     if fully_sharded:
@@ -506,6 +514,7 @@ def _fused_moe_lora(
     _fused_moe_lora_expand(
         output,
         a_intermediate_cache1,
+        b_intermediate_cache1,
         lora_b_stacked,
         topk_weights,
         sorted_token_ids,
@@ -534,6 +543,7 @@ def _fused_moe_lora(
         expand_split_k,
         mul_routed_weight,
         offset,
+        use_gdc=use_gdc,
     )
 
 
@@ -596,6 +606,7 @@ def _fused_moe_lora_shrink_fake(
     num_stages: int,
     split_k: int,
     mul_routed_weight: bool = False,
+    use_gdc: bool = False,
 ) -> None:
     return
 
@@ -629,6 +640,7 @@ def _fused_moe_lora_expand_fake(
     num_stages: int,
     split_k: int,
     mul_routed_weight: bool = False,
+    use_gdc: bool = False,
 ) -> None:
     return
 
diff --git a/vllm/lora/ops/triton_ops/lora_expand_op.py b/vllm/lora/ops/triton_ops/lora_expand_op.py
index 311c4b191859741b03edc30445aaef671f650bc1..862f5f6b2431e627e44b6d0e92686a0619723c3a 100644
--- a/vllm/lora/ops/triton_ops/lora_expand_op.py
+++ b/vllm/lora/ops/triton_ops/lora_expand_op.py
@@ -14,8 +14,6 @@ from vllm.lora.ops.triton_ops.utils import _get_lora_b_ptr, get_lora_op_configs
 from vllm.triton_utils import tl, triton
 from vllm.utils.torch_utils import direct_register_custom_op
 
-from .utils import supports_pdl
-
 
 @triton.jit
 def _lora_expand_kernel(
@@ -241,7 +239,9 @@ def _lora_expand(
         # thread blocks simply exit.
         MAX_LORAS,
     )
-    use_gdc = supports_pdl(inputs.device)
+    # We disable PDL temporarily because LoRA kernels are not launching back-to-back,
+    # making PDL invalid and affecting the kernel performance.
+    use_gdc = False  # supports_pdl(inputs.device)
     _lora_expand_kernel[grid](
         inputs,
         lora_ptr_tensor,
diff --git a/vllm/lora/ops/triton_ops/lora_shrink_op.py b/vllm/lora/ops/triton_ops/lora_shrink_op.py
index 71bd5e3614667b626c7a9ea42c790e3f3c727ecc..9ba82b396a48a10522f3545db712bf1a4044244f 100644
--- a/vllm/lora/ops/triton_ops/lora_shrink_op.py
+++ b/vllm/lora/ops/triton_ops/lora_shrink_op.py
@@ -14,8 +14,6 @@ from vllm.lora.ops.triton_ops.utils import _get_lora_a_ptr, get_lora_op_configs
 from vllm.triton_utils import tl, triton
 from vllm.utils.torch_utils import direct_register_custom_op
 
-from .utils import supports_pdl
-
 
 @triton.jit
 def _lora_shrink_kernel(
@@ -221,7 +219,9 @@ def _lora_shrink(
         # thread blocks exit early.
         MAX_LORAS,
     )
-    use_gdc = supports_pdl(inputs.device)
+    # We disable PDL temporarily because LoRA kernels are not launching back-to-back,
+    # making PDL invalid and affecting the kernel performance.
+    use_gdc = False  # supports_pdl(inputs.device)
     _lora_shrink_kernel[grid](
         inputs,
         lora_ptr_tensor,
diff --git a/vllm/lora/ops/triton_ops/utils.py b/vllm/lora/ops/triton_ops/utils.py
index 8ed42382e3a86a5ac6196f8361777c32527c5696..51535f32c136c87e90eba0291088bb15515208f5 100644
--- a/vllm/lora/ops/triton_ops/utils.py
+++ b/vllm/lora/ops/triton_ops/utils.py
@@ -11,9 +11,12 @@ import torch
 
 from vllm import envs
 from vllm.logger import init_logger
+from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant
 from vllm.platforms import current_platform
+from vllm.utils.math_utils import next_power_of_2
 
 logger = init_logger(__name__)
+is_batch_invariant = vllm_is_batch_invariant()
 
 _LORA_A_PTR_DICT: dict[tuple[int, ...], tuple[torch.tensor, ...]] = {}
 _LORA_B_PTR_DICT: dict[tuple[int, ...], tuple[torch.tensor, ...]] = {}
@@ -150,7 +153,8 @@ def _get_lora_b_ptr(
 @functools.lru_cache
 def load_lora_op_config(op_type: str, add_inputs: bool | None) -> dict | None:
     user_defined_config_folder = envs.VLLM_TUNED_CONFIG_FOLDER
-    if user_defined_config_folder is not None:
+    # Avoid optimizing for the batch invariant case. Use default config
+    if user_defined_config_folder is not None and not is_batch_invariant:
         gpu_name = torch.cuda.get_device_name()
         gpu_name = gpu_name.replace(" ", "_")
         gpu_name = gpu_name.replace("-", "_")
@@ -166,7 +170,7 @@ def load_lora_op_config(op_type: str, add_inputs: bool | None) -> dict | None:
 
         config_path = Path(f"{user_defined_config_folder}/{config_fname}")
         if not config_path.exists():
-            logger.warning_once(f"No LoRA kernel configs founded in {config_path}")
+            logger.warning_once(f"No LoRA kernel configs found in {config_path}")
             return None
 
         # Load json
@@ -203,11 +207,14 @@ def get_lora_op_configs(
     # default config
     default = {}
     if op_type == "shrink":
+        split_k = 64 if batch < 128 else 8
+        if is_batch_invariant:
+            split_k = 1
         default = {
             "block_m": 32,
             "block_n": 16,
             "block_k": 256 if batch < 128 else 32,
-            "split_k": 64 if batch < 128 else 8,
+            "split_k": split_k,
             "num_warps": 4,
             "num_ctas": 1,
             "group_size_m": 8,
@@ -217,14 +224,25 @@ def get_lora_op_configs(
     # The default config for fused_moe_lora ops
     elif op_type in [
         "fused_moe_lora_w13_shrink",
-        "fused_moe_lora_w13_expand",
         "fused_moe_lora_w2_shrink",
+    ]:
+        default = {
+            "block_m": 64,
+            "block_n": min(64, next_power_of_2(rank)),
+            "block_k": 32,
+            "num_warps": 4,
+            "num_stages": 3,
+            "group_size_m": 8,
+            "split_k": 1,
+        }
+    elif op_type in [
+        "fused_moe_lora_w13_expand",
         "fused_moe_lora_w2_expand",
     ]:
         default = {
             "block_m": 64,
             "block_n": 64,
-            "block_k": 32,
+            "block_k": max(16, min(32, next_power_of_2(rank))),
             "num_warps": 4,
             "num_stages": 3,
             "group_size_m": 8,
diff --git a/vllm/lora/ops/xla_ops/__init__.py b/vllm/lora/ops/xla_ops/__init__.py
deleted file mode 100644
index b5570ceca68cab53130dad43cfc9c39e89fdd936..0000000000000000000000000000000000000000
--- a/vllm/lora/ops/xla_ops/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from vllm.lora.ops.xla_ops.lora_ops import bgmv_expand, bgmv_expand_slice, bgmv_shrink
-
-__all__ = ["bgmv_expand", "bgmv_expand_slice", "bgmv_shrink"]
diff --git a/vllm/lora/ops/xla_ops/lora_ops.py b/vllm/lora/ops/xla_ops/lora_ops.py
deleted file mode 100644
index 4924890b388cbd35510b724139764f6483503b62..0000000000000000000000000000000000000000
--- a/vllm/lora/ops/xla_ops/lora_ops.py
+++ /dev/null
@@ -1,141 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import jax
-import jax.numpy as jnp
-import torch
-import torch.nn.functional as F
-import torch_xla.core.xla_builder as xb
-from torch.library import impl
-from torch_xla.experimental.custom_kernel import XLA_LIB, jax_import_guard
-
-
-@jax.jit
-def bgmv_jax(inputs, loras, idxs):
-    return jnp.einsum(
-        "td,tX,Xld->tl",
-        inputs,
-        jax.nn.one_hot(idxs, loras.shape[0], dtype=inputs.dtype),
-        loras,
-    )
-
-
-XLA_LIB.define("bgmv(Tensor inputs, Tensor loras, Tensor idxs) -> Tensor")
-
-
-@impl(XLA_LIB, "bgmv", "XLA")
-def bgmv_xla(inputs: torch.Tensor, loras: torch.Tensor, idxs: torch.IntTensor):
-    if len(loras.shape) == 4:
-        loras = loras.squeeze(axis=1)
-
-    jax_import_guard()
-    return xb.call_jax(bgmv_jax, (inputs, loras, idxs))
-
-
-@impl(XLA_LIB, "bgmv", "CompositeExplicitAutograd")
-def bgmv_non_xla(inputs: torch.Tensor, loras: torch.Tensor, idxs: torch.IntTensor):
-    T, _ = inputs.shape
-    if len(loras.shape) == 4:
-        loras = loras.squeeze(axis=1)
-    _, L, _ = loras.shape
-
-    return torch.empty((T, L), device=inputs.device)
-
-
-def bgmv_expand(
-    inputs: torch.Tensor,
-    lora_b_weights: torch.Tensor,
-    output_tensor: torch.Tensor,
-    lora_indices_tensor: torch.Tensor,
-    add_inputs: bool = True,
-):
-    """
-    Args:
-        inputs (torch.Tensor): Input tensor of shape [num_tokens, hidden_size].
-
-        lora_b_weights (torch.Tensor): LoRA weights of shape
-            [num_loras, lora_rank, hidden_size].
-
-        output_tensor (torch.Tensor): output tensor of shape
-            [num_tokens, hidden_size * num_slices].
-
-        lora_indices_tensor (torch.Tensor): Tensor of shape [num_tokens]
-            indicating which LoRA matrix to use for each token.
-        add_inputs (bool): Whether or not to add the input tensor to the output
-            tensor.
-    """
-
-    outputs = torch.ops.xla.bgmv(inputs, lora_b_weights, lora_indices_tensor)
-
-    limit = output_tensor.shape[0]
-    if outputs.shape[0] == 1 and output_tensor.shape[0] != 1:
-        limit = 1
-
-    if output_tensor.shape[1] > outputs.shape[1]:
-        outputs = F.pad(outputs, (0, output_tensor.shape[1] - outputs.shape[1], 0, 0))
-
-    if add_inputs:
-        return output_tensor + outputs[:limit, : output_tensor.shape[1]]
-    else:
-        return outputs[:limit, : output_tensor.shape[1]]
-
-
-def bgmv_shrink(
-    inputs: torch.Tensor,
-    lora_b_weights: torch.Tensor,
-    lora_indices_tensor: torch.Tensor,
-    scaling: float = 1.0,
-):
-    """
-    Args:
-        inputs (torch.Tensor): Input tensor of shape [num_tokens, hidden_size].
-        lora_b_weights (torch.Tensor): LoRA weights of shape
-            [num_loras, lora_rank, hidden_size].
-        lora_indices_tensor (torch.Tensor): Tensor of shape [num_tokens]
-            indicating which LoRA matrix to use for each token.
-        scaling (float, optional): Scalar multiplier applied to the output.
-    """
-
-    return scaling * torch.ops.xla.bgmv(inputs, lora_b_weights, lora_indices_tensor)
-
-
-def bgmv_expand_slice(
-    inputs: torch.Tensor,
-    lora_b_weights: torch.Tensor,
-    output_tensor: torch.Tensor,
-    lora_indices_tensor: torch.Tensor,
-    slice_offset: int,
-    slice_size: int,
-    add_inputs: bool = True,
-):
-    """
-    Args:
-        inputs (torch.Tensor): Input tensor of shape [num_tokens, hidden_size].
-
-        lora_b_weights (torch.Tensor): LoRA weights of shape
-            [num_loras, lora_rank, hidden_size].
-
-        output_tensor (torch.Tensor): output tensor of shape
-            [num_tokens, hidden_size * num_slices].
-
-        lora_indices_tensor (torch.Tensor): Tensor of shape [num_tokens]
-            indicating which LoRA matrix to use for each token.
-        add_inputs (bool): Whether or not to add the input tensor to the output
-            tensor.
-    """
-    outputs = torch.ops.xla.bgmv(inputs, lora_b_weights, lora_indices_tensor)
-
-    outputs = F.pad(
-        outputs,
-        (
-            slice_offset,
-            output_tensor.shape[1] - (slice_offset + slice_size),
-            0,
-            0,
-        ),
-    )
-
-    if add_inputs:
-        return output_tensor + outputs
-    else:
-        return outputs
diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py
index ef4b4ab7c34979ea2a82de090bf86175a74d2483..f765df0b390346043eda8d62a231e09c71e0949c 100644
--- a/vllm/lora/punica_wrapper/punica_gpu.py
+++ b/vllm/lora/punica_wrapper/punica_gpu.py
@@ -45,7 +45,8 @@ class PunicaWrapperGPU(PunicaWrapperBase):
     ):
         PunicaWrapperBase.__init__(self, max_num_batched_tokens, max_batches, device)
 
-        self.max_loras = kwargs["max_loras"]
+        self.lora_config = kwargs["lora_config"]
+        self.max_loras = self.lora_config.max_loras
 
         self.token_mapping_meta = LoRAKernelMeta.make(
             self.max_loras, max_num_batched_tokens, device=device
diff --git a/vllm/lora/punica_wrapper/punica_tpu.py b/vllm/lora/punica_wrapper/punica_tpu.py
deleted file mode 100644
index 0888772db54e7dfd4288ee3aadcfbb765cb511ee..0000000000000000000000000000000000000000
--- a/vllm/lora/punica_wrapper/punica_tpu.py
+++ /dev/null
@@ -1,358 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import math
-from typing import TYPE_CHECKING
-
-import torch
-import torch.nn.functional as F
-import torch_xla
-
-from vllm.lora.ops.xla_ops import bgmv_expand, bgmv_expand_slice, bgmv_shrink
-from vllm.lora.punica_wrapper.utils import convert_mapping
-
-if TYPE_CHECKING:
-    # avoid circuit import
-    from vllm.lora.layers import LoRAMapping
-
-from .punica_base import PunicaWrapperBase
-
-
-class PunicaWrapperTPU(PunicaWrapperBase):
-    """
-    PunicaWrapperTPU is designed to manage and provide metadata for the punica
-    kernel. The main function is to maintain the state information for
-    Multi-LoRA, and to provide the interface for the pytorch punica ops.
-    """
-
-    def __init__(
-        self,
-        max_num_batched_tokens: int,
-        max_batches: int,
-        device: torch.device | str,
-        **kwargs,
-    ):
-        PunicaWrapperBase.__init__(self, max_num_batched_tokens, max_batches, device)
-
-        # PunicaWrapperBase defines some tensors with dtype=torch.int64, which
-        # isn't supported by the TPU. So convert those tensors to int32.
-        # Not all of them are used by the TPU so only convert the useful ones.
-        self._token_lora_indices = self._token_lora_indices.to(dtype=torch.int32)
-        self._sampler_indices = self._sampler_indices.to(dtype=torch.int32)
-        self._sampler_indices_padded = self._sampler_indices_padded.to(
-            dtype=torch.int32
-        )
-
-        torch.ops.xla.dynamo_set_buffer_donor_(self._token_lora_indices, True)
-        torch.ops.xla.dynamo_set_buffer_donor_(self._sampler_indices, True)
-        torch.ops.xla.dynamo_set_buffer_donor_(self._sampler_indices_padded, True)
-        torch.ops.xla.dynamo_set_buffer_donor_(self._embeddings_indices, True)
-        torch.ops.xla.dynamo_set_buffer_donor_(self._lora_indices_per_batch, True)
-
-        torch._dynamo.mark_dynamic(self._token_lora_indices, 0)
-        torch._dynamo.mark_dynamic(self._embeddings_indices, 1)
-        torch._dynamo.mark_dynamic(self._sampler_indices_padded, 0)
-
-    def _get_token_lora_indices(self, x: torch.Tensor) -> torch.IntTensor:
-        return torch.narrow(self._token_lora_indices, 0, 0, x.size(0))
-
-    @property
-    def embeddings_indices(self) -> torch.Tensor:
-        """
-        This property provides access to the indices used for lora embeddings,
-        specifically for VocabParallelEmbeddingWithLoRA.
-        """
-        return self._embeddings_indices[:]
-
-    @property
-    def sampler_indices_padded(self) -> torch.Tensor:
-        """
-        This property provides access to padded sampler indices.
-        """
-        return self._sampler_indices_padded[:]
-
-    def shrink(
-        self,
-        x: torch.Tensor,
-        w_t_all: torch.Tensor,
-        scale: float,
-    ):
-        return bgmv_shrink(x, w_t_all, self._get_token_lora_indices(x), scale)
-
-    def expand(
-        self, y: torch.Tensor, x: torch.Tensor, w_t_all: torch.Tensor, add_inputs: bool
-    ):
-        return bgmv_expand(x, w_t_all, y, self._get_token_lora_indices(x), add_inputs)
-
-    def expand_slice(
-        self,
-        y: torch.Tensor,
-        x: torch.Tensor,
-        w_t_all: torch.Tensor,
-        y_offset: int,
-        y_slice_size: int,
-        add_inputs: bool,
-    ) -> torch.Tensor:
-        return bgmv_expand_slice(
-            x,
-            w_t_all,
-            y,
-            self._get_token_lora_indices(x),
-            y_offset,
-            y_slice_size,
-            add_inputs,
-        )
-
-    def add_shrink(
-        self,
-        y: tuple[torch.Tensor, ...] | torch.Tensor,
-        x: torch.Tensor,
-        lora_a_stacked: tuple[torch.Tensor, ...],
-        scale: float,
-        **kwargs,
-    ) -> torch.Tensor | None:
-        """
-        Performs GEMM for multiple slices of lora_a.
-
-        Semantics:
-        for i in range(len(lora_a_stacked)):
-            y[i] += (x @ lora_a_stacked[i]) * scale
-
-        Args:
-            y (Union[tuple[torch.Tensor, ...], torch.Tensor]): Output tensors
-            x (torch.Tensor): Input tensor
-            lora_a_stacked (tuple[torch.Tensor, ...]): lora_a's weights
-            scale (float): Scaling factor for the operation
-        """
-
-        torch.ops.xla.dynamo_set_buffer_donor_(y, True)
-        x = x.view(-1, x.shape[-1])
-
-        for slice_idx in range(len(lora_a_stacked)):
-            lora_s = lora_a_stacked[slice_idx]
-            y_s = self.shrink(x, lora_s, scale)
-            y[slice_idx, :, :] = y_s  # type: ignore[index]
-        return y
-
-    def add_expand(
-        self,
-        y: torch.Tensor,
-        x: tuple[torch.Tensor, ...] | torch.Tensor,
-        lora_b_stacked: tuple[torch.Tensor, ...],
-        output_slices: tuple[int, ...],
-        offset_start: int = 0,
-        add_inputs=True,
-        **kwargs,
-    ) -> torch.Tensor:
-        """
-        Performs GEMM for multiple slices of lora_b.
-
-        Semantics:
-            for i in range(len(lora_b_stacked)):
-                slice = output_slices[i]
-                y[:, offset:offset+slice] += x[i] @ lora_b_stacked[i]
-                offset += slice
-
-        Args:
-            y (torch.Tensor): Output tensor.
-            x (Union[tuple[torch.Tensor, ...], torch.Tensor]): Input tensors
-            lora_b_stacked (tuple[torch.Tensor, ...]): lora_b's weight
-            output_slices (tuple[int, ...]): Every slice's size
-            add_inputs (bool):  Defaults to True.
-        """
-        y_org = y
-        y = y.view(-1, y.shape[-1])
-        offset_left = 0
-
-        for slice_idx in range(len(lora_b_stacked)):
-            y = self.expand_slice(
-                y,
-                x[slice_idx],
-                lora_b_stacked[slice_idx],
-                offset_left,
-                output_slices[slice_idx],
-                add_inputs=add_inputs,
-            )
-            offset_left += output_slices[slice_idx]
-        return y.view_as(y_org)
-
-    def add_lora_embedding(
-        self,
-        y: torch.Tensor,
-        x: torch.Tensor,
-        lora_b_stacked: torch.Tensor,
-        add_inputs: bool = True,
-        **kwargs,
-    ) -> torch.Tensor:
-        """
-        Applies lora  specifically for VocabParallelEmbeddingWithLoRA.
-
-        Semantics:
-            y += x @ lora_b_stacked
-
-        Args:
-            y (torch.Tensor): Output tensor.
-            x (torch.Tensor): Input tensor.
-            lora_b_stacked (torch.Tensor): lora_b's weights.
-            add_inputs (bool): Default to True.
-        """
-
-        # Embedding layer only needs the expand op
-        return self.expand(y, x, lora_b_stacked, add_inputs)
-
-    def add_lora_linear(
-        self,
-        y: torch.Tensor,
-        x: torch.Tensor,
-        lora_a_stacked: tuple[torch.Tensor, ...],
-        lora_b_stacked: tuple[torch.Tensor, ...],
-        scale: float,
-        output_slices: tuple[int, ...],
-        *,
-        buffer: tuple[torch.Tensor, ...] | None = None,
-        **kwargs,
-    ) -> torch.Tensor:
-        """
-        Applicable to linear-related lora.
-
-        Semantics:
-            for i in range(len(lora_a_stacked)):
-                y[i] += (
-                    x[i].unsqueeze(0)
-                    @ lora_a_stacked[indices[i], layer_idx, :, :]
-                    @ lora_b_stacked[indices[i], layer_idx, :, :]
-                    * scale
-                    ).squeeze(0)
-
-        Args:
-            y (torch.Tensor): Output tensor. Will not be changed in-place.
-            x (torch.Tensor): Input tensor (T, E)
-            lora_a_stacked (tuple[torch.Tensor, ...]): lora_a's weight.
-            lora_b_stacked (tuple[torch.Tensor, ...]): lora_b's weight.
-            scale (float): Scaling factor.
-            output_slices (tuple[int, ...]): Every slice's size.
-            buffer (Optional[tuple[torch.Tensor, ...]]): Defaults to None.
-        """
-
-        assert len(lora_a_stacked) == len(lora_b_stacked) == len(output_slices)
-
-        if buffer is None:
-            r = lora_b_stacked[0].size(-1)
-            T = x.size(0)
-            buffer = torch.zeros(
-                (len(output_slices), T, r),
-                dtype=x.dtype,
-                device=x.device,
-            )
-        buffer = self.add_shrink(buffer, x, lora_a_stacked, scale, **kwargs)
-        return self.add_expand(
-            y, buffer, lora_b_stacked, output_slices, add_inputs=True, **kwargs
-        )
-
-    def add_lora_logits(
-        self,
-        y: torch.Tensor,
-        x: torch.Tensor,
-        lora_a_stacked: torch.Tensor,
-        lora_b_stacked: torch.Tensor,
-        scale,
-        *,
-        buffer: torch.Tensor | None = None,
-        **kwargs,
-    ) -> torch.Tensor:
-        """
-        Applies lora specifically for LogitsProcessorWithLoRA.
-
-        Semantics:
-            buffer = (x @ lora_a_stacked) * scale
-            y += buffer @ lora_b_stacked
-
-        Args:
-            y (torch.Tensor): Output tensor.
-            x (torch.Tensor): Input tensor.
-            lora_a_stacked (torch.Tensor): lora_a's weights.
-            lora_b_stacked (torch.Tensor):lora_b's weights.
-            scale (float): Scaling factor.
-            buffer (Optional[torch.Tensor]):Default to None.
-        """
-        y_org = y
-        y = y.view(-1, y.shape[-1])
-        x = x.view(-1, x.shape[-1])
-
-        sampler_indices = torch.narrow(self._sampler_indices, 0, 0, x.size(0))
-        buffer = bgmv_shrink(x, lora_a_stacked, sampler_indices, scale)
-        y = bgmv_expand(buffer, lora_b_stacked, y, sampler_indices, add_inputs=True)
-        return y.view_as(y_org)
-
-    # This performs the same tensor ops as the base method, except it does them
-    # on the CPU then transfers the results to the TPU
-    def _update_base_metadata(
-        self,
-        mapping: "LoRAMapping",
-        lora_index_to_id: list[int | None],
-        max_loras: int,
-        vocab_size: int,
-    ):
-        # Make sure we don't accidentally collect outside operations
-        torch_xla.sync()
-
-        # Pad the prompt mapping to avoid running into recompiles on the TPU
-        # TODO: Should this happen inside mapping internally? If so how can we
-        # avoid having backend specific LoRAMapping classes?
-        mapping.prompt_mapping = self._pad_prompt_mapping(mapping.prompt_mapping)
-
-        (
-            base_indices,
-            sampler_indices,
-            sampler_indices_padded,
-            embeddings_indices,
-            indices_len,
-        ) = convert_mapping(
-            mapping,
-            lora_index_to_id,
-            max_loras,
-            vocab_size,
-            0,  # extra_vocab_size
-            "cpu",
-        )
-        self._token_lora_indices = self._pad_to_shape(
-            base_indices, self._token_lora_indices.shape, dims=1
-        ).to(self.device)
-        self._sampler_indices = self._pad_to_shape(
-            sampler_indices, self._sampler_indices.shape, dims=1
-        ).to(self.device)
-        self._sampler_indices_padded = self._pad_to_shape(
-            sampler_indices_padded, self._sampler_indices_padded.shape, dims=1
-        ).to(self.device)
-        self._embeddings_indices = self._pad_to_shape(
-            embeddings_indices, self._embeddings_indices.shape, dims=2
-        ).to(self.device)
-        self.indices_len[:] = indices_len
-
-    def _update_prefill_metadata(self, token_lora_tensor: torch.Tensor) -> None:
-        self.batch_size = 1
-        self._lora_indices_per_batch[: self.batch_size] = token_lora_tensor[
-            : self.batch_size
-        ]
-
-    def _pad_prompt_mapping(self, prompt_mapping: tuple[int, ...]) -> tuple[int, ...]:
-        num_reqs = len(prompt_mapping)
-
-        # From vllm/v1/worker/tpu_model_runner:51, but need to avoid a circular
-        # import
-        MIN_NUM_SEQS = 8
-
-        padded_num_reqs = max(2 ** math.ceil(math.log2(num_reqs)), MIN_NUM_SEQS)
-        pad_len = padded_num_reqs - num_reqs
-
-        padding = [-1] * pad_len
-        return tuple(list(prompt_mapping) + padding)
-
-    def _pad_to_shape(self, src, target_shape, dims=1):
-        if dims == 1:
-            pad_len = target_shape[0] - src.shape[0]
-            return F.pad(src, (0, pad_len), value=0).to(torch.int32)
-        else:
-            pad_rows = target_shape[0] - src.shape[0]
-            pad_cols = target_shape[1] - src.shape[1]
-            return F.pad(src, (0, pad_cols, 0, pad_rows), value=0).to(torch.int32)
diff --git a/vllm/lora/request.py b/vllm/lora/request.py
index c97e435e32165f2d0f13ca580f549a84a2f11768..2811fee1dfb2d963048c2ff1ec416faa20dce5e3 100644
--- a/vllm/lora/request.py
+++ b/vllm/lora/request.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import warnings
 
 import msgspec
 
@@ -14,11 +13,6 @@ class LoRARequest(
     """
     Request for a LoRA adapter.
 
-    Note that this class should be used internally. For online
-    serving, it is recommended to not allow users to use this class but
-    instead provide another layer of abstraction to prevent users from
-    accessing unauthorized LoRA adapters.
-
     lora_int_id must be globally unique for a given adapter.
     This is currently not enforced in vLLM.
     """
@@ -26,24 +20,12 @@ class LoRARequest(
     lora_name: str
     lora_int_id: int
     lora_path: str = ""
-    lora_local_path: str | None = msgspec.field(default=None)
-    long_lora_max_len: int | None = None
     base_model_name: str | None = msgspec.field(default=None)
     tensorizer_config_dict: dict | None = None
 
     def __post_init__(self):
         if self.lora_int_id < 1:
             raise ValueError(f"id must be > 0, got {self.lora_int_id}")
-        if self.lora_local_path:
-            warnings.warn(
-                "The 'lora_local_path' attribute is deprecated "
-                "and will be removed in a future version. "
-                "Please use 'lora_path' instead.",
-                DeprecationWarning,
-                stacklevel=2,
-            )
-            if not self.lora_path:
-                self.lora_path = self.lora_local_path or ""
 
         # Ensure lora_path is not empty
         assert self.lora_path, "lora_path cannot be empty"
@@ -60,28 +42,6 @@ class LoRARequest(
     def path(self):
         return self.lora_path
 
-    @property
-    def local_path(self):
-        warnings.warn(
-            "The 'local_path' attribute is deprecated "
-            "and will be removed in a future version. "
-            "Please use 'path' instead.",
-            DeprecationWarning,
-            stacklevel=2,
-        )
-        return self.lora_path
-
-    @local_path.setter
-    def local_path(self, value):
-        warnings.warn(
-            "The 'local_path' attribute is deprecated "
-            "and will be removed in a future version. "
-            "Please use 'path' instead.",
-            DeprecationWarning,
-            stacklevel=2,
-        )
-        self.lora_path = value
-
     def __eq__(self, value: object) -> bool:
         """
         Overrides the equality method to compare LoRARequest
diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py
index 4d264c06826b82bce33ba1a90bd86241addbec6d..75aeccd004422f43794d9767c28603a9becedca0 100644
--- a/vllm/lora/utils.py
+++ b/vllm/lora/utils.py
@@ -5,7 +5,6 @@ import os
 from typing import TYPE_CHECKING, Optional
 
 import huggingface_hub
-import regex as re
 from huggingface_hub.utils import (
     EntryNotFoundError,
     HfHubHTTPError,
@@ -186,39 +185,6 @@ def is_base_embeddding_weights(name: str) -> bool:
     return name.endswith(embedding_suffixes)
 
 
-def is_regex_target_modules(
-    load_modules: str | list[str], expected_lora_modules: set[str]
-) -> bool:
-    """
-    PEFT supports passing `target_modules` in the form of regular expressions,
-    such as `model.*(q_proj|k_proj|v_proj)$`. This function is mainly used to
-    determine whether the suffix in the regular expression is present in the
-    `expected_lora_modules`.
-    """
-
-    def is_valid_regex(pattern):
-        try:
-            re.compile(pattern)
-            return True
-        except re.error:
-            return False
-
-    def is_subset(sub_list, full_set):
-        return set(sub_list).issubset(full_set)
-
-    # Similar to PEFT's processing logic, regex-related operations are only
-    #  executed when the load_modules is a `str`.
-    if not isinstance(load_modules, str):
-        return False
-
-    if is_valid_regex(load_modules):
-        match = re.search(r"\((.*?)\)\$?$", load_modules)
-        if match:
-            suffix = match.group(1).split("|")
-            return is_subset(suffix, expected_lora_modules)
-    return False
-
-
 def get_supported_lora_modules(model: nn.Module) -> list[str]:
     """
     In vLLM, all linear layers support LoRA.
diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py
index 28c2a53d84e42c29896ca388b16cde3e8a4d372d..277e462a39e008c6fb04ff0ebb79b8b5faaf3231 100644
--- a/vllm/lora/worker_manager.py
+++ b/vllm/lora/worker_manager.py
@@ -69,6 +69,7 @@ class WorkerLoRAManager:
     def create_lora_manager(
         self,
         model: torch.nn.Module,
+        vllm_config: VllmConfig | None = None,
     ) -> Any:
         lora_manager = create_lora_manager(
             model,
@@ -78,6 +79,7 @@ class WorkerLoRAManager:
             lora_config=self.lora_config,
             device=self.device,
             lora_manager_cls=self._manager_cls,
+            vllm_config=vllm_config,
         )
         self._adapter_manager = lora_manager
         return lora_manager.model
@@ -161,6 +163,12 @@ class WorkerLoRAManager:
         if mapping is not None:
             self._adapter_manager.set_adapter_mapping(mapping)
 
+    def supports_tower_connector_lora(self) -> bool:
+        return (
+            self._adapter_manager.supports_mm
+            and self._adapter_manager.supports_tower_connector_lora
+        )
+
     def _apply_adapters(self, adapter_requests: set[Any]) -> None:
         existing_adapters = self.list_adapters()
         models_map = {
@@ -210,6 +218,7 @@ class LRUCacheWorkerLoRAManager(WorkerLoRAManager):
     def create_lora_manager(
         self,
         model: torch.nn.Module,
+        vllm_config: VllmConfig | None = None,
     ) -> Any:
         lora_manager = create_lora_manager(
             model,
@@ -219,6 +228,7 @@ class LRUCacheWorkerLoRAManager(WorkerLoRAManager):
             lora_config=self.lora_config,
             device=self.device,
             max_num_batched_tokens=self.max_num_batched_tokens,
+            vllm_config=vllm_config,
         )
         self._adapter_manager = lora_manager
         return lora_manager.model
diff --git a/vllm/model_executor/__init__.py b/vllm/model_executor/__init__.py
index b50f0cb3a61a2f38ffed1a3b8bfa4e9f2af7bd6d..8d79940b858f217aa28f28fe7eb3df7037dc6e17 100644
--- a/vllm/model_executor/__init__.py
+++ b/vllm/model_executor/__init__.py
@@ -2,10 +2,8 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.model_executor.parameter import BasevLLMParameter, PackedvLLMParameter
-from vllm.model_executor.utils import set_random_seed
 
 __all__ = [
-    "set_random_seed",
     "BasevLLMParameter",
     "PackedvLLMParameter",
 ]
diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py
index 66250f816f45918becac11b3296c62e3b4ff9117..81ba544b481315a128804333962935514514b0ad 100644
--- a/vllm/model_executor/custom_op.py
+++ b/vllm/model_executor/custom_op.py
@@ -67,8 +67,9 @@ class CustomOp(nn.Module):
         return self.forward_native(*args, **kwargs)
 
     def forward_cpu(self, *args, **kwargs):
-        # By default, we assume that CPU ops are compatible with CUDA ops.
-        return self.forward_cuda(*args, **kwargs)
+        # By default, we assume that CPU ops are compatible with the
+        # PyTorch-native implementation.
+        return self.forward_native(*args, **kwargs)
 
     def forward_tpu(self, *args, **kwargs):
         # By default, we assume that TPU ops are compatible with the
@@ -86,9 +87,12 @@ class CustomOp(nn.Module):
         # specific backend. Currently, we do not support dynamic dispatching.
         compilation_config = get_cached_compilation_config()
 
-        # CustomOp object can be enforce enabled, e.g., enable device-specific
-        # kernels in ViT models when enabling graph mode. By default, it will
-        # follow the compilation_config to determine whether enable itself.
+        # NOTE(shen-shanshan): CustomOp object can be enforce enabled, e.g.,
+        # enable device-specific kernels in ViT models when enabling graph
+        # mode. By default, it will follow the compilation_config to determine
+        # whether enable itself.
+        # This enforce_enable mechanism will be removed after we adding a
+        # separate compilation_config for multi-modal part.
         enabled = self._enforce_enable or self.enabled()
         if enabled:
             compilation_config.enabled_custom_ops.update([self.__class__.name])
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index 465a0977273176adecd459913771825d6096bc2a..09dc0ab169b8a7037a6446a0dda4fb5fd5cce9b9 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -23,6 +23,7 @@ import vllm.envs as envs
 logger = init_logger(__name__)
 
 
+# --8<-- [start:fatrelu_and_mul]
 @CustomOp.register("fatrelu_and_mul")
 class FatreluAndMul(CustomOp):
     """An activation function for FATReLU.
@@ -36,6 +37,8 @@ class FatreluAndMul(CustomOp):
         return: (num_tokens, d) or (batch_size, seq_len, d)
     """
 
+    # --8<-- [end:fatrelu_and_mul]
+
     def __init__(self, threshold: float = 0.0):
         super().__init__()
         self.threshold = threshold
@@ -59,6 +62,7 @@ class FatreluAndMul(CustomOp):
         return out
 
 
+# --8<-- [start:silu_and_mul]
 @CustomOp.register("silu_and_mul")
 class SiluAndMul(CustomOp):
     """An activation function for SwiGLU.
@@ -70,6 +74,8 @@ class SiluAndMul(CustomOp):
         return: (num_tokens, d) or (batch_size, seq_len, d)
     """
 
+    # --8<-- [end:silu_and_mul]
+
     def __init__(self):
         super().__init__()
         if current_platform.is_cuda_alike():
@@ -109,6 +115,7 @@ class SiluAndMul(CustomOp):
         return out
 
 
+# --8<-- [start:mul_and_silu]
 @CustomOp.register("mul_and_silu")
 class MulAndSilu(CustomOp):
     """An activation function for SwiGLU.
@@ -120,6 +127,8 @@ class MulAndSilu(CustomOp):
         return: (num_tokens, d) or (batch_size, seq_len, d)
     """
 
+    # --8<-- [end:mul_and_silu]
+
     def __init__(self):
         super().__init__()
         if current_platform.is_cuda_alike():
@@ -147,6 +156,7 @@ class MulAndSilu(CustomOp):
     # def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
 
 
+# --8<-- [start:gelu_and_mul_sparse]
 @CustomOp.register("gelu_and_mul_sparse")
 class GeluAndMulSparse(CustomOp):
     """An activation function for GeluAndMulSparse.
@@ -161,6 +171,8 @@ class GeluAndMulSparse(CustomOp):
         return: (num_tokens, d) or (batch_size, seq_len, d)
     """
 
+    # --8<-- [end:gelu_and_mul_sparse]
+
     def __init__(self, activation_sparsity: float, approximate: str = "none"):
         super().__init__()
         # Gelu.
@@ -203,6 +215,7 @@ class GeluAndMulSparse(CustomOp):
         return self.forward_native(x)
 
 
+# --8<-- [start:gelu_and_mul]
 @CustomOp.register("gelu_and_mul")
 class GeluAndMul(CustomOp):
     """An activation function for GeGLU.
@@ -214,6 +227,8 @@ class GeluAndMul(CustomOp):
         return: (batch_size, seq_len, d) or (num_tokens, d)
     """
 
+    # --8<-- [end:gelu_and_mul]
+
     def __init__(self, approximate: str = "none"):
         super().__init__()
         self.approximate = approximate
@@ -270,9 +285,12 @@ class GeluAndMul(CustomOp):
         return f"approximate={repr(self.approximate)}"
 
 
+# --8<-- [start:swigluoai_and_mul]
 @CustomOp.register("swigluoai_and_mul")
 class SwigluOAIAndMul(CustomOp):
     # https://github.com/huggingface/transformers/blob/v4.55.0/src/transformers/models/gpt_oss/modeling_gpt_oss.py#L106-L110
+    # --8<-- [end:swigluoai_and_mul]
+
     def __init__(self, alpha: float = 1.702, limit: float = 7.0):
         super().__init__()
         self.alpha = alpha
@@ -299,8 +317,11 @@ class SwigluOAIAndMul(CustomOp):
         return f"alpha={repr(self.alpha)}, limit={repr(self.limit)}"
 
 
+# --8<-- [start:gelu_new]
 @CustomOp.register("gelu_new")
 class NewGELU(CustomOp):
+    # --8<-- [end:gelu_new]
+
     def __init__(self):
         super().__init__()
         if current_platform.is_cuda_alike() or current_platform.is_cpu():
@@ -324,8 +345,11 @@ class NewGELU(CustomOp):
         return self.op(x)
 
 
+# --8<-- [start:gelu_fast]
 @CustomOp.register("gelu_fast")
 class FastGELU(CustomOp):
+    # --8<-- [end:gelu_fast]
+
     def __init__(self):
         super().__init__()
         if current_platform.is_cuda_alike() or current_platform.is_cpu():
@@ -348,9 +372,12 @@ class FastGELU(CustomOp):
         return self.op(x)
 
 
+# --8<-- [start:quick_gelu]
 @CustomOp.register("quick_gelu")
 class QuickGELU(CustomOp):
     # https://github.com/huggingface/transformers/blob/main/src/transformers/activations.py#L90
+    # --8<-- [end:quick_gelu]
+
     def __init__(self):
         super().__init__()
         if current_platform.is_cuda_alike() or current_platform.is_cpu():
@@ -378,12 +405,15 @@ class QuickGELU(CustomOp):
     # def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
 
 
+# --8<-- [start:relu2]
 @CustomOp.register("relu2")
 class ReLUSquaredActivation(CustomOp):
     """
     Applies the relu^2 activation introduced in https://arxiv.org/abs/2109.08668v2
     """
 
+    # --8<-- [end:relu2]
+
     def forward_native(self, x: torch.Tensor) -> torch.Tensor:
         """PyTorch-native implementation equivalent to forward()."""
         return torch.square(F.relu(x))
@@ -393,6 +423,7 @@ class ReLUSquaredActivation(CustomOp):
         return self.forward_native(x)
 
 
+# --8<-- [start:xielu]
 @CustomOp.register("xielu")
 class XIELU(CustomOp):
     """
@@ -401,6 +432,8 @@ class XIELU(CustomOp):
     Otherwise, we emit a single warning and use xIELU Python
     """
 
+    # --8<-- [end:xielu]
+
     def __init__(
         self,
         alpha_p_init: float = 0.8,
diff --git a/vllm/model_executor/layers/attention/__init__.py b/vllm/model_executor/layers/attention/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/vllm/attention/layers/chunked_local_attention.py b/vllm/model_executor/layers/attention/chunked_local_attention.py
similarity index 84%
rename from vllm/attention/layers/chunked_local_attention.py
rename to vllm/model_executor/layers/attention/chunked_local_attention.py
index 0ced0028ded9ef61e50aba9ace5b05168311fea2..8916ff0c4c6dcd760a65eb893afdbc17a69ab7e5 100644
--- a/vllm/attention/layers/chunked_local_attention.py
+++ b/vllm/model_executor/layers/attention/chunked_local_attention.py
@@ -4,19 +4,21 @@ import functools
 
 import torch
 
-from vllm.attention.backends.abstract import AttentionBackend, AttentionMetadata
 from vllm.attention.layer import Attention
-from vllm.attention.selector import get_attn_backend
 from vllm.config import CacheConfig
 from vllm.config.vllm import VllmConfig
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.v1.attention.backends.utils import (
+from vllm.v1.attention.backend import (
+    AttentionBackend,
     AttentionCGSupport,
     AttentionMetadataBuilder,
     CommonAttentionMetadata,
+)
+from vllm.v1.attention.backends.utils import (
     make_local_attention_virtual_batches,
     subclass_attention_backend,
 )
+from vllm.v1.attention.selector import get_attn_backend
 from vllm.v1.kv_cache_interface import (
     AttentionSpec,
     ChunkedLocalAttentionSpec,
@@ -51,11 +53,19 @@ def create_chunked_local_attention_backend(
             common_prefix_len: int,
             common_attn_metadata: CommonAttentionMetadata,
             fast_build: bool = False,
-        ) -> AttentionMetadata:
-            common_attn_metadata = make_local_attention_virtual_batches(
+        ):
+            cm, make_virtual_batches_block_table = make_local_attention_virtual_batches(
                 attention_chunk_size, common_attn_metadata, block_size
             )
-            return super().build(common_prefix_len, common_attn_metadata, fast_build)
+            metadata = super().build(common_prefix_len, cm, fast_build)
+            metadata.make_virtual_batches_block_table = make_virtual_batches_block_table
+            return metadata
+
+        def update_block_table(
+            self, metadata, blk_table: torch.Tensor, slot_mapping: torch.Tensor
+        ):
+            blk_table = metadata.make_virtual_batches_block_table(blk_table)
+            return super().update_block_table(metadata, blk_table, slot_mapping)
 
     attn_backend = subclass_attention_backend(
         name_prefix=prefix,
diff --git a/vllm/attention/layers/cross_attention.py b/vllm/model_executor/layers/attention/cross_attention.py
similarity index 96%
rename from vllm/attention/layers/cross_attention.py
rename to vllm/model_executor/layers/attention/cross_attention.py
index cfd203bdd37b93485c05c8889670d9f1571b1100..a16981a831897e3d9159932336a3a67d9c466140 100644
--- a/vllm/attention/layers/cross_attention.py
+++ b/vllm/model_executor/layers/attention/cross_attention.py
@@ -6,20 +6,20 @@ from copy import copy
 import numpy as np
 import torch
 
-from vllm.attention.backends.abstract import (
-    AttentionBackend,
-    AttentionMetadata,
-    AttentionType,
-)
 from vllm.attention.layer import Attention
-from vllm.attention.selector import get_attn_backend
 from vllm.config import CacheConfig, VllmConfig
 from vllm.logger import init_logger
 from vllm.utils.math_utils import cdiv
-from vllm.v1.attention.backends.utils import (
+from vllm.v1.attention.backend import (
+    AttentionBackend,
+    AttentionMetadata,
+    AttentionType,
     CommonAttentionMetadata,
+)
+from vllm.v1.attention.backends.utils import (
     subclass_attention_backend,
 )
+from vllm.v1.attention.selector import get_attn_backend
 from vllm.v1.kv_cache_interface import CrossAttentionSpec, KVCacheSpec
 
 logger = init_logger(__name__)
@@ -149,16 +149,20 @@ class CrossAttention(Attention):
             kv_cache_dtype = "auto"
             block_size = 16
 
-        underlying_attn_backend = get_attn_backend(
-            head_size, dtype, kv_cache_dtype, block_size
-        )
-        attn_backend = create_cross_attention_backend(underlying_attn_backend)
-
         if attn_type is not None:
             assert attn_type == AttentionType.ENCODER_DECODER, (
                 "CrossAttention only supports AttentionType.ENCODER_DECODER"
             )
 
+        underlying_attn_backend = get_attn_backend(
+            head_size,
+            dtype,
+            kv_cache_dtype,
+            block_size,
+            attn_type=AttentionType.ENCODER_DECODER,
+        )
+        attn_backend = create_cross_attention_backend(underlying_attn_backend)
+
         super().__init__(
             num_heads=num_heads,
             head_size=head_size,
diff --git a/vllm/attention/layers/encoder_only_attention.py b/vllm/model_executor/layers/attention/encoder_only_attention.py
similarity index 96%
rename from vllm/attention/layers/encoder_only_attention.py
rename to vllm/model_executor/layers/attention/encoder_only_attention.py
index 5e99c990100347875b9dc9cbdd8d2757ad220c55..8df9e05c8a0891256ed3865b671a96f93a3266e6 100644
--- a/vllm/attention/layers/encoder_only_attention.py
+++ b/vllm/model_executor/layers/attention/encoder_only_attention.py
@@ -5,19 +5,19 @@ from copy import copy
 
 import torch
 
-from vllm.attention.backends.abstract import (
+from vllm.attention.layer import Attention
+from vllm.config import CacheConfig
+from vllm.config.vllm import VllmConfig
+from vllm.v1.attention.backend import (
     AttentionBackend,
     AttentionMetadata,
     AttentionType,
+    CommonAttentionMetadata,
 )
-from vllm.attention.layer import Attention
-from vllm.attention.selector import get_attn_backend
-from vllm.config import CacheConfig
-from vllm.config.vllm import VllmConfig
 from vllm.v1.attention.backends.utils import (
-    CommonAttentionMetadata,
     subclass_attention_backend,
 )
+from vllm.v1.attention.selector import get_attn_backend
 from vllm.v1.kv_cache_interface import KVCacheSpec
 
 
diff --git a/vllm/attention/layers/mm_encoder_attention.py b/vllm/model_executor/layers/attention/mm_encoder_attention.py
similarity index 66%
rename from vllm/attention/layers/mm_encoder_attention.py
rename to vllm/model_executor/layers/attention/mm_encoder_attention.py
index c9107ebcab8568ef556217972fa3185f39d36184..44e990d29c16be008b0a14c3c4d1c195f6fa5fcd 100644
--- a/vllm/attention/layers/mm_encoder_attention.py
+++ b/vllm/model_executor/layers/attention/mm_encoder_attention.py
@@ -1,48 +1,30 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from collections.abc import Callable
 
 import torch
 
-from vllm.attention.backends.registry import AttentionBackendEnum
-from vllm.attention.ops.vit_attn_wrappers import (
-    vit_flash_attn_wrapper,
-    vit_torch_sdpa_wrapper,
-)
 from vllm.config import MultiModalConfig
 from vllm.logger import init_logger
 from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.models.vision import get_vit_attn_backend
+from vllm.v1.attention.backends.fa_utils import get_flash_attn_version
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
+from vllm.v1.attention.ops.vit_attn_wrappers import (
+    vit_flash_attn_wrapper,
+    vit_torch_sdpa_wrapper,
+)
 
 logger = init_logger(__name__)
 
 
-def maybe_get_vit_flash_attn_backend(
-    attn_backend: AttentionBackendEnum | None,
-) -> Callable | None:
-    # At this point,
-    # we already have the attn_backend,
-    # overriding logic is done in the platform-specific implementation.
-    # so we don't need to override backend here.
-    # Just return the attn_backend and flash_attn_varlen_func.
-
-    if attn_backend == AttentionBackendEnum.FLASH_ATTN:
-        from vllm.attention.utils.fa_utils import flash_attn_varlen_func
-    elif attn_backend == AttentionBackendEnum.ROCM_AITER_FA:
-        from aiter import flash_attn_varlen_func
-    else:
-        flash_attn_varlen_func = None
-
-    # if attn_backend is TORCH_SDPA,
-    # it will reach here and the flash_attn_varlen_func will be None.
-    return flash_attn_varlen_func
-
-
+# --8<-- [start:mm_encoder_attn]
 @CustomOp.register("mm_encoder_attn")
 class MMEncoderAttention(CustomOp):
     """Multi-headed attention without any cache, used for multimodal encoder."""
 
+    # --8<-- [end:mm_encoder_attn]
+
     def __init__(
         self,
         num_heads: int,
@@ -97,8 +79,8 @@ class MMEncoderAttention(CustomOp):
             AttentionBackendEnum.ROCM_AITER_FA,
         }
 
-        self.flash_attn_varlen_func = maybe_get_vit_flash_attn_backend(
-            self.attn_backend,
+        self._fa_version = (
+            get_flash_attn_version() if self.is_flash_attn_backend else None
         )
 
         logger.info_once(f"Using {self.attn_backend} for MMEncoderAttention.")
@@ -107,7 +89,7 @@ class MMEncoderAttention(CustomOp):
     def enabled(cls) -> bool:
         return True
 
-    def reshape_qkv_to_4d(
+    def maybe_reshape_qkv_to_4d(
         self,
         query: torch.Tensor,
         key: torch.Tensor,
@@ -131,30 +113,6 @@ class MMEncoderAttention(CustomOp):
 
         return query, key, value
 
-    def reshape_qkv_to_3d(
-        self,
-        query: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
-        bsz: int,
-        q_len: int,
-        kv_len: int,
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        """
-        Reshape query, key, value to 3D tensors:
-        (batch_size * seq_len, num_heads, head_size)
-        """
-        query = query.view(bsz * q_len, self.num_heads, self.head_size)
-        key = key.view(bsz * kv_len, self.num_kv_heads, self.head_size)
-        value = value.view(bsz * kv_len, self.num_kv_heads, self.head_size)
-
-        if (num_repeat := self.num_queries_per_kv) > 1:
-            # Handle MQA and GQA
-            key = torch.repeat_interleave(key, num_repeat, dim=1)
-            value = torch.repeat_interleave(value, num_repeat, dim=1)
-
-        return query, key, value
-
     def _forward_sdpa(
         self,
         query: torch.Tensor,
@@ -162,13 +120,15 @@ class MMEncoderAttention(CustomOp):
         value: torch.Tensor,
         cu_seqlens: torch.Tensor | None = None,
     ) -> torch.Tensor:
-        # TODO(Isotr0py): Migrate MultiHeadAttention
-        assert cu_seqlens is not None
-
+        """Input shape:
+        (batch_size x seq_len x hidden_size) or
+        (batch_size x seq_len x num_heads x head_size)
+        """
         bsz, q_len = query.size()[:2]
         kv_len = key.size(1)
+        is_reshaped = query.dim() != 4
 
-        query, key, value = self.reshape_qkv_to_4d(
+        query, key, value = self.maybe_reshape_qkv_to_4d(
             query, key, value, bsz, q_len, kv_len
         )
 
@@ -176,8 +136,11 @@ class MMEncoderAttention(CustomOp):
             q=query,
             k=key,
             v=value,
+            scale=self.scale,
             cu_seqlens=cu_seqlens,
         )
+        if is_reshaped:
+            output = output.reshape(bsz, q_len, -1)
         return output
 
     def _forward_fa(
@@ -188,23 +151,35 @@ class MMEncoderAttention(CustomOp):
         cu_seqlens: torch.Tensor | None = None,
         max_seqlen: torch.Tensor | None = None,  # Only used for Flash Attention
     ) -> torch.Tensor:
-        assert self.flash_attn_varlen_func is not None, (
-            "Flash attention function is not set."
-        )
-        # # TODO(Isotr0py): Migrate MultiHeadAttention
-        assert cu_seqlens is not None and max_seqlen is not None
+        """Input shape:
+        (batch_size x seq_len x hidden_size) or
+        (batch_size x seq_len x num_heads x head_size)
+        """
+        assert (cu_seqlens is not None and max_seqlen is not None) or (
+            cu_seqlens is None and max_seqlen is None
+        ), "cu_seqlens and max_seqlen should be both set or both None."
 
-        bsz = query.shape[0]
+        bsz, q_len = query.size()[:2]
+        kv_len = key.size(1)
+        is_reshaped = query.dim() != 4
+
+        query, key, value = self.maybe_reshape_qkv_to_4d(
+            query, key, value, bsz, q_len, kv_len
+        )
 
         output = vit_flash_attn_wrapper(
             q=query,
             k=key,
             v=value,
-            cu_seqlens=cu_seqlens,
-            max_seqlen=max_seqlen,
             batch_size=bsz,
             is_rocm_aiter=(self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA),
+            fa_version=self._fa_version,
+            scale=self.scale,
+            cu_seqlens=cu_seqlens,
+            max_seqlen=max_seqlen,
         )
+        if is_reshaped:
+            output = output.reshape(bsz, q_len, -1)
         return output
 
     def forward_native(
@@ -257,28 +232,3 @@ class MMEncoderAttention(CustomOp):
             "XPU only supports FLASH_ATTN for vision attention."
         )
         return self._forward_fa(query, key, value, cu_seqlens, max_seqlen)
-
-    def forward_tpu(
-        self,
-        query: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
-        cu_seqlens: torch.Tensor | None = None,
-        max_seqlen: torch.Tensor | None = None,  # Only used for Flash Attention
-    ) -> torch.Tensor:
-        assert self.attn_backend == AttentionBackendEnum.PALLAS, (
-            f"MMEncoderAttention on TPU only supports PALLAS backend, "
-            f"but got {self.attn_backend}."
-        )
-        if cu_seqlens is None:
-            query, key, value = (x.transpose(1, 2) for x in (query, key, value))
-            from torch_xla.experimental.custom_kernel import flash_attention
-
-            out = flash_attention(query, key, value, sm_scale=self.scale)
-            out = out.transpose(1, 2)
-            return out
-        logger.warning_once(
-            "PALLAS backend with cu_seqlens is not supported for ViT yet. ",
-            "Falling back to SDPA implementation.",
-        )
-        return self._forward_sdpa(query, key, value, cu_seqlens)
diff --git a/vllm/model_executor/layers/attention/static_sink_attention.py b/vllm/model_executor/layers/attention/static_sink_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7ec382b334cce9b9469cfcbf2e59159443cdb78
--- /dev/null
+++ b/vllm/model_executor/layers/attention/static_sink_attention.py
@@ -0,0 +1,254 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import functools
+
+import torch
+
+from vllm.attention.layer import Attention
+from vllm.config import CacheConfig, VllmConfig
+from vllm.forward_context import ForwardContext, get_forward_context
+from vllm.logger import init_logger
+from vllm.model_executor.custom_op import CustomOp
+from vllm.utils.math_utils import cdiv
+from vllm.utils.torch_utils import direct_register_custom_op
+from vllm.v1.attention.backend import (
+    AttentionBackend,
+    AttentionMetadata,
+    AttentionType,
+    CommonAttentionMetadata,
+)
+from vllm.v1.attention.backends.utils import (
+    subclass_attention_backend,
+)
+from vllm.v1.attention.ops.triton_reshape_and_cache_flash import (
+    triton_reshape_and_cache_flash_diffkv,
+)
+from vllm.v1.attention.selector import get_attn_backend
+from vllm.v1.kv_cache_interface import (
+    AttentionSpec,
+    KVCacheSpec,
+    SinkFullAttentionSpec,
+)
+
+logger = init_logger(__name__)
+
+
+@functools.lru_cache
+def create_static_sink_attention_backend(
+    underlying_attn_backend: type[AttentionBackend],
+    sink_len: int = 0,
+) -> type[AttentionBackend]:
+    prefix = "StaticSink_"
+    underlying_builder = underlying_attn_backend.get_builder_cls()
+
+    class StaticSinkAttentionBuilder(underlying_builder):  # type: ignore
+        def __init__(
+            self,
+            kv_cache_spec: AttentionSpec,
+            layer_names: list[str],
+            vllm_config: VllmConfig,
+            device: torch.device,
+        ):
+            super().__init__(kv_cache_spec, layer_names, vllm_config, device)
+            model_config = vllm_config.model_config
+            scheduler_config = vllm_config.scheduler_config
+            self.sink_len = sink_len
+            self.block_size = vllm_config.cache_config.block_size
+            self.num_sink_blocks = self.sink_len // vllm_config.cache_config.block_size
+            self.max_num_blocks = cdiv(
+                model_config.max_model_len, vllm_config.cache_config.block_size
+            )
+            self.block_table_with_sink = torch.zeros(
+                (
+                    scheduler_config.max_num_seqs,
+                    self.max_num_blocks + self.num_sink_blocks,
+                ),
+                device=device,
+                dtype=torch.int32,
+            )
+            self.block_table_with_sink[:, : self.num_sink_blocks] = torch.arange(
+                1,
+                self.num_sink_blocks + 1,
+                device=device,
+                dtype=torch.int32,
+            )
+
+        def build(
+            self,
+            common_prefix_len: int,
+            common_attn_metadata: CommonAttentionMetadata,
+            fast_build: bool = False,
+        ) -> AttentionMetadata:
+            common_attn_metadata.seq_lens[:] = (
+                common_attn_metadata.seq_lens + self.sink_len
+            )
+            common_attn_metadata.seq_lens[
+                common_attn_metadata.seq_lens == self.sink_len
+            ] = 0
+            common_attn_metadata.max_seq_len = (
+                common_attn_metadata.max_seq_len + self.sink_len
+            )
+            max_num_blocks = cdiv(common_attn_metadata.max_seq_len, self.block_size)
+            num_reqs = common_attn_metadata.num_reqs
+            self.block_table_with_sink[
+                :num_reqs, self.num_sink_blocks : self.num_sink_blocks + max_num_blocks
+            ] = common_attn_metadata.block_table_tensor[:, :max_num_blocks]
+            common_attn_metadata.block_table_tensor = self.block_table_with_sink[
+                :num_reqs
+            ]
+
+            return super().build(common_prefix_len, common_attn_metadata, fast_build)
+
+    attn_backend = subclass_attention_backend(
+        name_prefix=prefix,
+        attention_backend_cls=underlying_attn_backend,
+        builder_cls=StaticSinkAttentionBuilder,
+    )
+
+    return attn_backend
+
+
+@CustomOp.register("static_sink_attention")
+class StaticSinkAttention(Attention, CustomOp):
+    """
+    Attention with static sink tokens
+    """
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        sink_len: int,
+        attn_backend: type[AttentionBackend] | None = None,
+        cache_config: CacheConfig | None = None,
+        **kwargs,
+    ):
+        dtype = torch.get_default_dtype()
+
+        if cache_config is not None:
+            kv_cache_dtype = cache_config.cache_dtype
+            block_size = cache_config.block_size
+        else:
+            kv_cache_dtype = "auto"
+            block_size = 16
+
+        if attn_backend is not None:
+            underlying_attn_backend = attn_backend
+        else:
+            underlying_attn_backend = get_attn_backend(
+                head_size, dtype, kv_cache_dtype, block_size
+            )
+        attn_backend = create_static_sink_attention_backend(
+            underlying_attn_backend,  # type: ignore[arg-type]
+            sink_len=sink_len,
+        )
+        Attention.__init__(
+            self=self,
+            num_heads=num_heads,
+            head_size=head_size,
+            scale=scale,
+            cache_config=cache_config,
+            attn_backend=attn_backend,
+            **kwargs,
+        )
+        CustomOp.__init__(self)
+
+        self.sink_len = sink_len
+        self.block_size = block_size
+        self.sink_populated = False
+        self.sink_key = None
+        self.sink_value = None
+
+    def update_sink_kv(self, sink_key, sink_value) -> None:
+        self.sink_key = sink_key
+        self.sink_value = sink_value
+
+    def forward_native(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        output_shape: torch.Size | None = None,
+    ) -> torch.Tensor:
+        assert self.sink_key is not None and self.sink_value is not None, (
+            "sink_key and sink_value have not been prepared"
+        )
+        if not self.sink_populated:
+            forward_context: ForwardContext = get_forward_context()
+            self_kv_cache = self.kv_cache[forward_context.virtual_engine]
+            torch.ops.vllm.maybe_populate_sink(self_kv_cache, self.layer_name)
+
+        return super().forward(query, key, value, output_shape)
+
+    def forward_cuda(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        output_shape: torch.Size | None = None,
+    ) -> torch.Tensor:
+        return self.forward_native(query, key, value, output_shape)
+
+    def forward(self, *args, **kwargs):
+        return self._forward_method(*args, **kwargs)
+
+    def populate_sink_kv(self, self_kv_cache):
+        sink_kv_slot_mapping = torch.arange(
+            self.block_size,
+            self.sink_len + self.block_size,
+            device=torch.cuda.current_device(),
+            dtype=torch.long,
+        )
+        triton_reshape_and_cache_flash_diffkv(
+            self.sink_key,
+            self.sink_value,
+            self_kv_cache,
+            sink_kv_slot_mapping,
+            self.kv_cache_dtype,
+            self._k_scale,
+            self._v_scale,
+        )
+        # We only populate the sink_key and sink_value once
+        self.sink_populated = True
+
+    def get_kv_cache_spec(self, vllm_config: VllmConfig) -> KVCacheSpec:
+        # Block size may get updated after model loading, refresh it
+        block_size = vllm_config.cache_config.block_size
+        # Should not be called for enc-dec or encoder-only attention.
+        assert self.attn_type == AttentionType.DECODER
+
+        return SinkFullAttentionSpec(
+            block_size=block_size,
+            num_kv_heads=self.num_kv_heads,
+            head_size=self.head_size,
+            head_size_v=self.head_size_v,
+            sink_len=self.sink_len,
+            dtype=self.kv_cache_torch_dtype,
+        )
+
+
+def maybe_populate_sink(
+    self_kv_cache: torch.Tensor,
+    layer_name: str,
+) -> None:
+    forward_context: ForwardContext = get_forward_context()
+    self = forward_context.no_compile_layers[layer_name]
+    if self.sink_populated or self_kv_cache.numel() == 0:
+        return
+    self.populate_sink_kv(self_kv_cache)
+
+
+def maybe_populate_sink_fake(
+    self_kv_cache: torch.Tensor,
+    layer_name: str,
+) -> None:
+    return
+
+
+direct_register_custom_op(
+    op_name="maybe_populate_sink",
+    op_func=maybe_populate_sink,
+    mutates_args=["self_kv_cache"],
+    fake_impl=maybe_populate_sink_fake,
+)
diff --git a/vllm/model_executor/layers/attention_layer_base.py b/vllm/model_executor/layers/attention_layer_base.py
index a60cf787135c0f428f46c79e15815a4e14c4c262..97395b641497a93d43207ffcc325948b62f5a1b0 100644
--- a/vllm/model_executor/layers/attention_layer_base.py
+++ b/vllm/model_executor/layers/attention_layer_base.py
@@ -4,8 +4,8 @@
 
 from abc import ABC, abstractmethod
 
-from vllm.attention.backends.abstract import AttentionBackend
 from vllm.config import VllmConfig
+from vllm.v1.attention.backend import AttentionBackend, AttentionImpl
 from vllm.v1.kv_cache_interface import KVCacheSpec
 
 
@@ -18,6 +18,8 @@ class AttentionLayerBase(ABC):
     from different layer types.
     """
 
+    impl: "AttentionImpl"
+
     @abstractmethod
     def get_attn_backend(self) -> type[AttentionBackend]:
         """Get the attention backend class for this layer."""
diff --git a/vllm/model_executor/layers/batch_invariant.py b/vllm/model_executor/layers/batch_invariant.py
index fde0826779eb1c745ad88bc22c00baef53e3f72c..d3cf9739fcf4230649fb1355dfb133affdfe70b3 100644
--- a/vllm/model_executor/layers/batch_invariant.py
+++ b/vllm/model_executor/layers/batch_invariant.py
@@ -6,11 +6,11 @@ from typing import Any
 
 import torch
 
-from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.triton_utils import tl, triton
 from vllm.utils.torch_utils import is_torch_equal_or_newer
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
 
 logger = init_logger(__name__)
 
@@ -933,30 +933,26 @@ def enable_batch_invariant_mode():
     _batch_invariant_MODE = True
     _batch_invariant_LIB = torch.library.Library("aten", "IMPL")
 
-    # Batch invariant matmuls are no longer needed after cublas overrides
-    if not is_torch_equal_or_newer("2.10.0.dev"):
-        if (
-            current_platform.is_device_capability_family(100)
-            or current_platform.is_device_capability(80)
-            or current_platform.is_device_capability(89)
-        ):
-            # For PyTorch 2.9, B200 uses GEMV for bs=1
-            # Requires https://github.com/pytorch/pytorch/pull/166735
-            _batch_invariant_LIB.impl("aten::mm", mm_batch_invariant, "CUDA")
-            _batch_invariant_LIB.impl("aten::addmm", addmm_batch_invariant, "CUDA")
-            _batch_invariant_LIB.impl("aten::matmul", matmul_batch_invariant, "CUDA")
-            _batch_invariant_LIB.impl("aten::linear", linear_batch_invariant, "CUDA")
-        else:
-            # Only source of batch invariance for Hopper is split-k, can disable through
-            # cuBLAS workspace config
-            _original_cublas_workspace_cfg = os.environ.get(
-                "CUBLAS_WORKSPACE_CONFIG", None
-            )
-            _original_cublaslt_workspace_size = os.environ.get(
-                "CUBLASLT_WORKSPACE_SIZE", None
-            )
-            os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8"
-            os.environ["CUBLASLT_WORKSPACE_SIZE"] = "1"
+    if (
+        current_platform.is_device_capability_family(100)
+        or current_platform.is_device_capability(80)
+        or current_platform.is_device_capability(89)
+    ):
+        # For PyTorch 2.9, B200 uses GEMV for bs=1
+        # Requires https://github.com/pytorch/pytorch/pull/166735
+        _batch_invariant_LIB.impl("aten::mm", mm_batch_invariant, "CUDA")
+        _batch_invariant_LIB.impl("aten::addmm", addmm_batch_invariant, "CUDA")
+        _batch_invariant_LIB.impl("aten::matmul", matmul_batch_invariant, "CUDA")
+        _batch_invariant_LIB.impl("aten::linear", linear_batch_invariant, "CUDA")
+    else:
+        # Only source of batch invariance for Hopper is split-k, can disable through
+        # cuBLAS workspace config
+        _original_cublas_workspace_cfg = os.environ.get("CUBLAS_WORKSPACE_CONFIG", None)
+        _original_cublaslt_workspace_size = os.environ.get(
+            "CUBLASLT_WORKSPACE_SIZE", None
+        )
+        os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8"
+        os.environ["CUBLASLT_WORKSPACE_SIZE"] = "1"
 
     _batch_invariant_LIB.impl(
         "aten::_log_softmax", _log_softmax_batch_invariant, "CUDA"
diff --git a/vllm/model_executor/layers/conv.py b/vllm/model_executor/layers/conv.py
index 8d51e5bd9920ad97463893add475a0a98e4cc242..f4709f2f4d80f648348baf523329fe7faf2eadb8 100644
--- a/vllm/model_executor/layers/conv.py
+++ b/vllm/model_executor/layers/conv.py
@@ -105,10 +105,13 @@ class ConvLayerBase(CustomOp):
         return s
 
 
+# --8<-- [start:conv2d]
 @CustomOp.register("conv2d")
 class Conv2dLayer(ConvLayerBase):
     """Conv layer with Conv2d."""
 
+    # --8<-- [end:conv2d]
+
     num_dim = 2
 
     def _forward_mulmat(self, x: torch.Tensor) -> torch.Tensor:
@@ -204,10 +207,13 @@ class CausalConv2dLayer(Conv2dLayer):
         return x
 
 
+# --8<-- [start:conv3d]
 @CustomOp.register("conv3d")
 class Conv3dLayer(ConvLayerBase):
     """Conv layer with Conv3d."""
 
+    # --8<-- [end:conv3d]
+
     num_dim = 3
 
     def _forward_mulmat(self, x: torch.Tensor) -> torch.Tensor:
@@ -251,6 +257,6 @@ class Conv3dLayer(ConvLayerBase):
         # See: https://github.com/vllm-project/vllm/issues/27406
         # and https://github.com/pytorch/pytorch/issues/166122
         # By default, we use CUDNN's convolution ops with optimization.
-        if self.enable_linear and is_torch_equal("2.9.0"):
+        if self.enable_linear and (is_torch_equal("2.9.0") or is_torch_equal("2.9.1")):
             return self._forward_mulmat(x)
         return self._forward_conv(x)
diff --git a/vllm/model_executor/layers/fla/ops/fused_recurrent.py b/vllm/model_executor/layers/fla/ops/fused_recurrent.py
index 0f27504780ac4c29f65b8eaacfd05b76fcfdc88a..91b07129dca8a63654e384f900cc54bacb102095 100644
--- a/vllm/model_executor/layers/fla/ops/fused_recurrent.py
+++ b/vllm/model_executor/layers/fla/ops/fused_recurrent.py
@@ -189,7 +189,7 @@ def fused_recurrent_gated_delta_rule_fwd(
     B, T, H, K, V = *k.shape, v.shape[-1]
     HV = v.shape[2]
     N = B if cu_seqlens is None else len(cu_seqlens) - 1
-    BK, BV = triton.next_power_of_2(K), min(triton.next_power_of_2(V), 8)
+    BK, BV = triton.next_power_of_2(K), min(triton.next_power_of_2(V), 32)
     NK, NV = triton.cdiv(K, BK), triton.cdiv(V, BV)
     assert NK == 1, "NK > 1 is not supported yet"
     num_stages = 3
diff --git a/vllm/model_executor/layers/fla/ops/utils.py b/vllm/model_executor/layers/fla/ops/utils.py
index 5a48e56a5fbbfa1fb21ebf18e581af0e5e696feb..18e17a5110c1ad4fa8ee72a8f5cc40d091b5f4c7 100644
--- a/vllm/model_executor/layers/fla/ops/utils.py
+++ b/vllm/model_executor/layers/fla/ops/utils.py
@@ -119,7 +119,7 @@ def input_guard(fn: Callable[..., torch.Tensor]) -> Callable[..., torch.Tensor]:
 def get_available_device() -> str:
     try:
         return triton.runtime.driver.active.get_current_target().backend
-    except BaseException:
+    except (RuntimeError, AttributeError):
         return "cpu"
 
 
diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index d71cfc5ad82009fc82b0054e055bd655e1116435..5ba9e80fc8b819b32cf005961c2da0b3cf2378a2 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -11,6 +11,9 @@ from vllm.model_executor.layers.fused_moe.config import (
 from vllm.model_executor.layers.fused_moe.fused_moe_method_base import (
     FusedMoEMethodBase,
 )
+from vllm.model_executor.layers.fused_moe.fused_moe_router import (
+    FusedMoERouter,
+)
 from vllm.model_executor.layers.fused_moe.layer import (
     FusedMoE,
     FusedMoeWeightScaleSupported,
@@ -25,6 +28,9 @@ from vllm.model_executor.layers.fused_moe.unquantized_fused_moe_method import (
     UnquantizedFusedMoEMethod,
 )
 from vllm.model_executor.layers.fused_moe.utils import activation_without_mul
+from vllm.model_executor.layers.fused_moe.zero_expert_fused_moe import (
+    ZeroExpertFusedMoE,
+)
 from vllm.triton_utils import HAS_TRITON
 
 _config: dict[str, Any] | None = None
@@ -45,6 +51,7 @@ def get_config() -> dict[str, Any] | None:
 
 __all__ = [
     "FusedMoE",
+    "FusedMoERouter",
     "FusedMoEConfig",
     "FusedMoEMethodBase",
     "UnquantizedFusedMoEMethod",
@@ -54,6 +61,7 @@ __all__ = [
     "FusedMoEPrepareAndFinalize",
     "RoutingMethodType",
     "SharedFusedMoE",
+    "ZeroExpertFusedMoE",
     "activation_without_mul",
     "override_config",
     "get_config",
@@ -68,8 +76,6 @@ if HAS_TRITON:
         CutlassBatchedExpertsFp8,
         CutlassExpertsFp8,
         CutlassExpertsW4A8Fp8,
-        cutlass_moe_fp4,
-        cutlass_moe_fp8,
         cutlass_moe_w4a8_fp8,
     )
     from vllm.model_executor.layers.fused_moe.deep_gemm_moe import DeepGemmExperts
@@ -77,11 +83,12 @@ if HAS_TRITON:
         BatchedTritonExperts,
     )
     from vllm.model_executor.layers.fused_moe.fused_moe import (
+        GroupedTopk,
         TritonExperts,
+        TritonWNA16Experts,
         fused_experts,
         fused_topk,
         get_config_file_name,
-        grouped_topk,
     )
     from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
         TritonOrDeepGemmExperts,
@@ -91,14 +98,13 @@ if HAS_TRITON:
         "fused_topk",
         "fused_experts",
         "get_config_file_name",
-        "grouped_topk",
-        "cutlass_moe_fp8",
-        "cutlass_moe_fp4",
+        "GroupedTopk",
         "cutlass_moe_w4a8_fp8",
         "CutlassExpertsFp8",
         "CutlassBatchedExpertsFp8",
         "CutlassExpertsW4A8Fp8",
         "TritonExperts",
+        "TritonWNA16Experts",
         "BatchedTritonExperts",
         "DeepGemmExperts",
         "BatchedDeepGemmExperts",
diff --git a/vllm/model_executor/layers/fused_moe/all2all_utils.py b/vllm/model_executor/layers/fused_moe/all2all_utils.py
index 86c50f39f007629c8caa1fbac8333027ae2ffcf0..036b3cac4cb31cb27fb0630c14a5b61079077e58 100644
--- a/vllm/model_executor/layers/fused_moe/all2all_utils.py
+++ b/vllm/model_executor/layers/fused_moe/all2all_utils.py
@@ -77,8 +77,10 @@ def maybe_make_prepare_finalize(
 
     prepare_finalize: FusedMoEPrepareAndFinalize | None = None
 
-    # TODO: could allow this now
-    assert not moe.use_flashinfer_cutlass_kernels, "Must be created in modelopt.py"
+    # TODO(rob): update this as part of the MoE refactor.
+    assert not moe.use_flashinfer_cutlass_kernels, (
+        "Must be created in modelopt.py or fp8.py"
+    )
 
     if moe.use_pplx_kernels:
         assert quant_config is not None
diff --git a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
index 15f6e3a18ed6c297e6ff46ed6f39ae3679e576c9..e598ec3acb3dfdca28df63ac14ea0806bbc14c73 100644
--- a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
@@ -305,6 +305,7 @@ class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
         global_num_experts: int,
         local_num_experts: int,
         expert_tokens_meta: mk.ExpertTokensMetadata | None,
+        activation: str,
     ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
         # FIXME (varun): We should be able to dispatch only from the leader
         # DP ranks in the case of TP > 1. At the moment, all the Ranks
@@ -312,8 +313,9 @@ class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
         num_dispatchers = self.num_dispatchers
         num_experts = local_num_experts
         max_num_tokens = M if self.max_num_tokens is None else self.max_num_tokens
+        activation_out_dim = self.adjust_N_for_activation(N, activation)
         workspace13 = (num_experts, max_num_tokens * num_dispatchers, max(K, N))
-        workspace2 = (num_experts, max_num_tokens * num_dispatchers, (N // 2))
+        workspace2 = (num_experts, max_num_tokens * num_dispatchers, activation_out_dim)
         output = (num_experts, max_num_tokens * num_dispatchers, K)
         return (workspace13, workspace2, output)
 
diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py
index c6b2f855128f5fd3d81980549e3c90a27854769a..1b826b0a4862dbccab65c1b13332ac839dc013ef 100644
--- a/vllm/model_executor/layers/fused_moe/config.py
+++ b/vllm/model_executor/layers/fused_moe/config.py
@@ -19,6 +19,7 @@ from vllm.model_executor.layers.quantization.utils.ocp_mx_utils import (
     OCP_MX_Scheme,
 )
 from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
+from vllm.platforms import current_platform
 from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
 from vllm.utils.import_utils import has_triton_kernels
 from vllm.utils.math_utils import cdiv
@@ -39,6 +40,7 @@ if has_triton_kernels():
 def _get_config_dtype_str(
     dtype: torch.dtype,
     use_fp8_w8a8: bool = False,
+    use_fp8_w8a16: bool = False,
     use_int8_w8a16: bool = False,
     use_int4_w4a16: bool = False,
     use_int8_w8a8: bool = False,
@@ -52,6 +54,8 @@ def _get_config_dtype_str(
     """
     if use_fp8_w8a8:
         return "fp8_w8a8"
+    elif use_fp8_w8a16:
+        return "fp8_w8a16"
     elif use_int8_w8a16:
         return "int8_w8a16"
     elif use_int4_w4a16:
@@ -329,10 +333,18 @@ class FusedMoEQuantConfig:
     def use_int4_w4a8(self) -> bool:
         return (self._a1.dtype == torch.int8 and self._w1.dtype == "int4")
 
+    @property
+    def use_fp8_w8a16(self) -> bool:
+        return self._a1.dtype is None and self._w1.dtype == current_platform.fp8_dtype()
+
     @property
     def use_int4_w4a16(self) -> bool:
         return self._a1.dtype is None and self._w1.dtype == "int4"
 
+    @property
+    def use_nvfp4_w4a16(self) -> bool:
+        return self._a1.dtype is None and self._w1.dtype == "nvfp4"
+
     @property
     def ocp_mx_scheme(self) -> str | None:
         if not hasattr(self, "_ocp_mx_scheme"):
@@ -372,6 +384,7 @@ class FusedMoEQuantConfig:
         """
         return _get_config_dtype_str(
             use_fp8_w8a8=self.use_fp8_w8a8,
+            use_fp8_w8a16=self.use_fp8_w8a16,
             use_int8_w8a16=self.use_int8_w8a16,
             use_int4_w4a16=self.use_int4_w4a16,
             ocp_mx_scheme=self.ocp_mx_scheme,
@@ -453,11 +466,14 @@ class FusedMoEQuantConfig:
         - a1_scale: Optional scale to be used for a1.
         - a2_scale: Optional scale to be used for a2.
         - g1_alphas: Optional global quantization scales for w1 (for nvfp4).
-            per-channel scales for w1 (for W4A8 FP8).
+                     Optional per-channel scales for w1 (for W4A8 FP8).
+                     Optional dq scale i.e. w_scale * a_scale (for W8A8 fp8).
         - g2_alphas: Optional global quantization scales for w2 (for nvfp4).
-            per-channel scales for w2 (for W4A8 FP8).
-        - a1_gscale: Optional global quantization scales for a1 (for nvfp4).
-        - a2_gscale: Optional global quantization scales for a2 (for nvfp4).
+                     Optional per-channel scales for w2 (for W4A8 FP8).
+                     Optional dq scale i.e. w_scale * a_scale (for W8A8 fp8).
+        - a1_gscale: Optional global quantization scales for a1 (1.0 /a2_scale).
+        - a2_gscale: Optional global quantization scales for a2 (1.0 /a2_scale).
+
         - w1_bias: Optional biases for w1 (GPT OSS Triton).
         - w2_bias: Optional biases for w1 (GPT OSS Triton).
         - w1_zp: Optional w1 zero points for int4/int8 quantization.
@@ -681,6 +697,25 @@ def nvfp4_moe_quant_config(
     )
 
 
+def nvfp4_w4a16_moe_quant_config(
+    g1_alphas: torch.Tensor,
+    g2_alphas: torch.Tensor,
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+) -> FusedMoEQuantConfig:
+    """
+    Construct a quant config for 16-but activations and nvp4 weights.
+    """
+    return FusedMoEQuantConfig.make(
+        quant_dtype=None,
+        w1_scale=w1_scale,
+        w2_scale=w2_scale,
+        g1_alphas=g1_alphas,
+        g2_alphas=g2_alphas,
+        weight_dtype="nvfp4",
+    )
+
+
 def int4_w4a16_moe_quant_config(
     w1_scale: torch.Tensor,
     w2_scale: torch.Tensor,
@@ -690,7 +725,6 @@ def int4_w4a16_moe_quant_config(
 ) -> FusedMoEQuantConfig:
     """
     Construct a quant config for 16-bit float activations and int4 weights.
-    Note: Activations are pre-quantized.
     """
     group_shape = GroupShape(*block_shape) if block_shape is not None else None
     return FusedMoEQuantConfig(
@@ -701,6 +735,27 @@ def int4_w4a16_moe_quant_config(
     )
 
 
+def fp8_w8a16_moe_quant_config(
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    block_shape: list[int] | None = None,
+) -> FusedMoEQuantConfig:
+    """
+    Construct a quant config for 16-bit float activations and fp8 weights.
+    """
+    group_shape = GroupShape(*block_shape) if block_shape is not None else None
+    return FusedMoEQuantConfig(
+        _a1=FusedMoEQuantDesc(),
+        _a2=FusedMoEQuantDesc(),
+        _w1=FusedMoEQuantDesc(
+            current_platform.fp8_dtype(), group_shape, w1_scale, None, None
+        ),
+        _w2=FusedMoEQuantDesc(
+            current_platform.fp8_dtype(), group_shape, w2_scale, None, None
+        ),
+    )
+
+
 def int8_w8a16_moe_quant_config(
     w1_scale: torch.Tensor,
     w2_scale: torch.Tensor,
@@ -710,7 +765,6 @@ def int8_w8a16_moe_quant_config(
 ) -> FusedMoEQuantConfig:
     """
     Construct a quant config for 16-bit float activations and int8 weights.
-    Note: Activations are pre-quantized.
     """
     group_shape = GroupShape(*block_shape) if block_shape is not None else None
     return FusedMoEQuantConfig(
@@ -985,6 +1039,9 @@ class FusedMoEConfig:
     # The activation type.
     in_dtype: torch.dtype
 
+    # Defaults to in_dtype if not specified.
+    router_logits_dtype: torch.dtype | None = None
+
     max_num_tokens: int = envs.VLLM_MOE_DP_CHUNK_SIZE
 
     has_bias: bool = False
@@ -1001,6 +1058,9 @@ class FusedMoEConfig:
 
         assert self.max_num_tokens > 0
 
+        if self.router_logits_dtype is None:
+            self.router_logits_dtype = self.in_dtype
+
     @property
     def tp_size(self):
         return self.moe_parallel_config.tp_size
@@ -1009,6 +1069,10 @@ class FusedMoEConfig:
     def dp_size(self):
         return self.moe_parallel_config.dp_size
 
+    @property
+    def pcp_size(self):
+        return self.moe_parallel_config.pcp_size
+
     @property
     def ep_size(self):
         return self.moe_parallel_config.ep_size
@@ -1021,6 +1085,10 @@ class FusedMoEConfig:
     def dp_rank(self):
         return self.moe_parallel_config.dp_rank
 
+    @property
+    def pcp_rank(self):
+        return self.moe_parallel_config.pcp_rank
+
     @property
     def ep_rank(self):
         return self.moe_parallel_config.ep_rank
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=gfx938_64cu.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=gfx938_64cu.json
new file mode 100644
index 0000000000000000000000000000000000000000..10900b88282cca1f49b038302ae24af9a7e26c13
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=gfx938_64cu.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=gfx938_64cu_nn.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=gfx938_64cu_nn.json
new file mode 100644
index 0000000000000000000000000000000000000000..df2215d14dca45b15d6c73c1e88ee28999b7a6de
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=gfx938_64cu_nn.json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "2": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 2,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000000000000000000000000000000..3859583fb31f27cf74b614a2645b0c43c12235c2
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,147 @@
+{
+    "triton_version": "3.5.0",
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..75dfc52cb46da0b7a4afb0c3753ab5083498b144
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI308X.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI308X.json
old mode 100755
new mode 100644
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200.json
new file mode 100644
index 0000000000000000000000000000000000000000..c8b3dcc8f80d6e1e0c8c96e32a39d31082531494
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200.json
@@ -0,0 +1,147 @@
+{
+    "triton_version": "3.5.0",
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=129,N=704,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=129,N=704,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..75dfc52cb46da0b7a4afb0c3753ab5083498b144
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=129,N=704,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..291a760cb2382da37a38499af0872607a5fa95c1
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json
@@ -0,0 +1,147 @@
+{
+    "triton_version": "3.5.1",
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=160,N=320,device_name=gfx938_64cu.json b/vllm/model_executor/layers/fused_moe/configs/E=160,N=320,device_name=gfx938_64cu.json
new file mode 100644
index 0000000000000000000000000000000000000000..7bbc7b76c4256c014d506fc0b5106b3f72bbc852
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=160,N=320,device_name=gfx938_64cu.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..a081be65f613b35bad6147ec3ff84c475c88c3bc
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json
@@ -0,0 +1,147 @@
+{
+    "triton_version": "3.5.1",
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=160,N=768,device_name=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=160,N=768,device_name=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..49aadc8c9dfd308a4f358c6f0bc8d49cc590fe48
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=160,N=768,device_name=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json
@@ -0,0 +1,147 @@
+{
+    "triton_version": "3.5.1",
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000000000000000000000000000000..b03a587294217633a9a54966f23949215b02371c
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,147 @@
+{
+    "triton_version": "3.5.0",
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000000000000000000000000000000..7e57e97eef8a74e732c91e53cc1013364700faa4
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,147 @@
+{
+    "triton_version": "3.5.0",
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=gfx928_120cu_nn.json b/vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=gfx928_120cu_nn.json
new file mode 100644
index 0000000000000000000000000000000000000000..45d42491f2dc8dc3fdbf8e06614be71c5ad9f104
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=gfx928_120cu_nn.json
@@ -0,0 +1,165 @@
+{
+    "triton_version": "3.1.0",
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "2": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3,
+        "num_ldmatrixes": 1
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3,
+        "num_ldmatrixes": 1
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3,
+        "num_ldmatrixes": 1
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3,
+        "num_ldmatrixes": 1
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3,
+        "num_ldmatrixes": 1
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3,
+        "num_ldmatrixes": 1
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3,
+        "num_ldmatrixes": 1
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3,
+        "num_ldmatrixes": 1
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3,
+        "num_ldmatrixes": 1
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5,
+        "num_ldmatrixes": 1
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3,
+        "num_ldmatrixes": 1
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000000000000000000000000000000..4438d15c5694956a5ca3f8e2fe621bc39c3db48d
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,147 @@
+{
+    "triton_version": "3.5.0",
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000000000000000000000000000000..93f7227b1126998e31d517adcb26c7a531a724ec
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,147 @@
+{
+    "triton_version": "3.5.0",
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=gfx928_120cu_nn.json b/vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=gfx928_120cu_nn.json
new file mode 100644
index 0000000000000000000000000000000000000000..e57a08c740efdadec25df23dc836d2510d1f6cc4
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=gfx928_120cu_nn.json
@@ -0,0 +1,165 @@
+{
+    "triton_version": "3.1.0",
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "2": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3,
+        "num_ldmatrixes": 1
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3,
+        "num_ldmatrixes": 1
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000000000000000000000000000000..694dbf47b207404e117e34228d884b2bc5a49738
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,147 @@
+{
+    "triton_version": "3.5.0",
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py b/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
index 659a2d4ee5b39792242d9c309ea9a2eb4461ce62..7055e41aad797305795d075fbfc6f277ed5fe52b 100644
--- a/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
@@ -1,12 +1,40 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import weakref
 from collections.abc import Callable
 
 import torch
 from torch.nn import functional as F
 
 from vllm import _custom_ops as ops
+from vllm._custom_ops import cpu_fused_moe, cpu_prepack_moe_weight
 from vllm.model_executor.layers.activation import SiluAndMul, SwigluOAIAndMul
+from vllm.model_executor.layers.quantization.utils.layer_utils import replace_parameter
+from vllm.utils.torch_utils import direct_register_custom_op
+
+_CPU_MOE_LAYER_CACHE = {}
+
+
+class _LazyActivationDict(dict):
+    """Lazily instantiate activation functions on first access.
+
+    Avoids triggering CustomOp.__init__() at module import time,
+    which would call get_current_vllm_config() before config is set.
+    """
+
+    _factories: dict[str, type[SiluAndMul] | type[SwigluOAIAndMul]] = {
+        "silu": SiluAndMul,
+        "swigluoai": SwigluOAIAndMul,
+    }
+
+    def __missing__(self, key: str) -> SiluAndMul | SwigluOAIAndMul:
+        if key not in self._factories:
+            raise KeyError(f"{key} is not a supported activation")
+        self[key] = self._factories[key]()
+        return self[key]
+
+
+_CPU_MOE_ACT = _LazyActivationDict()
 
 
 def grouped_topk(
@@ -174,8 +202,105 @@ class SGLFusedMOE:
 
 class CPUFusedMOE:
     def __init__(self, layer: torch.nn.Module) -> None:
-        use_onednn_mm = ops._supports_onednn and ops.is_onednn_acl_supported()
+        use_grouped_gemm, isa = self.check_grouped_gemm(layer)
+        self.isa = isa
+        if use_grouped_gemm:
+            self.forward_method = self.forward_grouped_gemm
+            self.init_moe_grouped_gemm(layer=layer)
+        else:
+            self.forward_method = self.forward_torch
+            self.init_moe_torch(layer=layer)
+
+    def __call__(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        use_grouped_topk: bool,
+        top_k: int,
+        router_logits: torch.Tensor,
+        renormalize: bool,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
+        global_num_experts: int = -1,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
+        scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
+        e_score_correction_bias: torch.Tensor | None = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+    ) -> torch.Tensor:
+        assert activation in _CPU_MOE_ACT._factories, f"{activation} is not supported."
+        assert not apply_router_weight_on_input
+
+        topk_weights, topk_ids = select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            routed_scaling_factor=routed_scaling_factor,
+            e_score_correction_bias=e_score_correction_bias,
+        )
+
+        return self.forward_method(
+            layer,
+            x,
+            topk_weights,
+            topk_ids,
+            activation,
+            global_num_experts,
+        )
+
+    def check_grouped_gemm(
+        self,
+        layer: torch.nn.Module,
+    ) -> tuple[bool, str]:
+        if not hasattr(torch.ops._C, "prepack_moe_weight"):
+            return False, "none"
+
+        dtype = layer.w13_weight.dtype
+        w13_input_size = layer.w13_weight.size(2)
+        w13_output_size = layer.w13_weight.size(1)
+        w2_input_size = layer.w2_weight.size(2)
+        w2_output_size = layer.w2_weight.size(1)
+
+        if not (w13_output_size % 32 == 0 and w2_output_size % 32 == 0):
+            return False, "none"
+
+        supports_amx = torch._C._cpu._is_amx_tile_supported()
+
+        if (
+            supports_amx
+            and dtype == torch.bfloat16
+            and w13_input_size % 32 == 0
+            and w2_input_size % 32 == 0
+        ):
+            return True, "amx"
+
+        if supports_amx:
+            return False, "none"
+
+        return True, "vec"
 
+    def init_moe_grouped_gemm(
+        self,
+        layer: torch.nn.Module,
+    ) -> None:
+        new_w13 = cpu_prepack_moe_weight(layer.w13_weight, self.isa)
+        replace_parameter(layer, "w13_weight", new_w13)
+        new_w2 = cpu_prepack_moe_weight(layer.w2_weight, self.isa)
+        replace_parameter(layer, "w2_weight", new_w2)
+
+    def init_moe_torch(
+        self,
+        layer: torch.nn.Module,
+    ) -> None:
+        use_onednn_mm = ops._supports_onednn and ops.is_onednn_acl_supported()
         num_experts = layer.w13_weight.size(0)
         has_w13_bias = hasattr(layer, "w13_bias")
         has_w2_bias = hasattr(layer, "w2_bias")
@@ -208,85 +333,112 @@ class CPUFusedMOE:
                 layer.down_linear.append(
                     lambda x, w=layer_w2_weight, b=layer_w2_bias: F.linear(x, w, b)
                 )
+
         if use_onednn_mm:  # remove weight
             layer.w13_weight = torch.nn.Parameter(torch.empty(0), requires_grad=False)
             layer.w2_weight = torch.nn.Parameter(torch.empty(0), requires_grad=False)
 
-        self.act_to_impl = {
-            "silu": SiluAndMul(),
-            "swigluoai": SwigluOAIAndMul(),
-        }
+        _CPU_MOE_LAYER_CACHE[id(layer)] = weakref.ref(layer)
 
-    def __call__(
+    def forward_grouped_gemm(
         self,
         layer: torch.nn.Module,
-        x: torch.Tensor,
-        use_grouped_topk: bool,
-        top_k: int,
-        router_logits: torch.Tensor,
-        renormalize: bool,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
+        input: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
         global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
     ) -> torch.Tensor:
-        assert activation in self.act_to_impl, f"{activation} is not supported."
-        assert not apply_router_weight_on_input
-        topk_weights, topk_ids = select_experts(
-            hidden_states=x,
-            router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
+        output = cpu_fused_moe(
+            input,
+            layer.w13_weight,
+            layer.w2_weight,
+            getattr(layer, "w13_bias", None),
+            getattr(layer, "w2_bias", None),
+            topk_weights,
+            topk_ids,
+            activation,
+            self.isa,
         )
+        return output
 
-        # Ref code from https://github.com/sgl-project/sglang/blob/716e682721397df103f347d22da8bd46c6016dab/python/sglang/srt/layers/moe/fused_moe_native.py#L53
-        len_experts = global_num_experts
-
-        cnts = topk_ids.new_zeros((topk_ids.shape[0], len_experts))
-        cnts.scatter_(1, topk_ids.to(torch.int64), 1)
-        tokens_per_expert = cnts.sum(dim=0)
-        idxs = topk_ids.view(-1).argsort()
-
-        sorted_tokens = x[idxs // topk_ids.shape[1]]
-        tokens_per_expert = tokens_per_expert.cpu().numpy()
-
-        outputs = []
-        start_idx = 0
-
-        for i, num_tokens in enumerate(tokens_per_expert):
-            end_idx = start_idx + num_tokens
-            if num_tokens == 0:
-                continue
-            tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
-
-            gate_up = layer.gate_up_linear[i](tokens_for_this_expert)
-            gate_up = self.act_to_impl[activation].forward_native(gate_up)
-            expert_out = layer.down_linear[i](gate_up)
-            outputs.append(expert_out)
-            start_idx = end_idx
-
-        outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)
-        new_x = torch.empty_like(outs)
-
-        new_x[idxs] = outs
-        final_out = (
-            new_x.view(*topk_ids.shape, -1)
-            .type(topk_weights.dtype)
-            .mul_(topk_weights.unsqueeze(dim=-1))
-            .sum(dim=1)
-            .type(new_x.dtype)
+    def forward_torch(
+        self,
+        layer: torch.nn.Module,
+        input: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
+        global_num_experts: int = -1,
+    ) -> torch.Tensor:
+        output = torch.empty_like(input)
+        layer_id = id(layer)
+        torch.ops.vllm.cpu_fused_moe_torch(
+            layer_id,
+            output,
+            input,
+            topk_weights,
+            topk_ids,
+            activation,
+            global_num_experts,
         )
-        return final_out
+
+        return output
+
+
+def cpu_fused_moe_torch(
+    layer_id: int,
+    output: torch.Tensor,
+    input: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    activation: str,
+    global_num_experts: int = -1,
+) -> None:
+    layer = _CPU_MOE_LAYER_CACHE[layer_id]()
+
+    # Ref code from https://github.com/sgl-project/sglang/blob/716e682721397df103f347d22da8bd46c6016dab/python/sglang/srt/layers/moe/fused_moe_native.py#L53
+    len_experts = global_num_experts
+
+    cnts = topk_ids.new_zeros((topk_ids.shape[0], len_experts))
+    cnts.scatter_(1, topk_ids.to(torch.int64), 1)
+    tokens_per_expert = cnts.sum(dim=0)
+    idxs = topk_ids.view(-1).argsort()
+
+    sorted_tokens = input[idxs // topk_ids.shape[1]]
+    tokens_per_expert = tokens_per_expert.cpu().numpy()
+
+    outputs = []
+    start_idx = 0
+
+    for i, num_tokens in enumerate(tokens_per_expert):
+        end_idx = start_idx + num_tokens
+        if num_tokens == 0:
+            continue
+        tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
+
+        gate_up = layer.gate_up_linear[i](tokens_for_this_expert)  # type: ignore
+        gate_up = _CPU_MOE_ACT[activation].forward_native(gate_up)
+        expert_out = layer.down_linear[i](gate_up)  # type: ignore
+        outputs.append(expert_out)
+        start_idx = end_idx
+
+    outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)
+    new_x = torch.empty_like(outs)
+
+    new_x[idxs] = outs
+    final_out = (
+        new_x.view(*topk_ids.shape, -1)
+        .type(topk_weights.dtype)
+        .mul_(topk_weights.unsqueeze(dim=-1))
+        .sum(dim=1)
+        .type(new_x.dtype)
+    )
+    output.copy_(final_out)
+
+
+direct_register_custom_op(
+    op_name="cpu_fused_moe_torch",
+    op_func=cpu_fused_moe_torch,
+    mutates_args=["output"],
+)
diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
index 4a0b4e82c1b3902b0dacb90cd1c6c9a053f2f75d..c0ffa38fdb2cd9838547fc0d28d311b589c209ed 100644
--- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py
+++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
@@ -21,7 +21,7 @@ from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
     TopKWeightAndReduceDelegate,
     TopKWeightAndReduceNoOP,
 )
-from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize, _resize_cache
+from vllm.model_executor.layers.fused_moe.utils import _resize_cache
 from vllm.scalar_type import scalar_types
 
 logger = init_logger(__name__)
@@ -108,15 +108,7 @@ def run_cutlass_moe_fp8(
     assert global_num_experts != -1
     assert a1q_scale is not None
 
-    if expert_map is not None:
-        "Translate info from expert_map to topk_ids"
-        local_topk_ids = torch.where(
-            expert_map[topk_ids] != -1, expert_map[topk_ids], -1
-        )
-    else:
-        local_topk_ids = topk_ids
-
-    topk = local_topk_ids.size(1)
+    topk = topk_ids.size(1)
     local_E = w1.size(0)
 
     if use_batched_format:
@@ -164,16 +156,12 @@ def run_cutlass_moe_fp8(
         # during offset calculations
         expert_offsets = expert_offsets.to(torch.int64)
     else:
-        problem_sizes1 = torch.empty(
-            (global_num_experts, 3), dtype=torch.int32, device=device
-        )
-        problem_sizes2 = torch.empty(
-            (global_num_experts, 3), dtype=torch.int32, device=device
-        )
+        problem_sizes1 = torch.empty((local_E, 3), dtype=torch.int32, device=device)
+        problem_sizes2 = torch.empty((local_E, 3), dtype=torch.int32, device=device)
 
         num_expert = global_num_experts if expert_map is None else expert_map.size(0)
         # permuted a1q reuses workspace2
-        a1q, a1q_scale, expert_offsets, inv_perm, _ = moe_permute(
+        a1q, a1q_scale, expert_first_token_offset, inv_perm, _ = moe_permute(
             a1q,
             a1q_scale,
             topk_ids,
@@ -182,11 +170,12 @@ def run_cutlass_moe_fp8(
             expert_map,
             permuted_hidden_states=a1q_perm,
         )
-        expert_offsets = expert_offsets[:-1]
-
-        ops.get_cutlass_moe_mm_problem_sizes(
-            local_topk_ids, problem_sizes1, problem_sizes2, global_num_experts, N, K
+        # swap_ab is a CUTLASS grouped-GEMM optimization (M <= 64 reduces padding).
+        swap_ab = a1q.size(0) <= 64
+        ops.get_cutlass_moe_mm_problem_sizes_from_expert_offsets(
+            expert_first_token_offset, problem_sizes1, problem_sizes2, N, K, swap_ab
         )
+        expert_offsets = expert_first_token_offset[:-1]
 
     if not per_act_token and (expert_map is not None or use_batched_format):
         # this is necessary to avoid imprecise scale calculation caused by
@@ -215,9 +204,6 @@ def run_cutlass_moe_fp8(
         act_out, a2_scale, use_per_token_if_dynamic=per_act_token, output=quant_out
     )
 
-    if expert_map is not None:
-        mm2_out.fill_(0)
-
     ops.cutlass_moe_mm(
         mm2_out,
         a2q,
@@ -243,26 +229,35 @@ def run_cutlass_moe_fp8(
             permuted_hidden_states=mm2_out,
             topk_weights=topk_weights,
             inv_permuted_idx=inv_perm,
+            expert_first_token_offset=expert_first_token_offset,
         )
 
 
 class CutlassExpertsFp8Base(mk.FusedMoEPermuteExpertsUnpermute):
     def __init__(
         self,
+        e: int,
+        n: int,
+        k: int,
         out_dtype: torch.dtype | None,
-        ab_strides1: torch.Tensor,
-        ab_strides2: torch.Tensor,
-        c_strides1: torch.Tensor,
-        c_strides2: torch.Tensor,
         quant_config: FusedMoEQuantConfig,
+        device: torch.dtype,
     ):
         assert quant_config.use_fp8_w8a8
         super().__init__(quant_config)
+
+        # E: num_experts
+        # N: intermediate size per partition
+        # K: hidden dim
+        ab_strides1_c_strides2 = torch.full((e,), k, device=device, dtype=torch.int64)
+        ab_strides2 = torch.full((e,), n, device=device, dtype=torch.int64)
+        c_strides1 = torch.full((e,), 2 * n, device=device, dtype=torch.int64)
+
         self.out_dtype = out_dtype
-        self.ab_strides1 = ab_strides1
+        self.ab_strides1 = ab_strides1_c_strides2
         self.ab_strides2 = ab_strides2
         self.c_strides1 = c_strides1
-        self.c_strides2 = c_strides2
+        self.c_strides2 = ab_strides1_c_strides2
 
     def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
         # Let PrepareAndFinalize::finalize() decide the impl.
@@ -329,24 +324,6 @@ class CutlassExpertsFp8Base(mk.FusedMoEPermuteExpertsUnpermute):
 
 
 class CutlassExpertsFp8(CutlassExpertsFp8Base):
-    def __init__(
-        self,
-        out_dtype: torch.dtype | None,
-        ab_strides1: torch.Tensor,
-        ab_strides2: torch.Tensor,
-        c_strides1: torch.Tensor,
-        c_strides2: torch.Tensor,
-        quant_config: FusedMoEQuantConfig,
-    ):
-        super().__init__(
-            out_dtype,
-            ab_strides1,
-            ab_strides2,
-            c_strides1,
-            c_strides2,
-            quant_config,
-        )
-
     @property
     def activation_formats(
         self,
@@ -378,9 +355,11 @@ class CutlassExpertsFp8(CutlassExpertsFp8Base):
         global_num_experts: int,
         local_num_experts: int,
         expert_tokens_meta: mk.ExpertTokensMetadata | None,
+        activation: str,
     ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
+        activation_out_dim = self.adjust_N_for_activation(N, activation)
         workspace1 = (M * topk, max(N, K))
-        workspace2 = (M * topk, max(N // 2, K))
+        workspace2 = (M * topk, max(activation_out_dim, K))
         output = (M, K)
         return (workspace1, workspace2, output)
 
@@ -390,21 +369,10 @@ class CutlassBatchedExpertsFp8(CutlassExpertsFp8Base):
         self,
         max_experts_per_worker: int,
         num_dispatchers: int,
-        out_dtype: torch.dtype | None,
-        ab_strides1: torch.Tensor,
-        ab_strides2: torch.Tensor,
-        c_strides1: torch.Tensor,
-        c_strides2: torch.Tensor,
-        quant_config: FusedMoEQuantConfig,
+        *args,
+        **kwargs,
     ):
-        super().__init__(
-            out_dtype,
-            ab_strides1,
-            ab_strides2,
-            c_strides1,
-            c_strides2,
-            quant_config,
-        )
+        super().__init__(*args, **kwargs)
         assert max_experts_per_worker > 0
         self.max_experts_per_worker = max_experts_per_worker
         self.num_dispatchers = num_dispatchers
@@ -436,122 +404,21 @@ class CutlassBatchedExpertsFp8(CutlassExpertsFp8Base):
         global_num_experts: int,
         local_num_experts: int,
         expert_tokens_meta: mk.ExpertTokensMetadata | None,
+        activation: str,
     ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
         num_dp = self.num_dispatchers
         assert num_dp is not None
+        activation_out_dim = self.adjust_N_for_activation(N, activation)
         workspace1 = (self.max_experts_per_worker, M * num_dp, max(N, K))
-        workspace2 = (self.max_experts_per_worker, M * num_dp, max(N // 2, K))
+        workspace2 = (
+            self.max_experts_per_worker,
+            M * num_dp,
+            max(activation_out_dim, K),
+        )
         output = (self.max_experts_per_worker, M, K)
         return (workspace1, workspace2, output)
 
 
-def cutlass_moe_fp8(
-    a: torch.Tensor,
-    w1_q: torch.Tensor,
-    w2_q: torch.Tensor,
-    topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
-    ab_strides1: torch.Tensor,
-    ab_strides2: torch.Tensor,
-    c_strides1: torch.Tensor,
-    c_strides2: torch.Tensor,
-    quant_config: FusedMoEQuantConfig,
-    activation: str = "silu",
-    expert_map: torch.Tensor | None = None,
-    apply_router_weight_on_input: bool = False,
-    global_num_experts: int = -1,
-) -> torch.Tensor:
-    """
-    This function computes a a8w8-quantized Mixture of Experts (MoE) layer
-    using two sets of quantized weights, w1_q and w2_q, and top-k gating
-    mechanism. The matrix multiplications are implemented with CUTLASS
-    grouped gemm.
-
-    Parameters:
-    - a (torch.Tensor): The input tensor to the MoE layer.
-        Shape: [M, K]
-    - w1_q (torch.Tensor): The first set of fp8-quantized expert weights.
-        Shape: [num_experts, K, 2N] (the weights are passed transposed)
-    - w2_q (torch.Tensor): The second set of fp8-quantized expert weights.
-        Shape: [num_experts, N, K] (the weights are passed transposed)
-    - topk_weights (torch.Tensor): The weights of each token->expert mapping.
-    - topk_ids (torch.Tensor): The token->expert mappings.
-    - w1_scale (torch.Tensor): The fp32 scale to dequantize w1_q.
-        Shape: [num_experts] or [num_experts, 2N]
-    - w2_scale (torch.Tensor): The fp32 scale to dequantize w2_q.
-        Shape: [num_experts] or [num_experts, K]
-    - ab_strides1 (torch.Tensor): The input/weight strides for the first gemm.
-        Shape: [num_experts]
-    - ab_strides2 (torch.Tensor): The input/weight strides for the second gemm.
-        Shape: [num_experts]
-    - c_strides1 (torch.Tensor): The output strides for the first gemm.
-        Shape: [num_experts]
-    - c_strides2 (torch.Tensor): The output strides for the second gemm.
-        Shape: [num_experts]
-    - per_act_token (Optional[bool]): Whether the scale is per-token or
-                                      per-tensor.
-    - activation (str): The activation function to use.
-    - a1_scale (Optional[torch.Tensor]): The optional fp32 scale to quantize a.
-        Shape: scalar or [M]
-    - a2_scale (Optional[torch.Tensor]): The optional fp32 scale to
-        quantize the intermediate result between the gemms.
-        Shape: scalar or [M]
-    - expert_map (Optional[torch.Tensor]): In the case of Expert parallel,
-        every Rank is responsible for a subset of experts. expert_map is a
-        mapping from global expert-id to local expert-id. When expert_map[i]
-        is -1, it means that this Rank is not responsible for global
-        expert-id i.
-    - apply_router_weight_on_input (bool): When true, the topk weights are
-        applied directly on the inputs. This is only applicable when topk is 1.
-    - global_num_experts (int): The total number of experts.
-
-    Returns:
-    - torch.Tensor: The fp16 output tensor after applying the MoE layer.
-    """
-    assert quant_config is not None
-
-    if quant_config.a1_scale is not None:
-        assert quant_config.per_act_token_quant == (quant_config.a1_scale.numel() != 1)
-    if quant_config.a2_scale is not None:
-        assert quant_config.per_act_token_quant == (quant_config.a2_scale.numel() != 1)
-
-    if quant_config.w1_scale is not None:
-        if quant_config.per_out_ch_quant:
-            assert quant_config.w1_scale.dim() > 1 and quant_config.w1_scale.size(
-                1
-            ) == w1_q.size(1)
-        else:
-            assert (
-                quant_config.w1_scale.dim() == 1 or quant_config.w1_scale.size(1) == 1
-            )
-
-    num_experts = global_num_experts if global_num_experts != -1 else w1_q.size(0)
-
-    fn = mk.FusedMoEModularKernel(
-        MoEPrepareAndFinalizeNoEP(),
-        CutlassExpertsFp8(
-            out_dtype=a.dtype,
-            ab_strides1=ab_strides1,
-            ab_strides2=ab_strides2,
-            c_strides1=c_strides1,
-            c_strides2=c_strides2,
-            quant_config=quant_config,
-        ),
-    )
-
-    return fn(
-        a,
-        w1_q,
-        w2_q,
-        topk_weights,
-        topk_ids,
-        activation=activation,
-        global_num_experts=num_experts,
-        expert_map=expert_map,
-        apply_router_weight_on_input=apply_router_weight_on_input,
-    )
-
-
 FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max()
 FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max
 
@@ -677,7 +544,8 @@ def run_cutlass_moe_fp4(
         num_topk,
     )
     c1 = _resize_cache(workspace13, (m * topk, n * 2))
-    c2 = _resize_cache(workspace2, (m * topk, n))
+    # Note: c2 workspace is no longer needed since SiLU is fused with quantization.
+    # c3 reuses workspace13 after c1 is consumed.
     c3 = _resize_cache(workspace13, (m * topk, k))
     ops.cutlass_fp4_moe_mm(
         c1,
@@ -691,9 +559,9 @@ def run_cutlass_moe_fp4(
         blockscale_offsets[:-1],
     )
     del rep_a_fp4, rep_a_blockscale
-    torch.ops._C.silu_and_mul(c2, c1)
-    int_fp4, int_blockscale = ops.scaled_fp4_experts_quant(
-        c2, a2_gscale, expert_offsets, blockscale_offsets, num_topk
+    # Fused SiLU+Mul+NVFP4 quantization
+    int_fp4, int_blockscale = ops.silu_and_mul_scaled_fp4_experts_quant(
+        c1, a2_gscale, expert_offsets, blockscale_offsets, num_topk
     )
 
     ops.cutlass_fp4_moe_mm(
@@ -775,13 +643,15 @@ class CutlassExpertsFp4(mk.FusedMoEPermuteExpertsUnpermute):
         global_num_experts: int,
         local_num_experts: int,
         expert_tokens_meta: mk.ExpertTokensMetadata | None,
+        activation: str,
     ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
+        activation_out_dim = self.adjust_N_for_activation(N, activation)
         workspace1: tuple[int, ...] = ()
         workspace2: tuple[int, ...] = ()
         output: tuple[int, ...] = ()
         if self.use_batched_format:
             workspace1 = (self.max_experts_per_worker, M, max(N, K))
-            workspace2 = (self.max_experts_per_worker, M, (N // 2))
+            workspace2 = (self.max_experts_per_worker, M, activation_out_dim)
             output = (self.max_experts_per_worker, M, K)
         else:
             workspace1 = (M * topk, max(2 * N, K))
@@ -834,224 +704,6 @@ class CutlassExpertsFp4(mk.FusedMoEPermuteExpertsUnpermute):
         )
 
 
-def cutlass_moe_fp4(
-    a: torch.Tensor,
-    w1_fp4: torch.Tensor,
-    w2_fp4: torch.Tensor,
-    topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
-    quant_config: FusedMoEQuantConfig,
-    m: int,
-    n: int,
-    k: int,
-    e: int,
-    expert_map: torch.Tensor | None = None,
-    apply_router_weight_on_input: bool = False,
-) -> torch.Tensor:
-    assert expert_map is None, (
-        "Expert Parallelism / expert_map "
-        "is currently not supported for "
-        "ModelOptNvFp4FusedMoE's cutlass_moe_fp4."
-    )
-
-    # TODO(bnell): this feels a bit hacky
-    # NVFP4 requires two levels of quantization, which involves
-    # computing some scaling factors dynamically. This makes it
-    # incompatible with the typical prepare -> MoE -> finalize
-    # pipeline. Move the quantization logic into the MoE body.
-    quant_config = FusedMoEQuantConfig.make(
-        quant_dtype=None,  # skip quantization in prepare/finalize
-        per_act_token_quant=quant_config.per_act_token_quant,
-        per_out_ch_quant=quant_config.per_out_ch_quant,
-        block_shape=quant_config.block_shape,
-        g1_alphas=quant_config.g1_alphas,
-        g2_alphas=quant_config.g2_alphas,
-        a1_gscale=quant_config.a1_gscale,
-        a2_gscale=quant_config.a2_gscale,
-        w1_scale=quant_config.w1_scale,
-        w2_scale=quant_config.w2_scale,
-    )
-
-    fn = mk.FusedMoEModularKernel(
-        MoEPrepareAndFinalizeNoEP(),
-        CutlassExpertsFp4(
-            max_experts_per_worker=e,
-            out_dtype=a.dtype,
-            quant_config=quant_config,
-            use_batched_format=False,
-        ),
-    )
-
-    return fn(
-        hidden_states=a,
-        w1=w1_fp4,
-        w2=w2_fp4,
-        topk_weights=topk_weights,
-        topk_ids=topk_ids,
-        inplace=False,
-        activation="silu",
-        global_num_experts=e,
-        expert_map=None,
-        apply_router_weight_on_input=apply_router_weight_on_input,
-    )
-
-
-def _valid_cutlass_block_scaled_grouped_gemm(
-    w1: torch.Tensor,
-    w2: torch.Tensor,
-    inplace: bool,
-    activation: str,
-    apply_router_weight_on_input: bool,
-    expert_map: torch.Tensor | None,
-) -> bool:
-    def _valid_cutlass_block_scaled_grouped_gemm_shape(N: int, K: int):
-        return N % 128 == 0 and K % 128 == 0
-
-    _, K, N = w2.size()
-    if not _valid_cutlass_block_scaled_grouped_gemm_shape(N, K):
-        logger.debug_once(
-            "CutlassBlockScaledGroupedGemm disabled: unaligned problem size. "
-            "N: %s, K: %s",
-            N,
-            K,
-        )
-        return False
-
-    if w1.dtype != torch.float8_e4m3fn or w2.dtype != torch.float8_e4m3fn:
-        logger.debug_once(
-            "CutlassBlockScaledGroupedGemm disabled: invalid weight dtype(s). "
-            "w1.dtype: %s, w2.dtype: %s",
-            w1.dtype,
-            w2.dtype,
-        )
-        return False
-
-    if expert_map is not None:
-        logger.debug_once(
-            "CutlassBlockScaledGroupedGemm disabled: expert_parallel is not supported."
-        )
-        return False
-
-    if activation != "silu":
-        logger.debug_once(
-            "CutlassBlockScaledGroupedGemm disabled: only activation silu is supported."
-        )
-        return False
-
-    if apply_router_weight_on_input:
-        logger.debug_once(
-            "CutlassBlockScaledGroupedGemm disabled:"
-            " apply_router_weight_on_input is not supported."
-        )
-        return False
-
-    if inplace:
-        logger.debug_once(
-            "CutlassBlockScaledGroupedGemm disabled: inplace is not supported."
-        )
-        return False
-
-    return True
-
-
-# TODO(bnell): would be nice combine/integrate with regular cutlass_fp8.
-def run_cutlass_block_scaled_fused_experts(
-    a: torch.Tensor,
-    w1: torch.Tensor,
-    w2: torch.Tensor,
-    w1_scale: torch.Tensor,
-    w2_scale: torch.Tensor,
-    topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
-) -> torch.Tensor:
-    w1_q = w1.transpose(1, 2)
-    w2_q = w2.transpose(1, 2)
-    w1_scale = w1_scale.transpose(1, 2)
-    w2_scale = w2_scale.transpose(1, 2)
-
-    assert topk_weights.shape == topk_ids.shape, "topk shape mismatch"
-    assert a.shape[0] == topk_ids.shape[0], (
-        "a and topk_ids must have the same batch size"
-    )
-    assert w1_q.dtype == torch.float8_e4m3fn, "w1_q must be float8_e4m3fn"
-    assert w2_q.dtype == torch.float8_e4m3fn, "w2_q must be float8_e4m3fn"
-    assert a.shape[1] == w1_q.shape[1], "Hidden size mismatch w1"
-    assert w1_q.shape[2] == w2_q.shape[1] * 2, "Hidden size mismatch w2"
-    assert w1_q.shape[0] == w2_q.shape[0], "Expert number mismatch"
-    assert w1_q.shape[0] == w1_scale.shape[0], "w1_scale expert number mismatch"
-    assert w1_q.shape[0] == w2_scale.shape[0], "w2_scale expert number mismatch"
-    assert a.dtype in [torch.half, torch.bfloat16], "Invalid output dtype"
-
-    out_dtype = a.dtype
-    num_experts = w1_q.size(0)
-    m = a.size(0)
-    k = w1_q.size(1)
-    n = w2_q.size(1)
-
-    topk = topk_ids.size(1)
-
-    a_q, a1_scale = _fp8_quantize(
-        a, A_scale=None, per_act_token=False, block_shape=[128, 128]
-    )
-    device = a_q.device
-
-    expert_offsets = torch.empty((num_experts + 1,), dtype=torch.int32, device=device)
-    problem_sizes1 = torch.empty((num_experts, 3), dtype=torch.int32, device=device)
-    problem_sizes2 = torch.empty((num_experts, 3), dtype=torch.int32, device=device)
-
-    a_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device)
-    c_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device)
-
-    ops.get_cutlass_moe_mm_data(
-        topk_ids,
-        expert_offsets,
-        problem_sizes1,
-        problem_sizes2,
-        a_map,
-        c_map,
-        num_experts,
-        n,
-        k,
-    )
-
-    rep_a_q = a_q.view(dtype=torch.uint8)[a_map].view(dtype=a_q.dtype)
-    rep_a1_scales = a1_scale[a_map]
-
-    c1 = torch.empty((m * topk, n * 2), dtype=out_dtype, device=device)
-    c2 = torch.empty((m * topk, k), dtype=out_dtype, device=device)
-
-    ops.cutlass_blockwise_scaled_grouped_mm(
-        c1,
-        rep_a_q,
-        w1_q,
-        rep_a1_scales,
-        w1_scale,
-        problem_sizes1,
-        expert_offsets[:-1],
-    )
-
-    intermediate = torch.empty((m * topk, n), dtype=out_dtype, device=device)
-    torch.ops._C.silu_and_mul(intermediate, c1)
-
-    intermediate_q, a2_scale = _fp8_quantize(
-        intermediate, A_scale=None, per_act_token=False, block_shape=[128, 128]
-    )
-
-    ops.cutlass_blockwise_scaled_grouped_mm(
-        c2,
-        intermediate_q,
-        w2_q,
-        a2_scale,
-        w2_scale,
-        problem_sizes2,
-        expert_offsets[:-1],
-    )
-
-    return (
-        c2[c_map].view(m, topk, k) * topk_weights.view(m, topk, 1).to(out_dtype)
-    ).sum(dim=1)
-
-
 # W4A8
 def run_cutlass_moe_w4a8_fp8(
     output: torch.Tensor,
@@ -1117,15 +769,7 @@ def run_cutlass_moe_w4a8_fp8(
         f"w1 hidden size mismatch: got {w1.size(2) * 8}, expected {K=}"
     )
 
-    # Translate info from expert_map to topk_ids
-    if expert_map is not None:
-        local_topk_ids = torch.where(
-            expert_map[topk_ids] != -1, expert_map[topk_ids], -1
-        )
-    else:
-        local_topk_ids = topk_ids
-
-    topk = local_topk_ids.size(1)
+    topk = topk_ids.size(1)
     a1q_perm = _resize_cache(workspace2.view(dtype=torch.float8_e4m3fn), (M * topk, K))
     mm1_out = _resize_cache(workspace13, (M * topk, N * 2))
     act_out = _resize_cache(workspace2, (M * topk, N))
@@ -1135,16 +779,12 @@ def run_cutlass_moe_w4a8_fp8(
     )
     mm2_out = _resize_cache(workspace2, (M * topk, K))
 
-    problem_sizes1 = torch.empty(
-        (global_num_experts, 3), dtype=torch.int32, device=device
-    )
-    problem_sizes2 = torch.empty(
-        (global_num_experts, 3), dtype=torch.int32, device=device
-    )
+    problem_sizes1 = torch.empty((local_E, 3), dtype=torch.int32, device=device)
+    problem_sizes2 = torch.empty((local_E, 3), dtype=torch.int32, device=device)
 
     num_expert = global_num_experts if expert_map is None else expert_map.size(0)
     # permuted a1q reuses workspace2
-    a1q, a1q_scale, expert_offsets, inv_perm, _ = moe_permute(
+    a1q, a1q_scale, expert_first_token_offset, inv_perm, _ = moe_permute(
         a1q,
         a1q_scale,
         topk_ids,
@@ -1153,18 +793,11 @@ def run_cutlass_moe_w4a8_fp8(
         expert_map,
         permuted_hidden_states=a1q_perm,
     )
-    expert_offsets = expert_offsets[:-1]
-
-    # For RS gemm SwapAB is always enabled (swap logical M, N in the problem shape)
-    ops.get_cutlass_moe_mm_problem_sizes(
-        local_topk_ids,
-        problem_sizes1,
-        problem_sizes2,
-        global_num_experts,
-        N,
-        K,
-        force_swap_ab=True,
+    # for RS gemm SwapAB is always enabled (swap logical M, N in the problem shape).
+    ops.get_cutlass_moe_mm_problem_sizes_from_expert_offsets(
+        expert_first_token_offset, problem_sizes1, problem_sizes2, N, K, True
     )
+    expert_offsets = expert_first_token_offset[:-1]
 
     ops.cutlass_w4a8_moe_mm(
         mm1_out,
@@ -1188,9 +821,6 @@ def run_cutlass_moe_w4a8_fp8(
         act_out, a2_scale, use_per_token_if_dynamic=per_act_token, output=quant_out
     )
 
-    if expert_map is not None:
-        mm2_out.fill_(0)
-
     ops.cutlass_w4a8_moe_mm(
         mm2_out,
         a2q,
@@ -1214,6 +844,7 @@ def run_cutlass_moe_w4a8_fp8(
         permuted_hidden_states=mm2_out,
         topk_weights=topk_weights,
         inv_permuted_idx=inv_perm,
+        expert_first_token_offset=expert_first_token_offset,
     )
 
 
@@ -1275,9 +906,11 @@ class CutlassExpertsW4A8Fp8(mk.FusedMoEPermuteExpertsUnpermute):
         global_num_experts: int,
         local_num_experts: int,
         expert_tokens_meta: mk.ExpertTokensMetadata | None,
+        activation: str,
     ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
+        activation_out_dim = self.adjust_N_for_activation(N, activation)
         workspace1 = (M * topk, max(N, K))
-        workspace2 = (M * topk, max(N // 2, K))
+        workspace2 = (M * topk, max(activation_out_dim, K))
         output = (M, K)
         return (workspace1, workspace2, output)
 
diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
index 5ca91768c9760c38e88fe83bfa2c56d879475237..a2e5a07fbfd208587328ca6599a000cec846162b 100644
--- a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
@@ -143,6 +143,7 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
         global_num_experts: int,
         local_num_experts: int,
         expert_tokens_meta: mk.ExpertTokensMetadata | None,
+        activation: str,
     ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
         assert self.block_shape is not None
         block_m = self.block_shape[0]
@@ -151,7 +152,8 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
         )
         assert M_sum % block_m == 0
 
-        workspace1 = (M_sum, max(N // 2, K))
+        activation_out_dim = self.adjust_N_for_activation(N, activation)
+        workspace1 = (M_sum, max(activation_out_dim, K))
         workspace2 = (M_sum, max(N, K))
         output = (M, K)
         return (workspace1, workspace2, output)
@@ -163,11 +165,13 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
         block_k = self.block_shape[1]
         scale_fmt = DeepGemmQuantScaleFMT.from_oracle()
 
+        M_sum, N = input.size()
+        activation_out_dim = self.adjust_N_for_activation(N, activation)
+
         # 1. DeepGemm UE8M0: use packed per-token-group quant
         if scale_fmt == DeepGemmQuantScaleFMT.UE8M0:
-            M_sum, N = input.size()
             act_out = torch.empty(
-                (M_sum, N // 2), dtype=input.dtype, device=input.device
+                (M_sum, activation_out_dim), dtype=input.dtype, device=input.device
             )
             self.activation(activation, act_out, input)
             a2q, a2q_scale = per_token_group_quant_fp8_packed_for_deepgemm(
@@ -187,8 +191,9 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
             )
 
         # 3. fallback path for non-SiLU activations in non‑UE8M0 cases.
-        M_sum, N = input.size()
-        act_out = torch.empty((M_sum, N // 2), dtype=input.dtype, device=input.device)
+        act_out = torch.empty(
+            (M_sum, activation_out_dim), dtype=input.dtype, device=input.device
+        )
         self.activation(activation, act_out, input)
         return per_token_group_quant_fp8(
             act_out, block_k, column_major_scales=True, out_q=output
@@ -254,8 +259,9 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
             (a1q, a1q_scale), (w1, self.w1_scale), mm1_out, expert_ids
         )
 
+        activation_out_dim = self.adjust_N_for_activation(N, activation)
         quant_out = _resize_cache(
-            workspace13.view(dtype=torch.float8_e4m3fn), (M_sum, N // 2)
+            workspace13.view(dtype=torch.float8_e4m3fn), (M_sum, activation_out_dim)
         )
         a2q, a2q_scale = self._act_mul_quant(
             input=mm1_out.view(-1, N), output=quant_out, activation=activation
diff --git a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
index 06e4a61133bd99b5d70ca15528655e2c95ef184e..c0e0b0b222c6a5a71907538eb1923e1d31f5b47e 100644
--- a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
@@ -295,6 +295,8 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
             self.max_tokens_per_rank,
             num_experts,
             use_fp8=self.use_fp8_dispatch,
+            round_scale=self.use_ue8m0_dispatch,
+            use_ue8m0=self.use_ue8m0_dispatch,
             **(dict(use_nvfp4=True) if use_nvfp4 else dict()),
             **(
                 dict(x_global_scale=qc_a1_gscale_or_scale)
diff --git a/vllm/model_executor/layers/fused_moe/fallback.py b/vllm/model_executor/layers/fused_moe/fallback.py
new file mode 100644
index 0000000000000000000000000000000000000000..4556392144a0137e2df20af2c742c9db3502d97c
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/fallback.py
@@ -0,0 +1,127 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from abc import ABC, abstractmethod
+
+import torch
+
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+
+
+class FallbackExperts(mk.FusedMoEPermuteExpertsUnpermute, ABC):
+    """Base class for runtime dispatching of expert implementations."""
+
+    def __init__(
+        self,
+        experts: mk.FusedMoEPermuteExpertsUnpermute,
+        fallback_experts: mk.FusedMoEPermuteExpertsUnpermute,
+    ):
+        super().__init__(experts.quant_config)
+        self.fallback_experts = fallback_experts
+        self.experts = experts
+
+    @property
+    def activation_formats(
+        self,
+    ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
+        assert (
+            self.fallback_experts.activation_formats == self.experts.activation_formats
+        )
+        return self.fallback_experts.activation_formats
+
+    def supports_chunking(self) -> bool:
+        assert (
+            self.experts.supports_chunking()
+            == self.fallback_experts.supports_chunking()
+        )
+        return (
+            self.experts.supports_chunking()
+            and self.fallback_experts.supports_chunking()
+        )
+
+    def supports_expert_map(self) -> bool:
+        assert (
+            self.experts.supports_expert_map()
+            == self.fallback_experts.supports_expert_map()
+        )
+        return (
+            self.experts.supports_expert_map()
+            and self.fallback_experts.supports_expert_map()
+        )
+
+    def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
+        e_war = self.experts.finalize_weight_and_reduce_impl()
+        fbe_war = self.fallback_experts.finalize_weight_and_reduce_impl()
+        is_dge_war = e_war is not None
+        is_fbe_war = fbe_war is not None
+
+        if is_dge_war and is_fbe_war:
+            assert e_war == fbe_war, (
+                "Both implementations should agree on WeightAndReduce impls. "
+                f"Got e_war: {e_war}, and fbe_war: {fbe_war}"
+            )
+
+        if e_war is not None:
+            return e_war
+        assert fbe_war is not None
+        return fbe_war
+
+    @abstractmethod
+    def workspace_shapes(
+        self,
+        M: int,
+        N: int,
+        K: int,
+        topk: int,
+        global_num_experts: int,
+        local_num_experts: int,
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
+        activation: str,
+    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
+        raise NotImplementedError
+
+    @abstractmethod
+    def _select_experts_impl(
+        self,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+    ) -> mk.FusedMoEPermuteExpertsUnpermute:
+        raise NotImplementedError
+
+    def apply(
+        self,
+        output: torch.Tensor,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
+        global_num_experts: int,
+        expert_map: torch.Tensor | None,
+        a1q_scale: torch.Tensor | None,
+        a2_scale: torch.Tensor | None,
+        workspace13: torch.Tensor,
+        workspace2: torch.Tensor,
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
+        apply_router_weight_on_input: bool,
+    ):
+        experts = self._select_experts_impl(hidden_states, w1, w2)
+        experts.apply(
+            output,
+            hidden_states,
+            w1,
+            w2,
+            topk_weights,
+            topk_ids,
+            activation,
+            global_num_experts,
+            expert_map,
+            a1q_scale,
+            a2_scale,
+            workspace13,
+            workspace2,
+            expert_tokens_meta,
+            apply_router_weight_on_input,
+        )
diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py
index 6e0b57156cb3143c6ba85699220bc98e423ef335..1651f3530eef4b8475fcc8a4481c1c7493b53262 100644
--- a/vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py
+++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py
@@ -91,6 +91,7 @@ class FlashInferCuteDSLExperts(mk.FusedMoEPermuteExpertsUnpermute):
         global_num_experts: int,
         local_num_experts: int,
         expert_tokens_meta: mk.ExpertTokensMetadata | None,
+        activation: str,
     ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
         # We use global_num_experts due to how moe_align_block_size handles
         # expert_maps.
@@ -335,42 +336,3 @@ def flashinfer_cutedsl_moe_masked(
         alpha_dtype=get_cute_dtype(w2_alpha),
     )  # in logical [m, k, l]
     out = out.permute(2, 0, 1)
-
-
-def flashinfer_cutedsl_moe_fp4(
-    hidden_states: torch.Tensor,
-    w1: torch.Tensor,
-    w2: torch.Tensor,
-    topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
-    quant_config: FusedMoEQuantConfig,
-    inplace: bool = False,
-    activation: str = "silu",
-    global_num_experts: int = -1,
-    expert_map: torch.Tensor | None = None,
-    apply_router_weight_on_input: bool = False,
-) -> torch.Tensor:
-    from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import (  # noqa: E501
-        create_flashinfer_prepare_finalize,
-    )
-
-    fused_experts = mk.FusedMoEModularKernel(
-        create_flashinfer_prepare_finalize(use_dp=False),  # could be swapped later
-        FlashInferCuteDSLExperts(
-            out_dtype=hidden_states.dtype,
-            quant_config=quant_config,
-        ),
-    )
-
-    return fused_experts(
-        hidden_states=hidden_states,
-        w1=w1,
-        w2=w2,
-        topk_weights=topk_weights,
-        topk_ids=topk_ids,
-        inplace=inplace,
-        activation=activation,
-        global_num_experts=global_num_experts,
-        expert_map=expert_map,
-        apply_router_weight_on_input=apply_router_weight_on_input,
-    )
diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
index f864634c661768deb3f4d939915461599b7296da..ae60e15db841001d328c3b811db3db34761fec30 100644
--- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
+++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
@@ -103,6 +103,7 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute):
         global_num_experts: int,
         local_num_experts: int,
         expert_tokens_meta: mk.ExpertTokensMetadata | None,
+        activation: str,
     ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
         # We use global_num_experts due to how moe_align_block_size handles
         # expert_maps.
@@ -165,10 +166,10 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute):
         ):
             # FP8 per-tensor path: use global alphas/scales; do not pass input_sf
             quant_scales = [
-                self.g1_alphas,
-                self.a2_gscale,
-                self.g2_alphas,
-                self.a1_gscale,
+                self.g1_alphas,  # w13_weight_scale * w13_input_scale
+                self.a2_gscale,  # 1.0 / w2_input_scale
+                self.g2_alphas,  # w2_weight_scale * w2_input_scale
+                self.a1_scale,
             ]
 
             a1q_scale = None  # not passing input_sf in fp8
@@ -241,7 +242,9 @@ def flashinfer_cutlass_moe_fp4(
     apply_router_weight_on_input: bool = False,
 ) -> torch.Tensor:
     fused_experts = mk.FusedMoEModularKernel(
-        create_flashinfer_prepare_finalize(use_dp=False),
+        create_flashinfer_prepare_finalize(
+            use_dp=False, use_nvfp4=True, enable_alltoallv=False
+        ),
         FlashInferExperts(
             out_dtype=hidden_states.dtype,
             quant_config=quant_config,
diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
index 762890867e6054f80b220848374fcb2a50e685bf..dfff860750d61a71dea54ad14b59c7b55c618098 100644
--- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
@@ -10,6 +10,9 @@ from vllm.distributed.device_communicators.base_device_communicator import (
 )
 from vllm.forward_context import get_forward_context
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
+from vllm.model_executor.layers.fused_moe.prepare_finalize import (
+    MoEPrepareAndFinalizeNoEP,
+)
 from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
     TopKWeightAndReduceNoOP,
 )
@@ -181,13 +184,14 @@ class FlashInferAllGatherMoEPrepareAndFinalize(FlashInferCutlassMoEPrepareAndFin
         self._apply_router_weight_on_input(
             a1, topk_weights, topk_ids, apply_router_weight_on_input
         )
-        if not self.use_dp and quant_config.quant_dtype == "nvfp4":
+        is_nvfp4 = quant_config.quant_dtype == "nvfp4"
+        if not self.use_dp and is_nvfp4:
             return a1, None, None, topk_ids, topk_weights
 
         if not self.use_deepseek_fp8_block_scale:
             a1q, a1q_scale = moe_kernel_quantize_input(
                 a1,
-                quant_config.a1_gscale,
+                quant_config.a1_gscale if is_nvfp4 else quant_config.a1_scale,
                 quant_config.quant_dtype,
                 quant_config.per_act_token_quant,
                 quant_config.block_shape,
@@ -219,7 +223,7 @@ class FlashInferAllGatherMoEPrepareAndFinalize(FlashInferCutlassMoEPrepareAndFin
                 topk_weights, topk_ids, a1q = gathered
                 a1q_scale = None
 
-        if quant_config.quant_dtype == "nvfp4" and a1q_scale is not None:
+        if is_nvfp4 and a1q_scale is not None:
             a1q_scale = nvfp4_block_scale_interleave(a1q_scale)
 
         return a1q, a1q_scale, None, topk_ids, topk_weights
@@ -349,14 +353,19 @@ def create_flashinfer_prepare_finalize(
     use_nvfp4: bool = False,
     enable_alltoallv: bool = False,
     use_deepseek_fp8_block_scale: bool = False,
-) -> FlashInferCutlassMoEPrepareAndFinalize:
+) -> FlashInferCutlassMoEPrepareAndFinalize | MoEPrepareAndFinalizeNoEP:
     """Factory function to create the appropriate FlashInfer implementation."""
-    if use_nvfp4:
+
+    if use_dp:
         if enable_alltoallv:
+            assert use_nvfp4
             return FlashInferAllToAllMoEPrepareAndFinalize(use_dp)
-        else:
-            return FlashInferAllGatherMoEPrepareAndFinalize(use_dp)
-    # FP8 path currently supported via AllGather; optionally enable block-scale
-    return FlashInferAllGatherMoEPrepareAndFinalize(
-        use_dp=use_dp, use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale
-    )
+        return FlashInferAllGatherMoEPrepareAndFinalize(
+            use_dp=True,
+            use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale,
+        )
+    else:
+        # CUTLASS FP8 BLOCK and CUTLASS NVFP4 apply input quantization
+        # in a single call with the MoE experts kernel.
+        defer_input_quant = use_deepseek_fp8_block_scale or use_nvfp4
+        return MoEPrepareAndFinalizeNoEP(defer_input_quant=defer_input_quant)
diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py
index 51e06ac54f497cc4ac2837297416f728b75c8c0e..3bb5a23abb7bcaeb87c30c55a1901d4910204d3e 100644
--- a/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py
@@ -100,7 +100,7 @@ direct_register_custom_op(
 )
 
 
-def flashinfer_fused_moe_per_tensor_scale_fp8(
+def fi_trtllm_fp8_per_tensor_moe(
     routing_logits: torch.Tensor,
     routing_bias: torch.Tensor | None,
     hidden_states: torch.Tensor,
@@ -158,7 +158,7 @@ def flashinfer_fused_moe_per_tensor_scale_fp8(
     )
 
 
-def flashinfer_fused_moe_per_tensor_scale_fp8_fake(
+def fi_trtllm_fp8_per_tensor_moe_fake(
     routing_logits: torch.Tensor,
     routing_bias: torch.Tensor | None,
     hidden_states: torch.Tensor,
@@ -184,9 +184,9 @@ def flashinfer_fused_moe_per_tensor_scale_fp8_fake(
 
 # TODO(bnell): Does this really need to be a torch.op?
 direct_register_custom_op(
-    op_name="flashinfer_fused_moe_per_tensor_scale_fp8",
-    op_func=flashinfer_fused_moe_per_tensor_scale_fp8,
+    op_name="fi_trtllm_fp8_per_tensor_moe",
+    op_func=fi_trtllm_fp8_per_tensor_moe,
     mutates_args=["hidden_states"],
-    fake_impl=flashinfer_fused_moe_per_tensor_scale_fp8_fake,
+    fake_impl=fi_trtllm_fp8_per_tensor_moe_fake,
     tags=(torch.Tag.needs_fixed_stride_order,),
 )
diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
index 7fd8511e297de3835a15ecd396d2be220da0b6bb..fb93464392ea8a37ccd8237ca2188ad7b7bd9565 100644
--- a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
@@ -673,6 +673,7 @@ class NaiveBatchedExperts(mk.FusedMoEPermuteExpertsUnpermute):
         global_num_experts: int,
         local_num_experts: int,
         expert_tokens_meta: mk.ExpertTokensMetadata | None,
+        activation: str,
     ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
         num_dp = self.num_dispatchers
         num_experts = local_num_experts
@@ -867,12 +868,14 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
         global_num_experts: int,
         local_num_experts: int,
         expert_tokens_meta: mk.ExpertTokensMetadata | None,
+        activation: str,
     ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
         num_dp = self.num_dispatchers
         num_experts = local_num_experts
         max_num_tokens = self.max_num_tokens
+        activation_out_dim = self.adjust_N_for_activation(N, activation)
         workspace13 = (num_experts, max_num_tokens * num_dp, max(K, N))
-        workspace2 = (num_experts, max_num_tokens * num_dp, (N // 2))
+        workspace2 = (num_experts, max_num_tokens * num_dp, activation_out_dim)
         output = (num_experts, max_num_tokens * num_dp, K)
         return (workspace13, workspace2, output)
 
@@ -947,7 +950,10 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
         # We can reuse the memory between these because by the time we need
         # cache3, we're done with cache1
         intermediate_cache1 = _resize_cache(workspace13, (E, max_num_tokens, N))
-        intermediate_cache2 = _resize_cache(workspace2, (E, max_num_tokens, N // 2))
+        activation_out_dim = self.adjust_N_for_activation(N, activation)
+        intermediate_cache2 = _resize_cache(
+            workspace2, (E, max_num_tokens, activation_out_dim)
+        )
 
         # TODO(bnell): should this be done for any quantized type?
         if self.quant_config.use_fp8_w8a8:
@@ -978,7 +984,7 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
         # TODO (bnell): use triton utility from batched deep gemm.
         self.activation(
             activation,
-            intermediate_cache2.view(-1, N // 2),
+            intermediate_cache2.view(-1, activation_out_dim),
             intermediate_cache1.view(-1, N),
         )
 
diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
index 71daa9529ef781137f3ffaaf39f6df836936ce9a..b89e80f19dfe6d339d98e4f7cf05d6972f600f40 100644
--- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
@@ -17,9 +17,6 @@ from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
     batched_moe_align_block_size,
     moe_align_block_size,
 )
-from vllm.model_executor.layers.fused_moe.prepare_finalize import (
-    MoEPrepareAndFinalizeNoEP,
-)
 from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
     TopKWeightAndReduceDelegate,
     TopKWeightAndReduceNoOP,
@@ -30,6 +27,7 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     marlin_moe_intermediate_size,
     marlin_quant_input,
 )
+from vllm.platforms import current_platform
 from vllm.scalar_type import ScalarType, scalar_types
 
 
@@ -141,7 +139,6 @@ def _fused_marlin_moe(
         moe_block_size=block_size_m,
         top_k=num_topk,
         mul_topk_weights=apply_router_weight_on_input,
-        is_ep=expert_map is not None,
         b_q_type=quant_type,
         size_m=M,
         size_n=2 * N,
@@ -193,7 +190,6 @@ def _fused_marlin_moe(
         moe_block_size=block_size_m,
         top_k=1,
         mul_topk_weights=not apply_router_weight_on_input,
-        is_ep=expert_map is not None,
         b_q_type=quant_type,
         size_m=M * num_topk,
         size_n=K,
@@ -546,9 +542,12 @@ class MarlinExpertsBase(mk.FusedMoEPermuteExpertsUnpermute):
         is_k_full: bool = True,
     ):
         # TODO (varun) : Enable activation quantization
-        assert quant_config.use_mxfp4_w4a16 or quant_config.use_int4_w4a16, (
-            "Supports only mxfp4_w4a16 or int4_w4a16"
-        )
+        assert (
+            quant_config.use_mxfp4_w4a16
+            or quant_config.use_nvfp4_w4a16
+            or quant_config.use_int4_w4a16
+            or quant_config.use_fp8_w8a16
+        ), "Supports only {mxfp,nvfp,int}4_w4a16 or fp8_w8a16"
         self.w13_g_idx = w13_g_idx
         self.w2_g_idx = w2_g_idx
         self.w13_g_idx_sort_indices = w13_g_idx_sort_indices
@@ -559,11 +558,17 @@ class MarlinExpertsBase(mk.FusedMoEPermuteExpertsUnpermute):
     @property
     def quant_type_id(self) -> int:
         # uint4b8 will be set for int4 weight and float4_e2m1f will be used for mxfp4
-        return (
-            scalar_types.uint4b8.id
-            if self.quant_config.use_int4_w4a16
-            else scalar_types.float4_e2m1f.id
-        )
+        if self.quant_config.use_int4_w4a16:
+            return scalar_types.uint4b8.id
+        elif self.quant_config.use_mxfp4_w4a16 or self.quant_config.use_nvfp4_w4a16:
+            return scalar_types.float4_e2m1f.id
+        elif (
+            self.quant_config.use_fp8_w8a16
+            and current_platform.fp8_dtype() == torch.float8_e4m3fn
+        ):
+            return scalar_types.float8_e4m3fn.id
+        else:
+            raise NotImplementedError("Unsupported quantization type.")
 
     def moe_problem_size(
         self,
@@ -639,6 +644,7 @@ class MarlinExperts(MarlinExpertsBase):
         global_num_experts: int,
         local_num_experts: int,
         expert_tokens_meta: mk.ExpertTokensMetadata | None,
+        activation: str,
     ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
         # Modular Kernel provisions output buffer from workspace1. However in
         # the fused_marlin_moe() function, the final torch.sum(), is defined
@@ -692,6 +698,8 @@ class MarlinExperts(MarlinExpertsBase):
             gating_output=None,
             topk_weights=topk_weights,
             topk_ids=topk_ids,
+            global_scale1=self.g1_alphas,
+            global_scale2=self.g2_alphas,
             quant_type_id=self.quant_type_id,
             apply_router_weight_on_input=apply_router_weight_on_input,
             global_num_experts=global_num_experts,
@@ -715,16 +723,6 @@ class MarlinExperts(MarlinExpertsBase):
         ops.moe_sum(input, output)
 
 
-def modular_marlin_fused_moe(
-    quant_config: FusedMoEQuantConfig, shared_experts: torch.nn.Module | None = None
-) -> mk.FusedMoEModularKernel:
-    return mk.FusedMoEModularKernel(
-        MoEPrepareAndFinalizeNoEP(),
-        MarlinExperts(quant_config),
-        shared_experts,
-    )
-
-
 class BatchedMarlinExperts(MarlinExpertsBase):
     def __init__(
         self,
@@ -775,6 +773,7 @@ class BatchedMarlinExperts(MarlinExpertsBase):
         global_num_experts: int,
         local_num_experts: int,
         expert_tokens_meta: mk.ExpertTokensMetadata | None,
+        activation: str,
     ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
         num_dispatchers = self.num_dispatchers
         num_experts = local_num_experts
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 5b11cf7dacb92c327ea4c0e2efcad04be3f5f110..c24749b307a1ceda6a79adfa8af5f4108a13c6c4 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -12,13 +12,13 @@ from collections.abc import Callable
 from typing import Any
 
 import torch
-import torch.nn.functional as F
 
 import vllm.envs as envs
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm import _custom_ops as ops
 from vllm._aiter_ops import rocm_aiter_ops
 from vllm.logger import init_logger
+from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.batch_invariant import (
     vllm_is_batch_invariant,
 )
@@ -27,10 +27,6 @@ from vllm.model_executor.layers.fused_moe.config import (
     FusedMoEQuantConfig,
     _get_config_dtype_str,
 )
-from vllm.model_executor.layers.fused_moe.cutlass_moe import (
-    _valid_cutlass_block_scaled_grouped_gemm,
-    run_cutlass_block_scaled_fused_experts,
-)
 from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
     _valid_deep_gemm,
     deep_gemm_moe_fp8,
@@ -50,12 +46,15 @@ except Exception:
 from vllm.model_executor.layers.fused_moe.prepare_finalize import (
     MoEPrepareAndFinalizeNoEP,
 )
+from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (  # noqa: E501
+    rocm_aiter_grouped_topk,
+)
 from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
     TopKWeightAndReduceNoOP,
 )
 from vllm.model_executor.layers.fused_moe.utils import (
     _resize_cache,
-    activation_without_mul,
+    apply_moe_activation,
     disable_inplace,
     moe_kernel_quantize_input,
 )
@@ -664,6 +663,7 @@ def fused_moe_kernel(
     # Block size for block-wise quantization
     group_n: tl.constexpr,
     group_k: tl.constexpr,
+    naive_block_assignment: tl.constexpr,
     # Meta-parameters
     BLOCK_SIZE_M: tl.constexpr,
     BLOCK_SIZE_N: tl.constexpr,
@@ -698,6 +698,9 @@ def fused_moe_kernel(
     - expert_ids: A tensor containing the indices of the expert for each
         block. It determines which expert matrix from B should be used for
         each block in A.
+    - naive_block_assignment: A boolean flag indicating whether to use naive
+        token wise block assignment. If True, each block corresponds to a
+        single token.
     This kernel performs the multiplication of a token by its corresponding
     expert matrix as determined by `expert_ids`. The sorting of
     `sorted_token_ids` by expert index and padding ensures divisibility by
@@ -736,12 +739,20 @@ def fused_moe_kernel(
     # and accumulate
     # `a_ptrs` is a block of [BLOCK_SIZE_M, BLOCK_SIZE_K] pointers
     # `b_ptrs` is a block of [BLOCK_SIZE_K, BLOCK_SIZE_N] pointers
+    offs = tl.arange(0, BLOCK_SIZE_M).to(tl.int64)
     num_tokens_post_padded = tl.load(num_tokens_post_padded_ptr)
     if pid_m * BLOCK_SIZE_M >= num_tokens_post_padded:
         return
+    if not naive_block_assignment:
+        offs_token_id = pid_m * BLOCK_SIZE_M + offs
+        offs_token = tl.load(sorted_token_ids_ptr + offs_token_id)
+    else:
+        offs_token = tl.where(
+            offs == 0,
+            pid_m,  # first element = pid_m
+            num_valid_tokens,  # remaining elements = constant
+        )
 
-    offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_token = tl.load(sorted_token_ids_ptr + offs_token_id)
     token_mask = offs_token < num_valid_tokens
 
     off_experts = tl.load(expert_ids_ptr + pid_m)
@@ -844,20 +855,39 @@ def fused_moe_kernel(
         # Advance the ptrs to the next K block.
         a_ptrs += BLOCK_SIZE_K * stride_ak
         b_ptrs += BLOCK_SIZE_K * stride_bk
+
+    # Dequantization for supported quantization schemes:
+    #   - int8_w8a16
+    #   - fp8_w8a8
+    #   - int8_w8a8
+    # Accumulator and scalings are in float32 to preserve numerical accuracy.
+    if use_int8_w8a16:
+        accumulator = accumulator * b_scale
+    elif (use_fp8_w8a8 or use_int8_w8a8) and not (group_k > 0 and group_n > 0):
+        accumulator = accumulator * a_scale * b_scale
+
+    # Bias addition:
+    # Bias must be applied after dequantization:
+    #   - Since bias is typically not quantized
+    #   - Bias should not be scaled by quantization factors
     if HAS_BIAS:
-        accumulator = accumulator + bias[None, :]
+        accumulator += bias[None, :]
+
+    # Router (MoE) weight multiplication:
+    # This multiplication MUST be performed in float32 before any precision
+    # conversion to ensure numerical stability, which is especially critical
+    # on ROCm platforms.
     if MUL_ROUTED_WEIGHT:
-        moe_weight = tl.load(topk_weights_ptr + offs_token, mask=token_mask, other=0)
-        accumulator = accumulator * moe_weight[:, None]
-    if use_int8_w8a16:
-        accumulator = (accumulator * b_scale).to(compute_type)
-    elif use_fp8_w8a8 or use_int8_w8a8:
-        if group_k > 0 and group_n > 0:
-            accumulator = accumulator.to(compute_type)
-        else:
-            accumulator = (accumulator * a_scale * b_scale).to(compute_type)
-    else:
-        accumulator = accumulator.to(compute_type)
+        moe_weight = tl.load(
+            topk_weights_ptr + offs_token,
+            mask=token_mask,
+            other=0,
+        )
+        accumulator *= moe_weight[:, None]
+
+    # Final precision conversion:
+    # Cast once at the end to the desired compute/output dtype.
+    accumulator = accumulator.to(compute_type)
 
     # -----------------------------------------------------------
     # Write back the block of the output
@@ -867,11 +897,70 @@ def fused_moe_kernel(
     tl.store(c_ptrs, accumulator, mask=c_mask)
 
 
-def invoke_fused_moe_kernel(
+# NOTE(zyongye): we can remove all the wna16 kernel
+# once we drop off sm75 support
+def invoke_fused_moe_wna16_cuda_kernel(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    C: torch.Tensor,
+    B_scale: torch.Tensor | None,
+    B_zp: torch.Tensor | None,
+    topk_weights: torch.Tensor | None,
+    sorted_token_ids: torch.Tensor | None,
+    expert_ids: torch.Tensor,
+    num_tokens_post_padded: torch.Tensor,
+    mul_routed_weight: bool,
+    top_k: int,
+    config: dict[str, Any],
+    block_shape: list[int],
+):
+    assert B_scale is not None and B_scale.ndim == 3
+    assert B_zp is None or B_zp.ndim == 3
+    assert block_shape is None or block_shape[0] == 0
+
+    M = A.size(0)
+    num_tokens = M * top_k
+    bit = 4
+
+    config = config.copy()
+    config.update(
+        get_moe_wna16_block_config(
+            config=config,
+            use_moe_wna16_cuda=True,
+            num_valid_tokens=num_tokens,
+            size_k=A.size(1),
+            size_n=B.size(1),
+            num_experts=B.size(1),
+            group_size=block_shape[1],
+            real_top_k=top_k,
+            block_size_m=config["BLOCK_SIZE_M"],
+        )
+    )
+
+    ops.moe_wna16_gemm(
+        A,
+        C,
+        B,
+        B_scale,
+        B_zp,
+        topk_weights if mul_routed_weight else None,
+        sorted_token_ids,
+        expert_ids,
+        num_tokens_post_padded,
+        top_k,
+        config["BLOCK_SIZE_M"],
+        config["BLOCK_SIZE_N"],
+        config["BLOCK_SIZE_K"],
+        bit,
+    )
+
+
+# NOTE(zyongye): we can remove all the wna16 kernel
+# once we drop off sm75 support
+def invoke_fused_moe_wna16_triton_kernel(
     A: torch.Tensor,
     B: torch.Tensor,
     C: torch.Tensor,
-    A_scale: torch.Tensor | None,
     B_scale: torch.Tensor | None,
     B_zp: torch.Tensor | None,
     topk_weights: torch.Tensor | None,
@@ -882,6 +971,135 @@ def invoke_fused_moe_kernel(
     top_k: int,
     config: dict[str, Any],
     compute_type: tl.dtype,
+    use_int8_w8a16: bool,
+    use_int4_w4a16: bool,
+    block_shape: list[int] | None,
+):
+    assert B_scale is not None and B_scale.ndim == 3
+    assert B_zp is None or B_zp.ndim == 3
+    assert block_shape is not None and block_shape[0] == 0
+
+    M = A.size(0)
+    num_tokens = M * top_k
+
+    EM = sorted_token_ids.size(0)
+    if A.size(0) < config["BLOCK_SIZE_M"]:
+        # optimize for small batch_size.
+        # We assume that top_ids of each token is unique,
+        # so num_valid_experts <= batch_size <= BLOCK_SIZE_M,
+        # and we can skip some invalid blocks.
+        EM = min(sorted_token_ids.size(0), A.size(0) * top_k * config["BLOCK_SIZE_M"])
+    grid = lambda META: (
+        triton.cdiv(EM, META["BLOCK_SIZE_M"])
+        * triton.cdiv(B.size(1), META["BLOCK_SIZE_N"]),
+    )
+    config = config.copy()
+    config.update(
+        get_moe_wna16_block_config(
+            config=config,
+            use_moe_wna16_cuda=False,
+            num_valid_tokens=num_tokens,
+            size_k=A.size(1),
+            size_n=B.size(1),
+            num_experts=B.size(1),
+            group_size=block_shape[1],
+            real_top_k=top_k,
+            block_size_m=config["BLOCK_SIZE_M"],
+        )
+    )
+
+    if os.environ.get('AWQ_MOE_SZ') == '1':
+        fused_moe_kernel_awq[grid](
+            A,
+            B,
+            C,
+            B_scale,
+            B_zp,
+            topk_weights,
+            sorted_token_ids,
+            expert_ids,
+            num_tokens_post_padded,
+            B.size(1),
+            A.size(1),
+            EM,
+            topk_ids.numel(),
+            A.stride(0),
+            A.stride(1),
+            B.stride(0),
+            B.stride(2),
+            B.stride(1),
+            C.stride(1),
+            C.stride(2),
+            B_scale.stride(0),
+            B_scale.stride(2),
+            B_scale.stride(1),
+            B_zp.stride(0) if B_zp is not None else 0,
+            B_zp.stride(2) if B_zp is not None else 0,
+            B_zp.stride(1) if B_zp is not None else 0,
+            block_k_diviable=A.size(1) % config["BLOCK_SIZE_K"] == 0,
+            group_size=block_shape[1],
+            MUL_ROUTED_WEIGHT=mul_routed_weight,
+            top_k=top_k,
+            compute_type=compute_type,
+            has_zp=B_zp is not None,
+            use_int4_w4a16=use_int4_w4a16,
+            use_int8_w8a16=use_int8_w8a16,
+            **config,
+        )
+    else:
+        fused_moe_kernel_gptq_awq[grid](
+            A,
+            B,
+            C,
+            B_scale,
+            B_zp,
+            topk_weights,
+            sorted_token_ids,
+            expert_ids,
+            num_tokens_post_padded,
+            B.size(1),
+            A.size(1),
+            EM,
+            num_tokens,
+            A.stride(0),
+            A.stride(1),
+            B.stride(0),
+            B.stride(2),
+            B.stride(1),
+            C.stride(1),
+            C.stride(2),
+            B_scale.stride(0),
+            B_scale.stride(2),
+            B_scale.stride(1),
+            B_zp.stride(0) if B_zp is not None else 0,
+            B_zp.stride(2) if B_zp is not None else 0,
+            B_zp.stride(1) if B_zp is not None else 0,
+            block_k_diviable=A.size(1) % config["BLOCK_SIZE_K"] == 0,
+            group_size=block_shape[1],
+            MUL_ROUTED_WEIGHT=mul_routed_weight,
+            top_k=top_k,
+            compute_type=compute_type,
+            has_zp=B_zp is not None,
+            use_int4_w4a16=use_int4_w4a16,
+            use_int8_w8a16=use_int8_w8a16,
+            **config,
+        )
+
+
+def invoke_fused_moe_triton_kernel(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    C: torch.Tensor,
+    A_scale: torch.Tensor | None,
+    B_scale: torch.Tensor | None,
+    topk_weights: torch.Tensor | None,
+    sorted_token_ids: torch.Tensor | None,
+    expert_ids: torch.Tensor,
+    num_tokens_post_padded: torch.Tensor,
+    mul_routed_weight: bool,
+    top_k: int,
+    config: dict[str, Any],
+    compute_type: tl.dtype,
     use_fp8_w8a8: bool,
     use_int8_w8a8: bool,
     use_int8_w8a16: bool,
@@ -891,10 +1109,10 @@ def invoke_fused_moe_kernel(
     block_shape: list[int] | None = None,
     B_bias: torch.Tensor | None = None,
     use_nn_moe: bool | None = False,
-) -> None:
+):
     assert topk_weights is not None or not mul_routed_weight
     assert topk_weights is None or topk_weights.stride(1) == 1
-    assert sorted_token_ids.stride(0) == 1
+    assert sorted_token_ids is None or sorted_token_ids.stride(0) == 1
 
     if use_fp8_w8a8 or use_int8_w8a8:
         assert B_scale is not None
@@ -904,7 +1122,6 @@ def invoke_fused_moe_kernel(
         assert block_shape is None or triton.cdiv(
             B.size(-1), block_shape[1]
         ) == B_scale.size(-1)
-
     elif use_int8_w8a16 or use_int4_w4a16:
         assert B_scale is not None
         assert block_shape is None or block_shape[0] == 0
@@ -914,109 +1131,119 @@ def invoke_fused_moe_kernel(
 
     M = A.size(0)
     num_tokens = M * top_k
-
-    EM = sorted_token_ids.size(0)
-    if A.size(0) < config["BLOCK_SIZE_M"]:
-        # optimize for small batch_size.
-        # We assume that top_ids of each token is unique,
-        # so num_valid_experts <= batch_size <= BLOCK_SIZE_M,
-        # and we can skip some invalid blocks.
-        EM = min(sorted_token_ids.size(0), A.size(0) * top_k * config["BLOCK_SIZE_M"])
+    if sorted_token_ids is not None:
+        EM = sorted_token_ids.size(0)
+        if A.size(0) < config["BLOCK_SIZE_M"]:
+            # optimize for small batch_size.
+            # We assume that top_ids of each token is unique,
+            # so num_valid_experts <= batch_size <= BLOCK_SIZE_M,
+            # and we can skip some invalid blocks.
+            EM = min(
+                sorted_token_ids.size(0), A.size(0) * top_k * config["BLOCK_SIZE_M"]
+            )
+    else:
+        EM = num_tokens * config["BLOCK_SIZE_M"]
     grid = lambda META: (
         triton.cdiv(EM, META["BLOCK_SIZE_M"])
         * triton.cdiv(B.size(1) if not use_nn_moe else B.size(2), META["BLOCK_SIZE_N"]),
     )
     HAS_BIAS = B_bias is not None
-    if (
-        (use_int8_w8a16 or use_int4_w4a16)
-        and block_shape is not None
-        and block_shape[1] > 0
+
+    # config = config.copy()
+    # config["SPLIT_K"] = 1
+    # BLOCK_SIZE_K = config.pop("BLOCK_SIZE_K")
+    # if block_shape is not None:
+    #     BLOCK_SIZE_K = min(BLOCK_SIZE_K, min(block_shape[0], block_shape[1]))
+    fused_moe_kernel[grid](
+        A,
+        B,
+        C,
+        B_bias,
+        A_scale,
+        B_scale,
+        topk_weights,
+        sorted_token_ids,
+        expert_ids,
+        num_tokens_post_padded,
+        B.size(1) if not use_nn_moe else B.size(2),
+        A.size(1), # B.size(2),
+        EM,
+        num_tokens,
+        A.stride(0),
+        A.stride(1),
+        B.stride(0),
+        B.stride(2) if not use_nn_moe else B.stride(1),
+        B.stride(1) if not use_nn_moe else B.stride(2),
+        C.stride(1),
+        C.stride(2),
+        A_scale.stride(0) if A_scale is not None and A_scale.ndim == 2 else 0,
+        A_scale.stride(1) if A_scale is not None and A_scale.ndim == 2 else 0,
+        B_scale.stride(0) if B_scale is not None and B_scale.ndim >= 2 else 0,
+        B_scale.stride(2) if B_scale is not None and B_scale.ndim == 3 else 0,
+        B_scale.stride(1) if B_scale is not None and B_scale.ndim >= 2 else 0,
+        B_bias.stride(0) if B_bias is not None else 0,
+        B_bias.stride(1) if B_bias is not None else 0,
+        0 if block_shape is None else block_shape[0],
+        0 if block_shape is None else block_shape[1],
+        MUL_ROUTED_WEIGHT=mul_routed_weight,
+        top_k=top_k,
+        compute_type=compute_type,
+        use_fp8_w8a8=use_fp8_w8a8,
+        use_int8_w8a8=use_int8_w8a8,
+        use_int8_w8a16=use_int8_w8a16,
+        per_channel_quant=per_channel_quant,
+        naive_block_assignment=(sorted_token_ids is None),
+        HAS_BIAS=HAS_BIAS,
+        # BLOCK_SIZE_K=BLOCK_SIZE_K,
+        **config,
+    )
+
+
+def dispatch_fused_moe_kernel(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    C: torch.Tensor,
+    A_scale: torch.Tensor | None,
+    B_scale: torch.Tensor | None,
+    B_zp: torch.Tensor | None,
+    topk_weights: torch.Tensor | None,
+    sorted_token_ids: torch.Tensor | None,
+    expert_ids: torch.Tensor,
+    num_tokens_post_padded: torch.Tensor,
+    mul_routed_weight: bool,
+    top_k: int,
+    config: dict[str, Any],
+    compute_type: tl.dtype,
+    use_fp8_w8a8: bool,
+    use_int8_w8a8: bool,
+    use_int8_w8a16: bool,
+    use_int4_w4a16: bool,
+    per_channel_quant: bool,
+    block_shape: list[int] | None = None,
+    B_bias: torch.Tensor | None = None,
+) -> None:
+    assert topk_weights is not None or not mul_routed_weight
+    assert topk_weights is None or topk_weights.stride(1) == 1
+    assert sorted_token_ids is None or sorted_token_ids.stride(0) == 1
+
+    M = A.size(0)
+    num_tokens = M * top_k
+
+    if (use_int8_w8a16 or use_int4_w4a16) and (
+        block_shape is not None and block_shape[1] > 0
     ):
-        assert B_scale is not None and B_scale.ndim == 3
-        assert B_zp is None or B_zp.ndim == 3
-        if os.environ.get('moe_wna16_use_cuda') == '1':
-            use_moe_wna16_cuda = should_moe_wna16_use_cuda(
+        assert B_bias is None
+        
+        # if os.environ.get('moe_wna16_use_cuda') == '1':
+        use_moe_wna16_cuda = should_moe_wna16_use_cuda(
             num_valid_tokens=num_tokens,
             group_size=block_shape[1],
             num_experts=B.size(0),
             bit=4 if use_int4_w4a16 else 8,
         )
-            
-            config = config.copy()
-            config.update(
-                get_moe_wna16_block_config(
-                    config=config,
-                    use_moe_wna16_cuda=use_moe_wna16_cuda,
-                    num_valid_tokens=num_tokens,
-                    size_k=A.size(1),
-                    size_n=B.size(1),
-                    num_experts=B.size(1),
-                    group_size=block_shape[1],
-                    real_top_k=top_k,
-                    block_size_m=config["BLOCK_SIZE_M"],
-                )
-            )
-            
-            if use_moe_wna16_cuda:
-                bit = 4 if use_int4_w4a16 else 8
-                ops.moe_wna16_gemm(
-                    A,
-                    C,
-                    B,
-                    B_scale,
-                    B_zp,
-                    topk_weights if mul_routed_weight else None,
-                    sorted_token_ids,
-                    expert_ids,
-                    num_tokens_post_padded,
-                    top_k,
-                    config["BLOCK_SIZE_M"],
-                    config["BLOCK_SIZE_N"],
-                    config["BLOCK_SIZE_K"],
-                    bit,
-                )
-                return
-            
-        if os.environ.get('AWQ_MOE_SZ') == '1':
-            fused_moe_kernel_awq[grid](
-                A,
-                B,
-                C,
-                B_scale,
-                B_zp,
-                topk_weights,
-                sorted_token_ids,
-                expert_ids,
-                num_tokens_post_padded,
-                B.size(1),
-                A.size(1),
-                EM,
-                topk_ids.numel(),
-                A.stride(0),
-                A.stride(1),
-                B.stride(0),
-                B.stride(2),
-                B.stride(1),
-                C.stride(1),
-                C.stride(2),
-                B_scale.stride(0),
-                B_scale.stride(2),
-                B_scale.stride(1),
-                B_zp.stride(0) if B_zp is not None else 0,
-                B_zp.stride(2) if B_zp is not None else 0,
-                B_zp.stride(1) if B_zp is not None else 0,
-                block_k_diviable=A.size(1) % config["BLOCK_SIZE_K"] == 0,
-                group_size=block_shape[1],
-                MUL_ROUTED_WEIGHT=mul_routed_weight,
-                top_k=top_k,
-                compute_type=compute_type,
-                has_zp=B_zp is not None,
-                use_int4_w4a16=use_int4_w4a16,
-                use_int8_w8a16=use_int8_w8a16,
-                **config,
-            )
-        else:
-            fused_moe_kernel_gptq_awq[grid](
+        
+        if use_moe_wna16_cuda:
+            invoke_fused_moe_wna16_cuda_kernel(
                 A,
                 B,
                 C,
@@ -1026,82 +1253,54 @@ def invoke_fused_moe_kernel(
                 sorted_token_ids,
                 expert_ids,
                 num_tokens_post_padded,
-                B.size(1),
-                A.size(1),
-                EM,
-                num_tokens,
-                A.stride(0),
-                A.stride(1),
-                B.stride(0),
-                B.stride(2),
-                B.stride(1),
-                C.stride(1),
-                C.stride(2),
-                B_scale.stride(0),
-                B_scale.stride(2),
-                B_scale.stride(1),
-                B_zp.stride(0) if B_zp is not None else 0,
-                B_zp.stride(2) if B_zp is not None else 0,
-                B_zp.stride(1) if B_zp is not None else 0,
-                block_k_diviable=A.size(1) % config["BLOCK_SIZE_K"] == 0,
-                group_size=block_shape[1],
-                MUL_ROUTED_WEIGHT=mul_routed_weight,
-                top_k=top_k,
-                compute_type=compute_type,
-                has_zp=B_zp is not None,
-                use_int4_w4a16=use_int4_w4a16,
-                use_int8_w8a16=use_int8_w8a16,
-                **config,
+                mul_routed_weight,
+                top_k,
+                config,
+                block_shape,
             )
-            
+            return
+        invoke_fused_moe_wna16_triton_kernel(
+            A,
+            B,
+            C,
+            B_scale,
+            B_zp,
+            topk_weights,
+            sorted_token_ids,
+            expert_ids,
+            num_tokens_post_padded,
+            mul_routed_weight,
+            top_k,
+            config,
+            compute_type,
+            use_int8_w8a16,
+            use_int4_w4a16,
+            block_shape,
+        )
     else:
-        # config = config.copy()
-        # config["SPLIT_K"] = 1
-        # BLOCK_SIZE_K = config.pop("BLOCK_SIZE_K")
-        # if block_shape is not None:
-        #     BLOCK_SIZE_K = min(BLOCK_SIZE_K, min(block_shape[0], block_shape[1]))
-        fused_moe_kernel[grid](
+        invoke_fused_moe_triton_kernel(
             A,
             B,
             C,
-            B_bias,
             A_scale,
             B_scale,
             topk_weights,
             sorted_token_ids,
             expert_ids,
             num_tokens_post_padded,
-            B.size(1) if not use_nn_moe else B.size(2),
-            A.size(1),
-            EM,
-            num_tokens,
-            A.stride(0),
-            A.stride(1),
-            B.stride(0),
-            B.stride(2) if not use_nn_moe else B.stride(1),
-            B.stride(1) if not use_nn_moe else B.stride(2),
-            C.stride(1),
-            C.stride(2),
-            A_scale.stride(0) if A_scale is not None and A_scale.ndim == 2 else 0,
-            A_scale.stride(1) if A_scale is not None and A_scale.ndim == 2 else 0,
-            B_scale.stride(0) if B_scale is not None and B_scale.ndim >= 2 else 0,
-            B_scale.stride(2) if B_scale is not None and B_scale.ndim == 3 else 0,
-            B_scale.stride(1) if B_scale is not None and B_scale.ndim >= 2 else 0,
-            B_bias.stride(0) if B_bias is not None else 0,
-            B_bias.stride(1) if B_bias is not None else 0,
-            0 if block_shape is None else block_shape[0],
-            0 if block_shape is None else block_shape[1],
-            MUL_ROUTED_WEIGHT=mul_routed_weight,
-            top_k=top_k,
-            compute_type=compute_type,
-            use_fp8_w8a8=use_fp8_w8a8,
-            use_int8_w8a8=use_int8_w8a8,
-            use_int8_w8a16=use_int8_w8a16,
-            per_channel_quant=per_channel_quant,
-            HAS_BIAS=HAS_BIAS,
-            **config,
+            mul_routed_weight,
+            top_k,
+            config,
+            compute_type,
+            use_fp8_w8a8,
+            use_int8_w8a8,
+            use_int8_w8a16,
+            use_int4_w4a16,
+            per_channel_quant,
+            block_shape,
+            B_bias,
         )
-
+        
 
 @triton.jit
 def compute_identity_kernel(
@@ -1512,14 +1711,14 @@ def vllm_topk_softmax(
     gating_output: torch.Tensor,
     renormalize: bool,
 ) -> tuple[torch.Tensor, ...]:
-    if envs.VLLM_USE_TOPK_RENORM:
+    if envs.VLLM_USE_TOPK_RENORM and renormalize is True:
         from lightop import op as op
         op.topk_softmax(
             topk_weights,
             topk_indices,
             token_expert_indices,
             gating_output,
-            True,
+            renormalize,
         )
     else:
         ops.topk_softmax(
@@ -1690,6 +1889,88 @@ def grouped_topk(
     return topk_weights.to(torch.float32), topk_ids.to(torch.int32)
 
 
+# --8<-- [start:grouped_topk]
+@CustomOp.register("grouped_topk")
+class GroupedTopk(CustomOp):
+    """GroupedTopk used by the Deepseek-V2 and Deepseek-V3 model."""
+
+    # --8<-- [end:grouped_topk]
+
+    def __init__(
+        self,
+        topk: int,
+        renormalize: bool,
+        num_expert_group: int = 0,
+        topk_group: int = 0,
+        scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
+        num_fused_shared_experts: int = 0,
+    ) -> None:
+        super().__init__()
+        self.native_impl = grouped_topk
+        self.topk = topk
+        self.renormalize = renormalize
+        self.num_expert_group = num_expert_group
+        self.topk_group = topk_group
+        self.scoring_func = scoring_func
+        self.routed_scaling_factor = routed_scaling_factor
+        self.num_fused_shared_experts = num_fused_shared_experts
+
+    def forward_native(
+        self,
+        hidden_states: torch.Tensor,
+        gating_output: torch.Tensor,
+        e_score_correction_bias: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        return self.native_impl(
+            hidden_states,
+            gating_output,
+            self.topk,
+            self.renormalize,
+            self.num_expert_group,
+            self.topk_group,
+            self.scoring_func,
+            self.routed_scaling_factor,
+            e_score_correction_bias,
+        )
+
+    def forward_cuda(
+        self,
+        hidden_states: torch.Tensor,
+        gating_output: torch.Tensor,
+        e_score_correction_bias: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        return self.forward_native(
+            hidden_states, gating_output, e_score_correction_bias
+        )
+
+    def forward_hip(
+        self,
+        hidden_states: torch.Tensor,
+        gating_output: torch.Tensor,
+        e_score_correction_bias: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if rocm_aiter_ops.is_fused_moe_enabled():
+            if not rocm_aiter_ops.is_fusion_moe_shared_experts_enabled():
+                assert self.num_fused_shared_experts == 0
+            return rocm_aiter_grouped_topk(
+                hidden_states,
+                gating_output,
+                self.topk,
+                self.renormalize,
+                self.num_expert_group,
+                self.topk_group,
+                self.scoring_func,
+                self.routed_scaling_factor,
+                e_score_correction_bias,
+                self.num_fused_shared_experts,
+            )
+        else:
+            return self.forward_native(
+                hidden_states, gating_output, e_score_correction_bias
+            )
+
+
 @torch.compile(dynamic=True, backend=current_platform.simple_compile_backend)
 def eplb_map_to_physical_and_record(
     topk_ids: torch.Tensor,
@@ -1783,7 +2064,7 @@ def fused_grouped_topk(
             topk,
             renormalize,
             routed_scaling_factor,
-            e_score_correction_bias.to(gating_output.dtype),
+            e_score_correction_bias,
             1,  # scoring_func=1 for sigmoid
         )
     elif scoring_func == "softmax":
@@ -1797,7 +2078,7 @@ def fused_grouped_topk(
             topk,
             renormalize,
             routed_scaling_factor,
-            e_score_correction_bias.to(gating_output.dtype),
+            e_score_correction_bias,
             0,  # scoring_func=0 (no activation, scores already computed)
         )
     else:
@@ -2042,12 +2323,10 @@ def fused_experts(
     expert_map: torch.Tensor | None = None,
     quant_config: FusedMoEQuantConfig | None = None,
     allow_deep_gemm: bool = False,
-    allow_cutlass_block_scaled_grouped_gemm: bool = False,
     use_nn_moe: bool | None = False,
 ) -> torch.Tensor:
     if quant_config is None:
         quant_config = FUSED_MOE_UNQUANTIZED_CONFIG
-    use_fp8_w8a8 = quant_config.use_fp8_w8a8
 
     # For now, disable DeepGemm for small N (<= 512) until better
     # permute/unpermute ops are available.
@@ -2061,7 +2340,6 @@ def fused_experts(
         and (is_deep_gemm_e8m0_used() or _valid_deep_gemm(hidden_states, w1, w2))
     ):
         assert quant_config is not None
-        assert apply_router_weight_on_input is False
         return deep_gemm_moe_fp8(
             hidden_states=hidden_states,
             w1=w1,
@@ -2078,23 +2356,6 @@ def fused_experts(
             a2_scale=quant_config.a2_scale,
             apply_router_weight_on_input=apply_router_weight_on_input,
         )
-    elif (
-        allow_cutlass_block_scaled_grouped_gemm
-        and use_fp8_w8a8
-        and _valid_cutlass_block_scaled_grouped_gemm(
-            w1, w2, inplace, activation, apply_router_weight_on_input, expert_map
-        )
-    ):
-        assert quant_config is not None
-        return run_cutlass_block_scaled_fused_experts(
-            a=hidden_states,
-            w1=w1,
-            w2=w2,
-            w1_scale=quant_config.w1_scale,
-            w2_scale=quant_config.w2_scale,
-            topk_weights=topk_weights,
-            topk_ids=topk_ids,
-        )
     else:
         return dispatch_fused_experts_func(inplace)(
             hidden_states=hidden_states,
@@ -2126,11 +2387,6 @@ def fused_experts(
         )
 
 
-SILU_NO_MUL: str = activation_without_mul("silu")
-GELU_NO_MUL: str = activation_without_mul("gelu")
-RELU2_NO_MUL: str = activation_without_mul("relu2")
-
-
 def _get_config_quant_dtype(
     use_fp8_w8a8: bool,
     use_int8_w8a8: bool,
@@ -2350,8 +2606,13 @@ def fused_experts_impl(
     intermediate_cache3 = cache13[: M * top_k_num * (K if not use_nn_moe else w2.shape[2])].view(M, top_k_num, K if not use_nn_moe else w2.shape[2])
 
     # This needs separate memory since it's used concurrently with cache1
+    activation_out_dim = mk.FusedMoEPermuteExpertsUnpermute.adjust_N_for_activation(
+        N, activation
+    )
     intermediate_cache2 = torch.empty(
-        (M * top_k_num, N // 2), device=hidden_states.device, dtype=hidden_states.dtype
+        (M * top_k_num, activation_out_dim),
+        device=hidden_states.device,
+        dtype=hidden_states.dtype,
     )
 
     if hidden_states.dtype == torch.bfloat16:
@@ -2437,25 +2698,39 @@ def fused_experts_impl(
             block_shape=block_shape,
         )
 
-        if use_int4_w4a16:
-            sorted_token_ids, expert_ids, num_tokens_post_padded = moe_align_block_size(
-                curr_topk_ids, 
-                config['BLOCK_SIZE_M'], 
-                global_num_experts, 
-                expert_map, 
-                ignore_invalid_experts=True,
-                num_token=curr_hidden_states.shape[0],
+        # SPARSITY_FACTOR is a heuristic margin ensuring tokens_in_chunk * top_k
+        # activates only a small fraction of total experts
+        SPARSITY_FACTOR = 4
+        # block quantized code path is not implemented yet.
+        naive_block_assignment = (
+            expert_map is None
+            and tokens_in_chunk * top_k_num * SPARSITY_FACTOR <= global_num_experts
+            and not (
+                (use_int8_w8a16 or use_int4_w4a16)
+                and block_shape is not None
+                and block_shape[1] > 0
             )
-        else:
+        )
+
+        if not naive_block_assignment:
             sorted_token_ids, expert_ids, num_tokens_post_padded = moe_align_block_size(
                 curr_topk_ids,
                 config["BLOCK_SIZE_M"],
                 global_num_experts,
                 expert_map,
                 ignore_invalid_experts=True,
+                num_token=curr_hidden_states.shape[0] if use_int4_w4a16 else None
             )
+        else:
+            max_num_tokens_padded = topk_ids.numel() * config["BLOCK_SIZE_M"]
+            expert_ids = curr_topk_ids.view(-1)
+            num_tokens_post_padded = torch.empty(
+                (1), dtype=torch.int32, device=topk_ids.device
+            )
+            num_tokens_post_padded.fill_(max_num_tokens_padded)
+            sorted_token_ids = None
 
-        invoke_fused_moe_kernel(
+        dispatch_fused_moe_kernel(
             qcurr_hidden_states,
             w1,
             intermediate_cache1,
@@ -2481,34 +2756,9 @@ def fused_experts_impl(
             use_nn_moe=use_nn_moe,
         )
 
-        # Activation function with multiplication
-        if activation == "silu":
-            if envs.VLLM_USE_FUSE_SILU_AND_MUL and intermediate_cache1.dtype == intermediate_cache2.dtype == torch.float16:
-                fuse_silu_and_mul(
-                    intermediate_cache1.view(-1, N),intermediate_cache2
-                )    
-            else:
-                torch.ops._C.silu_and_mul(
-                    intermediate_cache2, intermediate_cache1.view(-1, N)
-                )
-        elif activation == "gelu":
-            torch.ops._C.gelu_and_mul(
-                intermediate_cache2, intermediate_cache1.view(-1, N)
-            )
-        elif activation == "swigluoai":
-            # alpha = 1.702, limit = 7.0
-            torch.ops._C.swigluoai_and_mul(
-                intermediate_cache2, intermediate_cache1.view(-1, N)
-            )
-        # Activation function without multiplication
-        elif activation == SILU_NO_MUL:
-            intermediate_cache2 = F.silu(intermediate_cache1.view(-1, N))
-        elif activation == GELU_NO_MUL:
-            intermediate_cache2 = F.gelu(intermediate_cache1.view(-1, N))
-        elif activation == RELU2_NO_MUL:
-            intermediate_cache2 = torch.square(F.relu(intermediate_cache1.view(-1, N)))
-        else:
-            raise ValueError(f"Unsupported FusedMoe activation: {activation}.")
+        apply_moe_activation(
+            activation, intermediate_cache2, intermediate_cache1.view(-1, N)
+        )
 
         qintermediate_cache2, a2q_scale = moe_kernel_quantize_input(
             A=intermediate_cache2,
@@ -2521,7 +2771,7 @@ def fused_experts_impl(
         if expert_map is not None:
             intermediate_cache3.zero_()
 
-        invoke_fused_moe_kernel(
+        dispatch_fused_moe_kernel(
             qintermediate_cache2,
             w2,
             intermediate_cache3,
@@ -2607,8 +2857,10 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
         global_num_experts: int,
         local_num_experts: int,
         expert_tokens_meta: mk.ExpertTokensMetadata | None,
+        activation: str,
     ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
-        workspace1 = (M, topk, max(N // 2, K))
+        activation_out_dim = self.adjust_N_for_activation(N, activation)
+        workspace1 = (M, topk, max(activation_out_dim, K))
         workspace2 = (M, topk, max(N, K))
         output = (M, K)
         return (workspace1, workspace2, output)
@@ -2648,6 +2900,7 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
             torch.float16,
             torch.bfloat16,
             torch.float8_e4m3fn,
+            torch.float8_e4m3fnuz,
         ]
 
         E, num_tokens, N, K, top_k_num = self.moe_problem_size(
@@ -2672,15 +2925,19 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
             compute_type = tl.float16
         elif hidden_states.dtype == torch.float32:
             compute_type = tl.float32
-        elif hidden_states.dtype == torch.float8_e4m3fn:
+        elif (
+            hidden_states.dtype == torch.float8_e4m3fn
+            or hidden_states.dtype == torch.float8_e4m3fnuz
+        ):
             compute_type = tl.bfloat16
         else:
             raise ValueError(f"Unsupported compute_type: {hidden_states.dtype}")
 
         # Note that the output tensor might be in workspace1
         intermediate_cache1 = _resize_cache(workspace2, (num_tokens, top_k_num, N))
+        cache2_dim = self.adjust_N_for_activation(N, activation)
         intermediate_cache2 = _resize_cache(
-            workspace13, (num_tokens * top_k_num, N // 2)
+            workspace13, (num_tokens * top_k_num, cache2_dim)
         )
         intermediate_cache3 = _resize_cache(workspace2, (num_tokens, top_k_num, K))
 
@@ -2688,13 +2945,12 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
             topk_ids, config["BLOCK_SIZE_M"], global_num_experts, expert_map
         )
 
-        invoke_fused_moe_kernel(
+        invoke_fused_moe_triton_kernel(
             hidden_states,
             w1,
             intermediate_cache1,
             a1q_scale,
             self.w1_scale,
-            self.w1_zp,
             None,  # topk_weights
             sorted_token_ids,
             expert_ids,
@@ -2727,13 +2983,12 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
             self.block_shape,
         )
 
-        invoke_fused_moe_kernel(
+        invoke_fused_moe_triton_kernel(
             qintermediate_cache2,
             w2,
             intermediate_cache3,
             a2q_scale,
             self.w2_scale,
-            self.w2_zp,
             topk_weights,
             sorted_token_ids,
             expert_ids,
@@ -2759,6 +3014,149 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
         ops.moe_sum(input, output)
 
 
+class TritonWNA16Experts(TritonExperts):
+    def __init__(
+        self,
+        quant_config: FusedMoEQuantConfig,
+    ):
+        super().__init__(quant_config)
+
+    def apply(
+        self,
+        output: torch.Tensor,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
+        global_num_experts: int,
+        expert_map: torch.Tensor | None,
+        a1q_scale: torch.Tensor | None,
+        a2_scale: torch.Tensor | None,
+        workspace13: torch.Tensor,
+        workspace2: torch.Tensor,
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
+        apply_router_weight_on_input: bool,
+    ):
+        # Check constraints.
+        if self.quant_config.use_int4_w4a16:
+            assert hidden_states.size(-1) // 2 == w1.size(2), "Hidden size mismatch"
+        else:
+            assert hidden_states.size(-1) == w1.size(2), (
+                f"Hidden size mismatch {hidden_states.size(-1)} != {w1.size(2)}"
+            )
+
+        assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
+        assert hidden_states.dim() == 2
+        assert w1.stride(-1) == 1, "Stride of last dimension must be 1"
+        assert w2.stride(-1) == 1, "Stride of last dimension must be 1"
+        assert hidden_states.dtype in [
+            torch.float32,
+            torch.float16,
+            torch.bfloat16,
+            torch.float8_e4m3fn,
+            torch.float8_e4m3fnuz,
+        ]
+
+        E, num_tokens, N, K, top_k_num = self.moe_problem_size(
+            hidden_states, w1, w2, topk_ids
+        )
+
+        if global_num_experts == -1:
+            global_num_experts = E
+
+        config = try_get_optimal_moe_config(
+            w1.size(),
+            w2.size(),
+            top_k_num,
+            self.quant_config.config_name(hidden_states.dtype),
+            num_tokens,
+            block_shape=self.block_shape,
+        )
+
+        if hidden_states.dtype == torch.bfloat16:
+            compute_type = tl.bfloat16
+        elif hidden_states.dtype == torch.float16:
+            compute_type = tl.float16
+        elif hidden_states.dtype == torch.float32:
+            compute_type = tl.float32
+        elif (
+            hidden_states.dtype == torch.float8_e4m3fn
+            or hidden_states.dtype == torch.float8_e4m3fnuz
+        ):
+            compute_type = tl.bfloat16
+        else:
+            raise ValueError(f"Unsupported compute_type: {hidden_states.dtype}")
+
+        # Note that the output tensor might be in workspace1
+        intermediate_cache1 = _resize_cache(workspace2, (num_tokens, top_k_num, N))
+        activation_out_dim = self.adjust_N_for_activation(N, activation)
+        intermediate_cache2 = _resize_cache(
+            workspace13, (num_tokens * top_k_num, activation_out_dim)
+        )
+        intermediate_cache3 = _resize_cache(workspace2, (num_tokens, top_k_num, K))
+
+        sorted_token_ids, expert_ids, num_tokens_post_padded = moe_align_block_size(
+            topk_ids, config["BLOCK_SIZE_M"], global_num_experts, expert_map
+        )
+
+        invoke_fused_moe_wna16_triton_kernel(
+            hidden_states,
+            w1,
+            intermediate_cache1,
+            self.w1_scale,
+            self.quant_config.w1_zp,
+            None,  # topk_weights
+            sorted_token_ids,
+            expert_ids,
+            num_tokens_post_padded,
+            False,  # mul_routed_weights
+            top_k_num,
+            config,
+            compute_type=compute_type,
+            use_int8_w8a16=self.quant_config.use_int8_w8a16,
+            use_int4_w4a16=self.quant_config.use_int4_w4a16,
+            block_shape=self.block_shape,
+        )
+
+        self.activation(
+            activation, intermediate_cache2, intermediate_cache1.view(-1, N)
+        )
+
+        a2q_scale: torch.Tensor | None = None
+
+        qintermediate_cache2, a2q_scale = moe_kernel_quantize_input(
+            intermediate_cache2,
+            a2_scale,
+            self.quant_dtype,
+            self.per_act_token_quant,
+            self.block_shape,
+        )
+
+        invoke_fused_moe_wna16_triton_kernel(
+            qintermediate_cache2,
+            w2,
+            intermediate_cache3,
+            self.w2_scale,
+            self.quant_config.w2_zp,
+            topk_weights,
+            sorted_token_ids,
+            expert_ids,
+            num_tokens_post_padded,
+            not apply_router_weight_on_input,
+            1,
+            config,
+            compute_type=compute_type,
+            use_int8_w8a16=self.quant_config.use_int8_w8a16,
+            use_int4_w4a16=self.quant_config.use_int4_w4a16,
+            block_shape=self.block_shape,
+        )
+
+        # separate function is required for MoE + LoRA
+        self.moe_sum(intermediate_cache3, output)
+
+
 def modular_triton_fused_moe(
     quant_config: FusedMoEQuantConfig, shared_experts: torch.nn.Module | None = None
 ) -> mk.FusedMoEModularKernel:
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
index 8c9d8a2777d582ab3db83c48df109f9b4b57a6ea..389ccf358c56ff08ac231b048017d983f91bc7f6 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
@@ -10,6 +10,9 @@ from vllm.model_executor.layers.fused_moe.config import (
     FusedMoEConfig,
     FusedMoEQuantConfig,
 )
+from vllm.model_executor.layers.fused_moe.fused_moe_router import (
+    FusedMoERouter,
+)
 from vllm.model_executor.layers.fused_moe.modular_kernel import (
     FusedMoEPermuteExpertsUnpermute,
     FusedMoEPrepareAndFinalize,
@@ -71,6 +74,18 @@ class FusedMoEMethodBase(QuantizeMethodBase):
             "implementation based on the prepare_finalize"
         )
 
+    def prepare_dp_allgather_tensor(
+        self,
+        layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
+        hidden_states: torch.Tensor,
+        router_logits: torch.Tensor,
+    ) -> tuple[torch.Tensor, list[torch.Tensor]]:
+        """Hook to prepare tensors and extra tensors for DP allgather + EP dispatch."""
+        raise NotImplementedError(
+            "Method 'prepare_dp_allgather_tensor' is not implemented in "
+            f"{self.__class__.__name__}."
+        )
+
     @abstractmethod
     def get_fused_moe_quant_config(
         self, layer: torch.nn.Module
@@ -97,6 +112,7 @@ class FusedMoEMethodBase(QuantizeMethodBase):
     def apply(
         self,
         layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
+        router: FusedMoERouter,
         x: torch.Tensor,
         router_logits: torch.Tensor,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
index 9c9bc2514bb4b1630bd8209d3612be3b6eaa3712..2d98433e4db023c3bb31ad9904002dd71a5caa21 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
@@ -12,6 +12,7 @@ from vllm.model_executor.layers.fused_moe.config import (
 from vllm.model_executor.layers.fused_moe.fused_moe_method_base import (
     FusedMoEMethodBase,
 )
+from vllm.model_executor.layers.fused_moe.fused_moe_router import FusedMoERouter
 from vllm.model_executor.layers.fused_moe.modular_kernel import (
     FusedMoEModularKernel,
     FusedMoEPrepareAndFinalize,
@@ -20,8 +21,11 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import (
 logger = init_logger(__name__)
 
 
+# --8<-- [start:modular_fused_moe]
 @CustomOp.register("modular_fused_moe")
 class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
+    # --8<-- [end:modular_fused_moe]
+
     def __init__(
         self, old_quant_method: FusedMoEMethodBase, experts: FusedMoEModularKernel
     ):
@@ -49,7 +53,6 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
                 prepare_finalize,
                 old_quant_method.select_gemm_impl(prepare_finalize, moe_layer),
                 shared_experts,
-                getattr(moe_layer, "shared_experts_stream", None),
                 moe_parallel_config=moe_layer.moe_parallel_config,
             ),
         )
@@ -89,10 +92,11 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
     def apply(
         self,
         layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
+        router: FusedMoERouter,
         x: torch.Tensor,
         router_logits: torch.Tensor,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        topk_weights, topk_ids, zero_expert_result = layer.select_experts(
+        topk_weights, topk_ids = router.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
@@ -110,10 +114,4 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
             expert_map=None if self.disable_expert_map else layer.expert_map,
         )
 
-        if layer.zero_expert_num != 0 and layer.zero_expert_type is not None:
-            assert not isinstance(result, tuple), (
-                "Shared + zero experts are mutually exclusive not yet supported"
-            )
-            return result, zero_expert_result
-        else:
-            return result
+        return result
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_router.py b/vllm/model_executor/layers/fused_moe/fused_moe_router.py
new file mode 100644
index 0000000000000000000000000000000000000000..c322a8cd4cd69455012f8506e89444efde2a0927
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/fused_moe_router.py
@@ -0,0 +1,40 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from abc import ABC, abstractmethod
+
+import torch
+
+from vllm.model_executor.layers.fused_moe.config import RoutingMethodType
+
+
+class FusedMoERouter(ABC):
+    """
+    FusedMoERouter is an abstract class that provides a 'select_experts'
+    method that is used for routing hidden states based on router logits.
+    """
+
+    @property
+    @abstractmethod
+    def routing_method_type(self) -> RoutingMethodType:
+        raise NotImplementedError
+
+    @abstractmethod
+    def select_experts(
+        self,
+        hidden_states: torch.Tensor,
+        router_logits: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Route the input hidden states to the top-k experts based on the
+        router logits.
+
+        Returns:
+            (topk_weights, topk_ids)
+            (tuple[torch.Tensor, torch.Tensor]):
+            The weights and expert ids computation result.
+
+            **Compatibility**: When EPLB is not enabled, the returned ids are
+            equivalent to global logical ids, so should be compatible with
+            plain MoE implementations without redundant experts.
+        """
+        raise NotImplementedError
diff --git a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
index 0b006e15632e18be917f61877f392a9e32ac9981..c4bc1824aa1f467e8652f071b1b6ee86c93c7745 100644
--- a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
+++ b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
@@ -323,10 +323,12 @@ class OAITritonExperts(BaseOAITritonExperts):
         global_num_experts: int,
         local_num_experts: int,
         expert_tokens_meta: mk.ExpertTokensMetadata | None,
+        activation: str,
     ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
         # workspace are allocated inside the kernel
+        activation_out_dim = self.adjust_N_for_activation(N, activation)
         workspace1 = (0, 0)
-        workspace2 = (M * topk, N // 2)
+        workspace2 = (M * topk, activation_out_dim)
         output = (M, K)
         return (workspace1, workspace2, output)
 
@@ -415,9 +417,11 @@ class UnfusedOAITritonExperts(BaseOAITritonExperts):
         global_num_experts: int,
         local_num_experts: int,
         expert_tokens_meta: mk.ExpertTokensMetadata | None,
+        activation: str,
     ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
         # workspace are allocated inside the kernel
-        workspace1 = (M * topk, N // 2)
+        activation_out_dim = self.adjust_N_for_activation(N, activation)
+        workspace1 = (M * topk, activation_out_dim)
         workspace2 = (M * topk, max(N, K))
         output = (M, K)
         return (workspace1, workspace2, output)
@@ -443,8 +447,10 @@ class UnfusedOAITritonExperts(BaseOAITritonExperts):
         expert_tokens_meta: mk.ExpertTokensMetadata | None,
         apply_router_weight_on_input: bool,
     ):
-        if self.quant_config is None:
-            self.quant_config = FUSED_MOE_UNQUANTIZED_CONFIG
+        # Use local variable to help mypy narrow the type after None check
+        quant_config = self.quant_config
+        if quant_config is None:
+            quant_config = FUSED_MOE_UNQUANTIZED_CONFIG
 
         if expert_map is not None:
             topk_ids = expert_map[topk_ids]
@@ -462,12 +468,10 @@ class UnfusedOAITritonExperts(BaseOAITritonExperts):
         # type check, uint8 means mxfp4
         assert hidden_states.dtype == torch.bfloat16
         assert (
-            self.quant_config.w1_bias is None
-            or self.quant_config.w1_bias.dtype == torch.float32
+            quant_config.w1_bias is None or quant_config.w1_bias.dtype == torch.float32
         )
         assert (
-            self.quant_config.w2_bias is None
-            or self.quant_config.w2_bias.dtype == torch.float32
+            quant_config.w2_bias is None or quant_config.w2_bias.dtype == torch.float32
         )
 
         # Shape check, only check non-mxfp4
@@ -485,38 +489,41 @@ class UnfusedOAITritonExperts(BaseOAITritonExperts):
         # Note that the output tensor might be in workspace13
         intermediate_cache1 = _resize_cache(workspace2, (batch_dim, M * topk, N))
         intermediate_cache3 = _resize_cache(workspace2, (batch_dim, M * topk, K))
-        intermediate_cache2 = _resize_cache(workspace13, (M * topk, N // 2))
+        activation_out_dim = self.adjust_N_for_activation(N, activation)
+        intermediate_cache2 = _resize_cache(workspace13, (M * topk, activation_out_dim))
 
         gammas = routing_data.gate_scal if routing_data else None
 
         matmul_ogs(
             hidden_states,
             w1,
-            self.quant_config.w1_bias,
+            quant_config.w1_bias,
             routing_data,
             gather_indx=gather_indx,
-            precision_config=self.quant_config.w1_precision,
+            precision_config=quant_config.w1_precision,
             gammas=gammas if apply_router_weight_on_input else None,
             fused_activation=None,
             y=intermediate_cache1,
         )
 
         self.activation(
-            activation, intermediate_cache2, intermediate_cache1.view(-1, N)
+            activation,
+            intermediate_cache2,
+            intermediate_cache1.view(-1, N)[gather_indx.dst_indx],
         )
 
         # matmul_ogs grouped reduction fuse sum across multiple experts:
-        # y[dst_ind // n_expts_act, :] += x[src_ind, :]
+        # y[dst_indx // n_expts_act, :] += x
         # Need to set n_expts_act to 1 to unfuse moe_sum
         routing_data.n_expts_act = 1
 
         matmul_ogs(
-            intermediate_cache2,
+            intermediate_cache2[gather_indx.src_indx],
             w2,
-            self.quant_config.w2_bias,
+            quant_config.w2_bias,
             routing_data,
             scatter_indx=scatter_indx,
-            precision_config=self.quant_config.w2_precision,
+            precision_config=quant_config.w2_precision,
             gammas=None if apply_router_weight_on_input else gammas,
             y=intermediate_cache3,
         )
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index d24639122c62a8fc934fd44fb679bf4d777b299d..1745d2c2ab584c948805c84830825e66cc4b5bd3 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -5,7 +5,6 @@ import os
 from collections.abc import Callable, Iterable
 from contextlib import nullcontext
 from enum import Enum
-from functools import partial
 from typing import Literal, cast, get_args, overload
 
 import torch
@@ -33,18 +32,19 @@ from vllm.model_executor.layers.fused_moe.config import (
     FusedMoEQuantConfig,
     RoutingMethodType,
 )
-from vllm.model_executor.layers.fused_moe.fused_moe import zero_experts_compute_triton
+from vllm.model_executor.layers.fused_moe.fused_moe_router import FusedMoERouter
 from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
     init_aiter_topK_meta_data,
 )
+from vllm.model_executor.layers.fused_moe.routed_experts_capturer import (
+    RoutedExpertsCapturer,
+)
 from vllm.model_executor.layers.fused_moe.routing_simulator import RoutingSimulator
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig,
 )
-from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
-    is_flashinfer_supporting_global_sf,
-)
 from vllm.platforms import current_platform
+from vllm.utils.flashinfer import has_flashinfer_trtllm_fused_moe
 from vllm.utils.math_utils import cdiv, round_up
 from vllm.utils.torch_utils import (
     aux_stream,
@@ -69,17 +69,7 @@ else:
         return topk_ids
 
     eplb_map_to_physical_and_record = _eplb_map_to_physical_and_record
-from vllm.model_executor.layers.fused_moe.fused_moe import grouped_topk
-if current_platform.is_rocm():
-    from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (  # noqa: E501
-        rocm_aiter_grouped_topk,
-    )
-
-if current_platform.is_tpu():
-    from .moe_pallas import fused_moe as fused_moe_pallas
-else:
-    fused_moe_pallas = None  # type: ignore
-
+from vllm.model_executor.layers.fused_moe.fused_moe import GroupedTopk
 from vllm.model_executor.layers.fused_moe.fused_moe_method_base import (
     FusedMoEMethodBase,
 )
@@ -303,6 +293,24 @@ def maybe_roundup_hidden_size(
     return hidden_size
 
 
+class FusedMoERouterImpl(FusedMoERouter):
+    def __init__(self, layer: "FusedMoE"):
+        super().__init__()
+        self.layer = layer
+
+    @property
+    def routing_method_type(self) -> RoutingMethodType:
+        return self.layer.routing_method_type
+
+    def select_experts(
+        self,
+        hidden_states: torch.Tensor,
+        router_logits: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        return self.layer._select_experts(hidden_states, router_logits)
+
+
+# --8<-- [start:fused_moe]
 @CustomOp.register("fused_moe")
 class FusedMoE(CustomOp):
     """FusedMoE layer for MoE models.
@@ -324,8 +332,11 @@ class FusedMoE(CustomOp):
         renormalize: Whether to renormalize the logits in the fused_moe kernel
         quant_config: Quantization configure.
         enable_eplb: Whether to enable expert parallelism load balancer.
+        router_logits_dtype: Data type for router logits buffers.
     """
 
+    # --8<-- [end:fused_moe]
+
     def __init__(
         self,
         num_experts: int,  # Global number of experts
@@ -355,11 +366,10 @@ class FusedMoE(CustomOp):
         num_redundant_experts: int = 0,
         has_bias: bool = False,
         is_sequence_parallel=False,
-        zero_expert_num: int | None = 0,
-        zero_expert_type: str | None = None,
         expert_mapping: list[tuple[str, str, int, str]] | None = None,
         n_shared_experts: int | None = None,
-        routing_method_type: int | None = None,
+        routing_method_type: RoutingMethodType | None = None,
+        router_logits_dtype: torch.dtype | None = None,
     ):
         super().__init__()
 
@@ -368,14 +378,14 @@ class FusedMoE(CustomOp):
         # TODO: Remove this after more extensive testings with TP/DP
         # and other execution modes
         if envs.VLLM_DISABLE_SHARED_EXPERTS_STREAM:
-            logger.info_once("Disabling MoE shared_experts cuda stream")
+            logger.debug_once("Disabling MoE shared_experts cuda stream", scope="local")
             self.shared_experts_stream = None
         else:
             # TODO(rob): enable shared expert overlap with non-cuda-alike.
             # aux_stream() returns None on non-cuda-alike platforms.
             self.shared_experts_stream = aux_stream()
             if self.shared_experts_stream is not None:
-                logger.info_once(
+                logger.debug_once(
                     "Enabled separate cuda stream for MoE shared_experts", scope="local"
                 )
 
@@ -414,8 +424,6 @@ class FusedMoE(CustomOp):
 
         self.global_num_experts = num_experts + num_redundant_experts
         self.logical_num_experts = num_experts
-        self.zero_expert_num = zero_expert_num
-        self.zero_expert_type = zero_expert_type
 
         # Expert mapping used in self.load_weights
         self.expert_mapping = expert_mapping
@@ -543,6 +551,20 @@ class FusedMoE(CustomOp):
         self.apply_router_weight_on_input = apply_router_weight_on_input
         self.activation = activation
 
+        self._grouped_topk_impl: GroupedTopk | None = None
+        if self.use_grouped_topk:
+            assert self.num_expert_group is not None
+            assert self.topk_group is not None
+            self._grouped_topk_impl = GroupedTopk(
+                topk=self.top_k,
+                renormalize=self.renormalize,
+                num_expert_group=self.num_expert_group,
+                topk_group=self.topk_group,
+                scoring_func=self.scoring_func,
+                routed_scaling_factor=self.routed_scaling_factor,
+                num_fused_shared_experts=self.num_fused_shared_experts,
+            )
+
         if self.scoring_func != "softmax" and not self.use_grouped_topk:
             raise ValueError(
                 "Only softmax scoring function is supported for non-grouped topk."
@@ -550,7 +572,7 @@ class FusedMoE(CustomOp):
 
         # ToDo: Better logic to determine the routing method type
         if routing_method_type is not None:
-            self.routing_method_type = routing_method_type
+            self.routing_method_type: RoutingMethodType = routing_method_type
         else:
             if scoring_func == "sigmoid":
                 if self.use_grouped_topk:
@@ -573,6 +595,7 @@ class FusedMoE(CustomOp):
             num_local_experts=self.local_num_experts,
             moe_parallel_config=self.moe_parallel_config,
             in_dtype=moe_in_dtype,
+            router_logits_dtype=router_logits_dtype,
             max_num_tokens=envs.VLLM_MOE_DP_CHUNK_SIZE,
             has_bias=has_bias,
             is_act_and_mul=is_act_and_mul,
@@ -680,6 +703,8 @@ class FusedMoE(CustomOp):
         self.batched_hidden_states: torch.Tensor | None = None
         self.batched_router_logits: torch.Tensor | None = None
 
+        self.router = FusedMoERouterImpl(self)
+
     # Note: maybe_init_modular_kernel should only be called by
     # prepare_communication_buffer_for_model.
     # This is called after all weight loading and post-processing, so it
@@ -704,6 +729,13 @@ class FusedMoE(CustomOp):
     def shared_experts(self) -> torch.nn.Module | None:
         return None
 
+    @property
+    def layer_id(self):
+        # Delayed import to avoid circular dependency
+        from vllm.model_executor.models.utils import extract_layer_index
+
+        return extract_layer_index(self.layer_name)
+
     @property
     def gate(self) -> torch.nn.Module | None:
         return None
@@ -1168,14 +1200,9 @@ class FusedMoE(CustomOp):
         global_expert_id = expert_id
         expert_id = self._map_global_expert_id_to_local_expert_id(global_expert_id)
 
-        allow_flashinfer = getattr(self.quant_method, "allow_flashinfer", False)
-        moe_backend = getattr(self.quant_method, "flashinfer_moe_backend", None)
-
         use_global_sf = (
-            allow_flashinfer
-            and is_flashinfer_supporting_global_sf(moe_backend)
+            getattr(self.quant_method, "use_global_sf", False)
             and "input_scale" in weight_name
-            and quant_method_name == "ModelOptNvFp4FusedMoE"
         )
 
         if expert_id == -1 and not use_global_sf:
@@ -1557,23 +1584,25 @@ class FusedMoE(CustomOp):
         )
 
         self.batched_router_logits = torch.zeros(
-            logits_shape, dtype=moe.in_dtype, device=torch.cuda.current_device()
+            logits_shape,
+            dtype=moe.router_logits_dtype,
+            device=torch.cuda.current_device(),
         )
 
-    def select_experts(
+    def _select_experts(
         self,
         hidden_states: torch.Tensor,
         router_logits: torch.Tensor,
         use_fused_gate: bool | None = False,
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         """
         Route the input hidden states to the top-k experts based on the
         router logits.
 
         Returns:
-                (topk_weights, topk_ids, zero_expert_result)
-                (tuple[torch.Tensor, torch.Tensor, torch.Tensor]):
-                The weights, expert ids, and zero expert computation result.
+                (topk_weights, topk_ids)
+                (tuple[torch.Tensor, torch.Tensor]):
+                The weights and expert ids.
 
             **Compatibility**: When EPLB is not enabled, the returned ids are
             equivalent to global logical ids, so should be compatible with
@@ -1626,18 +1655,7 @@ class FusedMoE(CustomOp):
 
         # DeepSeekv2 uses grouped_top_k
         elif self.use_grouped_topk and valid_grouping():
-            assert self.topk_group is not None
-            assert self.num_expert_group is not None
-            # if rocm_aiter_ops.is_fused_moe_enabled():
-            #     if not rocm_aiter_ops.is_fusion_moe_shared_experts_enabled():
-            #         assert self.num_fused_shared_experts == 0
-            #     grouped_topk_impl = partial(
-            #         rocm_aiter_grouped_topk,
-            #         num_fused_shared_experts=self.num_fused_shared_experts,
-            #     )
-            # else:
-            grouped_topk_impl = grouped_topk
-
+            assert self._grouped_topk_impl is not None
             if use_fused_gate:
                 # if envs.VLLM_USE_LIGHTOP:
                 if False:
@@ -1661,15 +1679,9 @@ class FusedMoE(CustomOp):
                         n_share_experts_fusion=0,
                     )
             else:
-                topk_weights, topk_ids = grouped_topk_impl(
+                topk_weights, topk_ids = self._grouped_topk_impl(
                     hidden_states=hidden_states,
                     gating_output=router_logits,
-                    topk=self.top_k,
-                    renormalize=self.renormalize,
-                    num_expert_group=self.num_expert_group,
-                    topk_group=self.topk_group,
-                    scoring_func=self.scoring_func,
-                    routed_scaling_factor=self.routed_scaling_factor,
                     e_score_correction_bias=self.e_score_correction_bias,
                 )
         elif self.e_score_correction_bias is not None:
@@ -1711,23 +1723,19 @@ class FusedMoE(CustomOp):
 
         assert topk_ids.dtype == indices_type or indices_type is None
 
-        # Compute zero expert result if needed
         if (
-            self.zero_expert_num is not None
-            and self.zero_expert_num > 0
-            and self.zero_expert_type is not None
-            and self.global_num_experts is not None
+            self.vllm_config.model_config is not None
+            and self.vllm_config.model_config.enable_return_routed_experts
         ):
-            zero_expert_result = zero_experts_compute_triton(
-                expert_indices=topk_ids,
-                expert_scales=topk_weights,
-                num_experts=self.global_num_experts,
-                zero_expert_type=self.zero_expert_type,
-                hidden_states=hidden_states,
-            )
-        else:
-            zero_expert_result = None
-        return topk_weights, topk_ids, zero_expert_result
+            # In dummy runs, the capturer is not initialized.
+            capturer = RoutedExpertsCapturer.get_instance()
+            if capturer is not None:  # in dummmy_run may be None
+                capturer.capture(  # noqa
+                    layer_id=self.layer_id,
+                    topk_ids=topk_ids,
+                )
+
+        return topk_weights, topk_ids
 
     def must_reduce_shared_expert_outputs(self) -> bool:
         """
@@ -1782,27 +1790,22 @@ class FusedMoE(CustomOp):
             return states
 
         if self.shared_experts is None:
-            if current_platform.is_tpu():
+            if current_platform.is_tpu() or current_platform.is_cpu():
                 # TODO: Once the OOM issue for the TPU backend is resolved, we
                 # will switch to using the moe_forward custom op.
+                # Note: CPU doesn't require wrapped forward_impl.
                 fused_output = self.forward_impl(hidden_states, router_logits)
                 assert not isinstance(fused_output, tuple)
             else:
                 fused_output = torch.ops.vllm.moe_forward(
                     hidden_states, router_logits, self.layer_name
                 )
-            if self.zero_expert_num is not None and self.zero_expert_num > 0:
-                assert isinstance(fused_output, tuple)
-                fused_output, zero_expert_result = fused_output
-                return (reduce_output(fused_output) + zero_expert_result)[
-                    ..., :og_hidden_states
-                ]
-            else:
-                return reduce_output(fused_output)[..., :og_hidden_states]
+            return reduce_output(fused_output)[..., :og_hidden_states]
         else:
-            if current_platform.is_tpu():
+            if current_platform.is_tpu() or current_platform.is_cpu():
                 # TODO: Once the OOM issue for the TPU backend is resolved, we
                 # will switch to using the moe_forward custom op.
+                # Note: CPU doesn't require wrapped forward_impl.
                 shared_output, fused_output = self.forward_impl(
                     hidden_states, router_logits
                 )
@@ -1910,6 +1913,7 @@ class FusedMoE(CustomOp):
             # Matrix multiply.
             final_hidden_states = self.quant_method.apply(
                 layer=self,
+                router=self.router,
                 x=staged_hidden_states,
                 router_logits=staged_router_logits,
             )
@@ -1925,13 +1929,6 @@ class FusedMoE(CustomOp):
                     final_hidden_states,
                 )
 
-            if self.zero_expert_num is not None and self.zero_expert_num > 0:
-                assert isinstance(final_hidden_states, tuple)
-                assert self.shared_experts is None
-                final_hidden_states, zero_expert_result = final_hidden_states
-                if zero_expert_result is not None:
-                    final_hidden_states += zero_expert_result
-
             if not skip_result_store:
                 if self.shared_experts is None:
                     full_fused_final_hidden_states[chunk_start:chunk_end, :].copy_(
@@ -2027,10 +2024,46 @@ class FusedMoE(CustomOp):
         )
 
         with sp_ctx:
+            extra_tensors = None
             if do_naive_dispatch_combine:
-                hidden_states_combined, router_logits = get_ep_group().dispatch(
-                    hidden_states, router_logits, self.is_sequence_parallel
+                # Avoid circular import
+                from vllm.model_executor.layers.quantization.modelopt import (
+                    ModelOptNvFp4FusedMoE,
+                )
+
+                post_quant_allgather = (
+                    self.quant_method is not None
+                    and self.dp_size > 1
+                    and self.use_ep
+                    and isinstance(self.quant_method, ModelOptNvFp4FusedMoE)
+                    and has_flashinfer_trtllm_fused_moe()
                 )
+                if post_quant_allgather:
+                    hidden_states_to_dispatch, extra_tensors = (
+                        self.quant_method.prepare_dp_allgather_tensor(
+                            self, hidden_states, router_logits
+                        )
+                    )
+                else:
+                    hidden_states_to_dispatch = hidden_states
+
+                dispatch_res = get_ep_group().dispatch(
+                    hidden_states_to_dispatch,
+                    router_logits,
+                    self.is_sequence_parallel,
+                    extra_tensors=extra_tensors,
+                )
+                if extra_tensors is not None:
+                    hidden_states_combined, router_logits, extra_tensors_combined = (
+                        dispatch_res
+                    )
+                    hidden_states_combined = (
+                        hidden_states_combined,
+                        extra_tensors_combined[0],
+                    )
+                else:
+                    hidden_states_combined, router_logits = dispatch_res
+
             # Run shared experts before matrix multiply.
             # because matrix multiply maybe modify the hidden_states.
             if has_separate_shared_experts and not use_shared_experts_stream:
@@ -2053,6 +2086,7 @@ class FusedMoE(CustomOp):
             # Matrix multiply.
             final_hidden_states = self.quant_method.apply(
                 layer=self,
+                router=self.router,
                 x=hidden_states_combined
                 if do_naive_dispatch_combine
                 else hidden_states,
@@ -2080,9 +2114,6 @@ class FusedMoE(CustomOp):
                     shared_output,
                     final_hidden_states,
                 )
-            elif self.zero_expert_num is not None and self.zero_expert_num > 0:
-                assert isinstance(final_hidden_states, tuple)
-                final_hidden_states, zero_expert_result = final_hidden_states
 
             def combine_output(states: torch.Tensor) -> torch.Tensor:
                 if do_naive_dispatch_combine:
@@ -2100,15 +2131,13 @@ class FusedMoE(CustomOp):
                     final_hidden_states[0],
                     combine_output(final_hidden_states[1]),
                 )
-            elif self.zero_expert_num is not None and self.zero_expert_num > 0:
-                assert isinstance(final_hidden_states, torch.Tensor)
-                return (combine_output(final_hidden_states), zero_expert_result)
             else:
                 return combine_output(final_hidden_states)
 
     @classmethod
     def make_expert_params_mapping(
         cls,
+        model: torch.nn.Module,
         ckpt_gate_proj_name: str,
         ckpt_down_proj_name: str,
         ckpt_up_proj_name: str,
@@ -2127,13 +2156,19 @@ class FusedMoE(CustomOp):
             )
         )
 
+        base_layer = (
+            "base_layer."
+            if any(".base_layer." in name for name, _ in model.named_parameters())
+            else ""
+        )
+
         return [
             # (param_name, weight_name, expert_id, shard_id)
             (
-                "experts.w13_"
+                f"experts.{base_layer}w13_"
                 if weight_name in [ckpt_gate_proj_name, ckpt_up_proj_name]
-                else "experts.w2_",
-                f"experts.{physical_to_logical_map[expert_id]}.{weight_name}.",
+                else f"experts.{base_layer}w2_",
+                f"experts.{physical_to_logical_map[expert_id]}.{weight_name}.{base_layer}",
                 expert_id,
                 shard_id,
             )
diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py
index b0834e861338fbdc5caccf198e340c753f29df84..a6df2b20af9c90a4a0241cbed1b9966fa7cea2fc 100644
--- a/vllm/model_executor/layers/fused_moe/modular_kernel.py
+++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py
@@ -18,10 +18,10 @@ from vllm.model_executor.layers.fused_moe.config import (
 )
 from vllm.model_executor.layers.fused_moe.utils import (
     _resize_cache,
+    apply_moe_activation,
     count_expert_num_tokens,
     disable_inplace,
 )
-from vllm.platforms import current_platform
 from vllm.utils.math_utils import cdiv
 from vllm.v1.worker.ubatching import (
     dbo_enabled,
@@ -543,6 +543,7 @@ class FusedMoEPermuteExpertsUnpermute(ABC):
         global_num_experts: int,
         local_num_experts: int,
         expert_tokens_meta: ExpertTokensMetadata | None,
+        activation: str,
     ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
         """
         Compute the shapes for the temporary and final outputs of the two gemms
@@ -573,19 +574,31 @@ class FusedMoEPermuteExpertsUnpermute(ABC):
         """
         raise NotImplementedError
 
+    @staticmethod
+    def adjust_N_for_activation(N: int, activation: str) -> int:
+        """
+        Calculate the output dimension for the activation function.
+
+        For *_no_mul activations (e.g. relu2_no_mul),
+        there's no gate/up split, so output size equals input size (N).
+
+        For regular gated activations (e.g., silu, gelu, swigluoai),
+        output size is N // 2 due to gate × activation(up) multiplication.
+
+        Args:
+            N: The intermediate size (width of w1/w3 weights).
+            activation: The activation function name.
+
+        Returns:
+            The output dimension after activation.
+        """
+        is_no_mul = activation.endswith("_no_mul")
+        return N if is_no_mul else N // 2
+
     def activation(
         self, activation: str, output: torch.Tensor, input: torch.Tensor
     ) -> None:
-        assert output.size(-1) * 2 == input.size(-1)
-        if activation == "silu":
-            torch.ops._C.silu_and_mul(output, input)
-        elif activation == "gelu":
-            torch.ops._C.gelu_and_mul(output, input)
-        elif activation == "swigluoai":
-            # alpha = 1.702, limit = 7.0
-            torch.ops._C.swigluoai_and_mul(output, input)
-        else:
-            raise ValueError(f"Unsupported FusedMoe activation: {activation}")
+        apply_moe_activation(activation, output, input)
 
     def enable_chunking(self):
         return (
@@ -682,14 +695,12 @@ class FusedMoEModularKernel(torch.nn.Module):
         prepare_finalize: FusedMoEPrepareAndFinalize,
         fused_experts: FusedMoEPermuteExpertsUnpermute,
         shared_experts: torch.nn.Module | None = None,
-        shared_experts_stream: torch.cuda.Stream | None = None,
         moe_parallel_config: FusedMoEParallelConfig | None = None,
     ):
         super().__init__()
         self.prepare_finalize = prepare_finalize
         self.fused_experts = fused_experts
         self.shared_experts = shared_experts
-        self.shared_experts_stream = shared_experts_stream
 
         # prefer an explicit FusedMoEParallelConfig when available (from
         # FusedMoE layers / tests).
@@ -743,7 +754,7 @@ class FusedMoEModularKernel(torch.nn.Module):
             1,
             (
                 M
-                if not self.fused_experts.supports_chunking()
+                if not self.fused_experts.enable_chunking()
                 else min(M, envs.VLLM_FUSED_MOE_CHUNK_SIZE)
             ),
         )
@@ -764,6 +775,7 @@ class FusedMoEModularKernel(torch.nn.Module):
         global_num_experts: int,
         local_num_experts: int,
         expert_tokens_meta: ExpertTokensMetadata | None,
+        activation: str,
     ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """
         Allocate temporary and output buffers for the fused experts op.
@@ -786,7 +798,7 @@ class FusedMoEModularKernel(torch.nn.Module):
             is_forward_context_available()
             and get_forward_context().attn_metadata is None
         )
-        if is_profile_run and self.fused_experts.supports_chunking() and self.is_dp_ep:
+        if is_profile_run and self.fused_experts.enable_chunking() and self.is_dp_ep:
             max_workspace_13, max_workspace_2, max_fused_out_shape = (
                 self.fused_experts.workspace_shapes(
                     envs.VLLM_FUSED_MOE_CHUNK_SIZE,
@@ -799,6 +811,7 @@ class FusedMoEModularKernel(torch.nn.Module):
                     # amount of workspace. Mark it None, so we allocate for
                     # the worst-case scenario.
                     expert_tokens_meta=None,
+                    activation=activation,
                 )
             )
 
@@ -817,6 +830,7 @@ class FusedMoEModularKernel(torch.nn.Module):
             global_num_experts,
             local_num_experts,
             expert_tokens_meta,
+            activation,
         )
 
         # Get final output shape based on the full M size.
@@ -828,6 +842,7 @@ class FusedMoEModularKernel(torch.nn.Module):
             global_num_experts,
             local_num_experts,
             expert_tokens_meta,
+            activation,
         )
 
         # We can reuse the memory between cache1 and cache3 because by the
@@ -904,34 +919,6 @@ class FusedMoEModularKernel(torch.nn.Module):
             expert_num_tokens_cpu=c_expert_num_tokens_cpu,
         )
 
-    def _maybe_setup_shared_experts_stream(
-        self, hidden_states: torch.Tensor
-    ) -> tuple[bool, torch.Tensor | None]:
-        # decide whether to run shared experts on a separate CUDA stream to
-        # overlap with the main fused MoE kernel.
-        use_shared_experts_stream = (
-            self.shared_experts is not None
-            and self.shared_experts_stream is not None
-            and hidden_states.is_cuda
-            and (
-                hidden_states.shape[0]
-                <= envs.VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD
-            )
-        )
-
-        hidden_states_clone: torch.Tensor | None = None
-        if use_shared_experts_stream and self.shared_experts_stream is not None:
-            # TODO: Optimize this (complicated)
-            # Note: this clone adds overhead but is required
-            # for correctness with multiple CUDA streams and CUDA graph capture.
-            hidden_states_clone = hidden_states.clone()
-            # record that the clone will be used by the separate stream so its
-            # lifetime is correctly tracked.
-            hidden_states_clone.record_stream(self.shared_experts_stream)
-            self.shared_experts_stream.wait_stream(torch.cuda.current_stream())
-
-        return use_shared_experts_stream, hidden_states_clone
-
     def _prepare(
         self,
         hidden_states: torch.Tensor,
@@ -1074,6 +1061,7 @@ class FusedMoEModularKernel(torch.nn.Module):
                 global_num_experts,
                 local_num_experts,
                 expert_tokens_meta,
+                activation,
             )
 
         for chunk_idx in range(num_chunks):
@@ -1119,30 +1107,12 @@ class FusedMoEModularKernel(torch.nn.Module):
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
         apply_router_weight_on_input: bool,
-        hidden_states_clone: torch.Tensor | None = None,
-        use_shared_experts_stream: bool = False,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         """
         The _finalize method is a wrapper around self.prepare_finalize.finalize
         that handles DBO, async and shared expert overlap.
         """
-
-        def maybe_run_shared_experts() -> torch.Tensor | None:
-            if self.shared_experts is None:
-                return None
-
-            if (
-                not use_shared_experts_stream
-                or self.shared_experts_stream is not None
-                and (not hidden_states.is_cuda or not torch.cuda.is_available())
-            ):
-                # fall back to running on the current stream
-                return self.shared_experts(hidden_states)
-
-            assert hidden_states_clone is not None
-            # launch shared experts on the dedicated stream.
-            with torch.cuda.stream(self.shared_experts_stream):
-                return self.shared_experts(hidden_states_clone)
+        shared_output: torch.Tensor | None = None
 
         if not self.prepare_finalize.supports_async():
             assert not dbo_enabled()
@@ -1155,7 +1125,8 @@ class FusedMoEModularKernel(torch.nn.Module):
                 apply_router_weight_on_input,
                 self.fused_experts.finalize_weight_and_reduce_impl(),
             )
-            shared_output = maybe_run_shared_experts()
+            if self.shared_experts is not None:
+                shared_output = self.shared_experts(hidden_states)
         else:
             finalize_ret = self.prepare_finalize.finalize_async(
                 output,
@@ -1165,8 +1136,8 @@ class FusedMoEModularKernel(torch.nn.Module):
                 apply_router_weight_on_input,
                 self.fused_experts.finalize_weight_and_reduce_impl(),
             )
-
-            shared_output = maybe_run_shared_experts()
+            if self.shared_experts is not None:
+                shared_output = self.shared_experts(hidden_states)
 
             # TODO(lucas): refactor this in the alternative schedules followup
             # currently unpack if we have hook + receiver pair or just
@@ -1189,28 +1160,12 @@ class FusedMoEModularKernel(torch.nn.Module):
 
             receiver()
 
-        self._wait_for_shared_experts_stream(hidden_states, use_shared_experts_stream)
-
         if self.shared_experts is None:
             return output
         else:
             assert shared_output is not None
             return shared_output, output
 
-    def _wait_for_shared_experts_stream(
-        self, hidden_states: torch.Tensor, use_shared_experts_stream: bool
-    ) -> None:
-        # ensure that any work enqueued on the shared_experts_stream is
-        # completed before the shared_output tensor is consumed
-        if (
-            self.shared_experts is not None
-            and use_shared_experts_stream
-            and self.shared_experts_stream is not None
-            and hidden_states.is_cuda
-            and current_platform.is_cuda()
-        ):
-            torch.cuda.current_stream().wait_stream(self.shared_experts_stream)
-
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -1257,10 +1212,6 @@ class FusedMoEModularKernel(torch.nn.Module):
         else:
             output = torch.zeros_like(hidden_states)
 
-        use_shared_experts_stream, hidden_states_clone = (
-            self._maybe_setup_shared_experts_stream(hidden_states)
-        )
-
         local_num_experts = w1.size(0)
         if global_num_experts == -1:
             global_num_experts = local_num_experts
@@ -1297,6 +1248,4 @@ class FusedMoEModularKernel(torch.nn.Module):
             topk_weights,
             topk_ids,
             apply_router_weight_on_input,
-            hidden_states_clone=hidden_states_clone,
-            use_shared_experts_stream=use_shared_experts_stream,
         )
diff --git a/vllm/model_executor/layers/fused_moe/moe_pallas.py b/vllm/model_executor/layers/fused_moe/moe_pallas.py
deleted file mode 100644
index 66c00cf89873a07837627cf5a57bb07e7c43ff8e..0000000000000000000000000000000000000000
--- a/vllm/model_executor/layers/fused_moe/moe_pallas.py
+++ /dev/null
@@ -1,83 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import torch
-import torch.nn.functional as F
-
-
-def _histogram(input: torch.Tensor, min: int, max: int) -> torch.Tensor:
-    """
-    Compute the histogram of an int32 tensor. The bin edges are defined by the
-    min and max values, with step = 1.
-    """
-    assert input.dtype == torch.int32, "input must be of torch.int32 dtype."
-    assert min <= max, "min must be less than or equal to max."
-
-    def searchsorted(
-        sorted_sequence: torch.Tensor, values_to_search: torch.Tensor
-    ) -> torch.Tensor:
-        return (sorted_sequence.unsqueeze(1) == values_to_search).sum(dim=1)
-
-    bin_edges = torch.linspace(min, max, max - min + 1, dtype=input.dtype).to(
-        input.device
-    )
-    return searchsorted(bin_edges, input).to(torch.int32)
-
-
-def fused_moe(
-    hidden_states: torch.Tensor,
-    w1: torch.Tensor,
-    w2: torch.Tensor,
-    gating_output: torch.Tensor,
-    topk: int,
-    global_num_experts: int,
-    expert_map: torch.Tensor = None,
-    renormalize: bool = False,
-) -> torch.Tensor:
-    """
-    Args:
-        hidden_states: [*, hidden_size]
-        w1: [num_experts, intermediate_size * 2, hidden_size]
-        w2: [num_experts, hidden_size, intermediate_size]
-        gating_output: [*, num_experts]
-    """
-    assert expert_map is None, "expert_map is not supported for pallas MoE."
-    import torch_xla.experimental.custom_kernel  # noqa: F401
-
-    orig_shape = hidden_states.shape
-    hidden_size = hidden_states.shape[-1]
-    num_tokens = hidden_states.shape[:-1].numel()
-    num_experts = w1.shape[0]
-    intermediate_size = w2.shape[-1]
-    device = hidden_states.device
-    dtype = hidden_states.dtype
-    assert (num_tokens * topk) % 16 == 0, (
-        "The Pallas GMM kernel requires num_tokens * topk to be a multiple of "
-        f"16 but got {num_tokens * topk}"
-    )
-
-    hidden_states = hidden_states.view(num_tokens, hidden_size)
-    gating_output = gating_output.view(num_tokens, num_experts)
-    topk_weights = gating_output.softmax(dim=-1, dtype=torch.float)
-    topk_weights, topk_indices = topk_weights.topk(topk, dim=-1)
-    if renormalize:
-        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
-    topk_weights = topk_weights.to(dtype)
-
-    topk_indices = topk_indices.flatten()
-    topk_argsort_indices = topk_indices.argsort()
-    topk_argsort_revert_indices = topk_argsort_indices.argsort()
-    token_indices = torch.arange(num_tokens, device=device).repeat_interleave(topk)
-    token_indices = token_indices[topk_argsort_indices]
-    group_sizes = _histogram(topk_indices.to(torch.int32), 0, num_experts - 1)
-
-    x = hidden_states[token_indices]
-    x = torch.ops.xla.gmm(x, w1, group_sizes, transpose_rhs=True)
-    x = F.silu(x[..., :intermediate_size]) * x[..., intermediate_size:]
-    x = torch.ops.xla.gmm(x, w2, group_sizes, transpose_rhs=True)
-    x = x[topk_argsort_revert_indices].reshape(-1, topk, hidden_size)
-
-    x = x * topk_weights.unsqueeze(dim=-1)
-    x = x.sum(dim=-2)
-    x = x.reshape(orig_shape)
-    return x
diff --git a/vllm/model_executor/layers/fused_moe/oracle/__init__.py b/vllm/model_executor/layers/fused_moe/oracle/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..208f01a7cb5ee04c88d276fec2082cd4e830884b
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/oracle/__init__.py
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
diff --git a/vllm/model_executor/layers/fused_moe/oracle/fp8.py b/vllm/model_executor/layers/fused_moe/oracle/fp8.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5c3b9af611fb0e5f6e03a913b8b2f9292fe2c49
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/oracle/fp8.py
@@ -0,0 +1,358 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from enum import Enum
+
+import torch
+
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm import envs
+from vllm._aiter_ops import rocm_aiter_ops
+from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe.config import (
+    FusedMoEConfig,
+    FusedMoEQuantConfig,
+    fp8_w8a8_moe_quant_config,
+    fp8_w8a16_moe_quant_config,
+)
+from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
+    FlashinferMoeBackend,
+    get_flashinfer_moe_backend,
+    make_fp8_moe_alpha_scales_for_fi,
+    prepare_fp8_moe_layer_for_fi,
+)
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    prepare_fp8_moe_layer_for_deepgemm,
+)
+from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
+    prepare_fp8_moe_layer_for_marlin,
+)
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    cutlass_group_gemm_supported,
+)
+from vllm.platforms import current_platform
+from vllm.utils.deep_gemm import is_deep_gemm_supported
+from vllm.utils.flashinfer import has_flashinfer_moe
+from vllm.utils.import_utils import has_deep_gemm
+
+logger = init_logger(__name__)
+
+
+class Fp8MoeBackend(Enum):
+    NONE = 0
+    FLASHINFER_TRTLLM = 1
+    FLASHINFER_CUTLASS = 2
+    DEEPGEMM = 3
+    MARLIN = 4
+    TRITON = 5
+    AITER = 6
+    VLLM_CUTLASS = 7
+
+
+def select_fp8_moe_backend(
+    block_quant: bool,
+    tp_size: int,
+    with_lora_support: bool,
+    is_act_and_mul: bool = True,
+    allow_vllm_cutlass: bool = False,
+) -> Fp8MoeBackend:
+    """
+    Select the primary FP8 MoE backend
+    Note: Shape-specific fallbacks may still occur at runtime.
+    """
+    # TODO(rob): in a future PR, we will query each mk for
+    # supported features and return the mk directly, just like
+    # we do for the Attention Backend.
+
+    if with_lora_support:
+        return Fp8MoeBackend.TRITON
+
+    def _make_log_backend(backend_name: str):
+        return f"Using {backend_name} backend for FP8 MoE"
+
+    # Prefer FlashInfer backends on supported GPUs; allow SM90 and SM100.
+    if (
+        current_platform.is_cuda()
+        and (
+            current_platform.is_device_capability_family(100)
+            or current_platform.is_device_capability(90)
+        )
+        and envs.VLLM_USE_FLASHINFER_MOE_FP8
+        and has_flashinfer_moe()
+    ):
+        backend = get_flashinfer_moe_backend()
+        if backend == FlashinferMoeBackend.TENSORRT_LLM:
+            logger.info_once(_make_log_backend("FlashInfer TRTLLM"))
+            if not is_act_and_mul:
+                raise ValueError(
+                    "FlashInfer TRTLLM FP8 MoE backend only supports "
+                    "act_and_mul gate_up_project fusion. Please set "
+                    "VLLM_USE_FLASHINFER_MOE_FP8=throughput to use the "
+                    "FlashInfer CUTLASS backend instead."
+                )
+            return Fp8MoeBackend.FLASHINFER_TRTLLM
+        else:
+            if block_quant and current_platform.is_device_capability_family(100):
+                raise ValueError(
+                    "FlashInfer FP8 MoE throughput backend does not "
+                    "support block quantization on SM100. Please use "
+                    "VLLM_FLASHINFER_MOE_BACKEND=latency to use the "
+                    "FlashInfer TRTLLM backend instead."
+                )
+            logger.info_once(_make_log_backend("FlashInfer CUTLASS"))
+            return Fp8MoeBackend.FLASHINFER_CUTLASS
+
+    # weight-only path for older GPUs without native FP8
+    if (
+        current_platform.is_cuda() and not current_platform.has_device_capability(89)
+    ) or envs.VLLM_TEST_FORCE_FP8_MARLIN:
+        logger.info_once(_make_log_backend("Marlin"), scope="local")
+        return Fp8MoeBackend.MARLIN
+
+    # Determine if we should use DeepGEMM with block-quantized weights:
+    # - If explicitly set by user, respect their choice
+    # - If not explicitly set (default), disable when TP size is >= 8
+    moe_use_deep_gemm = envs.VLLM_MOE_USE_DEEP_GEMM
+    if not envs.is_set("VLLM_MOE_USE_DEEP_GEMM") and tp_size >= 8:
+        moe_use_deep_gemm = False
+        logger.info_once(
+            "DeepGEMM MoE is disabled by default when TP size is >= 8. "
+            "Set VLLM_MOE_USE_DEEP_GEMM=1 to enable it.",
+            scope="local",
+        )
+
+    use_deep_gemm = envs.VLLM_USE_DEEP_GEMM
+    if not is_deep_gemm_supported():
+        use_deep_gemm = False
+        logger.info_once(
+            "DeepGEMM is disabled because the platform does not support it.",
+            scope="local",
+        )
+
+    if use_deep_gemm and moe_use_deep_gemm and block_quant:
+        if not has_deep_gemm():
+            logger.warning_once(
+                "DeepGEMM backend requested but not available.", scope="local"
+            )
+        elif is_deep_gemm_supported():
+            logger.info_once(_make_log_backend("DeepGEMM"), scope="local")
+            return Fp8MoeBackend.DEEPGEMM
+
+    if envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MOE:
+        logger.info_once(_make_log_backend("ROCm AITER"), scope="local")
+        return Fp8MoeBackend.AITER
+
+    if allow_vllm_cutlass and not block_quant and cutlass_group_gemm_supported():
+        logger.info_once(_make_log_backend("vLLM CUTLASS"), scope="local")
+        return Fp8MoeBackend.VLLM_CUTLASS
+
+    # default to Triton
+    logger.info_once(_make_log_backend("Triton"), scope="local")
+    return Fp8MoeBackend.TRITON
+
+
+def convert_to_fp8_moe_kernel_format(
+    fp8_backend: Fp8MoeBackend,
+    layer: torch.nn.Module,
+    w13: torch.Tensor,
+    w2: torch.Tensor,
+    w13_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    w13_input_scale: torch.Tensor | None,
+    w2_input_scale: torch.Tensor | None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    block_quant = hasattr(layer, "weight_block_size")
+    if fp8_backend == Fp8MoeBackend.DEEPGEMM:
+        assert block_quant
+        w13, w2, w13_scale, w2_scale = prepare_fp8_moe_layer_for_deepgemm(
+            w13,
+            w2,
+            w13_scale,
+            w2_scale,
+            tuple(layer.weight_block_size),
+        )
+    elif fp8_backend == Fp8MoeBackend.AITER:
+        w13, w2 = rocm_aiter_ops.shuffle_weights(w13, w2)
+    elif fp8_backend == Fp8MoeBackend.MARLIN:
+        w13, w2, w13_scale, w2_scale = prepare_fp8_moe_layer_for_marlin(
+            layer,
+            w13,
+            w2,
+            w13_scale,
+            w2_scale,
+        )
+    elif fp8_backend in [
+        Fp8MoeBackend.FLASHINFER_CUTLASS,
+        Fp8MoeBackend.FLASHINFER_TRTLLM,
+    ]:
+        w13, w2, w13_scale = prepare_fp8_moe_layer_for_fi(
+            layer=layer,
+            w13=w13,
+            w2=w2,
+            w13_scale=w13_scale,
+            w13_input_scale=w13_input_scale,
+            w2_scale=w2_scale,
+            w2_input_scale=w2_input_scale,
+            is_trtllm=(fp8_backend == Fp8MoeBackend.FLASHINFER_TRTLLM),
+        )
+
+    return w13, w2, w13_scale, w2_scale
+
+
+def make_fp8_moe_quant_config(
+    fp8_backend: Fp8MoeBackend,
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    a1_scale: torch.Tensor | None,
+    a2_scale: torch.Tensor | None,
+    block_shape: list[int] | None = None,
+) -> FusedMoEQuantConfig | None:
+    """
+    Create FusedMoEQuantConfig for the specifed FP8 Backend.
+    The FusedMoEQuantConfig holds the scales that are used
+    at runtime by the Modular Kernel abstraction.
+
+    Note that certain kernels (e.g. Flashinfer CUTLASS) need
+    special Quant configs to handle non-standard inputs to
+    their kernel interfaces.
+
+    In a future PR, we will have this function should be
+    a method of the modular kernel itself.
+    """
+    # TRTLLM does not use Modular Kernel abstraction yet.
+    if fp8_backend == Fp8MoeBackend.FLASHINFER_TRTLLM:
+        return None
+
+    # MARLIN is mixed precision W8A16 config.
+    if fp8_backend == Fp8MoeBackend.MARLIN:
+        return fp8_w8a16_moe_quant_config(
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            block_shape=block_shape,
+        )
+
+    # Flashinfer CUTLASS per-tensor uses single dq scale
+    # (alpha = w_scale * a_scale) and inverse a2 scale.
+    if fp8_backend == Fp8MoeBackend.FLASHINFER_CUTLASS and block_shape is None:
+        assert a1_scale is not None and a2_scale is not None
+        g1_alphas, g2_alphas = make_fp8_moe_alpha_scales_for_fi(
+            w1_scale,
+            a1_scale,
+            w2_scale,
+            a2_scale,
+        )
+        return fp8_w8a8_moe_quant_config(
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            a1_scale=a1_scale,
+            a2_scale=a2_scale,
+            a1_gscale=(1.0 / a1_scale),
+            a2_gscale=(1.0 / a2_scale),
+            g1_alphas=g1_alphas,
+            g2_alphas=g2_alphas,
+        )
+    # All other backends use normal config.
+    return fp8_w8a8_moe_quant_config(
+        w1_scale=w1_scale,
+        w2_scale=w2_scale,
+        a1_scale=a1_scale,
+        a2_scale=a2_scale,
+        block_shape=block_shape,
+    )
+
+
+def make_fp8_moe_kernel(
+    layer: torch.nn.Module,
+    moe_quant_config: FusedMoEQuantConfig,
+    moe_config: FusedMoEConfig,
+    fp8_backend: Fp8MoeBackend,
+) -> tuple[mk.FusedMoEModularKernel, bool]:
+    # Delayed import is required since the oracle is imported
+    # by CPU backends which cannot import all of these experts.
+    # TODO: update the experts to make this not happen.
+    from vllm.model_executor.layers.fused_moe.prepare_finalize import (
+        MoEPrepareAndFinalizeNoEP,
+    )
+
+    # NOTE(rob): this is a WIP refactor. We are first migrating
+    # all of the kernels in the TP case to use mk. Once this is
+    # done, then we will initialzie the TP case and DP/EP case
+    # via the same code path (i.e. via maybe_init_modular_kernel).
+    # NOTE(rob): in progress migrating all into this format.
+    use_inplace = True
+    if fp8_backend == Fp8MoeBackend.FLASHINFER_CUTLASS:
+        from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
+            FlashInferExperts,
+        )
+
+        kernel = mk.FusedMoEModularKernel(
+            MoEPrepareAndFinalizeNoEP(
+                defer_input_quant=moe_quant_config.is_block_quantized
+            ),
+            FlashInferExperts(
+                out_dtype=layer.orig_dtype,
+                quant_config=moe_quant_config,
+                ep_rank=moe_config.ep_rank,
+                ep_size=moe_config.ep_size,
+                tp_rank=moe_config.tp_rank,
+                tp_size=moe_config.tp_size,
+                use_dp=(moe_config.dp_size > 1),
+                use_deepseek_fp8_block_scale=moe_quant_config.is_block_quantized,
+            ),
+        )
+        use_inplace = False
+
+    elif fp8_backend == Fp8MoeBackend.AITER:
+        from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
+            AiterExperts,
+        )
+
+        kernel = mk.FusedMoEModularKernel(
+            # TODO: make defer_input_quant an attr of the AiterExperts
+            MoEPrepareAndFinalizeNoEP(defer_input_quant=True),
+            AiterExperts(quant_config=moe_quant_config),
+        )
+    elif fp8_backend == Fp8MoeBackend.MARLIN:
+        from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
+            MarlinExperts,
+        )
+
+        kernel = mk.FusedMoEModularKernel(
+            MoEPrepareAndFinalizeNoEP(),
+            MarlinExperts(quant_config=moe_quant_config),
+        )
+    elif fp8_backend == Fp8MoeBackend.VLLM_CUTLASS:
+        from vllm.model_executor.layers.fused_moe.triton_cutlass_moe import (
+            TritonOrCutlassExperts,
+        )
+
+        kernel = mk.FusedMoEModularKernel(
+            MoEPrepareAndFinalizeNoEP(),
+            TritonOrCutlassExperts(
+                out_dtype=moe_config.in_dtype,
+                e=layer.local_num_experts,
+                n=layer.intermediate_size_per_partition,
+                k=layer.hidden_size,
+                device=layer.w13_weight.device,
+                quant_config=moe_quant_config,
+            ),
+        )
+    elif fp8_backend == Fp8MoeBackend.DEEPGEMM:
+        from vllm.model_executor.layers.fused_moe import (
+            TritonOrDeepGemmExperts,
+        )
+
+        kernel = mk.FusedMoEModularKernel(
+            MoEPrepareAndFinalizeNoEP(),
+            TritonOrDeepGemmExperts(quant_config=moe_quant_config),
+        )
+    else:
+        from vllm.model_executor.layers.fused_moe.fused_moe import (
+            TritonExperts,
+        )
+
+        assert fp8_backend == Fp8MoeBackend.TRITON
+        kernel = mk.FusedMoEModularKernel(
+            MoEPrepareAndFinalizeNoEP(),
+            TritonExperts(quant_config=moe_quant_config),
+        )
+    return kernel, use_inplace
diff --git a/vllm/model_executor/layers/fused_moe/oracle/nvfp4.py b/vllm/model_executor/layers/fused_moe/oracle/nvfp4.py
new file mode 100644
index 0000000000000000000000000000000000000000..547a2a795d1940e0bf5aaf4415431836db0f8bc8
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/oracle/nvfp4.py
@@ -0,0 +1,280 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from enum import Enum
+
+import torch
+
+import vllm.envs as envs
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe.config import (
+    FusedMoEConfig,
+    FusedMoEQuantConfig,
+    nvfp4_moe_quant_config,
+    nvfp4_w4a16_moe_quant_config,
+)
+from vllm.model_executor.layers.fused_moe.cutlass_moe import (
+    CutlassExpertsFp4,
+)
+from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
+    FlashInferExperts,
+)
+from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
+    MarlinExperts,
+)
+from vllm.model_executor.layers.fused_moe.prepare_finalize import (
+    MoEPrepareAndFinalizeNoEP,
+)
+from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import (
+    is_flashinfer_fp4_cutedsl_moe_available,
+    is_flashinfer_fp4_cutlass_moe_available,
+    prepare_nvfp4_moe_layer_for_fi_or_cutlass,
+)
+from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
+    FlashinferMoeBackend,
+    get_flashinfer_moe_backend,
+)
+from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
+    is_fp4_marlin_supported,
+    prepare_nvfp4_moe_layer_for_marlin,
+)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    cutlass_fp4_supported,
+)
+
+logger = init_logger(__name__)
+
+
+class NvFp4MoeBackend(Enum):
+    FLASHINFER_CUTLASS = "FlashInfer CUTLASS"
+    FLASHINFER_TRTLLM = "FlashInfer TRTLLM"
+    FLASHINFER_CUTEDSL = "FlashInfer CUTEDSL"
+    VLLM_CUTLASS = "vLLM CUTASS"
+    MARLIN = "vLLM MARLIN"
+
+
+FLASHINFER_NVFP4_MOE_BACKENDS = [
+    NvFp4MoeBackend.FLASHINFER_CUTLASS,
+    NvFp4MoeBackend.FLASHINFER_TRTLLM,
+    NvFp4MoeBackend.FLASHINFER_CUTEDSL,
+]
+
+fi_2_vllm_backend_map: dict[FlashinferMoeBackend, NvFp4MoeBackend] = {
+    FlashinferMoeBackend.CUTLASS: NvFp4MoeBackend.FLASHINFER_CUTLASS,
+    FlashinferMoeBackend.TENSORRT_LLM: NvFp4MoeBackend.FLASHINFER_TRTLLM,
+    FlashinferMoeBackend.CUTEDSL: NvFp4MoeBackend.FLASHINFER_CUTEDSL,
+}
+
+
+def is_global_sf_supported_for_nvfp4_backend(backend: NvFp4MoeBackend) -> bool:
+    # Checks whether `backend` supports quantizing with scaling factors
+    # of all experts in Expert Parallel Mode when all experts are not
+    # on the same rank.
+
+    return backend in [
+        NvFp4MoeBackend.FLASHINFER_CUTLASS,
+        NvFp4MoeBackend.FLASHINFER_TRTLLM,
+    ]
+
+
+def select_nvfp4_moe_backend() -> NvFp4MoeBackend:
+    def _make_log_backend(backend: NvFp4MoeBackend):
+        return f"Using {backend.value} backend for NvFp4 MoE"
+
+    if cutlass_fp4_supported() and not envs.VLLM_TEST_FORCE_FP8_MARLIN:
+        allow_flashinfer = (
+            is_flashinfer_fp4_cutlass_moe_available()
+            or is_flashinfer_fp4_cutedsl_moe_available()
+        )
+        if allow_flashinfer and envs.VLLM_USE_FLASHINFER_MOE_FP4:
+            backend = fi_2_vllm_backend_map[get_flashinfer_moe_backend()]
+        else:
+            backend = NvFp4MoeBackend.VLLM_CUTLASS
+    elif is_fp4_marlin_supported():
+        backend = NvFp4MoeBackend.MARLIN
+    else:
+        raise ValueError("No NvFp4 kernel backend available for NvFp4 MoE.")
+
+    # Log warning if FI backend requested but not available.
+    if (
+        backend not in FLASHINFER_NVFP4_MOE_BACKENDS
+        and envs.VLLM_USE_FLASHINFER_MOE_FP4
+    ):
+        logger.warning_once(
+            "Requested FlashInfer backend for NvFp4 MoE, but it's not available. "
+            "Falling back to %s for NvFp4 MoE",
+            backend.value,
+            scope="local",
+        )
+    else:
+        logger.info_once(_make_log_backend(backend), scope="local")
+    return backend
+
+
+def convert_to_nvfp4_moe_kernel_format(
+    nvfp4_backend: NvFp4MoeBackend,
+    layer: torch.nn.Module,
+    w13: torch.Tensor,
+    w13_scale: torch.Tensor,
+    w13_scale_2: torch.Tensor,
+    a13_scale: torch.Tensor | None,
+    w2: torch.Tensor,
+    w2_scale: torch.Tensor,
+    w2_scale_2: torch.Tensor,
+    a2_scale: torch.Tensor | None,
+    is_act_and_mul: bool,
+) -> tuple[
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+]:
+    if (
+        nvfp4_backend in FLASHINFER_NVFP4_MOE_BACKENDS
+        or nvfp4_backend == NvFp4MoeBackend.VLLM_CUTLASS
+    ):
+        (
+            w13,
+            w13_scale,
+            w13_scale_2,
+            a13_scale,
+            w2,
+            w2_scale,
+            w2_scale_2,
+            a2_scale,
+        ) = prepare_nvfp4_moe_layer_for_fi_or_cutlass(
+            backend=nvfp4_backend,
+            layer=layer,
+            w13=w13,
+            w13_scale=w13_scale,
+            w13_scale_2=w13_scale_2,
+            a13_scale=a13_scale,
+            w2=w2,
+            w2_scale=w2_scale,
+            w2_scale_2=w2_scale_2,
+            a2_scale=a2_scale,
+            is_act_and_mul=is_act_and_mul,
+        )
+    elif nvfp4_backend == NvFp4MoeBackend.MARLIN:
+        a13_scale = None
+        a2_scale = None
+        (
+            w13,
+            w13_scale,
+            w13_scale_2,
+            w2,
+            w2_scale,
+            w2_scale_2,
+        ) = prepare_nvfp4_moe_layer_for_marlin(
+            layer=layer,
+            w13=w13,
+            w13_scale=w13_scale,
+            w13_scale_2=w13_scale_2,
+            w2=w2,
+            w2_scale=w2_scale,
+            w2_scale_2=w2_scale_2,
+        )
+    else:
+        raise ValueError(f"Unknown NvFp4 backend for MoE: {nvfp4_backend}")
+
+    return (
+        w13,
+        w13_scale,
+        w13_scale_2,
+        a13_scale,
+        w2,
+        w2_scale,
+        w2_scale_2,
+        a2_scale,
+    )
+
+
+def make_nvfp4_moe_quant_config(
+    backend: NvFp4MoeBackend,
+    w13_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    w13_scale_2: torch.Tensor,
+    w2_scale_2: torch.Tensor,
+    a13_scale: torch.Tensor,
+    a2_scale: torch.Tensor,
+) -> FusedMoEQuantConfig | None:
+    UNSUPPORTED = [NvFp4MoeBackend.FLASHINFER_TRTLLM]
+    if backend in UNSUPPORTED:
+        return None
+
+    elif backend == NvFp4MoeBackend.MARLIN:
+        return nvfp4_w4a16_moe_quant_config(
+            g1_alphas=w13_scale_2,
+            g2_alphas=w2_scale_2,
+            w1_scale=w13_scale,
+            w2_scale=w2_scale,
+        )
+
+    g1_alphas = a13_scale * w13_scale_2
+    g2_alphas = a2_scale * w2_scale_2
+    return nvfp4_moe_quant_config(
+        g1_alphas=g1_alphas,
+        g2_alphas=g2_alphas,
+        a1_gscale=(1.0 / a13_scale),
+        a2_gscale=(1.0 / a2_scale),
+        w1_scale=w13_scale,
+        w2_scale=w2_scale,
+    )
+
+
+def make_nvfp4_moe_kernel(
+    backend: NvFp4MoeBackend,
+    quant_config: FusedMoEQuantConfig,
+    moe_config: FusedMoEConfig,
+) -> mk.FusedMoEModularKernel | None:
+    assert moe_config.dp_size == 1
+
+    UNSUPPORTED_BACKENDS = [
+        # TRTLLM does not use the modular kernl abstraction.
+        NvFp4MoeBackend.FLASHINFER_TRTLLM,
+        # CUTEDSL is used with BATCHED (masked) format only.
+        # TODO: add here once we support dp/ep via the oracle.
+        NvFp4MoeBackend.FLASHINFER_CUTEDSL,
+    ]
+
+    if backend in UNSUPPORTED_BACKENDS:
+        return None
+
+    elif backend == NvFp4MoeBackend.FLASHINFER_CUTLASS:
+        return mk.FusedMoEModularKernel(
+            MoEPrepareAndFinalizeNoEP(defer_input_quant=True),
+            FlashInferExperts(
+                out_dtype=moe_config.in_dtype,
+                quant_config=quant_config,
+                ep_rank=moe_config.ep_rank,
+                ep_size=moe_config.ep_size,
+                tp_rank=moe_config.tp_rank,
+                tp_size=moe_config.tp_size,
+                use_dp=False,
+                use_deepseek_fp8_block_scale=False,
+            ),
+        )
+
+    elif backend == NvFp4MoeBackend.VLLM_CUTLASS:
+        return mk.FusedMoEModularKernel(
+            MoEPrepareAndFinalizeNoEP(defer_input_quant=True),
+            CutlassExpertsFp4(
+                out_dtype=moe_config.in_dtype,
+                # TODO(rob): see what impact this has on expert map?
+                max_experts_per_worker=moe_config.num_experts,
+                quant_config=quant_config,
+            ),
+        )
+
+    elif backend == NvFp4MoeBackend.MARLIN:
+        return mk.FusedMoEModularKernel(
+            MoEPrepareAndFinalizeNoEP(),
+            MarlinExperts(quant_config=quant_config),
+        )
+
+    else:
+        raise ValueError(f"Unknown NvFp4 MoE backend: {backend}")
diff --git a/vllm/model_executor/layers/fused_moe/prepare_finalize.py b/vllm/model_executor/layers/fused_moe/prepare_finalize.py
index e27e2eb32da0f8fc14750b68b7705196ca7708ae..5d806fa843a3cf8a2d16ac3367b7a46e62c4186a 100644
--- a/vllm/model_executor/layers/fused_moe/prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/prepare_finalize.py
@@ -13,6 +13,10 @@ from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input
 
 
 class MoEPrepareAndFinalizeNoEP(mk.FusedMoEPrepareAndFinalize):
+    def __init__(self, defer_input_quant: bool = False) -> None:
+        super().__init__()
+        self.defer_input_quant = defer_input_quant
+
     @property
     def activation_format(self) -> mk.FusedMoEActivationFormat:
         return mk.FusedMoEActivationFormat.Standard
@@ -48,6 +52,11 @@ class MoEPrepareAndFinalizeNoEP(mk.FusedMoEPrepareAndFinalize):
             # Note: do not use inplace for shared experts overlap
             a1 = a1 * topk_weights.to(a1.dtype)
 
+        # Defer input quant to moe kernel for backends (e.g. AITER, FI)
+        # which use a single kernel call for quant + experts.
+        if self.defer_input_quant:
+            return a1, None, None, None, None
+
         a1q, a1q_scale = moe_kernel_quantize_input(
             a1,
             quant_config.a1_scale,
diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
index 882ad0a537cd5fbbd0253d8e14bee8bddc739a50..b78794c6bd83cee397c4582bd9c1c743814bde91 100644
--- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
@@ -5,11 +5,15 @@ from functools import lru_cache
 
 import torch
 
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm._aiter_ops import rocm_aiter_ops
 from vllm.model_executor.layers.fused_moe.config import (
     FUSED_MOE_UNQUANTIZED_CONFIG,
     FusedMoEQuantConfig,
 )
+from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
+    TopKWeightAndReduceNoOP,
+)
 
 
 class QuantMethod(IntEnum):
@@ -227,8 +231,7 @@ def rocm_aiter_fused_experts(
         # w8a8 block-scaled
         if quant_config.block_shape is not None and quant_config.use_fp8_w8a8:
             assert not apply_router_weight_on_input, (
-                "apply_router_weight_on_input is\
-                not supported for block scaled moe"
+                "apply_router_weight_on_input is not supported for block scaled moe"
             )
             assert quant_config.w1_scale is not None
             assert quant_config.w2_scale is not None
@@ -263,3 +266,82 @@ def rocm_aiter_fused_experts(
             a2_scale=quant_config.a2_scale,
             doweight_stage1=apply_router_weight_on_input,
         )
+
+
+class AiterExperts(mk.FusedMoEPermuteExpertsUnpermute):
+    def __init__(self, quant_config):
+        super().__init__(quant_config)
+
+    @property
+    def activation_formats(
+        self,
+    ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
+        return (
+            mk.FusedMoEActivationFormat.Standard,
+            mk.FusedMoEActivationFormat.Standard,
+        )
+
+    def supports_expert_map(self):
+        return True
+
+    def supports_chunking(self):
+        return False
+
+    def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
+        return TopKWeightAndReduceNoOP()
+
+    def workspace_shapes(
+        self,
+        M: int,
+        N: int,
+        K: int,
+        topk: int,
+        global_num_experts: int,
+        local_num_experts: int,
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
+        activation: str,
+    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
+        # Workspaces are managed internally by AITER.
+        workspace1 = (0,)
+        workspace2 = (0,)
+        output = (M, K)
+        return (workspace1, workspace2, output)
+
+    def apply(
+        self,
+        output: torch.Tensor,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
+        global_num_experts: int,
+        expert_map: torch.Tensor | None,
+        a1q_scale: torch.Tensor | None,
+        a2_scale: torch.Tensor | None,
+        workspace13: torch.Tensor,
+        workspace2: torch.Tensor,
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
+        apply_router_weight_on_input: bool,
+    ):
+        # TODO(rob): rocm_aiter_fused_experts uses self.quant_config's
+        # a_scales for static quantization. Update this to fit better
+        # with the interface once all quant integrations are complete.
+        assert a1q_scale is None
+        assert a2_scale == self.quant_config.a2_scale
+        assert expert_tokens_meta is None
+
+        result = rocm_aiter_fused_experts(
+            hidden_states=hidden_states,
+            w1=w1,
+            w2=w2,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            activation=activation,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+            expert_map=expert_map,
+            quant_config=self.quant_config,
+        )
+        assert result.shape == output.shape
+        output.copy_(result)
diff --git a/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py b/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py
new file mode 100644
index 0000000000000000000000000000000000000000..0fd788ea571e55a3ff9886e7b70ab138577472a1
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py
@@ -0,0 +1,324 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Adapted from
+# https://github.com/sgl-project/sglang/blob/bed301a5acaa9577c9aa706468bdf242f6a43051/python/sglang/srt/layers/moe/routed_experts_capturer.py
+
+from __future__ import annotations
+
+import fcntl
+import logging
+import os
+import tempfile
+from collections.abc import Generator
+from contextlib import contextmanager
+from multiprocessing import shared_memory
+from unittest.mock import patch
+
+import numpy as np
+import torch
+
+from vllm.config import ModelConfig
+from vllm.distributed import get_tensor_model_parallel_rank
+
+logger = logging.getLogger(__name__)
+
+# Constants
+_TMP_DIR = tempfile.gettempdir()
+_LOCK_FILE_PREFIX = os.path.join(_TMP_DIR, "vllm_routed_experts")
+_BUFFER_PREFIX = "vllm_routed_experts_buffer"
+
+# Global singleton instances
+_global_experts_capturer: RoutedExpertsCapturer | None = None
+_global_experts_reader: RoutedExpertsReader | None = None
+
+
+@contextmanager
+def _file_lock(lock_file: str, mode: str = "wb+") -> Generator[None, None, None]:
+    """Context manager for file-based locking."""
+    with open(lock_file, mode) as fp:
+        fcntl.flock(fp, fcntl.LOCK_EX)
+        try:
+            yield
+        finally:
+            fcntl.flock(fp, fcntl.LOCK_UN)
+
+
+def _create_or_attach_shared_memory(
+    name: str, size: int, lock_file: str
+) -> shared_memory.SharedMemory:
+    """Create or attach to shared memory with proper locking."""
+    # Ensure lock file exists before acquiring lock
+    with open(lock_file, "wb"):
+        pass
+
+    with _file_lock(lock_file):
+        try:
+            shm = shared_memory.SharedMemory(name=name, create=True, size=size)
+        except FileExistsError:
+            shm = shared_memory.SharedMemory(name=name, create=False, size=size)
+
+        if shm.size != size:
+            logger.warning(
+                "Shared memory %s size mismatch; recreating",
+                name,
+            )
+            shm.close()
+            shm.unlink()
+            try:
+                shm = shared_memory.SharedMemory(name=name, create=True, size=size)
+                logger.info("Created shared memory %s", name)
+            except FileExistsError:
+                shm = shared_memory.SharedMemory(name=name, create=False, size=size)
+                logger.info("Linked to existing shared memory %s", name)
+
+    return shm
+
+
+class RoutedExpertsCapturer:
+    """
+    Capturer for routed experts with device and optional shared memory buffer.
+
+    This class captures expert routing decisions during model forward passes
+    and optionally stores them in shared memory for cross-process access.
+    """
+
+    _instance: RoutedExpertsCapturer | None = None
+
+    def __init__(self) -> None:
+        self._device_buffer: torch.Tensor | None = None
+        self._shm: shared_memory.SharedMemory | None = None
+        self._host_buffer_view: np.ndarray | None = None
+        self._lock_file: str | None = None
+        self._shm_name: str | None = None
+
+    @classmethod
+    def create(cls) -> RoutedExpertsCapturer:
+        """Create a global singleton instance."""
+        global _global_experts_capturer
+        if _global_experts_capturer is not None:
+            raise RuntimeError("Experts capturer already created.")
+
+        _global_experts_capturer = cls()
+        return _global_experts_capturer
+
+    @staticmethod
+    def get_instance() -> RoutedExpertsCapturer | None:
+        """Get the global singleton instance."""
+        return _global_experts_capturer
+
+    def init_buffer(
+        self,
+        max_num_batched_tokens: int,
+        max_num_kv_tokens: int,
+        model_config: ModelConfig,
+        instance_id: str,
+    ) -> None:
+        """
+        Initialize the device buffer and optionally shared memory buffer.
+
+        Args:
+            max_num_batched_tokens: Maximum number of tokens in a batch.
+            max_num_kv_tokens: Maximum number of KV tokens for shared memory.
+            model_config: Model configuration containing layer and expert info.
+            instance_id: Unique identifier for the shared memory buffer.
+        """
+
+        if self._device_buffer is not None:
+            raise RuntimeError("Device buffer has already been initialized")
+
+        hf_config = model_config.hf_text_config
+        num_layers = hf_config.num_hidden_layers
+        num_experts_per_tok = hf_config.num_experts_per_tok
+
+        # Initialize device buffer
+        self._device_buffer = torch.zeros(
+            (max_num_batched_tokens, num_layers, num_experts_per_tok),
+            dtype=torch.int32,
+            device="cuda",
+        )
+
+        if get_tensor_model_parallel_rank() != 0:
+            return
+
+        # Initialize shared memory
+        shape = (max_num_kv_tokens, num_layers, num_experts_per_tok)
+        buffer_size = int(np.prod(shape)) * np.dtype(np.int32).itemsize
+
+        self._lock_file = f"{_LOCK_FILE_PREFIX}_{instance_id}.lock"
+        self._shm_name = f"{_BUFFER_PREFIX}_{instance_id}"
+
+        self._shm = _create_or_attach_shared_memory(
+            self._shm_name, buffer_size, self._lock_file
+        )
+        self._host_buffer_view = np.ndarray(shape, dtype=np.int32, buffer=self._shm.buf)
+        self._host_buffer_view.fill(0)
+
+        logger.debug(
+            "Created shared memory buffer '%s' with shape %s",
+            self._shm.name,
+            shape,
+        )
+
+    def capture(self, layer_id: int, topk_ids: torch.Tensor) -> None:
+        """
+        Capture expert routing decisions for a specific layer.
+
+        Args:
+            layer_id: The layer index.
+            topk_ids: Tensor of shape (batch_size, num_routed_experts).
+        """
+        if self._device_buffer is None:
+            raise RuntimeError("Buffer not initialized. Call init_buffer() first.")
+
+        if layer_id >= self._device_buffer.shape[1]:
+            return
+
+        batch_size = topk_ids.shape[0]
+        self._device_buffer[:batch_size, layer_id, :] = topk_ids
+
+    def clear_buffer(self) -> None:
+        """Clear the device buffer."""
+        if self._device_buffer is not None:
+            self._device_buffer.zero_()
+
+    def save_captured_experts(self, indices: np.ndarray) -> None:
+        """
+        Save captured experts from device buffer to shared memory.
+
+        Args:
+            indices: Array of indices indicating where to store the data.
+        """
+        if get_tensor_model_parallel_rank() != 0:
+            return
+        if self._lock_file is None:
+            raise RuntimeError("Shared memory not initialized.")
+        if self._host_buffer_view is None:
+            return
+        if self._device_buffer is None:
+            raise RuntimeError("Device buffer not initialized.")
+
+        num_tokens = len(indices)
+        data = self._device_buffer[:num_tokens, :, :].cpu().numpy()
+
+        with _file_lock(self._lock_file):
+            self._host_buffer_view[indices, :, :] = data
+
+    def cleanup(self) -> None:
+        """Explicitly clean up shared memory resources."""
+        if self._shm is not None:
+            try:
+                self._shm.close()
+                self._shm.unlink()
+            except Exception:
+                logger.debug("Exception during cleanup for capturer", exc_info=True)
+            finally:
+                self._shm = None
+
+    def __del__(self) -> None:
+        """Clean up shared memory on destruction."""
+        self.cleanup()
+
+
+class RoutedExpertsReader:
+    """
+    Reader for routed experts from shared memory.
+
+    This class attaches to shared memory created by RoutedExpertsCapturer
+    and reads expert routing decisions.
+    """
+
+    _instance: RoutedExpertsReader | None = None
+
+    def __init__(self) -> None:
+        self._shm: shared_memory.SharedMemory | None = None
+        self._host_buffer_view: np.ndarray | None = None
+        self._lock_file: str | None = None
+
+    @classmethod
+    def create(cls) -> RoutedExpertsReader:
+        """Create a global singleton instance."""
+        global _global_experts_reader
+        if _global_experts_reader is not None:
+            raise RuntimeError("Experts reader already created.")
+
+        _global_experts_reader = cls()
+        return _global_experts_reader
+
+    @staticmethod
+    def get_instance() -> RoutedExpertsReader | None:
+        """Get the global singleton instance."""
+        if _global_experts_reader is None:
+            logger.info("Experts reader not initialized.")
+        return _global_experts_reader
+
+    def attach_buffer(
+        self,
+        max_num_kv_tokens: int,
+        model_config: ModelConfig,
+        instance_id: str,
+    ) -> None:
+        """
+        Attach to an existing shared memory buffer.
+
+        Args:
+            max_num_kv_tokens: Maximum number of KV tokens.
+            model_config: Model configuration.
+            instance_id: Unique identifier for the shared memory buffer.
+        """
+        if self._shm is not None:
+            logger.warning("Already attached to shared memory buffer.")
+            return  # Already attached
+
+        hf_config = model_config.hf_text_config
+        shape = (
+            max_num_kv_tokens,
+            hf_config.num_hidden_layers,
+            hf_config.num_experts_per_tok,
+        )
+
+        self._lock_file = f"{_LOCK_FILE_PREFIX}_{instance_id}.lock"
+        shm_name = f"{_BUFFER_PREFIX}_{instance_id}"
+
+        with _file_lock(self._lock_file, mode="rb+"):
+            # Avoid resource_tracker registering the shared memory
+            with patch(
+                "multiprocessing.resource_tracker.register",
+                lambda *args, **kwargs: None,
+            ):
+                self._shm = shared_memory.SharedMemory(name=shm_name)
+
+            self._host_buffer_view = np.ndarray(
+                shape, dtype=np.int32, buffer=self._shm.buf
+            )
+
+    def get_routed_experts(self, indices: np.ndarray) -> np.ndarray:
+        """
+        Read routed expert data from shared memory.
+
+        Args:
+            indices: Array of indices to read.
+
+        Returns:
+            Copy of the expert routing data for the given indices.
+        """
+        if self._host_buffer_view is None:
+            raise RuntimeError("Buffer not attached. Call attach_buffer() first.")
+        if self._lock_file is None:
+            raise RuntimeError("Lock file not initialized.")
+
+        with _file_lock(self._lock_file, mode="rb+"):
+            return self._host_buffer_view[indices, :, :].copy()
+
+    def cleanup(self) -> None:
+        """Explicitly clean up resources (close without unlink)."""
+        if self._shm is not None:
+            try:
+                self._shm.close()
+            except Exception:
+                logger.debug("Exception during cleanup for reader", exc_info=True)
+            finally:
+                self._shm = None
+
+    def __del__(self) -> None:
+        """Close shared memory on destruction (do not unlink)."""
+        self.cleanup()
diff --git a/vllm/model_executor/layers/fused_moe/shared_fused_moe.py b/vllm/model_executor/layers/fused_moe/shared_fused_moe.py
index 60aa1c088b4d865fc3c66062579dff2803b6f7ec..a143347b19f2cc7aa9b03fed37aee385cc82e526 100644
--- a/vllm/model_executor/layers/fused_moe/shared_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/shared_fused_moe.py
@@ -29,14 +29,14 @@ class SharedFusedMoE(FusedMoE):
         self._shared_experts = shared_experts
 
         # Disable shared expert overlap if:
-        #   - we are using eplb, because of correctness issues
-        #   - we are using flashinfer with DP, since there nothing to gain
+        #   - we are using eplb with non-default backend, because of correctness issues
+        #   - we are using flashinfer with DP, since there nothint to gain
         #   - we are using marlin kernels
+        backend = self.moe_parallel_config.all2all_backend
         self.use_overlapped = (
             use_overlapped
             and not (
-                # TODO(wentao): find the root cause and remove this condition
-                self.enable_eplb
+                (self.enable_eplb and backend != "allgather_reducescatter")
                 or (self.moe_config.use_flashinfer_cutlass_kernels and self.dp_size > 1)
             )
             and self._shared_experts is not None
diff --git a/vllm/model_executor/layers/fused_moe/triton_cutlass_moe.py b/vllm/model_executor/layers/fused_moe/triton_cutlass_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..09d5e45c1ec2a00f9f175cc9d4e95be13f4fc5a6
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/triton_cutlass_moe.py
@@ -0,0 +1,78 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+import torch
+
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
+from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8
+from vllm.model_executor.layers.fused_moe.fallback import FallbackExperts
+from vllm.model_executor.layers.fused_moe.fused_moe import TritonExperts
+from vllm.platforms import current_platform
+
+
+class TritonOrCutlassExperts(FallbackExperts):
+    """Cutlass with fallback to Triton for low latency shapes on SM100."""
+
+    def __init__(
+        self,
+        e: int,
+        n: int,
+        k: int,
+        out_dtype: torch.dtype | None,
+        quant_config: FusedMoEQuantConfig,
+        device: torch.dtype,
+    ):
+        self.is_sm100 = current_platform.has_device_capability(100)
+        super().__init__(
+            experts=CutlassExpertsFp8(e, n, k, out_dtype, quant_config, device),
+            fallback_experts=TritonExperts(quant_config),
+        )
+
+    def workspace_shapes(
+        self,
+        M: int,
+        N: int,
+        K: int,
+        topk: int,
+        global_num_experts: int,
+        local_num_experts: int,
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
+        activation: str,
+    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
+        # Small batch fallback for sm100.
+        if self.is_sm100 and M <= 8:
+            return self.fallback_experts.workspace_shapes(
+                M,
+                N,
+                K,
+                topk,
+                global_num_experts,
+                local_num_experts,
+                expert_tokens_meta,
+                activation,
+            )
+        else:
+            return self.experts.workspace_shapes(
+                M,
+                N,
+                K,
+                topk,
+                global_num_experts,
+                local_num_experts,
+                expert_tokens_meta,
+                activation,
+            )
+
+    def _select_experts_impl(
+        self,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+    ) -> mk.FusedMoEPermuteExpertsUnpermute:
+        # Small batch fallback for sm100.
+        if self.is_sm100 and hidden_states.shape[0] <= 8:
+            return self.fallback_experts
+        else:
+            return self.experts
diff --git a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
index b8e0837162ef63ecdf5e81ea810cdc5c8a76de23..55b1e1211b0a7a8b6ed6e7a4541b34fd00e16a4f 100644
--- a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
@@ -10,77 +10,21 @@ from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
     _valid_deep_gemm,
     _valid_deep_gemm_shape,
 )
+from vllm.model_executor.layers.fused_moe.fallback import FallbackExperts
 from vllm.model_executor.layers.fused_moe.fused_moe import TritonExperts
 from vllm.utils.deep_gemm import (
-    get_mk_alignment_for_contiguous_layout,
     is_deep_gemm_e8m0_used,
 )
 
 
-class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
-    def __init__(
-        self,
-        quant_config: FusedMoEQuantConfig,
-        allow_deep_gemm: bool = False,
-    ):
-        super().__init__(quant_config)
-
-        self.triton_expert = TritonExperts(quant_config)
-
-        self.allow_deep_gemm = (
-            allow_deep_gemm
-            and self.quant_config.use_fp8_w8a8
-            and self.block_shape == get_mk_alignment_for_contiguous_layout()
-        )
+class TritonOrDeepGemmExperts(FallbackExperts):
+    """DeepGemm with fallback to Triton for low latency shapes."""
 
-        self.deep_gemm_expert = (
-            DeepGemmExperts(self.quant_config) if self.allow_deep_gemm else None
-        )
-
-    @property
-    def activation_formats(
-        self,
-    ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
-        assert (
-            self.deep_gemm_expert is None
-            or self.triton_expert.activation_formats
-            == self.deep_gemm_expert.activation_formats
+    def __init__(self, quant_config: FusedMoEQuantConfig):
+        super().__init__(
+            experts=DeepGemmExperts(quant_config),
+            fallback_experts=TritonExperts(quant_config),
         )
-        return self.triton_expert.activation_formats
-
-    def supports_chunking(self) -> bool:
-        dge = self.deep_gemm_expert
-        te = self.triton_expert
-        return (dge is None or dge.supports_chunking()) and (
-            te is None or te.supports_chunking()
-        )
-
-    def supports_expert_map(self) -> bool:
-        dge = self.deep_gemm_expert
-        te = self.triton_expert
-        return (dge is None or dge.supports_expert_map()) and (
-            te is None or te.supports_expert_map()
-        )
-
-    def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
-        dge = self.deep_gemm_expert
-        te = self.triton_expert
-        dge_war = dge.finalize_weight_and_reduce_impl() if dge else None
-        te_war = te.finalize_weight_and_reduce_impl() if te else None
-        is_dge_war = dge_war is not None
-        is_te_war = te_war is not None
-
-        if is_dge_war and is_te_war:
-            assert dge_war == te_war, (
-                "Both implementations should agree on WeightAndReduce impls. "
-                f"Got dge_war: {dge_war}, and te_war: {te_war}"
-            )
-
-        if dge_war is not None:
-            return dge_war
-
-        assert te_war is not None
-        return te_war
 
     def workspace_shapes(
         self,
@@ -91,15 +35,13 @@ class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
         global_num_experts: int,
         local_num_experts: int,
         expert_tokens_meta: mk.ExpertTokensMetadata | None,
+        activation: str,
     ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
         # Note: the deep gemm workspaces are strictly larger than the triton
         # workspaces so we can be pessimistic here and allocate for DeepGemm
         # even if we fall back to triton later, e.g. if expert maps are set.
-        if self.allow_deep_gemm and (
-            is_deep_gemm_e8m0_used() or _valid_deep_gemm_shape(M, N, K)
-        ):
-            assert self.deep_gemm_expert is not None
-            return self.deep_gemm_expert.workspace_shapes(
+        if is_deep_gemm_e8m0_used() or _valid_deep_gemm_shape(M, N, K):
+            return self.experts.workspace_shapes(
                 M,
                 N,
                 K,
@@ -107,9 +49,10 @@ class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
                 global_num_experts,
                 local_num_experts,
                 expert_tokens_meta,
+                activation,
             )
         else:
-            return self.triton_expert.workspace_shapes(
+            return self.fallback_experts.workspace_shapes(
                 M,
                 N,
                 K,
@@ -117,47 +60,16 @@ class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
                 global_num_experts,
                 local_num_experts,
                 expert_tokens_meta,
+                activation,
             )
 
-    def apply(
+    def _select_experts_impl(
         self,
-        output: torch.Tensor,
         hidden_states: torch.Tensor,
         w1: torch.Tensor,
         w2: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        activation: str,
-        global_num_experts: int,
-        expert_map: torch.Tensor | None,
-        a1q_scale: torch.Tensor | None,
-        a2_scale: torch.Tensor | None,
-        workspace13: torch.Tensor,
-        workspace2: torch.Tensor,
-        expert_tokens_meta: mk.ExpertTokensMetadata | None,
-        apply_router_weight_on_input: bool,
-    ):
-        use_deep_gemm = self.allow_deep_gemm and (
-            is_deep_gemm_e8m0_used() or _valid_deep_gemm(hidden_states, w1, w2)
-        )
-
-        experts = self.deep_gemm_expert if use_deep_gemm else self.triton_expert
-        assert experts is not None
-
-        experts.apply(
-            output,
-            hidden_states,
-            w1,
-            w2,
-            topk_weights,
-            topk_ids,
-            activation,
-            global_num_experts,
-            expert_map,
-            a1q_scale,
-            a2_scale,
-            workspace13,
-            workspace2,
-            expert_tokens_meta,
-            apply_router_weight_on_input,
-        )
+    ) -> mk.FusedMoEPermuteExpertsUnpermute:
+        if is_deep_gemm_e8m0_used() or _valid_deep_gemm(hidden_states, w1, w2):
+            return self.experts
+        else:
+            return self.fallback_experts
diff --git a/vllm/model_executor/layers/fused_moe/trtllm_moe.py b/vllm/model_executor/layers/fused_moe/trtllm_moe.py
index 132d35e65aba85f7c6c47c70fb314b41882eadd2..c46f59564930641b1fb301920ec0b42761aa909e 100644
--- a/vllm/model_executor/layers/fused_moe/trtllm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/trtllm_moe.py
@@ -57,6 +57,7 @@ class TrtLlmGenExperts(mk.FusedMoEPermuteExpertsUnpermute):
         global_num_experts: int,
         local_num_experts: int,
         expert_tokens_meta: mk.ExpertTokensMetadata | None,
+        activation: str,
     ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
         # The workspaces for this implementation are managed by flashinfer.
         workspace1 = (0,)
diff --git a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
index f3397d0c82af12a90ed7193b2f4b9d998c7ba8e7..06d03addd0ec27919740757dd0dff12c6ee70d89 100644
--- a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
+++ b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
@@ -6,6 +6,7 @@ import torch
 import torch.nn.functional as F
 
 import vllm.envs as envs
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm._aiter_ops import rocm_aiter_ops
 from vllm.logger import init_logger
 from vllm.model_executor.custom_op import CustomOp
@@ -15,47 +16,53 @@ from vllm.model_executor.layers.fused_moe.config import (
     FusedMoEQuantConfig,
     biased_moe_quant_config,
 )
+from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
+    FlashInferExperts,
+)
 from vllm.model_executor.layers.fused_moe.fused_moe_method_base import (
     FusedMoEMethodBase,
 )
+from vllm.model_executor.layers.fused_moe.fused_moe_router import FusedMoERouter
 from vllm.model_executor.layers.fused_moe.modular_kernel import (
     FusedMoEActivationFormat,
     FusedMoEPermuteExpertsUnpermute,
     FusedMoEPrepareAndFinalize,
 )
-from vllm.model_executor.utils import set_weight_attrs
+from vllm.model_executor.layers.fused_moe.prepare_finalize import (
+    MoEPrepareAndFinalizeNoEP,
+)
+from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
+    AiterExperts,
+)
+from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
+    swap_w13_to_w31,
+)
+from vllm.model_executor.utils import replace_parameter, set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.platforms.interface import CpuArchEnum
 from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
 
 if current_platform.is_cuda_alike():
     from .fused_batched_moe import BatchedTritonExperts
-    from .fused_moe import TritonExperts, fused_experts
+    from .fused_moe import TritonExperts
 else:
-    fused_experts = None  # type: ignore
+    TritonExperts = None  # type: ignore
 
-if current_platform.is_tpu():
-    from .moe_pallas import fused_moe as fused_moe_pallas
-else:
-    fused_moe_pallas = None  # type: ignore
 
 logger = init_logger(__name__)
 
 
+# --8<-- [start:unquantized_fused_moe]
 @CustomOp.register("unquantized_fused_moe")
 class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
     """MoE method without quantization."""
 
+    # --8<-- [end:unquantized_fused_moe]
+
     def __init__(self, moe: FusedMoEConfig):
         super().__init__(moe)
 
         self.rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled()
-        if self.rocm_aiter_moe_enabled:
-            from .rocm_aiter_fused_moe import rocm_aiter_fused_experts
-
-            self.rocm_aiter_fused_experts = rocm_aiter_fused_experts
-        else:
-            self.rocm_aiter_fused_experts = None  # type: ignore
 
         # FlashInfer CUTLASS MoE is only supported on Hopper and later GPUS
         self.flashinfer_cutlass_moe_enabled = (
@@ -69,18 +76,6 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
             logger.info_once(
                 "Enabling FlashInfer CUTLASS MoE for UnquantizedFusedMoEMethod"
             )
-            from functools import partial
-
-            from .flashinfer_cutlass_moe import flashinfer_cutlass_moe
-
-            self.flashinfer_cutlass_moe = partial(
-                flashinfer_cutlass_moe,
-                quant_config=FUSED_MOE_UNQUANTIZED_CONFIG,
-                tp_rank=self.moe.moe_parallel_config.tp_rank,
-                tp_size=self.moe.moe_parallel_config.tp_size,
-                ep_rank=self.moe.moe_parallel_config.ep_rank,
-                ep_size=self.moe.moe_parallel_config.ep_size,
-            )
         else:
             if (
                 self.moe.moe_parallel_config.use_ep
@@ -97,7 +92,6 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
                     "FlashInfer CUTLASS MoE is currently not available for DP.",
                     scope="local",
                 )
-            self.flashinfer_cutlass_moe = None  # type: ignore
 
     @property
     def supports_eplb(self) -> bool:
@@ -210,20 +204,6 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         layer.w13_weight.data = self._maybe_pad_weight(layer.w13_weight.data)
         layer.w2_weight.data = self._maybe_pad_weight(layer.w2_weight.data)
 
-        if self.rocm_aiter_moe_enabled:
-            shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights(
-                layer.w13_weight.data, layer.w2_weight.data
-            )
-
-            layer.w13_weight.data = shuffled_w13
-            layer.w2_weight.data = shuffled_w2
-
-        if self.flashinfer_cutlass_moe_enabled:
-            # Swap halves to arrange as [w3; w1] (kernel expectation)
-            w1_w, w3_w = torch.chunk(layer.w13_weight.data, 2, dim=1)
-            w13_weight_swapped = torch.cat([w3_w, w1_w], dim=1)
-            layer.w13_weight.data = w13_weight_swapped.contiguous()
-
         if current_platform.is_xpu():
             import intel_extension_for_pytorch as ipex
 
@@ -265,16 +245,58 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
                     layer.cpu_fused_moe = cpu_fused_moe.CPUFusedMOE(layer)
             else:
                 layer.cpu_fused_moe = cpu_fused_moe.CPUFusedMOE(layer)
+        elif current_platform.is_cuda_alike():
+            self.moe_quant_config = self.get_fused_moe_quant_config(layer)
+            if self.rocm_aiter_moe_enabled:
+                shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights(
+                    layer.w13_weight.data, layer.w2_weight.data
+                )
+                replace_parameter(layer, "w13_weight", shuffled_w13)
+                replace_parameter(layer, "w2_weight", shuffled_w2)
+
+                self.use_inplace = True
+                self.kernel = mk.FusedMoEModularKernel(
+                    MoEPrepareAndFinalizeNoEP(),
+                    AiterExperts(self.moe_quant_config),
+                    shared_experts=None,
+                )
+
+            elif self.flashinfer_cutlass_moe_enabled:
+                self.use_inplace = False
+                # Swap halves to arrange as [w3; w1] (kernel expectation)
+                w13_weight = swap_w13_to_w31(layer.w13_weight.data)
+                replace_parameter(layer, "w13_weight", w13_weight)
+
+                self.kernel = mk.FusedMoEModularKernel(
+                    MoEPrepareAndFinalizeNoEP(),
+                    FlashInferExperts(
+                        out_dtype=layer.params_dtype,
+                        quant_config=self.moe_quant_config,
+                        tp_rank=self.moe.moe_parallel_config.tp_rank,
+                        tp_size=self.moe.moe_parallel_config.tp_size,
+                        ep_rank=self.moe.moe_parallel_config.ep_rank,
+                        ep_size=self.moe.moe_parallel_config.ep_size,
+                    ),
+                )
+            else:
+                self.use_inplace = True
+                self.kernel = mk.FusedMoEModularKernel(
+                    MoEPrepareAndFinalizeNoEP(),
+                    TritonExperts(self.moe_quant_config),
+                    shared_experts=None,
+                )
 
     def apply(
         self,
         layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
+        router: FusedMoERouter,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         use_nn_moe: bool | None = False,
         use_fused_gate: bool | None = False,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         return self.forward(
+            router=router,
             layer=layer,
             x=x,
             router_logits=router_logits,
@@ -282,9 +304,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
             use_fused_gate=use_fused_gate,
         )
 
-    def get_fused_moe_quant_config(
-        self, layer: torch.nn.Module
-    ) -> FusedMoEQuantConfig | None:
+    def get_fused_moe_quant_config(self, layer: torch.nn.Module) -> FusedMoEQuantConfig:
         if self.moe.has_bias:
             return biased_moe_quant_config(
                 layer.w13_bias,
@@ -296,65 +316,37 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
     def forward_cuda(
         self,
         layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
+        router: FusedMoERouter,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         use_nn_moe: bool | None = False,
         use_fused_gate: bool | None = False,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        topk_weights, topk_ids, zero_expert_result = layer.select_experts(
+        topk_weights, topk_ids = router.select_experts(
             hidden_states=x,
             router_logits=router_logits,
             use_fused_gate=use_fused_gate,
         )
+        result = self.kernel(
+            hidden_states=x,
+            w1=layer.w13_weight,
+            w2=layer.w2_weight,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            inplace=self.use_inplace,
+            activation=layer.activation,
+            apply_router_weight_on_input=layer.apply_router_weight_on_input,
+            global_num_experts=layer.global_num_experts,
+            expert_map=layer.expert_map,
+            use_nn_moe=use_nn_moe,
+        )
 
-        if self.rocm_aiter_moe_enabled:
-            result = self.rocm_aiter_fused_experts(
-                hidden_states=x,
-                w1=layer.w13_weight,
-                w2=layer.w2_weight,
-                topk_weights=topk_weights,
-                topk_ids=topk_ids,
-                expert_map=layer.expert_map,
-                activation=layer.activation,
-                apply_router_weight_on_input=layer.apply_router_weight_on_input,
-            )
-        elif self.flashinfer_cutlass_moe_enabled:
-            return self.flashinfer_cutlass_moe(
-                hidden_states=x,
-                w1=layer.w13_weight,
-                w2=layer.w2_weight,
-                topk_weights=topk_weights,
-                topk_ids=topk_ids,
-                activation=layer.activation,
-                apply_router_weight_on_input=layer.apply_router_weight_on_input,
-            )
-        else:
-            result = fused_experts(
-                hidden_states=x,
-                w1=layer.w13_weight,
-                w2=layer.w2_weight,
-                topk_weights=topk_weights,
-                topk_ids=topk_ids,
-                inplace=True,
-                activation=layer.activation,
-                quant_config=self.moe_quant_config,
-                apply_router_weight_on_input=layer.apply_router_weight_on_input,
-                global_num_experts=layer.global_num_experts,
-                expert_map=layer.expert_map,
-                use_nn_moe=use_nn_moe,
-            )
-
-        if layer.zero_expert_num != 0 and layer.zero_expert_type is not None:
-            assert not isinstance(result, tuple), (
-                "Shared + zero experts are mutually exclusive not yet supported"
-            )
-            return result, zero_expert_result
-        else:
-            return result
+        return result
 
     def forward_cpu(
         self,
         layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
+        router: FusedMoERouter,
         x: torch.Tensor,
         router_logits: torch.Tensor,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
@@ -388,6 +380,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
     def forward_xpu(
         self,
         layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
+        router: FusedMoERouter,
         x: torch.Tensor,
         router_logits: torch.Tensor,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
@@ -409,53 +402,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
             custom_routing_function=layer.custom_routing_function,
         )
 
-    def forward_tpu(
-        self,
-        layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
-        x: torch.Tensor,
-        router_logits: torch.Tensor,
-    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        assert not layer.use_grouped_topk
-        assert layer.num_expert_group is None
-        assert layer.topk_group is None
-        assert layer.custom_routing_function is None
-        assert layer.apply_router_weight_on_input is False
-        if layer.scoring_func != "softmax":
-            raise NotImplementedError(
-                "Only softmax scoring function is supported for TPU."
-            )
-        if layer.e_score_correction_bias is not None:
-            raise NotImplementedError(
-                "Expert score correction bias is not supported for TPU."
-            )
-        assert layer.activation == "silu", (
-            f"{layer.activation} is not supported for TPU."
-        )
-        assert layer.routed_scaling_factor == 1.0, (
-            f"routed_scaling_factor {layer.routed_scaling_factor} is "
-            "not supported for TPU."
-        )
-        if (
-            layer.enable_eplb is not False
-            or layer.expert_load_view is not None
-            or layer.logical_to_physical_map is not None
-            or layer.logical_replica_count is not None
-        ):
-            raise NotImplementedError("Expert load balancing is not supported for TPU.")
-        return fused_moe_pallas(
-            hidden_states=x,
-            w1=layer.w13_weight,
-            w2=layer.w2_weight,
-            topk=layer.top_k,
-            gating_output=router_logits,
-            global_num_experts=layer.global_num_experts,
-            expert_map=layer.expert_map,
-            renormalize=layer.renormalize,
-        )
-
-    if current_platform.is_tpu():
-        forward_native = forward_tpu
-    elif current_platform.is_cpu():
+    if current_platform.is_cpu():
         forward_native = forward_cpu
     elif current_platform.is_xpu():
         forward_native = forward_xpu
diff --git a/vllm/model_executor/layers/fused_moe/utils.py b/vllm/model_executor/layers/fused_moe/utils.py
index d7cef66c971f1f76b7f1123af2614e089a6cf652..cff0277e77c7d790a757ab73b5738d53f6aca3dc 100644
--- a/vllm/model_executor/layers/fused_moe/utils.py
+++ b/vllm/model_executor/layers/fused_moe/utils.py
@@ -4,6 +4,7 @@ import functools
 from math import prod
 
 import torch
+import torch.nn.functional as F
 
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
@@ -33,6 +34,8 @@ from vllm.triton_utils import tl, triton
 from vllm.utils.flashinfer import flashinfer_fp4_quantize
 from vllm.utils.math_utils import cdiv
 from vllm.utils.torch_utils import is_torch_equal_or_newer
+import vllm.envs as envs
+from lightop import fuse_silu_and_mul
 
 
 @triton.jit
@@ -331,6 +334,58 @@ def activation_without_mul(activation: str) -> str:
     return activation + "_no_mul"
 
 
+RELU2_NO_MUL: str = activation_without_mul("relu2")
+SILU_NO_MUL: str = activation_without_mul("silu")
+GELU_NO_MUL: str = activation_without_mul("gelu")
+
+
+def apply_moe_activation(
+    activation: str,
+    output: torch.Tensor,
+    input: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Apply MoE activation function.
+
+    For *_and_mul activations (silu, gelu, swigluoai):
+        - Expects output.size(-1) * 2 == input.size(-1)
+
+    For *_no_mul activations (silu_no_mul, gelu_no_mul, relu2_no_mul):
+        - Expects output.size(-1) == input.size(-1)
+    """
+    is_no_mul = activation.endswith("_no_mul")
+    if is_no_mul:
+        assert output.size(-1) == input.size(-1), (
+            f"{activation} expects equal sizes: {output.size(-1)} vs {input.size(-1)}"
+        )
+    else:
+        assert output.size(-1) * 2 == input.size(-1), (
+            f"{activation} expects 2x ratio: {output.size(-1) * 2} vs {input.size(-1)}"
+        )
+
+    # Activations with gated multiplication (gate × activation(up))
+    if activation == "silu":
+        if envs.VLLM_USE_FUSE_SILU_AND_MUL and input.dtype == torch.float16:
+            fuse_silu_and_mul(output, input)
+        else:
+            torch.ops._C.silu_and_mul(output, input)
+    elif activation == "gelu":
+        torch.ops._C.gelu_and_mul(output, input)
+    elif activation == "swigluoai":
+        torch.ops._C.swigluoai_and_mul(output, input)
+    # Activations without gated multiplication
+    elif activation == SILU_NO_MUL:
+        output.copy_(F.silu(input))
+    elif activation == GELU_NO_MUL:
+        output.copy_(F.gelu(input))
+    elif activation == RELU2_NO_MUL:
+        torch.square(F.relu(input), out=output)
+    else:
+        raise ValueError(f"Unsupported FusedMoe activation: {activation}")
+
+    return output
+
+
 # Torch custom ops can't deal with outputs aliasing inputs so we need to
 # disable inplace for torch >= 2.9.
 # See https://github.com/vllm-project/vllm/issues/26378
diff --git a/vllm/model_executor/layers/fused_moe/zero_expert_fused_moe.py b/vllm/model_executor/layers/fused_moe/zero_expert_fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..97d21767f4fc32ca7e66ce798ee7a1a80e51cde5
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/zero_expert_fused_moe.py
@@ -0,0 +1,189 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from contextlib import contextmanager
+
+import torch
+from torch import nn
+
+from vllm.model_executor.layers.fused_moe.fused_moe import zero_experts_compute_triton
+from vllm.model_executor.layers.fused_moe.layer import FusedMoE
+
+
+class ZeroExpertFusedMoE(FusedMoE):
+    """
+    A FusedMoE operation that also computes the results of zero experts.
+    Zero experts perform identity operations (scaled pass-through) instead
+    of full MLP computations.
+
+    This class uses memoization to avoid redundant routing computation:
+    routing is computed once and reused for both zero expert computation
+    and the main FusedMoE forward pass.
+    """
+
+    def __init__(
+        self,
+        zero_expert_num: int,
+        zero_expert_type: str,
+        router: nn.Module,
+        **kwargs,
+    ):
+        # ZeroExpertFusedMoE manages its own custom_routing_function for memoization
+        assert (
+            "custom_routing_function" not in kwargs
+            or kwargs.get("custom_routing_function") is None
+        ), (
+            "ZeroExpertFusedMoE does not support external custom_routing_function. "
+            "It manages its own for routing memoization."
+        )
+
+        # Automatically slice router's e_score_correction_bias to only include
+        # real experts (not zero_experts) for the base FusedMoE.
+        # The full bias will be used temporarily in forward() for routing.
+        if hasattr(router, "e_score_correction_bias") and "num_experts" in kwargs:
+            num_real_experts = kwargs["num_experts"]
+            router_bias = router.e_score_correction_bias
+            user_bias = kwargs.get("e_score_correction_bias")
+
+            # Use router's bias if:
+            # 1. User didn't provide bias, or
+            # 2. User provided full bias (same size as router)
+            if user_bias is None or user_bias.shape[0] == router_bias.shape[0]:
+                kwargs["e_score_correction_bias"] = router_bias[:num_real_experts]
+
+        # FusedMoE no longer accepts zero_expert_num/zero_expert_type.
+        # We handle zero experts ourselves in forward().
+        super().__init__(**kwargs)
+        # Store the actual zero_expert_num and zero_expert_type for our own use
+        self._actual_zero_expert_num = zero_expert_num
+        self._actual_zero_expert_type = zero_expert_type
+        self._router = router  # Full router (includes zero experts)
+
+        # Expose zero_expert_num and zero_expert_type as attributes for
+        # compatibility with quantization methods that check these attributes
+        self.zero_expert_num = 0
+        self.zero_expert_type = None
+
+        # Memoization state for routing results
+        self._memoized_topk_weights: torch.Tensor | None = None
+        self._memoized_topk_ids: torch.Tensor | None = None
+
+        # Create custom_routing_function to reuse memoized routing results
+        def custom_routing_function(hidden_states, gating_output, topk, renormalize):
+            """Return memoized `topk_weights` and `topk_ids`."""
+            if self._memoized_topk_weights is None or self._memoized_topk_ids is None:
+                raise RuntimeError(
+                    "ZeroExpertFusedMoE: routing results not memoized. "
+                    "Call select_experts first to compute routing."
+                )
+            return self._memoized_topk_weights, self._memoized_topk_ids
+
+        self.custom_routing_function = custom_routing_function
+
+    @contextmanager
+    def _temporarily_set_attrs(self, **attrs):
+        """
+        Temporarily set attributes using object.__setattr__ and restore them.
+
+        This bypasses nn.Module.__setattr__ to avoid Dynamo tracing issues.
+        When PyTorch Dynamo traces the forward pass, it cannot handle
+        nn.Module.__setattr__ calls (which include parameter registration logic),
+        resulting in "Unsupported" errors. Using object.__setattr__ directly
+        sets the attribute without triggering nn.Module's custom __setattr__,
+        allowing Dynamo to trace the code successfully.
+        """
+        originals = {key: getattr(self, key) for key in attrs}
+        try:
+            for key, value in attrs.items():
+                object.__setattr__(self, key, value)
+            yield
+        finally:
+            for key, value in originals.items():
+                object.__setattr__(self, key, value)
+
+    def _compute_zero_expert_result(
+        self,
+        hidden_states: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+    ) -> torch.Tensor | None:
+        """Compute zero expert results using pre-computed routing."""
+        if (
+            self._actual_zero_expert_num is None
+            or self._actual_zero_expert_num <= 0
+            or self._actual_zero_expert_type is None
+        ):
+            return None
+
+        return zero_experts_compute_triton(
+            expert_indices=topk_ids.clone(),
+            expert_scales=topk_weights.clone(),
+            num_experts=self.logical_num_experts,
+            zero_expert_type=self._actual_zero_expert_type,
+            hidden_states=hidden_states,
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        router_logits: torch.Tensor,  # Full logits including zero experts
+    ) -> torch.Tensor:
+        """
+        Forward pass with zero expert support and routing memoization.
+
+        Args:
+            hidden_states: Input hidden states
+            router_logits: Full router logits (including zero experts)
+
+        Returns:
+            Combined output from real experts and zero experts
+        """
+        # Prepare temporary attribute overrides for routing computation
+        temp_attrs = {
+            "custom_routing_function": None,  # Disable for first routing
+        }
+        if self._router is not None:
+            temp_attrs["e_score_correction_bias"] = self._router.e_score_correction_bias
+
+        # Compute routing with temporary attributes
+        # Pass full router_logits (including zero experts) so that zero experts
+        # can be properly identified in topk_ids
+        with self._temporarily_set_attrs(**temp_attrs):
+            topk_weights, topk_ids = self.select_experts(
+                hidden_states=hidden_states,
+                router_logits=router_logits,  # Full logits (includes zero experts)
+            )
+
+        # Compute zero expert result if needed
+        zero_expert_result = self._compute_zero_expert_result(
+            hidden_states=hidden_states,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+        )
+
+        # Memoize routing results for reuse in super().forward()
+        self._memoized_topk_weights = topk_weights
+        self._memoized_topk_ids = topk_ids
+
+        # Slice router_logits for real experts only
+        router_logits_sliced = router_logits[..., : self.logical_num_experts]
+
+        # Compute real expert results (will reuse memoized routing via
+        # custom_routing_function)
+        # zero_expert_num is already 0, so FusedMoE won't handle zero experts
+        fused_out = super().forward(
+            hidden_states=hidden_states,
+            router_logits=router_logits_sliced,
+        )
+
+        # Combine results
+        # Both zero_expert_result and fused_out are computed from the same
+        # hidden_states, so they should be on the same device.
+        if zero_expert_result is not None:
+            fused_out = fused_out + zero_expert_result
+
+        # Clear memoization after use
+        self._memoized_topk_weights = None
+        self._memoized_topk_ids = None
+
+        return fused_out
diff --git a/vllm/model_executor/layers/kda.py b/vllm/model_executor/layers/kda.py
index 27cc3884517f9a99bd910deb62518ff795168d69..fde9ad36bcd3cc9961ea653871f24070f7d02d87 100644
--- a/vllm/model_executor/layers/kda.py
+++ b/vllm/model_executor/layers/kda.py
@@ -5,7 +5,6 @@ import torch
 from einops import rearrange
 from torch import nn
 
-from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import CacheConfig, ModelConfig, get_current_vllm_config
 from vllm.distributed import (
     divide,
@@ -17,6 +16,7 @@ from vllm.logger import init_logger
 from vllm.model_executor.model_loader.weight_utils import sharded_weight_loader
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.utils.torch_utils import direct_register_custom_op
+from vllm.v1.attention.backend import AttentionMetadata
 from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadata
 
 from .fla.ops.kda import (
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index d2257bce6581fe54de18ddbee48a481b5ebe1ece..c9cdee29ffe49f00fdcd6aa0c5096851c198be1b 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -25,7 +25,8 @@ def rms_norm(
     if vllm_is_batch_invariant():
         return rms_norm_batch_invariant(x, weight, variance_epsilon)
     out = torch.empty_like(x)
-    if envs.VLLM_USE_OPT_OP:
+    # if envs.VLLM_USE_OPT_OP:
+    if False:
         ops.rms_norm_opt(
             out,
             x,
@@ -54,7 +55,8 @@ def fused_add_rms_norm(
         return rms_norm_batch_invariant(
             x + residual, weight, variance_epsilon
         ), x + residual
-    if envs.VLLM_USE_OPT_OP:
+    # if envs.VLLM_USE_OPT_OP:
+    if False:
         ops.fused_add_rms_norm_opt(
             x,
             residual,
@@ -106,6 +108,7 @@ def dispatch_rocm_rmsnorm_func(
     return rms_norm
 
 
+# --8<-- [start:rms_norm]
 @CustomOp.register("rms_norm")
 class RMSNorm(CustomOp):
     """Root mean square normalization.
@@ -114,6 +117,8 @@ class RMSNorm(CustomOp):
     Refer to https://arxiv.org/abs/1910.07467
     """
 
+    # --8<-- [end:rms_norm]
+
     def __init__(
         self,
         hidden_size: int,
@@ -294,6 +299,7 @@ class RMSNorm(CustomOp):
         return s
 
 
+# --8<-- [start:gemma_rms_norm]
 @CustomOp.register("gemma_rms_norm")
 class GemmaRMSNorm(CustomOp):
     """RMS normalization for Gemma.
@@ -303,6 +309,8 @@ class GemmaRMSNorm(CustomOp):
         2. (x * w).to(orig_dtype) instead of x.to(orig_dtype) * w.
     """
 
+    # --8<-- [end:gemma_rms_norm]
+
     def __init__(
         self,
         hidden_size: int,
@@ -362,6 +370,7 @@ class GemmaRMSNorm(CustomOp):
         return self.forward_native(x, residual)
 
 
+# --8<-- [start:rms_norm_gated]
 @CustomOp.register("rms_norm_gated")
 class RMSNormGated(CustomOp):
     """RMS Normalization with optional gating.
@@ -372,6 +381,8 @@ class RMSNormGated(CustomOp):
     - Optional gating with SiLU activation
     """
 
+    # --8<-- [end:rms_norm_gated]
+
     def __init__(
         self,
         hidden_size: int,
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 23a2104646be8b1155a4c2091327da03260210dc..d5d8412a1d8bf801dff265a89d246576f22477c3 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -58,6 +58,8 @@ WEIGHT_LOADER_V2_SUPPORTED = [
     "GPTQLinearMethod",
     "FBGEMMFp8LinearMethod",
     "ModelOptFp8LinearMethod",
+    "ModelOptFp8PcPtLinearMethod",
+    "ModelOptFp8PbWoLinearMethod",
     "IPEXAWQLinearMethod",
     "IPEXGPTQLinearMethod",
     "HQQMarlinMethod",
@@ -84,6 +86,14 @@ def adjust_marlin_shard(param, shard_size, shard_offset):
     return shard_size * marlin_tile_size, shard_offset * marlin_tile_size
 
 
+def adjust_block_scale_shard(weight_block_size, shard_size, shard_offset):
+    assert weight_block_size is not None
+    block_n = weight_block_size[0]
+    shard_offset = (shard_offset + block_n - 1) // block_n
+    shard_size = (shard_size + block_n - 1) // block_n
+    return shard_size, shard_offset
+
+
 def adjust_bitsandbytes_4bit_shard(
     param: Parameter, shard_offsets: dict[str, tuple[int, int]], loaded_shard_id: str
 ) -> tuple[int, int]:
@@ -326,6 +336,7 @@ class LinearBase(CustomOp):
         self.params_dtype = params_dtype
         self.quant_config = quant_config
         self.prefix = prefix
+        self.allow_fp8_block_shape_mismatch = False
         if quant_config is None:
             self.quant_method: QuantizeMethodBase | None = UnquantizedLinearMethod()
         else:
@@ -342,6 +353,7 @@ class LinearBase(CustomOp):
                 param.tp_size = self.tp_size
 
 
+# --8<-- [start:replicated_linear]
 @CustomOp.register("replicated_linear")
 class ReplicatedLinear(LinearBase):
     """Replicated linear layer.
@@ -359,6 +371,8 @@ class ReplicatedLinear(LinearBase):
         disable_tp: Take no effect for replicated linear layers.
     """
 
+    # --8<-- [end:replicated_linear]
+
     def __init__(
         self,
         input_size: int,
@@ -453,10 +467,10 @@ class ReplicatedLinear(LinearBase):
         assert self.quant_method is not None
 
         output = self.quant_method.apply(self, x, bias)
-        output_bias = self.bias if self.skip_bias_add else None
 
         if not self.return_bias:
             return output
+        output_bias = self.bias if self.skip_bias_add else None
         return output, output_bias
 
     def extra_repr(self) -> str:
@@ -466,6 +480,7 @@ class ReplicatedLinear(LinearBase):
         return s
 
 
+# --8<-- [start:column_parallel_linear]
 @CustomOp.register("column_parallel_linear")
 class ColumnParallelLinear(LinearBase):
     """Linear layer with column parallelism.
@@ -485,14 +500,14 @@ class ColumnParallelLinear(LinearBase):
                        skip adding bias but instead return it.
         params_dtype: Data type for the parameters.
         quant_config: Quantization configure.
-        output_sizes: list of output sizes packed into one output, like for QKV
-                       the list would be size 3.
         prefix: The name of the layer in the state dict, including all parents
                         (e.g. model.layers.0.qkv_proj)
         return_bias: If true, return bias together with outputs in forward pass.
         disable_tp: If true, weights matrix won't be sharded through tp rank.
     """
 
+    # --8<-- [end:column_parallel_linear]
+
     def __init__(
         self,
         input_size: int,
@@ -502,8 +517,6 @@ class ColumnParallelLinear(LinearBase):
         skip_bias_add: bool = False,
         params_dtype: torch.dtype | None = None,
         quant_config: QuantizationConfig | None = None,
-        output_sizes: list[int] | None = None,
-        eps: float | None = 1e-6,
         prefix: str = "",
         *,
         return_bias: bool = True,
@@ -532,12 +545,9 @@ class ColumnParallelLinear(LinearBase):
             disable_tp=disable_tp,
         )
 
-        self.eps = eps
+        self._maybe_allow_fp8_block_shape_mismatch()
         self.gather_output = gather_output
 
-        if output_sizes is None:
-            output_sizes = [output_size]
-
         assert self.quant_method is not None
         self.quant_method.create_weights(
             layer=self,
@@ -568,6 +578,33 @@ class ColumnParallelLinear(LinearBase):
         self.update_param_tp_status()
         self.is_quantization = not isinstance(self.quant_method, UnquantizedLinearMethod)
 
+    def _maybe_allow_fp8_block_shape_mismatch(self) -> None:
+        quant_config = getattr(self, "quant_config", None)
+        weight_block = getattr(quant_config, "weight_block_size", None)
+        if (
+            weight_block is None
+            or len(weight_block) < 1
+            or len(self.output_partition_sizes) <= 1
+        ):
+            return
+
+        try:
+            block_n = int(weight_block[0])
+        except (ValueError, TypeError):
+            return
+
+        if block_n <= 0:
+            return
+
+        if any(size % block_n != 0 for size in self.output_partition_sizes):
+            self.allow_fp8_block_shape_mismatch = True
+            logger.debug(
+                "Allowing FP8 block shape mismatch for %s (block_n=%d, partitions=%s)",
+                getattr(self, "prefix", "<unknown>"),
+                block_n,
+                self.output_partition_sizes,
+            )
+
     def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
         output_dim = getattr(param, "output_dim", None)
 
@@ -634,9 +671,10 @@ class ColumnParallelLinear(LinearBase):
             output = tensor_model_parallel_all_gather(output_parallel)
         else:
             output = output_parallel
-        output_bias = self.bias if self.skip_bias_add else None
+
         if not self.return_bias:
             return output
+        output_bias = self.bias if self.skip_bias_add else None
         return output, output_bias
 
     def extra_repr(self) -> str:
@@ -800,8 +838,18 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
 
         assert loaded_shard_id < len(self.output_sizes)
         if output_dim is not None:
-            shard_offset = sum(self.output_sizes[:loaded_shard_id]) // self.tp_size
-            shard_size = self.output_sizes[loaded_shard_id] // self.tp_size
+            shard_offset = sum(self.output_sizes[:loaded_shard_id])
+            shard_size = self.output_sizes[loaded_shard_id]
+
+            if isinstance(param, BlockQuantScaleParameter):
+                weight_block_size = getattr(self, "weight_block_size", None)
+                shard_size, shard_offset = adjust_block_scale_shard(
+                    weight_block_size, shard_size, shard_offset
+                )
+
+            shard_offset //= self.tp_size
+            shard_size //= self.tp_size
+
             # Special case for quantization.
             # If quantized, we need to adjust the offset and size to account
             # for the packing.
@@ -910,24 +958,17 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
 
         assert loaded_shard_id < len(self.output_sizes)
 
+        shard_offset = sum(self.output_sizes[:loaded_shard_id])
+        shard_size = self.output_sizes[loaded_shard_id]
+
         if isinstance(param, BlockQuantScaleParameter):
-            assert self.quant_method is not None
-            # Assume the weight block size has been set by quant method
-            assert hasattr(self, "weight_block_size")
-            weight_block_size = self.weight_block_size
-            assert weight_block_size is not None
-            block_n, _ = weight_block_size[0], weight_block_size[1]
-            shard_offset = (
-                (sum(self.output_sizes[:loaded_shard_id]) + block_n - 1) // block_n
-            ) // self.tp_size
-            shard_size = (
-                (self.output_sizes[loaded_shard_id] + block_n - 1)
-                // block_n
-                // self.tp_size
+            weight_block_size = getattr(self, "weight_block_size", None)
+            shard_size, shard_offset = adjust_block_scale_shard(
+                weight_block_size, shard_size, shard_offset
             )
-        else:
-            shard_offset = sum(self.output_sizes[:loaded_shard_id]) // self.tp_size
-            shard_size = self.output_sizes[loaded_shard_id] // self.tp_size
+
+        shard_offset //= self.tp_size
+        shard_size //= self.tp_size
 
         param.load_merged_column_weight(
             loaded_weight=loaded_weight,
@@ -981,9 +1022,11 @@ class QKVParallelLinear(ColumnParallelLinear):
         *,
         return_bias: bool = True,
         disable_tp: bool = False,
+        v_head_size: int | None = None,
     ):
         self.hidden_size = hidden_size
         self.head_size = head_size
+        self.v_head_size = v_head_size if v_head_size is not None else head_size
         self.total_num_heads = total_num_heads
         if total_num_kv_heads is None:
             total_num_kv_heads = total_num_heads
@@ -999,12 +1042,14 @@ class QKVParallelLinear(ColumnParallelLinear):
             self.num_kv_head_replicas = 1
         input_size = self.hidden_size
         output_size = (
-            (self.num_heads + 2 * self.num_kv_heads) * tp_size * self.head_size
-        )
+            self.num_heads * self.head_size
+            + self.num_kv_heads * self.head_size
+            + self.num_kv_heads * self.v_head_size
+        ) * tp_size
         self.output_sizes = [
             self.num_heads * self.head_size * tp_size,  # q_proj
             self.num_kv_heads * self.head_size * tp_size,  # k_proj
-            self.num_kv_heads * self.head_size * tp_size,  # v_proj
+            self.num_kv_heads * self.v_head_size * tp_size,  # v_proj
         ]
 
         super().__init__(
@@ -1026,7 +1071,8 @@ class QKVParallelLinear(ColumnParallelLinear):
             "q": 0,
             "k": self.num_heads * self.head_size,
             "v": (self.num_heads + self.num_kv_heads) * self.head_size,
-            "total": (self.num_heads + 2 * self.num_kv_heads) * self.head_size,
+            "total": (self.num_heads + self.num_kv_heads) * self.head_size
+            + self.num_kv_heads * self.v_head_size,
         }
         return shard_offset_mapping.get(loaded_shard_id)
 
@@ -1034,7 +1080,7 @@ class QKVParallelLinear(ColumnParallelLinear):
         shard_size_mapping = {
             "q": self.num_heads * self.head_size,
             "k": self.num_kv_heads * self.head_size,
-            "v": self.num_kv_heads * self.head_size,
+            "v": self.num_kv_heads * self.v_head_size,
         }
         return shard_size_mapping.get(loaded_shard_id)
 
@@ -1061,7 +1107,7 @@ class QKVParallelLinear(ColumnParallelLinear):
             (
                 "v",
                 (self.total_num_heads + self.total_num_kv_heads) * self.head_size,
-                self.total_num_kv_heads * self.head_size,
+                self.total_num_kv_heads * self.v_head_size,
             ),
         ]
 
@@ -1106,16 +1152,11 @@ class QKVParallelLinear(ColumnParallelLinear):
         shard_offset = self._get_shard_offset_mapping(loaded_shard_id)
         shard_size = self._get_shard_size_mapping(loaded_shard_id)
 
-        # Note(simon): This is needed for Qwen3's fp8 quantization.
         if isinstance(param, BlockQuantScaleParameter):
-            assert self.quant_method is not None
-            # Assume the weight block size has been set by quant method
-            assert hasattr(self, "weight_block_size")
-            weight_block_size = self.weight_block_size
-            assert weight_block_size is not None
-            block_n, _ = weight_block_size[0], weight_block_size[1]
-            shard_offset = (shard_offset + block_n - 1) // block_n
-            shard_size = (shard_size + block_n - 1) // block_n
+            weight_block_size = getattr(self, "weight_block_size", None)
+            shard_size, shard_offset = adjust_block_scale_shard(
+                weight_block_size, shard_size, shard_offset
+            )
 
         param.load_qkv_weight(
             loaded_weight=loaded_weight,
@@ -1187,7 +1228,7 @@ class QKVParallelLinear(ColumnParallelLinear):
                 (
                     "v",
                     (self.total_num_heads + self.total_num_kv_heads) * self.head_size,
-                    self.total_num_kv_heads * self.head_size,
+                    self.total_num_kv_heads * self.v_head_size,
                 ),
             ]
             use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
@@ -1216,11 +1257,12 @@ class QKVParallelLinear(ColumnParallelLinear):
                         "v": (
                             (self.total_num_heads + self.total_num_kv_heads)
                             * self.head_size,
-                            self.total_num_kv_heads * self.head_size,
+                            self.total_num_kv_heads * self.v_head_size,
                         ),
                         "total": (
-                            (self.total_num_heads + 2 * self.total_num_kv_heads)
-                            * self.head_size,
+                            (self.total_num_heads + self.total_num_kv_heads)
+                            * self.head_size
+                            + self.total_num_kv_heads * self.v_head_size,
                             0,
                         ),
                     }
@@ -1247,7 +1289,14 @@ class QKVParallelLinear(ColumnParallelLinear):
                 shard_size = self.num_kv_heads * self.head_size
             elif loaded_shard_id == "v":
                 shard_offset = (self.num_heads + self.num_kv_heads) * self.head_size
-                shard_size = self.num_kv_heads * self.head_size
+                shard_size = self.num_kv_heads * self.v_head_size
+
+            if isinstance(param, BlockQuantScaleParameter):
+                weight_block_size = getattr(self, "weight_block_size", None)
+                shard_size, shard_offset = adjust_block_scale_shard(
+                    weight_block_size, shard_size, shard_offset
+                )
+
             # Special case for Quantized Weights.
             # If quantized, we need to adjust the offset and size to account
             # for the packing.
@@ -1276,10 +1325,11 @@ class QKVParallelLinear(ColumnParallelLinear):
                     ),
                     "v": (
                         (self.num_heads + self.num_kv_heads) * self.head_size,
-                        self.num_kv_heads * self.head_size,
+                        self.num_kv_heads * self.v_head_size,
                     ),
                     "total": (
-                        (self.num_heads + 2 * self.num_kv_heads) * self.head_size,
+                        (self.num_heads + self.num_kv_heads) * self.head_size
+                        + self.num_kv_heads * self.v_head_size,
                         0,
                     ),
                 }
@@ -1322,6 +1372,7 @@ class QKVParallelLinear(ColumnParallelLinear):
         param_data.copy_(loaded_weight)
 
 
+# --8<-- [start:row_parallel_linear]
 @CustomOp.register("row_parallel_linear")
 class RowParallelLinear(LinearBase):
     """Linear layer with row parallelism.
@@ -1356,6 +1407,8 @@ class RowParallelLinear(LinearBase):
         disable_tp: If true, weights matrix won't be sharded through tp rank.
     """
 
+    # --8<-- [end:row_parallel_linear]
+
     def __init__(
         self,
         input_size: int,
@@ -1501,10 +1554,9 @@ class RowParallelLinear(LinearBase):
         else:
             output = output_parallel
 
-        output_bias = self.bias if self.skip_bias_add else None
-
         if not self.return_bias:
             return output
+        output_bias = self.bias if self.skip_bias_add else None
         return output, output_bias
 
     def extra_repr(self) -> str:
diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py
index c8d57f597d1ca24636af3b245299afa4d08a92c1..38753b0fcc74842e5eecf270fae1dc7ad9ecaa94 100644
--- a/vllm/model_executor/layers/logits_processor.py
+++ b/vllm/model_executor/layers/logits_processor.py
@@ -13,6 +13,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmb
 from vllm.platforms import current_platform
 
 
+# --8<-- [start:logits_processor]
 @CustomOp.register("logits_processor")
 class LogitsProcessor(CustomOp):
     """Process logits and apply logits processors from sampling metadata.
@@ -23,6 +24,8 @@ class LogitsProcessor(CustomOp):
     3. Apply logits processors (if any).
     """
 
+    # --8<-- [end:logits_processor]
+
     def __init__(
         self,
         vocab_size: int,
diff --git a/vllm/model_executor/layers/mamba/abstract.py b/vllm/model_executor/layers/mamba/abstract.py
index 74f4383e9c2382b21683ca28ce8a1c5ffc68cfc6..4f45dd6caf35d6c58e0214d4af48ef9a32c7902c 100644
--- a/vllm/model_executor/layers/mamba/abstract.py
+++ b/vllm/model_executor/layers/mamba/abstract.py
@@ -5,10 +5,10 @@ from collections.abc import Iterable
 
 import torch
 
-from vllm.attention.backends.abstract import AttentionBackend
-from vllm.attention.selector import get_mamba_attn_backend
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
+from vllm.v1.attention.backend import AttentionBackend
+from vllm.v1.attention.selector import get_mamba_attn_backend
 from vllm.v1.kv_cache_interface import KVCacheSpec, MambaSpec
 
 
diff --git a/vllm/model_executor/layers/mamba/linear_attn.py b/vllm/model_executor/layers/mamba/linear_attn.py
index 278713408c288bd4cf50e3e9269ae11596679b3d..8b5f80f54527786304e93c5094d26c6928a34df6 100644
--- a/vllm/model_executor/layers/mamba/linear_attn.py
+++ b/vllm/model_executor/layers/mamba/linear_attn.py
@@ -8,7 +8,6 @@ import torch.nn.functional as F
 from einops import rearrange
 from torch import nn
 
-from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import CacheConfig, ModelConfig, get_current_vllm_config
 from vllm.distributed.communication_op import tensor_model_parallel_all_reduce
 from vllm.distributed.parallel_state import (
@@ -29,6 +28,7 @@ from vllm.model_executor.layers.mamba.mamba_utils import (
 )
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.utils.torch_utils import direct_register_custom_op
+from vllm.v1.attention.backend import AttentionMetadata
 from vllm.v1.attention.backends.linear_attn import LinearAttentionMetadata
 
 
@@ -79,6 +79,28 @@ class MiniMaxText01RMSNormTP(CustomOp):
         assert residual is None, "RMSNorm does not support residual connection."
         return self._forward(x)
 
+    @staticmethod
+    def forward_qk(
+        q_norm: "MiniMaxText01RMSNormTP",
+        k_norm: "MiniMaxText01RMSNormTP",
+        q: torch.Tensor,
+        k: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        orig_dtype = q.dtype
+        q = q.to(torch.float32)
+        k = k.to(torch.float32)
+        q_var = q.pow(2).mean(dim=-1, keepdim=True)
+        k_var = k.pow(2).mean(dim=-1, keepdim=True)
+        if q_norm.tp_world > 1:
+            qk_var = torch.cat([q_var, k_var], dim=-1)
+            qk_var = tensor_model_parallel_all_reduce(qk_var) / q_norm.tp_world
+            q_var, k_var = qk_var.chunk(2, dim=-1)
+        q = q * torch.rsqrt(q_var + q_norm.variance_epsilon) * q_norm.weight
+        k = k * torch.rsqrt(k_var + k_norm.variance_epsilon) * k_norm.weight
+        q = q.to(orig_dtype)
+        k = k.to(orig_dtype)
+        return q, k
+
 
 class MiniMaxText01LinearKernel:
     @staticmethod
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer.py b/vllm/model_executor/layers/mamba/mamba_mixer.py
index 0b63acf2dc5a5159b16aba78bf5cb7a7f85ac44f..c22a309ce166a342edbab8eab6ad242a12b84bf5 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer.py
@@ -34,11 +34,13 @@ from vllm.model_executor.layers.mamba.ops.mamba_ssm import (
     selective_state_update,
 )
 from vllm.model_executor.utils import set_weight_attrs
+from vllm.platforms import current_platform
 from vllm.utils.torch_utils import direct_register_custom_op
 from vllm.v1.attention.backends.mamba1_attn import Mamba1AttentionMetadata
 
 
 # Adapted from transformers.models.mamba.modeling_mamba.MambaMixer
+# --8<-- [start:mamba_mixer]
 @CustomOp.register("mamba_mixer")
 class MambaMixer(MambaBase, CustomOp):
     """
@@ -51,6 +53,8 @@ class MambaMixer(MambaBase, CustomOp):
     **selective** state spaces)
     """
 
+    # --8<-- [end:mamba_mixer]
+
     def __init__(
         self,
         hidden_size: int,
@@ -82,6 +86,7 @@ class MambaMixer(MambaBase, CustomOp):
             input_size=conv_kernel_size,
             output_size=intermediate_size,
             bias=use_conv_bias,
+            prefix=f"{prefix}.conv1d",
         )
         # unsqueeze to fit conv1d weights shape into the linear weights shape.
         # Can't do this in `weight_loader` since it already exists in
@@ -90,7 +95,10 @@ class MambaMixer(MambaBase, CustomOp):
         self.conv1d.weight.data = self.conv1d.weight.data.unsqueeze(1)
 
         self.in_proj = MergedColumnParallelLinear(
-            hidden_size, [intermediate_size] * 2, bias=use_bias
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=use_bias,
+            prefix=f"{prefix}.in_proj",
         )
 
         # selective projection used to make dt, B and C input dependent
@@ -98,12 +106,17 @@ class MambaMixer(MambaBase, CustomOp):
             intermediate_size,
             time_step_rank + ssm_state_size * 2,
             bias=False,
+            prefix=f"{prefix}.x_proj",
         )
         # time step projection (discretization) -
         # In the forward we need to apply dt_proj without the bias,
         # as the bias is added in the selective scan kernel.
         self.dt_proj = ColumnParallelLinear(
-            time_step_rank, intermediate_size, bias=True, skip_bias_add=True
+            time_step_rank,
+            intermediate_size,
+            bias=True,
+            skip_bias_add=True,
+            prefix=f"{prefix}.dt_proj",
         )
 
         def weight_loader(param: Parameter, loaded_weight: torch.Tensor):
@@ -136,6 +149,7 @@ class MambaMixer(MambaBase, CustomOp):
             hidden_size,
             bias=use_bias,
             input_is_parallel=True,
+            prefix=f"{prefix}.out_proj",
         )
 
         self.dt_layernorm = (
@@ -182,11 +196,12 @@ class MambaMixer(MambaBase, CustomOp):
     def _ssm_transform(
         self, x: torch.Tensor
     ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        if self.is_lora_enabled:
-            #  Lora kernel requires contiguous tensor.
-            ssm_params = self.x_proj(x.contiguous())[0]
-        else:
-            ssm_params = self.x_proj(x)[0]
+        # LoRA kernel requires contiguous tensor.
+        # ROCm: Non-contiguous tensors cause incorrect GEMM
+        # results when batch > 1.
+        if self.is_lora_enabled or current_platform.is_rocm():
+            x = x.contiguous()
+        ssm_params = self.x_proj(x)[0]
         time_step, B, C = torch.split(
             ssm_params,
             [self.time_step_rank, self.ssm_state_size, self.ssm_state_size],
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py
index 0ea5805305eda7ba8370d83139b39f7072fd6de6..74e4a34b4ae0ba9409396e095efe83ec33e3b442 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer2.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py
@@ -5,7 +5,6 @@
 import torch
 from torch import nn
 
-from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import CacheConfig, ModelConfig, get_current_vllm_config
 from vllm.distributed import (
     divide,
@@ -43,14 +42,18 @@ from vllm.model_executor.model_loader.weight_utils import (
 )
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.utils.torch_utils import direct_register_custom_op
+from vllm.v1.attention.backend import AttentionMetadata
 from vllm.v1.attention.backends.mamba2_attn import Mamba2AttentionMetadata
 
 # Added by the IBM Team, 2024
 
 
 # Adapted from transformers.models.mamba2.modeling_mamba2.MambaRMSNormGated
+# --8<-- [start:mixer2_gated_rms_norm]
 @CustomOp.register("mixer2_gated_rms_norm")
 class Mixer2RMSNormGated(CustomOp):
+    # --8<-- [end:mixer2_gated_rms_norm]
+
     def __init__(
         self,
         full_hidden_size: int,
@@ -214,6 +217,7 @@ def mamba_v2_sharded_weight_loader(
 
 
 # Adapted from transformers.models.mamba.modeling_mamba.MambaMixer
+# --8<-- [start:mamba_mixer2]
 @CustomOp.register("mamba_mixer2")
 class MambaMixer2(MambaBase, CustomOp):
     """
@@ -226,6 +230,8 @@ class MambaMixer2(MambaBase, CustomOp):
     **selective** state spaces)
     """
 
+    # --8<-- [end:mamba_mixer2]
+
     def __init__(
         self,
         hidden_size: int,
diff --git a/vllm/model_executor/layers/mamba/ops/causal_conv1d.py b/vllm/model_executor/layers/mamba/ops/causal_conv1d.py
index 83c2c5f11e187b262af36daabc159912aa6510da..157f9f34647abd41aca73089cd72b1fa6d45c6a7 100644
--- a/vllm/model_executor/layers/mamba/ops/causal_conv1d.py
+++ b/vllm/model_executor/layers/mamba/ops/causal_conv1d.py
@@ -8,8 +8,8 @@
 import numpy as np
 import torch
 
-from vllm.attention.backends.utils import PAD_SLOT_ID
 from vllm.triton_utils import tl, triton
+from vllm.v1.attention.backends.utils import PAD_SLOT_ID
 
 
 @triton.jit()
diff --git a/vllm/model_executor/layers/mamba/ops/mamba_ssm.py b/vllm/model_executor/layers/mamba/ops/mamba_ssm.py
index 800f8bd8407928ca7045215f74b63791f020a29b..628ad970cf5454c3c27aed11be0e86f6f6bd9da0 100644
--- a/vllm/model_executor/layers/mamba/ops/mamba_ssm.py
+++ b/vllm/model_executor/layers/mamba/ops/mamba_ssm.py
@@ -8,8 +8,8 @@ import torch
 from packaging import version
 
 from vllm import _custom_ops as ops
-from vllm.attention.backends.utils import PAD_SLOT_ID
 from vllm.triton_utils import HAS_TRITON, tl, triton
+from vllm.v1.attention.backends.utils import PAD_SLOT_ID
 
 TRITON3 = HAS_TRITON and (version.parse(triton.__version__) >= version.parse("3.0.0"))
 
diff --git a/vllm/model_executor/layers/mamba/short_conv.py b/vllm/model_executor/layers/mamba/short_conv.py
index 0bbad17d7ebc7f7cacc7846295aec1aaf8ce40d9..14e00bce2b1d0b7a231f8a7c8d7be2a17b94974b 100644
--- a/vllm/model_executor/layers/mamba/short_conv.py
+++ b/vllm/model_executor/layers/mamba/short_conv.py
@@ -4,7 +4,6 @@
 
 import torch
 
-from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import CacheConfig, ModelConfig, get_current_vllm_config
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.forward_context import ForwardContext, get_forward_context
@@ -24,11 +23,15 @@ from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
     causal_conv1d_update,
 )
 from vllm.utils.torch_utils import direct_register_custom_op
+from vllm.v1.attention.backend import AttentionMetadata
 from vllm.v1.attention.backends.short_conv_attn import ShortConvAttentionMetadata
 
 
+# --8<-- [start:short_conv]
 @CustomOp.register("short_conv")
 class ShortConv(MambaBase, CustomOp):
+    # --8<-- [end:short_conv]
+
     def __init__(
         self,
         config,
@@ -118,6 +121,7 @@ class ShortConv(MambaBase, CustomOp):
             conv_state = self_kv_cache[0].transpose(-1, -2)
             state_indices_tensor = attn_metadata.state_indices_tensor
             has_initial_states_p = attn_metadata.has_initial_states_p
+            query_start_loc_p = attn_metadata.query_start_loc_p
 
         BCx, _ = self.in_proj(hidden_states)
 
@@ -165,11 +169,6 @@ class ShortConv(MambaBase, CustomOp):
             [num_decodes, num_prefills],
             dim=0,
         )
-        query_start_loc_p = (
-            attn_metadata.query_start_loc[-num_prefills - 1 :] - num_decodes
-            if has_prefill
-            else None
-        )
 
         conv_output_list = []
 
diff --git a/vllm/model_executor/layers/mla.py b/vllm/model_executor/layers/mla.py
index 1656f4deb671706018654f1eebe031d7cea944a9..65541d2a485a8e4d5e307d9f0d19c69d9783c0bc 100644
--- a/vllm/model_executor/layers/mla.py
+++ b/vllm/model_executor/layers/mla.py
@@ -29,6 +29,7 @@ class MLAModules:
     indexer_rotary_emb: torch.nn.Module | None = None
 
 
+# --8<-- [start:multi_head_latent_attention]
 @CustomOp.register("multi_head_latent_attention")
 class MultiHeadLatentAttentionWrapper(CustomOp):
     """MLA layer registered as CustomOp to allow OOT backends to add
@@ -47,6 +48,8 @@ class MultiHeadLatentAttentionWrapper(CustomOp):
     3. Return the output tensor.
     """
 
+    # --8<-- [end:multi_head_latent_attention]
+
     def __init__(
         self,
         hidden_size: int,
diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py
deleted file mode 100644
index d1942689d7f5c54cb2d9bae05d5f339b92803afa..0000000000000000000000000000000000000000
--- a/vllm/model_executor/layers/pooler.py
+++ /dev/null
@@ -1,830 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from abc import ABC, abstractmethod
-from collections.abc import Callable, Mapping, Set
-from dataclasses import dataclass
-from enum import IntEnum
-from itertools import groupby
-from typing import TypeVar
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from transformers import PretrainedConfig
-
-from vllm.config import ModelConfig, PoolerConfig, get_current_vllm_config
-from vllm.logger import init_logger
-from vllm.model_executor.models.adapters import _load_st_projector
-from vllm.pooling_params import PoolingParams
-from vllm.tasks import PoolingTask
-from vllm.utils.import_utils import resolve_obj_by_qualname
-from vllm.v1.outputs import PoolerOutput
-from vllm.v1.pool.metadata import PoolingCursor, PoolingMetadata
-
-logger = init_logger(__name__)
-
-PoolingFn = Callable[
-    [torch.Tensor | list[torch.Tensor], PoolingMetadata],
-    torch.Tensor | list[torch.Tensor],
-]
-ClassifierFn = Callable[[torch.Tensor], torch.Tensor]
-
-
-class PoolingType(IntEnum):
-    """Enumeration for different types of pooling methods."""
-
-    LAST = 0
-    ALL = 1
-    CLS = 2
-    STEP = 3
-    MEAN = 4
-
-
-@dataclass(frozen=True)
-class ResolvedPoolingConfig:
-    pooling_type: PoolingType
-    task: PoolingTask
-
-    @classmethod
-    def from_config(
-        cls,
-        task: PoolingTask,
-        pooler_config: PoolerConfig,
-    ) -> "ResolvedPoolingConfig":
-        assert pooler_config.pooling_type is not None
-        return cls(task=task, pooling_type=PoolingType[pooler_config.pooling_type])
-
-
-@dataclass(frozen=True)
-class PoolingParamsUpdate:
-    requires_token_ids: bool = False
-    """Set this flag to enable `get_prompt_token_ids` for your pooler."""
-
-    def apply(self, params: PoolingParams) -> None:
-        params.requires_token_ids = self.requires_token_ids
-
-
-def get_classification_activation_function(config: PretrainedConfig):
-    # Implement alignment with transformers ForSequenceClassificationLoss
-    # https://github.com/huggingface/transformers/blob/57bb6db6ee4cfaccc45b8d474dfad5a17811ca60/src/transformers/loss/loss_utils.py#L92
-    problem_type = getattr(config, "problem_type", "")
-    if problem_type == "regression":
-        return PoolerIdentity()
-    if problem_type == "single_label_classification":
-        return PoolerClassify()
-    if problem_type == "multi_label_classification":
-        return PoolerMultiLabelClassify()
-    return PoolerClassify()
-
-
-def get_cross_encoder_activation_function(config: PretrainedConfig):
-    function_name: str | None = None
-    if (
-        hasattr(config, "sentence_transformers")
-        and "activation_fn" in config.sentence_transformers
-    ):
-        function_name = config.sentence_transformers["activation_fn"]
-    elif (
-        hasattr(config, "sbert_ce_default_activation_function")
-        and config.sbert_ce_default_activation_function is not None
-    ):
-        function_name = config.sbert_ce_default_activation_function
-
-    if function_name is not None:
-        assert function_name.startswith("torch.nn.modules."), (
-            "Loading of activation functions is restricted to "
-            "torch.nn.modules for security reasons"
-        )
-        fn = resolve_obj_by_qualname(function_name)()
-        return PoolerActivation.wraps(fn)
-
-    return PoolerClassify()
-
-
-class PoolingMethod(nn.Module, ABC):
-    @staticmethod
-    def from_pooling_type(pooling_type: PoolingType) -> "PoolingMethod":
-        if pooling_type == PoolingType.LAST:
-            return LastPool()
-        if pooling_type == PoolingType.ALL:
-            return AllPool()
-        if pooling_type == PoolingType.CLS:
-            return CLSPool()
-        if pooling_type == PoolingType.MEAN:
-            return MeanPool()
-
-        raise NotImplementedError(f"Unsupported method: {pooling_type}")
-
-    @abstractmethod
-    def get_supported_tasks(self) -> Set[PoolingTask]:
-        raise NotImplementedError
-
-    def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
-        return PoolingParamsUpdate()
-
-    @abstractmethod
-    def forward_all(
-        self,
-        hidden_states: torch.Tensor,
-        pooling_cursor: PoolingCursor,
-    ) -> PoolerOutput:
-        raise NotImplementedError
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        pooling_metadata: PoolingMetadata,
-    ) -> PoolerOutput:
-        pooling_cursor = pooling_metadata.pooling_cursor
-        return self.forward_all(hidden_states, pooling_cursor)
-
-
-class CLSPool(PoolingMethod):
-    def get_supported_tasks(self) -> Set[PoolingTask]:
-        return {"token_embed", "token_classify", "embed", "classify", "score"}
-
-    def forward_all(
-        self,
-        hidden_states: torch.Tensor,
-        pooling_cursor: PoolingCursor,
-    ) -> PoolerOutput:
-        assert not pooling_cursor.is_partial_prefill(), (
-            "partial prefill not supported with CLS pooling"
-        )
-
-        return hidden_states[pooling_cursor.first_token_indices_gpu]
-
-
-class LastPool(PoolingMethod):
-    def get_supported_tasks(self) -> Set[PoolingTask]:
-        return {"token_embed", "token_classify", "embed", "classify", "score"}
-
-    def forward_all(
-        self,
-        hidden_states: torch.Tensor,
-        pooling_cursor: PoolingCursor,
-    ) -> PoolerOutput:
-        return hidden_states[pooling_cursor.last_token_indices_gpu]
-
-
-class AllPool(PoolingMethod):
-    def __init__(self):
-        super().__init__()
-
-        vllm_config = get_current_vllm_config()
-        self.enable_chunked_prefill = (
-            vllm_config.scheduler_config.enable_chunked_prefill
-        )
-
-    def get_supported_tasks(self) -> Set[PoolingTask]:
-        return {"token_embed", "token_classify"}
-
-    def forward_all(
-        self, hidden_states: torch.Tensor, pooling_cursor: PoolingCursor
-    ) -> PoolerOutput:
-        raise NotImplementedError(
-            "forward_all is not implemented for AllPool. Use forward instead."
-        )
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        pooling_metadata: PoolingMetadata,
-    ) -> PoolerOutput:
-        pooling_cursor = pooling_metadata.pooling_cursor
-        is_finished = pooling_cursor.is_finished()
-        hidden_states_lst = list(
-            hidden_states.split(pooling_cursor.num_scheduled_tokens_cpu.tolist())
-        )
-        hidden_states_lst = [hidden_states_lst[i] for i in pooling_cursor.index]
-
-        if not self.enable_chunked_prefill:
-            return hidden_states_lst
-
-        pooling_states = pooling_metadata.pooling_states
-
-        # If chunked_prefill is enabled
-        # 1. first store the chunked hidden_states in pooling_states.hidden_states_cache
-        for p, hs_chunk in zip(pooling_states, hidden_states_lst):
-            p.hidden_states_cache.append(hs_chunk)
-
-        # 2. Once prefill is finished, send hidden_states_cache to PoolerHead
-        output_list: PoolerOutput = []
-        for p, finished in zip(pooling_states, is_finished):
-            if finished:
-                hidden_states_cache = p.hidden_states_cache
-                if len(hidden_states_cache) == 1:
-                    output_list.append(hidden_states_cache[0])
-                else:
-                    output_list.append(torch.concat(hidden_states_cache, dim=0))
-                p.clean()
-            else:
-                output_list.append(None)
-
-        return output_list
-
-
-class MeanPool(PoolingMethod):
-    def get_supported_tasks(self) -> Set[PoolingTask]:
-        return {"token_embed", "token_classify", "embed", "classify", "score"}
-
-    def forward_all(
-        self,
-        hidden_states: torch.Tensor,
-        pooling_cursor: PoolingCursor,
-    ) -> PoolerOutput:
-        assert not pooling_cursor.is_partial_prefill(), (
-            "partial prefill not supported with MEAN pooling"
-        )
-
-        prompt_lens = pooling_cursor.prompt_lens_cpu.to(
-            hidden_states.device, non_blocking=True
-        )
-
-        # Use float32 for torch.cumsum in MeanPool,
-        # otherwise precision will be lost significantly.
-        cumsum = torch.cumsum(hidden_states, dim=0, dtype=torch.float32)
-
-        start_indices = pooling_cursor.first_token_indices_gpu
-        end_indices = pooling_cursor.last_token_indices_gpu
-        return (
-            cumsum[end_indices] - cumsum[start_indices] + hidden_states[start_indices]
-        ) / prompt_lens.unsqueeze(1)
-
-
-_T = TypeVar("_T", torch.Tensor, list[torch.Tensor])
-
-
-class BasePoolerActivation(nn.Module, ABC):
-    @abstractmethod
-    def forward(self, pooled_data: _T) -> _T:
-        # shape:
-        # classify (& score) -> (batch_size, num_classes)
-        # embed -> (batch_size, embedding_dim) or list(embedding_dim)
-        #          (batch_size, dimensions) or list(dimensions) if using MRL
-        raise NotImplementedError
-
-
-class PoolerActivation(BasePoolerActivation):
-    @staticmethod
-    def wraps(module: nn.Module):
-        if isinstance(module, nn.Identity):
-            return PoolerIdentity()
-        if isinstance(module, (nn.Sigmoid, nn.Softmax)):
-            return PoolerClassify()
-
-        return LambdaPoolerActivation(module)
-
-    @abstractmethod
-    def forward_chunk(self, pooled_data: torch.Tensor) -> torch.Tensor:
-        raise NotImplementedError
-
-    def forward(self, pooled_data: _T) -> _T:
-        if isinstance(pooled_data, list):
-            return [self.forward_chunk(data) for data in pooled_data]
-
-        return self.forward_chunk(pooled_data)
-
-
-class PoolerIdentity(PoolerActivation):
-    def forward_chunk(self, pooled_data: torch.Tensor) -> torch.Tensor:
-        return pooled_data
-
-
-class PoolerNormalize(PoolerActivation):
-    def forward_chunk(self, pooled_data: torch.Tensor) -> torch.Tensor:
-        return F.normalize(pooled_data, p=2, dim=-1)
-
-
-class PoolerMultiLabelClassify(PoolerActivation):
-    def forward_chunk(self, pooled_data: torch.Tensor) -> torch.Tensor:
-        return F.sigmoid(pooled_data)
-
-
-class PoolerClassify(PoolerActivation):
-    def __init__(self, *, static_num_labels: bool = True) -> None:
-        super().__init__()
-
-        if static_num_labels:
-            vllm_config = get_current_vllm_config()
-            self.num_labels = getattr(
-                vllm_config.model_config.hf_config, "num_labels", 0
-            )
-            if self.num_labels == 0:
-                logger.warning(
-                    "num_labels should be > 0 for classification"
-                    "models, falling back to softmax. "
-                    "Please check if the configuration is correct."
-                )
-        else:
-            self.num_labels = None
-
-    def forward_chunk(self, pooled_data: torch.Tensor) -> torch.Tensor:
-        num_labels = (
-            self.num_labels if self.num_labels is not None else pooled_data.shape[-1]
-        )
-
-        if num_labels < 2:
-            return F.sigmoid(pooled_data)
-
-        return F.softmax(pooled_data, dim=-1)
-
-
-class LambdaPoolerActivation(PoolerActivation):
-    def __init__(self, fn: Callable[[torch.Tensor], torch.Tensor]):
-        super().__init__()
-
-        self.fn = fn
-
-    def forward_chunk(self, pooled_data: torch.Tensor) -> torch.Tensor:
-        return self.fn(pooled_data)
-
-
-class Pooler(nn.Module, ABC):
-    """The interface required for all poolers used in pooling models in vLLM."""
-
-    @staticmethod
-    def for_token_embed(pooler_config: PoolerConfig):
-        head = TokenEmbeddingPoolerHead()
-
-        if pooler_config.pooling_type == "STEP":
-            return StepPooler(head=head)
-
-        return AllPooler(head=head)
-
-    @staticmethod
-    def for_token_classify(
-        pooler_config: PoolerConfig,
-        classifier: ClassifierFn | None = None,
-        act_fn: PoolerActivation | str | None = None,
-    ):
-        head = TokenClassifierPoolerHead(classifier=classifier, act_fn=act_fn)
-
-        if pooler_config.pooling_type == "STEP":
-            return StepPooler(head=head)
-
-        return AllPooler(head=head)
-
-    @staticmethod
-    def for_embed(pooler_config: PoolerConfig):
-        resolved_config = ResolvedPoolingConfig.from_config(
-            task="embed",
-            pooler_config=pooler_config,
-        )
-
-        pooling = PoolingMethod.from_pooling_type(resolved_config.pooling_type)
-        head = EmbeddingPoolerHead()
-
-        return SimplePooler(pooling=pooling, head=head)
-
-    @staticmethod
-    def for_classify(
-        pooler_config: PoolerConfig,
-        classifier: ClassifierFn | None,
-        act_fn: PoolerActivation | str | None = None,
-    ):
-        resolved_config = ResolvedPoolingConfig.from_config(
-            task="classify",
-            pooler_config=pooler_config,
-        )
-
-        pooling = PoolingMethod.from_pooling_type(resolved_config.pooling_type)
-
-        return ClassifierPooler(
-            pooling=pooling,
-            classifier=classifier,
-            act_fn=act_fn,
-        )
-
-    @abstractmethod
-    def get_supported_tasks(self) -> Set[PoolingTask]:
-        """Determine which pooling tasks are supported."""
-        raise NotImplementedError
-
-    def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
-        """
-        Construct the updated pooling parameters to use for a supported task.
-        """
-        return PoolingParamsUpdate()
-
-    @abstractmethod
-    def forward(
-        self,
-        hidden_states: list[torch.Tensor] | torch.Tensor,
-        pooling_metadata: PoolingMetadata,
-    ) -> PoolerOutput:
-        raise NotImplementedError
-
-
-class DummyPooler(Pooler):
-    def get_supported_tasks(self) -> Set[PoolingTask]:
-        return {"plugin", "score"}
-
-    def forward(
-        self,
-        hidden_states: list[torch.Tensor] | torch.Tensor,
-        pooling_metadata: PoolingMetadata,
-    ) -> PoolerOutput:
-        return hidden_states
-
-
-class PoolerHead(nn.Module):
-    def __init__(self, activation: PoolerActivation) -> None:
-        super().__init__()
-        self.activation = activation
-
-    def forward(
-        self,
-        pooled_data: list[torch.Tensor] | torch.Tensor,
-        pooling_metadata: PoolingMetadata,
-    ) -> PoolerOutput:
-        return self.activation(pooled_data)
-
-
-class EmbeddingPoolerHead(PoolerHead):
-    def __init__(self) -> None:
-        super().__init__(activation=PoolerNormalize())
-
-        # Load ST projector if available
-        vllm_config = get_current_vllm_config()
-        self.projector: nn.Module | None = (
-            _load_st_projector(vllm_config.model_config) if vllm_config else None
-        )
-        self.head_dtype = vllm_config.model_config.head_dtype
-
-    def forward(
-        self,
-        pooled_data: list[torch.Tensor] | torch.Tensor,
-        pooling_metadata: PoolingMetadata,
-    ) -> PoolerOutput:
-        if isinstance(pooled_data, list):
-            pooled_data = torch.stack(pooled_data)
-        # pooled_data shape: [batchsize, hidden_dimension]
-
-        pooled_data = pooled_data.to(self.head_dtype)
-
-        # Apply ST projector
-        if self.projector is not None:
-            pooled_data = self.projector(pooled_data)
-        # pooled_data shape: [batchsize, embedding_dimension]
-
-        pooling_params = pooling_metadata.pooling_params
-
-        # for matryoshka representation
-        dimensions_list = [pooling_param.dimensions for pooling_param in pooling_params]
-        if any(d is not None for d in dimensions_list):
-            # change the output dimension
-            assert len(pooled_data) == len(dimensions_list)
-            if len(set(dimensions_list)) == 1 and not isinstance(pooled_data, list):
-                # if all dimensions are the same
-                d = dimensions_list[0]
-                pooled_data = pooled_data[..., :d]
-            else:
-                pooled_data = [
-                    vecs if d is None else vecs[..., :d]
-                    for vecs, d in zip(pooled_data, dimensions_list)
-                ]
-
-        # for normalize
-        flags = [p.normalize for p in pooling_params]
-        if len(set(flags)) == 1:
-            if flags[0]:
-                pooled_data = self.activation(pooled_data)
-        else:
-            pooled_data = [
-                self.activation(vecs) if f else vecs
-                for vecs, f in zip(pooled_data, flags)
-            ]
-
-        # pooled_data shape: [batchsize, embedding_dimension]
-        return pooled_data
-
-
-class SimplePooler(Pooler):
-    """A layer that pools specific information from hidden states.
-
-    This layer does the following:
-    1. Extracts specific tokens or aggregates data based on pooling method.
-    2. Normalizes output if specified.
-    3. Returns structured results as `PoolerOutput`.
-    """
-
-    def __init__(self, pooling: PoolingMethod, head: PoolerHead) -> None:
-        super().__init__()
-
-        self.pooling = pooling
-        self.head = head
-
-    def get_supported_tasks(self) -> Set[PoolingTask]:
-        return self.pooling.get_supported_tasks()
-
-    def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
-        return self.pooling.get_pooling_updates(task)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor | list[torch.Tensor],
-        pooling_metadata: PoolingMetadata,
-    ) -> PoolerOutput:
-        pooled_data = self.pooling(hidden_states, pooling_metadata)
-        pooled_data = self.head(pooled_data, pooling_metadata)
-        return pooled_data
-
-
-class ClassifierPooler(Pooler):
-    """A pooling layer for classification tasks.
-
-    This layer does the following:
-    1. Applies a classification layer to the hidden states.
-    2. Optionally applies a pooler layer.
-    3. Applies an activation function to the output.
-    """
-
-    @staticmethod
-    def act_fn_for_seq_cls(model_config: ModelConfig):
-        return get_classification_activation_function(model_config.hf_config)
-
-    @staticmethod
-    def act_fn_for_cross_encoder(model_config: ModelConfig):
-        return get_cross_encoder_activation_function(model_config.hf_config)
-
-    @staticmethod
-    def resolve_act_fn(
-        model_config: ModelConfig,
-        static_num_labels: bool = True,
-        act_fn: PoolerActivation | str | None = None,
-    ):
-        if isinstance(act_fn, str):
-            if act_fn == "classify":
-                return ClassifierPooler.act_fn_for_seq_cls(model_config)
-            elif act_fn == "score":
-                return ClassifierPooler.act_fn_for_cross_encoder(model_config)
-            else:
-                raise ValueError(f"act_fn [{act_fn=}] not supported.")
-        elif act_fn is None:
-            return PoolerClassify(static_num_labels=static_num_labels)
-        else:
-            assert callable(act_fn)
-            return act_fn
-
-    def __init__(
-        self,
-        pooling: PoolingFn,
-        classifier: ClassifierFn | None,
-        act_fn: PoolerActivation | str | None = None,
-    ) -> None:
-        super().__init__()
-
-        vllm_config = get_current_vllm_config()
-        self.pooling = pooling
-        self.classifier = classifier
-        self.act_fn = self.resolve_act_fn(
-            vllm_config.model_config, static_num_labels=True, act_fn=act_fn
-        )
-        self.logit_bias: float | None = (
-            vllm_config.model_config.pooler_config.logit_bias
-        )
-        self.head_dtype = vllm_config.model_config.head_dtype
-
-    def get_supported_tasks(self) -> Set[PoolingTask]:
-        return {"classify", "score"}
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor | list[torch.Tensor],
-        pooling_metadata: PoolingMetadata,
-    ) -> PoolerOutput:
-        pooled_data = self.pooling(hidden_states, pooling_metadata)
-        if isinstance(pooled_data, list):
-            pooled_data = torch.stack(pooled_data)
-        # pooled_data shape: [batchsize, hidden_size]
-
-        pooled_data = pooled_data.to(self.head_dtype)
-
-        if self.classifier is not None:
-            pooled_data = self.classifier(pooled_data)
-        # pooled_data shape: [batchsize, num_labels]
-
-        if self.logit_bias is not None:
-            pooled_data -= self.logit_bias
-
-        pooling_params = pooling_metadata.pooling_params
-        flags = [p.use_activation for p in pooling_params]
-
-        if len(set(flags)) == 1:
-            scores = self.act_fn(pooled_data) if flags[0] else pooled_data
-        else:
-            scores = [
-                self.act_fn(vecs) if f else vecs for vecs, f in zip(pooled_data, flags)
-            ]
-
-        # scores shape: [batchsize, num_labels]
-        return scores
-
-
-class TokenEmbeddingPoolerHead(EmbeddingPoolerHead):
-    def forward(
-        self, pooled_data: torch.Tensor | None, pooling_param: PoolingParams
-    ) -> PoolerOutput:
-        # for unfinished chunked prefill
-        if pooled_data is None:
-            return None
-
-        pooled_data = pooled_data.to(self.head_dtype)
-        # pooled_data shape: [n_tokens, hidden_dimension]
-
-        # Apply ST projector
-        if self.projector is not None:
-            pooled_data = self.projector(pooled_data)
-        # pooled_data shape: [n_tokens, embedding_dimension]
-
-        # for matryoshka representation
-        pooled_data = pooled_data[..., : pooling_param.dimensions]
-
-        # for normalize
-        if pooling_param.normalize:
-            pooled_data = self.activation(pooled_data)
-
-        # pooled_data shape: [n_tokens, embedding_dimension]
-        return pooled_data
-
-
-class TokenClassifierPoolerHead(nn.Module):
-    def __init__(
-        self,
-        classifier: ClassifierFn | None,
-        act_fn: PoolerActivation | str | None = None,
-    ) -> None:
-        super().__init__()
-        vllm_config = get_current_vllm_config()
-
-        self.classifier = classifier
-        self.act_fn = ClassifierPooler.resolve_act_fn(
-            vllm_config.model_config, static_num_labels=False, act_fn=act_fn
-        )
-        self.logit_bias: float | None = (
-            vllm_config.model_config.pooler_config.logit_bias
-        )
-        self.head_dtype = vllm_config.model_config.head_dtype
-
-    def get_supported_tasks(self) -> Set[PoolingTask]:
-        return {"token_classify"}
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor | None,
-        pooling_param: PoolingParams,
-    ) -> PoolerOutput:
-        # for unfinished chunked prefill
-        if hidden_states is None:
-            return None
-
-        hidden_states = hidden_states.to(self.head_dtype)
-        # hidden_states shape: [n_token, hidden_size]
-
-        if self.classifier is not None:
-            scores = self.classifier(hidden_states)
-        else:
-            scores = hidden_states
-        # scores shape: [n_token, num_labels]
-
-        if self.logit_bias is not None:
-            scores -= self.logit_bias
-
-        if pooling_param.use_activation:
-            scores = self.act_fn(scores)
-
-        # scores shape: [n_token, num_labels]
-        return scores
-
-
-class AllPooler(Pooler):
-    def __init__(self, head: nn.Module | PoolerHead) -> None:
-        super().__init__()
-
-        self.pooling = AllPool()
-        self.head = head
-
-    def get_supported_tasks(self) -> Set[PoolingTask]:
-        return {"token_embed", "token_classify"}
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        pooling_metadata: PoolingMetadata,
-    ) -> PoolerOutput:
-        pooled_data = self.pooling(hidden_states, pooling_metadata)
-        pooling_params = pooling_metadata.pooling_params
-        assert len(pooled_data) == len(pooling_params)
-
-        pooled_data = [self.head(d, p) for d, p in zip(pooled_data, pooling_params)]
-        return pooled_data
-
-
-class StepPooler(Pooler):
-    def __init__(self, head: nn.Module | PoolerHead) -> None:
-        super().__init__()
-
-        self.pooling = AllPool()
-        self.head = head
-
-    def extract_states(
-        self,
-        hidden_states: torch.Tensor | list[torch.Tensor],
-        pooling_metadata: PoolingMetadata,
-    ) -> PoolerOutput:
-        pooled_data_lst = self.pooling(hidden_states, pooling_metadata)
-        prompt_token_ids = pooling_metadata.get_prompt_token_ids()
-        pooling_params = pooling_metadata.pooling_params
-
-        pooled_data: PoolerOutput = []
-        for data, token_id, pooling_param in zip(
-            pooled_data_lst, prompt_token_ids, pooling_params
-        ):
-            # for unfinished chunked prefill
-            if data is None:
-                pooled_data.append(data)
-                continue
-
-            step_tag_id = pooling_param.step_tag_id
-            returned_token_ids = pooling_param.returned_token_ids
-
-            if returned_token_ids is not None and len(returned_token_ids) > 0:
-                data = data[:, returned_token_ids]
-
-            if step_tag_id is not None:
-                data = data[token_id == step_tag_id]
-            pooled_data.append(data)
-
-        return pooled_data
-
-    def get_supported_tasks(self) -> Set[PoolingTask]:
-        return {"token_embed", "token_classify"}
-
-    def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
-        return PoolingParamsUpdate(requires_token_ids=True)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor | list[torch.Tensor],
-        pooling_metadata: PoolingMetadata,
-    ) -> PoolerOutput:
-        pooled_data = self.extract_states(hidden_states, pooling_metadata)
-        pooling_params = pooling_metadata.pooling_params
-        assert len(pooled_data) == len(pooling_params)
-
-        pooled_data = [self.head(d, p) for d, p in zip(pooled_data, pooling_params)]
-        return pooled_data
-
-
-class DispatchPooler(Pooler):
-    """Dispatches calls to a sub-pooler based on the pooling task."""
-
-    def __init__(self, poolers_by_task: Mapping[PoolingTask, Pooler]) -> None:
-        super().__init__()
-
-        for task, pooler in poolers_by_task.items():
-            if task not in pooler.get_supported_tasks():
-                raise ValueError(
-                    f"{pooler=} does not support {task=}. "
-                    f"Supported tasks: {pooler.get_supported_tasks()}"
-                )
-
-        self.poolers_by_task = poolers_by_task
-
-    def get_supported_tasks(self) -> Set[PoolingTask]:
-        return set(self.poolers_by_task)
-
-    def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
-        return self.poolers_by_task[task].get_pooling_updates(task)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor | list[torch.Tensor],
-        pooling_metadata: PoolingMetadata,
-    ) -> PoolerOutput:
-        poolers_by_task = self.poolers_by_task
-
-        outputs = list[torch.Tensor]()
-        offset = 0
-        for task, group in groupby(pooling_metadata.tasks):
-            if not (pooler := poolers_by_task.get(task)):
-                raise ValueError(
-                    f"Unsupported task: {task} "
-                    f"Supported tasks: {self.get_supported_tasks()}"
-                )
-
-            num_items = len(list(group))
-            group_output: PoolerOutput = pooler(
-                hidden_states,
-                pooling_metadata[offset : offset + num_items],
-            )
-
-            outputs.extend(group_output)
-            offset += num_items
-
-        return outputs
-
-    def extra_repr(self) -> str:
-        s = f"supported_task={self.get_supported_tasks()}"
-        return s
diff --git a/vllm/model_executor/layers/pooler/__init__.py b/vllm/model_executor/layers/pooler/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2be3613385e023471e8aafaa2dffd5f100a6ffbc
--- /dev/null
+++ b/vllm/model_executor/layers/pooler/__init__.py
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from .abstract import *
+from .common import *
+from .special import *
diff --git a/vllm/model_executor/layers/pooler/abstract.py b/vllm/model_executor/layers/pooler/abstract.py
new file mode 100644
index 0000000000000000000000000000000000000000..82abef4f69adbfa1f357c319d7c0d0231d2ba390
--- /dev/null
+++ b/vllm/model_executor/layers/pooler/abstract.py
@@ -0,0 +1,39 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from abc import ABC, abstractmethod
+from collections.abc import Set
+
+import torch
+import torch.nn as nn
+
+from vllm.tasks import PoolingTask
+from vllm.v1.outputs import PoolerOutput
+from vllm.v1.pool.metadata import PoolingMetadata
+
+from .common import PoolingParamsUpdate
+
+
+class Pooler(nn.Module, ABC):
+    """The interface required for all poolers used in pooling models in vLLM."""
+
+    @abstractmethod
+    def get_supported_tasks(self) -> Set[PoolingTask]:
+        """Determine which pooling tasks are supported."""
+        raise NotImplementedError
+
+    def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
+        """
+        Construct the updated pooling parameters to use for a supported task.
+        """
+        return PoolingParamsUpdate()
+
+    @abstractmethod
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> PoolerOutput:
+        raise NotImplementedError
+
+
+__all__ = ["Pooler"]
diff --git a/vllm/model_executor/layers/pooler/activations.py b/vllm/model_executor/layers/pooler/activations.py
new file mode 100644
index 0000000000000000000000000000000000000000..b57e6ba68b9413692c27f6fa5e680b94290c2444
--- /dev/null
+++ b/vllm/model_executor/layers/pooler/activations.py
@@ -0,0 +1,162 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from abc import ABC, abstractmethod
+from collections.abc import Callable
+from typing import TypeVar
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PretrainedConfig
+
+from vllm.config import ModelConfig, get_current_vllm_config
+from vllm.logger import init_logger
+from vllm.utils.import_utils import resolve_obj_by_qualname
+
+logger = init_logger(__name__)
+
+
+def get_classification_act_fn(
+    config: PretrainedConfig,
+) -> "PoolerActivation":
+    # Implement alignment with transformers ForSequenceClassificationLoss
+    # https://github.com/huggingface/transformers/blob/57bb6db6ee4cfaccc45b8d474dfad5a17811ca60/src/transformers/loss/loss_utils.py#L92
+    problem_type = getattr(config, "problem_type", "")
+    if problem_type == "regression":
+        return PoolerIdentity()
+    if problem_type == "single_label_classification":
+        return PoolerClassify()
+    if problem_type == "multi_label_classification":
+        return PoolerMultiLabelClassify()
+
+    return PoolerClassify()
+
+
+def get_cross_encoder_act_fn(
+    config: PretrainedConfig,
+) -> "PoolerActivation":
+    function_name: str | None = None
+    if (
+        hasattr(config, "sentence_transformers")
+        and "activation_fn" in config.sentence_transformers
+    ):
+        function_name = config.sentence_transformers["activation_fn"]
+    elif (
+        hasattr(config, "sbert_ce_default_activation_function")
+        and config.sbert_ce_default_activation_function is not None
+    ):
+        function_name = config.sbert_ce_default_activation_function
+
+    if function_name is not None:
+        assert function_name.startswith("torch.nn.modules."), (
+            "Loading of activation functions is restricted to "
+            "torch.nn.modules for security reasons"
+        )
+        fn = resolve_obj_by_qualname(function_name)()
+        return PoolerActivation.wraps(fn)
+
+    return PoolerClassify()
+
+
+def resolve_classifier_act_fn(
+    model_config: ModelConfig,
+    static_num_labels: bool = True,
+    act_fn: "PoolerActivation | str | None" = None,
+):
+    if isinstance(act_fn, str):
+        if act_fn == "classify":
+            return get_classification_act_fn(model_config.hf_config)
+        if act_fn == "score":
+            return get_cross_encoder_act_fn(model_config.hf_config)
+
+        raise ValueError(f"act_fn [{act_fn=}] not supported.")
+
+    if act_fn is None:
+        return PoolerClassify(static_num_labels=static_num_labels)
+
+    assert callable(act_fn)
+    return act_fn
+
+
+_T = TypeVar("_T", torch.Tensor, list[torch.Tensor])
+
+
+class PoolerActivation(nn.Module, ABC):
+    @staticmethod
+    def wraps(module: nn.Module):
+        if isinstance(module, nn.Identity):
+            return PoolerIdentity()
+        if isinstance(module, (nn.Sigmoid, nn.Softmax)):
+            return PoolerClassify()
+
+        return LambdaPoolerActivation(module)
+
+    @abstractmethod
+    def forward_chunk(self, pooled_data: torch.Tensor) -> torch.Tensor:
+        raise NotImplementedError
+
+    def forward(self, pooled_data: _T) -> _T:
+        # shape:
+        # classify (& score) -> (batch_size, num_classes)
+        # embed -> (batch_size, embedding_dim) or list(embedding_dim)
+        #          (batch_size, dimensions) or list(dimensions) if using MRL
+        if isinstance(pooled_data, list):
+            return [self.forward_chunk(data) for data in pooled_data]
+
+        return self.forward_chunk(pooled_data)
+
+
+class PoolerIdentity(PoolerActivation):
+    def forward_chunk(self, pooled_data: torch.Tensor) -> torch.Tensor:
+        return pooled_data
+
+
+class PoolerNormalize(PoolerActivation):
+    def forward_chunk(self, pooled_data: torch.Tensor) -> torch.Tensor:
+        return F.normalize(pooled_data, p=2, dim=-1)
+
+
+class PoolerMultiLabelClassify(PoolerActivation):
+    def forward_chunk(self, pooled_data: torch.Tensor) -> torch.Tensor:
+        return F.sigmoid(pooled_data)
+
+
+class PoolerClassify(PoolerActivation):
+    def __init__(self, *, static_num_labels: bool = True) -> None:
+        super().__init__()
+
+        if static_num_labels:
+            vllm_config = get_current_vllm_config()
+            model_config = vllm_config.model_config
+            num_labels = getattr(model_config.hf_config, "num_labels", 0)
+        else:
+            num_labels = None
+
+        if num_labels == 0:
+            logger.warning(
+                "num_labels should be > 0 for classification "
+                "models, falling back to softmax. "
+                "Please check if the configuration is correct."
+            )
+
+        self.num_labels = num_labels
+
+    def forward_chunk(self, pooled_data: torch.Tensor) -> torch.Tensor:
+        num_labels = self.num_labels
+        if num_labels is None:
+            num_labels = pooled_data.shape[-1]
+
+        if num_labels < 2:
+            return F.sigmoid(pooled_data)
+
+        return F.softmax(pooled_data, dim=-1)
+
+
+class LambdaPoolerActivation(PoolerActivation):
+    def __init__(self, fn: Callable[[torch.Tensor], torch.Tensor]):
+        super().__init__()
+
+        self.fn = fn
+
+    def forward_chunk(self, pooled_data: torch.Tensor) -> torch.Tensor:
+        return self.fn(pooled_data)
diff --git a/vllm/model_executor/layers/pooler/common.py b/vllm/model_executor/layers/pooler/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8aa78e70cc6e083bdc7d6fe780cbcca3de077fd
--- /dev/null
+++ b/vllm/model_executor/layers/pooler/common.py
@@ -0,0 +1,32 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Callable
+from dataclasses import dataclass
+from typing import TypeVar
+
+import torch
+
+from vllm.pooling_params import PoolingParams
+
+_T = TypeVar("_T", bound=torch.Tensor | list[torch.Tensor])
+
+ProjectorFn = Callable[[torch.Tensor], torch.Tensor]
+ClassifierFn = Callable[[torch.Tensor], torch.Tensor]
+ActivationFn = Callable[[_T], _T]
+
+
+@dataclass(frozen=True)
+class PoolingParamsUpdate:
+    requires_token_ids: bool = False
+    """Set this flag to enable `get_prompt_token_ids` for your pooler."""
+
+    def __or__(self, other: "PoolingParamsUpdate") -> "PoolingParamsUpdate":
+        return PoolingParamsUpdate(
+            requires_token_ids=self.requires_token_ids or other.requires_token_ids,
+        )
+
+    def apply(self, params: PoolingParams) -> None:
+        params.requires_token_ids = self.requires_token_ids
+
+
+__all__ = ["ActivationFn", "ClassifierFn", "ProjectorFn", "PoolingParamsUpdate"]
diff --git a/vllm/model_executor/layers/pooler/seqwise/__init__.py b/vllm/model_executor/layers/pooler/seqwise/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1b0476a5ba2195b2adc13b105ccc7de3cf58de6
--- /dev/null
+++ b/vllm/model_executor/layers/pooler/seqwise/__init__.py
@@ -0,0 +1,45 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Poolers that produce an output aggregating all tokens in the sequence."""
+
+from .heads import (
+    ClassifierPoolerHead,
+    EmbeddingPoolerHead,
+    SequencePoolerHead,
+    SequencePoolerHeadOutput,
+)
+from .methods import (
+    CLSPool,
+    LastPool,
+    MeanPool,
+    SequencePoolingMethod,
+    SequencePoolingMethodOutput,
+    get_seq_pooling_method,
+)
+from .poolers import (
+    SequencePooler,
+    SequencePoolerOutput,
+    SequencePoolingFn,
+    SequencePoolingHeadFn,
+    pooler_for_classify,
+    pooler_for_embed,
+)
+
+__all__ = [
+    "SequencePoolerHead",
+    "SequencePoolerHeadOutput",
+    "ClassifierPoolerHead",
+    "EmbeddingPoolerHead",
+    "SequencePoolingMethod",
+    "SequencePoolingMethodOutput",
+    "CLSPool",
+    "LastPool",
+    "MeanPool",
+    "get_seq_pooling_method",
+    "SequencePooler",
+    "SequencePoolingFn",
+    "SequencePoolingHeadFn",
+    "SequencePoolerOutput",
+    "pooler_for_classify",
+    "pooler_for_embed",
+]
diff --git a/vllm/model_executor/layers/pooler/seqwise/heads.py b/vllm/model_executor/layers/pooler/seqwise/heads.py
new file mode 100644
index 0000000000000000000000000000000000000000..42059284e5cd59d8a3f6fa45a65d6d13c5bb24dd
--- /dev/null
+++ b/vllm/model_executor/layers/pooler/seqwise/heads.py
@@ -0,0 +1,151 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from abc import ABC, abstractmethod
+from collections.abc import Set
+from typing import TypeAlias
+
+import torch
+import torch.nn as nn
+
+from vllm.model_executor.layers.pooler import ActivationFn, ClassifierFn, ProjectorFn
+from vllm.tasks import PoolingTask
+from vllm.v1.pool.metadata import PoolingMetadata
+
+from .methods import SequencePoolingMethodOutput
+
+SequencePoolerHeadOutput: TypeAlias = torch.Tensor | list[torch.Tensor]
+
+
+class SequencePoolerHead(nn.Module, ABC):
+    @abstractmethod
+    def get_supported_tasks(self) -> Set[PoolingTask]:
+        raise NotImplementedError
+
+    @abstractmethod
+    def forward(
+        self,
+        pooled_data: SequencePoolingMethodOutput,
+        pooling_metadata: PoolingMetadata,
+    ) -> SequencePoolerHeadOutput:
+        raise NotImplementedError
+
+
+class EmbeddingPoolerHead(SequencePoolerHead):
+    def __init__(
+        self,
+        projector: ProjectorFn | None = None,
+        head_dtype: torch.dtype | str | None = None,
+        activation: ActivationFn | None = None,
+    ) -> None:
+        super().__init__()
+
+        self.projector = projector
+        self.head_dtype = head_dtype
+        self.activation = activation
+
+    def get_supported_tasks(self) -> Set[PoolingTask]:
+        return {"embed"}
+
+    def forward(
+        self,
+        pooled_data: SequencePoolingMethodOutput,
+        pooling_metadata: PoolingMetadata,
+    ) -> SequencePoolerHeadOutput:
+        pooling_params = pooling_metadata.pooling_params
+        assert len(pooled_data) == len(pooling_params)
+
+        if isinstance(pooled_data, list):
+            pooled_data = torch.stack(pooled_data)
+        # pooled_data shape: [batchsize, hidden_dimension]
+
+        if self.head_dtype is not None:
+            pooled_data = pooled_data.to(self.head_dtype)
+
+        # Apply ST projector
+        if self.projector is not None:
+            pooled_data = self.projector(pooled_data)
+        # pooled_data shape: [batchsize, embedding_dimension]
+
+        # for matryoshka representation
+        dimensions_list = [pooling_param.dimensions for pooling_param in pooling_params]
+        if any(d is not None for d in dimensions_list):
+            # change the output dimension
+            assert len(pooled_data) == len(dimensions_list)
+            if len(set(dimensions_list)) == 1 and not isinstance(pooled_data, list):
+                # if all dimensions are the same
+                d = dimensions_list[0]
+                pooled_data = pooled_data[..., :d]
+            else:
+                pooled_data = [
+                    vecs if d is None else vecs[..., :d]
+                    for vecs, d in zip(pooled_data, dimensions_list)
+                ]
+
+        # for normalize
+        if self.activation is not None:
+            flags = [p.use_activation for p in pooling_params]
+            if len(set(flags)) == 1:
+                if flags[0]:
+                    pooled_data = self.activation(pooled_data)
+            else:
+                pooled_data = [
+                    self.activation(vecs) if f else vecs
+                    for vecs, f in zip(pooled_data, flags)
+                ]
+
+        # pooled_data shape: [batchsize, embedding_dimension]
+        return pooled_data
+
+
+class ClassifierPoolerHead(SequencePoolerHead):
+    def __init__(
+        self,
+        classifier: ClassifierFn | None = None,
+        logit_bias: float | None = None,
+        head_dtype: torch.dtype | str | None = None,
+        activation: ActivationFn | None = None,
+    ) -> None:
+        super().__init__()
+
+        self.classifier = classifier
+        self.logit_bias = logit_bias
+        self.head_dtype = head_dtype
+        self.activation = activation
+
+    def get_supported_tasks(self) -> Set[PoolingTask]:
+        return {"classify", "score"}
+
+    def forward(
+        self,
+        pooled_data: SequencePoolingMethodOutput,
+        pooling_metadata: PoolingMetadata,
+    ) -> SequencePoolerHeadOutput:
+        pooling_params = pooling_metadata.pooling_params
+        assert len(pooled_data) == len(pooling_params)
+
+        if isinstance(pooled_data, list):
+            pooled_data = torch.stack(pooled_data)
+        # pooled_data shape: [batchsize, hidden_size]
+
+        if self.head_dtype is not None:
+            pooled_data = pooled_data.to(self.head_dtype)
+
+        if self.classifier is not None:
+            pooled_data = self.classifier(pooled_data)
+        # pooled_data shape: [batchsize, num_labels]
+
+        if self.logit_bias is not None:
+            pooled_data -= self.logit_bias
+
+        if self.activation is not None:
+            flags = [p.use_activation for p in pooling_params]
+            if len(set(flags)) == 1:
+                pooled_data = self.activation(pooled_data) if flags[0] else pooled_data
+            else:
+                pooled_data = [
+                    self.activation(vecs) if f else vecs
+                    for vecs, f in zip(pooled_data, flags)
+                ]
+
+        # pooled_data shape: [batchsize, num_labels]
+        return pooled_data
diff --git a/vllm/model_executor/layers/pooler/seqwise/methods.py b/vllm/model_executor/layers/pooler/seqwise/methods.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d855109509681b216fa90f3e9ba9c1c9d61b7d4
--- /dev/null
+++ b/vllm/model_executor/layers/pooler/seqwise/methods.py
@@ -0,0 +1,93 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from abc import ABC, abstractmethod
+from collections.abc import Set
+from typing import TypeAlias
+
+import torch
+import torch.nn as nn
+
+from vllm.config.pooler import SequencePoolingType
+from vllm.model_executor.layers.pooler import PoolingParamsUpdate
+from vllm.tasks import PoolingTask
+from vllm.v1.pool.metadata import PoolingMetadata
+
+SequencePoolingMethodOutput: TypeAlias = torch.Tensor | list[torch.Tensor]
+
+
+class SequencePoolingMethod(nn.Module, ABC):
+    def get_supported_tasks(self) -> Set[PoolingTask]:
+        return {"token_embed", "token_classify", "embed", "classify", "score"}
+
+    def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
+        return PoolingParamsUpdate()
+
+    @abstractmethod
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> SequencePoolingMethodOutput:
+        raise NotImplementedError
+
+
+class CLSPool(SequencePoolingMethod):
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> SequencePoolingMethodOutput:
+        pooling_cursor = pooling_metadata.get_pooling_cursor()
+        assert not pooling_cursor.is_partial_prefill(), (
+            "partial prefill not supported with CLS pooling"
+        )
+
+        return hidden_states[pooling_cursor.first_token_indices_gpu]
+
+
+class LastPool(SequencePoolingMethod):
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> SequencePoolingMethodOutput:
+        pooling_cursor = pooling_metadata.get_pooling_cursor()
+        return hidden_states[pooling_cursor.last_token_indices_gpu]
+
+
+class MeanPool(SequencePoolingMethod):
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> SequencePoolingMethodOutput:
+        pooling_cursor = pooling_metadata.get_pooling_cursor()
+        assert not pooling_cursor.is_partial_prefill(), (
+            "partial prefill not supported with MEAN pooling"
+        )
+
+        prompt_lens = pooling_cursor.prompt_lens_cpu.to(
+            hidden_states.device, non_blocking=True
+        )
+
+        # Use float32 for torch.cumsum in MeanPool,
+        # otherwise precision will be lost significantly.
+        cumsum = torch.cumsum(hidden_states, dim=0, dtype=torch.float32)
+
+        start_indices = pooling_cursor.first_token_indices_gpu
+        end_indices = pooling_cursor.last_token_indices_gpu
+
+        return (
+            cumsum[end_indices] - cumsum[start_indices] + hidden_states[start_indices]
+        ) / prompt_lens.unsqueeze(1)
+
+
+def get_seq_pooling_method(pooling_type: SequencePoolingType | str):
+    if pooling_type == "CLS":
+        return CLSPool()
+    if pooling_type == "LAST":
+        return LastPool()
+    if pooling_type == "MEAN":
+        return MeanPool()
+
+    raise NotImplementedError(f"Unknown sequence pooling type: {pooling_type!r}")
diff --git a/vllm/model_executor/layers/pooler/seqwise/poolers.py b/vllm/model_executor/layers/pooler/seqwise/poolers.py
new file mode 100644
index 0000000000000000000000000000000000000000..8bf3e25e66b6f90d70247264233a33fa36ec1a67
--- /dev/null
+++ b/vllm/model_executor/layers/pooler/seqwise/poolers.py
@@ -0,0 +1,127 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Callable, Set
+from typing import TypeAlias
+
+import torch
+
+from vllm.config import PoolerConfig, get_current_vllm_config
+from vllm.model_executor.layers.pooler import ClassifierFn, PoolingParamsUpdate
+from vllm.model_executor.layers.pooler.abstract import Pooler
+from vllm.model_executor.layers.pooler.activations import (
+    PoolerActivation,
+    PoolerNormalize,
+    resolve_classifier_act_fn,
+)
+from vllm.model_executor.models.adapters import _load_st_projector
+from vllm.tasks import POOLING_TASKS, PoolingTask
+from vllm.v1.pool.metadata import PoolingMetadata
+
+from .heads import (
+    ClassifierPoolerHead,
+    EmbeddingPoolerHead,
+    SequencePoolerHead,
+    SequencePoolerHeadOutput,
+)
+from .methods import (
+    SequencePoolingMethod,
+    SequencePoolingMethodOutput,
+    get_seq_pooling_method,
+)
+
+SequencePoolingFn: TypeAlias = Callable[
+    [torch.Tensor, PoolingMetadata],
+    SequencePoolingMethodOutput,
+]
+SequencePoolingHeadFn: TypeAlias = Callable[
+    [SequencePoolingMethodOutput, PoolingMetadata],
+    SequencePoolerHeadOutput,
+]
+
+SequencePoolerOutput: TypeAlias = torch.Tensor | list[torch.Tensor]
+
+
+class SequencePooler(Pooler):
+    """
+    A layer that pools specific information from hidden states.
+
+    This layer does the following:
+    1. Extracts specific tokens or aggregates data based on pooling method.
+    2. Postprocesses the output based on pooling head.
+    3. Returns structured results as `PoolerOutput`.
+    """
+
+    def __init__(
+        self,
+        pooling: SequencePoolingMethod | SequencePoolingFn,
+        head: SequencePoolerHead | SequencePoolingHeadFn,
+    ) -> None:
+        super().__init__()
+
+        self.pooling = pooling
+        self.head = head
+
+    def get_supported_tasks(self) -> Set[PoolingTask]:
+        tasks = set(POOLING_TASKS)
+
+        if isinstance(self.pooling, SequencePoolingMethod):
+            tasks &= self.pooling.get_supported_tasks()
+        if isinstance(self.head, SequencePoolerHead):
+            tasks &= self.head.get_supported_tasks()
+
+        return tasks
+
+    def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
+        updates = PoolingParamsUpdate()
+
+        if isinstance(self.pooling, SequencePoolingMethod):
+            updates |= self.pooling.get_pooling_updates(task)
+
+        return updates
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> SequencePoolerOutput:
+        pooled_data = self.pooling(hidden_states, pooling_metadata)
+        pooled_data = self.head(pooled_data, pooling_metadata)
+        return pooled_data
+
+
+def pooler_for_embed(pooler_config: PoolerConfig):
+    pooling = get_seq_pooling_method(pooler_config.get_seq_pooling_type())
+
+    vllm_config = get_current_vllm_config()
+    model_config = vllm_config.model_config
+    head = EmbeddingPoolerHead(
+        head_dtype=model_config.head_dtype,
+        projector=_load_st_projector(model_config),
+        activation=PoolerNormalize(),
+    )
+
+    return SequencePooler(pooling=pooling, head=head)
+
+
+def pooler_for_classify(
+    pooler_config: PoolerConfig,
+    *,
+    pooling: SequencePoolingMethod | SequencePoolingFn | None = None,
+    classifier: ClassifierFn | None = None,
+    act_fn: PoolerActivation | str | None = None,
+):
+    if pooling is None:
+        pooling = get_seq_pooling_method(pooler_config.get_seq_pooling_type())
+
+    vllm_config = get_current_vllm_config()
+    model_config = vllm_config.model_config
+    head = ClassifierPoolerHead(
+        head_dtype=model_config.head_dtype,
+        classifier=classifier,
+        logit_bias=model_config.pooler_config.logit_bias,
+        activation=resolve_classifier_act_fn(
+            model_config, static_num_labels=True, act_fn=act_fn
+        ),
+    )
+
+    return SequencePooler(pooling=pooling, head=head)
diff --git a/vllm/model_executor/layers/pooler/special.py b/vllm/model_executor/layers/pooler/special.py
new file mode 100644
index 0000000000000000000000000000000000000000..425f61a98ff3048a05fe2e357ad1ed7a79b4bd5b
--- /dev/null
+++ b/vllm/model_executor/layers/pooler/special.py
@@ -0,0 +1,128 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Mapping, Set
+from itertools import groupby
+
+import torch
+
+from vllm.config import PoolerConfig
+from vllm.model_executor.layers.pooler import PoolingParamsUpdate
+from vllm.tasks import PoolingTask
+from vllm.v1.pool.metadata import PoolingMetadata
+
+from .abstract import Pooler, PoolerOutput
+from .common import ClassifierFn
+from .seqwise import (
+    SequencePoolingFn,
+    SequencePoolingMethod,
+    pooler_for_classify,
+    pooler_for_embed,
+)
+from .tokwise import AllPool, pooler_for_token_classify, pooler_for_token_embed
+
+
+class DispatchPooler(Pooler):
+    """Dispatches calls to a sub-pooler based on the pooling task."""
+
+    @classmethod
+    def for_embedding(cls, pooler_config: PoolerConfig):
+        return cls(
+            {
+                "token_embed": pooler_for_token_embed(pooler_config),
+                "embed": pooler_for_embed(pooler_config),
+            },
+        )
+
+    @classmethod
+    def for_seq_cls(
+        cls,
+        pooler_config: PoolerConfig,
+        *,
+        pooling: SequencePoolingMethod | SequencePoolingFn | None = None,
+        classifier: ClassifierFn | None = None,
+    ):
+        return cls(
+            {
+                "token_classify": pooler_for_token_classify(
+                    pooler_config,
+                    pooling=AllPool(),
+                    classifier=classifier,
+                ),
+                "classify": pooler_for_classify(
+                    pooler_config,
+                    pooling=pooling,
+                    classifier=classifier,
+                    act_fn="classify",
+                ),
+                "score": pooler_for_classify(
+                    pooler_config,
+                    pooling=pooling,
+                    classifier=classifier,
+                    act_fn="score",
+                ),
+            }
+        )
+
+    def __init__(self, poolers_by_task: Mapping[PoolingTask, Pooler]) -> None:
+        super().__init__()
+
+        for task, pooler in poolers_by_task.items():
+            if task not in pooler.get_supported_tasks():
+                raise ValueError(
+                    f"{pooler=} does not support {task=}. "
+                    f"Supported tasks: {pooler.get_supported_tasks()}"
+                )
+
+        self.poolers_by_task = poolers_by_task
+
+    def get_supported_tasks(self) -> Set[PoolingTask]:
+        return set(self.poolers_by_task)
+
+    def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
+        return self.poolers_by_task[task].get_pooling_updates(task)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> PoolerOutput:
+        poolers_by_task = self.poolers_by_task
+
+        outputs = list[torch.Tensor | None]()
+        offset = 0
+        for task, group in groupby(pooling_metadata.tasks):
+            if not (pooler := poolers_by_task.get(task)):
+                raise ValueError(
+                    f"Unsupported task: {task!r} "
+                    f"Supported tasks: {self.get_supported_tasks()}"
+                )
+
+            num_items = len(list(group))
+            group_output: PoolerOutput = pooler(
+                hidden_states,
+                pooling_metadata[offset : offset + num_items],
+            )
+
+            outputs.extend(group_output)
+            offset += num_items
+
+        return outputs
+
+    def extra_repr(self) -> str:
+        s = f"supported_task={self.get_supported_tasks()}"
+        return s
+
+
+class IdentityPooler(Pooler):
+    def get_supported_tasks(self) -> Set[PoolingTask]:
+        return {"plugin", "score"}
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> PoolerOutput:
+        return hidden_states
+
+
+__all__ = ["DispatchPooler", "IdentityPooler"]
diff --git a/vllm/model_executor/layers/pooler/tokwise/__init__.py b/vllm/model_executor/layers/pooler/tokwise/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fbc610c8556416a89f36ae99b24c50ada9f52333
--- /dev/null
+++ b/vllm/model_executor/layers/pooler/tokwise/__init__.py
@@ -0,0 +1,39 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Poolers that produce an output for each token in the sequence."""
+
+from .heads import (
+    TokenClassifierPoolerHead,
+    TokenEmbeddingPoolerHead,
+    TokenPoolerHead,
+    TokenPoolerHeadOutputItem,
+)
+from .methods import (
+    AllPool,
+    StepPool,
+    TokenPoolingMethod,
+    TokenPoolingMethodOutputItem,
+    get_tok_pooling_method,
+)
+from .poolers import (
+    TokenPooler,
+    TokenPoolerOutput,
+    pooler_for_token_classify,
+    pooler_for_token_embed,
+)
+
+__all__ = [
+    "TokenPoolerHead",
+    "TokenPoolerHeadOutputItem",
+    "TokenClassifierPoolerHead",
+    "TokenEmbeddingPoolerHead",
+    "TokenPoolingMethod",
+    "TokenPoolingMethodOutputItem",
+    "AllPool",
+    "StepPool",
+    "get_tok_pooling_method",
+    "TokenPooler",
+    "TokenPoolerOutput",
+    "pooler_for_token_classify",
+    "pooler_for_token_embed",
+]
diff --git a/vllm/model_executor/layers/pooler/tokwise/heads.py b/vllm/model_executor/layers/pooler/tokwise/heads.py
new file mode 100644
index 0000000000000000000000000000000000000000..4183f5b1ba25c1c4ed788f82ed4682d7bb9cc221
--- /dev/null
+++ b/vllm/model_executor/layers/pooler/tokwise/heads.py
@@ -0,0 +1,133 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from abc import ABC, abstractmethod
+from collections.abc import Set
+from typing import TypeAlias
+
+import torch
+import torch.nn as nn
+
+from vllm.model_executor.layers.pooler import ActivationFn, ClassifierFn, ProjectorFn
+from vllm.pooling_params import PoolingParams
+from vllm.tasks import PoolingTask
+from vllm.v1.pool.metadata import PoolingMetadata
+
+from .methods import TokenPoolingMethodOutputItem
+
+TokenPoolerHeadOutputItem: TypeAlias = torch.Tensor | None
+
+
+class TokenPoolerHead(nn.Module, ABC):
+    @abstractmethod
+    def get_supported_tasks(self) -> Set[PoolingTask]:
+        raise NotImplementedError
+
+    @abstractmethod
+    def forward_chunk(
+        self,
+        pooled_data: TokenPoolingMethodOutputItem,
+        pooling_param: PoolingParams,
+    ) -> TokenPoolerHeadOutputItem:
+        raise NotImplementedError
+
+    def forward(
+        self,
+        pooled_data: list[TokenPoolingMethodOutputItem],
+        pooling_metadata: PoolingMetadata,
+    ) -> list[TokenPoolerHeadOutputItem]:
+        pooling_params = pooling_metadata.pooling_params
+        assert len(pooled_data) == len(pooling_params)
+
+        return [self.forward_chunk(d, p) for d, p in zip(pooled_data, pooling_params)]
+
+
+class TokenEmbeddingPoolerHead(TokenPoolerHead):
+    def __init__(
+        self,
+        head_dtype: torch.dtype | str | None = None,
+        projector: ProjectorFn | None = None,
+        activation: ActivationFn | None = None,
+    ) -> None:
+        super().__init__()
+
+        self.head_dtype = head_dtype
+        self.projector = projector
+        self.activation = activation
+
+    def get_supported_tasks(self) -> Set[PoolingTask]:
+        return {"token_embed"}
+
+    def forward_chunk(
+        self,
+        pooled_data: TokenPoolingMethodOutputItem,
+        pooling_param: PoolingParams,
+    ) -> TokenPoolerHeadOutputItem:
+        # for unfinished chunked prefill
+        if pooled_data is None:
+            return None
+
+        if self.head_dtype is not None:
+            pooled_data = pooled_data.to(self.head_dtype)
+        # pooled_data shape: [n_tokens, hidden_dimension]
+
+        # Apply ST projector
+        if self.projector is not None:
+            pooled_data = self.projector(pooled_data)
+        # pooled_data shape: [n_tokens, embedding_dimension]
+
+        # for matryoshka representation
+        pooled_data = pooled_data[..., : pooling_param.dimensions]
+
+        # for normalize
+        if self.activation is not None and pooling_param.use_activation:
+            pooled_data = self.activation(pooled_data)
+
+        # pooled_data shape: [n_tokens, embedding_dimension]
+        return pooled_data
+
+
+class TokenClassifierPoolerHead(TokenPoolerHead):
+    def __init__(
+        self,
+        classifier: ClassifierFn | None = None,
+        logit_bias: float | None = None,
+        head_dtype: torch.dtype | str | None = None,
+        activation: ActivationFn | None = None,
+    ) -> None:
+        super().__init__()
+
+        self.classifier = classifier
+        self.logit_bias = logit_bias
+        self.head_dtype = head_dtype
+        self.activation = activation
+
+    def get_supported_tasks(self) -> Set[PoolingTask]:
+        return {"token_classify"}
+
+    def forward_chunk(
+        self,
+        pooled_data: TokenPoolingMethodOutputItem,
+        pooling_param: PoolingParams,
+    ) -> TokenPoolerHeadOutputItem:
+        # for unfinished chunked prefill
+        if pooled_data is None:
+            return None
+
+        if self.head_dtype is not None:
+            pooled_data = pooled_data.to(self.head_dtype)
+        # hidden_states shape: [n_token, hidden_size]
+
+        if self.classifier is not None:
+            scores = self.classifier(pooled_data)
+        else:
+            scores = pooled_data
+        # scores shape: [n_token, num_labels]
+
+        if self.logit_bias is not None:
+            scores -= self.logit_bias
+
+        if self.activation is not None and pooling_param.use_activation:
+            scores = self.activation(scores)
+
+        # scores shape: [n_token, num_labels]
+        return scores
diff --git a/vllm/model_executor/layers/pooler/tokwise/methods.py b/vllm/model_executor/layers/pooler/tokwise/methods.py
new file mode 100644
index 0000000000000000000000000000000000000000..baa9d4075dd8f0da18e14c1c84e4508b152e86e7
--- /dev/null
+++ b/vllm/model_executor/layers/pooler/tokwise/methods.py
@@ -0,0 +1,122 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from abc import ABC, abstractmethod
+from collections.abc import Set
+from typing import TypeAlias
+
+import torch
+import torch.nn as nn
+
+from vllm.config import get_current_vllm_config
+from vllm.config.pooler import TokenPoolingType
+from vllm.model_executor.layers.pooler import PoolingParamsUpdate
+from vllm.tasks import PoolingTask
+from vllm.v1.pool.metadata import PoolingMetadata
+
+TokenPoolingMethodOutputItem: TypeAlias = torch.Tensor | None
+
+
+class TokenPoolingMethod(nn.Module, ABC):
+    def get_supported_tasks(self) -> Set[PoolingTask]:
+        return {"token_embed", "token_classify"}
+
+    def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
+        return PoolingParamsUpdate()
+
+    @abstractmethod
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> list[TokenPoolingMethodOutputItem]:
+        raise NotImplementedError
+
+
+class AllPool(TokenPoolingMethod):
+    def __init__(self):
+        super().__init__()
+
+        vllm_config = get_current_vllm_config()
+        scheduler_config = vllm_config.scheduler_config
+
+        self.enable_chunked_prefill = scheduler_config.enable_chunked_prefill
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> list[TokenPoolingMethodOutputItem]:
+        pooling_cursor = pooling_metadata.get_pooling_cursor()
+        hidden_states_all = hidden_states.split(
+            pooling_cursor.num_scheduled_tokens_cpu.tolist()
+        )
+        hidden_states_lst = [hidden_states_all[i] for i in pooling_cursor.index]
+
+        if not self.enable_chunked_prefill:
+            return hidden_states_lst
+
+        pooling_states = pooling_metadata.pooling_states
+
+        # If chunked_prefill is enabled
+        # 1. first store the chunked hidden_states in pooling_states.hidden_states_cache
+        for p, hs_chunk in zip(pooling_states, hidden_states_lst):
+            p.hidden_states_cache.append(hs_chunk)
+
+        # 2. Once prefill is finished, send hidden_states_cache to PoolerHead
+        output_list = list[TokenPoolingMethodOutputItem]()
+        for p, finished in zip(pooling_states, pooling_cursor.is_finished()):
+            if finished:
+                hidden_states_cache = p.hidden_states_cache
+                if len(hidden_states_cache) == 1:
+                    output_list.append(hidden_states_cache[0])
+                else:
+                    output_list.append(torch.concat(hidden_states_cache, dim=0))
+                p.clean()
+            else:
+                output_list.append(None)
+
+        return output_list
+
+
+class StepPool(AllPool):
+    def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
+        return PoolingParamsUpdate(requires_token_ids=True)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> list[TokenPoolingMethodOutputItem]:
+        pooled_data_lst = super().forward(hidden_states, pooling_metadata)
+        prompt_token_ids = pooling_metadata.get_prompt_token_ids()
+        pooling_params = pooling_metadata.pooling_params
+
+        pooled_data = list[torch.Tensor | None]()
+        for data, token_id, pooling_param in zip(
+            pooled_data_lst, prompt_token_ids, pooling_params
+        ):
+            # for unfinished chunked prefill
+            if data is None:
+                pass
+            else:
+                step_tag_id = pooling_param.step_tag_id
+                returned_token_ids = pooling_param.returned_token_ids
+
+                if returned_token_ids is not None and len(returned_token_ids) > 0:
+                    data = data[:, returned_token_ids]
+
+                if step_tag_id is not None:
+                    data = data[token_id == step_tag_id]
+
+            pooled_data.append(data)
+
+        return pooled_data
+
+
+def get_tok_pooling_method(pooling_type: TokenPoolingType | str):
+    if pooling_type == "ALL":
+        return AllPool()
+    if pooling_type == "STEP":
+        return StepPool()
+
+    raise NotImplementedError(f"Unknown tokenwise pooling type: {pooling_type!r}")
diff --git a/vllm/model_executor/layers/pooler/tokwise/poolers.py b/vllm/model_executor/layers/pooler/tokwise/poolers.py
new file mode 100644
index 0000000000000000000000000000000000000000..20790eff6aecc310735d395335f8d2fb47555394
--- /dev/null
+++ b/vllm/model_executor/layers/pooler/tokwise/poolers.py
@@ -0,0 +1,127 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Callable, Set
+from typing import TypeAlias
+
+import torch
+
+from vllm.config import PoolerConfig, get_current_vllm_config
+from vllm.model_executor.layers.pooler import ClassifierFn, PoolingParamsUpdate
+from vllm.model_executor.layers.pooler.abstract import Pooler
+from vllm.model_executor.layers.pooler.activations import (
+    PoolerActivation,
+    PoolerNormalize,
+    resolve_classifier_act_fn,
+)
+from vllm.model_executor.models.adapters import _load_st_projector
+from vllm.tasks import POOLING_TASKS, PoolingTask
+from vllm.v1.pool.metadata import PoolingMetadata
+
+from .heads import (
+    TokenClassifierPoolerHead,
+    TokenEmbeddingPoolerHead,
+    TokenPoolerHead,
+    TokenPoolerHeadOutputItem,
+)
+from .methods import (
+    TokenPoolingMethod,
+    TokenPoolingMethodOutputItem,
+    get_tok_pooling_method,
+)
+
+TokenPoolingFn: TypeAlias = Callable[
+    [torch.Tensor, PoolingMetadata],
+    list[TokenPoolingMethodOutputItem],
+]
+TokenPoolingHeadFn: TypeAlias = Callable[
+    [list[TokenPoolingMethodOutputItem], PoolingMetadata],
+    list[TokenPoolerHeadOutputItem],
+]
+
+TokenPoolerOutput: TypeAlias = list[torch.Tensor | None]
+
+
+class TokenPooler(Pooler):
+    """
+    A layer that pools specific information from hidden states.
+
+    This layer does the following:
+    1. Extracts specific tokens or aggregates data based on pooling method.
+    2. Postprocesses the output based on pooling head.
+    3. Returns structured results as `PoolerOutput`.
+    """
+
+    def __init__(
+        self,
+        pooling: TokenPoolingMethod | TokenPoolingFn,
+        head: TokenPoolerHead | TokenPoolingHeadFn,
+    ) -> None:
+        super().__init__()
+
+        self.pooling = pooling
+        self.head = head
+
+    def get_supported_tasks(self) -> Set[PoolingTask]:
+        tasks = set(POOLING_TASKS)
+
+        if isinstance(self.pooling, TokenPoolingMethod):
+            tasks &= self.pooling.get_supported_tasks()
+        if isinstance(self.head, TokenPoolerHead):
+            tasks &= self.head.get_supported_tasks()
+
+        return tasks
+
+    def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
+        updates = PoolingParamsUpdate()
+
+        if isinstance(self.pooling, TokenPoolingMethod):
+            updates |= self.pooling.get_pooling_updates(task)
+
+        return updates
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> TokenPoolerOutput:
+        pooled_data = self.pooling(hidden_states, pooling_metadata)
+        pooled_data = self.head(pooled_data, pooling_metadata)
+        return pooled_data
+
+
+def pooler_for_token_embed(pooler_config: PoolerConfig):
+    pooling = get_tok_pooling_method(pooler_config.get_tok_pooling_type())
+
+    vllm_config = get_current_vllm_config()
+    model_config = vllm_config.model_config
+    head = TokenEmbeddingPoolerHead(
+        head_dtype=model_config.head_dtype,
+        projector=_load_st_projector(model_config),
+        activation=PoolerNormalize(),
+    )
+
+    return TokenPooler(pooling=pooling, head=head)
+
+
+def pooler_for_token_classify(
+    pooler_config: PoolerConfig,
+    *,
+    pooling: TokenPoolingMethod | TokenPoolingFn | None = None,
+    classifier: ClassifierFn | None = None,
+    act_fn: PoolerActivation | str | None = None,
+):
+    if pooling is None:
+        pooling = get_tok_pooling_method(pooler_config.get_tok_pooling_type())
+
+    vllm_config = get_current_vllm_config()
+    model_config = vllm_config.model_config
+    head = TokenClassifierPoolerHead(
+        head_dtype=model_config.head_dtype,
+        classifier=classifier,
+        logit_bias=model_config.pooler_config.logit_bias,
+        activation=resolve_classifier_act_fn(
+            model_config, static_num_labels=False, act_fn=act_fn
+        ),
+    )
+
+    return TokenPooler(pooling=pooling, head=head)
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index 95821ad12bd030620e87a780bdaab03c2c8b5a89..9efffe7da553b8472487a2f82dd9f4c1bf766297 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -5,13 +5,13 @@ from typing import Literal, get_args
 
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
+from vllm.platforms import current_platform
 
 logger = init_logger(__name__)
 
 QuantizationMethods = Literal[
     "awq",
     "deepspeedfp",
-    "tpu_int8",
     "fp8",
     "ptpc_fp8",
     "fbgemm_fp8",
@@ -38,7 +38,6 @@ QuantizationMethods = Literal[
     "inc",
     "mxfp4",
     "petit_nvfp4",
-    "cpu_gptq",
     "cpu_awq",
     "blockwise_int8",
     "slimquant_w4a8",
@@ -47,6 +46,23 @@ QuantizationMethods = Literal[
 ]
 QUANTIZATION_METHODS: list[str] = list(get_args(QuantizationMethods))
 
+DEPRECATED_QUANTIZATION_METHODS = [
+    "deepspeedfp",
+    "tpu_int8",
+    "ptpc_fp8",
+    "fbgemm_fp8",
+    "fp_quant",
+    "bitblas",
+    "gptq_marlin_24",
+    "gptq_bitblas",
+    "hqq",
+    "experts_int8",
+    "ipex",
+    "auto-round",
+    "rtn",
+    "petit_nvfp4",
+]
+
 # The customized quantization methods which will be added to this dict.
 _CUSTOMIZED_METHOD_TO_QUANT_CONFIG = {}
 
@@ -87,6 +103,9 @@ def register_quantization_config(quantization: str):
             )
         else:
             QUANTIZATION_METHODS.append(quantization)
+            # Automatically assume the custom quantization config is supported
+            if sq := current_platform.supported_quantization:
+                sq.append(quantization)
 
         if not issubclass(quant_config_cls, QuantizationConfig):
             raise ValueError(
@@ -115,7 +134,7 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
     )
     from .compressed_tensors.compressed_tensors_marlin import (
         SlimQuantCompressedTensorsMarlinConfig)
-    from .cpu_wna16 import CPUAWQConfig, CPUGPTQConfig
+    from .cpu_wna16 import CPUAWQConfig
     from .deepspeedfp import DeepSpeedFPConfig
     from .experts_int8 import ExpertsInt8Config
     from .fbgemm_fp8 import FBGEMMFp8Config
@@ -136,7 +155,6 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
     from .ptpc_fp8 import PTPCFp8Config
     from .rtn import RTNConfig
     from .torchao import TorchAOConfig
-    from .tpu_int8 import Int8TpuConfig
     from .blockwise_int8 import BlockInt8Config
     from .slimquant_w4a8 import SlimQuantW4A8Int8Config
     from .slimquant_w4a8_marlin import SlimQuantW4A8Int8MarlinConfig
@@ -144,7 +162,6 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
     method_to_config: dict[str, type[QuantizationConfig]] = {
         "awq": AWQConfig,
         "deepspeedfp": DeepSpeedFPConfig,
-        "tpu_int8": Int8TpuConfig,
         "fp8": Fp8Config,
         "fbgemm_fp8": FBGEMMFp8Config,
         "fp_quant": FPQuantConfig,
@@ -171,7 +188,6 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
         "inc": INCConfig,
         "mxfp4": Mxfp4Config,
         "petit_nvfp4": PetitNvFp4Config,
-        "cpu_gptq": CPUGPTQConfig,
         "cpu_awq": CPUAWQConfig,
         "blockwise_int8": BlockInt8Config,
         "slimquant_w4a8":SlimQuantW4A8Int8Config,
diff --git a/vllm/model_executor/layers/quantization/auto_round.py b/vllm/model_executor/layers/quantization/auto_round.py
index 95e4382c89d7a311ed78954eb0564b81328d4b7e..5d77d1e3c7dd815bd2fd6e9da81ea684d4e24f3b 100644
--- a/vllm/model_executor/layers/quantization/auto_round.py
+++ b/vllm/model_executor/layers/quantization/auto_round.py
@@ -56,22 +56,22 @@ class AutoRoundConfig(QuantizationConfig):
         if weight_bits not in self.SUPPORTED_BITS:
             raise ValueError(
                 f"Unsupported weight_bits: {weight_bits}, "
-                f"currently only support  {self.SUPPORTED_BITS}"
+                f"currently only support {self.SUPPORTED_BITS}."
             )
         if data_type not in self.SUPPORTED_DTYPES:
             raise ValueError(
-                f"Unsupported data_type: {data_type},"
-                f" currently only support  {self.SUPPORTED_DTYPES}"
+                f"Unsupported data_type: {data_type}, "
+                f"currently only support {self.SUPPORTED_DTYPES}."
             )
         if packing_format not in self.SUPPORTED_FORMATS:
             raise ValueError(
                 f"Unsupported packing_format: {packing_format}, "
-                f"currently only support  {self.SUPPORTED_FORMATS}"
+                f"currently only support {self.SUPPORTED_FORMATS}."
             )
         if backend not in self.SUPPORTED_BACKENDS:
             raise ValueError(
-                f"Unsupported backend: {backend},  "
-                f"currently only support  {self.SUPPORTED_BACKENDS}"
+                f"Unsupported backend: {backend}, "
+                f"currently only support {self.SUPPORTED_BACKENDS}."
             )
 
         self.weight_bits = weight_bits
diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
index 2f479819b2410f463b7392ae9f41a2f6b7238ac4..49ec90fdd3215e74b2881610e1fceea8bff9b3aa 100644
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -15,6 +15,7 @@ from vllm.model_executor.layers.fused_moe.config import (
     FusedMoEQuantConfig,
 )
 from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe
+from vllm.model_executor.layers.fused_moe.fused_moe_router import FusedMoERouter
 from vllm.model_executor.layers.fused_moe.layer import (
     FusedMoE,
     FusedMoEMethodBase,
@@ -121,7 +122,7 @@ class AWQMarlinConfig(QuantizationConfig):
 
     @classmethod
     def get_min_capability(cls) -> int:
-        return 80
+        return 75
 
     @classmethod
     def get_config_filenames(cls) -> list[str]:
@@ -778,6 +779,7 @@ class AWQMarlinMoEMethod(FusedMoEMethodBase):
     def apply(
         self,
         layer: FusedMoE,
+        router: FusedMoERouter,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         use_nn_moe: bool | None = False,
@@ -785,7 +787,7 @@ class AWQMarlinMoEMethod(FusedMoEMethodBase):
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         assert layer.activation == "silu", "Only SiLU activation is supported."
 
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = router.select_experts(
             hidden_states=x,
             router_logits=router_logits,
             use_fused_gate=use_fused_gate,
diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py
index 1fd959cb3423d975d92531c3766d3cb59b9dcf62..1d2334f3933a16dbd2941327e1073da0cf525995 100644
--- a/vllm/model_executor/layers/quantization/bitsandbytes.py
+++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -10,7 +10,11 @@ from vllm.model_executor.layers.fused_moe.config import (
     FusedMoEConfig,
     FusedMoEQuantConfig,
 )
-from vllm.model_executor.layers.fused_moe.layer import FusedMoE, FusedMoEMethodBase
+from vllm.model_executor.layers.fused_moe.fused_moe_router import FusedMoERouter
+from vllm.model_executor.layers.fused_moe.layer import (
+    FusedMoE,
+    FusedMoEMethodBase,
+)
 from vllm.model_executor.layers.linear import (
     LinearBase,
     LinearMethodBase,
@@ -495,12 +499,13 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase):
     def apply(
         self,
         layer: FusedMoE,
+        router: FusedMoERouter,
         x: torch.Tensor,
         router_logits: torch.Tensor,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         from vllm.model_executor.layers.fused_moe import fused_experts
 
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = router.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 3845da9fa8aab27074ba0e62787fb33749eb2856..bb7721e143036a8c67076011b41b7d65b0656746 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -43,6 +43,7 @@ from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
     CompressedTensorsW4A8Fp8,
     CompressedTensorsW4A8Int,
     CompressedTensorsW4A16Fp4,
+    CompressedTensorsW4A16Mxfp4,
     CompressedTensorsW4A16Sparse24,
     CompressedTensorsW8A8Fp8,
     CompressedTensorsW8A8Int8,
@@ -334,25 +335,17 @@ class CompressedTensorsConfig(QuantizationConfig):
             return False
 
     @staticmethod
-    def _is_fp4a4_nvfp4(weight_quant: QuantizationArgs, input_quant: QuantizationArgs):
-        if weight_quant is None or input_quant is None:
+    def _is_nvfp4_format(quant_args: QuantizationArgs):
+        if quant_args is None:
             return False
-
         is_tensor_group_quant = (
-            weight_quant.strategy == QuantizationStrategy.TENSOR_GROUP.value
-            and input_quant.strategy == QuantizationStrategy.TENSOR_GROUP.value
-        )
-        is_symmetric = weight_quant.symmetric and input_quant.symmetric
-
-        is_group_size_16 = (
-            weight_quant.group_size == 16 and input_quant.group_size == 16
-        )
-        is_float_type = (
-            weight_quant.type == QuantizationType.FLOAT
-            and input_quant.type == QuantizationType.FLOAT
+            quant_args.strategy == QuantizationStrategy.TENSOR_GROUP.value
         )
-        is_4_bits = weight_quant.num_bits == 4 and input_quant.num_bits == 4
+        is_symmetric = quant_args.symmetric
 
+        is_group_size_16 = quant_args.group_size == 16
+        is_float_type = quant_args.type == QuantizationType.FLOAT
+        is_4_bits = quant_args.num_bits == 4
         return (
             is_tensor_group_quant
             and is_float_type
@@ -362,23 +355,21 @@ class CompressedTensorsConfig(QuantizationConfig):
         )
 
     @staticmethod
-    def _is_fp4a16_nvfp4(weight_quant: QuantizationArgs, input_quant: QuantizationArgs):
-        is_weight_only = weight_quant is not None and input_quant is None
-        is_tensor_group_quant = (
-            weight_quant.strategy == QuantizationStrategy.TENSOR_GROUP.value
-        )
-        is_symmetric = weight_quant.symmetric
+    def _is_mxfp4(quant_args: QuantizationArgs) -> bool:
+        if quant_args is None:
+            return False
 
-        is_group_size_16 = weight_quant.group_size == 16
-        is_float_type = weight_quant.type == QuantizationType.FLOAT
-        is_4_bits = weight_quant.num_bits == 4
+        is_group_quant = quant_args.strategy == QuantizationStrategy.GROUP.value
+        is_symmetric = quant_args.symmetric
+        is_group_size_32 = quant_args.group_size == 32
+        is_float_type = quant_args.type == QuantizationType.FLOAT
+        is_4_bits = quant_args.num_bits == 4
 
         return (
-            is_weight_only
-            and is_tensor_group_quant
+            is_group_quant
             and is_float_type
             and is_4_bits
-            and is_group_size_16
+            and is_group_size_32
             and is_symmetric
         )
 
@@ -579,9 +570,12 @@ class CompressedTensorsConfig(QuantizationConfig):
         format = format if format is not None else self.quant_format
 
         # Detect If Mixed Precision
-        if self._is_fp4a16_nvfp4(weight_quant, input_quant):
+        if self._is_nvfp4_format(weight_quant) and input_quant is None:
             return CompressedTensorsW4A16Fp4()
 
+        if self._is_mxfp4(weight_quant):
+            return CompressedTensorsW4A16Mxfp4()
+
         if self._is_fp8_w4a8_sm90(weight_quant, input_quant):
             return CompressedTensorsW4A8Fp8(
                 num_bits=weight_quant.num_bits,
@@ -617,7 +611,9 @@ class CompressedTensorsConfig(QuantizationConfig):
 
         act_quant_format = is_activation_quantization_format(format)
         if act_quant_format:
-            if self._is_fp4a4_nvfp4(weight_quant, input_quant):
+            if self._is_nvfp4_format(weight_quant) and self._is_nvfp4_format(
+                input_quant
+            ):
                 if cutlass_fp4_supported() or envs.VLLM_USE_NVFP4_CT_EMULATIONS:
                     return CompressedTensorsW4A4Fp4()
                 else:
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 76b8c27f8514aab9cb2e8306688db64934b31352..e3fc0a2957998998c865e32d38ed681b98bce216 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -11,12 +11,9 @@ from compressed_tensors.quantization import (
     QuantizationArgs,
     QuantizationStrategy,
 )
-from torch.nn.parameter import Parameter
 
-import vllm.envs as envs
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm import _custom_ops as ops
-from vllm._aiter_ops import rocm_aiter_ops
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import (
@@ -31,40 +28,47 @@ from vllm.model_executor.layers.fused_moe import (
 from vllm.model_executor.layers.fused_moe.config import (
     FusedMoEQuantConfig,
     fp8_w8a8_moe_quant_config,
+    fp8_w8a16_moe_quant_config,
     int4_w4a16_moe_quant_config,
     int4_w4afp8_moe_quant_config,
     int8_w8a8_moe_quant_config,
     int8_w8a16_moe_quant_config,
-    nvfp4_moe_quant_config,
 )
 from vllm.model_executor.layers.fused_moe.cpu_fused_moe import select_experts
-from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
-    is_valid_flashinfer_cutlass_fused_moe,
-)
 from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
     BatchedMarlinExperts,
     MarlinExperts,
     fused_marlin_moe,
 )
+from vllm.model_executor.layers.fused_moe.fused_moe_router import FusedMoERouter
+from vllm.model_executor.layers.fused_moe.oracle.fp8 import (
+    Fp8MoeBackend,
+    convert_to_fp8_moe_kernel_format,
+    make_fp8_moe_kernel,
+    select_fp8_moe_backend,
+)
+from vllm.model_executor.layers.fused_moe.oracle.nvfp4 import (
+    FLASHINFER_NVFP4_MOE_BACKENDS,
+    NvFp4MoeBackend,
+    convert_to_nvfp4_moe_kernel_format,
+    is_global_sf_supported_for_nvfp4_backend,
+    make_nvfp4_moe_kernel,
+    make_nvfp4_moe_quant_config,
+    select_nvfp4_moe_backend,
+)
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes.compressed_tensors_wNa16 import (  # noqa
     WNA16_SUPPORTED_BITS,
     WNA16_SUPPORTED_TYPES_MAP,
 )
-from vllm.model_executor.layers.quantization.utils import replace_parameter
 from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import (
     build_flashinfer_fp4_cutlass_moe_prepare_finalize,
     flashinfer_trtllm_fp4_moe,
-    prepare_static_weights_for_trtllm_fp4_moe,
-    reorder_w1w3_to_w3w1,
+    flashinfer_trtllm_fp4_routed_moe,
     select_nvfp4_gemm_impl,
 )
-from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
-    FlashinferMoeBackend,
-    get_flashinfer_moe_backend,
-)
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    expert_weight_is_col_major,
-    requant_weight_ue8m0_inplace,
+    process_fp8_input_tensor_strategy_moe,
+    process_fp8_weight_tensor_strategy_moe,
 )
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     check_moe_marlin_supports_layer,
@@ -74,32 +78,18 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     marlin_moe_permute_scales,
 )
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
-    prepare_moe_fp4_layer_for_marlin,
-)
-from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
-    prepare_moe_fp8_layer_for_marlin,
+    is_fp4_marlin_supported,
 )
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     convert_bf16_scales_to_fp8,
     convert_packed_uint4b8_to_signed_int4_inplace,
-    swizzle_blockscale,
 )
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    all_close_1d,
     normalize_e4m3fn_to_e4m3fnuz,
-    per_tensor_dequantize,
 )
-from vllm.model_executor.utils import set_weight_attrs
+from vllm.model_executor.utils import replace_parameter, set_weight_attrs
 from vllm.platforms import CpuArchEnum, current_platform
-from vllm.scalar_type import scalar_types
-from vllm.utils.deep_gemm import (
-    get_col_major_tma_aligned_tensor,
-    get_mk_alignment_for_contiguous_layout,
-    is_deep_gemm_e8m0_used,
-)
-
 from vllm.utils import W8a8GetCacheJSON
-from vllm.utils.import_utils import has_deep_gemm
 
 logger = init_logger(__name__)
 
@@ -196,8 +186,18 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase):
                 return CompressedTensorsWNA16MarlinMoEMethod(
                     weight_quant, input_quant, layer.moe_config
                 )
-        elif quant_config._is_fp4a4_nvfp4(weight_quant, input_quant):
-            return CompressedTensorsW4A4Nvfp4MoEMethod(layer.moe_config, layer_name)
+        elif quant_config._is_nvfp4_format(weight_quant):
+            _is_valid_nvfp4_activations = (
+                quant_config._is_nvfp4_format(input_quant) or input_quant is None
+            )
+            if not _is_valid_nvfp4_activations:
+                raise ValueError(
+                    "For NVFP4 weights, input quantization must also be NVFP4 format ",
+                    f"or None for NVFP4A16, found {input_quant}",
+                )
+            return CompressedTensorsW4A4Nvfp4MoEMethod(
+                layer.moe_config, layer_name, use_marlin=input_quant is None
+            )
         elif (
             quant_config._is_fp8_w8a8_sm90(weight_quant, input_quant)
             or quant_config._is_fp8_w8a8_sm100(weight_quant, input_quant)
@@ -226,32 +226,34 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase):
 
 
 class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod):
-    def __init__(self, moe: FusedMoEConfig, layer_name: str | None = None):
-        from vllm.model_executor.layers.quantization.utils.nvfp4_moe_support import (  # noqa: E501
-            detect_nvfp4_moe_support,
-        )
+    def __init__(
+        self,
+        moe: FusedMoEConfig,
+        layer_name: str | None = None,
+        use_marlin: bool = False,
+    ):
+        if not moe.is_act_and_mul:
+            raise ValueError(
+                "CompressedTensorsW4A4Nvfp4MoEMethod does not yet "
+                "support non gated MoE models."
+            )
 
         super().__init__(moe)
-        _nvfp4 = detect_nvfp4_moe_support(self.__class__.__name__)
-        self.cutlass_nvfp4_supported = _nvfp4.cutlass_supported
-        self.allow_flashinfer = _nvfp4.allow_flashinfer
-        self.use_marlin = _nvfp4.use_marlin
         self.group_size = 16
-        self.layer_name = layer_name
-        self.marlin_input_dtype = (
-            get_marlin_input_dtype(layer_name) if self.use_marlin else None
-        )
-        self.flashinfer_moe_backend = None
-        if self.allow_flashinfer:
-            self.flashinfer_moe_backend = get_flashinfer_moe_backend()
-            logger.info_once(
-                f"Using FlashInfer {self.flashinfer_moe_backend.value} kernels"
-                " for CompressedTensorsW4A4Nvfp4MoEMethod."
-            )
-        elif self.use_marlin:
-            logger.info_once("Using Marlin for CompressedTensorsW4A4Nvfp4MoEMethod.")
+        if use_marlin:
+            if is_fp4_marlin_supported():
+                self.nvfp4_backend = NvFp4MoeBackend.MARLIN
+            else:
+                raise ValueError(
+                    "Marlin FP4 MoE kernel requested but not ",
+                    "supported on current platform.",
+                )
         else:
-            logger.info_once("Using Cutlass for CompressedTensorsW4A4Nvfp4MoEMethod.")
+            self.nvfp4_backend = select_nvfp4_moe_backend()
+        self.use_global_sf = is_global_sf_supported_for_nvfp4_backend(
+            self.nvfp4_backend
+        )
+        self.kernel: mk.FusedMoEModularKernel | None = None
 
     def create_weights(
         self,
@@ -364,7 +366,13 @@ class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod):
         set_weight_attrs(w2_input_scale, extra_weight_attrs)
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        # From packed to weight
+        """
+        Convert NVFP4 MoE weights into kernel format and setup the kernel.
+        """
+        # NOTE(rob): wN_weight_packed -> wN_weight is because ModularKernelMethod
+        # requires this naming convention. However, the name change breaks
+        # reloading because the state dict no longer matches disk. Once we
+        # remove MKM, we should revert this change to ensure compatibility.
         layer.w13_weight = torch.nn.Parameter(
             layer.w13_weight_packed.data, requires_grad=False
         )
@@ -375,152 +383,79 @@ class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod):
         )
         delattr(layer, "w2_weight_packed")
 
-        # reorder GEMM1 weights and block scales for FlashInfer CUTLASS kernel.
-        if self.allow_flashinfer:
-            w, s = reorder_w1w3_to_w3w1(
-                layer.w13_weight.data, layer.w13_weight_scale.data, dim=-2
-            )
-            layer.w13_weight = torch.nn.Parameter(w, requires_grad=False)
-            layer.w13_weight_scale = torch.nn.Parameter(s, requires_grad=False)
-
-        if not torch.allclose(
+        # Use a single gscale for w13.
+        if self.moe.is_act_and_mul and not torch.allclose(
             layer.w13_weight_global_scale[:, 0], layer.w13_weight_global_scale[:, 1]
         ):
             logger.warning_once(
                 "w1_weight_global_scale must match w3_weight_global_scale. "
-                "Accuracy may be affected."
-            )
-
-        # Take inverse of global scale saved to disk
-        layer.w13_weight_scale_2 = torch.nn.Parameter(
-            1 / layer.w13_weight_global_scale[:, 0], requires_grad=False
-        )
-
-        layer.w2_weight_scale_2 = torch.nn.Parameter(
-            1 / layer.w2_weight_global_scale.data, requires_grad=False
-        )
-
-        if self.use_marlin:
-            prepare_moe_fp4_layer_for_marlin(layer, input_dtype=self.marlin_input_dtype)
-            return
-        # w13
-        if (
-            self.allow_flashinfer
-            and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
-        ):
-            w13_input_global_scale = (
-                layer.w13_input_global_scale.min()
-                .to(torch.float32)
-                .expand(layer.num_experts)
-            )
-        else:
-            w13_input_global_scale = layer.w13_input_global_scale.min(dim=1).values.to(
-                torch.float32
-            )
-        layer.g1_alphas = torch.nn.Parameter(
-            ((1 / w13_input_global_scale) * layer.w13_weight_scale_2),
-            requires_grad=False,
-        )
-
-        layer.w13_input_scale_quant = torch.nn.Parameter(
-            (w13_input_global_scale), requires_grad=False
-        )
-
-        # w2
-        if (
-            self.allow_flashinfer
-            and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
-        ):
-            w2_input_global_scale = (
-                layer.w2_input_global_scale.min()
-                .to(torch.float32)
-                .expand(layer.num_experts)
-            )
-        else:
-            w2_input_global_scale = layer.w2_input_global_scale
-
-        layer.g2_alphas = torch.nn.Parameter(
-            ((1 / w2_input_global_scale) * layer.w2_weight_scale_2).to(torch.float32),
-            requires_grad=False,
-        )
-
-        layer.w2_input_scale_quant = torch.nn.Parameter(
-            (w2_input_global_scale), requires_grad=False
-        )
-
-        # TensorRT-LLM specific processing
-        if (
-            self.allow_flashinfer
-            and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
-        ):
-            # Prepare static weights for TRT-LLM kernel
-            # alternate: prepare_static_weight_layouts_for_trtllm_moe
-            (
-                gemm1_weights_fp4_shuffled,
-                gemm1_scales_fp4_shuffled,
-                gemm2_weights_fp4_shuffled,
-                gemm2_scales_fp4_shuffled,
-            ) = prepare_static_weights_for_trtllm_fp4_moe(
-                layer.w13_weight,
-                layer.w2_weight,
-                layer.w13_weight_scale,
-                layer.w2_weight_scale,
-                layer.w2_weight.size(-2),  # hidden_size
-                layer.w13_weight.size(-2) // 2,  # intermediate_size
-                layer.w13_weight.size(0),  # num_experts
-            )
-            logger.debug_once("Finished shuffling weights for TRT-LLM MOE")
-
-            layer.gemm1_weights_fp4_shuffled = Parameter(
-                gemm1_weights_fp4_shuffled, requires_grad=False
-            )
-            layer.gemm2_weights_fp4_shuffled = Parameter(
-                gemm2_weights_fp4_shuffled, requires_grad=False
-            )
-            layer.gemm1_scales_fp4_shuffled = Parameter(
-                gemm1_scales_fp4_shuffled, requires_grad=False
-            )
-            layer.gemm2_scales_fp4_shuffled = Parameter(
-                gemm2_scales_fp4_shuffled, requires_grad=False
-            )
-
-            # Additional parameter needed for TRT-LLM
-            layer.g1_scale_c = Parameter(
-                (layer.w2_input_scale_quant * layer.g1_alphas).to(torch.float32),
-                requires_grad=False,
-            )
-
-            # Clean up weights that won't be used by TRT-LLM
-            del layer.w2_weight
-            del layer.w2_weight_scale
-            del layer.w13_weight
-            del layer.w13_weight_scale
-        else:
-            # swizzle weight scales
-            layer.w13_weight_scale = torch.nn.Parameter(
-                swizzle_blockscale(layer.w13_weight_scale), requires_grad=False
-            )
-
-            layer.w2_weight_scale = torch.nn.Parameter(
-                swizzle_blockscale(layer.w2_weight_scale), requires_grad=False
+                "Accuracy may be affected.",
+            )
+        w13_weight_global_scale = layer.w13_weight_global_scale[:, 0].contiguous()
+
+        # Shuffle weights into the NvFp4 kernel format.
+        (
+            w13,
+            w13_scale,
+            w13_scale_2,
+            a13_scale,
+            w2,
+            w2_scale,
+            w2_scale_2,
+            a2_scale,
+        ) = convert_to_nvfp4_moe_kernel_format(
+            nvfp4_backend=self.nvfp4_backend,
+            layer=layer,
+            w13=layer.w13_weight,
+            w13_scale=layer.w13_weight_scale,
+            w13_scale_2=(1.0 / w13_weight_global_scale),
+            a13_scale=(1.0 / layer.w13_input_global_scale),
+            w2=layer.w2_weight,
+            w2_scale=layer.w2_weight_scale,
+            w2_scale_2=(1.0 / layer.w2_weight_global_scale),
+            a2_scale=(1.0 / layer.w2_input_global_scale),
+            is_act_and_mul=self.moe.is_act_and_mul,
+        )
+
+        replace_parameter(layer, "w13_weight", w13)
+        replace_parameter(layer, "w13_weight_scale", w13_scale)
+        replace_parameter(layer, "w2_weight", w2)
+        replace_parameter(layer, "w2_weight_scale", w2_scale)
+        layer.w13_weight_scale_2 = w13_scale_2
+        layer.w2_weight_scale_2 = w2_scale_2
+        layer.w13_input_scale = a13_scale
+        layer.w2_input_scale = a2_scale
+
+        # Initialize the kernel that will be called in apply().
+        self.moe_quant_config = self.get_fused_moe_quant_config(layer)
+        use_dp = self.moe.dp_size > 1
+        if self.moe_quant_config is not None and not use_dp:
+            self.kernel = make_nvfp4_moe_kernel(
+                backend=self.nvfp4_backend,
+                quant_config=self.moe_quant_config,
+                moe_config=self.moe,
             )
 
     def maybe_make_prepare_finalize(
         self,
         routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
     ) -> mk.FusedMoEPrepareAndFinalize | None:
-        if self.use_marlin or (
-            self.allow_flashinfer
-            and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
-        ):
+        UNSUPPORTED = [NvFp4MoeBackend.MARLIN, NvFp4MoeBackend.FLASHINFER_TRTLLM]
+        if self.nvfp4_backend in UNSUPPORTED:
             return None
-        elif not self.allow_flashinfer:
+        elif self.nvfp4_backend == NvFp4MoeBackend.FLASHINFER_CUTLASS:
+            # TP case: avoid convert to ModularKernelMethod - to be refactored.
+            if self.moe.dp_size == 1:
+                return None
+            # For now, fp4 moe only works with the flashinfer dispatcher.
+            prepare_finalize = build_flashinfer_fp4_cutlass_moe_prepare_finalize(
+                self.moe
+            )
+            logger.debug_once("%s", prepare_finalize.__class__.__name__)
+            return prepare_finalize
+        else:
             return super().maybe_make_prepare_finalize(routing_tables)
 
-        prepare_finalize = build_flashinfer_fp4_cutlass_moe_prepare_finalize(self.moe)
-        logger.debug_once("%s", prepare_finalize.__class__.__name__)
-        return prepare_finalize
-
     def select_gemm_impl(
         self,
         prepare_finalize: mk.FusedMoEPrepareAndFinalize,
@@ -531,7 +466,7 @@ class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod):
         experts = select_nvfp4_gemm_impl(
             self.moe,
             self.moe_quant_config,
-            allow_flashinfer=self.allow_flashinfer,
+            allow_flashinfer=(self.nvfp4_backend in FLASHINFER_NVFP4_MOE_BACKENDS),
         )
         logger.debug_once("Using %s", experts.__class__.__name__)
         return experts
@@ -539,38 +474,29 @@ class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod):
     def get_fused_moe_quant_config(
         self, layer: torch.nn.Module
     ) -> FusedMoEQuantConfig | None:
-        if (
-            self.use_marlin
-            or self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
-        ):
-            return None
-
-        return nvfp4_moe_quant_config(
-            g1_alphas=layer.g1_alphas,
-            g2_alphas=layer.g2_alphas,
-            a1_gscale=layer.w13_input_scale_quant,
-            a2_gscale=layer.w2_input_scale_quant,
-            w1_scale=layer.w13_weight_scale,
+        return make_nvfp4_moe_quant_config(
+            backend=self.nvfp4_backend,
+            w13_scale=layer.w13_weight_scale,
             w2_scale=layer.w2_weight_scale,
+            w13_scale_2=layer.w13_weight_scale_2,
+            w2_scale_2=layer.w2_weight_scale_2,
+            a13_scale=layer.w13_input_scale,
+            a2_scale=layer.w2_input_scale,
         )
 
     def apply(
         self,
         layer: FusedMoE,
+        router: FusedMoERouter,
         x: torch.Tensor,
         router_logits: torch.Tensor,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         assert layer.activation == "silu", "Only SiLU activation is supported."
 
         if (
-            self.allow_flashinfer
-            and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
+            self.nvfp4_backend == NvFp4MoeBackend.FLASHINFER_TRTLLM
+            and not layer.enable_eplb
         ):
-            if layer.enable_eplb:
-                raise NotImplementedError(
-                    "EPLB not supported for `CompressedTensorsW4A4MoEMethod` yet."
-                )
-
             return flashinfer_trtllm_fp4_moe(
                 layer=layer,
                 x=x,
@@ -583,84 +509,41 @@ class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod):
                 e_score_correction_bias=layer.e_score_correction_bias,
             )
 
-        topk_weights, topk_ids, _ = layer.select_experts(
-            hidden_states=x,
+        # Hidden_states in select_experts is only used to extract metadata
+        if isinstance(x, tuple):
+            x_routing, _ = x
+        else:
+            x_routing = x
+        topk_weights, topk_ids = router.select_experts(
+            hidden_states=x_routing,
             router_logits=router_logits,
         )
 
-        if self.use_marlin:
-            return fused_marlin_moe(
+        # EPLB path
+        if self.nvfp4_backend == NvFp4MoeBackend.FLASHINFER_TRTLLM:
+            assert layer.enable_eplb
+            return flashinfer_trtllm_fp4_routed_moe(
+                layer=layer,
+                x=x,
+                topk_ids=topk_ids,
+                topk_weights=topk_weights,
+                top_k=layer.top_k,
+                global_num_experts=layer.global_num_experts,
+            )
+        else:
+            assert self.kernel is not None
+            return self.kernel(
                 x,
                 layer.w13_weight,
                 layer.w2_weight,
-                None,
-                None,
-                layer.w13_weight_scale,
-                layer.w2_weight_scale,
-                router_logits,
                 topk_weights,
                 topk_ids,
-                global_scale1=layer.w13_weight_scale_2,
-                global_scale2=layer.w2_weight_scale_2,
-                quant_type_id=scalar_types.float4_e2m1f.id,
-                apply_router_weight_on_input=layer.apply_router_weight_on_input,
-                global_num_experts=layer.global_num_experts,
-                expert_map=layer.expert_map,
-                input_dtype=self.marlin_input_dtype,
-                workspace=layer.workspace,
-            )
-
-        # FlashInfer fused experts path
-        elif self.allow_flashinfer:
-            from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (  # noqa: E501
-                flashinfer_cutlass_moe_fp4,
-            )
-
-            assert is_valid_flashinfer_cutlass_fused_moe(
-                x, layer.w13_weight, layer.w2_weight
-            ), "Flashinfer CUTLASS Fused MoE not applicable!"
-
-            assert self.moe_quant_config is not None
-
-            return flashinfer_cutlass_moe_fp4(
-                hidden_states=x,
-                w1=layer.w13_weight,
-                w2=layer.w2_weight,
-                topk_weights=topk_weights,
-                topk_ids=topk_ids,
-                quant_config=self.moe_quant_config,
-                inplace=False,  # TODO(shuw): fix later, now output is high prec
+                inplace=False,
                 activation=layer.activation,
                 global_num_experts=layer.global_num_experts,
                 expert_map=layer.expert_map,
                 apply_router_weight_on_input=layer.apply_router_weight_on_input,
             )
-        else:
-            from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4
-
-            assert layer.expert_map is None, (
-                "Expert Parallelism / expert_map "
-                "is currently not supported for "
-                "CompressedTensorsW4A4Nvfp4MoEMethod."
-            )
-            assert self.moe_quant_config is not None
-
-            # Cutlass moe takes in activations in BF16/Half precision
-            # and fp4 quantized weights loaded from the checkpoint
-            return cutlass_moe_fp4(
-                a=x,
-                w1_fp4=layer.w13_weight,
-                w2_fp4=layer.w2_weight,
-                topk_weights=topk_weights,
-                topk_ids=topk_ids,
-                quant_config=self.moe_quant_config,
-                apply_router_weight_on_input=layer.apply_router_weight_on_input,
-                # TODO(bnell): derive these from arguments
-                m=x.shape[0],
-                n=layer.w2_weight.shape[2] * 2,
-                k=x.shape[1],
-                e=layer.w13_weight.shape[0],
-            ).to(x.dtype)
 
 
 class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
@@ -671,10 +554,6 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
         moe: FusedMoEConfig,
         layer_name: str | None = None,
     ):
-        from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
-            CompressedTensorsConfig,
-        )
-
         super().__init__(moe)
         self.weight_quant = weight_quant
         self.input_quant = input_quant
@@ -701,36 +580,31 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
                 "For FP8 Fused MoE layer, we require either per tensor or "
                 "channelwise, dynamic per token quantization."
             )
-
-        # For GPUs that lack FP8 hardware support, we can leverage the Marlin
-        # kernel for fast weight-only FP8 quantization
-        self.use_marlin = (
-            not current_platform.has_device_capability(89)
-            or envs.VLLM_TEST_FORCE_FP8_MARLIN
-            and not self.block_quant
+        self.fp8_backend = select_fp8_moe_backend(
+            block_quant=self.block_quant,
+            tp_size=moe.tp_size,
+            with_lora_support=moe.is_lora_enabled,
+            # TODO(rob): enable selecting this externally.
+            allow_vllm_cutlass=True,
         )
-        # Disable marlin for rocm
-        if current_platform.is_rocm():
-            self.use_marlin = False
-
-        self.rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled()
-        self.tritonsingleton= W8a8GetCacheJSON()
-
-        # cutlass path
-        self.is_fp8_w8a8_sm100 = CompressedTensorsConfig._is_fp8_w8a8_sm100(
-            self.weight_quant, self.input_quant
-        )
-        self.use_cutlass = not self.block_quant and (
-            CompressedTensorsConfig._is_fp8_w8a8_sm90(
-                self.weight_quant, self.input_quant
+        if self.fp8_backend != Fp8MoeBackend.MARLIN:
+            per_act_token = self.input_quant.strategy == QuantizationStrategy.TOKEN
+            per_channel_quant = (
+                self.weight_quant.strategy == QuantizationStrategy.CHANNEL
+            )
+            if per_act_token != per_channel_quant:
+                raise NotImplementedError(
+                    "For FP8 Fused MoE layers, per-token and per-channel must be "
+                    "used together."
+                )
+        # TODO(rob): hook this up in a follow up PR.
+        if self.fp8_backend == Fp8MoeBackend.FLASHINFER_TRTLLM:
+            raise NotImplementedError(
+                "FlashInfer TRTLLM backend not supported for compressed-tensors yet."
             )
-            or self.is_fp8_w8a8_sm100
-        )
         self.disable_expert_map = False
-        self.layer_name = layer_name
-        self.marlin_input_dtype = (
-            get_marlin_input_dtype(layer_name) if self.use_marlin else None
-        )
+
+        self.kernel: mk.FusedMoEModularKernel | None = None
 
     def create_weights(
         self,
@@ -907,150 +781,75 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
         pass   
     
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        # Fp8 moe kernels require a single activation scale.
-        # We take the max of all the scales in case they differ.
-        if self.static_input_scales:
-            assert self.input_quant.strategy == QuantizationStrategy.TENSOR
-            if layer.w13_input_scale is None or layer.w2_input_scale is None:
-                raise ValueError(
-                    "QuantConfig has static quantization, but found "
-                    "activation scales are None."
-                )
-            if not all_close_1d(layer.w13_input_scale) or not all_close_1d(
-                layer.w2_input_scale
-            ):
-                logger.warning_once(
-                    "Found input_scales that are not equal for "
-                    "fp8 MoE layer. Using the maximum across experts "
-                    "for each layer."
-                )
-            layer.w13_input_scale = torch.nn.Parameter(
-                layer.w13_input_scale.max(), requires_grad=False
-            )
-            layer.w2_input_scale = torch.nn.Parameter(
-                layer.w2_input_scale.max(), requires_grad=False
-            )
-
+        # Allow for accessing weights and scales in standard way.
+        w13 = layer.w13_weight
+        w2 = layer.w2_weight
+        w13_scale = layer.w13_weight_scale
+        w2_scale = layer.w2_weight_scale
+        w13_input_scale = layer.w13_input_scale
+        w2_input_scale = layer.w2_input_scale
+
+        # MI300x and MI325x use FNUZ format for FP8. Convert if needed.
         if current_platform.is_fp8_fnuz():
-            # Normalize the weights and scales
-            w13_weight, w13_weight_scale, w13_input_scale = (
-                normalize_e4m3fn_to_e4m3fnuz(
-                    layer.w13_weight, layer.w13_weight_scale, layer.w13_input_scale
-                )
-            )
-            w2_weight, w2_weight_scale, w2_input_scale = normalize_e4m3fn_to_e4m3fnuz(
-                layer.w2_weight, layer.w2_weight_scale, layer.w2_input_scale
+            w13, w13_scale, w13_input_scale = normalize_e4m3fn_to_e4m3fnuz(
+                w13, w13_scale, w13_input_scale
             )
-            # Reset the parameter
-            layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False)
-            layer.w13_weight_scale = torch.nn.Parameter(
-                w13_weight_scale, requires_grad=False
+            w2, w2_scale, w2_input_scale = normalize_e4m3fn_to_e4m3fnuz(
+                w2, w2_scale, w2_input_scale
             )
-            if w13_input_scale is not None:
-                layer.w13_input_scale = torch.nn.Parameter(
-                    w13_input_scale, requires_grad=False
-                )
-            layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False)
-            layer.w2_weight_scale = torch.nn.Parameter(
-                w2_weight_scale, requires_grad=False
-            )
-            if w2_input_scale is not None:
-                layer.w2_input_scale = torch.nn.Parameter(
-                    w2_input_scale, requires_grad=False
-                )
-
-        # For Per-TENSOR case, Fp8 moe kernel needs single weight scale
-        # for w13 per expert. Use max then dequant and requant each expert.
-        if self.weight_quant.strategy == QuantizationStrategy.TENSOR:
-            assert layer.w13_weight_scale is not None
-            shard_size = layer.intermediate_size_per_partition
-            max_w13_scales = layer.w13_weight_scale.max(dim=1).values
-            for expert_id in range(layer.local_num_experts):
-                start = 0
-                for shard_id in range(2):
-                    dq_weight = per_tensor_dequantize(
-                        layer.w13_weight[expert_id][start : start + shard_size, :],
-                        layer.w13_weight_scale[expert_id][shard_id],
-                    )
-                    layer.w13_weight[expert_id][start : start + shard_size, :], _ = (
-                        ops.scaled_fp8_quant(dq_weight, max_w13_scales[expert_id])
-                    )
-                    start += shard_size
-            layer.w13_weight_scale = torch.nn.Parameter(
-                max_w13_scales, requires_grad=False
-            )
-
-        # Property to determine if AITER is used
-        if self.rocm_aiter_moe_enabled:
-            # reshaping weights is required for aiter moe kernel.
-            shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights(
-                layer.w13_weight.data, layer.w2_weight.data
-            )
-
-            layer.w13_weight = torch.nn.Parameter(shuffled_w13, requires_grad=False)
-            layer.w2_weight = torch.nn.Parameter(shuffled_w2, requires_grad=False)
 
-        elif self.use_marlin:
-            prepare_moe_fp8_layer_for_marlin(
-                layer, False, input_dtype=self.marlin_input_dtype
-            )
-            # Activations not quantized for marlin.
-            del layer.w13_input_scale
-            del layer.w2_input_scale
-
-        if self.use_cutlass:
-            assert self.weight_quant.strategy != QuantizationStrategy.BLOCK
-            device = layer.w13_weight.device
-            # ab_strides1 and c_strides2 are the same
-            self.ab_strides1_c_strides2 = torch.full(
-                (layer.local_num_experts,),
-                layer.hidden_size,
-                device=device,
-                dtype=torch.int64,
-            )
-            self.ab_strides2 = torch.full(
-                (layer.local_num_experts,),
-                layer.intermediate_size_per_partition,
-                device=device,
-                dtype=torch.int64,
-            )
-            self.c_strides1 = torch.full(
-                (layer.local_num_experts,),
-                2 * layer.intermediate_size_per_partition,
-                device=device,
-                dtype=torch.int64,
+        # Per tensor kernels require single activation scale. Use the max.
+        if self.static_input_scales:
+            assert self.input_quant.strategy == QuantizationStrategy.TENSOR
+            assert w13_input_scale is not None and w2_input_scale is not None
+            w13_input_scale, w2_input_scale = process_fp8_input_tensor_strategy_moe(
+                w13_input_scale, w2_input_scale
             )
+            replace_parameter(layer, "w13_input_scale", w13_input_scale)
+            replace_parameter(layer, "w2_input_scale", w2_input_scale)
 
-        if is_deep_gemm_e8m0_used() and self.block_quant:
-            assert layer.weight_block_size is not None
-            # Re-quantise the expert weights so their scales are UE8M0.
-            block_sz = tuple(layer.weight_block_size)
-            requant_weight_ue8m0_inplace(
-                layer.w13_weight.data,
-                layer.w13_weight_scale.data,
-                block_sz,
-            )
-            requant_weight_ue8m0_inplace(
-                layer.w2_weight.data,
-                layer.w2_weight_scale.data,
-                block_sz,
+        # Per-tensor kernels use a single scale, for W13, but on disk there
+        # is a separate scale for W1 and W3. Requantize with the max scale.
+        if self.weight_quant.strategy == QuantizationStrategy.TENSOR:
+            process_fp8_weight_tensor_strategy_moe(
+                w13,
+                w13_scale,
+                shard_size=layer.intermediate_size_per_partition,
+                num_experts=layer.num_local_experts,
+            )
+
+        w13, w2, w13_scale, w2_scale = convert_to_fp8_moe_kernel_format(
+            fp8_backend=self.fp8_backend,
+            layer=layer,
+            w13=w13,
+            w2=w2,
+            w13_scale=w13_scale,
+            w2_scale=w2_scale,
+            w13_input_scale=w13_input_scale,
+            w2_input_scale=w2_input_scale,
+        )
+
+        # Replace parameters with updated versions. Note that this helper
+        # function ensures the replacement is compatible with RL weight reloads.
+        replace_parameter(layer, "w13_weight", w13)
+        replace_parameter(layer, "w2_weight", w2)
+        replace_parameter(layer, "w13_weight_scale", w13_scale)
+        replace_parameter(layer, "w2_weight_scale", w2_scale)
+
+        self.moe_quant_config = self.get_fused_moe_quant_config(layer)
+        if self.moe_quant_config:
+            self.kernel, self.use_inplace = make_fp8_moe_kernel(
+                layer=layer,
+                moe_quant_config=self.moe_quant_config,
+                moe_config=self.moe,
+                fp8_backend=self.fp8_backend,
             )
 
-            # Ensure column-major TMA alignment expected by DeepGEMM.
-            if expert_weight_is_col_major(layer.w13_weight_scale):
-                layer.w13_weight_scale = get_col_major_tma_aligned_tensor(
-                    layer.w13_weight_scale
-                )
-            if expert_weight_is_col_major(layer.w2_weight_scale):
-                layer.w2_weight_scale = get_col_major_tma_aligned_tensor(
-                    layer.w2_weight_scale
-                )
-
     def maybe_make_prepare_finalize(
         self,
         routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
     ) -> mk.FusedMoEPrepareAndFinalize | None:
-        if self.use_marlin or self.rocm_aiter_moe_enabled:
+        if self.fp8_backend in [Fp8MoeBackend.MARLIN, Fp8MoeBackend.AITER]:
             return None
         else:
             return super().maybe_make_prepare_finalize(routing_tables)
@@ -1062,7 +861,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
     ) -> FusedMoEPermuteExpertsUnpermute:
         # cutlass path
         assert self.moe_quant_config is not None
-        if self.use_cutlass:
+        if self.fp8_backend == Fp8MoeBackend.VLLM_CUTLASS:
             from vllm.model_executor.layers.fused_moe import (
                 CutlassBatchedExpertsFp8,
                 CutlassExpertsFp8,
@@ -1078,26 +877,27 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
             ):
                 logger.debug("CutlassBatchedExpertsFp8(%s)", self.__class__.__name__)
                 experts = CutlassBatchedExpertsFp8(
-                    self.moe.num_local_experts,
-                    num_dispatchers,
-                    self.moe.in_dtype,
-                    ab_strides1=self.ab_strides1_c_strides2,
-                    ab_strides2=self.ab_strides2,
-                    c_strides1=self.c_strides1,
-                    c_strides2=self.ab_strides1_c_strides2,
+                    max_experts_per_worker=self.moe.num_local_experts,
+                    num_dispatchers=num_dispatchers,
+                    out_dtype=self.moe.in_dtype,
+                    e=layer.local_num_experts,
+                    n=layer.intermediate_size_per_partition,
+                    k=layer.hidden_size,
+                    device=layer.w13_weight.device,
                     quant_config=self.moe_quant_config,
                 )
             else:
                 logger.debug("CutlassExpertsFp8(%s)", self.__class__.__name__)
                 experts = CutlassExpertsFp8(
-                    self.moe.in_dtype,
-                    ab_strides1=self.ab_strides1_c_strides2,
-                    ab_strides2=self.ab_strides2,
-                    c_strides1=self.c_strides1,
-                    c_strides2=self.ab_strides1_c_strides2,
+                    out_dtype=self.moe.in_dtype,
+                    e=layer.local_num_experts,
+                    n=layer.intermediate_size_per_partition,
+                    k=layer.hidden_size,
+                    device=layer.w13_weight.device,
                     quant_config=self.moe_quant_config,
                 )
 
+            # TODO(rob): investigate disable_expert_map
             self.disable_expert_map = (
                 num_dispatchers > 1 or not experts.supports_expert_map()
             )
@@ -1110,13 +910,14 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
         from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
             BatchedTritonExperts,
         )
+        from vllm.model_executor.layers.fused_moe.fused_moe import (
+            TritonExperts,
+        )
         from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
             TritonOrDeepGemmExperts,
         )
 
-        assert not self.rocm_aiter_moe_enabled and not self.use_marlin
-
-        use_deep_gemm = envs.VLLM_USE_DEEP_GEMM and envs.VLLM_MOE_USE_DEEP_GEMM
+        assert self.fp8_backend not in [Fp8MoeBackend.AITER, Fp8MoeBackend.MARLIN]
 
         if (
             prepare_finalize.activation_format
@@ -1125,28 +926,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
             max_num_tokens_per_rank = prepare_finalize.max_num_tokens_per_rank()
             assert max_num_tokens_per_rank is not None
 
-            if use_deep_gemm and not has_deep_gemm():
-                raise RuntimeError(
-                    "DeepGEMM requested for MoE layer but not installed."
-                )
-
-            compatible_with_deep_gemm = (
-                self.moe_quant_config.use_fp8_w8a8
-                and self.moe_quant_config.block_shape
-                == get_mk_alignment_for_contiguous_layout()
-            )
-
-            # If this MoE layer is compatible with DeepGEMM, the proper env
-            # vars are set and DeepGEMM is not installed, throw an error.
-            if use_deep_gemm and compatible_with_deep_gemm and not has_deep_gemm():
-                raise RuntimeError(
-                    f"MoE layer incompatible with DeepGEMM, expected "
-                    f"fp8==True, got {self.moe_quant_config.use_fp8_w8a8}"
-                    f"or block_shape {self.moe_quant_config.block_shape}"
-                    f"=={get_mk_alignment_for_contiguous_layout()}."
-                )
-
-            if use_deep_gemm and compatible_with_deep_gemm and has_deep_gemm():
+            if self.fp8_backend == Fp8MoeBackend.DEEPGEMM:
                 logger.debug("BatchedDeepGemmExperts(%s)", self.__class__.__name__)
                 return BatchedDeepGemmExperts(
                     max_num_tokens=max_num_tokens_per_rank,
@@ -1162,17 +942,22 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
                 )
 
         else:
-            logger.debug("TritonOrDeepGemmExperts(%s)", self.__class__.__name__)
-            return TritonOrDeepGemmExperts(
-                self.moe_quant_config,
-                allow_deep_gemm=use_deep_gemm,
-            )
+            if self.fp8_backend == Fp8MoeBackend.DEEPGEMM:
+                logger.debug("TritonOrDeepGemmExperts(%s)", self.__class__.__name__)
+                return TritonOrDeepGemmExperts(self.moe_quant_config)
+            else:
+                logger.debug("TritonExperts(%s)", self.__class__.__name__)
+                return TritonExperts(self.moe_quant_config)
 
     def get_fused_moe_quant_config(
         self, layer: torch.nn.Module
     ) -> FusedMoEQuantConfig | None:
-        if self.use_marlin:
-            return None
+        if self.fp8_backend == Fp8MoeBackend.MARLIN:
+            return fp8_w8a16_moe_quant_config(
+                w1_scale=layer.w13_weight_scale,
+                w2_scale=layer.w2_weight_scale,
+                block_shape=self.weight_block_size,
+            )
 
         per_act_token = self.input_quant.strategy == QuantizationStrategy.TOKEN
         per_channel_quant = self.weight_quant.strategy == QuantizationStrategy.CHANNEL
@@ -1190,128 +975,35 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
     def apply(
         self,
         layer: FusedMoE,
+        router: FusedMoERouter,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         use_nn_moe: bool | None = False,
         use_fused_gate: bool | None = False,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = router.select_experts(
             hidden_states=x,
             router_logits=router_logits,
             use_fused_gate=use_fused_gate,
         )
 
-        per_act_token = self.input_quant.strategy == QuantizationStrategy.TOKEN
-        per_channel_quant = self.weight_quant.strategy == QuantizationStrategy.CHANNEL
-
-        if self.use_marlin:
-            assert layer.activation == "silu", (
-                f"{layer.activation} not supported for Marlin MoE."
-            )
-            return fused_marlin_moe(
-                x,
-                layer.w13_weight,
-                layer.w2_weight,
-                None,
-                None,
-                layer.w13_weight_scale,
-                layer.w2_weight_scale,
-                router_logits,
-                topk_weights,
-                topk_ids,
-                quant_type_id=scalar_types.float8_e4m3fn.id,
-                apply_router_weight_on_input=layer.apply_router_weight_on_input,
-                global_num_experts=layer.global_num_experts,
-                expert_map=layer.expert_map,
-                input_dtype=self.marlin_input_dtype,
-                workspace=layer.workspace,
-            )
-
-        elif self.rocm_aiter_moe_enabled:
-            from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (  # noqa E501
-                rocm_aiter_fused_experts,
-            )
-
-            assert per_act_token == per_channel_quant
-            assert self.moe_quant_config is not None
-            return rocm_aiter_fused_experts(
-                hidden_states=x,
-                w1=layer.w13_weight,
-                w2=layer.w2_weight,
-                topk_weights=topk_weights,
-                topk_ids=topk_ids,
-                activation=layer.activation,
-                apply_router_weight_on_input=layer.apply_router_weight_on_input,
-                expert_map=layer.expert_map,
-                quant_config=self.moe_quant_config,
-            )
-
-        # cutlass path
-        elif self.use_cutlass:
-            assert self.moe_quant_config is not None
-
-            # small-batch fallback on SM100
-            if self.is_fp8_w8a8_sm100 and topk_ids.shape[0] <= 8:
-                from vllm.model_executor.layers.fused_moe import fused_experts
-
-                assert per_act_token == per_channel_quant
-                return fused_experts(
-                    hidden_states=x,
-                    w1=layer.w13_weight,
-                    w2=layer.w2_weight,
-                    topk_weights=topk_weights,
-                    topk_ids=topk_ids,
-                    inplace=True,
-                    activation=layer.activation,
-                    apply_router_weight_on_input=layer.apply_router_weight_on_input,
-                    global_num_experts=layer.global_num_experts,
-                    expert_map=None
-                    if self.disable_expert_map
-                    else layer.expert_map,  # ???
-                    quant_config=self.moe_quant_config,
-                )
-            else:
-                from vllm.model_executor.layers.fused_moe.cutlass_moe import (
-                    cutlass_moe_fp8,
-                )
-
-                assert per_act_token == per_channel_quant
-                assert self.moe_quant_config is not None
-                return cutlass_moe_fp8(
-                    x,
-                    layer.w13_weight,
-                    layer.w2_weight,
-                    topk_weights,
-                    topk_ids,
-                    quant_config=self.moe_quant_config,
-                    activation=layer.activation,
-                    global_num_experts=layer.global_num_experts,
-                    expert_map=None if self.disable_expert_map else layer.expert_map,
-                    ab_strides1=self.ab_strides1_c_strides2,
-                    ab_strides2=self.ab_strides2,
-                    c_strides1=self.c_strides1,
-                    c_strides2=self.ab_strides1_c_strides2,
-                )
-
-        else:
-            from vllm.model_executor.layers.fused_moe import fused_experts
+        assert self.kernel is not None
+        result = self.kernel(
+            x,
+            layer.w13_weight,
+            layer.w2_weight,
+            topk_weights,
+            topk_ids,
+            inplace=self.use_inplace,
+            activation=layer.activation,
+            global_num_experts=layer.global_num_experts,
+            # TODO(rob): investigate the disable_expert_map introduced by:
+            # https://github.com/vllm-project/vllm/commit/84166fee9770e6fba71a96978b3e7d149392fb28 # noqa: E501
+            expert_map=None if self.disable_expert_map else layer.expert_map,
+            apply_router_weight_on_input=layer.apply_router_weight_on_input,
+        )
 
-            assert per_act_token == per_channel_quant
-            assert self.moe_quant_config is not None
-            return fused_experts(
-                hidden_states=x,
-                w1=layer.w13_weight,
-                w2=layer.w2_weight,
-                topk_weights=topk_weights,
-                topk_ids=topk_ids,
-                inplace=True,
-                activation=layer.activation,
-                apply_router_weight_on_input=layer.apply_router_weight_on_input,
-                global_num_experts=layer.global_num_experts,
-                expert_map=layer.expert_map,
-                quant_config=self.moe_quant_config,
-                use_nn_moe=False,
-            )
+        return result
 
     @property
     def supports_eplb(self) -> bool:
@@ -1444,6 +1136,7 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
     def apply(
         self,
         layer: FusedMoE,
+        router: FusedMoERouter,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         use_nn_moe: bool | None = False,
@@ -1451,7 +1144,7 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         from vllm.model_executor.layers.fused_moe import fused_experts
 
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = router.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
@@ -1808,6 +1501,7 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
     def apply(
         self,
         layer: FusedMoE,
+        router: FusedMoERouter,
         x: torch.Tensor,
         router_logits: torch.Tensor,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
@@ -1815,7 +1509,7 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
             f"{layer.activation} not supported for Marlin MoE."
         )
 
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = router.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
@@ -2033,15 +1727,39 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
             block_shape=[0, self.group_size],
         )
 
+    def select_gemm_impl(
+        self,
+        prepare_finalize: mk.FusedMoEPrepareAndFinalize,
+        layer: torch.nn.Module,
+    ) -> mk.FusedMoEPermuteExpertsUnpermute:
+        if self.moe.is_lora_enabled:
+            assert self.moe_quant_config is not None
+            from vllm.triton_utils import HAS_TRITON
+
+            if HAS_TRITON:
+                from vllm.model_executor.layers.fused_moe import TritonWNA16Experts
+
+                layer.w13_weight = layer.w13_weight_packed
+                layer.w2_weight = layer.w2_weight_packed
+                return TritonWNA16Experts(quant_config=self.moe_quant_config)
+            else:
+                raise NotImplementedError(
+                    "TritonExperts requires Triton. "
+                    "Install triton or disable LoRA for MoE."
+                )
+
+        raise NotImplementedError
+
     def apply(
         self,
         layer: FusedMoE,
+        router: FusedMoERouter,
         x: torch.Tensor,
         router_logits: torch.Tensor,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         from vllm.model_executor.layers.fused_moe import fused_experts
 
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = router.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
@@ -2338,6 +2056,7 @@ class CompressedTensorsW4A8Int8MoEMethod(CompressedTensorsMoEMethod):
     def apply(
         self,
         layer: FusedMoE,
+        router: FusedMoERouter,
         x: torch.Tensor,
         router_logits: torch.Tensor,
     ) -> torch.Tensor:
@@ -2650,6 +2369,7 @@ class CompressedTensorsW4A8Fp8MoEMethod(CompressedTensorsMoEMethod):
     def apply(
         self,
         layer: FusedMoE,
+        router: FusedMoERouter,
         x: torch.Tensor,
         router_logits: torch.Tensor,
     ):
@@ -2658,7 +2378,7 @@ class CompressedTensorsW4A8Fp8MoEMethod(CompressedTensorsMoEMethod):
                 "EPLB not supported for `CompressedTensorsW4A8Fp8MoEMethod` yet."
             )
         assert self.moe_quant_config is not None
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = router.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
index ca286675ebd0cace14ddb53c87f869982a9f104b..6d40685f05e9efcfd9ddceba084fac5ee4f24408 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
@@ -9,6 +9,7 @@ from .compressed_tensors_w4a16_24 import (
     W4A16SPARSE24_SUPPORTED_BITS,
     CompressedTensorsW4A16Sparse24,
 )
+from .compressed_tensors_w4a16_mxfp4 import CompressedTensorsW4A16Mxfp4
 from .compressed_tensors_w4a16_nvfp4 import CompressedTensorsW4A16Fp4
 from .compressed_tensors_w8a8_fp8 import CompressedTensorsW8A8Fp8
 from .compressed_tensors_w8a8_int8 import CompressedTensorsW8A8Int8
@@ -29,6 +30,7 @@ __all__ = [
     "W4A16SPARSE24_SUPPORTED_BITS",
     "CompressedTensors24",
     "CompressedTensorsW4A16Fp4",
+    "CompressedTensorsW4A16Mxfp4",
     "CompressedTensorsW4A4Fp4",
     "CompressedTensorsW4A8Int",
     "CompressedTensorsW4A8Fp8",
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_mxfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_mxfp4.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c76adebebdad383810112108a59802f727df8e7
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_mxfp4.py
@@ -0,0 +1,106 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Callable
+
+import torch
+from torch.nn.parameter import Parameter
+
+from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
+    CompressedTensorsScheme,
+)
+from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
+    apply_fp4_marlin_linear,
+    prepare_fp4_layer_for_marlin,
+)
+from vllm.model_executor.parameter import (
+    GroupQuantScaleParameter,
+    ModelWeightParameter,
+)
+
+__all__ = ["CompressedTensorsW4A16Mxfp4"]
+
+
+class CompressedTensorsW4A16Mxfp4(CompressedTensorsScheme):
+    """
+    Compressed tensors scheme for MXFP4 weight-only quantization.
+
+    Supports models quantized with the compressed-tensors mxfp4-pack-quantized
+    format.
+
+    MXFP4 format:
+    - 4-bit float weights (E2M1) packed into uint8
+    - Per-group E8M0 scales with group_size=32
+    - No global scale (unlike NVFP4)
+    """
+
+    def __init__(self):
+        self.group_size = 32
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 80
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        output_partition_sizes: list[int],
+        input_size_per_partition: int,
+        params_dtype: torch.dtype,
+        weight_loader: Callable,
+        **kwargs,
+    ):
+        output_size_per_partition = sum(output_partition_sizes)
+        layer.logical_widths = output_partition_sizes
+        layer.input_size_per_partition = input_size_per_partition
+        layer.output_size_per_partition = output_size_per_partition
+        layer.params_dtype = params_dtype
+
+        # Packed FP4 weights (2 values per byte)
+        weight = ModelWeightParameter(
+            data=torch.empty(
+                output_size_per_partition,
+                input_size_per_partition // 2,
+                dtype=torch.uint8,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight_packed", weight)
+
+        # Per-group E8M0 scales
+        weight_scale = GroupQuantScaleParameter(
+            data=torch.empty(
+                output_size_per_partition,
+                input_size_per_partition // self.group_size,
+                dtype=torch.uint8,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight_scale", weight_scale)
+
+    def process_weights_after_loading(self, layer) -> None:
+        # Rename weight_packed to weight that marlin expects
+        layer.weight = Parameter(layer.weight_packed.data, requires_grad=False)
+        del layer.weight_packed
+
+        prepare_fp4_layer_for_marlin(layer)
+
+    def apply_weights(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        return apply_fp4_marlin_linear(
+            input=x,
+            weight=layer.weight,
+            weight_scale=layer.weight_scale,
+            weight_scale_2=None,
+            workspace=layer.workspace,
+            size_n=layer.output_size_per_partition,
+            size_k=layer.input_size_per_partition,
+            bias=bias,
+        )
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
index ee99572f5f4999723bc33e3f3cd6d44fedfdeffc..0c303b335935cbe355cfa89e2691be402eef5682 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
@@ -61,7 +61,7 @@ class CompressedTensorsW8A8Fp8(CompressedTensorsScheme):
             )
 
         self.cutlass_block_fp8_supported = cutlass_block_fp8_supported()
-        self.use_aiter_and_is_supported = rocm_aiter_ops.is_linear_fp8_enaled()
+        self.use_aiter_and_is_supported = rocm_aiter_ops.is_linear_fp8_enabled()
 
         if self.weight_block_size is not None:
             assert not self.is_static_input_scheme
@@ -158,7 +158,10 @@ class CompressedTensorsW8A8Fp8(CompressedTensorsScheme):
             input_scale = None
 
         else:
-            raise ValueError(f"Unknown quantization strategy {self.strategy}")
+            raise ValueError(
+                f"Unknown quantization strategy {self.strategy}: "
+                f"should be one of {list(QuantizationStrategy)}"
+            )
 
         # required by torch.compile to be torch.nn.Parameter
         layer.weight = Parameter(weight.data, requires_grad=False)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
index 81454d8f2744649d8cc393b130cf07acfb2e42cd..69bb0644e7930a58609bf6905aa9fe6f4344d53f 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
@@ -114,7 +114,7 @@ class CompressedTensorsWNA16(CompressedTensorsScheme):
             logger.info("Using %s for CompressedTensorsWNA16", kernel_type.__name__)
             self._kernel_backends_being_used.add(kernel_type.__name__)
 
-        if isinstance(kernel_type, MarlinLinearKernel):
+        if kernel_type is MarlinLinearKernel:
             input_dtype = get_marlin_input_dtype(self.layer_name)
             if input_dtype is not None:
                 mp_linear_kernel_config.act_type = input_dtype
diff --git a/vllm/model_executor/layers/quantization/cpu_wna16.py b/vllm/model_executor/layers/quantization/cpu_wna16.py
index bf643f55f1b9af977108e120f9f8a5ba7ebb04c9..dd985c2cefa6aa2d593206eedb0986a87ee14693 100644
--- a/vllm/model_executor/layers/quantization/cpu_wna16.py
+++ b/vllm/model_executor/layers/quantization/cpu_wna16.py
@@ -20,12 +20,6 @@ from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig,
     QuantizeMethodBase,
 )
-from vllm.model_executor.layers.quantization.utils.gptq_utils import (
-    get_linear_quant_method,
-)
-from vllm.model_executor.layers.quantization.utils.marlin_utils import (
-    marlin_repeat_scales_on_all_ranks,
-)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     is_layer_skipped,
     pack_cols,
@@ -34,335 +28,15 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.models.utils import WeightsMapper
 from vllm.model_executor.parameter import (
-    ChannelQuantScaleParameter,
     GroupQuantScaleParameter,
-    PackedColumnParameter,
     PackedvLLMParameter,
-    RowvLLMParameter,
 )
-from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.transformers_utils.config import get_safetensors_params_metadata
-from vllm.utils.collection_utils import is_list_of
 
 logger = init_logger(__name__)
 
 
-class CPUGPTQConfig(QuantizationConfig):
-    """Config class for CPU GPTQ quant"""
-
-    def __init__(
-        self,
-        weight_bits: int,
-        group_size: int,
-        desc_act: bool,
-        is_sym: bool,
-        lm_head_quantized: bool,
-        dynamic: dict[str, dict[str, int | bool]],
-        full_config: dict[str, Any],
-        modules_in_block_to_quantize: list[str] | None = None,
-    ) -> None:
-        super().__init__()
-        if desc_act and group_size == -1:
-            # In this case, act_order == True is the same as act_order == False
-            # (since we have only one group per output channel)
-            desc_act = False
-
-        # GPTQModel use `dynamic` config property to allow per module
-        # quantization config so each module can be individually optimized.
-        # Format is dict[str, dict] where key is a regex string that can
-        # perform both positive ("+:" prefixed) or negative ("-:" prefixed)
-        # matching of a module.
-        # Default to positive match, override base quant config mode, if no
-        # prefix is used. Value is in dict format of field key and override
-        # value.
-        # Negative matching will skip quantization init for this module
-        # entirely:
-        # non-quantized inference. More details and quantization examples can be
-        # found at: https://github.com/ModelCloud/GPTQModel
-        # Example:
-        #  # last 1/2 of the layers 10-21 has 8bit vs 4bit for 0-9
-        #  # last 1/4 of the layers 16-21 has 8bit and group_size 64
-        # dynamic = {
-        #  #`.*\.` matches the layers_node prefix
-        #  # positive match layer 10-15
-        #  r"+:.*\.(?:1[0-5])\..*": {"bits": 8,},
-        #  # positive match layer 16-21
-        #  r"+:.*\.(?:1[6-9]|20|21)\..*": {"bits": 8, "group_size": 64,},
-        #  r"-:.*\.moe\..*": {}, # negative match (skip) all `moe` layers
-        # }
-        assert weight_bits == 4
-        self.dynamic = dynamic
-        self.weight_bits = weight_bits
-        self.is_sym = is_sym
-        self.pack_factor = 32 // weight_bits  # packed into int32
-        self.group_size = group_size
-        self.desc_act = desc_act
-        self.lm_head_quantized = lm_head_quantized
-        self.full_config = full_config
-        self.modules_in_block_to_quantize = modules_in_block_to_quantize or []
-
-    def __repr__(self) -> str:
-        return (
-            f"CPUWNA16Config("
-            f"group_size={self.group_size}, "
-            f"desc_act={self.desc_act}, "
-            f"lm_head_quantized={self.lm_head_quantized}, "
-            f"dynamic={self.dynamic}, "
-            f"modules_in_block_to_quantize={self.modules_in_block_to_quantize})"
-        )
-
-    @classmethod
-    def get_name(cls) -> QuantizationMethods:
-        return "cpu_gptq"
-
-    @classmethod
-    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
-        return [torch.half, torch.bfloat16]
-
-    @classmethod
-    def get_min_capability(cls) -> int:
-        return -1
-
-    @classmethod
-    def get_config_filenames(cls) -> list[str]:
-        return ["quantize_config.json"]
-
-    @classmethod
-    def from_config(cls, config: dict[str, Any]) -> "CPUGPTQConfig":
-        weight_bits = cls.get_from_keys(config, ["bits"])
-        desc_act = cls.get_from_keys_or(config, ["desc_act"], default=False)
-        dynamic = cls.get_from_keys_or(config, ["dynamic"], default={})
-        group_size = cls.get_from_keys(config, ["group_size"])
-        is_sym = cls.get_from_keys(config, ["sym"])
-        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"], default=False)
-        modules_in_block_to_quantize = cls.get_from_keys_or(
-            config, ["modules_in_block_to_quantize"], default=None
-        )
-        return cls(
-            weight_bits,
-            group_size,
-            desc_act,
-            is_sym,
-            lm_head_quantized,
-            dynamic,
-            config,
-            modules_in_block_to_quantize,
-        )
-
-    @classmethod
-    def override_quantization_method(
-        cls, hf_quant_cfg, user_quant
-    ) -> QuantizationMethods | None:
-        quant_method = hf_quant_cfg.get("quant_method", "").lower()
-        if current_platform.is_cpu() and (quant_method == "gptq"):
-            return cls.get_name()
-        return None
-
-    def get_quant_method(
-        self, layer: torch.nn.Module, prefix: str
-    ) -> Optional["QuantizeMethodBase"]:
-        return get_linear_quant_method(self, layer, prefix, CPUGPTQLinearMethod)  # type: ignore
-
-    def apply_vllm_mapper(self, hf_to_vllm_mapper):
-        if self.modules_in_block_to_quantize is not None:
-            self.modules_in_block_to_quantize = hf_to_vllm_mapper.apply_list(
-                self.modules_in_block_to_quantize
-            )
-
-    def maybe_update_config(self, model_name: str, revision: str | None = None):
-        if self.modules_in_block_to_quantize:
-            if is_list_of(self.modules_in_block_to_quantize, list):
-                # original modules_in_block_to_quantize: list[list[str]]
-                # flatten original modules_in_block_to_quantize
-                self.modules_in_block_to_quantize = [
-                    item
-                    for sublist in self.modules_in_block_to_quantize
-                    for item in sublist
-                ]
-            return
-
-        unquant_dtypes = [torch.float16, torch.bfloat16, torch.float32]
-        metadata = get_safetensors_params_metadata(model_name, revision=revision)
-        quant_layers: set[str] = {
-            param_name.rsplit(".", 1)[0]
-            for param_name, info in metadata.items()
-            if (dtype := info.get("dtype", None))
-            and _SAFETENSORS_TO_TORCH_DTYPE[dtype] not in unquant_dtypes
-        }
-        self.modules_in_block_to_quantize = list(quant_layers)
-
-
-class CPUGPTQLinearMethod(LinearMethodBase):
-    """Linear method for GPTQ on CPU.
-
-    Args:
-        quant_config: The CPUWNA16 quantization config.
-    """
-
-    def __init__(self, quant_config: CPUGPTQConfig) -> None:
-        self.quant_config = quant_config
-        assert self.quant_config.is_sym, "GPTQ asym quant is not supported on CPU"
-
-    def create_weights(
-        self,
-        layer: torch.nn.Module,
-        input_size_per_partition: int,
-        output_partition_sizes: list[int],
-        input_size: int,
-        output_size: int,
-        params_dtype: torch.dtype,
-        **extra_weight_attrs,
-    ) -> None:
-        output_size_per_partition = sum(output_partition_sizes)
-        assert output_size_per_partition * self.quant_config.weight_bits % 32 == 0
-        assert output_size_per_partition % 32 == 0
-        assert input_size_per_partition % 32 == 0
-
-        is_row_parallel = input_size != input_size_per_partition
-        weight_loader = extra_weight_attrs.get("weight_loader")
-
-        # Normalize group_size
-        if self.quant_config.group_size != -1:
-            group_size = self.quant_config.group_size
-        else:
-            group_size = input_size
-
-        # Determine sharding
-        if marlin_repeat_scales_on_all_ranks(
-            self.quant_config.desc_act, self.quant_config.group_size, is_row_parallel
-        ):
-            # By setting scale_dim == None, weight_loader will
-            # repeat the scales on each rank in TP>1 case.
-            scales_and_zp_input_dim = None
-            scales_and_zp_size = input_size // group_size
-        else:
-            # By setting scale_dim == 0, weight_loader will
-            # shard the scales in TP>1 case.
-            scales_and_zp_input_dim = 0
-            scales_and_zp_size = input_size_per_partition // group_size
-
-        # Quantized weights
-        qweight = PackedvLLMParameter(
-            data=torch.empty(
-                input_size_per_partition // self.quant_config.pack_factor,
-                output_size_per_partition,
-                dtype=torch.int32,
-            ),
-            input_dim=0,
-            output_dim=1,
-            packed_dim=0,
-            packed_factor=self.quant_config.pack_factor,
-            weight_loader=weight_loader,
-        )
-
-        # Activation order
-        g_idx = RowvLLMParameter(
-            data=torch.empty(
-                input_size_per_partition,
-                dtype=torch.int32,
-            ),
-            input_dim=0,
-            weight_loader=weight_loader,
-        )
-        set_weight_attrs(
-            g_idx,
-            {"ignore_warning": True},
-        )
-
-        qzeros_args = {
-            "data": torch.empty(
-                scales_and_zp_size,
-                output_size_per_partition // self.quant_config.pack_factor,
-                dtype=torch.int32,
-            ),
-            "weight_loader": weight_loader,
-        }
-        weight_scale_args = {
-            "data": torch.empty(
-                scales_and_zp_size,
-                output_size_per_partition,
-                dtype=params_dtype,
-            ),
-            "weight_loader": weight_loader,
-        }
-
-        if scales_and_zp_input_dim is None:
-            scales = ChannelQuantScaleParameter(output_dim=1, **weight_scale_args)
-            qzeros = PackedColumnParameter(
-                output_dim=1,
-                packed_dim=1,
-                packed_factor=self.quant_config.pack_factor,
-                **qzeros_args,
-            )
-
-        else:
-            scales = GroupQuantScaleParameter(
-                output_dim=1, input_dim=0, **weight_scale_args
-            )
-            qzeros = PackedvLLMParameter(
-                input_dim=0,
-                output_dim=1,
-                packed_dim=1,
-                packed_factor=self.quant_config.pack_factor,
-                **qzeros_args,
-            )
-
-        layer.register_parameter("qweight", qweight)
-        layer.register_parameter("g_idx", g_idx)
-        layer.register_parameter("scales", scales)
-        layer.register_parameter("qzeros", qzeros)
-
-    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        torch.set_printoptions(profile="full", linewidth=5000, sci_mode=False)
-        packed_weight = layer.qweight.data
-        bits = self.quant_config.weight_bits
-        pack_factor = int(self.quant_config.pack_factor)
-        p_w_k, p_w_n = packed_weight.size()
-        input_size = p_w_k * pack_factor
-        output_size = p_w_n
-        isa_hint = _get_isa_hint(layer.scales.dtype)
-        layer.isa_hint = isa_hint
-
-        layer.qzeros = None
-        if not self.quant_config.desc_act:
-            layer.g_idx = None
-
-        # convert input dim packed to output dim packed
-        weight = unpack_cols(packed_weight, bits, p_w_k, p_w_n * pack_factor).view(
-            p_w_k, p_w_n, pack_factor
-        )
-        weight = weight.permute(0, 2, 1).reshape(input_size, output_size).contiguous()
-        weight = pack_cols(weight, bits, input_size, output_size)
-        # make 16 output channel as a block and transpose to the make
-        # the block contigous
-        weight = (
-            weight.view(input_size, -1, 16 // pack_factor)
-            .permute(1, 0, 2)
-            .reshape(-1, input_size * 16 // pack_factor)
-            .contiguous()
-        )
-        layer.qweight.data = weight
-
-    def apply(
-        self,
-        layer: torch.nn.Module,
-        x: torch.Tensor,
-        bias: torch.Tensor | None = None,
-    ) -> torch.Tensor:
-        x = cpu_gemm_wna16(
-            input=x,
-            q_weight=layer.qweight,
-            scales=layer.scales,
-            zeros=layer.qzeros,
-            g_idx=layer.g_idx,
-            bias=bias,
-            pack_factor=8,
-            isa_hint=layer.isa_hint,
-        )
-        return x
-
-
 class CPUAWQConfig(QuantizationConfig):
     """Config class for CPU AWQ"""
 
diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py
index 11097cf36f5ca4bf0221d192b225d08ab25de4d5..37e6020cb2a9097d2f3b060725cc481f73c26138 100644
--- a/vllm/model_executor/layers/quantization/experts_int8.py
+++ b/vllm/model_executor/layers/quantization/experts_int8.py
@@ -15,6 +15,7 @@ from vllm.model_executor.layers.fused_moe.config import (
     FusedMoEQuantConfig,
     int8_w8a16_moe_quant_config,
 )
+from vllm.model_executor.layers.fused_moe.fused_moe_router import FusedMoERouter
 from vllm.model_executor.layers.linear import LinearBase, UnquantizedLinearMethod
 from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.model_executor.layers.quantization.base_config import (
@@ -137,12 +138,13 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase):
     def apply(
         self,
         layer: FusedMoE,
+        router: FusedMoERouter,
         x: torch.Tensor,
         router_logits: torch.Tensor,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         from vllm.model_executor.layers.fused_moe import fused_experts
 
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = router.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 362213994b3e0599c87e1a0a5c3502903b7d0e22..e2b04648e3255bd7a4cdd2c571df6f77921a6c78 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -1,13 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from enum import Enum
-from functools import partial
 from typing import TYPE_CHECKING, Any, Optional
 
 import torch
 from torch.nn import Module
-from torch.nn.parameter import Parameter
+from torch.utils._python_dispatch import TorchDispatchMode
 
 import vllm.envs as envs
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
@@ -28,13 +26,18 @@ from vllm.model_executor.layers.fused_moe import (
     FusedMoeWeightScaleSupported,
 )
 from vllm.model_executor.layers.fused_moe.config import (
-    FusedMoEParallelConfig,
     FusedMoEQuantConfig,
     RoutingMethodType,
-    fp8_w8a8_moe_quant_config,
 )
-from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe
+from vllm.model_executor.layers.fused_moe.fused_moe_router import FusedMoERouter
 from vllm.model_executor.layers.fused_moe.layer import UnquantizedFusedMoEMethod
+from vllm.model_executor.layers.fused_moe.oracle.fp8 import (
+    Fp8MoeBackend,
+    convert_to_fp8_moe_kernel_format,
+    make_fp8_moe_kernel,
+    make_fp8_moe_quant_config,
+    select_fp8_moe_backend,
+)
 from vllm.model_executor.layers.linear import (
     LinearBase,
     LinearMethodBase,
@@ -47,25 +50,20 @@ from vllm.model_executor.layers.quantization.base_config import (
 )
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
 from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
-    FlashinferMoeBackend,
-    apply_flashinfer_per_tensor_scale_fp8,
+    apply_fi_trtllm_fp8_per_tensor_moe,
     build_flashinfer_fp8_cutlass_moe_prepare_finalize,
-    flashinfer_cutlass_moe_fp8,
-    get_flashinfer_moe_backend,
-    register_moe_scaling_factors,
-    rotate_flashinfer_fp8_moe_weights,
     select_cutlass_fp8_gemm_impl,
-    swap_w13_to_w31,
 )
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
     W8A8BlockFp8LinearOp,
     create_fp8_input_scale,
     create_fp8_scale_parameter,
     create_fp8_weight_parameter,
-    deepgemm_post_process_fp8_weight_block,
     maybe_post_process_fp8_weight_block,
+    process_fp8_input_tensor_strategy_moe,
     process_fp8_weight_block_strategy,
     process_fp8_weight_tensor_strategy,
+    process_fp8_weight_tensor_strategy_moe,
     validate_fp8_block_shape,
 )
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
@@ -74,7 +72,6 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils import (
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
     apply_fp8_marlin_linear,
     prepare_fp8_layer_for_marlin,
-    prepare_moe_fp8_layer_for_marlin,
 )
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     GroupShape,
@@ -82,12 +79,10 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
 )
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     Fp8LinearOp,
-    all_close_1d,
     cutlass_block_fp8_supported,
     cutlass_fp8_supported,
     maybe_create_device_identity,
     normalize_e4m3fn_to_e4m3fnuz,
-    per_tensor_dequantize,
 )
 from vllm.model_executor.parameter import (
     BlockQuantScaleParameter,
@@ -96,13 +91,9 @@ from vllm.model_executor.parameter import (
 )
 from vllm.model_executor.utils import replace_parameter, set_weight_attrs
 from vllm.platforms import current_platform
-from vllm.scalar_type import scalar_types
 from vllm.utils.deep_gemm import (
-    is_deep_gemm_e8m0_used,
     is_deep_gemm_supported,
 )
-from vllm.utils.flashinfer import has_flashinfer_moe
-from vllm.utils.import_utils import has_deep_gemm
 
 if TYPE_CHECKING:
     from vllm.model_executor.models.utils import WeightsMapper
@@ -112,100 +103,6 @@ ACTIVATION_SCHEMES = ["static", "dynamic"]
 logger = init_logger(__name__)
 
 
-class Fp8MoeBackend(Enum):
-    NONE = 0
-    FLASHINFER_TRTLLM = 1
-    FLASHINFER_CUTLASS = 2
-    DEEPGEMM = 3
-    CUTLASS_BLOCK_SCALED_GROUPED_GEMM = 4
-    MARLIN = 5
-    TRITON = 6
-
-
-def get_fp8_moe_backend(
-    block_quant: bool,
-    moe_parallel_config: FusedMoEParallelConfig,
-    with_lora_support: bool,
-) -> Fp8MoeBackend:
-    """
-    Select the primary FP8 MoE backend
-    Note: Shape-specific fallbacks may still occur at runtime.
-    """
-    if with_lora_support:
-        return Fp8MoeBackend.TRITON
-    # Prefer FlashInfer backends on supported GPUs; allow SM90 and SM100.
-    if (
-        current_platform.is_cuda()
-        and (
-            current_platform.is_device_capability_family(100)
-            or current_platform.is_device_capability(90)
-        )
-        and envs.VLLM_USE_FLASHINFER_MOE_FP8
-        and has_flashinfer_moe()
-    ):
-        backend = get_flashinfer_moe_backend()
-        if backend == FlashinferMoeBackend.TENSORRT_LLM:
-            logger.info_once("Using FlashInfer FP8 MoE TRTLLM backend for SM100")
-            return Fp8MoeBackend.FLASHINFER_TRTLLM
-        else:
-            if block_quant and current_platform.is_device_capability_family(100):
-                raise ValueError(
-                    "FlashInfer FP8 MoE throughput backend does not "
-                    "support block quantization. Please use "
-                    "VLLM_FLASHINFER_MOE_BACKEND=latency "
-                    "instead."
-                )
-            logger.info_once("Using FlashInfer FP8 MoE CUTLASS backend for SM90/SM100")
-            return Fp8MoeBackend.FLASHINFER_CUTLASS
-
-    # weight-only path for older GPUs without native FP8
-    use_marlin = (
-        not current_platform.has_device_capability(89)
-        or envs.VLLM_TEST_FORCE_FP8_MARLIN
-    )
-    if current_platform.is_rocm():
-        use_marlin = False
-    if use_marlin:
-        logger.info_once("Using Marlin backend for FP8 MoE")
-        return Fp8MoeBackend.MARLIN
-
-    # Determine if we should use DeepGEMM with block-quantized weights:
-    # - If explicitly set by user, respect their choice
-    # - If not explicitly set (default), disable when TP size is >= 8
-    moe_use_deep_gemm = envs.VLLM_MOE_USE_DEEP_GEMM
-    if not envs.is_set("VLLM_MOE_USE_DEEP_GEMM") and moe_parallel_config.tp_size >= 8:
-        moe_use_deep_gemm = False
-        logger.info_once(
-            "DeepGEMM MoE is disabled by default when TP size is >= 8. "
-            "Set VLLM_MOE_USE_DEEP_GEMM=1 to enable it.",
-            scope="local",
-        )
-
-    if envs.VLLM_USE_DEEP_GEMM and moe_use_deep_gemm and block_quant:
-        if not has_deep_gemm():
-            logger.warning_once(
-                "DeepGEMM backend requested but not available.", scope="local"
-            )
-        elif is_deep_gemm_supported():
-            logger.info_once("Using DeepGEMM backend for FP8 MoE", scope="local")
-            return Fp8MoeBackend.DEEPGEMM
-
-    # CUTLASS BlockScaled GroupedGemm on SM100 with block-quantized weights
-    if (
-        current_platform.is_cuda()
-        and current_platform.is_device_capability_family(100)
-        and block_quant
-    ):
-        logger.info_once(
-            "Using Cutlass BlockScaled GroupedGemm backend for FP8 MoE", scope="local"
-        )
-        return Fp8MoeBackend.CUTLASS_BLOCK_SCALED_GROUPED_GEMM
-
-    # default to Triton
-    logger.info_once("Using Triton backend for FP8 MoE")
-    return Fp8MoeBackend.TRITON
-
-
 class Fp8Config(QuantizationConfig):
     """Config class for FP8."""
 
@@ -253,7 +150,7 @@ class Fp8Config(QuantizationConfig):
 
     @classmethod
     def get_min_capability(cls) -> int:
-        return 80
+        return 75
 
     @classmethod
     def get_config_filenames(cls) -> list[str]:
@@ -305,6 +202,13 @@ class Fp8Config(QuantizationConfig):
                 return UnquantizedLinearMethod()
             return XPUFp8LinearMethod(fp8_config)
         elif isinstance(layer, FusedMoE):
+            if is_layer_skipped(
+                prefix=prefix,
+                ignored_layers=self.ignored_layers,
+                fused_mapping=self.packed_modules_mapping,
+            ):
+                return UnquantizedFusedMoEMethod(layer.moe_config)
+
             return XPUFp8MoEMethod(fp8_config, layer)
         elif isinstance(layer, Attention):
             return Fp8KVCacheMethod(self)
@@ -336,7 +240,6 @@ class Fp8Config(QuantizationConfig):
                 moe_quant_method = Fp8MoEMethod(self, layer)
             else:
                 moe_quant_method = Fp8OnlineMoEMethod(self, layer)
-            moe_quant_method.marlin_input_dtype = get_marlin_input_dtype(prefix)
             return moe_quant_method
         elif isinstance(layer, Attention):
             return Fp8KVCacheMethod(self)
@@ -363,6 +266,26 @@ class Fp8Config(QuantizationConfig):
         return None
 
 
+class CopyNumelCounter(TorchDispatchMode):
+    """
+    Tracks total number of elements modified with `copy_`. Useful for keeping
+    track of weight loading where underlying weights can be arbitrarily
+    transformed (such as with `narrow`) before calling copy.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.copied_numel = 0
+
+    def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+        if kwargs is None:
+            kwargs = {}
+        out = func(*args, **kwargs)
+        if func == torch.ops.aten.copy_.default:
+            self.copied_numel += args[0].numel()
+        return out
+
+
 class Fp8LinearMethod(LinearMethodBase):
     """Linear method for FP8.
     Supports loading FP8 checkpoints with static weight scale and
@@ -399,7 +322,7 @@ class Fp8LinearMethod(LinearMethodBase):
         if vllm_is_batch_invariant():
             self.use_marlin = False
 
-        self.use_aiter_and_is_supported = rocm_aiter_ops.is_linear_fp8_enaled()
+        self.use_aiter_and_is_supported = rocm_aiter_ops.is_linear_fp8_enabled()
         self.use_deep_gemm = is_deep_gemm_supported()
 
         self.weight_block_size = self.quant_config.weight_block_size
@@ -469,13 +392,15 @@ class Fp8LinearMethod(LinearMethodBase):
         else:
 
             def patched_weight_loader(param, loaded_weight, *args, **kwargs):
-                # load the current weight chunk
-                res = weight_loader(param, loaded_weight, *args, **kwargs)  # type: ignore[misc]
-
                 # track how many elements we have updated
                 if not hasattr(layer, "_loaded_numel"):
                     layer._loaded_numel = 0
-                layer._loaded_numel += loaded_weight.numel()
+
+                # load the current weight chunk
+                copy_numel_counter = CopyNumelCounter()
+                with copy_numel_counter:
+                    res = weight_loader(param, loaded_weight, *args, **kwargs)  # type: ignore[misc]
+                layer._loaded_numel += copy_numel_counter.copied_numel
 
                 # if we have loaded all of the elements, call
                 # process_weights_after_loading
@@ -702,36 +627,42 @@ class Fp8MoEMethod(FusedMoEMethodBase):
 
     def __init__(self, quant_config: Fp8Config, layer: torch.nn.Module):
         super().__init__(layer.moe_config)
-        self.layer = layer
         self.quant_config = quant_config
         self.weight_block_size = self.quant_config.weight_block_size
         self.block_quant: bool = self.weight_block_size is not None
-        self.fp8_backend = get_fp8_moe_backend(
-            self.block_quant, layer.moe_parallel_config, self.moe.is_lora_enabled
+        self.weight_scale_name = (
+            "weight_scale_inv" if self.block_quant else "weight_scale"
+        )
+        self.fp8_backend = select_fp8_moe_backend(
+            block_quant=self.block_quant,
+            tp_size=layer.moe_parallel_config.tp_size,
+            with_lora_support=self.moe.is_lora_enabled,
         )
 
-        self.marlin_input_dtype = None
-        self.use_marlin = self.fp8_backend == Fp8MoeBackend.MARLIN
-        self.flashinfer_moe_backend: FlashinferMoeBackend | None = None
-        if self.fp8_backend == Fp8MoeBackend.FLASHINFER_TRTLLM:
-            self.flashinfer_moe_backend = FlashinferMoeBackend.TENSORRT_LLM
-        elif self.fp8_backend == Fp8MoeBackend.FLASHINFER_CUTLASS:
-            self.flashinfer_moe_backend = FlashinferMoeBackend.CUTLASS
-            if self.block_quant:
-                assert self.weight_block_size == [128, 128], (
-                    f"Only support weight_block_size == [128, 128], "
-                    f"got {self.weight_block_size}"
+        if self.fp8_backend == Fp8MoeBackend.FLASHINFER_CUTLASS:
+            if self.block_quant and self.weight_block_size != [128, 128]:
+                raise NotImplementedError(
+                    "FlashInfer CUTLASS FP8 MoE backend only supports block "
+                    "size [128, 128]."
                 )
-            self.flashinfer_moe_fn = partial(
-                flashinfer_cutlass_moe_fp8,
-                moe=self.moe,
-                use_deepseek_fp8_block_scale=self.block_quant,
+            if layer.activation != "silu":
+                raise NotImplementedError(
+                    "FlashInfer CUTLASS FP8 MoE backend only supports SiLU "
+                    "activation function, but got {layer.activation}."
+                )
+        dynamic_per_token = (
+            not self.block_quant and self.quant_config.activation_scheme != "static"
+        )
+        if dynamic_per_token and self.fp8_backend in [
+            Fp8MoeBackend.FLASHINFER_TRTLLM,
+            Fp8MoeBackend.FLASHINFER_CUTLASS,
+        ]:
+            raise NotImplementedError(
+                "FlashInfer FP8 MoE backend does not support dynamic per token "
+                "activation quantization."
             )
 
-        self.allow_deep_gemm = self.fp8_backend == Fp8MoeBackend.DEEPGEMM
-        self.allow_cutlass_block_scaled_grouped_gemm = (
-            self.fp8_backend == Fp8MoeBackend.CUTLASS_BLOCK_SCALED_GROUPED_GEMM
-        )
+        self.kernel: mk.FusedMoEModularKernel | None = None
 
     def create_weights(
         self,
@@ -804,38 +735,28 @@ class Fp8MoEMethod(FusedMoEMethodBase):
 
         # WEIGHT_SCALES
         if not self.block_quant:
-            # Allocate 2 scales for w1 and w3 respectively.
-            # They will be combined to a single scale after weight loading.
-            w13_weight_scale = torch.nn.Parameter(
-                torch.ones(num_experts, 2, dtype=torch.float32), requires_grad=False
-            )
-            w2_weight_scale = torch.nn.Parameter(
-                torch.ones(num_experts, dtype=torch.float32), requires_grad=False
-            )
-            layer.register_parameter("w13_weight_scale", w13_weight_scale)
-            layer.register_parameter("w2_weight_scale", w2_weight_scale)
+            # For per-tensor quant, the scales are per expert and weight.
+            w13_scale_data = torch.ones(num_experts, 2, dtype=torch.float32)
+            w2_scale_data = torch.ones(num_experts, dtype=torch.float32)
         else:
-            w13_weight_scale = torch.nn.Parameter(
-                torch.ones(
-                    num_experts,
-                    2 * ((intermediate_size_per_partition + block_n - 1) // block_n),
-                    (hidden_size + block_k - 1) // block_k,
-                    dtype=torch.float32,
-                ),
-                requires_grad=False,
+            # For block quant, the scales are per block (typically 128x128).
+            w13_scale_data = torch.ones(
+                num_experts,
+                2 * ((intermediate_size_per_partition + block_n - 1) // block_n),
+                (hidden_size + block_k - 1) // block_k,
+                dtype=torch.float32,
             )
-            w2_weight_scale = torch.nn.Parameter(
-                torch.ones(
-                    num_experts,
-                    (hidden_size + block_n - 1) // block_n,
-                    (intermediate_size_per_partition + block_k - 1) // block_k,
-                    dtype=torch.float32,
-                ),
-                requires_grad=False,
+            w2_scale_data = torch.ones(
+                num_experts,
+                (hidden_size + block_n - 1) // block_n,
+                (intermediate_size_per_partition + block_k - 1) // block_k,
+                dtype=torch.float32,
             )
-            layer.register_parameter("w13_weight_scale_inv", w13_weight_scale)
-            layer.register_parameter("w2_weight_scale_inv", w2_weight_scale)
-            assert self.quant_config.activation_scheme == "dynamic"
+        w13_weight_scale = torch.nn.Parameter(w13_scale_data, requires_grad=False)
+        w2_weight_scale = torch.nn.Parameter(w2_scale_data, requires_grad=False)
+        # Note: name is weight_scale for tensor, weight_scale_inv for block.
+        layer.register_parameter(f"w13_{self.weight_scale_name}", w13_weight_scale)
+        layer.register_parameter(f"w2_{self.weight_scale_name}", w2_weight_scale)
 
         # Add the quantization method used (per tensor/grouped/channel)
         # to ensure the weight scales are loaded in properly
@@ -849,6 +770,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
 
         # INPUT_SCALES
         if self.quant_config.activation_scheme == "static":
+            assert not self.block_quant
             w13_input_scale = torch.nn.Parameter(
                 torch.ones(num_experts, dtype=torch.float32), requires_grad=False
             )
@@ -865,197 +787,111 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             layer.w13_input_scale = None
             layer.w2_input_scale = None
 
-        self.rocm_aiter_moe_enabled = False
+    def _setup_kernel(
+        self,
+        layer: Module,
+        w13: torch.Tensor,
+        w2: torch.Tensor,
+        w13_scale: torch.Tensor,
+        w2_scale: torch.Tensor,
+        w13_input_scale: torch.Tensor | None,
+        w2_input_scale: torch.Tensor | None,
+    ) -> None:
+        # Shuffle weights to runtime format.
+        w13, w2, w13_scale, w2_scale = convert_to_fp8_moe_kernel_format(
+            fp8_backend=self.fp8_backend,
+            layer=layer,
+            w13=w13,
+            w2=w2,
+            w13_scale=w13_scale,
+            w2_scale=w2_scale,
+            w13_input_scale=w13_input_scale,
+            w2_input_scale=w2_input_scale,
+        )
+
+        # Replace parameters with updated versions. Note that this helper
+        # function ensures the replacement is compatible with RL weight reloads.
+        replace_parameter(layer, "w13_weight", w13)
+        replace_parameter(layer, "w2_weight", w2)
+        replace_parameter(layer, f"w13_{self.weight_scale_name}", w13_scale)
+        replace_parameter(layer, f"w2_{self.weight_scale_name}", w2_scale)
+
+        # Setup modular kernel for TP case.
+        self.moe_quant_config = self.get_fused_moe_quant_config(layer)
+        if self.moe_quant_config:
+            self.kernel, self.use_inplace = make_fp8_moe_kernel(
+                layer=layer,
+                moe_quant_config=self.moe_quant_config,
+                moe_config=self.moe,
+                fp8_backend=self.fp8_backend,
+            )
 
     def process_weights_after_loading(self, layer: Module) -> None:
         if getattr(layer, "_already_called_process_weights_after_loading", False):
             return
 
-        # Lazy import to avoid importing triton too early.
-
-        self.rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled()
+        # Allow for accessing weights and scales in standard way.
+        w13 = layer.w13_weight
+        w2 = layer.w2_weight
+        w13_scale = getattr(layer, f"w13_{self.weight_scale_name}")
+        w2_scale = getattr(layer, f"w2_{self.weight_scale_name}")
+        w13_input_scale = layer.w13_input_scale
+        w2_input_scale = layer.w2_input_scale
+
+        # MI300x and MI325x use FNUZ format for FP8. Convert if needed.
+        if current_platform.is_fp8_fnuz():
+            w13, w13_scale, w13_input_scale = normalize_e4m3fn_to_e4m3fnuz(
+                w13,
+                w13_scale,
+                w13_input_scale,
+            )
+            w2, w2_scale, w2_input_scale = normalize_e4m3fn_to_e4m3fnuz(
+                w2,
+                w2_scale,
+                w2_input_scale,
+            )
 
-        # TODO (rob): refactor block quant into separate class.
-        if self.block_quant:
-            assert self.quant_config.activation_scheme == "dynamic"
-            if current_platform.is_fp8_fnuz():
-                w13_weight, w13_weight_scale_inv, w13_input_scale = (
-                    normalize_e4m3fn_to_e4m3fnuz(
-                        layer.w13_weight,
-                        layer.w13_weight_scale_inv,
-                        layer.w13_input_scale,
-                    )
-                )
-                w2_weight, w2_weight_scale_inv, w2_input_scale = (
-                    normalize_e4m3fn_to_e4m3fnuz(
-                        layer.w2_weight, layer.w2_weight_scale_inv, layer.w2_input_scale
-                    )
-                )
-            elif self.flashinfer_moe_backend is not None:
-                # NOTE: weights have to be swapped since the activation is
-                # applied on different half for flashinfer vs vllm
-                w13_weight = swap_w13_to_w31(layer.w13_weight.data)
-                w13_weight_scale_inv = swap_w13_to_w31(layer.w13_weight_scale_inv.data)
-                w2_weight = layer.w2_weight.data
-                w2_weight_scale_inv = layer.w2_weight_scale_inv.data
-            else:
-                w13_weight = layer.w13_weight.data
-                w13_weight_scale_inv = layer.w13_weight_scale_inv.data
-                w2_weight = layer.w2_weight
-                w2_weight_scale_inv = layer.w2_weight_scale_inv
-
-            # torch.compile() cannot use Parameter subclasses.
-            replace_parameter(layer, "w13_weight", w13_weight)
-            replace_parameter(layer, "w13_weight_scale_inv", w13_weight_scale_inv)
-            replace_parameter(layer, "w2_weight", w2_weight)
-            replace_parameter(layer, "w2_weight_scale_inv", w2_weight_scale_inv)
-            if self.rocm_aiter_moe_enabled:
-                # reshaping weights is required for aiter moe kernel.
-                shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights(
-                    layer.w13_weight.data, layer.w2_weight.data
-                )
+        # Per tensor kernels require single activation scale. Use the max.
+        if self.quant_config.activation_scheme == "static":
+            assert not self.block_quant
+            assert w13_input_scale is not None and w2_input_scale is not None
+            w13_input_scale, w2_input_scale = process_fp8_input_tensor_strategy_moe(
+                w13_input_scale, w2_input_scale
+            )
+            replace_parameter(layer, "w13_input_scale", w13_input_scale)
+            replace_parameter(layer, "w2_input_scale", w2_input_scale)
 
-                replace_parameter(layer, "w13_weight", shuffled_w13)
-                replace_parameter(layer, "w2_weight", shuffled_w2)
-
-            # DeepGemm scales need to be transposed and aligned. We try to do
-            # it ahead of time for performance reasons.
-            if self.allow_deep_gemm:
-                dg_w13_weight, dg_w13_weight_scale_inv = (
-                    deepgemm_post_process_fp8_weight_block(
-                        wq=layer.w13_weight.data,
-                        ws=layer.w13_weight_scale_inv.data,
-                        quant_block_shape=tuple(layer.weight_block_size),
-                        use_e8m0=is_deep_gemm_e8m0_used(),
-                    )
-                )
-                dg_w2_weight, dg_w2_weight_scale_inv = (
-                    deepgemm_post_process_fp8_weight_block(
-                        wq=layer.w2_weight.data,
-                        ws=layer.w2_weight_scale_inv.data,
-                        quant_block_shape=tuple(layer.weight_block_size),
-                        use_e8m0=is_deep_gemm_e8m0_used(),
-                    )
-                )
-                layer.w13_weight = Parameter(dg_w13_weight, requires_grad=False)
-                layer.w13_weight_scale_inv = Parameter(
-                    dg_w13_weight_scale_inv, requires_grad=False
-                )
-                layer.w2_weight = Parameter(dg_w2_weight, requires_grad=False)
-                layer.w2_weight_scale_inv = Parameter(
-                    dg_w2_weight_scale_inv, requires_grad=False
-                )
-        else:
-            # Fp8 moe kernels require a single activation scale.
-            # We take the max of all the scales in case they differ.
-            if self.quant_config.activation_scheme == "static":
-                if layer.w13_input_scale is None or layer.w2_input_scale is None:
-                    raise ValueError(
-                        "QuantConfig has static quantization, but found "
-                        "activation scales are None."
-                    )
-                if not all_close_1d(layer.w13_input_scale) or not all_close_1d(
-                    layer.w2_input_scale
-                ):
-                    logger.warning_once(
-                        "Found input_scales that are not equal for "
-                        "fp8 MoE layer. Using the maximum across experts "
-                        "for each layer."
-                    )
-                replace_parameter(layer, "w13_input_scale", layer.w13_input_scale.max())
-                replace_parameter(layer, "w2_input_scale", layer.w2_input_scale.max())
-            if current_platform.is_fp8_fnuz():
-                # Normalize the weights and scales
-                w13_weight, w13_weight_scale, w13_input_scale = (
-                    normalize_e4m3fn_to_e4m3fnuz(
-                        layer.w13_weight, layer.w13_weight_scale, layer.w13_input_scale
-                    )
-                )
-                w2_weight, w2_weight_scale, w2_input_scale = (
-                    normalize_e4m3fn_to_e4m3fnuz(
-                        layer.w2_weight, layer.w2_weight_scale, layer.w2_input_scale
-                    )
-                )
-                # Reset the parameter
-                replace_parameter(layer, "w13_weight", w13_weight)
-                replace_parameter(layer, "w13_weight_scale", w13_weight_scale)
-                if w13_input_scale is not None:
-                    replace_parameter(layer, "w13_input_scale", w13_input_scale)
-                replace_parameter(layer, "w2_weight", w2_weight)
-                replace_parameter(layer, "w2_weight_scale", w2_weight_scale)
-                if w2_input_scale is not None:
-                    replace_parameter(layer, "w2_input_scale", w2_input_scale)
-
-            # Fp8 moe kernel needs single weight scale for w13 per expert.
-            # We take the max then dequant and requant each expert.
-            assert layer.w13_weight_scale is not None
+        # Per tensor kernels require single weight scale for w13 per expert, but
+        # on disk there is a scale for w1 and w3. Use the max to requantize.
+        if not self.block_quant:
             shard_size = layer.intermediate_size_per_partition
-            max_w13_scales = layer.w13_weight_scale.max(dim=1).values
-            for expert_id in range(layer.local_num_experts):
-                start = 0
-                for shard_id in range(2):
-                    dq_weight = per_tensor_dequantize(
-                        layer.w13_weight[expert_id][start : start + shard_size, :],
-                        layer.w13_weight_scale[expert_id][shard_id],
-                    )
-                    layer.w13_weight[expert_id][start : start + shard_size, :], _ = (
-                        ops.scaled_fp8_quant(dq_weight, max_w13_scales[expert_id])
-                    )
-                    start += shard_size
-
-            if self.rocm_aiter_moe_enabled:
-                shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights(
-                    layer.w13_weight, layer.w2_weight
-                )
-
-                replace_parameter(layer, "w13_weight", shuffled_w13)
-                replace_parameter(layer, "w2_weight", shuffled_w2)
-
-            replace_parameter(layer, "w13_weight_scale", max_w13_scales)
-
-            if self.flashinfer_moe_backend is not None:
-                # NOTE: weights have to be swapped since the activation is
-                # applied on different half for flashinfer vs vllm
-                assert not self.block_quant
-                register_moe_scaling_factors(layer)
-                w13_weight = swap_w13_to_w31(layer.w13_weight.data)
-                if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM:
-                    rotate_flashinfer_fp8_moe_weights(w13_weight, w2_weight)
-                layer.w13_weight.data = w13_weight.data
-
-        if self.use_marlin:
-            prepare_moe_fp8_layer_for_marlin(
-                layer, False, input_dtype=self.marlin_input_dtype
+            w13, w13_scale = process_fp8_weight_tensor_strategy_moe(
+                w13, w13_scale, shard_size, layer.local_num_experts
             )
-            # Activations not quantized for marlin.
-            del layer.w13_input_scale
-            del layer.w2_input_scale
+
+        # Shuffle weights to runtime format and setup kernel.
+        self._setup_kernel(
+            layer, w13, w2, w13_scale, w2_scale, w13_input_scale, w2_input_scale
+        )
 
     def maybe_make_prepare_finalize(
         self,
         routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
     ) -> mk.FusedMoEPrepareAndFinalize | None:
-        if (
-            self.rocm_aiter_moe_enabled
-            or self.use_marlin
-            or self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
-        ):
+        if self.fp8_backend in [
+            Fp8MoeBackend.AITER,
+            Fp8MoeBackend.MARLIN,
+            Fp8MoeBackend.FLASHINFER_TRTLLM,
+        ]:
             return None
-        elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
-            if self.block_quant:
-                assert self.weight_block_size == [128, 128], (
-                    f"Only support weight_block_size == [128, 128], "
-                    f"got {self.weight_block_size}"
-                )
-            # Wire block-scale flag through prepare/finalize when using CUTLASS
+        elif self.fp8_backend == Fp8MoeBackend.FLASHINFER_CUTLASS:
             prepare_finalize = build_flashinfer_fp8_cutlass_moe_prepare_finalize(
                 self.moe,
                 use_deepseek_fp8_block_scale=self.block_quant,
             )
             logger.debug_once("%s", prepare_finalize.__class__.__name__)
             return prepare_finalize
-        else:
-            return super().maybe_make_prepare_finalize(routing_tables)
+        return super().maybe_make_prepare_finalize(routing_tables)
 
     def select_gemm_impl(
         self,
@@ -1069,9 +905,10 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             TritonOrDeepGemmExperts,
         )
 
-        assert not self.use_marlin and not self.rocm_aiter_moe_enabled, (
-            "Marlin and ROCm AITER are not supported with all2all yet."
-        )
+        if self.fp8_backend in [Fp8MoeBackend.MARLIN, Fp8MoeBackend.AITER]:
+            raise NotImplementedError(
+                "Marlin and ROCm AITER are not supported with all2all yet."
+            )
 
         assert self.moe_quant_config is not None
 
@@ -1083,7 +920,9 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             assert max_num_tokens_per_rank is not None
 
             experts_impl = (
-                BatchedDeepGemmExperts if self.allow_deep_gemm else BatchedTritonExperts
+                BatchedDeepGemmExperts
+                if self.fp8_backend == Fp8MoeBackend.DEEPGEMM
+                else BatchedTritonExperts
             )
             logger.debug(
                 "%s(%s): max_tokens_per_rank=%s, block_size=%s, per_act_token=%s",
@@ -1100,7 +939,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             )
         elif self.moe.is_lora_enabled:
             return TritonExperts(quant_config=self.moe_quant_config)
-        elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
+        elif self.fp8_backend == Fp8MoeBackend.FLASHINFER_CUTLASS:
             # Select GEMM experts with block-scale when weights are block-quantized
             experts = select_cutlass_fp8_gemm_impl(
                 self.moe,
@@ -1109,35 +948,42 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             )
             logger.debug_once("Using %s", experts.__class__.__name__)
             return experts
-        else:
+        elif self.fp8_backend == Fp8MoeBackend.DEEPGEMM:
             logger.debug(
                 "TritonOrDeepGemmExperts(%s): block_size=%s, per_act_token=%s",
                 self.__class__.__name__,
                 self.weight_block_size,
                 False,
             )
-            return TritonOrDeepGemmExperts(
-                quant_config=self.moe_quant_config,
-                allow_deep_gemm=self.allow_deep_gemm,
+            return TritonOrDeepGemmExperts(self.moe_quant_config)
+        else:
+            assert self.fp8_backend == Fp8MoeBackend.TRITON
+            logger.debug(
+                "TritonExperts(%s): block_size=%s, per_act_token=%s",
+                self.__class__.__name__,
+                self.weight_block_size,
+                False,
             )
+            return TritonExperts(self.moe_quant_config)
 
     def get_fused_moe_quant_config(
         self, layer: torch.nn.Module
     ) -> FusedMoEQuantConfig | None:
-        if self.use_marlin:
+        # TRTLLM does not use Modular Kernel.
+        if self.fp8_backend == Fp8MoeBackend.FLASHINFER_TRTLLM:
             return None
 
-        return fp8_w8a8_moe_quant_config(
-            w1_scale=(
-                layer.w13_weight_scale_inv
-                if self.block_quant
-                else layer.w13_weight_scale
-            ),
-            w2_scale=(
-                layer.w2_weight_scale_inv if self.block_quant else layer.w2_weight_scale
-            ),
-            a1_scale=layer.w13_input_scale,
-            a2_scale=layer.w2_input_scale,
+        w1_scale = getattr(layer, f"w13_{self.weight_scale_name}")
+        w2_scale = getattr(layer, f"w2_{self.weight_scale_name}")
+        a1_scale = layer.w13_input_scale
+        a2_scale = layer.w2_input_scale
+
+        return make_fp8_moe_quant_config(
+            fp8_backend=self.fp8_backend,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            a1_scale=a1_scale,
+            a2_scale=a2_scale,
             block_shape=self.weight_block_size,
         )
 
@@ -1152,10 +998,12 @@ class Fp8MoEMethod(FusedMoEMethodBase):
     def apply(
         self,
         layer: FusedMoE,
+        router: FusedMoERouter,
         x: torch.Tensor,
         router_logits: torch.Tensor,**_,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM:
+        if self.fp8_backend == Fp8MoeBackend.FLASHINFER_TRTLLM:
+            # TODO(rob): convert this to MK.
             if layer.enable_eplb:
                 raise NotImplementedError("EPLB not supported for `Fp8MoEMethod` yet.")
             assert layer.activation == "silu", (
@@ -1193,10 +1041,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                     routed_scaling=layer.routed_scaling_factor,
                 )
             else:
-                assert (
-                    not layer.renormalize and layer.custom_routing_function is not None
-                )
-                result = apply_flashinfer_per_tensor_scale_fp8(
+                result = apply_fi_trtllm_fp8_per_tensor_moe(
                     layer=layer,
                     hidden_states=x,
                     router_logits=router_logits,
@@ -1208,103 +1053,26 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                     apply_router_weight_on_input=layer.apply_router_weight_on_input,
                 )
 
-        select_result = layer.select_experts(
+        topk_weights, topk_ids = router.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
 
-        topk_weights, topk_ids, zero_expert_result = select_result
-
-        if self.rocm_aiter_moe_enabled:
-            from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (  # noqa: E501
-                rocm_aiter_fused_experts,
-            )
-
-            result = rocm_aiter_fused_experts(
-                x,
-                layer.w13_weight,
-                layer.w2_weight,
-                topk_weights=topk_weights,
-                topk_ids=topk_ids,
-                activation=layer.activation,
-                apply_router_weight_on_input=layer.apply_router_weight_on_input,
-                expert_map=layer.expert_map,
-                quant_config=self.moe_quant_config,
-            )
-        elif self.use_marlin:
-            assert layer.activation == "silu", (
-                f"{layer.activation} not supported for Marlin MoE."
-            )
-            result = fused_marlin_moe(
-                x,
-                layer.w13_weight,
-                layer.w2_weight,
-                None,
-                None,
-                layer.w13_weight_scale,
-                layer.w2_weight_scale,
-                router_logits,
-                topk_weights,
-                topk_ids,
-                quant_type_id=scalar_types.float8_e4m3fn.id,
-                apply_router_weight_on_input=layer.apply_router_weight_on_input,
-                global_num_experts=layer.global_num_experts,
-                expert_map=layer.expert_map,
-                input_dtype=self.marlin_input_dtype,
-                workspace=layer.workspace,
-            )
-        elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
-            assert layer.activation == "silu", (
-                f"Expected 'silu' activation but got {layer.activation}"
-            )
-            if not self.block_quant:
-                assert (
-                    not layer.renormalize and layer.custom_routing_function is not None
-                )
-                assert layer.scoring_func == "sigmoid", (
-                    f"Expected 'sigmoid' scoring func but got {layer.scoring_func}"
-                )
-            # Delegate to CUTLASS FlashInfer path; function already bound with
-            # use_deepseek_fp8_block_scale for block-quant when applicable
-            result = self.flashinfer_moe_fn(
-                x,
-                layer,
-                topk_weights,
-                topk_ids,
-                inplace=False,
-                activation=layer.activation,
-                global_num_experts=layer.global_num_experts,
-                expert_map=layer.expert_map,
-                apply_router_weight_on_input=layer.apply_router_weight_on_input,
-            )
-        else:
-            from vllm.model_executor.layers.fused_moe import fused_experts
-
-            result = fused_experts(
-                hidden_states=x,
-                w1=layer.w13_weight,
-                w2=layer.w2_weight,
-                topk_weights=topk_weights,
-                topk_ids=topk_ids,
-                inplace=True,
-                activation=layer.activation,
-                global_num_experts=layer.global_num_experts,
-                apply_router_weight_on_input=layer.apply_router_weight_on_input,
-                expert_map=layer.expert_map,
-                quant_config=self.moe_quant_config,
-                allow_deep_gemm=self.allow_deep_gemm,
-                allow_cutlass_block_scaled_grouped_gemm=(
-                    self.allow_cutlass_block_scaled_grouped_gemm
-                ),
-            )
+        assert self.kernel is not None
+        result = self.kernel(
+            x,
+            layer.w13_weight,
+            layer.w2_weight,
+            topk_weights,
+            topk_ids,
+            inplace=self.use_inplace,
+            activation=layer.activation,
+            global_num_experts=layer.global_num_experts,
+            expert_map=layer.expert_map,
+            apply_router_weight_on_input=layer.apply_router_weight_on_input,
+        )
 
-        if layer.zero_expert_num != 0 and layer.zero_expert_type is not None:
-            assert not isinstance(result, tuple), (
-                "Shared + zero experts are mutually exclusive not yet supported"
-            )
-            return result, zero_expert_result
-        else:
-            return result
+        return result
 
 
 class Fp8OnlineMoEMethod(Fp8MoEMethod):
@@ -1322,7 +1090,6 @@ class Fp8OnlineMoEMethod(Fp8MoEMethod):
         assert not quant_config.is_checkpoint_fp8_serialized
         assert quant_config.activation_scheme == "dynamic"
         assert quant_config.weight_block_size is None
-        assert self.flashinfer_moe_backend is None
 
     def create_weights(
         self,
@@ -1348,13 +1115,15 @@ class Fp8OnlineMoEMethod(Fp8MoEMethod):
         new_extra_weight_attrs = extra_weight_attrs
 
         def patched_weight_loader(param, loaded_weight, *args, **kwargs):
-            # load the current weight chunk
-            res = weight_loader(param, loaded_weight, *args, **kwargs)  # type: ignore[misc]
-
             # add a counter to track how many elements we have updated
             if not hasattr(layer, "_loaded_numel"):
                 layer._loaded_numel = 0
-            layer._loaded_numel += loaded_weight.numel()
+
+            # load the current weight chunk
+            copy_numel_counter = CopyNumelCounter()
+            with copy_numel_counter:
+                res = weight_loader(param, loaded_weight, *args, **kwargs)  # type: ignore[misc]
+            layer._loaded_numel += copy_numel_counter.copied_numel
 
             # if we have loaded all of the elements, call
             # process_weights_after_loading
@@ -1409,47 +1178,41 @@ class Fp8OnlineMoEMethod(Fp8MoEMethod):
         )
         layer.register_parameter("w13_weight_scale", w13_weight_scale)
         layer.register_parameter("w2_weight_scale", w2_weight_scale)
+        set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+        set_weight_attrs(w2_weight_scale, extra_weight_attrs)
 
         layer.w13_input_scale = None
         layer.w2_input_scale = None
 
-        self.rocm_aiter_moe_enabled = False
-
     def process_weights_after_loading(self, layer: Module) -> None:
         if getattr(layer, "_already_called_process_weights_after_loading", False):
             return
 
-        # Lazy import to avoid importing triton too early.
-        self.rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled()
-
         # If checkpoint is fp16, quantize in place.
         fp8_dtype = current_platform.fp8_dtype()
-        w13_weight = torch.empty_like(layer.w13_weight.data, dtype=fp8_dtype)
-        w2_weight = torch.empty_like(layer.w2_weight.data, dtype=fp8_dtype)
+        w13 = torch.empty_like(layer.w13_weight, dtype=fp8_dtype)
+        w2 = torch.empty_like(layer.w2_weight, dtype=fp8_dtype)
+        w13_scale = layer.w13_weight_scale
+        w2_scale = layer.w2_weight_scale
 
         for expert in range(layer.local_num_experts):
-            w13_weight[expert, :, :], layer.w13_weight_scale[expert] = (
-                ops.scaled_fp8_quant(layer.w13_weight.data[expert, :, :])
+            w13[expert, :, :], w13_scale[expert] = ops.scaled_fp8_quant(
+                layer.w13_weight[expert, :, :]
             )
-            w2_weight[expert, :, :], layer.w2_weight_scale[expert] = (
-                ops.scaled_fp8_quant(layer.w2_weight.data[expert, :, :])
+            w2[expert, :, :], w2_scale[expert] = ops.scaled_fp8_quant(
+                layer.w2_weight[expert, :, :]
             )
-        replace_parameter(layer, "w13_weight", w13_weight)
-        replace_parameter(layer, "w2_weight", w2_weight)
 
-        # Reshuffle weights for AITER if needed.
-        if self.rocm_aiter_moe_enabled:
-            shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights(
-                layer.w13_weight, layer.w2_weight
-            )
-            replace_parameter(layer, "w13_weight", shuffled_w13)
-            replace_parameter(layer, "w2_weight", shuffled_w2)
-
-        # Rushuffle weights for MARLIN if needed.
-        if self.use_marlin:
-            prepare_moe_fp8_layer_for_marlin(
-                layer, False, input_dtype=self.marlin_input_dtype
-            )
+        # Shuffle weights to runtime format and setup kernel.
+        self._setup_kernel(
+            layer,
+            w13,
+            w2,
+            w13_scale,
+            w2_scale,
+            layer.w13_input_scale,
+            layer.w2_input_scale,
+        )
 
 
 class Fp8KVCacheMethod(BaseKVCacheMethod):
diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py
index 9dd734f2fea6a40665105475d68f3cf385914887..1c03e5243a8566828fbcec4840b168f62a5ce844 100644
--- a/vllm/model_executor/layers/quantization/gguf.py
+++ b/vllm/model_executor/layers/quantization/gguf.py
@@ -16,7 +16,11 @@ from vllm.model_executor.layers.fused_moe.config import (
     FusedMoEConfig,
     FusedMoEQuantConfig,
 )
-from vllm.model_executor.layers.fused_moe.layer import FusedMoE, FusedMoEMethodBase
+from vllm.model_executor.layers.fused_moe.fused_moe_router import FusedMoERouter
+from vllm.model_executor.layers.fused_moe.layer import (
+    FusedMoE,
+    FusedMoEMethodBase,
+)
 from vllm.model_executor.layers.linear import (
     LinearBase,
     LinearMethodBase,
@@ -629,6 +633,7 @@ class GGUFMoEMethod(FusedMoEMethodBase):
     def apply(
         self,
         layer: FusedMoE,
+        router: FusedMoERouter,
         x: torch.Tensor,
         router_logits: torch.Tensor,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
@@ -639,7 +644,7 @@ class GGUFMoEMethod(FusedMoEMethodBase):
                 "fused GGUF MoE method."
             )
 
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = router.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 6e5dcfe59b2f909ef19a3f10261684be76c156bd..68a2c375e3539f7979e89a9616c05bf4f71caf52 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -15,6 +15,7 @@ from vllm.model_executor.layers.fused_moe.config import (
     FusedMoEQuantConfig,
 )
 from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe
+from vllm.model_executor.layers.fused_moe.fused_moe_router import FusedMoERouter
 from vllm.model_executor.layers.fused_moe.layer import (
     FusedMoE,
     FusedMoEMethodBase,
@@ -181,7 +182,7 @@ class GPTQMarlinConfig(QuantizationConfig):
 
     @classmethod
     def get_min_capability(cls) -> int:
-        return 80
+        return 75
 
     @classmethod
     def get_config_filenames(cls) -> list[str]:
@@ -276,7 +277,7 @@ class GPTQMarlinConfig(QuantizationConfig):
         sym = quant_config.get("sym")
         desc_act = quant_config.get("desc_act")
 
-        if not current_platform.is_cuda():
+        if not (current_platform.is_cuda() or current_platform.is_cpu()):
             return False
 
         if quant_method != "gptq":
@@ -895,12 +896,13 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
     def apply(
         self,
         layer: FusedMoE,
+        router: FusedMoERouter,
         x: torch.Tensor,
         router_logits: torch.Tensor,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         assert layer.activation == "silu", "Only SiLU activation is supported."
 
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = router.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
diff --git a/vllm/model_executor/layers/quantization/input_quant_fp8.py b/vllm/model_executor/layers/quantization/input_quant_fp8.py
index a5db086fb47294f255fc22f69ac5677ddd90ac17..c1a901c37a0ba8f4ab294abbe4e8cd3e1eba7f43 100644
--- a/vllm/model_executor/layers/quantization/input_quant_fp8.py
+++ b/vllm/model_executor/layers/quantization/input_quant_fp8.py
@@ -7,25 +7,28 @@ import torch.nn.functional as F
 from vllm import _custom_ops as ops
 from vllm._aiter_ops import rocm_aiter_ops
 from vllm.model_executor.custom_op import CustomOp
-from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    GroupShape,
+    get_fp8_min_max,
+    group_broadcast,
+)
 from vllm.platforms import current_platform
 
-# Using the default value (240.0) from pytorch will cause accuracy
-# issue on dynamic quantization models. Here use 224.0 for fnuz on ROCm.
 _FP8_DTYPE = current_platform.fp8_dtype()
-_FP8_FINFO = torch.finfo(_FP8_DTYPE)
-_FP8_MAX = 224.0 if current_platform.is_fp8_fnuz() else _FP8_FINFO.max
-_FP8_MIN = -224.0 if current_platform.is_fp8_fnuz() else _FP8_FINFO.min
+_FP8_MIN, _FP8_MAX = get_fp8_min_max()
 _FP8_MIN_SCALING_FACTOR = 1.0 / (_FP8_MAX * 512.0)
 
 
+# --8<-- [start:quant_fp8]
 @CustomOp.register("quant_fp8")
 class QuantFP8(CustomOp):
     """
-    Quantize input tensor to FP8 (per-tensor, per-token, or per-group).
+    Quantize input tensor to FP8 (per-tensor, per-token, per-channel, or per-group).
     This CustomOp supports both static and dynamic quantization.
     """
 
+    # --8<-- [end:quant_fp8]
+
     def __init__(
         self,
         static: bool,
@@ -51,18 +54,18 @@ class QuantFP8(CustomOp):
         self.column_major_scales = column_major_scales
         self.use_ue8m0 = use_ue8m0
 
-        self.use_aiter = rocm_aiter_ops.is_linear_fp8_enaled()
+        self.use_aiter = rocm_aiter_ops.is_linear_fp8_enabled()
 
         self.is_group_quant = group_shape.is_per_group()
         if self.is_group_quant:
-            assert not static, "Group quantization only supports dynamic mode"
             self.group_size = group_shape.col
         else:
-            assert group_shape in {GroupShape.PER_TOKEN, GroupShape.PER_TENSOR}
-            assert not static or group_shape == GroupShape.PER_TENSOR, (
-                "Only per-tensor scales supported for static quantization."
-            )
             self.use_per_token_if_dynamic = group_shape == GroupShape.PER_TOKEN
+            if not static:
+                assert group_shape in (GroupShape.PER_TOKEN, GroupShape.PER_TENSOR), (
+                    "Only per-token or per-tensor scales are supported for dynamic "
+                    "non-group quantization."
+                )
 
     def forward_cuda(
         self,
@@ -70,8 +73,8 @@ class QuantFP8(CustomOp):
         scale: torch.Tensor | None = None,
         scale_ub: torch.Tensor | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
-        if self.is_group_quant:
-            assert scale is None, "Group quantization is always dynamic"
+        if self.is_group_quant and not self.static:
+            assert scale is None, "Dynamic group quantization does not use scale"
             from vllm.model_executor.layers.quantization.utils import fp8_utils
 
             return fp8_utils.per_token_group_quant_fp8(
@@ -88,12 +91,16 @@ class QuantFP8(CustomOp):
             and self.group_shape == GroupShape.PER_TOKEN
             and scale_ub.numel() == 1
         )
+
         return ops.scaled_fp8_quant(
             x,
             scale,
             num_token_padding=self.num_token_padding,
             scale_ub=scale_ub,
             use_per_token_if_dynamic=self.use_per_token_if_dynamic,
+            group_shape=(self.group_shape.row, self.group_shape.col)
+            if self.static
+            else None,
         )
 
     def forward_hip(
@@ -129,8 +136,8 @@ class QuantFP8(CustomOp):
         scale: torch.Tensor | None = None,
         scale_ub: torch.Tensor | None = None,
     ):
-        if self.is_group_quant:
-            assert scale is None, "Group quantization is always dynamic"
+        if self.is_group_quant and not self.static:
+            assert scale is None, "Dynamic group quantization does not use scale"
             return self._quantize_group_native(x)
 
         assert (scale is not None) == self.static
@@ -153,7 +160,10 @@ class QuantFP8(CustomOp):
 
         # Even for dynamic per-token scales,
         # reciprocal performs slightly better than division
-        out = x.to(torch.float32) * scale.reciprocal()
+        out = (
+            x.to(torch.float32)
+            * group_broadcast(scale.to(torch.float32), x.shape[-2:]).reciprocal()
+        )
         out = out.clamp(_FP8_MIN, _FP8_MAX).to(_FP8_DTYPE)
 
         # This currently generates an extra Triton kernel in compilation.
diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py
index f33ee43727f1926344311a24c8de83d4b0761f04..10cc1fa61e3248899901cbb3f411cbaa04b4722b 100644
--- a/vllm/model_executor/layers/quantization/ipex_quant.py
+++ b/vllm/model_executor/layers/quantization/ipex_quant.py
@@ -6,14 +6,12 @@ from typing import Any, Optional
 import torch
 from packaging import version
 from torch.nn import Module
-from torch.nn.parameter import Parameter
 
 from vllm._ipex_ops import ipex_ops as ops
-from vllm.model_executor.layers.fused_moe import (
-    FusedMoEMethodBase,
-    FusedMoeWeightScaleSupported,
-)
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
+from vllm.model_executor.layers.fused_moe.fused_moe_router import (
+    FusedMoERouter,
+)
 from vllm.model_executor.layers.linear import (
     LinearBase,
     LinearMethodBase,
@@ -24,14 +22,14 @@ from vllm.model_executor.layers.quantization import (
     QuantizationMethods,
 )
 from vllm.model_executor.layers.quantization.awq import AWQLinearMethod
-from vllm.model_executor.layers.quantization.fp8 import Fp8Config, Fp8LinearMethod
+from vllm.model_executor.layers.quantization.fp8 import (
+    Fp8Config,
+    Fp8LinearMethod,
+    Fp8OnlineMoEMethod,
+)
 from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
 from vllm.model_executor.layers.quantization.utils.quant_utils import is_layer_skipped
-from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    maybe_create_device_identity,
-)
-from vllm.model_executor.parameter import ModelWeightParameter
-from vllm.model_executor.utils import set_weight_attrs
+from vllm.model_executor.utils import replace_parameter
 from vllm.platforms import current_platform
 
 MIN_IPEX_VERSION = "2.6.0"
@@ -341,12 +339,14 @@ class XPUFp8LinearMethod(Fp8LinearMethod):
         layer.register_parameter("weight", weight)
 
     def process_weights_after_loading(self, layer: Module) -> None:
+        if getattr(layer, "_already_called_process_weights_after_loading", False):
+            return
         # If checkpoint not serialized fp8, quantize the weights.
         if not self.quant_config.is_checkpoint_fp8_serialized:
             qweight, weight_scale = ops.scaled_fp8_quant(layer.weight, scale=None)
             # Update the layer with the new values.
-            layer.weight = Parameter(qweight, requires_grad=False)
-            layer.weight_scale = Parameter(weight_scale, requires_grad=False)
+            replace_parameter(layer, "weight", qweight.data)
+            replace_parameter(layer, "weight_scale", weight_scale.data)
             layer.input_scale = None
 
     def apply(
@@ -363,69 +363,14 @@ class XPUFp8LinearMethod(Fp8LinearMethod):
         return output
 
 
-class XPUFp8MoEMethod(FusedMoEMethodBase):
+class XPUFp8MoEMethod(Fp8OnlineMoEMethod):
     def __init__(self, quant_config: Fp8Config, layer: torch.nn.Module):
-        super().__init__(layer.moe_config)
+        super().__init__(quant_config, layer)
         self.quant_config = quant_config
 
-    def create_weights(
-        self,
-        layer: Module,
-        num_experts: int,
-        hidden_size: int,
-        intermediate_size_per_partition: int,
-        params_dtype: torch.dtype,
-        **extra_weight_attrs,
-    ):
-        layer.intermediate_size_per_partition = intermediate_size_per_partition
-        layer.hidden_size = hidden_size
-        layer.num_experts = num_experts
-        layer.orig_dtype = params_dtype
-        layer.weight_block_size = None
-        # WEIGHTS
-        w13_weight = torch.nn.Parameter(
-            torch.empty(
-                num_experts,
-                2 * intermediate_size_per_partition,
-                hidden_size,
-                dtype=params_dtype,
-            ),
-            requires_grad=False,
-        )
-        layer.register_parameter("w13_weight", w13_weight)
-        set_weight_attrs(w13_weight, extra_weight_attrs)
-
-        w2_weight = torch.nn.Parameter(
-            torch.empty(
-                num_experts,
-                hidden_size,
-                intermediate_size_per_partition,
-                dtype=params_dtype,
-            ),
-            requires_grad=False,
-        )
-        layer.register_parameter("w2_weight", w2_weight)
-        set_weight_attrs(w2_weight, extra_weight_attrs)
-
-        # Allocate 2 scales for w1 and w3 respectively.
-        # They will be combined to a single scale after weight loading.
-        w13_weight_scale = torch.nn.Parameter(
-            torch.ones(num_experts, 2, dtype=torch.float32), requires_grad=False
-        )
-        w2_weight_scale = torch.nn.Parameter(
-            torch.ones(num_experts, dtype=torch.float32), requires_grad=False
-        )
-        layer.register_parameter("w13_weight_scale", w13_weight_scale)
-        layer.register_parameter("w2_weight_scale", w2_weight_scale)
-
-        extra_weight_attrs.update(
-            {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
-        )
-        # INPUT_SCALES
-        layer.w13_input_scale = None
-        layer.w2_input_scale = None
-
     def process_weights_after_loading(self, layer: Module) -> None:
+        if getattr(layer, "_already_called_process_weights_after_loading", False):
+            return
         if not self.quant_config.is_checkpoint_fp8_serialized:
             fp8_dtype = current_platform.fp8_dtype()
             w13_weight = torch.empty_like(layer.w13_weight.data, dtype=fp8_dtype)
@@ -448,8 +393,9 @@ class XPUFp8MoEMethod(FusedMoEMethodBase):
                 w2_weight[expert, :, :], layer.w2_weight_scale[expert] = (
                     ops.scaled_fp8_quant(layer.w2_weight.data[expert, :, :])
                 )
-            layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False)
-            layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False)
+            replace_parameter(layer, "w13_weight", w13_weight)
+            replace_parameter(layer, "w2_weight", w2_weight)
+
         import intel_extension_for_pytorch as ipex
 
         ep_rank_start = self.moe.ep_rank * self.moe.num_local_experts
@@ -472,6 +418,7 @@ class XPUFp8MoEMethod(FusedMoEMethodBase):
     def apply(
         self,
         layer: torch.nn.Module,
+        router: FusedMoERouter,
         x: torch.Tensor,
         router_logits: torch.Tensor,
     ) -> torch.Tensor:
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
index c4160157cd628869219283ff2d4b6109dd457dea..b64218dce7f4f60f89f54622f4307bb8e8d85792 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
@@ -11,6 +11,9 @@ from vllm.model_executor.layers.quantization.kernels.mixed_precision.bitblas imp
 from vllm.model_executor.layers.quantization.kernels.mixed_precision.conch import (  # noqa: E501
     ConchLinearKernel,
 )
+from vllm.model_executor.layers.quantization.kernels.mixed_precision.cpu import (  # noqa: E501
+    CPUWNA16LinearKernel,
+)
 from vllm.model_executor.layers.quantization.kernels.mixed_precision.cutlass import (  # noqa: E501
     CutlassW4A8LinearKernel,
 )
@@ -46,6 +49,7 @@ _POSSIBLE_KERNELS: list[type[MPLinearKernel]] = [
     ConchLinearKernel,
     ExllamaLinearKernel,
     XPUwNa16LinearKernel,
+    CPUWNA16LinearKernel,
 ]
 
 
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/cpu.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/cpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a9d7c3723eee368f283f14c45fd7ad19cf1dd03
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/cpu.py
@@ -0,0 +1,126 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    pack_quantized_values_into_int32,
+    unpack_quantized_values_into_int32,
+)
+from vllm.platforms import current_platform
+from vllm.scalar_type import scalar_types
+
+from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig
+
+_CPUWNA16_SUPPORTED_QUANT_TYPES = (scalar_types.uint4, scalar_types.uint4b8)
+
+
+class CPUWNA16LinearKernel(MPLinearKernel):
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return -1
+
+    @classmethod
+    def can_implement(cls, c: MPLinearLayerConfig) -> tuple[bool, str | None]:
+        if not current_platform.is_cpu():
+            return False, "CPUWNA16 only supported on CPU"
+
+        if c.weight_type not in _CPUWNA16_SUPPORTED_QUANT_TYPES:
+            return (
+                False,
+                f"Quant type ({c.weight_type}) not supported by "
+                "CPUWNA16, supported types are: "
+                f"{_CPUWNA16_SUPPORTED_QUANT_TYPES}",
+            )
+
+        if c.group_size != -1 and c.group_size % 2 != 0:
+            return (
+                False,
+                f"Group size ({c.group_size}) not supported by "
+                "CPUWNA16, supported group sizes are multiples of 2",
+            )
+
+        if c.partition_weight_shape[0] % 32 != 0:
+            return (
+                False,
+                f"Input size ({c.partition_weight_shape[0]}) not supported by "
+                "CPUWNA16, supported sizes are multiples of 32",
+            )
+
+        if c.partition_weight_shape[1] % 32 != 0:
+            return (
+                False,
+                f"Output size ({c.partition_weight_shape[1]}) not supported by "
+                "CPUWNA16, supported sizes are multiples of 32",
+            )
+
+        return True, None
+
+    # note assumes that
+    #  `weight_packed` is: {input_dim = 0, output_dim = 1, packed_dim = 0}
+    #  `weight_scale`  is: {input_dim = 0, output_dim = 1}
+    #  `weight_zp`     is: {input_dim = 0, output_dim = 1, packed_dim = 1}
+    def _process_gptq_weights(self, layer: torch.nn.Module):
+        packed_weight = layer.qweight.data
+        bits = self.config.weight_type.mantissa
+        pack_factor = 32 // bits
+        p_w_k, p_w_n = packed_weight.size()
+        input_size = p_w_k * pack_factor
+        output_size = p_w_n
+        isa_hint = _get_isa_hint(layer.scales.dtype)
+        layer.isa_hint = isa_hint
+
+        layer.qzeros = None
+        if not self.config.has_g_idx:
+            layer.g_idx = None
+
+        # convert input dim packed to output dim packed
+        weight = unpack_quantized_values_into_int32(
+            packed_weight, self.config.weight_type, 1
+        ).view(p_w_k, p_w_n, pack_factor)
+        weight = weight.permute(0, 2, 1).reshape(input_size, output_size).contiguous()
+        weight = pack_quantized_values_into_int32(weight, self.config.weight_type, 1)
+        # make 16 output channel as a block and transpose to the make
+        # the block contigous
+        weight = (
+            weight.view(input_size, -1, 16 // pack_factor)
+            .permute(1, 0, 2)
+            .reshape(-1, input_size * 16 // pack_factor)
+            .contiguous()
+        )
+        layer.qweight.data = weight
+
+    def process_weights_after_loading(self, layer: torch.nn.Module):
+        if not self.config.zero_points:
+            # GPTQ
+            self._process_gptq_weights(layer)
+        else:
+            # AWQ
+            raise NotImplementedError("AWQ is not supported in CPUWNA16LinearKernel")
+
+    def apply_weights(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        x = ops.cpu_gemm_wna16(
+            input=x,
+            q_weight=layer.qweight,
+            scales=layer.scales,
+            zeros=layer.qzeros,
+            g_idx=layer.g_idx,
+            bias=bias,
+            pack_factor=8,  # 32 // 4
+            isa_hint=layer.isa_hint,
+        )
+        return x
+
+
+def _get_isa_hint(dtype: torch.dtype) -> str:
+    supports_amx = torch._C._cpu._is_amx_tile_supported()
+    if supports_amx and dtype in (torch.bfloat16,):
+        return "amx"
+    else:
+        return "vec"
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py
index 9fba4aafb05a7c21f8277ecb73edd728cb0b852d..537a8e278a39f2fd18748a8ffd2d83e8f0e8a956 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py
@@ -9,6 +9,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
     pack_quantized_values_into_int32,
 )
 from vllm.model_executor.parameter import BasevLLMParameter, permute_param_layout_
+from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
 
 from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig
@@ -25,6 +26,12 @@ class ExllamaLinearKernel(MPLinearKernel):
 
     @classmethod
     def can_implement(cls, c: MPLinearLayerConfig) -> tuple[bool, str | None]:
+        if not current_platform.is_cuda_alike():
+            return (
+                False,
+                "Exllama is only supported on CUDA and ROCm",
+            )
+
         if c.has_g_idx and c.partition_weight_shape[0] != c.full_weight_shape[0]:
             return (
                 False,
@@ -104,7 +111,7 @@ class ExllamaLinearKernel(MPLinearKernel):
                 # indices
                 return torch.argsort(x).to(torch.int)
 
-            self._transform_param(layer, self.w_gidx_name, transform_w_g_idx)
+            self._transform_param(layer, self.w_gidx_name, transform_w_g_idx)  # type: ignore
         else:
             self.w_gidx_name = "g_idx"
             empty_g_idx = torch.nn.Parameter(
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
index faaa45b861de7a9d053e5d57ba822b4d6bb2fe05..eb14f9ec378c4612a3791de81b944dbe9486b18a 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
@@ -30,7 +30,7 @@ from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig
 class MarlinLinearKernel(MPLinearKernel):
     @classmethod
     def get_min_capability(cls) -> int:
-        return 80
+        return 75
 
     @classmethod
     def can_implement(cls, c: MPLinearLayerConfig) -> tuple[bool, str | None]:
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
index 20d050d387d49d1fc9bf30a8144343d7e7a64b7c..4ccc4182367a657e337690dbdece16f3467a36c3 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
@@ -19,9 +19,6 @@ from vllm.model_executor.layers.quantization.kernels.scaled_mm.ScaledMMLinearKer
 from vllm.model_executor.layers.quantization.kernels.scaled_mm.triton import (
     TritonScaledMMLinearKernel,
 )
-from vllm.model_executor.layers.quantization.kernels.scaled_mm.xla import (
-    XLAScaledMMLinearKernel,
-)
 from vllm.platforms import PlatformEnum, current_platform
 
 # in priority/performance order (when available)
@@ -29,7 +26,6 @@ _POSSIBLE_KERNELS: dict[PlatformEnum, list[type[ScaledMMLinearKernel]]] = {
     PlatformEnum.CPU: [CPUScaledMMLinearKernel],
     PlatformEnum.CUDA: [CutlassScaledMMLinearKernel, TritonScaledMMLinearKernel],
     PlatformEnum.ROCM: [AiterScaledMMLinearKernel, TritonScaledMMLinearKernel],
-    PlatformEnum.TPU: [XLAScaledMMLinearKernel],
 }
 
 
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py
index 760f1f7f79576b1b96ec0b2e3b8b90066e8f14c1..e4286f91bcbeccfea761b20c15951c07dd0703df 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py
@@ -9,6 +9,9 @@ from vllm.model_executor.layers.quantization.compressed_tensors.triton_scaled_mm
     triton_scaled_mm,
 )
 from vllm.model_executor.layers.quantization.utils import replace_parameter
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    convert_to_channelwise,
+)
 from vllm.platforms import current_platform
 
 from .ScaledMMLinearKernel import ScaledMMLinearKernel, ScaledMMLinearLayerConfig
@@ -37,6 +40,20 @@ class TritonScaledMMLinearKernel(ScaledMMLinearKernel):
             torch.nn.Parameter(weight.t().data, requires_grad=False),
         )
 
+        # WEIGHT SCALE
+        # Triton kernel supports only per-tensor and per-channel.
+        # If we have a fused module (QKV, MLP) with per tensor scales (thus N
+        # scales being passed to the kernel), convert to the per-channel case.
+        is_fused_module = len(layer.logical_widths) > 1
+        weight_scale = getattr(layer, self.w_s_name)
+        if is_fused_module and not self.config.is_channelwise:
+            weight_scale = convert_to_channelwise(weight_scale, layer.logical_widths)
+        replace_parameter(
+            layer,
+            self.w_s_name,
+            torch.nn.Parameter(weight_scale.data, requires_grad=False),
+        )
+
         # INPUT SCALE
         if self.config.is_static_input_scheme:
             input_scale = getattr(layer, self.i_s_name)
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py
deleted file mode 100644
index 0be858c51993d2304b804b4469cbd61e7b3346ab..0000000000000000000000000000000000000000
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py
+++ /dev/null
@@ -1,106 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import warnings
-
-import torch
-from functorch.experimental.control_flow import cond  # noqa: F401
-
-from vllm.model_executor.layers.quantization.utils import replace_parameter
-from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    convert_to_channelwise,
-)
-from vllm.platforms import current_platform
-
-from .ScaledMMLinearKernel import ScaledMMLinearKernel, ScaledMMLinearLayerConfig
-
-
-class XLAScaledMMLinearKernel(ScaledMMLinearKernel):
-    @classmethod
-    def is_supported(
-        cls, compute_capability: int | None = None
-    ) -> tuple[bool, str | None]:
-        if not current_platform.is_tpu():
-            return False, "Requires TPU."
-        return True, None
-
-    @classmethod
-    def can_implement(cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, str | None]:
-        if not current_platform.is_tpu():
-            return False, "ScaledMMXLA requires running on TPU."
-
-        if c.is_static_input_scheme:
-            return False, "ScaledMMXLA requires dynamic activation scales."
-
-        if not c.input_symmetric:
-            return False, "ScaledMMXLA requires symmetric activation scales."
-
-        if not c.is_channelwise:
-            return False, "ScaledMMXLA requires channelwise weight scales"
-
-        return True, None
-
-    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        # WEIGHT
-        # [out, in] (different than cutlass_scaled_mm)
-        weight = getattr(layer, self.w_q_name)
-        replace_parameter(
-            layer, self.w_q_name, torch.nn.Parameter(weight.data, requires_grad=False)
-        )
-
-        # WEIGHT SCALE
-        # XLA kernels support only per-tensor and per-channel.
-        # If we have a fused module (QKV, MLP) with per tensor scales (thus N
-        # scales being passed to the kernel), convert to the per-channel case.
-        is_fused_module = len(layer.logical_widths) > 1
-        weight_scale = getattr(layer, self.w_s_name)
-        if is_fused_module and not self.config.is_channelwise:
-            weight_scale = convert_to_channelwise(weight_scale, layer.logical_widths)
-
-        # [out_channel,] (different than cutlass_scaled_mm)
-        weight_scale = weight_scale.squeeze(-1)
-        replace_parameter(
-            layer,
-            self.w_s_name,
-            torch.nn.Parameter(weight_scale.data, requires_grad=False),
-        )
-
-        # Only support symmetric dynamic activation quantization.
-        setattr(layer, self.i_s_name, None)
-        setattr(layer, self.i_zp_name, None)
-        setattr(layer, self.azp_adj_name, None)
-
-        # Filter warning for cond usage in apply_weights. It is okay
-        # to specialize the graph since bias is not dynamic.
-        warnings.filterwarnings(
-            "ignore",
-            message="Pred is a Python constant. When used with torch.cond, it specializes on one of the branches.",  # noqa: E501
-        )
-
-    def no_add_bias(self, x: torch.Tensor, bias: torch.Tensor | None):
-        return x
-
-    def add_bias(self, x: torch.Tensor, bias: torch.Tensor | None):
-        return x + bias
-
-    def apply_weights(
-        self,
-        layer: torch.nn.Module,
-        x: torch.Tensor,
-        bias: torch.Tensor | None = None,
-    ) -> torch.Tensor:
-        w_q, w_s, _, _, _ = self._get_weight_params(layer)
-
-        # Required to register custom ops.
-        import torch_xla.experimental.custom_kernel  # noqa: F401
-
-        out = torch.ops.xla.quantized_matmul_int8(
-            x,
-            w_q,
-            w_s,
-            quantize_activation=True,
-        )
-
-        # Explicitly capture control flow to make dynamo happy.
-        # https://pytorch.org/docs/main/generated/exportdb/index.html#cond-branch-class-method # noqa: E501
-        return cond(bias is None, self.no_add_bias, self.add_bias, [out, bias])
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 030d85080a34d2ec4a55f65dd099b2bc56e691da..a646012ddd3aae85ee391fda5391cbd2d9abf350 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -14,16 +14,31 @@ from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
 from vllm.attention.layer import Attention
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.config import (
+    FusedMoEConfig,
     FusedMoEQuantConfig,
-    fp8_w8a8_moe_quant_config,
-    nvfp4_moe_quant_config,
 )
-from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe
+from vllm.model_executor.layers.fused_moe.fused_moe_router import FusedMoERouter
 from vllm.model_executor.layers.fused_moe.layer import (
     FusedMoE,
     FusedMoEMethodBase,
     FusedMoeWeightScaleSupported,
 )
+from vllm.model_executor.layers.fused_moe.oracle.fp8 import (
+    Fp8MoeBackend,
+    convert_to_fp8_moe_kernel_format,
+    make_fp8_moe_kernel,
+    make_fp8_moe_quant_config,
+    select_fp8_moe_backend,
+)
+from vllm.model_executor.layers.fused_moe.oracle.nvfp4 import (
+    FLASHINFER_NVFP4_MOE_BACKENDS,
+    NvFp4MoeBackend,
+    convert_to_nvfp4_moe_kernel_format,
+    is_global_sf_supported_for_nvfp4_backend,
+    make_nvfp4_moe_kernel,
+    make_nvfp4_moe_quant_config,
+    select_nvfp4_moe_backend,
+)
 from vllm.model_executor.layers.linear import (
     LinearBase,
     LinearMethodBase,
@@ -39,21 +54,17 @@ from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import (
     build_flashinfer_fp4_cutlass_moe_prepare_finalize,
     flashinfer_trtllm_fp4_moe,
     flashinfer_trtllm_fp4_routed_moe,
-    prepare_static_weights_for_trtllm_fp4_moe,
-    reorder_w1w3_to_w3w1,
     select_nvfp4_gemm_impl,
 )
 from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
-    FlashinferMoeBackend,
-    apply_flashinfer_per_tensor_scale_fp8,
+    apply_fi_trtllm_fp8_per_tensor_moe,
     build_flashinfer_fp8_cutlass_moe_prepare_finalize,
-    flashinfer_cutlass_moe_fp8,
-    get_flashinfer_moe_backend,
-    is_flashinfer_supporting_global_sf,
-    register_moe_scaling_factors,
-    rotate_flashinfer_fp8_moe_weights,
     select_cutlass_fp8_gemm_impl,
-    swap_w13_to_w31,
+)
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    W8A8BlockFp8LinearOp,
+    process_fp8_input_tensor_strategy_moe,
+    process_fp8_weight_tensor_strategy_moe,
 )
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     get_marlin_input_dtype,
@@ -62,7 +73,6 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
     apply_fp4_marlin_linear,
     is_fp4_marlin_supported,
     prepare_fp4_layer_for_marlin,
-    prepare_moe_fp4_layer_for_marlin,
 )
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     GroupShape,
@@ -72,23 +82,36 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
 )
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     Fp8LinearOp,
+    cutlass_block_fp8_supported,
     requantize_with_max_scale,
 )
-from vllm.model_executor.parameter import ModelWeightParameter, PerTensorScaleParameter
-from vllm.scalar_type import scalar_types
+from vllm.model_executor.parameter import (
+    BlockQuantScaleParameter,
+    ChannelQuantScaleParameter,
+    ModelWeightParameter,
+    PerTensorScaleParameter,
+)
+from vllm.model_executor.utils import replace_parameter
 from vllm.utils.flashinfer import (
     flashinfer_scaled_fp4_mm,
     has_flashinfer,
-    has_flashinfer_moe,
 )
-from vllm.utils.math_utils import round_up
 
 if TYPE_CHECKING:
     from vllm.model_executor.models.utils import WeightsMapper
 
 logger = init_logger(__name__)
 
-QUANT_ALGOS = ["FP8", "NVFP4"]
+QUANT_ALGOS = [
+    # FP8 (per-tensor weight + optional static activation scale).
+    "FP8",
+    # FP8 per-channel weight scale + per-token activation scale.
+    "FP8_PER_CHANNEL_PER_TOKEN",
+    # FP8 per-block weight-only (ModelOpt may emit this as lowercase).
+    "FP8_PB_WO",
+    # FP4
+    "NVFP4",
+]
 KV_CACHE_QUANT_ALGOS = ["FP8"]
 
 
@@ -179,7 +202,9 @@ class ModelOptQuantConfigBase(QuantizationConfig):
                 quant_method.marlin_input_dtype = get_marlin_input_dtype(prefix)
             return quant_method
         elif isinstance(layer, FusedMoE):
-            quant_method = self.FusedMoEMethodCls(quant_config=self, layer=layer)
+            quant_method = self.FusedMoEMethodCls(
+                quant_config=self, moe_config=layer.moe_config
+            )
             if getattr(quant_method, "backend", "") == "marlin":
                 quant_method.marlin_input_dtype = get_marlin_input_dtype(prefix)
             return quant_method
@@ -255,6 +280,9 @@ class ModelOptQuantConfigBase(QuantizationConfig):
         if not quant_method:
             raise ValueError("Missing 'quant_algo' in quantization config")
 
+        # Normalize quant_algo for robust matching (ModelOpt may emit lowercase).
+        quant_method = str(quant_method).upper()
+
         if kv_cache_quant_method is None:
             # No KV cache quantization, keep this branch just to have this comment
             pass
@@ -263,6 +291,8 @@ class ModelOptQuantConfigBase(QuantizationConfig):
                 f"kv_cache_quant_algo must be a string, got "
                 f"{type(kv_cache_quant_method)}"
             )
+        else:
+            kv_cache_quant_method = kv_cache_quant_method.upper()
 
         if not isinstance(exclude_modules, list):
             raise ValueError(
@@ -302,17 +332,34 @@ class ModelOptFp8Config(ModelOptQuantConfigBase):
 
     def __init__(
         self,
+        quant_method: str,
         is_checkpoint_fp8_serialized: bool,
         kv_cache_quant_method: str | None,
         exclude_modules: list[str],
     ) -> None:
         super().__init__(exclude_modules)
+        self.quant_method = quant_method
         self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
         self.kv_cache_quant_method = kv_cache_quant_method
         if is_checkpoint_fp8_serialized:
             logger.warning(
-                "Detected ModelOpt fp8 checkpoint. Please note that"
-                " the format is experimental and could change."
+                "Detected ModelOpt fp8 checkpoint (quant_algo=%s). Please note "
+                "that the format is experimental and could change.",
+                quant_method,
+            )
+
+        # Select LinearMethod implementation based on quant_algo.
+        if self.quant_method == "FP8":
+            self.LinearMethodCls = ModelOptFp8LinearMethod
+        elif self.quant_method == "FP8_PER_CHANNEL_PER_TOKEN":
+            self.LinearMethodCls = ModelOptFp8PcPtLinearMethod
+        elif self.quant_method == "FP8_PB_WO":
+            self.LinearMethodCls = ModelOptFp8PbWoLinearMethod
+        else:
+            raise ValueError(
+                "Unsupported ModelOpt FP8 quant_algo for vLLM: "
+                f"{self.quant_method}. Supported: FP8 / "
+                "FP8_PER_CHANNEL_PER_TOKEN / FP8_PB_WO."
             )
 
     def get_name(self) -> QuantizationMethods:
@@ -346,13 +393,13 @@ class ModelOptFp8Config(ModelOptQuantConfigBase):
         if "quantization" in hf_quant_cfg:
             quant_config = hf_quant_cfg["quantization"]
             if isinstance(quant_config, dict):
-                quant_algo = quant_config.get("quant_algo", "")
-                if "FP8" in quant_algo:
+                quant_algo = str(quant_config.get("quant_algo", ""))
+                if "FP8" in quant_algo.upper():
                     return "modelopt"
         else:
             # Check for compressed-tensors style config with specific quant_algo
-            quant_algo = hf_quant_cfg.get("quant_algo", "")
-            if isinstance(quant_algo, str) and "FP8" in quant_algo:
+            quant_algo = str(hf_quant_cfg.get("quant_algo", ""))
+            if "FP8" in quant_algo.upper():
                 return "modelopt"
 
         return None
@@ -369,7 +416,12 @@ class ModelOptFp8Config(ModelOptQuantConfigBase):
     ) -> "ModelOptFp8Config":
         is_checkpoint_fp8_serialized = "FP8" in quant_method
 
-        return cls(is_checkpoint_fp8_serialized, kv_cache_quant_method, exclude_modules)
+        return cls(
+            quant_method,
+            is_checkpoint_fp8_serialized,
+            kv_cache_quant_method,
+            exclude_modules,
+        )
 
 
 class ModelOptFp8LinearMethod(LinearMethodBase):
@@ -464,6 +516,203 @@ class ModelOptFp8LinearMethod(LinearMethodBase):
         )
 
 
+class ModelOptFp8PcPtLinearMethod(LinearMethodBase):
+    """Linear method for ModelOpt FP8_PER_CHANNEL_PER_TOKEN checkpoints.
+
+    Expected checkpoint structure (per Linear):
+    - weight: fp8-e4m3fn, shape [out, in]
+    - weight_scale: fp32, shape [out] (per-output-channel)
+    - no input_scale (activations are dynamically quantized per-token)
+    """
+
+    def __init__(self, quant_config: ModelOptFp8Config) -> None:
+        self.quant_config = quant_config
+        self.fp8_linear = Fp8LinearOp(
+            act_quant_static=False, act_quant_group_shape=GroupShape.PER_TOKEN
+        )
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        del input_size, output_size
+
+        if not self.quant_config.is_checkpoint_fp8_serialized:
+            raise ValueError(
+                "FP8_PER_CHANNEL_PER_TOKEN currently only supports "
+                "FP8-serialized checkpoints."
+            )
+
+        output_size_per_partition = sum(output_partition_sizes)
+        weight_loader = extra_weight_attrs.get("weight_loader")
+        layer.logical_widths = output_partition_sizes
+        layer.input_size_per_partition = input_size_per_partition
+        layer.output_size_per_partition = output_size_per_partition
+
+        weight = ModelWeightParameter(
+            data=torch.empty(
+                output_size_per_partition,
+                input_size_per_partition,
+                dtype=torch.float8_e4m3fn,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight", weight)
+
+        weight_scale = ChannelQuantScaleParameter(
+            data=torch.empty(output_size_per_partition, dtype=torch.float32),
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        weight_scale[:] = torch.finfo(torch.float32).min
+        layer.register_parameter("weight_scale", weight_scale)
+
+    def process_weights_after_loading(self, layer: Module) -> None:
+        layer.weight = Parameter(layer.weight.t(), requires_grad=False)
+        layer.weight_scale = Parameter(layer.weight_scale.data, requires_grad=False)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        return self.fp8_linear.apply(
+            input=x,
+            weight=layer.weight,
+            weight_scale=layer.weight_scale,
+            input_scale=None,
+            bias=bias,
+        )
+
+
+class ModelOptFp8PbWoLinearMethod(LinearMethodBase):
+    """Linear method for ModelOpt FP8_PB_WO checkpoints.
+
+    ModelOpt exports `weight_scale` as a 4D tensor:
+      [out_blk, 1, in_blk, 1]
+    where block size is typically 128 for both dims.
+
+    vLLM executes it as FP8 GEMM with *dynamic per-token* activation quant.
+    """
+
+    _WEIGHT_BLOCK_SIZE: tuple[int, int] = (128, 128)
+
+    def __init__(self, quant_config: ModelOptFp8Config) -> None:
+        self.quant_config = quant_config
+        block_n, block_k = self._WEIGHT_BLOCK_SIZE
+        self.weight_block_size = list(self._WEIGHT_BLOCK_SIZE)
+        self.w8a8_block_fp8_linear = W8A8BlockFp8LinearOp(
+            weight_group_shape=GroupShape(block_n, block_k),
+            act_quant_group_shape=GroupShape(1, block_k),
+            cutlass_block_fp8_supported=cutlass_block_fp8_supported(),
+            use_aiter_and_is_supported=False,
+        )
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        del input_size, output_size
+
+        if not self.quant_config.is_checkpoint_fp8_serialized:
+            raise ValueError(
+                "FP8_PB_WO currently only supports FP8-serialized checkpoints."
+            )
+
+        output_size_per_partition = sum(output_partition_sizes)
+        weight_loader = extra_weight_attrs.get("weight_loader")
+        layer.logical_widths = output_partition_sizes
+        layer.input_size_per_partition = input_size_per_partition
+        layer.output_size_per_partition = output_size_per_partition
+
+        # Expose block size so the v2 weight loaders can translate offsets from
+        # element-space -> block-space for BlockQuantScaleParameter.
+        layer.weight_block_size = self.weight_block_size
+
+        weight = ModelWeightParameter(
+            data=torch.empty(
+                output_size_per_partition,
+                input_size_per_partition,
+                dtype=torch.float8_e4m3fn,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight", weight)
+
+        block_n, block_k = self._WEIGHT_BLOCK_SIZE
+        if output_size_per_partition % block_n != 0:
+            raise ValueError(
+                "ModelOpt FP8_PB_WO requires out_features divisible by "
+                f"{block_n}, got {output_size_per_partition}."
+            )
+        if input_size_per_partition % block_k != 0:
+            raise ValueError(
+                "ModelOpt FP8_PB_WO requires in_features divisible by "
+                f"{block_k}, got {input_size_per_partition}."
+            )
+
+        out_blks = output_size_per_partition // block_n
+        in_blks = input_size_per_partition // block_k
+
+        # Match ModelOpt's exported shape so weight loading works without a
+        # custom loader: [out_blk, 1, in_blk, 1]
+        weight_scale = BlockQuantScaleParameter(
+            data=torch.empty((out_blks, 1, in_blks, 1), dtype=torch.float32),
+            input_dim=2,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        weight_scale[:] = torch.finfo(torch.float32).min
+        layer.register_parameter("weight_scale", weight_scale)
+
+    def process_weights_after_loading(self, layer: Module) -> None:
+        # Keep weight in [out, in] layout for W8A8BlockFp8LinearOp.
+        layer.weight = Parameter(layer.weight.data, requires_grad=False)
+
+        scale = layer.weight_scale
+        if scale.dim() == 4:
+            # [out_blk, 1, in_blk, 1] -> [out_blk, in_blk]
+            scale = scale.squeeze(1).squeeze(-1)
+        elif scale.dim() != 2:
+            raise ValueError(
+                "Unexpected ModelOpt FP8_PB_WO weight_scale shape: "
+                f"{tuple(scale.shape)}."
+            )
+
+        layer.weight_scale = Parameter(scale.contiguous(), requires_grad=False)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        return self.w8a8_block_fp8_linear.apply(
+            input=x,
+            weight=layer.weight,
+            weight_scale=layer.weight_scale,
+            input_scale=None,
+            bias=bias,
+        )
+
+
 class ModelOptFp8MoEMethod(FusedMoEMethodBase):
     """MoE method for ModelOpt FP8.
     Supports loading FP8 checkpoints with static weight scale and
@@ -475,48 +724,37 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
     def __init__(
         self,
         quant_config: ModelOptFp8Config,
-        layer: FusedMoE,
+        moe_config: FusedMoEConfig,
     ) -> None:
-        super().__init__(layer.moe_config)
-        self.layer = layer
+        super().__init__(moe_config)
         self.quant_config = quant_config
-        from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-            cutlass_fp8_supported,
+        assert self.quant_config.is_checkpoint_fp8_serialized
+        self.fp8_backend = select_fp8_moe_backend(
+            block_quant=False,
+            tp_size=moe_config.moe_parallel_config.tp_size,
+            with_lora_support=self.moe.is_lora_enabled,
         )
-
-        self.cutlass_fp8_supported = cutlass_fp8_supported()
-        self.flashinfer_moe_backend: FlashinferMoeBackend | None = None
-        if envs.VLLM_USE_FLASHINFER_MOE_FP8 and has_flashinfer_moe():
-            self.flashinfer_moe_backend = get_flashinfer_moe_backend()
-            if (
-                self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
-                and not self.moe.is_act_and_mul
-            ):
-                logger.info_once(
-                    "Non-gated MoE is not supported for min-latency mode,"
-                    "falling back to high-throughput mode"
-                )
-                self.flashinfer_moe_backend = FlashinferMoeBackend.CUTLASS
-
-            logger.info_once(
-                f"Using FlashInfer {self.flashinfer_moe_backend.value} kernels"
-            )
+        self.kernel: mk.FusedMoEModularKernel | None = None
 
     def maybe_make_prepare_finalize(
         self,
         routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
     ) -> mk.FusedMoEPrepareAndFinalize | None:
         # TRT LLM not supported with all2all yet.
-        if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM:
+        if self.fp8_backend == Fp8MoeBackend.FLASHINFER_TRTLLM:
             return None
-        elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
+        elif self.fp8_backend == Fp8MoeBackend.FLASHINFER_CUTLASS:
+            # TP case: avoid convert to ModularKernelMethod - to be refactored.
+            if self.moe.dp_size == 1:
+                return None
+
             prepare_finalize = build_flashinfer_fp8_cutlass_moe_prepare_finalize(
-                self.moe
+                self.moe,
+                use_deepseek_fp8_block_scale=False,
             )
             logger.debug_once("%s", prepare_finalize.__class__.__name__)
             return prepare_finalize
-        else:
-            return super().maybe_make_prepare_finalize(routing_tables)
+        return super().maybe_make_prepare_finalize(routing_tables)
 
     def select_gemm_impl(
         self,
@@ -540,6 +778,9 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
         params_dtype: torch.dtype,
         **extra_weight_attrs,
     ):
+        layer.orig_dtype = params_dtype
+        layer.num_experts = num_experts
+
         # Use FP8 dtype if checkpoint is serialized
         weight_dtype = (
             torch.float8_e4m3fn
@@ -579,211 +820,141 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
         )
         layer.register_parameter("w2_weight", w2_weight)
 
-        if self.quant_config.is_checkpoint_fp8_serialized:
-            # WEIGHT SCALES - Per-tensor scaling for ModelOpts
-            # For gated MoE, allocate 2 scales for w1 and w3 respectively.
-            # They will be combined to a single scale after weight loading.
-            # For non-gated MoE, allocate 1 scale for w13.
-            if self.moe.is_act_and_mul:
-                w13_weight_scale_shape = (num_experts, 2)
-            else:
-                w13_weight_scale_shape = (num_experts, 1)
-            w13_weight_scale = PerTensorScaleParameter(
-                data=torch.full(
-                    w13_weight_scale_shape,
-                    1.0,
-                    dtype=torch.float32,
-                ),
-                weight_loader=weight_loader,
-            )
-            w2_weight_scale = PerTensorScaleParameter(
-                data=torch.full((num_experts,), 1.0, dtype=torch.float32),
-                weight_loader=weight_loader,
-            )
-            layer.register_parameter("w13_weight_scale", w13_weight_scale)
-            layer.register_parameter("w2_weight_scale", w2_weight_scale)
-
-            # Set weight loader attributes for scales
-            extra_weight_attrs.update(
-                {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
-            )
-
-            # INPUT SCALES - Per-tensor scaling for ModelOpt
-            w13_input_scale = PerTensorScaleParameter(
-                data=torch.full((num_experts,), 1.0, dtype=torch.float32),
-                weight_loader=weight_loader,
-            )
-            w2_input_scale = PerTensorScaleParameter(
-                data=torch.full((num_experts,), 1.0, dtype=torch.float32),
-                weight_loader=weight_loader,
-            )
-            layer.register_parameter("w13_input_scale", w13_input_scale)
-            layer.register_parameter("w2_input_scale", w2_input_scale)
-
-    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        """Process FP8 MoE weights after loading from serialized checkpoint.
-        Only supports pre-quantized checkpoints with FP8 weights and scales.
-        """
-
-        if self.flashinfer_moe_backend is not None:
-            self._maybe_pad_intermediate_for_flashinfer(layer)
-
-        layer.w13_weight = Parameter(layer.w13_weight.data, requires_grad=False)
-        layer.w2_weight = Parameter(layer.w2_weight.data, requires_grad=False)
+        # WEIGHT SCALES - Per-tensor scaling for ModelOpts
+        # For gated MoE, allocate 2 scales for w1 and w3 respectively.
+        # They will be combined to a single scale after weight loading.
+        # For non-gated MoE, allocate 1 scale for w13.
+        w13_weight_scale = PerTensorScaleParameter(
+            data=torch.full(
+                (num_experts, 2 if self.moe.is_act_and_mul else 1),
+                1.0,
+                dtype=torch.float32,
+            ),
+            weight_loader=weight_loader,
+        )
+        w2_weight_scale = PerTensorScaleParameter(
+            data=torch.full((num_experts,), 1.0, dtype=torch.float32),
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("w13_weight_scale", w13_weight_scale)
+        layer.register_parameter("w2_weight_scale", w2_weight_scale)
 
-        from vllm._custom_ops import scaled_fp8_quant
-        from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-            per_tensor_dequantize,
+        # INPUT SCALES - Per-tensor scaling for ModelOpt
+        w13_input_scale = PerTensorScaleParameter(
+            data=torch.full((num_experts,), 1.0, dtype=torch.float32),
+            weight_loader=weight_loader,
         )
+        w2_input_scale = PerTensorScaleParameter(
+            data=torch.full((num_experts,), 1.0, dtype=torch.float32),
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("w13_input_scale", w13_input_scale)
+        layer.register_parameter("w2_input_scale", w2_input_scale)
 
-        # Handle scale parameters
-        if hasattr(layer, "w13_weight_scale") and layer.w13_weight_scale is not None:
-            # Fp8 moe kernel needs single weight scale for w13 per expert.
-            # We take the max of the w1 and w3 scales
-            # then dequant and requant each expert.
-            if (
-                layer.w13_weight_scale.dim() == 2
-                and layer.w13_weight_scale.shape[1] == 2
-            ):
-                assert self.moe.is_act_and_mul, (
-                    "w13_weight_scale should have 2 elements per expert "
-                    "only for gated MoE"
-                )
-                # Get the maximum scale across w1 and w3 for each expert
-                max_w13_scales = layer.w13_weight_scale.max(dim=1).values
-
-                # Requantize each expert's weights using the combined scale
-                # w13_weight (num_experts, 2 * intermediate_size, hidden_size)
-                # where the first intermediate_size rows are w1, the next are w3
-                intermediate_size = layer.w13_weight.shape[1] // 2
-                for expert_id in range(layer.w13_weight.shape[0]):
-                    start = 0
-                    for shard_id in range(2):  # w1 and w3
-                        # Dequantize using the original scale for this shard
-                        dq_weight = per_tensor_dequantize(
-                            layer.w13_weight[expert_id][
-                                start : start + intermediate_size, :
-                            ],
-                            layer.w13_weight_scale[expert_id][shard_id],
-                        )
-                        # Requantize using the combined max scale
-
-                        (
-                            layer.w13_weight[expert_id][
-                                start : start + intermediate_size, :
-                            ],
-                            _,
-                        ) = scaled_fp8_quant(dq_weight, max_w13_scales[expert_id])
-
-                        start += intermediate_size
-
-                # Update the scale parameter to be per-expert
-                layer.w13_weight_scale = Parameter(max_w13_scales, requires_grad=False)
-            else:
-                layer.w13_weight_scale = Parameter(
-                    layer.w13_weight_scale.data, requires_grad=False
-                )
+    def _setup_kernel(
+        self,
+        layer: torch.nn.Module,
+        w13: torch.Tensor,
+        w2: torch.Tensor,
+        w13_scale: torch.Tensor,
+        w2_scale: torch.Tensor,
+        w13_input_scale: torch.Tensor,
+        w2_input_scale: torch.Tensor,
+    ):
+        w13, w2, w13_scale, w2_scale = convert_to_fp8_moe_kernel_format(
+            fp8_backend=self.fp8_backend,
+            layer=layer,
+            w13=w13,
+            w2=w2,
+            w13_scale=w13_scale,
+            w2_scale=w2_scale,
+            w13_input_scale=w13_input_scale,
+            w2_input_scale=w2_input_scale,
+        )
 
-        if hasattr(layer, "w2_weight_scale") and layer.w2_weight_scale is not None:
-            layer.w2_weight_scale = Parameter(
-                layer.w2_weight_scale.data, requires_grad=False
-            )
-        # Input scales must be equal for each expert in fp8 MoE layers.
-        if hasattr(layer, "w13_input_scale") and layer.w13_input_scale is not None:
-            layer.w13_input_scale = Parameter(
-                layer.w13_input_scale.max(), requires_grad=False
-            )
-        if hasattr(layer, "w2_input_scale") and layer.w2_input_scale is not None:
-            layer.w2_input_scale = Parameter(
-                layer.w2_input_scale.max(), requires_grad=False
+        # Replace parameters with updated versions. Note that this helper
+        # function ensures the replacement is compatible with RL weight reloads.
+        replace_parameter(layer, "w13_weight", w13)
+        replace_parameter(layer, "w2_weight", w2)
+        replace_parameter(layer, "w13_weight_scale", w13_scale)
+        replace_parameter(layer, "w2_weight_scale", w2_scale)
+
+        # Setup modular kernel for TP case.
+        self.moe_quant_config = self.get_fused_moe_quant_config(layer)
+        if self.moe_quant_config:
+            self.kernel, self.use_inplace = make_fp8_moe_kernel(
+                layer=layer,
+                moe_quant_config=self.moe_quant_config,
+                moe_config=self.moe,
+                fp8_backend=self.fp8_backend,
             )
 
-        if self.flashinfer_moe_backend is not None:
-            if self.moe.is_act_and_mul:
-                layer.w13_weight.data = swap_w13_to_w31(layer.w13_weight.data)
-            if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM:
-                rotate_flashinfer_fp8_moe_weights(layer.w13_weight, layer.w2_weight)
-        register_moe_scaling_factors(layer)
-
-    def _maybe_pad_intermediate_for_flashinfer(self, layer: torch.nn.Module) -> None:
-        """Pad intermediate size so FlashInfer kernels' alignment constraints hold.
-
-        Some FlashInfer FP8 MoE kernels require the (gated) intermediate size
-        used for GEMM to be divisible by a small alignment value. When this is
-        not satisfied (e.g. with certain tensor-parallel sizes), we pad the
-        gate/up and down projection weights along the intermediate dim.
-        """
-        if not hasattr(layer, "w13_weight") or not hasattr(layer, "w2_weight"):
-            return
-
-        # Current local intermediate size (per partition) is the K dimension of
-        # the down projection.
-        num_experts, hidden_size, intermediate = layer.w2_weight.shape
-
-        min_alignment = 16
-        padded_intermediate = round_up(intermediate, min_alignment)
-
-        if padded_intermediate == intermediate:
-            return
-
-        logger.info(
-            "Padding intermediate size from %d to %d for up/down projection weights.",
-            intermediate,
-            padded_intermediate,
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        w13 = layer.w13_weight
+        w2 = layer.w2_weight
+        w13_scale = layer.w13_weight_scale
+        w2_scale = layer.w2_weight_scale
+        w13_input_scale = layer.w13_input_scale
+        w2_input_scale = layer.w2_input_scale
+
+        # Per tensor kernels require single activation scale. Use the max.
+        w13_input_scale, w2_input_scale = process_fp8_input_tensor_strategy_moe(
+            w13_input_scale, w2_input_scale
+        )
+        replace_parameter(layer, "w13_input_scale", w13_input_scale)
+        replace_parameter(layer, "w2_input_scale", w2_input_scale)
+
+        # Per tensor kernels require single weight scale for w13 per expert, but
+        # on disk there is a scale for w1 and w3. Use the max to requantize.
+        shard_size = layer.intermediate_size_per_partition
+        w13, w13_scale = process_fp8_weight_tensor_strategy_moe(
+            w13,
+            w13_scale,
+            shard_size,
+            num_experts=layer.w13_weight.shape[0],
+            is_act_and_mul=self.moe.is_act_and_mul,
         )
 
-        up_mult = 2 if self.moe.is_act_and_mul else 1
-        padded_gate_up_dim = up_mult * padded_intermediate
-
-        # Pad w13 and w12 along its intermediate dimension.
-        w13 = layer.w13_weight.data
-        padded_w13 = w13.new_zeros((num_experts, padded_gate_up_dim, hidden_size))
-        padded_w13[:, : w13.shape[1], :] = w13
-        layer.w13_weight.data = padded_w13
-
-        w2 = layer.w2_weight.data
-        padded_w2 = w2.new_zeros((num_experts, hidden_size, padded_intermediate))
-        padded_w2[:, :, :intermediate] = w2
-        layer.w2_weight.data = padded_w2
-
-        if hasattr(layer, "intermediate_size_per_partition"):
-            layer.intermediate_size_per_partition = padded_intermediate
+        # Shuffle weights to runtime format and setup kernel.
+        self._setup_kernel(
+            layer, w13, w2, w13_scale, w2_scale, w13_input_scale, w2_input_scale
+        )
 
     def get_fused_moe_quant_config(
         self, layer: torch.nn.Module
     ) -> FusedMoEQuantConfig | None:
-        if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM:
-            return None
-
-        return fp8_w8a8_moe_quant_config(
-            w1_scale=layer.w13_weight_scale,
-            g1_alphas=layer.output1_scales_gate_scalar.squeeze(),
-            w2_scale=layer.w2_weight_scale,
-            g2_alphas=layer.output2_scales_scalar.squeeze(),
-            a1_scale=layer.w13_input_scale,
-            a1_gscale=layer.w13_input_scale,
-            a2_scale=layer.w2_input_scale,
-            a2_gscale=layer.w2_input_scale_inv,
-            per_act_token_quant=False,
+        w1_scale = layer.w13_weight_scale
+        w2_scale = layer.w2_weight_scale
+        a1_scale = layer.w13_input_scale
+        a2_scale = layer.w2_input_scale
+
+        return make_fp8_moe_quant_config(
+            fp8_backend=self.fp8_backend,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            a1_scale=a1_scale,
+            a2_scale=a2_scale,
         )
 
     def apply(
         self,
         layer: FusedMoE,
+        router: FusedMoERouter,
         x: torch.Tensor,
         router_logits: torch.Tensor,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM:
+        if self.fp8_backend == Fp8MoeBackend.FLASHINFER_TRTLLM:
             if layer.enable_eplb:
                 raise NotImplementedError(
-                    "EPLB not supported for `ModelOptFp8MoEMethod` yet."
+                    "EPLB not supported for FlashInfer TRTLLM FP8 MoE Backend."
                 )
+            # TODO(rob): this validation should happen at kernel selection
+            # time in the oracle rather than here.
             assert layer.activation == "silu", (
                 f"Expected 'silu' activation but got {layer.activation}"
             )
-
             assert not layer.renormalize
-            return apply_flashinfer_per_tensor_scale_fp8(
+            return apply_fi_trtllm_fp8_per_tensor_moe(
                 layer=layer,
                 hidden_states=x,
                 router_logits=router_logits,
@@ -796,45 +967,34 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
             )
 
         # Expert selection
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = router.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
 
-        if self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
+        # TODO(rob): this validation should happen at kernel selection
+        # time in the oracle rather than here.
+        if self.fp8_backend == Fp8MoeBackend.FLASHINFER_CUTLASS:
             assert layer.activation in ("silu", "relu2_no_mul"), (
                 "Expected activation to be in ('silu', 'relu2_no_mul'),"
                 f"but got {layer.activation}"
             )
-            return flashinfer_cutlass_moe_fp8(
-                x,
-                layer,
-                topk_weights,
-                topk_ids,
-                inplace=False,
-                activation=layer.activation,
-                global_num_experts=layer.global_num_experts,
-                expert_map=layer.expert_map,
-                apply_router_weight_on_input=layer.apply_router_weight_on_input,
-            )
-        else:
-            from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts
 
-            assert self.moe_quant_config is not None
+        assert self.kernel is not None
+        result = self.kernel(
+            x,
+            layer.w13_weight,
+            layer.w2_weight,
+            topk_weights,
+            topk_ids,
+            inplace=self.use_inplace,
+            activation=layer.activation,
+            global_num_experts=layer.global_num_experts,
+            expert_map=layer.expert_map,
+            apply_router_weight_on_input=layer.apply_router_weight_on_input,
+        )
 
-            return fused_experts(
-                x,
-                layer.w13_weight,
-                layer.w2_weight,
-                topk_weights=topk_weights,
-                topk_ids=topk_ids,
-                inplace=True,
-                activation=layer.activation,
-                quant_config=self.moe_quant_config,
-                global_num_experts=layer.global_num_experts,
-                expert_map=layer.expert_map,
-                apply_router_weight_on_input=layer.apply_router_weight_on_input,
-            )
+        return result
 
 
 ModelOptFp8Config.LinearMethodCls = ModelOptFp8LinearMethod
@@ -871,7 +1031,7 @@ class ModelOptNvFp4Config(ModelOptQuantConfigBase):
 
     @classmethod
     def get_min_capability(cls) -> int:
-        return 80
+        return 75
 
     @classmethod
     def override_quantization_method(
@@ -1171,45 +1331,37 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
     def __init__(
         self,
         quant_config: ModelOptNvFp4Config,
-        layer: FusedMoE,
+        moe_config: FusedMoEConfig,
     ) -> None:
-        from vllm.model_executor.layers.quantization.utils.nvfp4_moe_support import (
-            detect_nvfp4_moe_support,  # noqa: E501
-        )
-
-        super().__init__(layer.moe_config)
+        super().__init__(moe_config)
         self.quant_config = quant_config
-        self.layer = layer
-        _nvfp4 = detect_nvfp4_moe_support(self.__class__.__name__)
-        self.cutlass_nvfp4_supported = _nvfp4.cutlass_supported
-        self.allow_flashinfer = _nvfp4.allow_flashinfer
-        self.use_marlin = _nvfp4.use_marlin
-        self.marlin_input_dtype = None
-        self.flashinfer_moe_backend = None
-        if self.allow_flashinfer:
-            self.flashinfer_moe_backend = get_flashinfer_moe_backend()
-            logger.info_once(
-                f"Using FlashInfer {self.flashinfer_moe_backend.value} kernels"
-                " for ModelOptNvFp4FusedMoE."
+        self.nvfp4_backend = select_nvfp4_moe_backend()
+        # TODO: move this type of check into the oracle.
+        if (
+            not self.moe.is_act_and_mul
+            and not self.nvfp4_backend == NvFp4MoeBackend.FLASHINFER_CUTLASS
+        ):
+            raise NotImplementedError(
+                "Non-gated activations are only supported by FlashInfer "
+                "CUTLASS NvFP4 MoE backend."
             )
-        elif self.use_marlin:
-            logger.info_once("Using Marlin for ModelOptNvFp4FusedMoE.")
-        else:
-            logger.info_once("Using Cutlass for ModelOptNvFp4FusedMoE.")
+
+        self.use_global_sf = is_global_sf_supported_for_nvfp4_backend(
+            self.nvfp4_backend
+        )
+        self.kernel: mk.FusedMoEModularKernel | None = None
 
     def maybe_make_prepare_finalize(
         self,
         routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
     ) -> mk.FusedMoEPrepareAndFinalize | None:
-        if self.use_marlin or (
-            self.allow_flashinfer
-            and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
-        ):
+        UNSUPPORTED = [NvFp4MoeBackend.MARLIN, NvFp4MoeBackend.FLASHINFER_TRTLLM]
+        if self.nvfp4_backend in UNSUPPORTED:
             return None
-        elif (
-            self.allow_flashinfer
-            and self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS
-        ):
+        elif self.nvfp4_backend == NvFp4MoeBackend.FLASHINFER_CUTLASS:
+            # TP case: avoid convert to ModularKernelMethod - to be refactored.
+            if self.moe.dp_size == 1:
+                return None
             # For now, fp4 moe only works with the flashinfer dispatcher.
             prepare_finalize = build_flashinfer_fp4_cutlass_moe_prepare_finalize(
                 self.moe
@@ -1228,7 +1380,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
         experts = select_nvfp4_gemm_impl(
             self.moe,
             self.moe_quant_config,
-            allow_flashinfer=self.allow_flashinfer,
+            allow_flashinfer=self.nvfp4_backend in FLASHINFER_NVFP4_MOE_BACKENDS,
         )
         logger.debug_once("Using %s", experts.__class__.__name__)
         return experts
@@ -1248,11 +1400,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
         params_dtype: torch.dtype,
         **extra_weight_attrs,
     ):
-        if not self.quant_config.is_checkpoint_nvfp4_serialized:
-            raise ValueError(
-                "NVFP4 quantization was selected, "
-                " dynamic quantization is not supported."
-            )
+        assert self.quant_config.is_checkpoint_nvfp4_serialized
 
         layer.num_experts = num_experts
         layer.params_dtype = params_dtype
@@ -1341,14 +1489,12 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
             {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
         )
 
-        use_global_sf = self.allow_flashinfer and is_flashinfer_supporting_global_sf(
-            self.flashinfer_moe_backend
+        global_sf_num_experts = (
+            global_num_experts if self.use_global_sf else num_experts
         )
-        global_scale_num_experts = global_num_experts if use_global_sf else num_experts
-
         w13_input_scale = PerTensorScaleParameter(
             data=torch.empty(
-                global_scale_num_experts,
+                global_sf_num_experts,
                 2 if self.moe.is_act_and_mul else 1,
                 dtype=torch.float32,
             ),
@@ -1357,32 +1503,17 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
         layer.register_parameter("w13_input_scale", w13_input_scale)
 
         w2_input_scale = PerTensorScaleParameter(
-            data=torch.empty(global_scale_num_experts, dtype=torch.float32),
+            data=torch.empty(global_sf_num_experts, dtype=torch.float32),
             weight_loader=weight_loader,
         )
         layer.register_parameter("w2_input_scale", w2_input_scale)
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        # GEMM 1 processing
-        gemm1_weight = layer.w13_weight.data
-        gemm1_weight_scale = layer.w13_weight_scale.data
-
-        if (
-            self.allow_flashinfer
-            and (
-                self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS
-                or self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
-            )
-            and self.moe.is_act_and_mul
-        ):
-            gemm1_weight, gemm1_weight_scale = reorder_w1w3_to_w3w1(
-                gemm1_weight, gemm1_weight_scale, dim=-2
-            )
-
-        layer.w13_weight = Parameter(gemm1_weight, requires_grad=False)
-        layer.w13_weight_scale = Parameter(gemm1_weight_scale, requires_grad=False)
+        """
+        Convert NVFP4 MoE weights into kernel format and setup the kernel.
+        """
 
-        # Common processing for w13_weight_scale_2
+        # Use a single gscale for w13.
         if self.moe.is_act_and_mul and not torch.allclose(
             layer.w13_weight_scale_2[:, 0], layer.w13_weight_scale_2[:, 1]
         ):
@@ -1390,162 +1521,79 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                 "w1_weight_scale_2 must match w3_weight_scale_2. "
                 "Accuracy may be affected."
             )
-
         w13_weight_scale_2 = layer.w13_weight_scale_2[:, 0].contiguous()
-        layer.w13_weight_scale_2 = Parameter(w13_weight_scale_2, requires_grad=False)
-
-        # Common processing for input scales and alphas
-        use_global_sf = self.allow_flashinfer and is_flashinfer_supporting_global_sf(
-            self.flashinfer_moe_backend
-        )
-        if use_global_sf:
-            # For backends provide by Flashinfer, the input global scales are
-            # shared across all experts.
-            w13_input_scale = (
-                layer.w13_input_scale.max().to(torch.float32).expand(layer.num_experts)
-            )
-        else:
-            w13_input_scale = layer.w13_input_scale.max(dim=1).values.to(torch.float32)
-        layer.g1_alphas = Parameter(
-            (w13_input_scale * w13_weight_scale_2).to(torch.float32),
-            requires_grad=False,
-        )
-
-        # This is for quantization, so we need to invert it.
-        layer.w13_input_scale_quant = Parameter(
-            (1 / w13_input_scale).to(torch.float32), requires_grad=False
-        )
-
-        # GEMM 2 processing
-        if use_global_sf:
-            # For backends provide by Flashinfer, the input global scales are
-            # shared across all experts.
-            w2_input_scale = (
-                layer.w2_input_scale.max().to(torch.float32).expand(layer.num_experts)
-            )
-        else:
-            w2_input_scale = layer.w2_input_scale
-        layer.g2_alphas = Parameter(
-            (w2_input_scale * layer.w2_weight_scale_2).to(torch.float32),
-            requires_grad=False,
-        )
 
-        # This is for quantization, so we need to invert it.
-        layer.w2_input_scale_quant = Parameter(
-            (1 / w2_input_scale).to(torch.float32), requires_grad=False
+        (
+            w13,
+            w13_scale,
+            w13_scale_2,
+            a13_scale,
+            w2,
+            w2_scale,
+            w2_scale_2,
+            a2_scale,
+        ) = convert_to_nvfp4_moe_kernel_format(
+            nvfp4_backend=self.nvfp4_backend,
+            layer=layer,
+            w13=layer.w13_weight,
+            w13_scale=layer.w13_weight_scale,
+            w13_scale_2=w13_weight_scale_2,
+            a13_scale=layer.w13_input_scale,
+            w2=layer.w2_weight,
+            w2_scale=layer.w2_weight_scale,
+            w2_scale_2=layer.w2_weight_scale_2,
+            a2_scale=layer.w2_input_scale,
+            is_act_and_mul=self.moe.is_act_and_mul,
         )
 
-        # TensorRT-LLM specific processing
-        if (
-            self.allow_flashinfer
-            and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
-        ):
-            # Prepare static weights for TRT-LLM kernel
-            # alternate: prepare_static_weight_layouts_for_trtllm_moe
-            (
-                gemm1_weights_fp4_shuffled,
-                gemm1_scales_fp4_shuffled,
-                gemm2_weights_fp4_shuffled,
-                gemm2_scales_fp4_shuffled,
-            ) = prepare_static_weights_for_trtllm_fp4_moe(
-                layer.w13_weight,
-                layer.w2_weight,
-                layer.w13_weight_scale,
-                layer.w2_weight_scale,
-                layer.w2_weight.size(-2),  # hidden_size
-                layer.w13_weight.size(-2) // 2,  # intermediate_size
-                layer.w13_weight.size(0),  # num_experts
-            )
-            logger.debug_once("Finished shuffling weights for TRT-LLM MOE")
-
-            layer.gemm1_weights_fp4_shuffled = Parameter(
-                gemm1_weights_fp4_shuffled, requires_grad=False
-            )
-            layer.gemm2_weights_fp4_shuffled = Parameter(
-                gemm2_weights_fp4_shuffled, requires_grad=False
-            )
-            layer.gemm1_scales_fp4_shuffled = Parameter(
-                gemm1_scales_fp4_shuffled, requires_grad=False
-            )
-            layer.gemm2_scales_fp4_shuffled = Parameter(
-                gemm2_scales_fp4_shuffled, requires_grad=False
-            )
-
-            # Additional parameter needed for TRT-LLM
-            layer.g1_scale_c = Parameter(
-                (layer.w2_input_scale_quant * layer.g1_alphas).to(torch.float32),
-                requires_grad=False,
-            )
-
-            # Clean up weights that won't be used by TRT-LLM
-            del layer.w2_weight
-            del layer.w2_weight_scale
-            del layer.w13_weight
-            del layer.w13_weight_scale
-        elif self.use_marlin:
-            # Marlin processing
-            prepare_moe_fp4_layer_for_marlin(layer)
-            del layer.g1_alphas
-            del layer.g2_alphas
-            del layer.w13_input_scale_quant
-            del layer.w2_input_scale_quant
-        else:
-            # Non-TRT-LLM processing (Cutlass or non-flashinfer)
-            w13_blockscale_swizzled = swizzle_blockscale(layer.w13_weight_scale)
-            layer.w13_weight_scale = Parameter(
-                w13_blockscale_swizzled, requires_grad=False
+        replace_parameter(layer, "w13_weight", w13)
+        replace_parameter(layer, "w13_weight_scale", w13_scale)
+        replace_parameter(layer, "w13_weight_scale_2", w13_scale_2)
+        replace_parameter(layer, "w13_input_scale", a13_scale)
+        replace_parameter(layer, "w2_weight", w2)
+        replace_parameter(layer, "w2_weight_scale", w2_scale)
+        replace_parameter(layer, "w2_weight_scale_2", w2_scale_2)
+        replace_parameter(layer, "w2_input_scale", a2_scale)
+
+        self.moe_quant_config = self.get_fused_moe_quant_config(layer)
+        use_dp = self.moe.dp_size > 1
+        if self.moe_quant_config is not None and not use_dp:
+            self.kernel = make_nvfp4_moe_kernel(
+                backend=self.nvfp4_backend,
+                quant_config=self.moe_quant_config,
+                moe_config=self.moe,
             )
 
-            w13_weight = layer.w13_weight
-            intermediate_size_pad = w13_blockscale_swizzled.size(1) - w13_weight.size(1)
-            if intermediate_size_pad:
-                # padding gated activations will require to split w1 and w3
-                # and pad them individually
-                assert not self.moe.is_act_and_mul, (
-                    "The intermediate size required padding, "
-                    "but padding is not implemented for gated activations"
-                )
-
-                layer.w13_weight = Parameter(
-                    torch.nn.functional.pad(
-                        w13_weight, (0, 0, 0, intermediate_size_pad)
-                    ),
-                    requires_grad=False,
-                )
-                layer.w2_weight = Parameter(
-                    torch.nn.functional.pad(
-                        layer.w2_weight, (0, intermediate_size_pad // 2, 0, 0)
-                    ),
-                    requires_grad=False,
-                )
-                layer.w2_weight_scale = Parameter(
-                    torch.nn.functional.pad(
-                        layer.w2_weight_scale, (0, intermediate_size_pad // 16)
-                    ),
-                    requires_grad=False,
-                )
+    def prepare_dp_allgather_tensor(
+        self,
+        layer: FusedMoE,
+        hidden_states: torch.Tensor,
+        router_logits: torch.Tensor,
+    ) -> tuple[torch.Tensor, list[torch.Tensor]]:
+        """Optionally prepare extra tensors to carry through DP allgather/EP."""
+        import flashinfer
 
-            w2_blockscale_swizzled = swizzle_blockscale(layer.w2_weight_scale)
-            layer.w2_weight_scale = Parameter(
-                w2_blockscale_swizzled, requires_grad=False
-            )
+        assert self.moe_quant_config is not None
+        a1_gscale = self.moe_quant_config.a1_gscale
+        hidden_states_fp4, hidden_states_sf = flashinfer.fp4_quantize(
+            hidden_states,
+            a1_gscale,
+            is_sf_swizzled_layout=False,
+        )
+        extra_tensors: list[torch.Tensor] = [hidden_states_sf]
+        return hidden_states_fp4, extra_tensors
 
     def get_fused_moe_quant_config(
         self, layer: torch.nn.Module
     ) -> FusedMoEQuantConfig | None:
-        if (
-            self.use_marlin
-            or self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
-        ):
-            return None
-
-        return nvfp4_moe_quant_config(
-            w1_scale=layer.w13_weight_scale,
+        return make_nvfp4_moe_quant_config(
+            backend=self.nvfp4_backend,
+            w13_scale=layer.w13_weight_scale,
             w2_scale=layer.w2_weight_scale,
-            g1_alphas=layer.g1_alphas,
-            g2_alphas=layer.g2_alphas,
-            a1_gscale=layer.w13_input_scale_quant,
-            a2_gscale=layer.w2_input_scale_quant,
+            w13_scale_2=layer.w13_weight_scale_2,
+            w2_scale_2=layer.w2_weight_scale_2,
+            a13_scale=layer.w13_input_scale,
+            a2_scale=layer.w2_input_scale,
         )
 
     @property
@@ -1555,21 +1603,12 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
     def apply(
         self,
         layer: FusedMoE,
+        router: FusedMoERouter,
         x: torch.Tensor,
         router_logits: torch.Tensor,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if not self.moe.is_act_and_mul:
-            assert (
-                self.allow_flashinfer
-                and self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS
-            ), (
-                "Non-gated activations are only supported by the"
-                " flashinfer CUTLASS backend for modelopt checkpoints"
-            )
-
         if (
-            self.allow_flashinfer
-            and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
+            self.nvfp4_backend == NvFp4MoeBackend.FLASHINFER_TRTLLM
             and not layer.enable_eplb
         ):
             return flashinfer_trtllm_fp4_moe(
@@ -1584,16 +1623,19 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                 e_score_correction_bias=layer.e_score_correction_bias,
             )
 
-        topk_weights, topk_ids, _ = layer.select_experts(
-            hidden_states=x,
+        # Hidden_states in select_experts is only used to extract metadata
+        if isinstance(x, tuple):
+            x_routing, _ = x
+        else:
+            x_routing = x
+        topk_weights, topk_ids = router.select_experts(
+            hidden_states=x_routing,
             router_logits=router_logits,
         )
 
         # EPLB path
-        if (
-            self.allow_flashinfer
-            and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
-        ):
+        if self.nvfp4_backend == NvFp4MoeBackend.FLASHINFER_TRTLLM:
+            assert layer.enable_eplb
             return flashinfer_trtllm_fp4_routed_moe(
                 layer=layer,
                 x=x,
@@ -1602,81 +1644,20 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                 top_k=layer.top_k,
                 global_num_experts=layer.global_num_experts,
             )
-
-        if self.use_marlin:
-            return fused_marlin_moe(
+        else:
+            assert self.kernel is not None
+            return self.kernel(
                 x,
                 layer.w13_weight,
                 layer.w2_weight,
-                None,
-                None,
-                layer.w13_weight_scale,
-                layer.w2_weight_scale,
-                router_logits,
                 topk_weights,
                 topk_ids,
-                global_scale1=layer.w13_weight_scale_2,
-                global_scale2=layer.w2_weight_scale_2,
-                quant_type_id=scalar_types.float4_e2m1f.id,
-                apply_router_weight_on_input=layer.apply_router_weight_on_input,
-                global_num_experts=layer.global_num_experts,
-                expert_map=layer.expert_map,
-                input_dtype=self.marlin_input_dtype,
-            )
-
-        elif self.allow_flashinfer:
-            assert self.flashinfer_moe_backend in (
-                FlashinferMoeBackend.CUTLASS,
-                FlashinferMoeBackend.CUTEDSL,
-            )
-            if self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
-                from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (  # noqa: E501
-                    flashinfer_cutlass_moe_fp4,
-                )
-
-                flashinfer_fn_moe_fp4 = flashinfer_cutlass_moe_fp4
-            else:
-                from vllm.model_executor.layers.fused_moe.flashinfer_cutedsl_moe import (  # noqa: E501
-                    flashinfer_cutedsl_moe_fp4,
-                )
-
-                flashinfer_fn_moe_fp4 = flashinfer_cutedsl_moe_fp4
-
-            assert self.moe_quant_config is not None
-            return flashinfer_fn_moe_fp4(
-                hidden_states=x,
-                w1=layer.w13_weight,
-                w2=layer.w2_weight,
-                topk_weights=topk_weights,
-                topk_ids=topk_ids,
-                quant_config=self.moe_quant_config,
                 inplace=False,
                 activation=layer.activation,
                 global_num_experts=layer.global_num_experts,
                 expert_map=layer.expert_map,
                 apply_router_weight_on_input=layer.apply_router_weight_on_input,
             )
-        else:
-            # If no modular kernel is provided, use cutlass_moe_fp4 for TP case
-            # only (no EP).
-            from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4
-
-            assert self.moe_quant_config is not None
-            return cutlass_moe_fp4(
-                a=x,
-                w1_fp4=layer.w13_weight,
-                w2_fp4=layer.w2_weight,
-                topk_weights=topk_weights,
-                topk_ids=topk_ids,
-                quant_config=self.moe_quant_config,
-                expert_map=layer.expert_map,
-                apply_router_weight_on_input=layer.apply_router_weight_on_input,
-                # TODO: derive from arguments
-                m=x.shape[0],
-                n=layer.w2_weight.shape[2] * 2,
-                k=x.shape[1],
-                e=layer.w13_weight.shape[0],
-            )
 
 
 ModelOptNvFp4Config.LinearMethodCls = ModelOptNvFp4LinearMethod
diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py
index 2453c2b7f754c19f98e917c9112541af0501b204..f1c5568d994b9e3681030c24a372433deaa49428 100644
--- a/vllm/model_executor/layers/quantization/moe_wna16.py
+++ b/vllm/model_executor/layers/quantization/moe_wna16.py
@@ -11,6 +11,7 @@ from vllm.model_executor.layers.fused_moe.config import (
     int4_w4a16_moe_quant_config,
     int8_w8a16_moe_quant_config,
 )
+from vllm.model_executor.layers.fused_moe.fused_moe_router import FusedMoERouter
 from vllm.model_executor.layers.fused_moe.layer import (
     FusedMoE,
     FusedMoEConfig,
@@ -382,6 +383,7 @@ class MoeWNA16Method(FusedMoEMethodBase):
     def apply(
         self,
         layer: FusedMoE,
+        router: FusedMoERouter,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         use_nn_moe: bool | None = False,
@@ -390,7 +392,7 @@ class MoeWNA16Method(FusedMoEMethodBase):
         from vllm.model_executor.layers.fused_moe import fused_experts
 
         assert layer.activation == "silu", "Only SiLU activation is supported."
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = router.select_experts(
             hidden_states=x,
             router_logits=router_logits,
             use_fused_gate=use_fused_gate,
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index e96e87d15787d318cedf9239a2e3c1a8cb3db7f0..8e050b795f94284e8302dd624532a30b1165a8c2 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -27,6 +27,7 @@ from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
     MarlinExperts,
     fused_marlin_moe,
 )
+from vllm.model_executor.layers.fused_moe.fused_moe_router import FusedMoERouter
 from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import (
     OAITritonExperts,
     UnfusedOAITritonExperts,
@@ -95,12 +96,12 @@ def get_mxfp4_backend_with_lora() -> Mxfp4Backend:
         # SM120 needs this fix: https://github.com/triton-lang/triton/pull/8498
         and (9, 0) <= current_platform.get_device_capability() < (11, 0)
     )
-    if envs.VLLM_MXFP4_USE_MARLIN or not triton_kernels_supported:
-        logger.info_once("[get_mxfp4_backend_with_lora] Using Marlin backend")
-        return Mxfp4Backend.MARLIN
+    if envs.VLLM_MXFP4_USE_MARLIN is False and triton_kernels_supported:
+        logger.info_once("[get_mxfp4_backend_with_lora] Using Triton backend")
+        return Mxfp4Backend.TRITON
 
-    logger.info_once("[get_mxfp4_backend_with_lora] Using Triton backend")
-    return Mxfp4Backend.TRITON
+    logger.info_once("[get_mxfp4_backend_with_lora] Using Marlin backend")
+    return Mxfp4Backend.MARLIN
 
 
 def get_mxfp4_backend(with_lora_support: bool) -> Mxfp4Backend:
@@ -240,7 +241,6 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
         self.mxfp4_backend = get_mxfp4_backend(moe.is_lora_enabled)
 
         self.marlin_input_dtype = None
-        self.use_marlin = self.mxfp4_backend == Mxfp4Backend.MARLIN
         self.max_capture_size = (
             get_current_vllm_config().compilation_config.max_cudagraph_capture_size
         )
@@ -784,7 +784,10 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
             layer.w13_weight = w13_weight
             layer.w2_weight = w2_weight
         else:
-            raise ValueError(f"Unsupported backend: {self.mxfp4_backend}")
+            raise ValueError(
+                f"Unsupported mxfp4_backend: {self.mxfp4_backend}: "
+                f"should be one of: {list(Mxfp4Backend)}."
+            )
 
     def get_fused_moe_quant_config(
         self, layer: torch.nn.Module
@@ -889,6 +892,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
     def apply(
         self,
         layer: FusedMoE,
+        router: FusedMoERouter,
         x: torch.Tensor,
         router_logits: torch.Tensor,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
@@ -896,7 +900,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
             raise NotImplementedError("EPLB is not supported for mxfp4")
 
         if self.mxfp4_backend == Mxfp4Backend.MARLIN:
-            topk_weights, topk_ids, _ = layer.select_experts(
+            topk_weights, topk_ids = router.select_experts(
                 hidden_states=x,
                 router_logits=router_logits,
             )
@@ -990,7 +994,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
         ):
             from vllm.utils.flashinfer import flashinfer_cutlass_fused_moe
 
-            topk_weights, topk_ids, _ = layer.select_experts(
+            topk_weights, topk_ids = router.select_experts(
                 hidden_states=x,
                 router_logits=router_logits,
             )
@@ -1117,7 +1121,8 @@ class IpexMxfp4MoEMethod(Mxfp4MoEMethod):
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
+        router: FusedMoERouter,
         x: torch.Tensor,
         router_logits: torch.Tensor,
     ) -> torch.Tensor:
diff --git a/vllm/model_executor/layers/quantization/ptpc_fp8.py b/vllm/model_executor/layers/quantization/ptpc_fp8.py
index ed8a2c7fa0841f07c0913a782483a90f9aedbef1..80efc29de67e0fee248c94788b3d4b52da833f64 100644
--- a/vllm/model_executor/layers/quantization/ptpc_fp8.py
+++ b/vllm/model_executor/layers/quantization/ptpc_fp8.py
@@ -103,21 +103,25 @@ class PTPCFp8LinearMethod(Fp8LinearMethod):
         )
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        layer.weight = torch.nn.Parameter(layer.weight.data, requires_grad=False)
-
-        assert layer.weight.data.dtype == torch.bfloat16, (
-            f"Currently torch._scaled_mm (hipBLASLt) rowwise gemm only support output dtype of bfloat16. {str(layer.weight.data.dtype)} is specified."  # noqa: E501
-        )
-        # Quantize the weights.
-        qweight, weight_scale = ops.scaled_fp8_quant(
-            layer.weight, scale=None, use_per_token_if_dynamic=True
+        assert layer.weight.data.dtype not in (torch.float16, torch.float32), (
+            "Currently torch._scaled_mm (hipBLASLt) rowwise gemm only support "
+            f"output dtype of bfloat16. {layer.weight.data.dtype} is specified."
         )
 
-        # Update the layer with the new values.
-        layer.weight = Parameter(
-            qweight.t(), requires_grad=False
-        )  # Pretranspose the weight
-        layer.weight_scale = Parameter(weight_scale, requires_grad=False)
+        if layer.weight.data.dtype == torch.bfloat16:
+            # Quantize the weights.
+            qweight, weight_scale = ops.scaled_fp8_quant(
+                layer.weight, scale=None, use_per_token_if_dynamic=True
+            )
+
+            # Update the layer with the new values.
+            layer.weight = Parameter(
+                qweight.t(), requires_grad=False
+            )  # Pretranspose the weight
+            layer.weight_scale = Parameter(weight_scale, requires_grad=False)
+        else:
+            assert layer.weight.data.dtype == current_platform.fp8_dtype()
+            assert getattr(layer, "weight_scale", None) is not None
         layer.input_scale = None
 
     def apply(
diff --git a/vllm/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py
index 3640e5c452786176c7c6385ff93ce4c2bc0c136d..39bcd56bcd3dcd4077677976b6f568be43d1b227 100644
--- a/vllm/model_executor/layers/quantization/quark/quark.py
+++ b/vllm/model_executor/layers/quantization/quark/quark.py
@@ -218,6 +218,49 @@ class QuarkConfig(QuantizationConfig):
         else:
             return False
 
+    def _is_fp8_w4a8(
+        self,
+        weight_quant: list[dict[str, Any]] | None,
+        input_quant: dict[str, Any] | None,
+    ) -> bool:
+        # Confirm weights and input quantized.
+        if weight_quant is None or input_quant is None:
+            return False
+
+        if not isinstance(weight_quant, list) or len(weight_quant) != 2:
+            return False
+
+        # Confirm weight scheme is supported
+        is_w4a8_dtype = (
+            weight_quant[0].get("dtype") == "fp8_e4m3"
+            and weight_quant[1].get("dtype") == "int4"
+            and input_quant.get("dtype") == "fp8_e4m3"
+        )
+        is_static_weight = not weight_quant[0].get("is_dynamic") and not weight_quant[
+            1
+        ].get("is_dynamic")
+        is_per_tensor_fp8_and_per_channel_int4_weight = (
+            weight_quant[0].get("qscheme") == "per_tensor"
+            and weight_quant[1].get("qscheme") == "per_channel"
+            and weight_quant[1].get("symmetric") is True
+            and weight_quant[1].get("ch_axis") == 0
+        )
+
+        if not (
+            is_w4a8_dtype
+            and is_static_weight
+            and is_per_tensor_fp8_and_per_channel_int4_weight
+        ):
+            return False
+
+        # Dynamic quantization is always supported if weights supported.
+        if input_quant.get("is_dynamic"):
+            return True
+
+        # Confirm activation scheme is supported.
+        is_per_tensor_activation = input_quant.get("qscheme") == "per_tensor"
+        return is_per_tensor_activation
+
     def _is_fp8_w8a8(
         self,
         weight_quant: dict[str, Any] | None,
diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py
index d84e22d1fa0f25c3a5b472319328d46593aef247..6b731314825a89d76366f54495baefe9e6cff359 100644
--- a/vllm/model_executor/layers/quantization/quark/quark_moe.py
+++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py
@@ -13,6 +13,7 @@ from vllm.model_executor.layers.fused_moe import (
     FusedMoE,
     FusedMoEConfig,
     FusedMoEMethodBase,
+    FusedMoERouter,
     FusedMoeWeightScaleSupported,
 )
 from vllm.model_executor.layers.fused_moe.config import (
@@ -22,7 +23,7 @@ from vllm.model_executor.layers.fused_moe.config import (
 )
 from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
-    prepare_moe_fp8_layer_for_marlin,
+    prepare_fp8_moe_layer_for_marlin,
 )
 from vllm.model_executor.layers.quantization.utils.ocp_mx_utils import (
     OCP_MX_BLOCK_SIZE,
@@ -63,8 +64,9 @@ class QuarkMoEMethod(FusedMoEMethodBase):
             )
         weight_config = layer_quant_config.get("weight")
         input_config = layer_quant_config.get("input_tensors")
-
-        if quant_config._is_fp8_w8a8(weight_config, input_config):
+        if quant_config._is_fp8_w4a8(weight_config, input_config):
+            return QuarkW4A8Fp8MoEMethod(weight_config, input_config, module.moe_config)
+        elif quant_config._is_fp8_w8a8(weight_config, input_config):
             return QuarkW8A8Fp8MoEMethod(weight_config, input_config, module.moe_config)
         elif quant_config._is_ocp_mx(weight_config, input_config):
             return QuarkOCP_MX_MoEMethod(weight_config, input_config, module.moe_config)
@@ -314,10 +316,25 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
             layer.w2_weight = torch.nn.Parameter(shuffled_w2, requires_grad=False)
 
         elif self.use_marlin:
-            prepare_moe_fp8_layer_for_marlin(layer, False)
-            # Activations not quantized for marlin.
-            del layer.w13_input_scale
-            del layer.w2_input_scale
+            w13_weight, w2_weight, w13_weight_scale, w2_weight_scale = (
+                prepare_fp8_moe_layer_for_marlin(
+                    layer,
+                    layer.w13_weight,
+                    layer.w2_weight,
+                    layer.w13_weight_scale,
+                    layer.w2_weight_scale,
+                )
+            )
+            # TODO(rob): once we apply refactor to Quark, switch to using
+            # replace_parameter for compatibility with reloading in RL.
+            layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False)
+            layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False)
+            layer.w13_weight_scale = torch.nn.Parameter(
+                w13_weight_scale, requires_grad=False
+            )
+            layer.w2_weight_scale = torch.nn.Parameter(
+                w2_weight_scale, requires_grad=False
+            )
 
     def get_fused_moe_quant_config(
         self, layer: torch.nn.Module
@@ -334,10 +351,11 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
     def apply(
         self,
         layer: FusedMoE,
+        router: FusedMoERouter,
         x: torch.Tensor,
         router_logits: torch.Tensor,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = router.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
@@ -396,6 +414,162 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
             )
 
 
+class QuarkW4A8Fp8MoEMethod(QuarkMoEMethod):
+    def __init__(
+        self,
+        weight_config: dict[str, Any],
+        input_config: dict[str, Any],
+        moe: FusedMoEConfig,
+    ):
+        super().__init__(moe)
+        self.weight_quant = weight_config
+        self.input_quant = input_config
+
+        assert rocm_aiter_ops.is_fused_moe_enabled(), (
+            "W4A8 FP8 MoE requires ROCm AITER fused MoE support."
+        )
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        params_dtype = torch.uint32
+        w13_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                hidden_size // 8,  # INT32 packing for W4
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        w2_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                intermediate_size_per_partition // 8,  # INT32 packing for W4
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight", w13_weight)
+        layer.register_parameter("w2_weight", w2_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+
+        # Per-tensor fp8 weight scales
+        w13_weight_scale = torch.nn.Parameter(
+            torch.ones(num_experts, 2, dtype=torch.float32), requires_grad=False
+        )
+        w2_weight_scale = torch.nn.Parameter(
+            torch.ones(num_experts, dtype=torch.float32), requires_grad=False
+        )
+        layer.register_parameter("w13_weight_scale", w13_weight_scale)
+        layer.register_parameter("w2_weight_scale", w2_weight_scale)
+        extra_weight_attrs.update(
+            {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
+        )
+        set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+        set_weight_attrs(w2_weight_scale, extra_weight_attrs)
+
+        # Per-channel int4 weight scales
+        w13_weight_scale_2 = torch.nn.Parameter(
+            torch.ones(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                dtype=torch.float32,
+            ),
+            requires_grad=False,
+        )
+        w2_weight_scale_2 = torch.nn.Parameter(
+            torch.ones(num_experts, hidden_size, dtype=torch.float32),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight_scale_2", w13_weight_scale_2)
+        layer.register_parameter("w2_weight_scale_2", w2_weight_scale_2)
+        extra_weight_attrs.update(
+            {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value}
+        )
+        set_weight_attrs(w13_weight_scale_2, extra_weight_attrs)
+        set_weight_attrs(w2_weight_scale_2, extra_weight_attrs)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights(
+            layer.w13_weight.data, layer.w2_weight.data
+        )
+        layer.w13_weight = torch.nn.Parameter(shuffled_w13, requires_grad=False)
+        layer.w2_weight = torch.nn.Parameter(shuffled_w2, requires_grad=False)
+
+        # INT4-FP8 : offset INT4 w13_weight_scale1 to single w13_weight_scale
+        # Fp8 moe kernel needs single fp8 w13_weight_scale for w13 per expert.
+        # We won't do requant each expert's fp8 weight (not direct available),
+        # instead we adjust half of INT4 w13_weight_scale1 numbers
+        shard_size = layer.intermediate_size_per_partition
+        max_w13_scales = layer.w13_weight_scale.max(dim=1).values
+        assert torch.all(max_w13_scales != 0), "fp8 weight scale cannot be zero."
+        for expert_id in range(layer.local_num_experts):
+            start = 0
+            max_w13_scale_fp8 = max_w13_scales[expert_id]
+            for shard_id in range(2):
+                if layer.w13_weight_scale[expert_id][shard_id] != max_w13_scale_fp8:
+                    int4_rescale = (
+                        layer.w13_weight_scale[expert_id][shard_id] / max_w13_scale_fp8
+                    )
+                    layer.w13_weight_scale_2[expert_id][start : start + shard_size] *= (
+                        int4_rescale
+                    )
+                start += shard_size
+
+        layer.w13_weight_scale = torch.nn.Parameter(max_w13_scales, requires_grad=False)
+
+        # special hack to asm_moe, which takes (weight_scale1 * weight_scale) as post
+        # GEMM scaling optimal design - shall apply per-column weight_scale1 before
+        # GEMM, and weight_scale post
+        for expert_id in range(layer.local_num_experts):
+            layer.w13_weight_scale_2[expert_id] *= max_w13_scales[expert_id]
+            layer.w2_weight_scale_2[expert_id] *= layer.w2_weight_scale[expert_id]
+
+    def get_fused_moe_quant_config(self, layer):
+        return fp8_w8a8_moe_quant_config(
+            w1_scale=layer.w13_weight_scale_2,
+            w2_scale=layer.w2_weight_scale_2,
+            per_out_ch_quant=True,
+        )
+
+    def apply(
+        self,
+        layer: FusedMoE,
+        router: FusedMoERouter,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+        topk_weights, topk_ids = layer.select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+        )
+
+        from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
+            rocm_aiter_fused_experts,
+        )
+
+        return rocm_aiter_fused_experts(
+            hidden_states=x,
+            w1=layer.w13_weight,
+            w2=layer.w2_weight,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            activation=layer.activation,
+            apply_router_weight_on_input=layer.apply_router_weight_on_input,
+            quant_config=self.moe_quant_config,
+            expert_map=layer.expert_map,
+        )
+
+
 class QuarkOCP_MX_MoEMethod(QuarkMoEMethod):
     def __init__(
         self,
@@ -579,10 +753,11 @@ class QuarkOCP_MX_MoEMethod(QuarkMoEMethod):
     def apply(
         self,
         layer: FusedMoE,
+        router: FusedMoERouter,
         x: torch.Tensor,
         router_logits: torch.Tensor,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = router.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
diff --git a/vllm/model_executor/layers/quantization/rtn.py b/vllm/model_executor/layers/quantization/rtn.py
index b2ecb0b175f81dc24b875bf39e953a7a9f6d105c..239adb384708e7cb1bb44b7418d89d1914d438e6 100644
--- a/vllm/model_executor/layers/quantization/rtn.py
+++ b/vllm/model_executor/layers/quantization/rtn.py
@@ -15,7 +15,11 @@ from vllm.model_executor.layers.fused_moe.config import (
     FusedMoEQuantConfig,
 )
 from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe
-from vllm.model_executor.layers.fused_moe.layer import FusedMoE, FusedMoEMethodBase
+from vllm.model_executor.layers.fused_moe.fused_moe_router import FusedMoERouter
+from vllm.model_executor.layers.fused_moe.layer import (
+    FusedMoE,
+    FusedMoEMethodBase,
+)
 from vllm.model_executor.layers.linear import (
     LinearBase,
     LinearMethodBase,
@@ -356,10 +360,11 @@ class RTNMoEMethod(FusedMoEMethodBase):
     def apply(
         self,
         layer: FusedMoE,
+        router: FusedMoERouter,
         x: torch.Tensor,
         router_logits: torch.Tensor,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = router.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
diff --git a/vllm/model_executor/layers/quantization/tpu_int8.py b/vllm/model_executor/layers/quantization/tpu_int8.py
deleted file mode 100644
index 64bfa8fb80eb21c73dde5aa856da2472c1be97ed..0000000000000000000000000000000000000000
--- a/vllm/model_executor/layers/quantization/tpu_int8.py
+++ /dev/null
@@ -1,139 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from typing import Any, Optional
-
-import torch
-from torch.nn import Module
-from torch.nn.parameter import Parameter
-
-from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
-from vllm.model_executor.layers.quantization import (
-    QuantizationConfig,
-    QuantizationMethods,
-)
-from vllm.model_executor.parameter import ModelWeightParameter
-
-ACTIVATION_SCHEMES = ["none", "dynamic"]
-
-
-class Int8TpuConfig(QuantizationConfig):
-    """Int8 Quantization Config class for TPU Backend."""
-
-    def __init__(
-        self,
-        activation_scheme: str = "none",
-    ) -> None:
-        super().__init__()
-        if activation_scheme not in ACTIVATION_SCHEMES:
-            raise ValueError(f"Unsupported activation scheme {activation_scheme}")
-        self.activation_scheme = activation_scheme
-
-    def get_name(self) -> QuantizationMethods:
-        return "tpu_int8"
-
-    def get_supported_act_dtypes(self) -> list[torch.dtype]:
-        return [torch.float16, torch.bfloat16]
-
-    @classmethod
-    def get_min_capability(cls) -> int:
-        raise NotImplementedError("This function should not be called with TPU Backend")
-
-    @staticmethod
-    def get_config_filenames() -> list[str]:
-        return []
-
-    @classmethod
-    def from_config(cls, config: dict[str, Any]) -> "Int8TpuConfig":
-        activation_scheme = cls.get_from_keys(config, ["activation_scheme"])
-        return cls(activation_scheme=activation_scheme)
-
-    def get_quant_method(
-        self, layer: Module, prefix: str
-    ) -> Optional["TPUInt8LinearMethod"]:
-        if isinstance(layer, LinearBase):
-            return TPUInt8LinearMethod(self)
-        return None
-
-
-class TPUInt8LinearMethod(LinearMethodBase):
-    """Int8 Linear method for TPU Quant."""
-
-    def __init__(self, quant_config: Int8TpuConfig):
-        self.quant_config = quant_config
-        self.quantize_activation = False
-        if self.quant_config.activation_scheme == "dynamic":
-            self.quantize_activation = True
-
-    def create_weights(
-        self,
-        layer: Module,
-        input_size_per_partition: int,
-        output_partition_sizes: list[int],
-        input_size: int,
-        output_size: int,
-        params_dtype: torch.dtype,
-        **extra_weight_attrs,
-    ):
-        weight_loader = extra_weight_attrs.get("weight_loader")
-        weight = ModelWeightParameter(
-            data=torch.empty(
-                sum(output_partition_sizes),
-                input_size_per_partition,
-                dtype=params_dtype,
-            ),
-            input_dim=1,
-            output_dim=0,
-            weight_loader=weight_loader,
-        )
-        layer.register_parameter("weight", weight)
-
-    def _quantize_weight(
-        self, weight: torch.Tensor
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        weight_dtype = weight.dtype
-        weight = weight.cpu().to(torch.float32)
-        n_bit = 8
-        eps = 1e-5
-        max_int = 2 ** (n_bit - 1) - 1
-        min_int = -(2 ** (n_bit - 1))
-        max_val = weight.abs().amax(dim=-1, keepdim=True)
-        max_val = max_val.clamp(min=eps)
-        qscale = max_val / max_int
-        qweight = torch.clamp(
-            torch.round(weight * (1.0 / qscale)), min_int, max_int
-        ).to(torch.int8)
-        qscale = qscale.squeeze().to(weight_dtype)
-        return qweight, qscale
-
-    def process_weights_after_loading(self, layer: Module) -> None:
-        layer.weight = Parameter(layer.weight.data, requires_grad=False)
-        device = layer.weight.device
-        qweight, qscale = self._quantize_weight(layer.weight)
-        qweight = qweight.to(device)
-        qscale = qscale.to(device)
-        layer.weight = Parameter(qweight, requires_grad=False)
-        layer.scale = Parameter(qscale, requires_grad=False)
-
-    def apply(
-        self,
-        layer: torch.nn.Module,
-        x: torch.Tensor,
-        bias: torch.Tensor | None = None,
-    ) -> torch.Tensor:
-        try:
-            import torch_xla.experimental.custom_kernel  # noqa: F401
-        except ImportError as err:
-            raise ImportError(
-                "Please install torch_xla by following the instructions at "
-                "https://docs.vllm.ai/en/latest/getting_started/tpu-installation.html "  # noqa: E501
-                "to run vLLM on TPU."
-            ) from err
-        weight = layer.weight
-        scale = layer.scale
-        out = torch.ops.xla.quantized_matmul_int8(
-            x, weight, scale, quantize_activation=self.quantize_activation
-        )
-        if bias is not None:
-            out = out + bias
-        return out
diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
index e424cd0e1ac99655c70283f55f92827f5fce40a4..912ff5a4a12aa19867c6bc6cafb06209a388d3e1 100644
--- a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
+++ b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
@@ -2,10 +2,13 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Utility helpers for NVFP4 + FlashInfer fused-MoE path"""
 
+from typing import TYPE_CHECKING
+
 import torch
 
 import vllm.envs as envs
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.config import (
     FusedMoEConfig,
     FusedMoEQuantConfig,
@@ -20,12 +23,23 @@ from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
 from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import (  # noqa: E501
     create_flashinfer_prepare_finalize,
 )
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    swizzle_blockscale,
+)
 from vllm.platforms import current_platform
 from vllm.utils.flashinfer import (
     has_flashinfer_cutedsl_grouped_gemm_nt_masked,
     has_flashinfer_cutlass_fused_moe,
 )
 
+if TYPE_CHECKING:
+    from vllm.model_executor.layers.fused_moe.oracle.nvfp4 import (
+        NvFp4MoeBackend,
+    )
+
+logger = init_logger(__name__)
+
+
 __all__ = [
     "is_flashinfer_fp4_cutlass_moe_available",
     "is_flashinfer_fp4_cutedsl_moe_available",
@@ -238,7 +252,7 @@ def prepare_static_weights_for_trtllm_fp4_moe(
 
 def flashinfer_trtllm_fp4_moe(
     layer: torch.nn.Module,
-    x: torch.Tensor,
+    x: torch.Tensor | tuple[torch.Tensor, torch.Tensor],
     router_logits: torch.Tensor,
     top_k: int,
     global_num_experts: int,
@@ -269,12 +283,15 @@ def flashinfer_trtllm_fp4_moe(
     from vllm.model_executor.models.llama4 import Llama4MoE
 
     # Quantize input to FP4
-    a1_gscale = layer.w13_input_scale_quant
-    (hidden_states_fp4, hidden_states_scale_linear_fp4) = flashinfer.fp4_quantize(
-        x,
-        a1_gscale,
-        is_sf_swizzled_layout=False,
-    )
+    if isinstance(x, tuple):
+        hidden_states_fp4, hidden_states_scale_linear_fp4 = x
+    else:
+        # hidden_states is the already quantized
+        (hidden_states_fp4, hidden_states_scale_linear_fp4) = flashinfer.fp4_quantize(
+            x,
+            layer.a1_gscale,
+            is_sf_swizzled_layout=False,
+        )
 
     # Determine routing method type
     use_llama4_routing = custom_routing_function is Llama4MoE.custom_routing_function
@@ -301,18 +318,14 @@ def flashinfer_trtllm_fp4_moe(
         hidden_states_scale=hidden_states_scale_linear_fp4.view(
             torch.float8_e4m3fn
         ).flatten(),
-        gemm1_weights=layer.gemm1_weights_fp4_shuffled.data,
-        gemm1_weights_scale=layer.gemm1_scales_fp4_shuffled.data.view(
-            torch.float8_e4m3fn
-        ),
+        gemm1_weights=layer.w13_weight.data,
+        gemm1_weights_scale=layer.w13_weight_scale.data.view(torch.float8_e4m3fn),
         gemm1_bias=None,
         gemm1_alpha=None,
         gemm1_beta=None,
         gemm1_clamp_limit=None,
-        gemm2_weights=layer.gemm2_weights_fp4_shuffled.data,
-        gemm2_weights_scale=layer.gemm2_scales_fp4_shuffled.data.view(
-            torch.float8_e4m3fn
-        ),
+        gemm2_weights=layer.w2_weight.data,
+        gemm2_weights_scale=layer.w2_weight_scale.data.view(torch.float8_e4m3fn),
         gemm2_bias=None,
         output1_scale_scalar=layer.g1_scale_c.data,
         output1_scale_gate_scalar=layer.g1_alphas.data,
@@ -364,13 +377,16 @@ def flashinfer_trtllm_fp4_routed_moe(
         torch.bfloat16
     ).view(torch.int16)
 
-    # Quantize input to FP4
-    a1_gscale = layer.w13_input_scale_quant
-    (hidden_states_fp4, hidden_states_scale_linear_fp4) = flashinfer.fp4_quantize(
-        x,
-        a1_gscale,
-        is_sf_swizzled_layout=False,
-    )
+    if isinstance(x, tuple):
+        # Hidden_states is the already quantized
+        hidden_states_fp4, hidden_states_scale_linear_fp4 = x
+    else:
+        # Quantize input to FP4
+        (hidden_states_fp4, hidden_states_scale_linear_fp4) = flashinfer.fp4_quantize(
+            x,
+            layer.a1_gscale,
+            is_sf_swizzled_layout=False,
+        )
 
     # Call TRT-LLM FP4 block-scale MoE kernel
     out = flashinfer.fused_moe.trtllm_fp4_block_scale_routed_moe(
@@ -380,18 +396,14 @@ def flashinfer_trtllm_fp4_routed_moe(
         hidden_states_scale=hidden_states_scale_linear_fp4.view(
             torch.float8_e4m3fn
         ).flatten(),
-        gemm1_weights=layer.gemm1_weights_fp4_shuffled.data,
-        gemm1_weights_scale=layer.gemm1_scales_fp4_shuffled.data.view(
-            torch.float8_e4m3fn
-        ),
+        gemm1_weights=layer.w13_weight.data,
+        gemm1_weights_scale=layer.w13_weight_scale.data.view(torch.float8_e4m3fn),
         gemm1_bias=None,
         gemm1_alpha=None,
         gemm1_beta=None,
         gemm1_clamp_limit=None,
-        gemm2_weights=layer.gemm2_weights_fp4_shuffled.data,
-        gemm2_weights_scale=layer.gemm2_scales_fp4_shuffled.data.view(
-            torch.float8_e4m3fn
-        ),
+        gemm2_weights=layer.w2_weight.data,
+        gemm2_weights_scale=layer.w2_weight_scale.data.view(torch.float8_e4m3fn),
         gemm2_bias=None,
         output1_scale_scalar=layer.g1_scale_c.data,
         output1_scale_gate_scalar=layer.g1_alphas.data,
@@ -410,3 +422,93 @@ def flashinfer_trtllm_fp4_routed_moe(
     )[0]
 
     return out
+
+
+def prepare_nvfp4_moe_layer_for_fi_or_cutlass(
+    backend: "NvFp4MoeBackend",
+    layer: torch.nn.Module,
+    w13: torch.Tensor,
+    w13_scale: torch.Tensor,
+    w13_scale_2: torch.Tensor,
+    a13_scale: torch.Tensor,
+    w2: torch.Tensor,
+    w2_scale: torch.Tensor,
+    w2_scale_2: torch.Tensor,
+    a2_scale: torch.Tensor,
+    is_act_and_mul: bool,
+) -> tuple[
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+]:
+    # Delayed import for circular dependency avoidance.
+    from vllm.model_executor.layers.fused_moe.oracle.nvfp4 import (
+        NvFp4MoeBackend,
+        is_global_sf_supported_for_nvfp4_backend,
+    )
+
+    assert backend in [
+        NvFp4MoeBackend.VLLM_CUTLASS,
+        NvFp4MoeBackend.FLASHINFER_CUTLASS,
+        NvFp4MoeBackend.FLASHINFER_TRTLLM,
+        NvFp4MoeBackend.FLASHINFER_CUTEDSL,
+    ]
+
+    # Reorder [w1, w3] to [w3, w1] for FI NVFP4 MoE kernels.
+    if is_act_and_mul and backend in [
+        NvFp4MoeBackend.FLASHINFER_CUTLASS,
+        NvFp4MoeBackend.FLASHINFER_TRTLLM,
+    ]:
+        w13, w13_scale = reorder_w1w3_to_w3w1(w13, w13_scale)
+
+    # For some FI kernels, the input scales are shared by all experts.
+    if is_global_sf_supported_for_nvfp4_backend(backend):
+        num_experts = w13.shape[0]
+        a13_scale = a13_scale.max().to(torch.float32).expand(num_experts)
+        a2_scale = a2_scale.max().to(torch.float32).expand(num_experts)
+    else:
+        a13_scale = a13_scale.max(dim=1).values.to(torch.float32)
+
+    # Shuffle weights and scales for FI TRTLLM NVFP4 MoE kernels.
+    if backend == NvFp4MoeBackend.FLASHINFER_TRTLLM:
+        w13, w13_scale, w2, w2_scale = prepare_static_weights_for_trtllm_fp4_moe(
+            w13,
+            w2,
+            w13_scale,
+            w2_scale,
+            w2.size(-2),  # hidden_size
+            w13.size(-2) // 2,  # intermediate_size
+            w13.size(0),  # num_experts
+        )
+
+        # We do not need to make this a parameter, because
+        # it is not used during the weight (re)-loading process.
+        layer.g1_scale_c = a13_scale * w13_scale_2 / a2_scale
+        layer.a1_gscale = 1.0 / a13_scale
+        layer.g1_alphas = a13_scale * w13_scale_2
+        layer.g2_alphas = a2_scale * w2_scale_2
+    else:
+        # Swizzle the block scales for other FI NVFP4 MoE kernels.
+        w13_scale = swizzle_blockscale(w13_scale)
+
+        # Apply padding if needed.
+        pad_size = w13_scale.size(1) - w13.size(1)
+        if pad_size > 0:
+            if is_act_and_mul:
+                raise NotImplementedError(
+                    "Intermediate size padding for w1 and w3, for %s "
+                    "NvFp4 backend, but this is not currently supported",
+                    backend.value,
+                )
+            w13 = torch.nn.functional.pad(w13, (0, 0, 0, pad_size))
+            w2 = torch.nn.functional.pad(w2, (0, pad_size // 2, 0, 0))
+            w2_scale = torch.nn.functional.pad(w2_scale, (0, pad_size // 16))
+
+        w2_scale = swizzle_blockscale(w2_scale)
+
+    return w13, w13_scale, w13_scale_2, a13_scale, w2, w2_scale, w2_scale_2, a2_scale
diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
index 3d6e9cda8766775e2bee50fd039d2493890f237e..799854479823157448fa8bf2da59b91fe62c021a 100644
--- a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
@@ -18,6 +18,7 @@ from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize im
     create_flashinfer_prepare_finalize,
 )
 from vllm.platforms import current_platform
+from vllm.utils.math_utils import round_up
 
 logger = init_logger(__name__)
 
@@ -58,9 +59,10 @@ def swap_w13_to_w31(x: torch.Tensor) -> torch.Tensor:
     )
 
 
-def rotate_flashinfer_fp8_moe_weights(
+def rotate_weights_for_fi_trtllm_fp8_per_tensor_moe(
     gemm1_weights: torch.Tensor, gemm2_weights: torch.Tensor
 ):
+    """Shuffle weights for for FI TRT-LLM Format"""
     from flashinfer import reorder_rows_for_gated_act_gemm, shuffle_matrix_a
 
     epilogue_tile_m = 128
@@ -103,7 +105,27 @@ def rotate_flashinfer_fp8_moe_weights(
     )
 
 
-def apply_flashinfer_per_tensor_scale_fp8(
+def register_scales_for_trtllm_fp8_per_tensor_moe(
+    layer: torch.nn.Module,
+    w13_scale: torch.Tensor,
+    w13_input_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    w2_input_scale: torch.Tensor,
+) -> None:
+    """Register necessary scales for FlashInfer TRTLLM FP8 MoE kernel"""
+    g1_alphas, g2_alphas = make_fp8_moe_alpha_scales_for_fi(
+        w13_scale=w13_scale,
+        w13_input_scale=w13_input_scale,
+        w2_scale=w2_scale,
+        w2_input_scale=w2_input_scale,
+    )
+    layer.w2_input_scale_inv = 1.0 / w2_input_scale
+    layer.output1_scales_gate_scalar = g1_alphas
+    layer.output1_scales_scalar = g1_alphas * layer.w2_input_scale_inv
+    layer.output2_scales_scalar = g2_alphas
+
+
+def apply_fi_trtllm_fp8_per_tensor_moe(
     layer: torch.nn.Module,
     hidden_states: torch.Tensor,
     router_logits: torch.Tensor,
@@ -117,23 +139,25 @@ def apply_flashinfer_per_tensor_scale_fp8(
     from flashinfer.fused_moe import RoutingMethodType
 
     import vllm.model_executor.layers.fused_moe.flashinfer_trtllm_moe  # noqa: E501, F401
+    from vllm.model_executor.models.llama4 import Llama4MoE
 
-    assert layer.output1_scales_scalar is not None, (
-        "Expected output1_scales_scalar to be initialized"
-    )
-    assert layer.output1_scales_scalar is not None, (
-        "Expected output1_scales_gate_scalar to be initialized"
-    )
-    assert layer.output1_scales_scalar is not None, (
-        "Expected output2_scales_scalar to be initialized"
+    # Added to the layer by: register_scales_for_trtllm_fp8_per_tensor_moe
+    assert (
+        hasattr(layer, "output1_scales_scalar")
+        and hasattr(layer, "output1_scales_gate_scalar")
+        and hasattr(layer, "output2_scales_scalar")
     )
 
-    from vllm.model_executor.models.llama4 import Llama4MoE
-
-    assert layer.custom_routing_function == Llama4MoE.custom_routing_function, (
-        "FusedMoE flashinfer kernels are only supported for Llama4"
+    # Added to the layer by: register_scales_for_trtllm_fp8_per_tensor_moe
+    assert (
+        hasattr(layer, "output1_scales_scalar")
+        and hasattr(layer, "output1_scales_gate_scalar")
+        and hasattr(layer, "output2_scales_scalar")
     )
-    return torch.ops.vllm.flashinfer_fused_moe_per_tensor_scale_fp8(
+
+    is_llama4 = layer.custom_routing_function == Llama4MoE.custom_routing_function
+    assert is_llama4, "FusedMoE flashinfer kernels are only supported for Llama4"
+    return torch.ops.vllm.fi_trtllm_fp8_per_tensor_moe(
         routing_logits=router_logits,
         routing_bias=routing_bias,
         hidden_states=hidden_states,
@@ -155,40 +179,16 @@ def apply_flashinfer_per_tensor_scale_fp8(
     )
 
 
-def get_moe_scaling_factors(
-    input_scale: torch.Tensor,
-    gemm1_weights_scale: torch.Tensor,
-    activation_scale: torch.Tensor,
-    gemm2_weights_scale: torch.Tensor,
-) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-    output1_scales_scalar = gemm1_weights_scale * input_scale * (1.0 / activation_scale)
-    output1_scales_gate_scalar = gemm1_weights_scale * input_scale
-    output2_scales_scalar = activation_scale * gemm2_weights_scale
+def make_fp8_moe_alpha_scales_for_fi(
+    w13_scale: torch.Tensor,
+    w13_input_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    w2_input_scale: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    g1_alphas = (w13_scale * w13_input_scale).squeeze()
+    g2_alphas = (w2_scale * w2_input_scale).squeeze()
 
-    return output1_scales_scalar, output1_scales_gate_scalar, output2_scales_scalar
-
-
-def register_moe_scaling_factors(layer: torch.nn.Module) -> None:
-    output1_scales, output1_gate_scales, output2_scales = get_moe_scaling_factors(
-        layer.w13_input_scale,
-        layer.w13_weight_scale,
-        layer.w2_input_scale,
-        layer.w2_weight_scale,
-    )
-    layer.register_parameter(
-        "output1_scales_scalar", torch.nn.Parameter(output1_scales, requires_grad=False)
-    )
-    layer.register_parameter(
-        "output1_scales_gate_scalar",
-        torch.nn.Parameter(output1_gate_scales, requires_grad=False),
-    )
-    layer.register_parameter(
-        "output2_scales_scalar", torch.nn.Parameter(output2_scales, requires_grad=False)
-    )
-    layer.register_parameter(
-        "w2_input_scale_inv",
-        torch.nn.Parameter(1.0 / layer.w2_input_scale, requires_grad=False),
-    )
+    return g1_alphas, g2_alphas
 
 
 def build_flashinfer_fp8_cutlass_moe_prepare_finalize(
@@ -230,50 +230,6 @@ def select_cutlass_fp8_gemm_impl(
     )
 
 
-def flashinfer_cutlass_moe_fp8(
-    hidden_states: torch.Tensor,
-    layer: torch.nn.Module,
-    topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
-    inplace: bool = False,
-    activation: str = "silu",
-    global_num_experts: int = -1,
-    expert_map: torch.Tensor | None = None,
-    apply_router_weight_on_input: bool = False,
-    use_deepseek_fp8_block_scale: bool = False,
-    moe: FusedMoEConfig | None = None,
-) -> torch.Tensor:
-    quant_config = layer.quant_method.get_fused_moe_quant_config(layer)
-    assert quant_config is not None
-
-    # Construct modular kernel with block-scale support when requested.
-    fused_experts = mk.FusedMoEModularKernel(
-        build_flashinfer_fp8_cutlass_moe_prepare_finalize(
-            moe=moe, use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale
-        ),
-        select_cutlass_fp8_gemm_impl(
-            moe=moe,
-            quant_config=quant_config,
-            out_dtype=hidden_states.dtype,
-            use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale,
-        ),
-        moe_parallel_config=layer.moe_parallel_config,
-    )
-
-    return fused_experts(
-        hidden_states,
-        layer.w13_weight,
-        layer.w2_weight,
-        topk_weights,
-        topk_ids,
-        inplace=inplace,
-        activation=activation,
-        global_num_experts=global_num_experts,
-        expert_map=expert_map,
-        apply_router_weight_on_input=apply_router_weight_on_input,
-    )
-
-
 def get_flashinfer_moe_backend() -> FlashinferMoeBackend:
     backend_map = {
         "throughput": FlashinferMoeBackend.CUTLASS,
@@ -308,5 +264,107 @@ def is_flashinfer_supporting_global_sf(backend: FlashinferMoeBackend | None) ->
     backends_supporting_global_sf = (
         FlashinferMoeBackend.CUTLASS,
         FlashinferMoeBackend.TENSORRT_LLM,
+        FlashinferMoeBackend.CUTEDSL,
     )
     return backend in backends_supporting_global_sf
+
+
+def align_fp8_moe_weights_for_fi(
+    w13: torch.Tensor, w2: torch.Tensor, is_act_and_mul: bool
+) -> tuple[torch.Tensor, torch.Tensor, int]:
+    """Pad intermediate size so FlashInfer kernels' alignment constraints hold.
+
+    Some FlashInfer FP8 MoE kernels require the (gated) intermediate size
+    used for GEMM to be divisible by a small alignment value. When this is
+    not satisfied (e.g. with certain tensor-parallel sizes), we pad the
+    gate/up and down projection weights along the intermediate dim.
+    """
+
+    # Current local intermediate size (per partition) is the K dimension of
+    # the down projection.
+    num_experts, hidden_size, intermediate = w2.shape
+
+    min_alignment = 16
+    padded_intermediate = round_up(intermediate, min_alignment)
+
+    if padded_intermediate == intermediate:
+        return w13, w2, intermediate
+
+    logger.info_once(
+        "Padding intermediate size from %d to %d for up/down projection weights.",
+        intermediate,
+        padded_intermediate,
+        scope="local",
+    )
+
+    up_mult = 2 if is_act_and_mul else 1
+    padded_gate_up_dim = up_mult * padded_intermediate
+
+    # Pad w13 and w2 along its intermediate dimension.
+    padded_w13 = w13.new_zeros((num_experts, padded_gate_up_dim, hidden_size))
+    padded_w13[:, : w13.shape[1], :] = w13
+
+    padded_w2 = w2.new_zeros((num_experts, hidden_size, padded_intermediate))
+    padded_w2[:, :, :intermediate] = w2
+
+    return padded_w13, padded_w2, padded_intermediate
+
+
+def prepare_fp8_moe_layer_for_fi(
+    layer: torch.nn.Module,
+    w13: torch.Tensor,
+    w2: torch.Tensor,
+    w13_scale: torch.Tensor,
+    w13_input_scale: torch.Tensor | None,
+    w2_scale: torch.Tensor,
+    w2_input_scale: torch.Tensor | None,
+    is_trtllm: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Convert Fp8 MoE weights to flashinfer kernel format
+
+    Note that for trtllm we update the model state dict
+    with the scale format needed for these kernels.
+
+    Note that for per-tensor, we update the layer's
+    intermediate size if the weights needed padding.
+    """
+
+    assert hasattr(layer.moe_config, "is_act_and_mul")
+    block_quant = (
+        hasattr(layer, "weight_block_size") and layer.weight_block_size is not None
+    )
+
+    # Some FI MoE kernels require internal alignment of 16
+    # for the gate-up proj. Pad the weights to respect this.
+    if not block_quant:
+        w13, w2, new_intermediate = align_fp8_moe_weights_for_fi(
+            w13,
+            w2,
+            layer.moe_config.is_act_and_mul,
+        )
+        layer.intermediate_size_per_partition = new_intermediate
+
+    # FI kernels require W31 layout rather than W13.
+    if layer.moe_config.is_act_and_mul:
+        w13 = swap_w13_to_w31(w13)
+        if block_quant:
+            w13_scale = swap_w13_to_w31(w13_scale)
+
+    # FI TRT-LLM FP8 per-tensor MoE kernel requires weight shuffle
+    # and registration of alpha scales. Note that we do not register
+    # as nn.Parameters since they are not needed for weight-reloading.
+    if is_trtllm and not block_quant:
+        assert w13_input_scale is not None
+        assert w2_input_scale is not None
+
+        rotate_weights_for_fi_trtllm_fp8_per_tensor_moe(w13, w2)
+        register_scales_for_trtllm_fp8_per_tensor_moe(
+            layer,
+            w13_scale=w13_scale,
+            w13_input_scale=w13_input_scale,
+            w2_scale=w2_scale,
+            w2_input_scale=w2_input_scale,
+        )
+
+    return w13, w2, w13_scale
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index ea68745585160629891955116acb43ca5077ac0e..ac82bbd59ae76f790d2c69bbe06f238b90d3d992 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -17,10 +17,12 @@ from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     GroupShape,
-    group_broadcast,
+    get_fp8_min_max,
 )
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     CUTLASS_BLOCK_FP8_SUPPORTED,
+    all_close_1d,
+    per_tensor_dequantize,
 )
 from vllm.model_executor.parameter import (
     BlockQuantScaleParameter,
@@ -31,12 +33,18 @@ from vllm.model_executor.utils import replace_parameter
 from vllm.platforms import current_platform
 from vllm.triton_utils import tl, triton
 from vllm.utils.deep_gemm import (
+    DeepGemmQuantScaleFMT,
     fp8_gemm_nt,
     is_deep_gemm_e8m0_used,
     is_deep_gemm_supported,
     should_use_deepgemm_for_fp8_linear,
     transform_sf_into_required_layout,
 )
+from vllm.utils.flashinfer import (
+    flashinfer_fp8_blockscale_gemm,
+    is_flashinfer_fp8_blockscale_gemm_supported,
+    should_use_flashinfer_for_blockscale_fp8_gemm,
+)
 from vllm.utils.torch_utils import direct_register_custom_op
 
 logger = init_logger(__name__)
@@ -228,6 +236,112 @@ direct_register_custom_op(
 )
 
 
+def _flashinfer_fp8_blockscale_gemm_impl(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    weight_scale: torch.Tensor,
+    group_size: int,
+    use_deep_gemm_e8m0: bool,
+) -> torch.Tensor:
+    """
+    Conditional FlashInfer FP8 blockscale GEMM with batch-size-dependent selection.
+
+    This function switches between two optimized kernels based on the input batch size:
+    - For small batches (M < 32): Uses FlashInfer's DeepGEMM swapAB optimization.
+    - For larger batches (M >= 32): Uses the official DeepGEMM kernel.
+
+    The conditional logic must use torch.cond() instead of a simple if-else statement
+    to maintain compatibility with torch.compile graph compilation.
+
+    This batch-size-dependent selection is essential for maintaining model accuracy.
+    Benchmarks on GSM8K show a significant accuracy gap (88% vs 95%) for DeepSeek-V3.1
+    when using FlashInfer's DeepGEMM on M>=32. The M < 32 strategy fixes the accurracy
+    drop.
+
+    Args:
+        input: Input tensor of shape (batch_size, input_dim) in FP8 format
+        weight: Weight tensor of shape (output_dim, input_dim) in FP8 format
+        weight_scale: Scale factors for weight quantization (per-group)
+        group_size: Quantization group size for the weight tensor
+        use_deep_gemm_e8m0: Whether to use the E8M0 format in DeepGEMM quantization
+
+    Returns:
+        Output tensor of shape (batch_size, output_dim) in bfloat16 format
+    """
+
+    def run_flashinfer_deepgemm_swapAB(
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        weight_scale: torch.Tensor,
+    ) -> torch.Tensor:
+        return flashinfer_fp8_blockscale_gemm(
+            input=input,
+            weight=weight,
+            weight_scale=weight_scale,
+            out_dtype=torch.bfloat16,
+        )
+
+    def run_deepgemm(
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        weight_scale: torch.Tensor,
+    ) -> torch.Tensor:
+        q_input, input_scale = per_token_group_quant_fp8(
+            input,
+            group_size=group_size,
+            column_major_scales=True,
+            use_ue8m0=use_deep_gemm_e8m0,
+        )
+        output = torch.empty(
+            (q_input.shape[0], weight.shape[0]),
+            dtype=torch.bfloat16,
+            device=q_input.device,
+        )
+        fp8_gemm_nt(
+            (q_input, input_scale),
+            (weight, weight_scale),
+            output,
+            is_deep_gemm_e8m0_used=use_deep_gemm_e8m0,
+        )
+        return output
+
+    condition = input.shape[0] < 32
+
+    # PyTorch's torch.compile cannot handle input-dependent control flow in standard
+    # Python conditionals. torch.cond() explicitly registers both code paths in the
+    # computation graph, allowing torch.compile to capture both branches.
+    # without torch.cond, the M < 32 condition won't be able to be captured by torch
+    # compile
+    return torch.cond(
+        condition,
+        run_flashinfer_deepgemm_swapAB,
+        run_deepgemm,
+        (input, weight, weight_scale),
+    )
+
+
+def _flashinfer_fp8_blockscale_gemm_fake(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    weight_scale: torch.Tensor,
+    group_size: int,
+    use_deep_gemm_e8m0: bool,
+) -> torch.Tensor:
+    """
+    Required fake/meta implementation for torch.compile graph tracing.
+    """
+    return torch.empty(
+        input.shape[0], weight.shape[0], dtype=torch.bfloat16, device=input.device
+    )
+
+
+direct_register_custom_op(
+    "flashinfer_fp8_blockscale_gemm",
+    _flashinfer_fp8_blockscale_gemm_impl,
+    fake_impl=_flashinfer_fp8_blockscale_gemm_fake,
+)
+
+
 # TODO fix ROCm->Triton custom path:
 #  https://github.com/vllm-project/vllm/issues/14397
 class W8A8BlockFp8LinearOp:
@@ -247,8 +361,8 @@ class W8A8BlockFp8LinearOp:
         self.act_quant_group_shape = act_quant_group_shape
         self.is_deep_gemm_supported = is_deep_gemm_supported()
         self.is_hopper = current_platform.is_device_capability(90)
-        self.is_blackwell = current_platform.is_device_capability_family(100)
         self.use_deep_gemm_e8m0 = is_deep_gemm_e8m0_used()
+        self.is_flashinfer_supported = is_flashinfer_fp8_blockscale_gemm_supported()
 
         # Get the correct blockscale mul and input quant operations.
         # We can't use _dispatch_w8a8_blockscale_op to figure out if we want
@@ -284,7 +398,14 @@ class W8A8BlockFp8LinearOp:
         output_shape = [*input.shape[:-1], weight.shape[0]]
         output_dtype = input.dtype
 
-        if should_use_deepgemm_for_fp8_linear(
+        if should_use_flashinfer_for_blockscale_fp8_gemm(
+            self.is_flashinfer_supported, output_dtype, input_2d, weight
+        ) and should_use_deepgemm_for_fp8_linear(
+            output_dtype, weight, self.is_deep_gemm_supported
+        ):
+            output = self._run_flashinfer(input_2d, weight, weight_scale)
+
+        elif should_use_deepgemm_for_fp8_linear(
             output_dtype, weight, self.is_deep_gemm_supported
         ):
             output = self._run_deepgemm(input_2d, weight, weight_scale)
@@ -303,7 +424,7 @@ class W8A8BlockFp8LinearOp:
         weight: torch.Tensor,
         weight_scale: torch.Tensor,
     ) -> torch.Tensor:
-        if self.use_deep_gemm_e8m0 and self.is_blackwell:
+        if DeepGemmQuantScaleFMT.from_oracle() == DeepGemmQuantScaleFMT.UE8M0:
             q_input, input_scale = per_token_group_quant_fp8_packed_for_deepgemm(
                 input_2d,
                 group_size=self.act_quant_group_shape.col,
@@ -412,6 +533,29 @@ class W8A8BlockFp8LinearOp:
             input_2d.dtype,
         )
 
+    def _run_flashinfer(
+        self,
+        input_2d: torch.Tensor,
+        weight: torch.Tensor,
+        weight_scale: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Run FlashInfer FP8 block-scale GEMM.
+
+        This backend uses TensorRT-LLM's FP8 block-scale GEMM kernels
+        and supports FP8+FP8 (W8A8 full quantization) on SM90+ (Hopper).
+        """
+        # Now call FlashInfer with BF16 input + FP8 weight, input will be
+        # quantized with FlashInfer kernel (W8A8)
+        output = torch.ops.vllm.flashinfer_fp8_blockscale_gemm(
+            input=input_2d,  # BF16 input
+            weight=weight,  # FP8 weight
+            weight_scale=weight_scale,  # Weight scales
+            group_size=self.act_quant_group_shape.col,
+            use_deep_gemm_e8m0=self.use_deep_gemm_e8m0,
+        )
+        return output
+
     def _dispatch_w8a8_blockscale_op(
         self,
         use_cutlass: bool,
@@ -463,21 +607,6 @@ def input_to_float8(
     return x_scl_sat.to(dtype).contiguous(), scale.float().reciprocal()
 
 
-def block_quant_to_tensor_quant(
-    x_q_block: torch.Tensor,
-    x_s: torch.Tensor,
-) -> tuple[torch.Tensor, torch.Tensor]:
-    """This function converts block-wise quantization to tensor-wise
-    quantization. The inputs are block-wise quantization tensor `x_q_block`,
-    block-wise quantization scale and the block size.
-    The outputs are tensor-wise quantization tensor and tensor-wise
-    quantization scale. Note only float8 is supported for now.
-    """
-    x_dq_block = group_broadcast(x_q_block, x_s)
-    x_q_tensor, scale = input_to_float8(x_dq_block, dtype=x_q_block.dtype)
-    return x_q_tensor, scale
-
-
 @triton.jit
 def _per_token_group_quant_fp8(
     # Pointers to inputs and output
@@ -625,8 +754,9 @@ def silu_mul_per_token_group_quant_fp8_colmajor(
     M, N = input.size()
     N_2 = N // 2
 
+    fp8_dtype = current_platform.fp8_dtype()
     if output is None:
-        output = torch.empty((M, N_2), dtype=torch.float8_e4m3fn, device=input.device)
+        output = torch.empty((M, N_2), dtype=fp8_dtype, device=input.device)
 
     output_scales = torch.empty(
         ((N_2 // GROUP_SIZE), M), dtype=torch.float32, device=input.device
@@ -637,9 +767,12 @@ def silu_mul_per_token_group_quant_fp8_colmajor(
     assert M % BLOCK_M == 0
     assert N_2 % BLOCK_N == 0
 
-    finfo = torch.finfo(torch.float8_e4m3fn)
-    fp8_min = finfo.min
-    fp8_max = finfo.max
+    # Using the default value (240.0) from pytorch will cause accuracy
+    # issue on dynamic quantization models. Here use 224.0 for fnuz on ROCm
+    # platforms that use the torch.float8_e4m3fnuz dtype.
+    finfo = torch.finfo(fp8_dtype)
+    fp8_min = -224.0 if current_platform.is_fp8_fnuz() else finfo.min
+    fp8_max = 224.0 if current_platform.is_fp8_fnuz() else finfo.max
 
     # Force even division so we can avoid edgecases within the kernel.
     assert M % BLOCK_M == 0
@@ -762,12 +895,7 @@ def per_token_group_quant_fp8(
     )
     assert x.stride(-1) == 1, "`x` groups must be contiguous"
 
-    # Using the default value (240.0) from pytorch will cause accuracy
-    # issue on dynamic quantization models. Here use 224.0 for fnuz on ROCm
-    # platforms that use the torch.float8_e4mefnuz dtype.
-    finfo = torch.finfo(dtype)
-    fp8_min = -224.0 if current_platform.is_fp8_fnuz() else finfo.min
-    fp8_max = 224.0 if current_platform.is_fp8_fnuz() else finfo.max
+    fp8_min, fp8_max = get_fp8_min_max()
 
     assert out_q is None or out_q.shape == x.shape
     x_q = out_q
@@ -1224,6 +1352,29 @@ def deepgemm_post_process_fp8_weight_block(
     return wq, dg_ws
 
 
+def prepare_fp8_moe_layer_for_deepgemm(
+    w13: torch.Tensor,
+    w2: torch.Tensor,
+    w13_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    block_shape: tuple[int],
+):
+    w13, w13_scale = deepgemm_post_process_fp8_weight_block(
+        wq=w13,
+        ws=w13_scale,
+        quant_block_shape=block_shape,
+        use_e8m0=is_deep_gemm_e8m0_used(),
+    )
+    w2, w2_scale = deepgemm_post_process_fp8_weight_block(
+        wq=w2,
+        ws=w2_scale,
+        quant_block_shape=block_shape,
+        use_e8m0=is_deep_gemm_e8m0_used(),
+    )
+
+    return w13, w2, w13_scale, w2_scale
+
+
 def _maybe_pad_fp8_weight(weight: torch.Tensor) -> torch.Tensor:
     """Pad the weight tensor. This is an optimization on ROCm platform, which
     can benefit from tensors located far enough from one another in memory"""
@@ -1252,6 +1403,14 @@ def validate_fp8_block_shape(
     """Validate block quantization shapes for tensor parallelism."""
     from vllm.distributed import get_tensor_model_parallel_world_size
 
+    if getattr(layer, "allow_fp8_block_shape_mismatch", False):
+        logger.debug(
+            "Skipping FP8 block shape validation for layer %s due to detected"
+            " mismatch allowance.",
+            getattr(layer, "prefix", "<unknown>"),
+        )
+        return
+
     tp_size = getattr(layer, "tp_size", get_tensor_model_parallel_world_size())
     block_n, block_k = block_size[0], block_size[1]
 
@@ -1437,17 +1596,63 @@ def maybe_post_process_fp8_weight_block(layer: torch.nn.Module):
         layer.orig_dtype, layer.weight
     )
     if should_use_deepgemm:
+        scale_attr = (
+            "weight_scale_inv" if hasattr(layer, "weight_scale_inv") else "weight_scale"
+        )
         dg_weight, dg_weight_scale = deepgemm_post_process_fp8_weight_block(
             wq=layer.weight.data,
-            ws=layer.weight_scale_inv.data,
+            ws=getattr(layer, scale_attr).data,
             quant_block_shape=tuple(layer.weight_block_size),
             use_e8m0=is_deep_gemm_e8m0_used(),
         )
         replace_parameter(layer, "weight", dg_weight)
-        replace_parameter(layer, "weight_scale_inv", dg_weight_scale)
+        replace_parameter(layer, scale_attr, dg_weight_scale)
+
+
+def process_fp8_weight_tensor_strategy_moe(
+    weight: torch.Tensor,
+    weight_scales: torch.Tensor,
+    shard_size: int,
+    num_experts: int,
+    is_act_and_mul: bool = True,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Process moe weights for tensor-wise quantization strategy."""
+    max_scales = weight_scales.max(dim=1).values
+
+    # For w1 case (i.e. not w13): there is already just one scale per expert.
+    if not is_act_and_mul:
+        assert weight_scales.shape[1] == 1
+        # One scale per expert
+        assert max_scales.shape == (num_experts,)
+        return weight, max_scales
+
+    # For w13 case (common): require single scale for w13 per expert, but
+    # on disk there is a scale for w1 and w3. Use the max to requantize.
+    for expert_id in range(num_experts):
+        start = 0
+        for shard_id in range(2):
+            dq_weight = per_tensor_dequantize(
+                weight[expert_id][start : start + shard_size, :],
+                weight_scales[expert_id][shard_id],
+            )
+            weight[expert_id][start : start + shard_size, :], _ = ops.scaled_fp8_quant(
+                dq_weight, max_scales[expert_id]
+            )
+            start += shard_size
+    return weight, max_scales
+
 
+def process_fp8_input_tensor_strategy_moe(
+    w13_input_scale: torch.Tensor,
+    w2_input_scale: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Process moe input scales for tensor-wise quantization strategy."""
+
+    if not all_close_1d(w13_input_scale) or not all_close_1d(w2_input_scale):
+        logger.info_once(
+            "Found input_scales that are not equal for "
+            "fp8 MoE layer. Using the maximum across experts "
+            "for each layer."
+        )
 
-def expert_weight_is_col_major(x: torch.Tensor) -> bool:
-    assert x.dim() == 3
-    b, m, n = x.shape
-    return x.stride(0) == m * n and x.stride(1) == 1 and x.stride(2) == m
+    return w13_input_scale.max(), w2_input_scale.max()
diff --git a/vllm/model_executor/layers/quantization/utils/int8_utils.py b/vllm/model_executor/layers/quantization/utils/int8_utils.py
index 32192225f61e26cd08ec16a81903eeb7cdcf7afa..020098dffc3993d7b69492fa426737bfb7f18e1e 100644
--- a/vllm/model_executor/layers/quantization/utils/int8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/int8_utils.py
@@ -122,15 +122,17 @@ def _per_token_quant_int8(
 
 
 def per_token_quant_int8(x):
+    original_shape = x.shape
+    if x.dim() > 2:
+        x = x.view(-1, original_shape[-1])
     M = x.numel() // x.shape[-1]
     N = x.shape[-1]
-    x_q = torch.empty_like(x, device=x.device, dtype=torch.int8)
-    scales = torch.empty(x.shape[:-1] + (1,), device=x.device, dtype=torch.float32)
+    x_q = torch.empty((M, N), device=x.device, dtype=torch.int8)
+    scales = torch.empty((M, 1), device=x.device, dtype=torch.float32)
     BLOCK = triton.next_power_of_2(N)
     # heuristics for number of warps
     num_warps = min(max(BLOCK // 256, 1), 8)
-
-    assert x.is_contiguous()
+    x = x.contiguous()
     _per_token_quant_int8[(M,)](
         x,
         x_q,
@@ -142,7 +144,8 @@ def per_token_quant_int8(x):
         num_warps=num_warps,
         num_stages=1,
     )
-
+    x_q = x_q.view(*original_shape)
+    scales = scales.view(*original_shape[:-1], 1)
     return x_q, scales
 
 
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
index 89c6043180011ed6b4f873910c69e55fb1e667bd..a397eafd7fd358d9a8575249fa3c22428330a521 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -42,13 +42,16 @@ def query_marlin_supported_quant_types(
     include_fp_type: bool = True,
     device_capability: int | None = None,
 ):
+    if current_platform.is_cpu():
+        return _query_cpu_marlin_supported_quant_types(has_zp, include_fp_type)
+
     if device_capability is None:
         capability_tuple = current_platform.get_device_capability()
         device_capability = (
             -1 if capability_tuple is None else capability_tuple.to_int()
         )
 
-    if device_capability < 80:
+    if device_capability < 75:
         return []
 
     # - has_zp is True: return quant_types that has zero points
@@ -74,6 +77,33 @@ def query_marlin_supported_quant_types(
         return res
 
 
+def _query_cpu_marlin_supported_quant_types(
+    has_zp: bool | None = None,
+    include_fp_type: bool = True,
+):
+    # - has_zp is True: return quant_types that has zero points
+    # - has_zp is False: return quant_types that has not zero points
+    # - has_zp is None: both
+    if has_zp is None:
+        types0 = _query_cpu_marlin_supported_quant_types(
+            False,
+            include_fp_type,
+        )
+        types1 = _query_cpu_marlin_supported_quant_types(
+            True,
+            include_fp_type,
+        )
+        return types0 + types1
+
+    if has_zp:
+        # AWQ style, unsigned + runtime zero-point
+        return [scalar_types.uint4]
+    else:
+        # GPTQ style, unsigned + symmetric bias, only supports 4-bits for now
+        res = [scalar_types.uint4b8]
+        return res
+
+
 def _check_marlin_supported(
     quant_type: ScalarType,
     group_size: int | None,
@@ -488,7 +518,7 @@ def get__quant_fp8_method() -> QuantFP8:
     return _quant_fp8_method
 
 
-def get_marlin_input_dtype(prefix):
+def get_marlin_input_dtype(prefix: str | None = None):
     if envs.VLLM_MARLIN_INPUT_DTYPE is None:
         return
     elif envs.VLLM_MARLIN_INPUT_DTYPE.lower() == "int8":
@@ -616,9 +646,15 @@ def apply_awq_marlin_linear(
 
     a_scales = None
     if input_dtype == torch.int8:
+        assert quant_type == scalar_types.uint4, (
+            "W8A8-INT8 is not supported by marlin kernel."
+        )
         reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype)
         a_scales = a_scales * input_global_scale
     elif input_dtype == torch.float8_e4m3fn:
+        assert quant_type == scalar_types.uint4, (
+            "INT8 weight + FP8 activation is not supported."
+        )
         reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype)
 
     output = ops.gptq_marlin_gemm(
@@ -671,9 +707,15 @@ def apply_rtn_marlin_linear(
 
     a_scales = None
     if input_dtype == torch.int8:
+        assert quant_type == scalar_types.uint4b8, (
+            "W8A8-INT8 is not supported by marlin kernel."
+        )
         reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype)
         a_scales = a_scales * input_global_scale
     elif input_dtype == torch.float8_e4m3fn:
+        assert quant_type == scalar_types.uint4b8, (
+            "INT8 weight + FP8 activation is not supported."
+        )
         reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype)
 
     output = ops.gptq_marlin_gemm(
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
index b94d5bbf36540565638ba9d331c45fd07520701d..2ced41ef886ad6adb769fd4a3cafafdd281f7515 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
@@ -8,6 +8,7 @@ import vllm._custom_ops as ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     USE_FP32_REDUCE_DEFAULT,
+    get_marlin_input_dtype,
     marlin_make_workspace_new,
     marlin_permute_bias,
     marlin_permute_scales,
@@ -23,7 +24,7 @@ logger = init_logger(__name__)
 
 
 def is_fp4_marlin_supported():
-    return current_platform.has_device_capability(80)
+    return current_platform.has_device_capability(75)
 
 
 def nvfp4_marlin_process_scales(marlin_scales):
@@ -154,6 +155,12 @@ def prepare_fp4_layer_for_marlin(
     )
 
     is_nvfp4 = hasattr(layer, "weight_scale_2")
+    if input_dtype is not None and input_dtype.itemsize == 1:
+        if is_nvfp4:
+            raise RuntimeError("NVFP4 weight + INT8/FP8 activation is not supported.")
+        elif input_dtype != torch.float8_e4m3fn:
+            raise RuntimeError("MXFP4 weight + INT8 activation is not supported.")
+
     group_size = 16 if is_nvfp4 else 32
 
     part_size_n = layer.output_size_per_partition
@@ -220,6 +227,106 @@ def prepare_fp4_layer_for_marlin(
     return
 
 
+def prepare_nvfp4_moe_layer_for_marlin(
+    layer: torch.nn.Module,
+    w13: torch.Tensor,
+    w13_scale: torch.Tensor,
+    w13_scale_2: torch.Tensor,
+    w2: torch.Tensor,
+    w2_scale: torch.Tensor,
+    w2_scale_2: torch.Tensor,
+) -> tuple[
+    torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor
+]:
+    logger.warning_once(
+        "Your GPU does not have native support for FP4 computation but "
+        "FP4 quantization is being used. Weight-only FP4 compression will "
+        "be used leveraging the Marlin kernel. This may degrade "
+        "performance for compute-heavy workloads."
+    )
+
+    input_dtype = get_marlin_input_dtype(prefix="")
+    if input_dtype is not None and input_dtype.itemsize == 1:
+        raise RuntimeError("NVFP4 weight + INT8/FP8 activation is not supported.")
+
+    GROUP_SIZE = 16
+    E = layer.num_experts
+    K = layer.hidden_size
+    N = layer.intermediate_size_per_partition
+
+    device = w13.device
+    param_dtype = layer.params_dtype
+    is_a_8bit = input_dtype is not None and input_dtype.itemsize == 1
+
+    # WORKSPACE
+    layer.workspace = marlin_make_workspace_new(device, 4)
+    perm = torch.empty(0, dtype=torch.int, device=device)
+
+    # WEIGHT
+    # Repack weights to marlin format
+    def repack_weight(weight: torch.Tensor, name: str) -> torch.Tensor:
+        tensor_list = []
+        if "w13" in name:
+            size_n, size_k = N * 2, K
+        else:
+            size_n, size_k = K, N
+
+        assert weight.shape == (E, size_n, size_k // 2)
+
+        for i in range(E):
+            qweight = weight[i].view(torch.int32).T.contiguous()
+
+            marlin_qweight = ops.gptq_marlin_repack(
+                b_q_weight=qweight,
+                perm=perm,
+                size_k=size_k,
+                size_n=size_n,
+                num_bits=4,
+                is_a_8bit=is_a_8bit,
+            )
+            tensor_list.append(marlin_qweight)
+
+        return torch.cat([x.unsqueeze(0) for x in tensor_list], 0)
+
+    w13 = repack_weight(w13, "w13")
+    w2 = repack_weight(w2, "w2")
+
+    # WEIGHT SCALES
+    # Permute scales
+    def premute_scales(
+        scales: torch.Tensor, g_scales: torch.Tensor, name: str
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        scales = scales.to(param_dtype)
+        g_scales = g_scales.to(param_dtype)
+
+        tensor_list = []
+        if "w13" in name:
+            size_n, size_k = N * 2, K
+        else:
+            size_n, size_k = K, N
+
+        for i in range(E):
+            scale = scales[i].T
+            marlin_scales = marlin_permute_scales(
+                s=scale,
+                size_k=size_k,
+                size_n=size_n,
+                group_size=GROUP_SIZE,
+                is_a_8bit=is_a_8bit,
+            )
+            marlin_scales = nvfp4_marlin_process_scales(marlin_scales)
+            tensor_list.append(marlin_scales)
+
+        scales = torch.cat([x.unsqueeze(0) for x in tensor_list], 0)
+        g_scales = nvfp4_marlin_process_global_scale(g_scales)
+        return scales, g_scales
+
+    w13_scale, w13_scale_2 = premute_scales(w13_scale, w13_scale_2, "w13")
+    w2_scale, w2_scale_2 = premute_scales(w2_scale, w2_scale_2, "w2")
+
+    return w13, w13_scale, w13_scale_2, w2, w2_scale, w2_scale_2
+
+
 def prepare_moe_fp4_layer_for_marlin(
     layer: torch.nn.Module, input_dtype: torch.dtype | None = None
 ) -> None:
@@ -231,6 +338,12 @@ def prepare_moe_fp4_layer_for_marlin(
     )
 
     is_nvfp4 = hasattr(layer, "w13_weight_scale_2")
+    if input_dtype is not None and input_dtype.itemsize == 1:
+        if is_nvfp4:
+            raise RuntimeError("NVFP4 weight + INT8/FP8 activation is not supported.")
+        elif input_dtype != torch.float8_e4m3fn:
+            raise RuntimeError("MXFP4 weight + INT8 activation is not supported.")
+
     group_size = 16 if is_nvfp4 else 32
 
     e = layer.num_experts
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
index c67e4f437cf0c16d068f8ae2e7d804f8aa7667ef..91b93c76cb323f1783bdce3c3063b384d3ff11fc 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
@@ -8,10 +8,10 @@ import vllm._custom_ops as ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     USE_FP32_REDUCE_DEFAULT,
+    get_marlin_input_dtype,
     marlin_make_workspace_new,
     marlin_permute_bias,
     marlin_permute_scales,
-    marlin_quant_input,
     should_use_atomic_add_reduce,
 )
 from vllm.model_executor.utils import replace_parameter
@@ -22,7 +22,7 @@ logger = init_logger(__name__)
 
 
 def is_fp8_marlin_supported():
-    return current_platform.has_device_capability(80)
+    return current_platform.has_device_capability(75)
 
 
 def fp8_fused_exponent_bias_into_scales(scales):
@@ -63,13 +63,11 @@ def apply_fp8_marlin_linear(
     inputs = reshaped_x
     a_scales = None
     if input_dtype is not None and input_dtype.itemsize == 1:
-        if input_dtype != torch.float8_e4m3fn:
-            raise RuntimeError("FP8 weight + INT8 activation is not supported.")
-
-        inputs, a_scales = marlin_quant_input(inputs, torch.float8_e4m3fn)
+        # inputs, a_scales = marlin_quant_input(inputs, torch.float8_e4m3fn)
+        raise RuntimeError("Marlin W8A8 is not supported.")
 
     output = ops.gptq_marlin_gemm(
-        a=reshaped_x,
+        a=inputs,
         c=None,
         b_q_weight=weight,
         b_bias=bias,
@@ -102,6 +100,8 @@ def prepare_fp8_layer_for_marlin(
         "be used leveraging the Marlin kernel. This may degrade "
         "performance for compute-heavy workloads."
     )
+    if input_dtype is not None and input_dtype.itemsize == 1:
+        raise RuntimeError("Marlin W8A8 is not supported.")
 
     part_size_n = layer.output_size_per_partition
     part_size_k = layer.input_size_per_partition
@@ -145,10 +145,20 @@ def prepare_fp8_layer_for_marlin(
     # marlin kernel only support channel-wise and group-wise quantization
     # we need to convert the scales
     if weight_block_size is None:
+        logical_widths = getattr(layer, "logical_widths", [])
         if scales.nelement() == 1:
             # tensor-wise quantization -> channel-wise quantization
             # (1, 1) =>(repeat)=> (1, size_n)
             scales = scales.view(1, 1).repeat_interleave(part_size_n, 1)
+        elif scales.nelement() == len(logical_widths):
+            # tensor-wise quantization with logical_widths ->
+            #    channel-wise quantization
+            assert sum(logical_widths) == part_size_n, (
+                f"Sum of logical_widths ({sum(logical_widths)}) must be equal "
+                f"to part_size_n ({part_size_n})"
+            )
+            lw_tensor = scales.new_tensor(logical_widths, dtype=torch.int64)
+            scales = scales.view(1, -1).repeat_interleave(lw_tensor, dim=1)
         elif scales.nelement() > 1 and scales.nelement() != part_size_n:
             assert part_size_n % scales.nelement() == 0
             s_size = scales.nelement()
@@ -188,17 +198,30 @@ def prepare_fp8_layer_for_marlin(
         replace_parameter(layer, "bias", bias)
 
 
-def prepare_moe_fp8_layer_for_marlin(
+def prepare_fp8_moe_layer_for_marlin(
     layer: torch.nn.Module,
-    size_k_first: bool = True,
-    input_dtype: torch.dtype | None = None,
-) -> None:
+    w13_weight: torch.Tensor,
+    w2_weight: torch.Tensor,
+    w13_weight_scale: torch.Tensor,
+    w2_weight_scale: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Shuffle weights and scales into marlin format.
+
+    Note that this function has the side effect of adding a `workspace`
+    attribute to the layer. This `workspace` does not need to be
+    registered as a Parameter as it is not used during weight reloading.
+    """
+
     logger.warning_once(
         "Your GPU does not have native support for FP8 computation but "
         "FP8 quantization is being used. Weight-only FP8 compression will "
         "be used leveraging the Marlin kernel. This may degrade "
         "performance for compute-heavy workloads."
     )
+    input_dtype = get_marlin_input_dtype()
+    if input_dtype is not None and input_dtype.itemsize == 1:
+        raise NotImplementedError("Marlin W8A8 is not supported.")
 
     e = layer.num_experts
     k = layer.hidden_size
@@ -207,53 +230,42 @@ def prepare_moe_fp8_layer_for_marlin(
 
     # WORKSPACE
     device = layer.w13_weight.device
+    # NOTE(rob): we do not need to register the workspace as a param
+    # because it is not used as part of the weight reloading process.
     layer.workspace = marlin_make_workspace_new(device, 4)
     perm = torch.empty(0, dtype=torch.int, device=device)
 
     # WEIGHT
     # Repack weights to marlin format
-    for name in ["w13_weight", "w2_weight"]:
-        weight = getattr(layer, name)
+    def repack_weight(name: str, weight: torch.Tensor) -> torch.Tensor:
         tensor_list = []
         if "w13" in name:
             size_n, size_k = n * 2, k
         else:
             size_n, size_k = k, n
 
-        if size_k_first:
-            assert weight.shape == (e, size_k, size_n)
-        else:
-            assert weight.shape == (e, size_n, size_k)
+        assert weight.shape == (e, size_n, size_k)
 
         for i in range(e):
-            qweight = pack_fp8_to_int32(weight[i], size_k_first)
-            if not size_k_first:
-                qweight = qweight.T.contiguous()
+            qweight = pack_fp8_to_int32(weight[i], size_k_first=False)
+            qweight = qweight.T.contiguous()
 
             marlin_qweight = ops.gptq_marlin_repack(
                 b_q_weight=qweight, perm=perm, size_k=size_k, size_n=size_n, num_bits=8
             )
             tensor_list.append(marlin_qweight)
 
-        weight = torch.cat([x.unsqueeze(0) for x in tensor_list], 0)
-        weight = torch.nn.Parameter(weight, requires_grad=False)
+        return torch.cat([x.unsqueeze(0) for x in tensor_list], 0)
 
-        setattr(layer, name, weight)
+    w13_weight = repack_weight("w13", w13_weight)
+    w2_weight = repack_weight("w2", w2_weight)
 
     # WEIGHT SCALES
     # Permute scales
     group_size = -1 if weight_block_size is None else weight_block_size[1]
 
-    for name in ["w13", "w2"]:
-        if name + "_weight_scale" in dir(layer):
-            new_name = name + "_weight_scale"
-            scales = getattr(layer, new_name).to(layer.orig_dtype)
-            delattr(layer, new_name)
-        elif name + "_weight_scale_inv" in dir(layer):
-            new_name = name + "_weight_scale_inv"
-            scales = getattr(layer, new_name).to(layer.orig_dtype)
-            delattr(layer, new_name)
-
+    def permute_scales(scales: torch.Tensor, name: str) -> torch.Tensor:
+        scales = scales.to(layer.orig_dtype)
         tensor_list = []
         if "w13" in name:
             size_n, size_k = n * 2, k
@@ -283,8 +295,7 @@ def prepare_moe_fp8_layer_for_marlin(
             # block-wise quantization -> group-wise quantization
             # (e, size_k // block_size[1], ceil(size_n / block_size[0]))
             #  =>(repeat)=> (e, size_k // block_size[1], size_n)
-            if not size_k_first:
-                scales = scales.permute(0, 2, 1)
+            scales = scales.permute(0, 2, 1)
             block_n = weight_block_size[0]
             scales = scales.repeat_interleave(block_n, 2)
             # size_n may not divisible by block_size[0]
@@ -299,26 +310,12 @@ def prepare_moe_fp8_layer_for_marlin(
         scales = torch.cat([x.unsqueeze(0) for x in tensor_list], 0)
         if input_dtype != torch.float8_e4m3fn:
             scales = fp8_fused_exponent_bias_into_scales(scales)
-        scales = torch.nn.Parameter(scales, requires_grad=False)
-
-        setattr(layer, name + "_weight_scale", scales)
-
-    # BIAS
-    # Permute bias
-    for name in ["w13_bias", "w2_bias"]:
-        if not hasattr(layer, name):
-            continue
-        bias = getattr(layer, name).to(layer.orig_dtype)
-
-        tensor_list = []
-        for i in range(e):
-            expert_bias = bias[i]
+        return scales
 
-            tensor_list.append(marlin_permute_bias(expert_bias))
+    w13_weight_scale = permute_scales(w13_weight_scale, "w13")
+    w2_weight_scale = permute_scales(w2_weight_scale, "w2")
 
-        bias = torch.cat([x.unsqueeze(0) for x in tensor_list], 0)
-        bias = torch.nn.Parameter(bias, requires_grad=False)
-        setattr(layer, name, bias)
+    return w13_weight, w2_weight, w13_weight_scale, w2_weight_scale
 
 
 def pack_fp8_to_int32(
diff --git a/vllm/model_executor/layers/quantization/utils/quant_utils.py b/vllm/model_executor/layers/quantization/utils/quant_utils.py
index d01263f82007dc3c890d1899b1bd386b40cd817e..a6c4c702e67604a9a004ccfe5b1c1fe71f0f5f3a 100644
--- a/vllm/model_executor/layers/quantization/utils/quant_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/quant_utils.py
@@ -19,6 +19,17 @@ FP8_DTYPE = current_platform.fp8_dtype()
 FP4_DTYPE = torch.uint8
 
 
+def get_fp8_min_max() -> tuple[float, float]:
+    """Get the min and max values for FP8 quantization."""
+    # Using the default value (240.0) from pytorch will cause accuracy
+    # issue on dynamic quantization models on ROCm. Here, use 224.0 for fnuz
+    # on ROCm platforms that use the torch.float8_e4m3fnuz dtype.
+    if current_platform.is_fp8_fnuz():
+        return -224.0, 224.0
+    finfo = torch.finfo(current_platform.fp8_dtype())
+    return finfo.min, finfo.max
+
+
 # Use proxy as NamedTuple direct subclasses cannot have static members
 class _GroupShape(NamedTuple):
     row: int
@@ -147,11 +158,14 @@ def _normalize_quant_group_shape(x: torch.Tensor, group_shape: GroupShape):
 # with an extent of 1, since this can be done implicitly by pytorch
 def group_broadcast(t, shape):
     for i, s in enumerate(shape):
-        if t.shape[i] != s and t.shape[i] != 1:
-            assert s % t.shape[i] == 0
+        # If tensor has fewer dimensions than target shape, treat missing
+        # dimensions as size 1 (standard PyTorch broadcasting behavior)
+        t_dim_size = t.shape[i] if i < t.ndim else 1
+        if t_dim_size != s and t_dim_size != 1:
+            assert s % t_dim_size == 0
             t = (
                 t.unsqueeze(i + 1)
-                .expand(*t.shape[: i + 1], s // t.shape[i], *t.shape[i + 1 :])
+                .expand(*t.shape[: i + 1], s // t_dim_size, *t.shape[i + 1 :])
                 .flatten(i, i + 1)
             )
     return t
@@ -169,7 +183,16 @@ def scaled_quantize(
     x: torch.Tensor,
     group_shape: GroupShape,
     quant_dtype: torch.dtype,
+    compute_dtype: torch.dtype | None = None,
 ) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Args:
+        x: Input tensor to quantize
+        group_shape: Shape of quantization groups
+        quant_dtype: Target quantized dtype (e.g., torch.float8_e4m3fn)
+        compute_dtype: Optional dtype for intermediate computations.
+            If None, uses input dtype. Use torch.float32 for higher precision.
+    """
     group_shape = _normalize_quant_group_shape(x, group_shape)
     assert quant_dtype.is_floating_point, (
         "currently `scaled_quantize` only supports floating point dtypes "
@@ -178,11 +201,14 @@ def scaled_quantize(
 
     finfo = torch.finfo(quant_dtype)
 
+    # Convert to compute dtype if specified
+    x_compute = x if compute_dtype is None else x.to(compute_dtype)
+
     # Reshape (M, N) into (BLK_M, BLOCK_SIZE_M, BLK_N, BLOCK_SIZE_N)
     assert x.ndim == 2
     assert x.shape[0] % group_shape[0] == 0 and x.shape[1] % group_shape[1] == 0
     blk_m, blk_n = x.shape[0] // group_shape[0], x.shape[1] // group_shape[1]
-    x_blkd = x.reshape(blk_m, group_shape[0], blk_n, group_shape[1])
+    x_blkd = x_compute.reshape(blk_m, group_shape[0], blk_n, group_shape[1])
 
     # Permute to (BLK_M, BLK_N, BLOCK_SIZE_M, BLOCK_SIZE_N)
     x_blkd_permd = x_blkd.permute(0, 2, 1, 3)
diff --git a/vllm/model_executor/layers/rotary_embedding/__init__.py b/vllm/model_executor/layers/rotary_embedding/__init__.py
index e2ce32cc290f7fff34c7e1cfd26a315f5cd82032..3eef02369d3246a515ed36dc036176a2b7d97172 100644
--- a/vllm/model_executor/layers/rotary_embedding/__init__.py
+++ b/vllm/model_executor/layers/rotary_embedding/__init__.py
@@ -20,7 +20,9 @@ from .phi3_long_rope_scaled_rope import Phi3LongRoPEScaledRotaryEmbedding
 from .xdrope import XDRotaryEmbedding
 from .yarn_scaling_rope import YaRNScalingRotaryEmbedding
 
-_ROPE_DICT: dict[tuple, RotaryEmbedding] = {}
+_ROPE_DICT: dict[tuple[Any, ...], RotaryEmbedding] = {}
+
+__all__ = ["RotaryEmbedding"]
 
 
 def get_rope(
diff --git a/vllm/model_executor/layers/rotary_embedding/base.py b/vllm/model_executor/layers/rotary_embedding/base.py
index afa69324c4e2e2dff12980d05816adef13d34353..d63367af5fe68c3777185a4653a752955cf9efa9 100644
--- a/vllm/model_executor/layers/rotary_embedding/base.py
+++ b/vllm/model_executor/layers/rotary_embedding/base.py
@@ -10,10 +10,13 @@ from vllm.model_executor.custom_op import CustomOp
 from .common import ApplyRotaryEmb
 
 
+# --8<-- [start:rotary_embedding]
 @CustomOp.register("rotary_embedding")
 class RotaryEmbeddingBase(CustomOp):
     """Original rotary positional embedding."""
 
+    # --8<-- [end:rotary_embedding]
+
     def __init__(
         self,
         head_size: int,
@@ -38,7 +41,10 @@ class RotaryEmbeddingBase(CustomOp):
         #                        and current_platform.is_cuda()
         #                        and has_flashinfer()
         #                        and self.head_size in [64, 128, 256, 512])
-        self.use_flashinfer = False
+
+        # Check if use_flashinfer is already set
+        if not hasattr(self, "use_flashinfer"):
+            self.use_flashinfer = False
 
         cache = self._compute_cos_sin_cache()
         if not self.use_flashinfer:
@@ -247,6 +253,28 @@ class RotaryEmbedding(RotaryEmbeddingBase):
             )
         return query, key
 
+    def forward_cpu(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        from vllm import _custom_ops as ops
+
+        self._match_cos_sin_cache_dtype(query)
+
+        # ops.rotary_embedding() is an in-place operation
+        # that updates the query and key tensors.
+        ops.rotary_embedding(
+            positions,
+            query,
+            key,
+            self.head_size,
+            self.cos_sin_cache,
+            self.is_neox_style,
+        )
+        return query, key
+
     def extra_repr(self) -> str:
         s = f"head_size={self.head_size}, rotary_dim={self.rotary_dim}"
         s += f", max_position_embeddings={self.max_position_embeddings}"
diff --git a/vllm/model_executor/layers/rotary_embedding/common.py b/vllm/model_executor/layers/rotary_embedding/common.py
index 50660c6ecc223f0c0dea1a56ea1a32c19c3b7728..34de1da561f500568b67f095a52294597727ab85 100644
--- a/vllm/model_executor/layers/rotary_embedding/common.py
+++ b/vllm/model_executor/layers/rotary_embedding/common.py
@@ -118,8 +118,11 @@ direct_register_custom_op(
 )
 
 
+# --8<-- [start:apply_rotary_emb]
 @CustomOp.register("apply_rotary_emb")
 class ApplyRotaryEmb(CustomOp):
+    # --8<-- [end:apply_rotary_emb]
+
     def __init__(
         self,
         enforce_enable: bool = False,
@@ -178,6 +181,37 @@ class ApplyRotaryEmb(CustomOp):
             output = output.to(origin_dtype)
         return output
 
+    def _pre_process(
+        self,
+        x: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Size, torch.dtype]:
+        origin_shape = x.shape
+        if len(origin_shape) == 3:
+            # x: [seq_len, num_heads, head_size]
+            x = x.unsqueeze(0)
+
+        origin_dtype = x.dtype
+        if self.enable_fp32_compute:
+            x = x.float()
+            cos = cos.float()
+            sin = sin.float()
+
+        return x, cos, sin, origin_shape, origin_dtype
+
+    def _post_process(
+        self,
+        output: torch.Tensor,
+        origin_shape: torch.Size,
+        origin_dtype: torch.dtype,
+    ) -> torch.Tensor:
+        if len(origin_shape) == 3:
+            output = output.squeeze(0)
+        if self.enable_fp32_compute:
+            output = output.to(origin_dtype)
+        return output
+
     def forward_native(
         self,
         x: torch.Tensor,
@@ -197,16 +231,7 @@ class ApplyRotaryEmb(CustomOp):
     ) -> torch.Tensor:
         from vllm.vllm_flash_attn.layers.rotary import apply_rotary_emb
 
-        origin_dtype = x.dtype
-        if self.enable_fp32_compute:
-            x = x.float()
-            cos = cos.float()
-            sin = sin.float()
-
-        origin_shape = x.shape
-        if len(origin_shape) == 3:
-            # x: [seq_len, num_heads, head_size]
-            x = x.unsqueeze(0)
+        x, cos, sin, origin_shape, origin_dtype = self._pre_process(x, cos, sin)
 
         """
         Arguments of apply_rotary_emb() in vllm_flash_attn:
@@ -218,10 +243,7 @@ class ApplyRotaryEmb(CustomOp):
         interleaved = not self.is_neox_style
         output = apply_rotary_emb(x, cos, sin, interleaved)
 
-        if len(origin_shape) == 3:
-            output = output.squeeze(0)
-        if self.enable_fp32_compute:
-            output = output.to(origin_dtype)
+        output = self._post_process(output, origin_shape, origin_dtype)
         return output
 
     def forward_hip(
@@ -231,16 +253,7 @@ class ApplyRotaryEmb(CustomOp):
         sin: torch.Tensor,
     ) -> torch.Tensor:
         if self.apply_rotary_emb_flash_attn is not None:
-            origin_dtype = x.dtype
-            if self.enable_fp32_compute:
-                x = x.float()
-                cos = cos.float()
-                sin = sin.float()
-
-            origin_shape = x.shape
-            if len(origin_shape) == 3:
-                # x: [seq_len, num_heads, head_size]
-                x = x.unsqueeze(0)
+            x, cos, sin, origin_shape, origin_dtype = self._pre_process(x, cos, sin)
 
             """
             Arguments of apply_rotary() in flash_attn:
@@ -254,10 +267,7 @@ class ApplyRotaryEmb(CustomOp):
                 x, cos, sin, interleaved=interleaved
             ).type_as(x)
 
-            if len(origin_shape) == 3:
-                output = output.squeeze(0)
-            if self.enable_fp32_compute:
-                output = output.to(origin_dtype)
+            output = self._post_process(output, origin_shape, origin_dtype)
         else:
             # Falling back to PyTorch native implementation.
             output = self.forward_native(x, cos, sin)
@@ -275,5 +285,5 @@ class ApplyRotaryEmb(CustomOp):
 
     def extra_repr(self) -> str:
         s = f"is_neox_style={self.is_neox_style}"
-        s += f"enable_fp32_compute={self.enable_fp32_compute}"
+        s += f", enable_fp32_compute={self.enable_fp32_compute}"
         return s
diff --git a/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py b/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py
index e72834e473c1559866eb6daaa6407c2743ed9c4f..8402b65efcc04150f953c610d56a0183c9f49d1a 100644
--- a/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py
+++ b/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py
@@ -6,6 +6,7 @@ import math
 import torch
 
 from vllm.platforms import current_platform
+from vllm.utils.flashinfer import has_flashinfer
 
 from .base import RotaryEmbeddingBase
 from .common import (
@@ -56,6 +57,13 @@ class DeepseekScalingRotaryEmbedding(RotaryEmbeddingBase):
             / yarn_get_mscale(self.scaling_factor, float(mscale_all_dim))
             * attn_factor
         )
+        self.use_flashinfer = (
+            self.enabled()
+            and dtype in (torch.float16, torch.bfloat16)
+            and current_platform.is_cuda()
+            and has_flashinfer()
+            and head_size in [64, 128, 256, 512]
+        )
         super().__init__(
             head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
         )
@@ -162,4 +170,15 @@ class DeepseekScalingRotaryEmbedding(RotaryEmbeddingBase):
         key: torch.Tensor | None = None,
         offsets: torch.Tensor | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor | None]:
-        return self.forward_native(positions, query, key, offsets)
+        if self.use_flashinfer:
+            torch.ops.vllm.flashinfer_rotary_embedding(
+                torch.add(positions, offsets) if offsets is not None else positions,
+                query,
+                key,
+                self.head_size,
+                self.cos_sin_cache,
+                self.is_neox_style,
+            )
+            return query, key
+        else:
+            return self.forward_native(positions, query, key, offsets)
diff --git a/vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py b/vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py
index b5dd94cc7f53197ebb081f5e12aad7ca5ff58951..e5dabe035b34eefd31b456afcd16e36510d6dcb2 100644
--- a/vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py
+++ b/vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py
@@ -9,10 +9,13 @@ from vllm.model_executor.custom_op import CustomOp
 from .common import rotate_gptj, rotate_neox
 
 
+# --8<-- [start:dual_chunk_rotary_embedding]
 @CustomOp.register("dual_chunk_rotary_embedding")
 class DualChunkRotaryEmbedding(CustomOp):
     """Rotary positional embedding for Dual Chunk Attention."""
 
+    # --8<-- [end:dual_chunk_rotary_embedding]
+
     def __init__(
         self,
         head_size: int,
diff --git a/vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py b/vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py
index 9fdac309df7ee39cd56963d6afe5683f2c760c9b..f51429cd75c1ee04b554903701897d2767bcfdee 100644
--- a/vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py
+++ b/vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py
@@ -60,14 +60,17 @@ class Llama4VisionRotaryEmbedding(RotaryEmbeddingBase):
         assert key is not None
         # self.cos_sin_cache here is complex tensor so we cannot cast into
         # query's dtype directly with self._match_cos_sin_cache_dtype
-        self.cos_sin_cache: torch.Tensor = self.cos_sin_cache.to(query.device)
+
+        # NOTE: by not storing cos_sin_cache in self, we can avoid
+        # memory buffer update which is costly to runtime
+        cos_sin_cache: torch.Tensor = self.cos_sin_cache.to(query.device)
         query_ = torch.view_as_complex(query.float().reshape(*query.shape[:-1], -1, 2))
         key_ = torch.view_as_complex(key.float().reshape(*key.shape[:-1], -1, 2))
         broadcast_shape = [
             d if i == 1 or i == (query_.ndim - 1) else 1
             for i, d in enumerate(query_.shape)
         ]
-        freqs_ci = self.cos_sin_cache.view(*broadcast_shape)
+        freqs_ci = cos_sin_cache.view(*broadcast_shape)
         query_out = torch.view_as_real(query_ * freqs_ci).flatten(3)
         key_out = torch.view_as_real(key_ * freqs_ci).flatten(3)
         return query_out.type_as(query), key_out.type_as(key)
diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py
index 338148adfa2ba4fa2b560d1bcd62c2826aed39b7..d0aa67c896016e7002ef780cee5178d4be2d0fb7 100644
--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -210,6 +210,7 @@ def get_masked_input_and_mask(
     return input_, ~vocab_mask
 
 
+# --8<-- [start:vocab_parallel_embedding]
 @CustomOp.register("vocab_parallel_embedding")
 class VocabParallelEmbedding(CustomOp):
     """Embedding parallelized in the vocabulary dimension.
@@ -250,6 +251,8 @@ class VocabParallelEmbedding(CustomOp):
         prefix: full name of the layer in the state dict
     """  # noqa: E501
 
+    # --8<-- [end:vocab_parallel_embedding]
+
     def __init__(
         self,
         num_embeddings: int,
@@ -525,6 +528,7 @@ class VocabParallelEmbedding(CustomOp):
         return s
 
 
+# --8<-- [start:parallel_lm_head]
 @CustomOp.register("parallel_lm_head")
 class ParallelLMHead(VocabParallelEmbedding):
     """Parallelized LM head.
@@ -542,6 +546,8 @@ class ParallelLMHead(VocabParallelEmbedding):
         padding_size: padding size for the vocabulary.
     """
 
+    # --8<-- [end:parallel_lm_head]
+
     def __init__(
         self,
         num_embeddings: int,
diff --git a/vllm/model_executor/model_loader/base_loader.py b/vllm/model_executor/model_loader/base_loader.py
index 94dfa478245d627607e395a086db8d8eabdd2a32..2238b0cfe38a4867a85bc9651aeee61c2abb432b 100644
--- a/vllm/model_executor/model_loader/base_loader.py
+++ b/vllm/model_executor/model_loader/base_loader.py
@@ -5,6 +5,7 @@ from abc import ABC, abstractmethod
 import torch
 import torch.nn as nn
 
+import vllm.envs as envs
 from vllm.config import ModelConfig, VllmConfig
 from vllm.config.load import LoadConfig
 from vllm.logger import init_logger
@@ -50,8 +51,21 @@ class BaseModelLoader(ABC):
                     vllm_config=vllm_config, model_config=model_config
                 )
 
+            log_model_inspection(model)
+
             logger.debug("Loading weights on %s ...", load_device)
             # Quantization does not happen in `load_weights` but after it
             self.load_weights(model, model_config)
             process_weights_after_loading(model, model_config, target_device)
+
         return model.eval()
+
+
+def log_model_inspection(model: nn.Module) -> None:
+    """Log model structure if VLLM_LOG_MODEL_INSPECTION=1."""
+    if not envs.VLLM_LOG_MODEL_INSPECTION:
+        return
+
+    from vllm.model_inspection import format_model_inspection
+
+    logger.info("vLLM model structure:\n%s", format_model_inspection(model))
diff --git a/vllm/model_executor/model_loader/bitsandbytes_loader.py b/vllm/model_executor/model_loader/bitsandbytes_loader.py
index 97c7a20bc4d5a842d99d4954e16cbe48970e568f..aa020645021ea8cc200f57a89c6b362e48845273 100644
--- a/vllm/model_executor/model_loader/bitsandbytes_loader.py
+++ b/vllm/model_executor/model_loader/bitsandbytes_loader.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-# ruff: noqa: SIM117
 import fnmatch
 import glob
 import itertools
@@ -59,7 +58,7 @@ def is_moe_model(model: torch.nn.Module) -> bool:
 
 
 class BitsAndBytesModelLoader(BaseModelLoader):
-    """Model loader to load model weights with BitAndBytes quantization."""
+    """Model loader to load model weights with BitsAndBytes quantization."""
 
     possible_config_file_names = ["adapter_config.json"]
 
diff --git a/vllm/model_executor/model_loader/default_loader.py b/vllm/model_executor/model_loader/default_loader.py
index 30b526ed23734074cbc83dd52a892d3f712ddec5..b8d571e9e4260f4d1b99c16f3e90b6cd6e3d8b3f 100644
--- a/vllm/model_executor/model_loader/default_loader.py
+++ b/vllm/model_executor/model_loader/default_loader.py
@@ -30,7 +30,6 @@ from vllm.model_executor.model_loader.weight_utils import (
     pt_weights_iterator,
     safetensors_weights_iterator,
 )
-from vllm.platforms import current_platform
 from vllm.transformers_utils.repo_utils import list_filtered_repo_files
 
 logger = init_logger(__name__)
@@ -241,22 +240,6 @@ class DefaultModelLoader(BaseModelLoader):
                     self.load_config.pt_load_map_location,
                 )
 
-        if current_platform.is_tpu():
-            from vllm.platforms.tpu import USE_TPU_INFERENCE
-
-            if not USE_TPU_INFERENCE:
-                # In PyTorch XLA, we should call `torch_xla.sync`
-                # frequently so that not too many ops are accumulated
-                # in the XLA program.
-                import torch_xla
-
-                def _xla_weights_iterator(iterator: Generator):
-                    for weights in iterator:
-                        yield weights
-                        torch_xla.sync(wait=False)
-
-                weights_iterator = _xla_weights_iterator(weights_iterator)
-
         if self.counter_before_loading_weights == 0.0:
             self.counter_before_loading_weights = time.perf_counter()
         # Apply the prefix.
diff --git a/vllm/model_executor/model_loader/runai_streamer_loader.py b/vllm/model_executor/model_loader/runai_streamer_loader.py
index 93da07c5501952fa36a4c3a34b3a7f6e5c9abcc5..9d3ade4cd97e356c59ff74ed2e9ae9aced59ccd4 100644
--- a/vllm/model_executor/model_loader/runai_streamer_loader.py
+++ b/vllm/model_executor/model_loader/runai_streamer_loader.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-# ruff: noqa: SIM117
 import os
 from collections.abc import Generator
 
@@ -109,8 +108,8 @@ class RunaiModelStreamerLoader(BaseModelLoader):
     def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None:
         """Load weights into a model."""
         model_weights = model_config.model
-        if hasattr(model_config, "model_weights"):
-            model_weights = model_config.model_weights
+        if model_weights_override := model_config.model_weights:
+            model_weights = model_weights_override
         model.load_weights(
             self._get_weights_iterator(model_weights, model_config.revision)
         )
diff --git a/vllm/model_executor/model_loader/sharded_state_loader.py b/vllm/model_executor/model_loader/sharded_state_loader.py
index 1538f0c2af6552c3f127140b9f718f4895b8d4db..e27cedd991c20976faa2861a68f7b15d0d4b2542 100644
--- a/vllm/model_executor/model_loader/sharded_state_loader.py
+++ b/vllm/model_executor/model_loader/sharded_state_loader.py
@@ -110,8 +110,8 @@ class ShardedStateLoader(BaseModelLoader):
         from vllm.distributed import get_tensor_model_parallel_rank
 
         model_weights = model_config.model
-        if hasattr(model_config, "model_weights"):
-            model_weights = model_config.model_weights
+        if model_weights_override := model_config.model_weights:
+            model_weights = model_weights_override
         local_model_path = model_weights
 
         rank = get_tensor_model_parallel_rank()
diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py
index e4e530f0cea88827fa5af3e484066d38cf9672f7..5160d4842ab972a033f83bb9fe6aacf1406ba2fe 100644
--- a/vllm/model_executor/model_loader/tensorizer.py
+++ b/vllm/model_executor/model_loader/tensorizer.py
@@ -762,9 +762,12 @@ def tensorize_lora_adapter(lora_path: str, tensorizer_config: TensorizerConfig):
     if tensor_path.endswith(".safetensors"):
         tensors = safetensors.torch.load_file(tensor_path)
     elif tensor_path.endswith(".bin"):
-        tensors = torch.load(tensor_path)
+        tensors = torch.load(tensor_path, weights_only=True)
     else:
-        raise ValueError("Unsupported file: %s", tensor_path)
+        raise ValueError(
+            f"Unsupported adapter model file: {tensor_path}. "
+            f"Must be a .safetensors or .bin file."
+        )
 
     with open(config_path) as f:
         config = json.load(f)
diff --git a/vllm/model_executor/model_loader/tpu.py b/vllm/model_executor/model_loader/tpu.py
deleted file mode 100644
index fc142f1f07fae40e58c3c0d1918bb54e4539ca6d..0000000000000000000000000000000000000000
--- a/vllm/model_executor/model_loader/tpu.py
+++ /dev/null
@@ -1,118 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import time
-
-import torch
-import torch.nn as nn
-import torch_xla.core.xla_model as xm
-import torch_xla.distributed.spmd as xs
-
-from vllm.config import ModelConfig, VllmConfig
-from vllm.distributed.tpu_distributed_utils import get_fqn, shard_model
-from vllm.logger import init_logger
-from vllm.model_executor.model_loader.default_loader import DefaultModelLoader
-from vllm.model_executor.model_loader.utils import (
-    initialize_model,
-    process_weights_after_loading,
-)
-from vllm.utils.torch_utils import set_default_torch_dtype
-
-logger = init_logger(__name__)
-
-
-class TPUModelLoader(DefaultModelLoader):
-    """
-    A TPU model loader for model loading under SPMD mode.
-    """
-
-    def load_model(
-        self,
-        vllm_config: VllmConfig,
-        model_config: ModelConfig,
-        mesh: xs.Mesh | None = None,
-    ) -> nn.Module:
-        # Initialize model and load weights on CPU. Then, during SPMD partition,
-        # weights are sharded and transferred to TPUs.
-        self.counter_before_loading_weights = time.perf_counter()
-        model_config = vllm_config.model_config
-        assert model_config.quantization is None, "Quantization not supported"
-        target_device = torch.device("cpu")
-        with set_default_torch_dtype(model_config.dtype):
-            with target_device:
-                model = initialize_model(vllm_config=vllm_config)
-
-            load_format = vllm_config.load_config.load_format
-            if load_format != "dummy":
-                weights_to_load = {name for name, _ in model.named_parameters()}
-                all_weights = self.get_all_weights(model_config, model)
-                loaded_weights = model.load_weights(all_weights)
-                self.counter_after_loading_weights = time.perf_counter()
-                logger.info(
-                    "Loading weights took %.2f seconds",
-                    self.counter_after_loading_weights
-                    - self.counter_before_loading_weights,
-                )
-                # We only enable strict check for non-quantized models
-                # that have loaded weights tracking currently.
-                if model_config.quantization is None and loaded_weights is not None:
-                    weights_not_loaded = weights_to_load - loaded_weights
-                    if weights_not_loaded:
-                        raise ValueError(
-                            "Following weights were not initialized from "
-                            f"checkpoint: {weights_not_loaded}"
-                        )
-            else:
-                logger.info("Use dummy weight during weight loading.")
-
-            process_weights_after_loading(model, model_config, target_device)
-
-        counter_before_partition = time.perf_counter()
-        model = model.eval()
-        model = model.to("xla")
-        shard_model(model, mesh)
-        counter_after_partition = time.perf_counter()
-        logger.info(
-            "Partition model took %.2f seconds",
-            counter_after_partition - counter_before_partition,
-        )
-
-        # Ensure the model is properly loaded.
-        self._check_model_is_loaded(mesh, model)
-
-        # Need to torch compile after model sharding are done. Because the
-        # compiler hints ('xs.mark_sharding') are torch ops.
-        if not model_config.is_multimodal_model:
-            model.model = torch.compile(model.model, backend="openxla")
-        else:
-            model.language_model.model = torch.compile(
-                model.language_model.model, backend="openxla"
-            )
-        return model
-
-    def _check_model_is_loaded(self, mesh: xs.Mesh | None, model: nn.Module) -> None:
-        """
-        Ensure the model is properly loaded.
-        1. All model parameters and buffers are on XLA device.
-        2. Non-SPMD friendly layers are replaced as expected.
-        """
-        device = xm.xla_device()
-        device_type = str(device.type)
-
-        # Check parameters
-        for name, param in model.named_parameters():
-            assert param.device.type == device_type, (
-                f"Parameter {name} is on {param.device.type} instead of {device_type}"
-            )
-
-        # Check buffers
-        for name, buffer in model.named_buffers():
-            assert buffer.device.type == device_type, (
-                f"Buffer {name} is on {buffer.device.type} instead of {device_type}"
-            )
-
-        for module in model.modules():
-            if (mesh is not None) and (get_fqn(module) == "QKVParallelLinear"):
-                raise AssertionError(
-                    "QKVParallelLinear should be replaced by \
-                            XlaQKVParallelLinear under SPMD mode."
-                )
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index 5a37a7061a9f40267f8786d95050626da54fba66..5ff4159b2138f0557b6414f32c89a60aba9cbb85 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -284,7 +284,9 @@ def _get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module],
             )
 
     convert_type = model_config.convert_type
-    if convert_type != "none" and supports_multimodal(model_cls):
+    if convert_type not in ["none", "mm_encoder_only"] and supports_multimodal(
+        model_cls
+    ):
         logger.debug_once("Detected conversion of Multi Modal model.")
         converted = try_create_mm_pooling_model_cls(model_cls)
         if converted is not None:
@@ -295,6 +297,11 @@ def _get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module],
 
     if convert_type == "none":
         pass
+    elif convert_type == "mm_encoder_only":
+        logger.debug_once("Converting to mm encoder only model.")
+        from vllm.model_executor.models.adapters import as_mm_encoder_only_model
+
+        model_cls = as_mm_encoder_only_model(model_cls)
     elif convert_type == "embed":
         logger.debug_once("Converting to embedding model.")
         model_cls = as_embedding_model(model_cls)
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index ba4d7e5a2678796b0c41038133fd11a8e7480535..c06d51677601b9e77dd6c65fba2b21fd3dc367a8 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -23,6 +23,7 @@ import torch
 from huggingface_hub import HfFileSystem, hf_hub_download, snapshot_download
 from safetensors.torch import load, load_file, safe_open, save_file
 from tqdm.auto import tqdm
+from transformers.utils import SAFE_WEIGHTS_INDEX_NAME
 
 from vllm import envs
 from vllm.config import ModelConfig
@@ -448,12 +449,31 @@ def download_weights_from_hf(
             fs = HfFileSystem()
             file_list = fs.ls(model_name_or_path, detail=False, revision=revision)
 
-            # Use the first pattern found in the HF repo's files.
-            for pattern in allow_patterns:
-                matching = fnmatch.filter(file_list, pattern)
-                if len(matching) > 0:
-                    allow_patterns = [pattern]
-                break
+            # If downloading safetensors and an index file exists, use the
+            # specific file names from the index to avoid downloading
+            # unnecessary files (e.g., from subdirectories like "original/").
+            index_file = f"{model_name_or_path}/{SAFE_WEIGHTS_INDEX_NAME}"
+            if "*.safetensors" in allow_patterns and index_file in file_list:
+                index_path = hf_hub_download(
+                    repo_id=model_name_or_path,
+                    filename=SAFE_WEIGHTS_INDEX_NAME,
+                    cache_dir=cache_dir,
+                    revision=revision,
+                )
+                with open(index_path) as f:
+                    weight_map = json.load(f)["weight_map"]
+                if weight_map:
+                    # Extra [] so that weight_map files are treated as a
+                    # single allow_pattern in the loop below
+                    allow_patterns = [list(set(weight_map.values()))]  # type: ignore[list-item]
+                else:
+                    allow_patterns = ["*.safetensors"]
+            else:
+                # Use the first pattern found in the HF repo's files.
+                for pattern in allow_patterns:
+                    if fnmatch.filter(file_list, pattern):
+                        allow_patterns = [pattern]
+                        break
         except Exception as e:
             logger.warning(
                 "Failed to get file list for '%s'. Trying each pattern in "
@@ -480,6 +500,9 @@ def download_weights_from_hf(
             )
             # If we have downloaded weights for this allow_pattern,
             # we don't need to check the rest.
+            # allow_pattern can be a list (from weight_map) or str (glob)
+            if isinstance(allow_pattern, list):
+                break
             if any(Path(hf_folder).glob(allow_pattern)):
                 break
         time_taken = time.perf_counter() - start_time
@@ -664,8 +687,8 @@ def safetensors_weights_iterator(
             # instead we reconstruct the subclasses here before returning
             if not torchao_version_at_least("0.15.0"):
                 raise ValueError(
-                    "Please use torchao version >= 0.15.0 \
-                        to load torchao safetensors checkpoint"
+                    "Please use torchao version >= 0.15.0 "
+                    "to load torchao safetensors checkpoint"
                 )
             from torchao.prototype.safetensors.safetensors_support import (
                 unflatten_tensor_state_dict,
@@ -831,28 +854,41 @@ def pt_weights_iterator(
     pt_load_map_location: str | dict[str, str] = "cpu",
 ) -> Generator[tuple[str, torch.Tensor], None, None]:
     """Iterate over the weights in the model bin/pt files."""
-    total_count = 0
-    for bin_file in hf_weights_files:
-        state = torch.load(bin_file, map_location=pt_load_map_location, weights_only=True)
-        total_count += len(state)
-        del state
-        
-    current_count = 0
-    for bin_file in tqdm(
-        hf_weights_files,
-        desc="Loading pt checkpoint shards",
-        disable=not enable_tqdm(use_tqdm_on_load),
-        bar_format=_BAR_FORMAT,
-    ):
-        state = torch.load(
-            bin_file, map_location=pt_load_map_location, weights_only=True
-        )
-        for name, param in state.items():
-            current_count += 1
-            param.current_count = current_count
-            param.total_count = total_count
-            yield name, param
-        del state
+    if os.environ.get('LLAMA_NN') == '1':
+        total_count = 0
+        for bin_file in hf_weights_files:
+            state = torch.load(bin_file, map_location=pt_load_map_location, weights_only=True)
+            total_count += len(state)
+            del state
+            
+        current_count = 0
+        for bin_file in tqdm(
+            hf_weights_files,
+            desc="Loading pt checkpoint shards",
+            disable=not enable_tqdm(use_tqdm_on_load),
+            bar_format=_BAR_FORMAT,
+        ):
+            state = torch.load(
+                bin_file, map_location=pt_load_map_location, weights_only=True
+            )
+            for name, param in state.items():
+                current_count += 1
+                param.current_count = current_count
+                param.total_count = total_count
+                yield name, param
+            del state
+    else:
+        for bin_file in tqdm(
+            hf_weights_files,
+            desc="Loading pt checkpoint shards",
+            disable=not enable_tqdm(use_tqdm_on_load),
+            bar_format=_BAR_FORMAT,
+        ):
+            state = torch.load(
+                bin_file, map_location=pt_load_map_location, weights_only=True
+            )
+            yield from state.items()
+            del state
 
 
 def multi_thread_pt_weights_iterator(
@@ -1150,6 +1186,9 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> str | None:
         # Qwen3 MoE format: .self_attn.qkqkv_proj.{k,v}_scale ->
         # .self_attn.attn.{k,v}_scale
         (r"\.self_attn\.qkqkv_proj\.([kv])_scale$", r".self_attn.attn.\1_scale"),
+        # NemotronH format: .mixer.{k,v}_proj.{k,v}_scale ->
+        # .mixer.attn.{k,v}_scale
+        (r"\.mixer\.[kv]_proj\.([kv])_scale$", r".mixer.attn.\1_scale"),
         # Default format: .{k,v}_scale -> .attn.{k,v}_scale
         (r"\.([kv])_scale$", r".attn.\1_scale"),
     ]
diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py
index 504de9fe108710c94ee724e0b67f3411f21db5d2..6f94bffe8131d60bab649efe248e90885c5d7717 100644
--- a/vllm/model_executor/models/adapters.py
+++ b/vllm/model_executor/models/adapters.py
@@ -252,19 +252,14 @@ def as_embedding_model(cls: _T) -> _T:
         return cls
 
     # Lazy import
-    from vllm.model_executor.layers.pooler import DispatchPooler, Pooler
+    from vllm.model_executor.layers.pooler import DispatchPooler
 
     class ModelForEmbedding(_create_pooling_model_cls(cls)):
         def _init_pooler(self, vllm_config: "VllmConfig", prefix: str = ""):
             pooler_config = vllm_config.model_config.pooler_config
             assert pooler_config is not None
 
-            self.pooler = DispatchPooler(
-                {
-                    "token_embed": Pooler.for_token_embed(pooler_config),
-                    "embed": Pooler.for_embed(pooler_config),
-                },
-            )
+            self.pooler = DispatchPooler.for_embedding(pooler_config)
 
     ModelForEmbedding.__name__ = _get_pooling_model_name(cls.__name__, "ForEmbedding")
 
@@ -289,10 +284,7 @@ def as_seq_cls_model(cls: _T) -> _T:
 
     # Lazy import
     from vllm.model_executor.layers.linear import ReplicatedLinear
-    from vllm.model_executor.layers.pooler import (
-        DispatchPooler,
-        Pooler,
-    )
+    from vllm.model_executor.layers.pooler import DispatchPooler
     from vllm.model_executor.models.interfaces import SupportsCrossEncoding
 
     from .utils import maybe_prefix
@@ -318,24 +310,19 @@ def as_seq_cls_model(cls: _T) -> _T:
             pooler_config = vllm_config.model_config.pooler_config
             assert pooler_config is not None
 
-            self.pooler = DispatchPooler(
-                {
-                    "token_classify": Pooler.for_token_classify(
-                        pooler_config, classifier=self.score
-                    ),
-                    "classify": Pooler.for_classify(
-                        pooler_config, classifier=self.score, act_fn="classify"
-                    ),
-                    "score": Pooler.for_classify(
-                        pooler_config, classifier=self.score, act_fn="score"
-                    ),
-                }
+            self.pooler = DispatchPooler.for_seq_cls(
+                pooler_config, classifier=self.score
             )
 
         def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
-            text_config = self.config.get_text_config()
-            tokens = getattr(text_config, "classifier_from_token", None)
-            method = getattr(text_config, "method", None)
+            hf_config = self.config
+            text_config = hf_config.get_text_config()
+            tokens = getattr(
+                hf_config,
+                "classifier_from_token",
+                getattr(text_config, "classifier_from_token", None),
+            )
+            method = getattr(hf_config, "method", getattr(text_config, "method", None))
 
             def auto_set_score_bias(weights):
                 for name, weight in weights:
@@ -366,9 +353,14 @@ def as_seq_cls_model(cls: _T) -> _T:
 class SequenceClassificationConfig(VerifyAndUpdateConfig):
     @staticmethod
     def verify_and_update_config(vllm_config: "VllmConfig") -> None:
-        text_config = vllm_config.model_config.hf_config.get_text_config()
-        method = getattr(text_config, "method", None)
-        tokens = getattr(text_config, "classifier_from_token", None)
+        hf_config = vllm_config.model_config.hf_config
+        text_config = hf_config.get_text_config()
+        method = getattr(hf_config, "method", getattr(text_config, "method", None))
+        tokens = getattr(
+            hf_config,
+            "classifier_from_token",
+            getattr(text_config, "classifier_from_token", None),
+        )
 
         if method is None:
             return
@@ -378,13 +370,15 @@ class SequenceClassificationConfig(VerifyAndUpdateConfig):
 
         if method == "from_2_way_softmax":
             assert len(tokens) == 2
+            hf_config.num_labels = 1
             text_config.num_labels = 1
         else:
+            hf_config.num_labels = len(tokens)
             text_config.num_labels = len(tokens)
 
-        # `llm as reranker` defaults to not using pad_token
-        use_pad_token = getattr(text_config, "use_pad_token", False)
-        text_config.use_pad_token = use_pad_token
+        # `llm as reranker` defaults to not using separating token.
+        use_sep_token = getattr(text_config, "use_sep_token", False)
+        text_config.use_sep_token = use_sep_token
 
 
 def load_weights_using_from_2_way_softmax(
@@ -396,25 +390,34 @@ def load_weights_using_from_2_way_softmax(
 
     model_config = model.vllm_config.model_config
     quant_config = model.vllm_config.quant_config
-    text_config = model.config.get_text_config()
+    hf_config = model.config
+    text_config = hf_config.get_text_config()
 
-    tokens = getattr(text_config, "classifier_from_token", [])
+    tokens = getattr(
+        hf_config,
+        "classifier_from_token",
+        getattr(text_config, "classifier_from_token", []),
+    )
     tokens = cast(list[int], tokens)
     assert len(tokens) == 2
 
-    model.lm_head = ParallelLMHead(
+    language_model = (
+        model.get_language_model() if hasattr(model, "get_language_model") else model
+    )
+    language_model.lm_head = ParallelLMHead(
         text_config.vocab_size, text_config.hidden_size, quant_config=quant_config
     )
     if text_config.tie_word_embeddings:
         # embed_tokens is the assumed name for input embeddings. If the model does not
         # have this attribute, we fall back to get_input_embeddings(), which is used by
         # the Transformers modeling backend.
+        text_backbone = language_model.model
         embed_tokens = (
-            model.model.embed_tokens
-            if hasattr(model.model, "embed_tokens")
-            else model.model.get_input_embeddings()
+            text_backbone.embed_tokens
+            if hasattr(text_backbone, "embed_tokens")
+            else text_backbone.get_input_embeddings()
         )
-        model.lm_head = model.lm_head.tie_weights(embed_tokens)
+        language_model.lm_head = language_model.lm_head.tie_weights(embed_tokens)
 
     # ModelForPooling is dynamically defined inside the _create_pooling_model_cls
     # function, so we need use this hacky method to obtain it.
@@ -434,17 +437,22 @@ def load_weights_using_from_2_way_softmax(
 
     false_id = tokenizer.convert_tokens_to_ids(tokens[0])
     true_id = tokenizer.convert_tokens_to_ids(tokens[1])
-    score_weight = model.lm_head.weight.data[[true_id]].to(
+    lm_head_weight = language_model.lm_head.weight
+    score_weight = lm_head_weight.data[[true_id]].to(
         torch.float32
-    ) - model.lm_head.weight.data[[false_id]].to(torch.float32)
+    ) - lm_head_weight.data[[false_id]].to(torch.float32)
 
     param = model.score.weight
     weight_loader = getattr(param, "weight_loader", default_weight_loader)
     weight_loader(param, score_weight)
 
-    del model.lm_head
+    del language_model.lm_head
     loaded_weights.add("score.weight")
-    loaded_weights.discard("lm_head.weight")
+
+    lm_head_name = "lm_head.weight"
+    if hf_to_vllm_mapper := getattr(model, "hf_to_vllm_mapper", None):
+        lm_head_name = hf_to_vllm_mapper._map_name(lm_head_name)
+    loaded_weights.discard(lm_head_name)
     return loaded_weights
 
 
@@ -516,7 +524,69 @@ def seq_cls_model_loader(model, weights: Iterable[tuple[str, torch.Tensor]]):
     #   - GemmaForCausalLM
     #     - bge-reranker-v2-gemma
 
-    text_config = model.vllm_config.model_config.hf_config.get_text_config()
-    method = getattr(text_config, "method", None)
+    hf_config = model.vllm_config.model_config.hf_config
+    text_config = hf_config.get_text_config()
+    method = getattr(hf_config, "method", getattr(text_config, "method", None))
     assert method in SEQ_CLS_LOAD_METHODS, f"method {method} not supported"
     return SEQ_CLS_LOAD_METHODS[method](model, weights)
+
+
+def as_mm_encoder_only_model(cls: _T) -> _T:
+    """
+    Subclass an existing vLLM vl model to support mm encoder only for
+    EPD encoder instances.
+    """
+    if not hasattr(cls, "embed_multimodal"):
+        # Submodel case: return the original class.
+        return cls
+
+    if not hasattr(cls, "get_language_model_spec"):
+        raise TypeError(f"{cls} need to implement `get_language_model_spec` method.")
+
+    lm_model_cls, lm_attr = cls.get_language_model_spec()
+
+    if lm_model_cls is None or lm_attr is None:
+        raise TypeError(
+            f"{cls}.get_language_model_spec() must return (lm_model_cls, lm_attr)"
+        )
+
+    class DummyLM(nn.Module):
+        def __init__(self, *args, **kwargs):
+            self.make_empty_intermediate_tensors = None
+
+    class ModelForMMEncoderOnly(cls):
+        def __init__(
+            self,
+            *,
+            vllm_config: "VllmConfig",
+            prefix: str = "",
+            **kwargs: Any,
+        ) -> None:
+            self.is_mm_encoder_only_model = True
+            origin_init = lm_model_cls.__init__
+            try:
+                lm_model_cls.__init__ = DummyLM.__init__
+                super().__init__(vllm_config=vllm_config, prefix=prefix, **kwargs)
+
+                if hasattr(self, lm_attr):
+                    delattr(self, lm_attr)
+            finally:
+                lm_model_cls.__init__ = origin_init
+
+        def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+            from .utils import AutoWeightsLoader
+
+            origin_init_ = AutoWeightsLoader.__init__
+
+            def _new_init_(self, *args, **kwargs):
+                origin_init_(self, *args, **kwargs)
+                self.skip_prefixes = (self.skip_prefixes or []) + [f"{lm_attr}."]
+
+            try:
+                AutoWeightsLoader.__init__ = _new_init_
+                result = super().load_weights(weights)
+            finally:
+                AutoWeightsLoader.__init__ = origin_init_
+            return result
+
+    return ModelForMMEncoderOnly  # type: ignore
diff --git a/vllm/model_executor/models/afmoe.py b/vllm/model_executor/models/afmoe.py
index f5dfe43067414a1d8f481c29519a9b53b5f08bed..ef6f59e447d257ced23dcad9a5d176f18fffa9ff 100644
--- a/vllm/model_executor/models/afmoe.py
+++ b/vllm/model_executor/models/afmoe.py
@@ -9,7 +9,6 @@ from itertools import islice
 import torch
 from torch import nn
 
-from vllm.attention.backends.abstract import AttentionType
 from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config
@@ -50,6 +49,7 @@ from vllm.model_executor.models.utils import (
     maybe_prefix,
 )
 from vllm.sequence import IntermediateTensors
+from vllm.v1.attention.backend import AttentionType
 
 logger = init_logger(__name__)
 
@@ -475,6 +475,7 @@ class AfmoeModel(nn.Module):
         # Params for weights, fp8 weight scales, fp8 activation scales
         # (param_name, weight_name, expert_id, shard_id)
         return SharedFusedMoE.make_expert_params_mapping(
+            self,
             ckpt_gate_proj_name="gate_proj",
             ckpt_down_proj_name="down_proj",
             ckpt_up_proj_name="up_proj",
diff --git a/vllm/model_executor/models/aimv2.py b/vllm/model_executor/models/aimv2.py
index 3d000f3ac3ab5b9039dcfb3580356b67912018fb..b802bb0ee35bed02c6b7192a086369cc00995e7f 100644
--- a/vllm/model_executor/models/aimv2.py
+++ b/vllm/model_executor/models/aimv2.py
@@ -8,10 +8,10 @@ from collections.abc import Iterable
 import torch
 import torch.nn as nn
 
-from vllm.attention.layer import MultiHeadAttention
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.distributed.utils import divide
 from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention
 from vllm.model_executor.layers.conv import Conv2dLayer
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (
@@ -126,7 +126,7 @@ class AIMv2Attention(nn.Module):
         self.tp_size = get_tensor_model_parallel_world_size()
         self.num_heads_per_partition = divide(self.num_heads, self.tp_size)
 
-        self.attn = MultiHeadAttention(
+        self.attn = MMEncoderAttention(
             self.num_heads_per_partition, self.head_dim, self.scale
         )
 
diff --git a/vllm/model_executor/models/apertus.py b/vllm/model_executor/models/apertus.py
index e3f97a718b0f4e3b1ba52df20fc27b32ac4db780..7d43735c0053c7de26a56747cc7d46165c083499 100644
--- a/vllm/model_executor/models/apertus.py
+++ b/vllm/model_executor/models/apertus.py
@@ -32,13 +32,14 @@ import torch
 from torch import nn
 from transformers import ApertusConfig
 
-from vllm.attention.backends.abstract import AttentionType
 from vllm.attention.layer import Attention
-from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import XIELU
+from vllm.model_executor.layers.attention.encoder_only_attention import (
+    EncoderOnlyAttention,
+)
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
@@ -57,6 +58,7 @@ from vllm.model_executor.model_loader.weight_utils import (
     maybe_remap_kv_scale_name,
 )
 from vllm.sequence import IntermediateTensors
+from vllm.v1.attention.backend import AttentionType
 
 from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (
diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index c6d7f19cbe90ddbba81a06bb7b75692c79b9adde..c7f44762f3936577fdb665ef6286da7e5ab9efba 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -127,11 +127,16 @@ class AriaProjectorMLP(nn.Module):
         in_features: int,
         hidden_features: int,
         output_dim: int,
+        prefix: str = "",
     ) -> None:
         super().__init__()
 
-        self.linear_in = ColumnParallelLinear(in_features, hidden_features, bias=False)
-        self.linear_out = RowParallelLinear(hidden_features, output_dim, bias=False)
+        self.linear_in = ColumnParallelLinear(
+            in_features, hidden_features, bias=False, prefix=f"{prefix}.linear_in"
+        )
+        self.linear_out = RowParallelLinear(
+            hidden_features, output_dim, bias=False, prefix=f"{prefix}.linear_out"
+        )
         self.act = get_act_fn("gelu_new")
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
@@ -154,7 +159,7 @@ class AriaProjector(nn.Module):
         A tensor with the shape of (batch_size, query_number, output_dim)
     """
 
-    def __init__(self, config: AriaConfig) -> None:
+    def __init__(self, config: AriaConfig, prefix: str = "") -> None:
         super().__init__()
 
         self.patch_to_query_dict = config.projector_patch_to_query_dict
@@ -174,7 +179,10 @@ class AriaProjector(nn.Module):
 
         self.layer_norm = nn.LayerNorm(self.in_features)
         self.feed_forward = AriaProjectorMLP(
-            self.in_features, self.hidden_features, self.output_dim
+            self.in_features,
+            self.hidden_features,
+            self.output_dim,
+            prefix=f"{prefix}.feed_forward",
         )
 
     def forward(
@@ -536,7 +544,9 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
             quant_config=quant_config,
             prefix=f"{prefix}.vision_tower",
         )
-        self.multi_modal_projector = AriaProjector(config)
+        self.multi_modal_projector = AriaProjector(
+            config, prefix=maybe_prefix(prefix, "multi_modal_projector")
+        )
         self.vocab_size = config.text_config.vocab_size
         self.language_model = AriaTextModel(
             vllm_config=vllm_config.with_hf_config(config.text_config),
diff --git a/vllm/model_executor/models/audioflamingo3.py b/vllm/model_executor/models/audioflamingo3.py
index 0ca5f2c4e0a754b12b23cb52a8cb69d167e0f9ba..3609cc26a4c6bbe0742edbfa97d3ceeb687e90f4 100644
--- a/vllm/model_executor/models/audioflamingo3.py
+++ b/vllm/model_executor/models/audioflamingo3.py
@@ -111,7 +111,7 @@ class AudioFlamingo3EmbeddingInputs(TensorSchema):
 
     audio_embeds: Annotated[
         list[torch.Tensor],
-        TensorShape("bn", "naf", "hs"),
+        TensorShape("bn", "naf", "hs", dynamic_dims={"naf"}),
     ]
 
 
diff --git a/vllm/model_executor/models/bagel.py b/vllm/model_executor/models/bagel.py
index 98229c6d4ca1b9dade1f24d2e42df9ada1b3aa45..08bb13e95299edc4e71f4173d1b1dc38dfbc6b3b 100644
--- a/vllm/model_executor/models/bagel.py
+++ b/vllm/model_executor/models/bagel.py
@@ -346,6 +346,13 @@ class BagelForConditionalGeneration(
         }
     )
 
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
+        if modality.startswith("image"):
+            return "<|image_pad|>"
+
+        raise ValueError("Only image modality is supported")
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
@@ -487,7 +494,7 @@ class BagelForConditionalGeneration(
         # Split by image
         return tuple(vision_embeds)
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         """Get multimodal embeddings from input."""
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
diff --git a/vllm/model_executor/models/bailing_moe.py b/vllm/model_executor/models/bailing_moe.py
index 4bccee752174988bdd52b5ad3d7c4868fc8e36c7..e1e675bd5a057283ec5da04db2db2bbcba4903c7 100644
--- a/vllm/model_executor/models/bailing_moe.py
+++ b/vllm/model_executor/models/bailing_moe.py
@@ -476,6 +476,7 @@ class BailingMoeModel(nn.Module):
 
     def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
         return SharedFusedMoE.make_expert_params_mapping(
+            self,
             ckpt_gate_proj_name="gate_proj",
             ckpt_down_proj_name="down_proj",
             ckpt_up_proj_name="up_proj",
diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py
index e774cd647ea8c02c7faa9ebcc546c6f147013e30..532175e7294c72d4976642816a158b1ea8d39b47 100644
--- a/vllm/model_executor/models/bert.py
+++ b/vllm/model_executor/models/bert.py
@@ -7,23 +7,33 @@ import torch
 from torch import nn
 from transformers import BertConfig
 
-from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig, PoolerConfig, VllmConfig
+from vllm.config import CacheConfig, ModelConfig, PoolerConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.attention.encoder_only_attention import (
+    EncoderOnlyAttention,
+)
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
     QKVParallelLinear,
     RowParallelLinear,
 )
 from vllm.model_executor.layers.pooler import (
-    ClassifierPooler,
     DispatchPooler,
     Pooler,
-    PoolingMethod,
     PoolingParamsUpdate,
-    PoolingType,
+)
+from vllm.model_executor.layers.pooler.activations import LambdaPoolerActivation
+from vllm.model_executor.layers.pooler.seqwise import (
+    EmbeddingPoolerHead,
+    SequencePooler,
+    SequencePoolerOutput,
+    get_seq_pooling_method,
+)
+from vllm.model_executor.layers.pooler.tokwise import (
+    pooler_for_token_classify,
+    pooler_for_token_embed,
 )
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
@@ -55,7 +65,9 @@ class BertEmbedding(nn.Module):
             "position_ids",
             torch.arange(config.max_position_embeddings).unsqueeze(0),
         )
-        self.position_embedding_type = config.position_embedding_type
+        self.position_embedding_type = getattr(
+            config, "position_embedding_type", "absolute"
+        )
         if self.position_embedding_type != "absolute":
             raise ValueError(
                 "Only 'absolute' position_embedding_type" + " is supported"
@@ -81,38 +93,33 @@ class BertEmbedding(nn.Module):
         return embeddings
 
 
-class BertPooler(Pooler):
-    def __init__(self, config: BertConfig):
-        super().__init__()
-
-        self.pooling = PoolingMethod.from_pooling_type(PoolingType.CLS)
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.activation = nn.Tanh()
-
-    def get_supported_tasks(self) -> Set[PoolingTask]:
-        return self.pooling.get_supported_tasks()
-
-    def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
-        return self.pooling.get_pooling_updates(task)
+class BertPooler(SequencePooler):
+    def __init__(self, model_config: ModelConfig):
+        pooler_config = model_config.pooler_config
+        assert pooler_config is not None
 
-    def _head(self, pooled_output: torch.Tensor):
-        pooled_output = self.dense(pooled_output)
-        pooled_output = self.activation(pooled_output)
-        return pooled_output
+        config: BertConfig = model_config.hf_config
 
-    def forward(
-        self,
-        hidden_states: torch.Tensor | list[torch.Tensor],
-        pooling_metadata: PoolingMetadata,
-    ) -> torch.Tensor | list[torch.Tensor]:
-        pooled_output = self.pooling(hidden_states, pooling_metadata)
+        super().__init__(
+            pooling=get_seq_pooling_method(pooler_config.seq_pooling_type),
+            # We set this dummy to avoid adding parameters to nn.Module too early
+            head=nn.Identity(),
+        )
 
-        if isinstance(pooled_output, list):
-            pooled_output = [self._head(output) for output in pooled_output]
-        else:
-            pooled_output = self._head(pooled_output)
+        head_dtype = model_config.head_dtype
+        self.dense = nn.Linear(
+            config.hidden_size,
+            config.hidden_size,
+            dtype=head_dtype,
+        )
+        self.act_fn = nn.Tanh()
 
-        return pooled_output
+        # Use lambdas so that weights are not registered under `self.head`
+        self.head = EmbeddingPoolerHead(
+            head_dtype=head_dtype,
+            projector=lambda x: self.dense(x),
+            activation=LambdaPoolerActivation(self.act_fn),
+        )
 
 
 class BertEncoder(nn.Module):
@@ -356,7 +363,7 @@ class BertOutput(nn.Module):
 
 
 @support_torch_compile
-@default_pooling_type("CLS")
+@default_pooling_type(seq_pooling_type="CLS")
 class BertModel(nn.Module, SupportsQuant):
     is_pooling_model = True
 
@@ -448,8 +455,7 @@ class BertPoolingModel(BertModel):
             embedding_class=embedding_class,
         )
 
-        config = vllm_config.model_config.hf_config
-        self.pooler = BertPooler(config)
+        self.pooler = BertPooler(vllm_config.model_config)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         other_weights, loaded_stacked_params = self._load_weights(weights)
@@ -460,7 +466,7 @@ class BertPoolingModel(BertModel):
         return loaded_params
 
 
-@default_pooling_type("CLS")
+@default_pooling_type(seq_pooling_type="CLS")
 class BertEmbeddingModel(nn.Module, SupportsQuant):
     """A model that uses Bert to provide embedding functionalities.
 
@@ -518,12 +524,7 @@ class BertEmbeddingModel(nn.Module, SupportsQuant):
         )
 
     def _build_pooler(self, pooler_config: PoolerConfig) -> Pooler:
-        return DispatchPooler(
-            {
-                "token_embed": Pooler.for_token_embed(pooler_config),
-                "embed": Pooler.for_embed(pooler_config),
-            }
-        )
+        return DispatchPooler.for_embedding(pooler_config)
 
 
 # Here we encode the token type ids together with the input ids.
@@ -614,6 +615,7 @@ class SPLADESparsePooler(Pooler):
         remove_cls_sep: bool = True,
     ):
         super().__init__()
+
         assert pooling in ("max", "sum")
         self.mlm_head = mlm_head
         self.cls_token_id = cls_token_id
@@ -631,10 +633,8 @@ class SPLADESparsePooler(Pooler):
         self,
         hidden_states: torch.Tensor,
         pooling_metadata: PoolingMetadata,
-    ) -> torch.Tensor:
-        assert isinstance(hidden_states, torch.Tensor) and hidden_states.dim() == 2
-
-        lens_tensor: torch.Tensor = pooling_metadata.prompt_lens
+    ) -> SequencePoolerOutput:
+        lens_tensor = pooling_metadata.prompt_lens
         lens: list[int] = lens_tensor.tolist()
         B: int = len(lens)
 
@@ -680,7 +680,7 @@ class SPLADESparsePooler(Pooler):
         return torch.stack(pooled_list, dim=0).contiguous()
 
 
-@default_pooling_type("CLS")
+@default_pooling_type(seq_pooling_type="CLS")
 class BertSpladeSparseEmbeddingModel(BertEmbeddingModel):
     """
     BertEmbeddingModel + SPLADE sparse embedding.
@@ -716,6 +716,8 @@ class BertSpladeSparseEmbeddingModel(BertEmbeddingModel):
                 layer_norm_eps=getattr(cfg, "layer_norm_eps", 1e-12),
             )
 
+        # None of vLLM's built-in sequence pooling types are
+        # applicable so it is overwritten by SPLADESparsePooler
         pooling_mode = getattr(self, "_splade_pooling", "max")
 
         cls_id = getattr(cfg, "cls_token_id", None)
@@ -723,7 +725,7 @@ class BertSpladeSparseEmbeddingModel(BertEmbeddingModel):
 
         return DispatchPooler(
             {
-                "token_embed": Pooler.for_token_embed(pooler_config),
+                "token_embed": pooler_for_token_embed(pooler_config),
                 "embed": SPLADESparsePooler(
                     mlm_head=self.mlm_head,
                     cls_token_id=cls_id,
@@ -785,7 +787,7 @@ class BertSpladeSparseEmbeddingModel(BertEmbeddingModel):
         return loaded
 
 
-@default_pooling_type("CLS")
+@default_pooling_type(seq_pooling_type="CLS")
 class BertForSequenceClassification(nn.Module, SupportsCrossEncoding, SupportsQuant):
     """A model that uses Bert to provide embedding functionalities.
 
@@ -818,20 +820,10 @@ class BertForSequenceClassification(nn.Module, SupportsCrossEncoding, SupportsQu
         pooler_config = vllm_config.model_config.pooler_config
         assert pooler_config is not None
 
-        self.pooler = DispatchPooler(
-            {
-                "token_classify": Pooler.for_token_classify(
-                    pooler_config, classifier=self.classifier
-                ),
-                "classify": ClassifierPooler(
-                    pooling=self.bert.pooler,
-                    classifier=self.classifier,
-                    act_fn="classify",
-                ),
-                "score": ClassifierPooler(
-                    pooling=self.bert.pooler, classifier=self.classifier, act_fn="score"
-                ),
-            }
+        self.pooler = DispatchPooler.for_seq_cls(
+            pooler_config,
+            pooling=self.bert.pooler,
+            classifier=self.classifier,
         )
 
     def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
@@ -864,7 +856,7 @@ class BertForSequenceClassification(nn.Module, SupportsCrossEncoding, SupportsQu
 
 
 @attn_type("encoder_only")
-@default_pooling_type("ALL")
+@default_pooling_type(tok_pooling_type="ALL")
 class BertForTokenClassification(nn.Module):
     is_pooling_model = True
 
@@ -885,13 +877,7 @@ class BertForTokenClassification(nn.Module):
         pooler_config = vllm_config.model_config.pooler_config
         assert pooler_config is not None
 
-        self.pooler = DispatchPooler(
-            {
-                "token_classify": Pooler.for_token_classify(
-                    pooler_config=pooler_config
-                ),
-            }
-        )
+        self.pooler = pooler_for_token_classify(pooler_config)
 
     def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.bert.embed_input_ids(input_ids)
diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py
index 131cb68914cf35613c217d265de49d86275d5df8..02950dc9e1d806bbe3daf5c9e372c30f31240634 100644
--- a/vllm/model_executor/models/bert_with_rope.py
+++ b/vllm/model_executor/models/bert_with_rope.py
@@ -6,7 +6,6 @@ import torch
 from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (
@@ -16,6 +15,9 @@ from vllm.distributed import (
     tensor_model_parallel_all_reduce,
 )
 from vllm.model_executor.layers.activation import get_act_and_mul_fn, get_act_fn
+from vllm.model_executor.layers.attention.encoder_only_attention import (
+    EncoderOnlyAttention,
+)
 from vllm.model_executor.layers.fused_moe import activation_without_mul, fused_topk
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
@@ -24,6 +26,7 @@ from vllm.model_executor.layers.linear import (
     ReplicatedLinear,
     RowParallelLinear,
 )
+from vllm.model_executor.layers.pooler import DispatchPooler
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
@@ -37,7 +40,6 @@ from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 
-from ..layers.pooler import ClassifierPooler, DispatchPooler, Pooler
 from .bert import BertPooler
 from .interfaces import SupportsCrossEncoding, SupportsQuant
 from .interfaces_base import default_pooling_type
@@ -439,7 +441,7 @@ class BertWithRopeEncoder(nn.Module):
 
 
 @support_torch_compile
-@default_pooling_type("CLS")
+@default_pooling_type(seq_pooling_type="CLS")
 class BertWithRope(nn.Module, SupportsQuant):
     hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
 
@@ -451,6 +453,7 @@ class BertWithRope(nn.Module, SupportsQuant):
         add_pooling_layer: bool = False,
     ):
         super().__init__()
+
         self.vllm_config = vllm_config
         self.add_pooling_layer = add_pooling_layer
         self.config = vllm_config.model_config.hf_config
@@ -461,7 +464,11 @@ class BertWithRope(nn.Module, SupportsQuant):
             rotary_kwargs=self.config.rotary_kwargs,
             prefix=f"{prefix}.encoder",
         )
-        self.pooler = BertPooler(self.config) if add_pooling_layer else None
+
+        if add_pooling_layer:
+            self.pooler = BertPooler(vllm_config.model_config)
+        else:
+            self.pooler = None
 
     def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embeddings(input_ids)
@@ -668,7 +675,7 @@ class JinaRobertaModel(BertWithRope):
         return super().load_weights(weights)
 
 
-@default_pooling_type("CLS")
+@default_pooling_type(seq_pooling_type="CLS")
 class GteNewForSequenceClassification(nn.Module, SupportsCrossEncoding):
     is_pooling_model = True
 
@@ -693,20 +700,10 @@ class GteNewForSequenceClassification(nn.Module, SupportsCrossEncoding):
         pooler_config = vllm_config.model_config.pooler_config
         assert pooler_config is not None
 
-        self.pooler = DispatchPooler(
-            {
-                "token_classify": Pooler.for_token_classify(
-                    pooler_config, classifier=self.classifier
-                ),
-                "classify": ClassifierPooler(
-                    pooling=self.new.pooler,
-                    classifier=self.classifier,
-                    act_fn="classify",
-                ),
-                "score": ClassifierPooler(
-                    pooling=self.new.pooler, classifier=self.classifier, act_fn="score"
-                ),
-            }
+        self.pooler = DispatchPooler.for_seq_cls(
+            pooler_config,
+            pooling=self.new.pooler,
+            classifier=self.classifier,
         )
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py
index f31f99c0592b2c8c81d4a554c6efc76bbd5d5566..9279cccd596dea82424cc46a8f18c5472f98fecd 100644
--- a/vllm/model_executor/models/blip.py
+++ b/vllm/model_executor/models/blip.py
@@ -9,9 +9,9 @@ import torch
 import torch.nn as nn
 from transformers import Blip2VisionConfig, BlipVisionConfig
 
-from vllm.attention.layer import MultiHeadAttention
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention
 from vllm.model_executor.layers.conv import Conv2dLayer
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
@@ -122,7 +122,7 @@ class BlipAttention(nn.Module):
         self.tp_size = get_tensor_model_parallel_world_size()
         self.num_heads_per_partition = divide(self.num_heads, self.tp_size)
 
-        self.attn = MultiHeadAttention(
+        self.attn = MMEncoderAttention(
             self.num_heads_per_partition, self.head_dim, self.scale
         )
 
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index 1244f97a1bd6888fa8b4bdf20e2f56370747f210..2bd1dd1aef3b15f3cabcde8a7ee7d25ac2c49584 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -35,13 +35,15 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
-from .blip import BlipVisionModel
+from .blip import BlipVisionModel, get_blip_num_patches
 from .interfaces import (
     MultiModalEmbeddings,
+    SupportsLoRA,
     SupportsMultiModal,
     SupportsPP,
     SupportsQuant,
 )
+from .module_mapping import MultiModelKeys
 from .utils import AutoWeightsLoader, init_vllm_registered_model, maybe_prefix
 
 
@@ -521,7 +523,7 @@ class Blip2MultiModalProcessor(BaseMultiModalProcessor[Blip2ProcessingInfo]):
     dummy_inputs=Blip2DummyInputsBuilder,
 )
 class Blip2ForConditionalGeneration(
-    nn.Module, SupportsMultiModal, SupportsPP, SupportsQuant
+    nn.Module, SupportsLoRA, SupportsMultiModal, SupportsPP, SupportsQuant
 ):
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> str | None:
@@ -538,9 +540,17 @@ class Blip2ForConditionalGeneration(
         multimodal_config = vllm_config.model_config.multimodal_config
         self.config = config
         self.multimodal_config = multimodal_config
+        vision_config = config.vision_config
+        self._vision_tokens_per_image = (
+            get_blip_num_patches(
+                image_size=vision_config.image_size,
+                patch_size=vision_config.patch_size,
+            )
+            + 1  # include class token
+        )
 
         # TODO: Optionally initializes this for supporting embeddings.
-        self.vision_model = BlipVisionModel(config.vision_config, quant_config)
+        self.vision_model = BlipVisionModel(vision_config, quant_config)
 
         self.query_tokens = nn.Parameter(
             torch.zeros(1, config.num_query_tokens, config.qformer_config.hidden_size)
@@ -691,3 +701,36 @@ class Blip2ForConditionalGeneration(
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights)
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        return MultiModelKeys.from_string_field(
+            language_model="language_model",
+            connector=["qformer", "language_projection"],
+            tower_model="vision_model",
+        )
+
+    def get_num_mm_encoder_tokens(
+        self,
+        num_image_tokens: int,
+    ) -> int:
+        if num_image_tokens <= 0:
+            return 0
+        assert num_image_tokens % self.config.num_query_tokens == 0, (
+            "The number of image tokens must be a multiple of "
+            "the number of query tokens."
+        )
+        num_images = num_image_tokens / self.config.num_query_tokens
+        return num_images * self._vision_tokens_per_image
+
+    def get_num_mm_connector_tokens(
+        self,
+        num_vision_tokens: int,
+    ) -> int:
+        if num_vision_tokens <= 0:
+            return 0
+        assert num_vision_tokens % self._vision_tokens_per_image == 0, (
+            "The number of vision tokens must be a multiple of "
+            "the number of tokens per image."
+        )
+        num_images = num_vision_tokens / self._vision_tokens_per_image
+        return num_images * self.config.num_query_tokens
diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index 22f3ecad748e69756473d2225812fe9e27942359..1a4811ef8f995ab9b7aecfc1342859fd7af03bc5 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -14,18 +14,19 @@ from transformers import (
     CLIPVisionConfig,
 )
 
-from vllm.attention.layer import Attention, MultiHeadAttention
+from vllm.attention.layer import Attention
 from vllm.config import VllmConfig
-from vllm.config.multimodal import BaseDummyOptions
+from vllm.config.multimodal import BaseDummyOptions, MultiModalConfig
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention
 from vllm.model_executor.layers.conv import Conv2dLayer
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
     QKVParallelLinear,
     RowParallelLinear,
 )
-from vllm.model_executor.layers.pooler import DispatchPooler, Pooler
+from vllm.model_executor.layers.pooler import DispatchPooler
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -144,7 +145,7 @@ class CLIPProcessingInfo(BaseProcessingInfo):
                 image_width=image_width,
                 image_height=image_height,
             ),
-            _get_vision_feature_select_strategy(pooler_config.pooling_type),
+            _get_vision_feature_select_strategy(pooler_config.seq_pooling_type),
         )
 
     def get_image_size_with_most_features(self) -> ImageSize:
@@ -352,9 +353,10 @@ class CLIPAttention(nn.Module):
         self,
         config: CLIPTextConfig | CLIPVisionConfig,
         quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         *,
         prefix: str = "",
-        attn_cls: type[Attention] | type[MultiHeadAttention],
+        attn_cls: type[Attention] | type[MMEncoderAttention],
     ) -> None:
         super().__init__()
 
@@ -364,18 +366,24 @@ class CLIPAttention(nn.Module):
         self.head_dim = self.embed_dim // self.num_heads
         if self.head_dim * self.num_heads != self.embed_dim:
             raise ValueError(
-                "embed_dim must be divisible by num_heads "
-                f"(got `embed_dim`: {self.embed_dim} and `num_heads`:"
-                f" {self.num_heads})."
+                f"embed_dim must be divisible by num_heads "
+                f"(got `embed_dim`: {self.embed_dim} and "
+                f"`num_heads`: {self.num_heads})."
             )
         self.scale = self.head_dim**-0.5
 
+        use_data_parallel = (
+            multimodal_config.mm_encoder_tp_mode == "data"
+            if multimodal_config
+            else False
+        )
         self.qkv_proj = QKVParallelLinear(
             hidden_size=self.embed_dim,
             head_size=self.head_dim,
             total_num_heads=self.num_heads,
             quant_config=quant_config,
             prefix=f"{prefix}.qkv_proj",
+            disable_tp=use_data_parallel,
         )
 
         self.out_proj = RowParallelLinear(
@@ -383,17 +391,29 @@ class CLIPAttention(nn.Module):
             output_size=self.embed_dim,
             quant_config=quant_config,
             prefix=f"{prefix}.out_proj",
+            disable_tp=use_data_parallel,
         )
 
-        self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_size = (
+            1 if use_data_parallel else get_tensor_model_parallel_world_size()
+        )
         self.num_heads_per_partition = divide(self.num_heads, self.tp_size)
 
-        self.attn = attn_cls(
-            self.num_heads_per_partition,
-            self.head_dim,
-            self.scale,
-            prefix=f"{prefix}.attn",
-        )
+        if attn_cls == MMEncoderAttention:
+            self.attn = attn_cls(
+                self.num_heads_per_partition,
+                self.head_dim,
+                self.scale,
+                prefix=f"{prefix}.attn",
+                multimodal_config=multimodal_config,
+            )
+        else:
+            self.attn = attn_cls(
+                self.num_heads_per_partition,
+                self.head_dim,
+                self.scale,
+                prefix=f"{prefix}.attn",
+            )
 
     def forward(
         self,
@@ -414,17 +434,26 @@ class CLIPMLP(nn.Module):
         self,
         config: CLIPTextConfig | CLIPVisionConfig,
         quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
+
         self.config = config
+        use_data_parallel = (
+            multimodal_config.mm_encoder_tp_mode == "data"
+            if multimodal_config
+            else False
+        )
         self.activation_fn = get_act_fn(config.hidden_act)
+
         self.fc1 = ColumnParallelLinear(
             config.hidden_size,
             config.intermediate_size,
             bias=True,
             quant_config=quant_config,
             prefix=f"{prefix}.fc1",
+            disable_tp=use_data_parallel,
         )
         self.fc2 = RowParallelLinear(
             config.intermediate_size,
@@ -432,6 +461,7 @@ class CLIPMLP(nn.Module):
             bias=True,
             quant_config=quant_config,
             prefix=f"{prefix}.fc2",
+            disable_tp=use_data_parallel,
         )
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
@@ -447,19 +477,27 @@ class CLIPEncoderLayer(nn.Module):
         self,
         config: CLIPTextConfig | CLIPVisionConfig,
         quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         *,
         prefix: str = "",
-        attn_cls: type[Attention] | type[MultiHeadAttention],
+        attn_cls: type[Attention] | type[MMEncoderAttention],
     ) -> None:
         super().__init__()
+
         self.self_attn = CLIPAttention(
             config,
             quant_config=quant_config,
+            multimodal_config=multimodal_config,
             prefix=f"{prefix}.self_attn",
             attn_cls=attn_cls,
         )
         self.layer_norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.mlp = CLIPMLP(config, quant_config=quant_config, prefix=f"{prefix}.mlp")
+        self.mlp = CLIPMLP(
+            config,
+            quant_config=quant_config,
+            multimodal_config=multimodal_config,
+            prefix=f"{prefix}.mlp",
+        )
         self.layer_norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
@@ -490,10 +528,11 @@ class CLIPEncoder(nn.Module):
         self,
         config: CLIPTextConfig | CLIPVisionConfig,
         quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         num_hidden_layers_override: int | None = None,
         *,
         prefix: str = "",
-        attn_cls: type[Attention] | type[MultiHeadAttention],
+        attn_cls: type[Attention] | type[MMEncoderAttention],
     ) -> None:
         super().__init__()
 
@@ -503,11 +542,13 @@ class CLIPEncoder(nn.Module):
             num_hidden_layers = config.num_hidden_layers
         else:
             num_hidden_layers = num_hidden_layers_override
+
         self.layers = nn.ModuleList(
             [
                 CLIPEncoderLayer(
                     config=config,
                     quant_config=quant_config,
+                    multimodal_config=multimodal_config,
                     prefix=f"{prefix}.layers.{layer_idx}",
                     attn_cls=attn_cls,
                 )
@@ -617,6 +658,7 @@ class CLIPVisionTransformer(nn.Module):
         self,
         config: CLIPVisionConfig,
         quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         *,
         num_hidden_layers_override: int | None = None,
         require_post_norm: bool | None = None,
@@ -636,9 +678,10 @@ class CLIPVisionTransformer(nn.Module):
         self.encoder = CLIPEncoder(
             config=config,
             quant_config=quant_config,
+            multimodal_config=multimodal_config,
             num_hidden_layers_override=num_hidden_layers_override,
             prefix=f"{prefix}.encoder",
-            attn_cls=MultiHeadAttention,
+            attn_cls=MMEncoderAttention,
         )
 
         num_hidden_layers = config.num_hidden_layers
@@ -737,6 +780,7 @@ class CLIPVisionModel(nn.Module):
         self,
         config: CLIPVisionConfig,
         quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         *,
         num_hidden_layers_override: int | None = None,
         require_post_norm: bool | None = None,
@@ -747,6 +791,7 @@ class CLIPVisionModel(nn.Module):
         self.vision_model = CLIPVisionTransformer(
             config=config,
             quant_config=quant_config,
+            multimodal_config=multimodal_config,
             num_hidden_layers_override=num_hidden_layers_override,
             require_post_norm=require_post_norm,
             prefix=f"{prefix}.vision_model",
@@ -774,7 +819,7 @@ class CLIPVisionModel(nn.Module):
 
 
 # Assume EOS token corresponds to LAST token in text model
-@default_pooling_type("LAST")
+@default_pooling_type(seq_pooling_type="LAST")
 @MULTIMODAL_REGISTRY.register_processor(
     CLIPMultiModalProcessor,
     info=CLIPProcessingInfo,
@@ -816,6 +861,7 @@ class CLIPEmbeddingModel(nn.Module, SupportsMultiModal, SupportsQuant):
         self.vision_model = CLIPVisionTransformer(
             vision_config,
             quant_config=quant_config,
+            multimodal_config=multimodal_config,
             prefix=maybe_prefix(prefix, "vision_model"),
         )
 
@@ -834,12 +880,7 @@ class CLIPEmbeddingModel(nn.Module, SupportsMultiModal, SupportsQuant):
         assert pooler_config is not None
         self.pooler_config = pooler_config
 
-        self.pooler = DispatchPooler(
-            {
-                "token_embed": Pooler.for_token_embed(pooler_config),
-                "embed": Pooler.for_embed(pooler_config),
-            }
-        )
+        self.pooler = DispatchPooler.for_embedding(pooler_config)
 
         # Assumes that self.forward is called after self.embed_input_ids
         self._is_text_input = True
@@ -867,7 +908,7 @@ class CLIPEmbeddingModel(nn.Module, SupportsMultiModal, SupportsQuant):
     ) -> torch.Tensor:
         if feature_select_strategy is None:
             feature_select_strategy = _get_vision_feature_select_strategy(
-                self.pooler_config.pooling_type
+                self.pooler_config.seq_pooling_type
             )
 
         pooled_output = self.vision_model(
diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
index 4b08472538db43a7bd6e711e145bb6bfff8c7cf3..e51a110ce0b32ee5bf16b39e764173feb1683160 100644
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -4,16 +4,16 @@ from copy import deepcopy
 from math import lcm
 from typing import TYPE_CHECKING
 
-from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.logger import init_logger
 from vllm.model_executor.models import ModelRegistry
 from vllm.platforms import current_platform
 from vllm.utils.math_utils import cdiv, round_up
 from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
 from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec, MLAAttentionSpec
 
 if TYPE_CHECKING:
-    from vllm.config import VllmConfig
+    from vllm.config import ModelConfig, VllmConfig
 
 logger = init_logger(__name__)
 
@@ -21,20 +21,24 @@ logger = init_logger(__name__)
 class VerifyAndUpdateConfig:
     @staticmethod
     def verify_and_update_config(vllm_config: "VllmConfig") -> None:
-        raise NotImplementedError
+        return
+
+    @staticmethod
+    def verify_and_update_model_config(model_config: "ModelConfig") -> None:
+        return
 
 
-class Gemma3TextModelConfig:
+class Gemma3TextModelConfig(VerifyAndUpdateConfig):
     @staticmethod
-    def verify_and_update_config(vllm_config: "VllmConfig") -> None:
-        hf_config = vllm_config.model_config.hf_config
+    def verify_and_update_model_config(model_config: "ModelConfig") -> None:
+        hf_config = model_config.hf_config
         hf_config.is_causal = not hf_config.use_bidirectional_attention
 
 
 class GteNewModelConfig(VerifyAndUpdateConfig):
     @staticmethod
-    def verify_and_update_config(vllm_config: "VllmConfig") -> None:
-        config = vllm_config.model_config.hf_config
+    def verify_and_update_model_config(model_config: "ModelConfig") -> None:
+        config = model_config.hf_config
 
         assert config.__class__.__name__ == "NewConfig"
         assert config.hidden_act == "gelu"
@@ -53,16 +57,15 @@ class GteNewModelConfig(VerifyAndUpdateConfig):
 
 class JambaForSequenceClassificationConfig(VerifyAndUpdateConfig):
     @staticmethod
-    def verify_and_update_config(vllm_config: "VllmConfig") -> None:
-        pooler_config = vllm_config.model_config.pooler_config
+    def verify_and_update_model_config(model_config: "ModelConfig") -> None:
+        pooler_config = model_config.pooler_config
         if pooler_config.use_activation is None:
             pooler_config.use_activation = False
 
 
 class JinaRobertaModelConfig(VerifyAndUpdateConfig):
     @staticmethod
-    def verify_and_update_config(vllm_config: "VllmConfig") -> None:
-        model_config = vllm_config.model_config
+    def verify_and_update_model_config(model_config: "ModelConfig") -> None:
         config = model_config.hf_config
 
         if config.position_embedding_type == "rotary":
@@ -88,10 +91,31 @@ class JinaRobertaModelConfig(VerifyAndUpdateConfig):
             }
 
 
+class LlamaBidirectionalConfig(VerifyAndUpdateConfig):
+    @staticmethod
+    def verify_and_update_model_config(model_config: "ModelConfig") -> None:
+        from vllm.config.pooler import SequencePoolingType
+
+        hf_config = model_config.hf_config
+        hf_config.is_causal = False
+
+        pooling_type_map: dict[str, SequencePoolingType] = {
+            "avg": "MEAN",
+            "cls": "CLS",
+            "last": "LAST",
+        }
+
+        pooling_type = pooling_type_map.get(hf_config.pooling, None)
+        if pooling_type is None:
+            raise ValueError(f"pool_type {hf_config.pooling!r} not supported")
+
+        model_config.pooler_config.seq_pooling_type = pooling_type
+
+
 class NomicBertModelConfig(VerifyAndUpdateConfig):
     @staticmethod
-    def verify_and_update_config(vllm_config: "VllmConfig") -> None:
-        config = vllm_config.model_config.hf_config
+    def verify_and_update_model_config(model_config: "ModelConfig") -> None:
+        config = model_config.hf_config
 
         assert config.__class__.__name__ == "NomicBertConfig"
         assert config.activation_function in ["swiglu", "gelu"]
@@ -114,6 +138,10 @@ class NomicBertModelConfig(VerifyAndUpdateConfig):
         config.intermediate_size = config.n_inner
         config.hidden_size = config.n_embd
         config.num_hidden_layers = config.n_layer
+        model_config.model_arch_config.hidden_size = config.hidden_size
+        model_config.model_arch_config.total_num_hidden_layers = (
+            config.num_hidden_layers
+        )
 
         head_dim = config.hidden_size // config.num_attention_heads
         max_trained_positions = getattr(config, "max_trained_positions", 2048)
@@ -130,42 +158,43 @@ class NomicBertModelConfig(VerifyAndUpdateConfig):
         # The context extension uses vllm style rope_theta and rope_parameters.
         # See #17785 #18755
         if (
-            not vllm_config.model_config.hf_overrides
-            and vllm_config.model_config.original_max_model_len is None
+            not model_config.hf_overrides
+            and model_config.original_max_model_len is None
         ):
             # Default
             # Reset max_model_len to max_trained_positions.
             # nomic-embed-text-v2-moe the length is set to 512
             # by sentence_bert_config.json.
-            max_model_len_before = vllm_config.model_config.max_model_len
-            max_model_len = min(
-                vllm_config.model_config.max_model_len, max_trained_positions
-            )
+            max_model_len_before = model_config.max_model_len
+            max_model_len = min(model_config.max_model_len, max_trained_positions)
 
-            vllm_config.recalculate_max_model_len(max_model_len)
-            logger.warning(
-                "Nomic context extension is disabled. "
-                "Changing max_model_len from %s to %s. "
-                "To enable context extension, see: "
-                "https://github.com/vllm-project/vllm/tree/main/examples/offline_inference/context_extension.html",
-                max_model_len_before,
-                vllm_config.model_config.max_model_len,
+            model_config.max_model_len = model_config.get_and_verify_max_len(
+                max_model_len
             )
+
+            if model_config.max_model_len != max_model_len_before:
+                logger.warning(
+                    "Nomic context extension is disabled. "
+                    "Changing max_model_len from %s to %s. "
+                    "To enable context extension, see: "
+                    "https://github.com/vllm-project/vllm/tree/main/examples/offline_inference/context_extension.html",
+                    max_model_len_before,
+                    model_config.max_model_len,
+                )
         else:
             # We need to re-verify max_model_len to avoid lengths
             # greater than position_embedding.
-            model_config = vllm_config.model_config
             hf_text_config = model_config.hf_text_config
 
             if isinstance(model_config.hf_overrides, dict):
                 # hf_overrides_kw
                 max_model_len = model_config.hf_overrides.get(
-                    "max_model_len", vllm_config.model_config.max_model_len
+                    "max_model_len", model_config.max_model_len
                 )
             else:
                 # hf_overrides_fn
                 # This might be overridden by sentence_bert_config.json.
-                max_model_len = vllm_config.model_config.max_model_len
+                max_model_len = model_config.max_model_len
 
             # reset hf_text_config for recalculate_max_model_len.
             if hasattr(hf_text_config, "max_model_len"):
@@ -173,19 +202,27 @@ class NomicBertModelConfig(VerifyAndUpdateConfig):
             hf_text_config.max_position_embeddings = max_trained_positions
             hf_text_config.rope_parameters = config.rotary_kwargs["rope_parameters"]
 
+            # Update the cached derived_max_model_len to enforce the limit
+            model_config.model_arch_config.derived_max_model_len_and_key = (
+                float(max_trained_positions),
+                "max_position_embeddings",
+            )
+
             # The priority of sentence_bert_config.json is higher
             # than max_position_embeddings
             encoder_config = deepcopy(model_config.encoder_config)
             encoder_config.pop("max_seq_length", None)
             model_config.encoder_config = encoder_config
 
-            vllm_config.recalculate_max_model_len(max_model_len)
+            model_config.max_model_len = model_config.get_and_verify_max_len(
+                max_model_len
+            )
 
 
 class Qwen2ForProcessRewardModelConfig(VerifyAndUpdateConfig):
     @staticmethod
-    def verify_and_update_config(vllm_config: "VllmConfig") -> None:
-        pooler_config = vllm_config.model_config.pooler_config
+    def verify_and_update_model_config(model_config: "ModelConfig") -> None:
+        pooler_config = model_config.pooler_config
 
         if pooler_config.step_tag_id is None:
             pooler_config.step_tag_id = 151651
@@ -193,8 +230,8 @@ class Qwen2ForProcessRewardModelConfig(VerifyAndUpdateConfig):
 
 class Qwen2ForRewardModelConfig(VerifyAndUpdateConfig):
     @staticmethod
-    def verify_and_update_config(vllm_config: "VllmConfig") -> None:
-        pooler_config = vllm_config.model_config.pooler_config
+    def verify_and_update_model_config(model_config: "ModelConfig") -> None:
+        pooler_config = model_config.pooler_config
 
         if pooler_config.softmax is None:
             pooler_config.softmax = False
@@ -202,8 +239,8 @@ class Qwen2ForRewardModelConfig(VerifyAndUpdateConfig):
 
 class Qwen3ForSequenceClassificationConfig(VerifyAndUpdateConfig):
     @staticmethod
-    def verify_and_update_config(vllm_config: "VllmConfig") -> None:
-        config = vllm_config.model_config.hf_config
+    def verify_and_update_model_config(model_config: "ModelConfig") -> None:
+        config = model_config.hf_config
 
         is_original_qwen3_reranker = getattr(
             config, "is_original_qwen3_reranker", False
@@ -215,25 +252,31 @@ class Qwen3ForSequenceClassificationConfig(VerifyAndUpdateConfig):
         tokens = getattr(config, "classifier_from_token", None)
         assert tokens is not None and len(tokens) == 2, (
             "Try loading the original Qwen3 Reranker?, see: "
-            "https://github.com/vllm-project/vllm/tree/main/examples/offline_inference/offline_reranker.py"
+            "https://github.com/vllm-project/vllm/tree/main/examples/pooling/score/qwen3_reranker_offline.py"
         )
-        vllm_config.model_config.hf_config.method = "from_2_way_softmax"
+        text_config = config.get_text_config()
+        text_config.method = "from_2_way_softmax"
+        text_config.classifier_from_token = tokens
+
+
+class Qwen3VLForSequenceClassificationConfig(Qwen3ForSequenceClassificationConfig):
+    pass
 
 
 class JinaVLForSequenceClassificationConfig(VerifyAndUpdateConfig):
     @staticmethod
-    def verify_and_update_config(vllm_config: "VllmConfig") -> None:
-        config = vllm_config.model_config.hf_config
+    def verify_and_update_model_config(model_config: "ModelConfig") -> None:
+        config = model_config.hf_config
         config.num_labels = 1
-        pooler_config = vllm_config.model_config.pooler_config
+        pooler_config = model_config.pooler_config
         if pooler_config.logit_bias is None:
             pooler_config.logit_bias = 2.65
 
 
 class SnowflakeGteNewModelConfig(VerifyAndUpdateConfig):
     @staticmethod
-    def verify_and_update_config(vllm_config: "VllmConfig") -> None:
-        config = vllm_config.model_config.hf_config
+    def verify_and_update_model_config(model_config: "ModelConfig") -> None:
+        config = model_config.hf_config
 
         assert config.__class__.__name__ == "GteConfig"
         assert config.hidden_act == "gelu"
@@ -308,12 +351,6 @@ class MambaModelConfig(VerifyAndUpdateConfig):
         if cache_config.mamba_block_size is None:
             cache_config.mamba_block_size = model_config.max_model_len
 
-        # TODO(tdoublep): remove once cascade attention is supported
-        logger.info(
-            "Disabling cascade attention since it is not supported for hybrid models."
-        )
-        model_config.disable_cascade_attn = True
-
 
 class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
     @classmethod
@@ -407,7 +444,7 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
             # of attention tokens that would fit mamba_page_size:
             # e.g. for mamba page size = 788kB
             #          attn_1_token = 2kB -> fits ~394 tokens
-            #      then round up to a mulitple of 256 -> 512 tokens
+            #      then round up to a multiple of 256 -> 512 tokens
             # End result:
             #  attn_block_size = 512
             #  mamba_block_size = 512 (aligned to a multiple of chunk_size)
@@ -515,10 +552,13 @@ MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = {
     "GteNewModel": GteNewModelConfig,
     "GteNewForSequenceClassification": GteNewModelConfig,
     "Gemma3TextModel": Gemma3TextModelConfig,
+    "LlamaBidirectionalForSequenceClassification": LlamaBidirectionalConfig,
+    "LlamaBidirectionalModel": LlamaBidirectionalConfig,
     "NomicBertModel": NomicBertModelConfig,
     "Qwen2ForProcessRewardModel": Qwen2ForProcessRewardModelConfig,
     "Qwen2ForRewardModel": Qwen2ForRewardModelConfig,
     "Qwen3ForSequenceClassification": Qwen3ForSequenceClassificationConfig,
+    "Qwen3VLForSequenceClassification": Qwen3VLForSequenceClassificationConfig,
     "XLMRobertaModel": JinaRobertaModelConfig,
     "JinaVLForRanking": JinaVLForSequenceClassificationConfig,
     "JambaForSequenceClassification": JambaForSequenceClassificationConfig,
diff --git a/vllm/model_executor/models/deepencoder.py b/vllm/model_executor/models/deepencoder.py
index 8f1660891fcbf9895c2485743df93986001766b1..b3e5d920e03aa47baaff86053494f8f647b79abc 100644
--- a/vllm/model_executor/models/deepencoder.py
+++ b/vllm/model_executor/models/deepencoder.py
@@ -18,7 +18,8 @@ import torch.nn as nn
 import torch.nn.functional as F
 from transformers import CLIPVisionConfig
 
-from vllm.attention.layer import MultiHeadAttention
+from vllm.config import MultiModalConfig
+from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention
 from vllm.model_executor.layers.conv import Conv2dLayer
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -608,6 +609,7 @@ class DeepCLIPVisionTransformer(nn.Module):
         self,
         config: CLIPVisionConfig,
         quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         *,
         num_hidden_layers_override: int | None = None,
         prefix: str = "",
@@ -626,9 +628,10 @@ class DeepCLIPVisionTransformer(nn.Module):
         self.transformer = CLIPEncoder(
             config=config,
             quant_config=quant_config,
+            multimodal_config=multimodal_config,
             num_hidden_layers_override=num_hidden_layers_override,
             prefix=f"{prefix}.encoder",
-            attn_cls=MultiHeadAttention,
+            attn_cls=MMEncoderAttention,
         )
 
         num_hidden_layers = config.num_hidden_layers
diff --git a/vllm/model_executor/models/deepseek_eagle.py b/vllm/model_executor/models/deepseek_eagle.py
index 8f6b4a4b021f25113d1480e87e44f04a411260e8..5c439cdf486d238267839fd5851150078b1a8eea 100644
--- a/vllm/model_executor/models/deepseek_eagle.py
+++ b/vllm/model_executor/models/deepseek_eagle.py
@@ -106,6 +106,7 @@ class DeepseekV2Model(nn.Module):
         # Params for weights, fp8 weight scales, fp8 activation scales
         # (param_name, weight_name, expert_id, shard_id)
         expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            self,
             ckpt_gate_proj_name="gate_proj",
             ckpt_down_proj_name="down_proj",
             ckpt_up_proj_name="up_proj",
diff --git a/vllm/model_executor/models/deepseek_mtp.py b/vllm/model_executor/models/deepseek_mtp.py
index d6b2c5bd402f5e4fbbb2685bd72deffe7c3862a4..bdd249a10569a9554baa78f43f9b67d2bc1bb798 100644
--- a/vllm/model_executor/models/deepseek_mtp.py
+++ b/vllm/model_executor/models/deepseek_mtp.py
@@ -36,7 +36,6 @@ from .deepseek_v2 import (
     DeepseekV2MoE,
     get_spec_layer_idx_from_weight_name,
 )
-from .interfaces import SupportsPP
 from .utils import maybe_prefix
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.blockwise_int8 import BlockInt8Config
@@ -147,6 +146,7 @@ class DeepSeekMultiTokenPredictor(nn.Module):
         self.embed_tokens = VocabParallelEmbedding(
             config.vocab_size,
             config.hidden_size,
+            prefix=maybe_prefix(prefix, "embed_tokens"),
         )
         self.logits_processor = LogitsProcessor(config.vocab_size)
 
@@ -186,7 +186,7 @@ class DeepSeekMultiTokenPredictor(nn.Module):
 
 
 @support_torch_compile
-class DeepSeekMTP(nn.Module, SupportsPP, DeepseekV2MixtureOfExperts):
+class DeepSeekMTP(nn.Module, DeepseekV2MixtureOfExperts):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         self.config = vllm_config.model_config.hf_config
@@ -262,6 +262,7 @@ class DeepSeekMTP(nn.Module, SupportsPP, DeepseekV2MixtureOfExperts):
         ]
 
         expert_params_mapping = SharedFusedMoE.make_expert_params_mapping(
+            self,
             ckpt_gate_proj_name="gate_proj",
             ckpt_down_proj_name="down_proj",
             ckpt_up_proj_name="up_proj",
diff --git a/vllm/model_executor/models/deepseek_ocr.py b/vllm/model_executor/models/deepseek_ocr.py
index 1f07381c0cbd0de3cff2c80041adcd8b485b186b..f396897bfa5165881825d0c335f65be37679e953 100644
--- a/vllm/model_executor/models/deepseek_ocr.py
+++ b/vllm/model_executor/models/deepseek_ocr.py
@@ -14,9 +14,11 @@ from vllm.config import VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.model_executor.models.interfaces import (
     MultiModalEmbeddings,
+    SupportsLoRA,
     SupportsMultiModal,
     SupportsPP,
 )
+from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.models.utils import (
     AutoWeightsLoader,
     WeightsMapper,
@@ -343,7 +345,7 @@ class DeepseekOCRMultiModalProcessor(
     info=DeepseekOCRProcessingInfo,
     dummy_inputs=DeepseekOCRDummyInputsBuilder,
 )
-class DeepseekOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
+class DeepseekOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
             # map prefix for language backbone
@@ -395,6 +397,7 @@ class DeepseekOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
         self.vision_model = DeepCLIPVisionTransformer(
             config=clip_vision_config,
             quant_config=quant_config,
+            multimodal_config=multimodal_config,
             prefix=maybe_prefix(prefix, "vision_model"),
         )
 
@@ -435,19 +438,16 @@ class DeepseekOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
         if pixel_values is None or torch.sum(pixel_values).item() == 0:
             return None
 
-        if pixel_values is not None:
-            base_size = self.vision_config.image_size
-            return DeepseekOCRImagePixelInputs(
-                type="pixel_values",
-                data=pixel_values,
-                images_crop=images_crop,
-                images_spatial_crop=images_spatial_crop,
-                resolve_bindings={
-                    "base_size": base_size,
-                },
-            )
-
-        raise AssertionError("This line should be unreachable.")
+        base_size = self.vision_config.image_size
+        return DeepseekOCRImagePixelInputs(
+            type="pixel_values",
+            data=pixel_values,
+            images_crop=images_crop,
+            images_spatial_crop=images_spatial_crop,
+            resolve_bindings={
+                "base_size": base_size,
+            },
+        )
 
     def _encode_global_features(self, image_tensor: torch.Tensor) -> torch.Tensor:
         global_features_1 = self.sam_model(image_tensor)
@@ -589,3 +589,13 @@ class DeepseekOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
         loader = AutoWeightsLoader(self)
         autoloaded_weights = loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
         return autoloaded_weights
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="language_model",
+            connector="projector",
+            tower_model=["sam_model", "vision_model"],
+        )
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 36357a7c913d725a70e4df79ed3ee7870421be7c..0162caba9a323dd9b2925188431a947776c020d8 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -36,9 +36,7 @@ from torch import nn
 from transformers import DeepseekV2Config, DeepseekV3Config
 
 from vllm._aiter_ops import rocm_aiter_ops
-from vllm.attention.backends.abstract import AttentionBackend
 from vllm.attention.layer import Attention
-from vllm.attention.ops.common import pack_seq_triton, unpack_seq_triton
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, ParallelConfig, VllmConfig, get_current_vllm_config
 from vllm.distributed import (
@@ -80,10 +78,12 @@ from vllm.model_executor.models.utils import sequence_parallel_chunk
 from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 from vllm.utils.torch_utils import direct_register_custom_op
+from vllm.v1.attention.backend import AttentionBackend
 from vllm.v1.attention.backends.mla.indexer import (
     DeepseekV32IndexerBackend,
     DeepseekV32IndexerMetadata,
 )
+from vllm.v1.attention.ops.common import pack_seq_triton, unpack_seq_triton
 from vllm.v1.kv_cache_interface import KVCacheSpec, MLAAttentionSpec
 from vllm.v1.worker.workspace import current_workspace_manager
 
@@ -701,7 +701,9 @@ def sparse_attn_indexer(
                 )
             fp8_mqa_logits_func = fp8_mqa_logits
             if current_platform.is_rocm():
-                # from vllm.attention.ops.rocm_aiter_mla_sparse import rocm_fp8_mqa_logits
+                # from vllm.v1.attention.ops.rocm_aiter_mla_sparse import (
+                #     rocm_fp8_mqa_logits,
+                # )
 
                 # fp8_mqa_logits_func = rocm_fp8_mqa_logits
                 fp8_mqa_logits_func = op.mqa_logits
@@ -758,7 +760,7 @@ def sparse_attn_indexer(
         num_padded_tokens = batch_size * next_n
         fp8_paged_mqa_logits_func = fp8_paged_mqa_logits
         if current_platform.is_rocm():
-            # from vllm.attention.ops.rocm_aiter_mla_sparse import (
+            # from vllm.v1.attention.ops.rocm_aiter_mla_sparse import (
             #     rocm_fp8_paged_mqa_logits,
             # )
 
@@ -865,7 +867,11 @@ class Indexer(nn.Module):
         )
         self.k_norm = LayerNorm(self.head_dim, eps=1e-6)
         self.weights_proj = ReplicatedLinear(
-            hidden_size, self.n_head, quant_config=None, prefix=f"{prefix}.weights_proj"
+            hidden_size,
+            self.n_head,
+            bias=False,
+            quant_config=None,
+            prefix=f"{prefix}.weights_proj",
         )
         self.softmax_scale = self.head_dim**-0.5
 
@@ -904,8 +910,14 @@ class Indexer(nn.Module):
         )
 
         q_pe, k_pe = rotary_emb(positions, q_pe, k_pe.unsqueeze(1))
-        q = torch.cat([q_pe.squeeze(0), q_nope], dim=-1)
-        k = torch.cat([k_pe.squeeze((0, 2)), k_nope], dim=-1)
+        # Note: RoPE (NeoX) can introduce extra leading dimensions during compilation
+        # so we need to reshape back to token-flattened shapes
+        q_pe = q_pe.reshape(-1, self.n_head, self.rope_dim)
+        k_pe = k_pe.reshape(-1, 1, self.rope_dim)
+
+        q = torch.cat([q_pe, q_nope], dim=-1)
+        # `k_pe` is [num_tokens, 1, rope_dim] (MQA).
+        k = torch.cat([k_pe.squeeze(-2), k_nope], dim=-1)
 
         # we only quant q here since k quant is fused with cache insertion
         if not current_platform.is_rocm() or torch.cuda.get_device_properties("cuda").gcnArchName.split(':')[0] == "gfx938":
@@ -1525,6 +1537,7 @@ class DeepseekV2ForCausalLM(
         # Params for weights, fp8 weight scales, fp8 activation scales
         # (param_name, weight_name, expert_id, shard_id)
         return SharedFusedMoE.make_expert_params_mapping(
+            self,
             ckpt_gate_proj_name="gate_proj",
             ckpt_down_proj_name="down_proj",
             ckpt_up_proj_name="up_proj",
@@ -1558,6 +1571,7 @@ class DeepseekV2ForCausalLM(
         # Params for weights, fp8 weight scales, fp8 activation scales
         # (param_name, weight_name, expert_id, shard_id)
         expert_params_mapping = SharedFusedMoE.make_expert_params_mapping(
+            self,
             ckpt_gate_proj_name="gate_proj",
             ckpt_down_proj_name="down_proj",
             ckpt_up_proj_name="up_proj",
@@ -1637,7 +1651,11 @@ class DeepseekV2ForCausalLM(
                     # Determine split axis based on op type
                     # gate/up: ColumnParallel → split along dim 0
                     # down: RowParallel → split along dim 1
-                    split_dim = 1 if "down_proj.weight" in name else 0
+                    split_dim = (
+                        1
+                        if ("down_proj.weight" in name and loaded_weight.ndim > 1)
+                        else 0
+                    )
                     total = loaded_weight.shape[split_dim]
                     assert total % num_chunks == 0, (
                         f"Shared expert weight dim {total} "
@@ -1650,14 +1668,13 @@ class DeepseekV2ForCausalLM(
                     weight_to_load = loaded_weight
 
                     if is_fusion_moe_shared_experts_layer:
-                        if split_dim == 0:
-                            weight_to_load = loaded_weight[
-                                j * chunk_size : (j + 1) * chunk_size, :
-                            ]
+                        chunk_slice = slice(j * chunk_size, (j + 1) * chunk_size)
+                        if loaded_weight.ndim == 1:
+                            weight_to_load = loaded_weight[chunk_slice]
+                        elif split_dim == 0:
+                            weight_to_load = loaded_weight[chunk_slice, :]
                         else:
-                            weight_to_load = loaded_weight[
-                                :, j * chunk_size : (j + 1) * chunk_size
-                            ]
+                            weight_to_load = loaded_weight[:, chunk_slice]
                         # Synthesize an expert-style name so expert mapping
                         # can route it
                         chunk_name = name.replace(
diff --git a/vllm/model_executor/models/dots1.py b/vllm/model_executor/models/dots1.py
index 870a37039f151bc1f11a882220782993694d3d4d..b64f163761c86c0a6470bef448cdff6a11ebb150 100644
--- a/vllm/model_executor/models/dots1.py
+++ b/vllm/model_executor/models/dots1.py
@@ -424,6 +424,7 @@ class Dots1Model(nn.Module):
 
     def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
         return SharedFusedMoE.make_expert_params_mapping(
+            self,
             ckpt_gate_proj_name="gate_proj",
             ckpt_down_proj_name="down_proj",
             ckpt_up_proj_name="up_proj",
diff --git a/vllm/model_executor/models/dots_ocr.py b/vllm/model_executor/models/dots_ocr.py
index 6d8dbec9236c9a66725ccf72a25f37758c793291..ac9ad3b67d65bb40fb5c713a13b97e580acf64d6 100644
--- a/vllm/model_executor/models/dots_ocr.py
+++ b/vllm/model_executor/models/dots_ocr.py
@@ -8,10 +8,6 @@ import torch.nn as nn
 from torch.nn import LayerNorm
 from transformers.models.qwen2_vl import Qwen2VLProcessor
 
-from vllm.attention.backends.registry import AttentionBackendEnum
-from vllm.attention.layers.mm_encoder_attention import (
-    MMEncoderAttention,
-)
 from vllm.config import MultiModalConfig, VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import utils as dist_utils
@@ -20,6 +16,9 @@ from vllm.distributed.parallel_state import (
     get_tensor_model_parallel_world_size,
 )
 from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.attention.mm_encoder_attention import (
+    MMEncoderAttention,
+)
 from vllm.model_executor.layers.conv import Conv2dLayer
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (
@@ -59,6 +58,7 @@ from vllm.multimodal.inputs import MultiModalDataDict
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.dotsocr import DotsOCRConfig, DotsVisionConfig
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
 
 from .vision import run_dp_sharded_mrope_vision_model
 
@@ -271,6 +271,7 @@ class DotsVisionAttention(nn.Module):
         self.attn = MMEncoderAttention(
             num_heads=self.num_attention_heads_per_partition,
             head_size=self.hidden_size_per_attention_head,
+            scale=self.hidden_size_per_attention_head**-0.5,
             multimodal_config=multimodal_config,
             prefix=f"{prefix}.attn",
         )
@@ -765,6 +766,14 @@ class DotsOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
+    def get_num_mm_encoder_tokens(self, num_image_tokens: int) -> int:
+        merge_size = self.vision_tower.spatial_merge_size
+        return num_image_tokens * (merge_size**2)
+
+    def get_num_mm_connector_tokens(self, num_vision_tokens: int) -> int:
+        merge_size = self.vision_tower.spatial_merge_size
+        return num_vision_tokens // (merge_size**2)
+
     def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
diff --git a/vllm/model_executor/models/ernie45_moe.py b/vllm/model_executor/models/ernie45_moe.py
index 215f5cca4a125367f092deec91a2ad9a1f074a5a..a07b488678407b38a473c2cf748e6d6f833025da 100644
--- a/vllm/model_executor/models/ernie45_moe.py
+++ b/vllm/model_executor/models/ernie45_moe.py
@@ -497,6 +497,7 @@ class Ernie4_5_MoeModel(nn.Module):
         # Params for weights, fp8 weight scales, fp8 activation scales
         # (param_name, weight_name, expert_id, shard_id)
         return SharedFusedMoE.make_expert_params_mapping(
+            self,
             ckpt_gate_proj_name="gate_proj",
             ckpt_down_proj_name="down_proj",
             ckpt_up_proj_name="up_proj",
diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py
index 61cf78fdb5a6742d724ed9f12404aceb9b901434..a382cb5b61fe011067e03950d8347f1740d22a3e 100644
--- a/vllm/model_executor/models/ernie45_vl.py
+++ b/vllm/model_executor/models/ernie45_vl.py
@@ -21,7 +21,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Inference-only Erine VL model compatible with HuggingFace weights."""
+"""Inference-only Ernie VL model compatible with HuggingFace weights."""
 
 import itertools
 import math
@@ -36,16 +36,15 @@ import torch.nn.functional as F
 from einops import rearrange
 from transformers import BatchFeature
 
-from vllm.attention.backends.registry import AttentionBackendEnum
-from vllm.attention.layers.mm_encoder_attention import (
-    MMEncoderAttention,
-)
 from vllm.config import MultiModalConfig, VllmConfig
-from vllm.config.multimodal import BaseDummyOptions
+from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions
 from vllm.distributed import parallel_state
 from vllm.distributed import utils as dist_utils
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import QuickGELU
+from vllm.model_executor.layers.attention.mm_encoder_attention import (
+    MMEncoderAttention,
+)
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
@@ -64,7 +63,7 @@ from vllm.multimodal.inputs import (
     MultiModalFieldConfig,
     MultiModalKwargsItems,
 )
-from vllm.multimodal.parse import ImageSize, MultiModalDataItems
+from vllm.multimodal.parse import ImageSize, MultiModalDataItems, MultiModalDataParser
 from vllm.multimodal.processing import (
     BaseMultiModalProcessor,
     BaseProcessingInfo,
@@ -74,6 +73,7 @@ from vllm.multimodal.processing import (
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
 
 from .ernie45_vl_moe import Ernie4_5_VLMoeForCausalLM
 from .interfaces import (
@@ -152,6 +152,7 @@ class Ernie4_5_VisionAttention(nn.Module):
         self.attn = MMEncoderAttention(
             num_heads=self.num_attention_heads_per_partition,
             head_size=self.hidden_size_per_attention_head,
+            scale=self.hidden_size_per_attention_head**-0.5,
             multimodal_config=multimodal_config,
             prefix=f"{prefix}.attn",
         )
@@ -599,7 +600,11 @@ def smart_resize(
         w_bar = ceil_by_factor(width * beta, factor)
 
     if min_pixels > h_bar * w_bar or h_bar * w_bar > max_pixels:
-        raise ValueError(f"encounter invalid h_bar: {h_bar}, w_bar: {w_bar}")
+        raise ValueError(
+            f"Invalid h_bar={h_bar}, w_bar={w_bar}: "
+            f"h_bar * w_bar must be >= min_pixels ({min_pixels}) "
+            f"and <= max_pixels ({max_pixels})."
+        )
 
     return h_bar, w_bar
 
@@ -952,6 +957,11 @@ class Ernie4_5_VLProcessingInfo(BaseProcessingInfo):
 
 
 class Ernie4_5VLMultiModalProcessor(BaseMultiModalProcessor[Ernie4_5_VLProcessingInfo]):
+    def _get_data_parser(self) -> MultiModalDataParser:
+        return MultiModalDataParser(
+            video_needs_metadata=True,
+        )
+
     def _pixel_values_norm(
         self,
         pixel_values: torch.Tensor,
@@ -1010,8 +1020,25 @@ class Ernie4_5VLMultiModalProcessor(BaseMultiModalProcessor[Ernie4_5_VLProcessin
             mm_data["images"] = []
         if "videos" not in mm_data:
             mm_data["videos"] = []
+
+        # Check if HF processor supports video metadata
+        hf_processor = self.info.get_hf_processor(**mm_kwargs)
+        supports_video_metadata = getattr(
+            hf_processor, "supports_video_metadata", False
+        )
+
+        if mm_data["videos"] and not supports_video_metadata:
+            # Old HF processor, unwrap tuple to pure frames
+            logger.warning_once(
+                "HF processor doesn't support video metadata. "
+                "Timestamps will NOT be rendered. Please upgrade the model."
+            )
+            mm_data["videos"] = [
+                v[0] if isinstance(v, tuple) else v for v in mm_data["videos"]
+            ]
+
         processor_output = self.info.ctx.call_hf_processor(
-            self.info.get_hf_processor(**mm_kwargs),
+            hf_processor,
             dict(text=[prompt], images=mm_data["images"], videos=mm_data["videos"]),
             dict(**mm_kwargs, **tok_kwargs),
         )
@@ -1163,6 +1190,60 @@ class Ernie4_5_VLDummyInputsBuilder(BaseDummyInputsBuilder[Ernie4_5_VLProcessing
             ),
         }
 
+    def _get_dummy_videos(
+        self,
+        *,
+        width: int,
+        height: int,
+        num_frames: int,
+        num_videos: int,
+        overrides: VideoDummyOptions | None = None,
+    ):
+        if overrides:
+            if overrides.num_frames:
+                if overrides.num_frames > num_frames:
+                    logger.warning(
+                        "video.num_frames override (%d) exceeds model's "
+                        "maximum number of frames (%d), will be ignored",
+                        overrides.num_frames,
+                        num_frames,
+                    )
+                num_frames = min(num_frames, overrides.num_frames)
+            if overrides.width:
+                if overrides.width > width:
+                    logger.warning(
+                        "video.width override (%d) exceeds model's "
+                        "maximum width (%d), will be ignored",
+                        overrides.width,
+                        width,
+                    )
+                width = min(width, overrides.width)
+            if overrides.height:
+                if overrides.height > height:
+                    logger.warning(
+                        "video.height override (%d) exceeds model's "
+                        "maximum height (%d), will be ignored",
+                        overrides.height,
+                        height,
+                    )
+                height = min(height, overrides.height)
+        num_frames = max(num_frames, 2)  # ernie4.5-vl requires at least 2 frames
+
+        video = np.full((num_frames, width, height, 3), 255, dtype=np.uint8)
+        video_items = []
+        for i in range(num_videos):
+            video_metadata = {
+                "fps": 2.0,
+                "duration": num_frames / 2.0,
+                "total_num_frames": num_frames,
+                "frames_indices": [i for i in range(num_frames)],
+                "video_backend": "opencv",
+                "do_sample_frames": False,
+            }
+            video_item = (video.copy(), video_metadata)
+            video_items.append(video_item)
+        return video_items
+
 
 @MULTIMODAL_REGISTRY.register_processor(
     Ernie4_5VLMultiModalProcessor,
diff --git a/vllm/model_executor/models/ernie45_vl_moe.py b/vllm/model_executor/models/ernie45_vl_moe.py
index 72f9957fc88286c80451c583e564ce6d6cb4914f..75be587eedb270dd2ce5dbc9b6a7fccf67c3ba6a 100644
--- a/vllm/model_executor/models/ernie45_vl_moe.py
+++ b/vllm/model_executor/models/ernie45_vl_moe.py
@@ -675,6 +675,7 @@ class Ernie4_5_VLMoeForCausalLM(nn.Module, SupportsPP):
         # Params for weights, fp8 weight scales, fp8 activation scales
         # (param_name, weight_name, expert_id, shard_id)
         expert_params_mapping = SharedFusedMoE.make_expert_params_mapping(
+            self,
             ckpt_gate_proj_name="gate_proj",
             ckpt_down_proj_name="down_proj",
             ckpt_up_proj_name="up_proj",
diff --git a/vllm/model_executor/models/ernie_mtp.py b/vllm/model_executor/models/ernie_mtp.py
index 1b9abc3572a3b60b2afce236c42683a0593e2917..05c4277b1b84b3e62013e14fc327005836016e03 100644
--- a/vllm/model_executor/models/ernie_mtp.py
+++ b/vllm/model_executor/models/ernie_mtp.py
@@ -39,7 +39,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsPP
 from .llama import LlamaDecoderLayer
 from .utils import is_pp_missing_parameter, maybe_prefix
 
@@ -143,7 +142,7 @@ class ErnieMultiTokenPredictor(nn.Module):
         return logits
 
 
-class ErnieMTP(nn.Module, SupportsPP):
+class ErnieMTP(nn.Module):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
diff --git a/vllm/model_executor/models/exaone4.py b/vllm/model_executor/models/exaone4.py
index b4b7a798fd050538e34ad48bb08694d54310c8a2..cff82396ab30f816f8c705e72f14a3eaf0c9af11 100644
--- a/vllm/model_executor/models/exaone4.py
+++ b/vllm/model_executor/models/exaone4.py
@@ -72,6 +72,7 @@ class Exaone4GatedMLP(nn.Module):
         intermediate_size: int,
         hidden_act: str,
         quant_config: QuantizationConfig | None = None,
+        reduce_results: bool = True,
         bias: bool = False,
         prefix: str = "",
     ) -> None:
@@ -88,6 +89,7 @@ class Exaone4GatedMLP(nn.Module):
             output_size=hidden_size,
             bias=bias,
             quant_config=quant_config,
+            reduce_results=reduce_results,
             prefix=f"{prefix}.down_proj",
         )
         if hidden_act != "silu":
diff --git a/vllm/model_executor/models/exaone_moe.py b/vllm/model_executor/models/exaone_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..ccff419a424aead3f600abb494e9244ebb1453e7
--- /dev/null
+++ b/vllm/model_executor/models/exaone_moe.py
@@ -0,0 +1,579 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa: E501
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only K-EXAONE-236B-A22B model compatible with HuggingFace weights."""
+
+import typing
+from collections.abc import Callable, Iterable
+from itertools import islice
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config
+from vllm.distributed import (
+    get_ep_group,
+    get_pp_group,
+    get_tensor_model_parallel_world_size,
+)
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import ReplicatedLinear
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE,
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from vllm.sequence import IntermediateTensors
+
+from .exaone4 import Exaone4Attention as ExaoneMoeAttention
+from .exaone4 import Exaone4GatedMLP as ExaoneMoeGatedMLP
+from .interfaces import SupportsLoRA, SupportsPP
+from .utils import (
+    AutoWeightsLoader,
+    PPMissingLayer,
+    extract_layer_index,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
+
+
+class ExaoneMoe(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+        enable_eplb: bool = False,
+    ):
+        super().__init__()
+        self.tp_size = get_tensor_model_parallel_world_size()
+
+        self.routed_scaling_factor = config.routed_scaling_factor
+
+        self.ep_group = get_ep_group().device_group
+        self.ep_rank = self.ep_group.rank()
+        self.ep_size = self.ep_group.size()
+        self.n_routed_experts = config.num_experts
+
+        if self.tp_size > config.num_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than "
+                f"the number of experts {config.num_experts}."
+            )
+
+        self.gate = ReplicatedLinear(
+            config.hidden_size,
+            config.num_experts,
+            bias=False,
+            quant_config=None,
+            prefix=f"{prefix}.gate",
+        )
+
+        self.e_score_correction_bias = nn.Parameter(
+            torch.empty(config.num_experts, dtype=torch.float32)
+        )
+
+        # Load balancing settings.
+        vllm_config = get_current_vllm_config()
+        eplb_config = vllm_config.parallel_config.eplb_config
+        self.enable_eplb = enable_eplb
+
+        self.n_logical_experts = self.n_routed_experts
+        eplb_config.num_redundant_experts = (
+            eplb_config.num_redundant_experts
+            if eplb_config.num_redundant_experts is not None
+            else 0
+        )
+        self.n_redundant_experts = eplb_config.num_redundant_experts
+        self.n_physical_experts = self.n_logical_experts + self.n_redundant_experts
+        self.n_local_physical_experts = self.n_physical_experts // self.ep_size
+
+        self.physical_expert_start = self.ep_rank * self.n_local_physical_experts
+        self.physical_expert_end = (
+            self.physical_expert_start + self.n_local_physical_experts
+        )
+
+        self.experts = FusedMoE(
+            num_experts=self.n_routed_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.moe_intermediate_size,
+            reduce_results=False,
+            renormalize=config.norm_topk_prob,
+            quant_config=quant_config,
+            use_grouped_topk=True,
+            num_expert_group=config.n_group,
+            topk_group=config.topk_group,
+            prefix=f"{prefix}.experts",
+            scoring_func="sigmoid",
+            routed_scaling_factor=self.routed_scaling_factor,
+            e_score_correction_bias=self.e_score_correction_bias,
+            enable_eplb=self.enable_eplb,
+            num_redundant_experts=self.n_redundant_experts,
+        )
+
+        if getattr(config, "num_shared_experts", 0) > 0:
+            intermediate_size = config.moe_intermediate_size * config.num_shared_experts
+            self.shared_experts = ExaoneMoeGatedMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                reduce_results=self.experts.must_reduce_shared_expert_outputs(),
+                prefix=f"{prefix}.shared_experts",
+            )
+        else:
+            self.shared_experts = None
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # NOTE: hidden_states can have either 1D or 2D shape.
+        orig_shape = hidden_states.shape
+        hidden_dim = hidden_states.shape[-1]
+        hidden_states = hidden_states.view(-1, hidden_dim)
+
+        # router_logits: (num_tokens, n_experts)
+        router_logits, _ = self.gate(hidden_states)
+
+        final_hidden_states = self.experts(
+            hidden_states=hidden_states, router_logits=router_logits
+        )
+
+        if self.shared_experts is not None:
+            shared_output = self.shared_experts(hidden_states)
+            final_hidden_states = final_hidden_states + shared_output
+
+        if self.tp_size > 1:
+            final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel(  # noqa E501
+                final_hidden_states
+            )
+
+        return final_hidden_states.view(orig_shape)
+
+
+class ExaoneMoeDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        mtp_layer: bool = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        layer_idx = extract_layer_index(prefix)
+        self.hidden_size = config.hidden_size
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+        # Support abacusai/Smaug-72B-v0.1 with attention_bias
+        # Support internlm/internlm-7b with bias
+        attention_bias = getattr(config, "attention_bias", False) or getattr(
+            config, "bias", False
+        )
+
+        self.self_attn = ExaoneMoeAttention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=getattr(
+                config, "num_key_value_heads", config.num_attention_heads
+            ),
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            bias=attention_bias,
+            cache_config=cache_config,
+            prefix=f"{prefix}.self_attn",
+        )
+
+        if config.is_moe_layer[layer_idx] and not mtp_layer:
+            self.mlp = ExaoneMoe(
+                config=config, quant_config=quant_config, prefix=f"{prefix}.mlp"
+            )
+        else:
+            self.mlp = ExaoneMoeGatedMLP(
+                hidden_size=self.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                bias=getattr(config, "mlp_bias", False),
+                prefix=f"{prefix}.mlp",
+            )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor | None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+
+        # Self Attention
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+
+        return hidden_states, residual
+
+
+@support_torch_compile
+class ExaoneMoeModel(nn.Module):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+        self.num_redundant_experts = (
+            vllm_config.parallel_config.eplb_config.num_redundant_experts
+        )
+
+        self.config = config
+        self.quant_config = quant_config
+        lora_vocab = (
+            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
+            if lora_config
+            else 0
+        )
+        self.vocab_size = config.vocab_size + lora_vocab
+        if get_pp_group().is_first_rank or (
+            config.tie_word_embeddings and get_pp_group().is_last_rank
+        ):
+            self.embed_tokens = VocabParallelEmbedding(
+                self.vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                quant_config=quant_config,
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: ExaoneMoeDecoderLayer(
+                config=config,
+                cache_config=cache_config,
+                quant_config=quant_config,
+                prefix=prefix,
+            ),
+            prefix=f"{prefix}.layers",
+        )
+        if get_pp_group().is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer()
+
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor | None,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.embed_input_ids(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                residual,
+            )
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        return FusedMoE.make_expert_params_mapping(
+            self,
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.num_experts,
+            num_redundant_experts=self.num_redundant_experts,
+        )
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+
+        # Skip loading extra parameters for GPTQ/modelopt models.
+        ignore_suffixes = (
+            ".bias",
+            "_bias",
+            ".k_scale",
+            "_k_scale",
+            ".v_scale",
+            "_v_scale",
+            ".weight_scale",
+            "_weight_scale",
+            ".input_scale",
+            "_input_scale",
+        )
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        expert_params_mapping = self.get_expert_mapping()
+        for name, loaded_weight in weights:
+            if name.startswith("mtp."):
+                continue
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            if self.quant_config is not None and (
+                scale_name := self.quant_config.get_cache_scale(name)
+            ):
+                # Loading kv cache quantization scales
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                loaded_weight = (
+                    loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0]
+                )
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                if "mlp.experts" in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                if name not in params_dict:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                is_expert_weight = False
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+
+                    # Anyway, this is an expert weight and should not be
+                    # attempted to load as other weights later
+                    is_expert_weight = True
+
+                    # Do not modify `name` since the loop may continue here
+                    # Instead, create a new variable
+                    name_mapped = name.replace(weight_name, param_name)
+
+                    if is_pp_missing_parameter(name_mapped, self):
+                        continue
+
+                    # Skip loading extra parameters for GPTQ/modelopt models.
+                    if (
+                        name_mapped.endswith(ignore_suffixes)
+                        and name_mapped not in params_dict
+                    ):
+                        continue
+
+                    param = params_dict[name_mapped]
+                    # We should ask the weight loader to return success or not
+                    # here since otherwise we may skip experts with other
+                    # available replicas.
+                    weight_loader = typing.cast(
+                        Callable[..., bool], param.weight_loader
+                    )
+                    success = weight_loader(
+                        param,
+                        loaded_weight,
+                        name_mapped,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                        return_success=True,
+                    )
+                    if success:
+                        name = name_mapped
+                        break
+                else:
+                    if is_expert_weight:
+                        continue
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    # Skip loading extra parameters for GPTQ/modelopt models.
+                    if name.endswith(ignore_suffixes) and name not in params_dict:
+                        continue
+                    # Remapping the name of FP8 kv-scale.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+
+                    if is_pp_missing_parameter(name, self):
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class ExaoneMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    # LoRA specific attributes
+    embedding_modules = {
+        "embed_tokens": "input_embeddings",
+        "lm_head": "output_embeddings",
+    }
+    embedding_padding_modules = ["lm_head"]
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config.get_text_config()
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+        self.lora_config = lora_config
+        self.quant_config = quant_config
+
+        self.model = ExaoneMoeModel(
+            vllm_config=vllm_config,
+            prefix=maybe_prefix(prefix, "model"),
+        )
+        if get_pp_group().is_last_rank:
+            self.unpadded_vocab_size = config.vocab_size
+            if lora_config:
+                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+            self.lm_head = ParallelLMHead(
+                self.unpadded_vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                padding_size=DEFAULT_VOCAB_PADDING_SIZE
+                # We need bigger padding if using lora for kernel
+                # compatibility
+                if not lora_config
+                else lora_config.lora_vocab_padding_size,
+                quant_config=quant_config,
+            )
+            if config.tie_word_embeddings:
+                self.lm_head.weight = self.model.embed_tokens.weight
+
+            logit_scale = getattr(config, "logit_scale", 1.0)
+            self.logits_processor = LogitsProcessor(
+                self.unpadded_vocab_size, config.vocab_size, logit_scale
+            )
+        else:
+            self.lm_head = PPMissingLayer()
+
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors
+        )
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        model_output = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
+        return model_output
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor | None:
+        logits = self.logits_processor(self.lm_head, hidden_states)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            # With tie_word_embeddings, we can skip lm_head.weight
+            # The weight might appear unnecessarily in the files if the model is
+            # processed with quantization, LoRA, fine-tuning, etc.
+            skip_prefixes=(
+                ["lm_head.", "mtp."] if self.config.tie_word_embeddings else ["mtp."]
+            ),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/exaone_moe_mtp.py b/vllm/model_executor/models/exaone_moe_mtp.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3c71e6aef6ea2f2617ea85561c5d594b401aa0c
--- /dev/null
+++ b/vllm/model_executor/models/exaone_moe_mtp.py
@@ -0,0 +1,255 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Inference-only ExaoneMoe MTP model."""
+
+from collections.abc import Iterable
+
+import torch
+from torch import nn
+
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import VllmConfig
+from vllm.distributed.parallel_state import get_pp_group
+from vllm.logger import init_logger
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import ColumnParallelLinear
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.exaone_moe import ExaoneMoeDecoderLayer
+from vllm.sequence import IntermediateTensors
+
+from .utils import (
+    AutoWeightsLoader,
+    is_pp_missing_parameter,
+    maybe_prefix,
+)
+
+logger = init_logger(__name__)
+
+KVCache = tuple[torch.Tensor, torch.Tensor]
+
+
+@support_torch_compile
+class ExaoneMoeMultiTokenPredictor(nn.Module):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        model_config = vllm_config.model_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+        config = model_config.hf_config
+
+        self.config = config
+        lora_vocab = (
+            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
+            if lora_config
+            else 0
+        )
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.org_vocab_size = config.vocab_size
+
+        self.mtp_start_layer_idx = config.num_hidden_layers
+        self.num_mtp_layers = getattr(config, "num_nextn_predict_layers", 1)
+
+        self.embed_tokens = VocabParallelEmbedding(
+            self.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+        )
+
+        self.fc = ColumnParallelLinear(
+            self.config.hidden_size * 2,
+            self.config.hidden_size,
+            gather_output=True,
+            bias=False,
+            return_bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc",
+        )
+        self.layers = nn.ModuleList(
+            ExaoneMoeDecoderLayer(
+                vllm_config.model_config.hf_config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.layers.{idx}",
+                mtp_layer=True,
+            )
+            for idx in range(self.num_mtp_layers)
+        )
+
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.pre_fc_norm_hidden = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.pre_fc_norm_embedding = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        spec_step_idx: int = 0,
+    ) -> torch.Tensor:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is None:
+                inputs_embeds = self.get_input_embeddings(input_ids)
+            assert hidden_states.shape[-1] == inputs_embeds.shape[-1]
+            inputs_embeds = self.pre_fc_norm_embedding(inputs_embeds)
+            hidden_states = self.pre_fc_norm_hidden(hidden_states)
+            hidden_states = torch.cat([inputs_embeds, hidden_states], dim=-1)
+            hidden_states = self.fc(hidden_states)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        current_step_idx = spec_step_idx % self.num_mtp_layers
+        hidden_states, residual = self.layers[current_step_idx](
+            positions=positions,
+            hidden_states=hidden_states,
+            residual=residual,
+        )
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+
+                if "mlp.experts" in name:
+                    continue
+
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Skip layers on other devices.
+                if is_pp_missing_parameter(name, self):
+                    continue
+                if name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+@support_torch_compile
+class ExaoneMoeMTP(nn.Module):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        config = vllm_config.model_config.hf_config
+        self.vllm_config = vllm_config
+        cache_config = vllm_config.cache_config
+        assert not cache_config.enable_prefix_caching, (
+            "ExaoneMoeMTP currently does not support prefix caching"
+        )
+
+        self.quant_config = vllm_config.quant_config
+
+        super().__init__()
+        self.config = config
+        self.model = ExaoneMoeMultiTokenPredictor(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "mtp")
+        )
+        self.unpadded_vocab_size = config.vocab_size
+        self.lm_head = ParallelLMHead(
+            self.unpadded_vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            # padding_size=DEFAULT_VOCAB_PADDING_SIZE,
+            prefix=maybe_prefix(prefix, "lm_head"),
+        )
+        if config.tie_word_embeddings:
+            self.lm_head.weight = self.model.embed_tokens.weight
+        self.logits_processor = LogitsProcessor(
+            self.unpadded_vocab_size, config.vocab_size
+        )
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        spec_step_idx: int = 0,
+        **kwargs: object,
+    ):
+        hidden_states = self.model(
+            input_ids,
+            positions,
+            hidden_states,
+            intermediate_tensors,
+            inputs_embeds,
+            spec_step_idx,
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        spec_step_idx: int = 0,
+    ) -> torch.Tensor | None:
+        return self.logits_processor(self.lm_head, hidden_states)
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        shared_weight_names = ["embed_tokens", "lm_head"]
+
+        def remap_weight_names(weights):
+            for name, weight in weights:
+                if name.startswith("mtp."):
+                    name = name.replace("mtp.", "model.")
+                elif not any(key in name for key in shared_weight_names):
+                    continue
+                yield name, weight
+
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(remap_weight_names(weights))
diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py
index 3e01239e2c3cb11f765f2844ffb39379982aa01f..c8a0ba8c9d3bcda1c94ee481e65d8ee7b6112f21 100644
--- a/vllm/model_executor/models/gemma3.py
+++ b/vllm/model_executor/models/gemma3.py
@@ -22,13 +22,15 @@ import torch
 from torch import nn
 from transformers import Gemma3TextConfig
 
-from vllm.attention.backends.abstract import AttentionType
 from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import GeluAndMul
+from vllm.model_executor.layers.attention.encoder_only_attention import (
+    EncoderOnlyAttention,
+)
 from vllm.model_executor.layers.layernorm import GemmaRMSNorm
 from vllm.model_executor.layers.linear import (
     MergedColumnParallelLinear,
@@ -38,14 +40,17 @@ from vllm.model_executor.layers.linear import (
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader,
     maybe_remap_kv_scale_name,
 )
 from vllm.sequence import IntermediateTensors
+from vllm.v1.attention.backend import AttentionType
 
-from ...attention.layers.encoder_only_attention import EncoderOnlyAttention
 from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (
     AutoWeightsLoader,
@@ -463,12 +468,20 @@ class Gemma3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
 
         super().__init__()
         self.config = config
-        # currently all existing Gemma models have `tie_word_embeddings` enabled
-        assert config.tie_word_embeddings
         self.quant_config = quant_config
         self.model = Gemma3Model(
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
         )
+
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "lm_head"),
+        )
+        if config.tie_word_embeddings:
+            self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens)
+
         self.logits_processor = LogitsProcessor(
             config.vocab_size, soft_cap=config.final_logit_softcapping
         )
@@ -496,7 +509,7 @@ class Gemma3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         self,
         hidden_states: torch.Tensor,
     ) -> torch.Tensor | None:
-        logits = self.logits_processor(self.model.embed_tokens, hidden_states)
+        logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
diff --git a/vllm/model_executor/models/gemma3n_audio_utils.py b/vllm/model_executor/models/gemma3n_audio_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..bef9bb9a0d4e225eb21e9603889a47933490e346
--- /dev/null
+++ b/vllm/model_executor/models/gemma3n_audio_utils.py
@@ -0,0 +1,57 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Lightweight utility functions for Gemma3n audio processing.
+
+This module is separate from gemma3n_mm.py to avoid heavy CUDA dependencies,
+making it testable without a full vLLM build.
+"""
+
+import torch
+
+
+def adjust_audio_features_to_expected_length(
+    audio_features: torch.Tensor,
+    expected_tokens: int,
+    audio_padding_embs: torch.Tensor,
+) -> tuple[torch.Tensor, int]:
+    """Adjust audio features to expected token length via padding or truncation.
+
+    The Gemma3nProcessor expects all audio will be ~30s in length and inserts
+    a fixed number of audio soft tokens into the text. However, the audio
+    preprocessing and encoder do not guarantee they will produce exactly that
+    many soft tokens; they may produce fewer tokens (for shorter audio) or more
+    tokens (for longer audio or due to BOA/EOA special tokens).
+
+    This function handles both cases:
+    - If fewer tokens: pad with the provided padding embeddings
+    - If more tokens: truncate to the expected count
+
+    Args:
+        audio_features: Audio embeddings tensor of shape
+            (batch_size, seq_len, embed_dim)
+        expected_tokens: The expected number of audio tokens (e.g., 188)
+        audio_padding_embs: Padding embeddings tensor of shape (1, 1, embed_dim)
+
+    Returns:
+        Tuple of:
+        - adjusted_features: Audio features adjusted to expected_tokens length
+        - tokens_truncated: Number of tokens truncated (0 if padding was applied)
+    """
+    audio_batch_size, audio_seq_len, audio_embed_dim = audio_features.shape
+    tokens_truncated = 0
+
+    if audio_seq_len < expected_tokens:
+        # Pad to expected length with padding embeddings
+        extra_padding_tokens = expected_tokens - audio_seq_len
+        extra_padding_features = audio_padding_embs.expand(
+            audio_batch_size, extra_padding_tokens, audio_embed_dim
+        )
+        audio_features = torch.cat((audio_features, extra_padding_features), dim=1)
+    elif audio_seq_len > expected_tokens:
+        # Truncate to expected length (audio encoder produced more tokens
+        # than expected, e.g., due to longer audio or placeholder mismatch)
+        tokens_truncated = audio_seq_len - expected_tokens
+        audio_features = audio_features[:, :expected_tokens, :]
+
+    return audio_features, tokens_truncated
diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py
index 7036118ada0845e87e9694588e17944503be3a0f..acb0d739902ba54d5f46e80a8b96a994aeeca0b8 100644
--- a/vllm/model_executor/models/gemma3n_mm.py
+++ b/vllm/model_executor/models/gemma3n_mm.py
@@ -1,11 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Annotated, Any, Literal, Optional, Union, cast
+from typing import Annotated, Any, Literal, cast
 
 import numpy as np
 import torch
-
 from torch import nn
 from transformers import AutoModel, BatchFeature
 from transformers.models.gemma3n import (
@@ -26,6 +25,9 @@ from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import RowParallelLinear
 from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
 from vllm.model_executor.models.gemma3n import Gemma3nForCausalLM
+from vllm.model_executor.models.gemma3n_audio_utils import (
+    adjust_audio_features_to_expected_length,
+)
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.models.whisper import ISO639_1_SUPPORTED_LANGS
 from vllm.multimodal import MULTIMODAL_REGISTRY
@@ -105,12 +107,12 @@ class Gemma3nProcessingInfo(BaseProcessingInfo):
     def get_hf_processor(self, **kwargs: object):
         return self.ctx.get_hf_processor(Gemma3nProcessor, **kwargs)
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None, "audio": None}
 
     def get_max_tokens_per_item(
         self, seq_len: int, mm_counts: Mapping[str, int]
-    ) -> Optional[Mapping[str, int]]:
+    ) -> Mapping[str, int] | None:
         return {"image": TOKENS_PER_IMAGE, "audio": TOKENS_PER_AUDIO}
 
     def get_image_repl(
@@ -118,7 +120,7 @@ class Gemma3nProcessingInfo(BaseProcessingInfo):
         *,
         image_width: int,
         image_height: int,
-        processor: Optional[Gemma3nProcessor],
+        processor: Gemma3nProcessor | None,
     ) -> str:
         """
         Get the replacement text for image tokens.
@@ -136,7 +138,7 @@ class Gemma3nProcessingInfo(BaseProcessingInfo):
     def get_audio_repl(
         self,
         *,
-        processor: Optional[Gemma3nProcessor],
+        processor: Gemma3nProcessor | None,
     ) -> str:
         """
         Get the replacement text for audio tokens.
@@ -168,7 +170,7 @@ class Gemma3nDummyInputsBuilder(BaseDummyInputsBuilder[Gemma3nProcessingInfo]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
         num_audios = mm_counts.get("audio", 0)
@@ -387,7 +389,7 @@ class Gemma3nMultimodalEmbedder(nn.Module):
 
     def __init__(
         self,
-        multimodal_config: Union[Gemma3nAudioConfig, Gemma3nVisionConfig],
+        multimodal_config: Gemma3nAudioConfig | Gemma3nVisionConfig,
         text_config: Gemma3nTextConfig,
     ):
         super().__init__()
@@ -427,8 +429,8 @@ class Gemma3nMultimodalEmbedder(nn.Module):
 
     def forward(
         self,
-        input_ids: Optional[torch.LongTensor] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        input_ids: torch.LongTensor | None = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
         """Embeds token ids or soft tokens for multimodal content into language model space.
 
@@ -529,7 +531,7 @@ class Gemma3nForConditionalGeneration(
 
     def _parse_and_validate_image_input(
         self, **kwargs: object
-    ) -> Optional[Gemma3nImageInputs]:
+    ) -> Gemma3nImageInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
         image_embeds = kwargs.pop("image_embeds", None)
         # TODO is this the case?
@@ -541,7 +543,7 @@ class Gemma3nForConditionalGeneration(
 
     def _parse_and_validate_audio_input(
         self, **kwargs: object
-    ) -> Optional[Gemma3nAudioInputs]:
+    ) -> Gemma3nAudioInputs | None:
         input_features_padded = kwargs.pop("input_features_padded", None)
         if input_features_padded is None:
             return None
@@ -616,12 +618,15 @@ class Gemma3nForConditionalGeneration(
         )
         audio_features = self.embed_audio(inputs_embeds=audio_outputs)
 
-        # ruff: noqa
-        # The Gemma3nProcessor expects all audio will be 30s in length and inserts 188 audio soft tokens into the
-        # text to account for this. However, the audio preprocessing and encoder do not guarantee they will
-        # produce 188 soft tokens; they will produce at most that many tokens, but they may produce fewer tokens
-        # depending on the length of the longest audio input in the batch. When we encounter this situation, we pad
-        # the audio feature out to 188 soft tokens with the embedding of the last token in the embed_audio vocab.
+        # The Gemma3nProcessor expects all audio will be 30s in length and
+        # inserts 188 audio soft tokens into the text to account for this.
+        # However, the audio preprocessing and encoder do not guarantee they
+        # will produce exactly 188 soft tokens; they may produce fewer tokens
+        # (for shorter audio) or more tokens (for longer audio or due to
+        # BOA/EOA special tokens in the placeholder sequence).
+        # We handle both cases:
+        # - If fewer tokens: pad with the embedding of the last vocab token
+        # - If more tokens: truncate to the expected count
         # TODO precompute and cache padding
         audio_padding_toks = torch.tensor(
             [[self.vocab_size - 1]], dtype=torch.long, device=audio_features.device
@@ -631,13 +636,18 @@ class Gemma3nForConditionalGeneration(
             audio_mask.unsqueeze(-1), audio_padding_embs, audio_features
         )
 
-        audio_batch_size, audio_seq_len, audio_embed_dim = audio_features.shape
-        extra_padding_tokens = self.config.audio_soft_tokens_per_image - audio_seq_len  # noqa: E501
-        extra_padding_features = audio_padding_embs.expand(
-            audio_batch_size, extra_padding_tokens, audio_embed_dim
+        expected_tokens = self.config.audio_soft_tokens_per_image
+        audio_features, tokens_truncated = adjust_audio_features_to_expected_length(
+            audio_features, expected_tokens, audio_padding_embs
         )
+        if tokens_truncated > 0:
+            logger.warning(
+                "Gemma3n audio encoder produced %d extra tokens. "
+                "Truncating to match placeholder count of %d.",
+                tokens_truncated,
+                expected_tokens,
+            )
 
-        audio_features = torch.cat((audio_features, extra_padding_features), dim=1)
         # Return a list of embeddings instead of a batched tensor
         return audio_features.unbind(0)
 
@@ -666,9 +676,9 @@ class Gemma3nForConditionalGeneration(
     def embed_input_ids(
         self,
         input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+        multimodal_embeddings: MultiModalEmbeddings | None = None,
         *,
-        is_multimodal: Optional[torch.Tensor] = None,
+        is_multimodal: torch.Tensor | None = None,
         handle_oov_mm_token: bool = False,
     ) -> torch.Tensor:
         # NOTE (NickLucche) Each pass needs tokens to compute PLE so we cache
@@ -701,8 +711,8 @@ class Gemma3nForConditionalGeneration(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
     ) -> IntermediateTensors:
         if intermediate_tensors is not None:
@@ -729,7 +739,7 @@ class Gemma3nForConditionalGeneration(
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
@@ -747,7 +757,7 @@ class Gemma3nForConditionalGeneration(
         )
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality == "image":
             return "<image_soft_token>"
         elif modality == "audio":
@@ -761,10 +771,10 @@ class Gemma3nForConditionalGeneration(
         audio: np.ndarray,
         stt_config: SpeechToTextConfig,
         model_config: ModelConfig,
-        language: Optional[str],
+        language: str | None,
         task_type: Literal["transcribe", "translate"],
         request_prompt: str,
-        to_language: Optional[str],
+        to_language: str | None,
     ) -> PromptType:
         """
         Gemma3n supports "free-form" transcription.
diff --git a/vllm/model_executor/models/glm4.py b/vllm/model_executor/models/glm4.py
index 145f7f501d0df816317e197b642c430721dd774b..2f8eca2147a6fa92a7cc42fcdbaf65e3ed928d16 100644
--- a/vllm/model_executor/models/glm4.py
+++ b/vllm/model_executor/models/glm4.py
@@ -31,7 +31,6 @@ from torch import nn
 from transformers import Glm4Config
 
 import vllm.envs as envs
-from vllm.attention.backends.abstract import AttentionType
 from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
@@ -43,6 +42,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.sequence import IntermediateTensors
+from vllm.v1.attention.backend import AttentionType
 
 from .interfaces import SupportsLoRA, SupportsPP
 from .llama import LlamaMLP as Glm4MLP
diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
index 9904d5663d0799f5fd1206f80c58c6d57e642bcc..7d465bedc330b34ff2d67b84bd094d17de60e8ca 100644
--- a/vllm/model_executor/models/glm4_1v.py
+++ b/vllm/model_executor/models/glm4_1v.py
@@ -47,15 +47,14 @@ from transformers.models.glm4v.image_processing_glm4v import (
 from transformers.models.glm4v.video_processing_glm4v import Glm4vVideoProcessor
 from transformers.video_utils import VideoMetadata
 
-from vllm.attention.backends.registry import AttentionBackendEnum
-from vllm.attention.layers.mm_encoder_attention import (
-    MMEncoderAttention,
-)
 from vllm.config import MultiModalConfig, VllmConfig
 from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions
 from vllm.distributed import get_tensor_model_parallel_world_size, parallel_state
 from vllm.distributed import utils as dist_utils
 from vllm.logger import init_logger
+from vllm.model_executor.layers.attention.mm_encoder_attention import (
+    MMEncoderAttention,
+)
 from vllm.model_executor.layers.conv import Conv2dLayer, Conv3dLayer
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (
@@ -90,6 +89,7 @@ from vllm.multimodal.processing import (
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
 
 from ..layers.activation import SiluAndMul
 from .interfaces import (
@@ -305,6 +305,7 @@ class Glm4vVisionAttention(nn.Module):
         self.attn = MMEncoderAttention(
             num_heads=self.num_attention_heads_per_partition,
             head_size=self.hidden_size_per_attention_head,
+            scale=self.hidden_size_per_attention_head**-0.5,
             multimodal_config=multimodal_config,
         )
 
@@ -1815,6 +1816,20 @@ class Glm4vForConditionalGeneration(
             tower_model="visual.",
         )
 
+    def get_num_mm_encoder_tokens(
+        self,
+        num_image_tokens: int,
+    ) -> int:
+        merge_size = self.config.vision_config.spatial_merge_size
+        return num_image_tokens * (merge_size**2)
+
+    def get_num_mm_connector_tokens(
+        self,
+        num_vision_tokens: int,
+    ) -> int:
+        merge_size = self.config.vision_config.spatial_merge_size
+        return num_vision_tokens // (merge_size**2)
+
 
 @MULTIMODAL_REGISTRY.register_processor(
     Glm4vMultiModalProcessor,
diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py
index 541d3b2beff83219410680219bb18963df49dbac..efa6c1cfe93ca42bd0574fc7ccff9565d3861bf0 100644
--- a/vllm/model_executor/models/glm4_moe.py
+++ b/vllm/model_executor/models/glm4_moe.py
@@ -21,7 +21,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Inference-only GLM-4.5, GLM-4.6 model compatible with HuggingFace weights."""
+"""Inference-only GLM-4.5, GLM-4.6, GLM-4.7 model
+compatible with HuggingFace weights."""
 
 import typing
 from collections.abc import Callable, Iterable
@@ -196,6 +197,7 @@ class Glm4MoE(nn.Module):
             e_score_correction_bias=self.gate.e_score_correction_bias,
             enable_eplb=self.enable_eplb,
             num_redundant_experts=self.n_redundant_experts,
+            router_logits_dtype=torch.float32,
         )
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
@@ -476,24 +478,11 @@ class Glm4MoeModel(nn.Module):
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
-    def make_empty_intermediate_tensors(
-        self, batch_size: int, dtype: torch.dtype, device: torch.device
-    ) -> IntermediateTensors:
-        return IntermediateTensors(
-            {
-                "hidden_states": torch.zeros(
-                    (batch_size, self.config.hidden_size), dtype=dtype, device=device
-                ),
-                "residual": torch.zeros(
-                    (batch_size, self.config.hidden_size), dtype=dtype, device=device
-                ),
-            }
-        )
-
     def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
         # Params for weights, fp8 weight scales, fp8 activation scales
         # (param_name, weight_name, expert_id, shard_id)
         return SharedFusedMoE.make_expert_params_mapping(
+            self,
             ckpt_gate_proj_name="gate_proj",
             ckpt_down_proj_name="down_proj",
             ckpt_up_proj_name="up_proj",
diff --git a/vllm/model_executor/models/glm4_moe_mtp.py b/vllm/model_executor/models/glm4_moe_mtp.py
index e34ae6c85a4f8f156ecbadc76e5969f715141aec..25a57c8301326e678540496d4fc5420afce55755 100644
--- a/vllm/model_executor/models/glm4_moe_mtp.py
+++ b/vllm/model_executor/models/glm4_moe_mtp.py
@@ -47,7 +47,6 @@ from .glm4_moe import (
     Glm4MoeDecoderLayer,
     get_spec_layer_idx_from_weight_name,
 )
-from .interfaces import SupportsPP
 from .utils import maybe_prefix
 
 
@@ -106,7 +105,7 @@ class Glm4MoeMultiTokenPredictorLayer(nn.Module):
     ) -> torch.Tensor:
         assert inputs_embeds is not None
         # masking inputs at position 0, as not needed by MTP
-        inputs_embeds[positions == 0] = 0
+        inputs_embeds = torch.where(positions.unsqueeze(-1) == 0, 0, inputs_embeds)
         inputs_embeds = self.enorm(inputs_embeds)
         previous_hidden_states = self.hnorm(previous_hidden_states)
 
@@ -184,7 +183,7 @@ class Glm4MoeMultiTokenPredictor(nn.Module):
         return logits
 
 
-class Glm4MoeMTP(nn.Module, SupportsPP, Glm4MixtureOfExperts):
+class Glm4MoeMTP(nn.Module, Glm4MixtureOfExperts):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         self.config = vllm_config.model_config.hf_config
@@ -248,6 +247,7 @@ class Glm4MoeMTP(nn.Module, SupportsPP, Glm4MixtureOfExperts):
         # Params for weights, fp8 weight scales, fp8 activation scales
         # (param_name, weight_name, expert_id, shard_id)
         expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            self,
             ckpt_gate_proj_name="gate_proj",
             ckpt_down_proj_name="down_proj",
             ckpt_up_proj_name="up_proj",
@@ -263,10 +263,16 @@ class Glm4MoeMTP(nn.Module, SupportsPP, Glm4MixtureOfExperts):
             elif name == "model.embed_tokens.weight":
                 spec_layer = self.model.mtp_start_layer_idx
             else:
-                spec_layer = get_spec_layer_idx_from_weight_name(self.config, name)
-                if spec_layer is None:
-                    continue
-                name = self._rewrite_spec_layer_name(spec_layer, name)
+                if name == "lm_head.weight":
+                    spec_layer = self.model.mtp_start_layer_idx
+                    name = f"model.layers.{spec_layer}.shared_head.head.weight"
+                elif name == "model.embed_tokens.weight":
+                    spec_layer = self.model.mtp_start_layer_idx
+                else:
+                    spec_layer = get_spec_layer_idx_from_weight_name(self.config, name)
+                    if spec_layer is None:
+                        continue
+                    name = self._rewrite_spec_layer_name(spec_layer, name)
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 # Skip non-stacked layers and experts (experts handled below).
                 if weight_name not in name:
@@ -309,6 +315,12 @@ class Glm4MoeMTP(nn.Module, SupportsPP, Glm4MixtureOfExperts):
                     # Skip loading extra bias for GPTQ models.
                     if name.endswith(".bias") and name not in params_dict:
                         continue
+                    # Some checkpoints include weight scale tensors for the
+                    # LM head even when the quantized head isn't built. Skip
+                    # them if the model does not expose a matching parameter
+                    # to avoid KeyError during load.
+                    if name.endswith(".weight_scale") and name not in params_dict:
+                        continue
 
                     # According to DeepSeek-V3 Technical Report, MTP modules
                     # shares embedding layer. We only load the first weights.
diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py
index ec5af94e297c1ca6abcb09a26bb8b4ac55359b3b..297237fd196ad4c589d47b83accf069533c8ed42 100644
--- a/vllm/model_executor/models/glm4v.py
+++ b/vllm/model_executor/models/glm4v.py
@@ -19,11 +19,11 @@ from transformers import BatchFeature, PreTrainedTokenizer, TensorType
 from transformers.image_utils import ImageInput
 from transformers.tokenization_utils_base import TextInput
 
-from vllm.attention.layer import MultiHeadAttention
 from vllm.config import VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn
+from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention
 from vllm.model_executor.layers.conv import Conv2dLayer
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
@@ -135,7 +135,7 @@ class EVA2CLIPAttention(nn.Module):
             prefix=f"{prefix}.dense",
         )
 
-        self.attn = MultiHeadAttention(
+        self.attn = MMEncoderAttention(
             self.num_heads_per_rank, self.head_dim, self.scale
         )
         self.output_dropout = torch.nn.Dropout(config.dropout_prob)
diff --git a/vllm/model_executor/models/glmasr.py b/vllm/model_executor/models/glmasr.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3b9a1221934568f5e920fdc122a88d4dfd9b7ba
--- /dev/null
+++ b/vllm/model_executor/models/glmasr.py
@@ -0,0 +1,1154 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Iterable, Mapping, Sequence
+from typing import Annotated, Any, Literal, TypeAlias, cast
+
+import numpy as np
+import torch
+import torch.nn as nn
+from transformers import BatchFeature
+from transformers.models.glmasr import GlmAsrConfig, GlmAsrProcessor
+from transformers.models.whisper import WhisperFeatureExtractor
+
+from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig
+from vllm.config.multimodal import BaseDummyOptions
+from vllm.distributed.parallel_state import get_tensor_model_parallel_world_size
+from vllm.inputs.data import PromptType
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding.common import ApplyRotaryEmb
+from vllm.model_executor.models.module_mapping import MultiModelKeys
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (
+    MultiModalDataDict,
+    MultiModalFieldConfig,
+    MultiModalKwargsItems,
+)
+from vllm.multimodal.parse import (
+    DictEmbeddingItems,
+    ModalityData,
+    ModalityDataItems,
+    MultiModalDataItems,
+    MultiModalDataParser,
+)
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    PromptReplacement,
+    PromptUpdate,
+    PromptUpdateDetails,
+)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+from vllm.tokenizers import cached_tokenizer_from_config
+from vllm.transformers_utils.processor import cached_processor_from_config
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
+
+from .glmasr_utils import (
+    DEFAULT_CONV_PARAMS,
+    DEFAULT_MAX_AUDIO_LEN_S,
+    DEFAULT_MERGE_FACTOR,
+    _flatten_audio_features_by_length,
+    _get_audio_output_lengths_for_tower,
+    _group_audio_embeddings,
+    _normalize_chunk_counts,
+)
+from .interfaces import (
+    MultiModalEmbeddings,
+    SupportsLoRA,
+    SupportsMultiModal,
+    SupportsPP,
+    SupportsTranscription,
+)
+from .utils import AutoWeightsLoader, init_vllm_registered_model, maybe_prefix
+from .whisper import ISO639_1_SUPPORTED_LANGS
+
+
+class GlmAsrEncoderRotaryEmbedding(nn.Module):
+    """
+    Rotary Position Embedding for GLM-ASR encoder.
+
+    Computes rotary position embeddings on-demand for efficiency.
+    Only caches inv_freq as a buffer; cos/sin are computed during forward
+    to avoid wasted computation during initialization and ensure correct
+    device placement.
+    """
+
+    def __init__(self, config) -> None:
+        super().__init__()
+
+        # Compute inverse frequencies following transformers implementation
+        head_dim = getattr(
+            config, "head_dim", config.hidden_size // config.num_attention_heads
+        )
+
+        # Handle rope_parameters if present (for compatibility with transformers config)
+        if hasattr(config, "rope_parameters") and config.rope_parameters:
+            base = config.rope_parameters.get("rope_theta", 10000.0)
+            partial_rotary_factor = config.rope_parameters.get(
+                "partial_rotary_factor", 1.0
+            )
+            dim = int(head_dim * partial_rotary_factor)
+            self.attention_scaling = config.rope_parameters.get(
+                "attention_scaling", 1.0
+            )
+        else:
+            base = getattr(config, "rope_theta", 10000.0)
+            dim = head_dim
+            self.attention_scaling = 1.0
+
+        self.dim = dim
+        self.head_dim = head_dim
+
+        # Only cache inv_freq; cos/sin computed on-demand in correct device
+        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+    def forward(self, seq_len: int) -> torch.Tensor:
+        """
+        Compute rotary position frequencies for given sequence length.
+
+        Args:
+            seq_len: The sequence length to compute embeddings for.
+
+        Returns:
+            Frequency tensor with shape [seq_len, dim/2]. Use .cos() and
+            .sin() to get the rotary embedding components.
+        """
+        # Compute on the same device as inv_freq (automatically correct after .to())
+        seq = torch.arange(
+            seq_len, device=self.inv_freq.device, dtype=self.inv_freq.dtype
+        )
+        freqs = torch.outer(seq, self.inv_freq)
+        return freqs * self.attention_scaling
+
+
+class GlmAsrEncoderAttention(nn.Module):
+    """
+    Optimized Multi-headed Grouped Query Attention for GLM-ASR encoder.
+
+    Uses vLLM's QKVParallelLinear for fused projections, ApplyRotaryEmb for
+    rotary position embeddings, and MMEncoderAttention for hardware-optimized
+    attention computation with automatic backend selection.
+    """
+
+    def __init__(
+        self,
+        config,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.num_kv_heads = getattr(
+            config, "num_key_value_heads", config.num_attention_heads
+        )
+        self.head_dim = self.hidden_size // self.num_heads
+
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.num_heads_per_rank = self.num_heads // self.tp_size
+        self.num_kv_heads_per_rank = max(1, self.num_kv_heads // self.tp_size)
+
+        # Use QKVParallelLinear for fused QKV projection
+        # Note: GLM-ASR uses bias on Q and V, but not K
+        # For simplicity with QKVParallelLinear, we use bias=True for all
+        self.qkv_proj = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            self.num_heads,
+            self.num_kv_heads,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+
+        self.o_proj = RowParallelLinear(
+            self.hidden_size,
+            self.hidden_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        # Use vLLM's ApplyRotaryEmb CustomOp
+        # enforce_enable=True ensures the op is always enabled (important for ViT)
+        self.apply_rotary_emb = ApplyRotaryEmb(enforce_enable=True)
+
+        # Use vLLM's MMEncoderAttention for hardware-optimized attention
+        # Automatically selects Flash Attention, SDPA, or Pallas based on device
+        self.attn = MMEncoderAttention(
+            num_heads=self.num_heads_per_rank,
+            head_size=self.head_dim,
+            scale=self.head_dim**-0.5,
+            num_kv_heads=self.num_kv_heads_per_rank,
+            prefix=f"{prefix}.attn",
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        rotary_pos_emb_cos: torch.Tensor,
+        rotary_pos_emb_sin: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Args:
+            hidden_states: [batch_size, seq_len, hidden_size]
+            rotary_pos_emb_cos: [seq_len, rotary_dim/2] - cosine of rotary embeddings
+            rotary_pos_emb_sin: [seq_len, rotary_dim/2] - sine of rotary embeddings
+
+        Returns:
+            [batch_size, seq_len, hidden_size]
+        """
+        batch_size, seq_len, _ = hidden_states.shape
+
+        # QKV projection - fused for efficiency
+        qkv, _ = self.qkv_proj(hidden_states)
+
+        # Split into q, k, v
+        q_size = self.num_heads_per_rank * self.head_dim
+        kv_size = self.num_kv_heads_per_rank * self.head_dim
+        q, k, v = qkv.split([q_size, kv_size, kv_size], dim=-1)
+
+        # Reshape to [batch, seq, num_heads, head_dim] for ApplyRotaryEmb
+        q = q.view(batch_size, seq_len, self.num_heads_per_rank, self.head_dim)
+        k = k.view(batch_size, seq_len, self.num_kv_heads_per_rank, self.head_dim)
+        v = v.view(batch_size, seq_len, self.num_kv_heads_per_rank, self.head_dim)
+
+        # Apply rotary position embeddings using vLLM's ApplyRotaryEmb
+        # ApplyRotaryEmb expects x: [batch, seq, heads, head_dim]
+        # cos/sin: [seq_len, rotary_dim/2]
+        q = self.apply_rotary_emb(q, rotary_pos_emb_cos, rotary_pos_emb_sin)
+        k = self.apply_rotary_emb(k, rotary_pos_emb_cos, rotary_pos_emb_sin)
+
+        # MMEncoderAttention expects [batch, seq, num_heads, head_dim]
+        # It handles GQA internally via repeat_interleave
+        attn_output = self.attn(q, k, v)
+
+        # Reshape back to [batch, seq, hidden_size]
+        attn_output = attn_output.view(batch_size, seq_len, -1)
+
+        # Output projection
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class GlmAsrEncoderMLP(nn.Module):
+    """
+    Optimized MLP for GLM-ASR encoder.
+    Uses vLLM's parallel linear layers for better performance.
+    """
+
+    def __init__(
+        self,
+        config,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+
+        self.fc1 = ColumnParallelLinear(
+            self.hidden_size,
+            self.intermediate_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc1",
+        )
+
+        self.act_fn = get_act_fn(config.hidden_act)
+
+        self.fc2 = RowParallelLinear(
+            self.intermediate_size,
+            self.hidden_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc2",
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.fc1(hidden_states)
+        hidden_states = self.act_fn(hidden_states)
+        hidden_states, _ = self.fc2(hidden_states)
+        return hidden_states
+
+
+class GlmAsrEncoderLayer(nn.Module):
+    """
+    Optimized Transformer encoder layer for GLM-ASR.
+    Combines attention and MLP with residual connections and layer norms.
+    """
+
+    def __init__(
+        self,
+        config,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = GlmAsrEncoderAttention(
+            config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+        )
+
+        self.mlp = GlmAsrEncoderMLP(
+            config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+        )
+
+        layer_norm_eps = getattr(config, "layer_norm_eps", 1e-5)
+        self.input_layernorm = nn.LayerNorm(self.hidden_size, eps=layer_norm_eps)
+        self.post_attention_layernorm = nn.LayerNorm(
+            self.hidden_size, eps=layer_norm_eps
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        rotary_pos_emb_cos: torch.Tensor,
+        rotary_pos_emb_sin: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Args:
+            hidden_states: [batch_size, seq_len, hidden_size]
+            rotary_pos_emb_cos: [seq_len, rotary_dim/2] - cosine of rotary embeddings
+            rotary_pos_emb_sin: [seq_len, rotary_dim/2] - sine of rotary embeddings
+
+        Returns:
+            [batch_size, seq_len, hidden_size]
+        """
+        # Self-attention with residual
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states = self.self_attn(
+            hidden_states=hidden_states,
+            rotary_pos_emb_cos=rotary_pos_emb_cos,
+            rotary_pos_emb_sin=rotary_pos_emb_sin,
+        )
+        hidden_states = residual + hidden_states
+
+        # MLP with residual
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        return hidden_states
+
+
+class _GlmAsrEncoderOutput:
+    """
+    Simple output container compatible with transformers' BaseModelOutput.
+
+    This lightweight container holds the encoder output and is compatible
+    with the transformers library's output format while being more efficient
+    than a full dataclass.
+
+    Attributes:
+        last_hidden_state: Final layer hidden states from the encoder.
+            Shape: [batch_size, seq_len, hidden_size]
+    """
+
+    __slots__ = ("last_hidden_state",)
+
+    def __init__(self, last_hidden_state: torch.Tensor):
+        self.last_hidden_state = last_hidden_state
+
+
+class GlmAsrEncoder(nn.Module):
+    """
+    Optimized GLM-ASR Audio Encoder with vLLM native implementation.
+
+    This encoder processes audio features through convolutional layers
+    followed by transformer layers with rotary position embeddings.
+    Optimized for performance with:
+    - QKVParallelLinear for fused attention projections
+    - Tensor parallelism support via ColumnParallelLinear/RowParallelLinear
+    - Quantization support
+    - Flash Attention (SDPA)
+    """
+
+    # Mapping for weight loading: transformers uses separate q/k/v, we use fused qkv
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+    }
+
+    def __init__(
+        self,
+        config,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+
+        # Convolutional feature extraction layers
+        self.conv1 = nn.Conv1d(
+            config.num_mel_bins,
+            config.hidden_size,
+            kernel_size=3,
+            padding=1,
+        )
+        self.conv2 = nn.Conv1d(
+            config.hidden_size,
+            config.hidden_size,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+        )
+
+        # Transformer encoder layers
+        self.layers = nn.ModuleList(
+            [
+                GlmAsrEncoderLayer(
+                    config,
+                    quant_config=quant_config,
+                    prefix=f"{prefix}.layers.{layer_idx}",
+                )
+                for layer_idx in range(config.num_hidden_layers)
+            ]
+        )
+
+        # Final layer norm
+        layer_norm_eps = getattr(config, "layer_norm_eps", 1e-5)
+        self.norm = nn.LayerNorm(config.hidden_size, eps=layer_norm_eps)
+
+        # Rotary position embeddings
+        self.rotary_emb = GlmAsrEncoderRotaryEmbedding(config)
+
+    def _get_feat_extract_output_lengths(
+        self, input_lengths: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Compute the output length after convolutions.
+
+        Args:
+            input_lengths: Input sequence lengths [batch_size]
+
+        Returns:
+            Tuple of (output after conv1, output after conv2)
+        """
+        # Conv1: kernel=3, stride=1, padding=1
+        output_lengths_conv1 = (input_lengths + 2 * 1 - 3) // 1 + 1
+
+        # Conv2: kernel=3, stride=2, padding=1
+        output_lengths_conv2 = (output_lengths_conv1 + 2 * 1 - 3) // 2 + 1
+
+        return output_lengths_conv1, output_lengths_conv2
+
+    def forward(self, input_features: torch.Tensor) -> _GlmAsrEncoderOutput:
+        """
+        Forward pass through the encoder.
+
+        Args:
+            input_features: [batch_size, num_mel_bins, seq_len]
+
+        Returns:
+            _GlmAsrEncoderOutput: Object with .last_hidden_state attribute \
+                containing [batch_size, seq_len', hidden_size] where seq_len' \
+                is the sequence length after convolutions
+        """
+        # Apply convolutional layers with GELU activation
+        hidden_states = torch.nn.functional.gelu(self.conv1(input_features))
+        hidden_states = torch.nn.functional.gelu(self.conv2(hidden_states))
+
+        # Transpose to [batch_size, seq_len, hidden_size]
+        hidden_states = hidden_states.transpose(1, 2)
+        output_seq_len = hidden_states.shape[1]
+
+        # Compute rotary position embeddings on-demand
+        rotary_pos_emb = self.rotary_emb(output_seq_len)
+        rotary_pos_emb_cos = rotary_pos_emb.cos().to(dtype=hidden_states.dtype)
+        rotary_pos_emb_sin = rotary_pos_emb.sin().to(dtype=hidden_states.dtype)
+
+        # Apply transformer layers
+        for encoder_layer in self.layers:
+            hidden_states = encoder_layer(
+                hidden_states, rotary_pos_emb_cos, rotary_pos_emb_sin
+            )
+
+        # Final layer norm
+        hidden_states = self.norm(hidden_states)
+
+        # Return in a format compatible with transformers' BaseModelOutput
+        return _GlmAsrEncoderOutput(last_hidden_state=hidden_states)
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        """Custom weight loading to handle q_proj/k_proj/v_proj -> qkv_proj mapping."""
+        from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+
+        for name, loaded_weight in weights:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Default weight loading for non-stacked params
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class GlmAsrFeatureInputs(TensorSchema):
+    """
+    Dimensions:
+        - num_chunks: Number of audio chunks (flattened)
+        - nmb: Number of mel bins
+        - num_audios: Number of original audio files
+    """
+
+    type: Literal["audio_features"]
+    input_features: Annotated[
+        torch.Tensor | list[torch.Tensor],
+        TensorShape("num_chunks", "nmb", "chunk_length", dynamic_dims={"chunk_length"}),
+    ]
+    feature_attention_mask: Annotated[
+        torch.Tensor | list[torch.Tensor],
+        TensorShape("num_chunks", "chunk_length", dynamic_dims={"chunk_length"}),
+    ]
+    chunk_counts: Annotated[
+        torch.Tensor | list[torch.Tensor],
+        TensorShape("num_audios"),
+    ]
+
+
+class GlmAsrEmbeddingInputs(TensorSchema):
+    """
+    Dimensions:
+        - bn: Batch size
+        - naf: Number of audio features
+        - hs: Hidden size (must match the hidden size of language model
+          backbone)
+    """
+
+    type: Literal["audio_embeds"] = "audio_embeds"
+    audio_embeds: Annotated[
+        list[torch.Tensor],
+        TensorShape("bn", "naf", "hs", dynamic_dims={"naf"}),
+    ]
+
+
+GlmAsrInputs: TypeAlias = GlmAsrFeatureInputs | GlmAsrEmbeddingInputs
+
+
+class GlmAsrMultiModalProjector(nn.Module):
+    """
+    Projects audio encoder outputs to language model hidden space.
+
+    This projector uses a two-layer MLP to map audio features from the
+    encoder's intermediate size to the language model's hidden size.
+    Uses vLLM's parallel linear layers for tensor parallelism support.
+
+    Architecture:
+        - Linear layer: intermediate_size -> hidden_size * 2
+        - Activation function (e.g., GELU)
+        - Linear layer: hidden_size * 2 -> hidden_size
+    """
+
+    def __init__(
+        self,
+        config: GlmAsrConfig,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.linear_1 = ColumnParallelLinear(
+            input_size=config.audio_config.intermediate_size,
+            output_size=config.text_config.hidden_size * 2,
+            quant_config=quant_config,
+            prefix=f"{prefix}.linear_1",
+        )
+        self.act = get_act_fn(config.projector_hidden_act)
+        self.linear_2 = RowParallelLinear(
+            input_size=config.text_config.hidden_size * 2,
+            output_size=config.text_config.hidden_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.linear_2",
+        )
+
+    def forward(self, audio_features: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.linear_1(audio_features)
+        hidden_states = self.act(hidden_states)
+        hidden_states, _ = self.linear_2(hidden_states)
+        return hidden_states
+
+
+class GlmAsrProcessingInfo(BaseProcessingInfo):
+    """
+    Processing information provider for GLM-ASR model.
+
+    Provides access to model configuration, processor, and feature extractor
+    needed for audio preprocessing and multimodal integration.
+    """
+
+    def get_hf_config(self) -> GlmAsrConfig:
+        return self.ctx.get_hf_config(GlmAsrConfig)
+
+    def get_hf_processor(self, **kwargs: object) -> GlmAsrProcessor:
+        return self.ctx.get_hf_processor(GlmAsrProcessor, **kwargs)
+
+    def get_feature_extractor(self, **kwargs: object) -> WhisperFeatureExtractor:
+        return self.get_hf_processor(**kwargs).feature_extractor
+
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
+        return {"audio": None}
+
+
+class GlmAsrDummyInputsBuilder(BaseDummyInputsBuilder[GlmAsrProcessingInfo]):
+    """
+    Builder for dummy inputs used in profiling and testing.
+
+    Generates dummy text prompts and audio data that match the expected
+    format for GLM-ASR model inputs. Used for memory profiling and
+    performance benchmarking.
+    """
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_audios = mm_counts.get("audio", 0)
+        hf_processor = self.info.get_hf_processor()
+        return hf_processor.audio_token * num_audios
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
+    ) -> MultiModalDataDict:
+        feature_extractor = self.info.get_feature_extractor()
+        sampling_rate = feature_extractor.sampling_rate
+        num_audios = mm_counts.get("audio", 0)
+        audio_overrides = mm_options.get("audio") if mm_options else None
+
+        max_audio_len = getattr(
+            self.info.get_hf_processor(), "max_audio_len", DEFAULT_MAX_AUDIO_LEN_S
+        )
+        audio_len = int(max_audio_len * sampling_rate)
+
+        return {
+            "audio": self._get_dummy_audios(
+                length=audio_len, num_audios=num_audios, overrides=audio_overrides
+            )
+        }
+
+
+def _glmasr_field_config(
+    hf_inputs: Mapping[str, torch.Tensor],
+) -> dict[str, MultiModalFieldConfig]:
+    """
+    Configure multimodal field batching strategy for GLM-ASR.
+
+    Determines how to batch audio inputs based on whether chunking is used.
+    When chunk_counts is present, features are flattened across chunks;
+    otherwise, they are batched normally.
+
+    Args:
+        hf_inputs: Dictionary of preprocessed inputs from HuggingFace processor.
+
+    Returns:
+        Dictionary mapping field names to MultiModalFieldConfig objects \
+            that specify batching behavior.
+    """
+    chunk_counts = hf_inputs.get("chunk_counts")
+    if chunk_counts is not None:
+        return dict(
+            audio_embeds=MultiModalFieldConfig.batched("audio"),
+            input_features=MultiModalFieldConfig.flat_from_sizes(
+                "audio", chunk_counts, dim=0
+            ),
+            feature_attention_mask=MultiModalFieldConfig.flat_from_sizes(
+                "audio", chunk_counts, dim=0
+            ),
+            chunk_counts=MultiModalFieldConfig.batched("audio"),
+        )
+    return dict(
+        audio_embeds=MultiModalFieldConfig.batched("audio"),
+        input_features=MultiModalFieldConfig.batched("audio"),
+        feature_attention_mask=MultiModalFieldConfig.batched("audio"),
+        chunk_counts=MultiModalFieldConfig.batched("audio"),
+    )
+
+
+class GlmAsrMultiModalDataParser(MultiModalDataParser):
+    """
+    Custom parser for GLM-ASR multimodal data.
+
+    Extends the base parser to handle GLM-ASR specific audio data formats,
+    including both pre-computed audio embeddings and raw audio features.
+    """
+
+    def _parse_audio_data(
+        self,
+        data: dict[str, torch.Tensor] | ModalityData[Any],
+    ) -> ModalityDataItems[Any, Any] | None:
+        if isinstance(data, dict):
+            return DictEmbeddingItems(
+                data,
+                modality="audio",
+                required_fields={"audio_embeds"},
+                fields_factory=_glmasr_field_config,
+            )
+        return super()._parse_audio_data(data)
+
+
+class GlmAsrMultiModalProcessor(BaseMultiModalProcessor["GlmAsrProcessingInfo"]):
+    """
+    GLM-ASR processor that inherits directly from BaseMultiModalProcessor
+    for better performance and cleaner implementation.
+    """
+
+    def _get_data_parser(self) -> MultiModalDataParser:
+        feature_extractor = self.info.get_feature_extractor()
+        return GlmAsrMultiModalDataParser(target_sr=feature_extractor.sampling_rate)
+
+    def _calculate_chunk_counts(
+        self,
+        audio_list: list[Any],
+        feature_extractor: WhisperFeatureExtractor,
+        processor: GlmAsrProcessor,
+    ) -> list[int]:
+        sampling_rate = feature_extractor.sampling_rate
+        chunk_length = feature_extractor.chunk_length
+        max_audio_len = getattr(processor, "max_audio_len", DEFAULT_MAX_AUDIO_LEN_S)
+        window_size = int(sampling_rate * chunk_length)
+        max_windows = int(max_audio_len // chunk_length)
+
+        chunk_counts = []
+        for audio in audio_list:
+            n_samples = len(audio) if isinstance(audio, list) else audio.shape[0]
+            n_chunks = max(1, (n_samples + window_size - 1) // window_size)
+            chunk_counts.append(min(n_chunks, max_windows))
+        return chunk_counts
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: dict[str, object],
+        mm_kwargs: Mapping[str, Any],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        # Normalize input: handle deprecated key and list conversion.
+        if "audios" in mm_data:
+            mm_data["audio"] = mm_data.pop("audios")
+
+        audio = mm_data.get("audio", [])
+        audio_list = [audio] if audio and not isinstance(audio, list) else audio
+
+        # Early return for text-only.
+        if not audio_list:
+            prompt_ids = self.info.get_tokenizer().encode(prompt)
+            prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
+            return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
+
+        # Handle sampling_rate
+        feature_extractor = self.info.get_feature_extractor(**mm_kwargs)
+        mm_kwargs = dict(
+            **mm_kwargs,
+            sampling_rate=feature_extractor.sampling_rate,
+        )
+
+        # Call parent method
+        outputs = super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+            tok_kwargs=tok_kwargs,
+        )
+
+        # Postprocess: rename mask and add chunk counts
+        # Handle different key names from different transformers versions
+        if "input_feature_mask" in outputs:
+            outputs["feature_attention_mask"] = outputs.pop("input_feature_mask")
+        elif "feature_attention_mask" not in outputs and "input_features" in outputs:
+            # If no mask is provided, create one from input_features
+            input_features = outputs["input_features"]
+            if isinstance(input_features, torch.Tensor):
+                # Create a mask of all ones matching the sequence length
+                mask = torch.ones(
+                    input_features.shape[0],
+                    input_features.shape[-1],
+                    dtype=torch.long,
+                )
+                outputs["feature_attention_mask"] = mask
+
+        # Get processor for chunk counts calculation
+        processor = self.info.get_hf_processor(**mm_kwargs)
+
+        # Override chunk counts calculation with GLM-ASR specific logic
+        chunk_counts = self._calculate_chunk_counts(
+            audio_list, processor.feature_extractor, processor
+        )
+        outputs["chunk_counts"] = torch.tensor(chunk_counts, dtype=torch.long)
+
+        return outputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return _glmasr_field_config(hf_inputs)
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargsItems,
+    ) -> Sequence[PromptUpdate]:
+        processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        tokenizer = self.info.get_tokenizer()
+        vocab = tokenizer.get_vocab()
+        config = self.info.get_hf_config()
+
+        audio_token = getattr(processor, "audio_token", "<|pad|>")
+        audio_token_id = vocab.get(audio_token)
+        if audio_token_id is None:
+            audio_token_id = processor.audio_token_id
+
+        merge_factor = getattr(config, "merge_factor", DEFAULT_MERGE_FACTOR)
+        conv_params = getattr(config, "conv_params", DEFAULT_CONV_PARAMS)
+        out_mm_data = out_mm_kwargs.get_data()
+        feature_attention_mask = out_mm_data.get("feature_attention_mask")
+        chunk_counts = out_mm_data.get("chunk_counts")
+
+        # Pre-compute audio output lengths if feature_attention_mask is available
+        audio_output_lengths: list[int] = []
+        if feature_attention_mask is not None:
+            # Compute output lengths for all audio items
+            from .glmasr_utils import (
+                _as_list_chunk_counts,
+                _get_audio_output_lengths_from_mask,
+            )
+
+            if chunk_counts is not None:
+                start_idx = 0
+                for count in _as_list_chunk_counts(chunk_counts):
+                    end_idx = start_idx + count
+                    mask = feature_attention_mask[start_idx:end_idx]
+                    if isinstance(mask, list):
+                        mask = torch.stack(mask)
+
+                    lengths = _get_audio_output_lengths_from_mask(
+                        mask, merge_factor, conv_params
+                    )
+                    audio_output_lengths.append(int(lengths.sum().item()))
+                    start_idx = end_idx
+            else:
+                # Single chunk per audio
+                for idx in range(len(feature_attention_mask)):
+                    mask = feature_attention_mask[idx : idx + 1]
+                    if isinstance(mask, list):
+                        mask = torch.tensor(mask).unsqueeze(0)
+                    lengths = _get_audio_output_lengths_from_mask(
+                        mask, merge_factor, conv_params
+                    )
+                    audio_output_lengths.append(int(lengths.sum().item()))
+
+        def get_replacement_glmasr(item_idx: int):
+            # Use pre-computed lengths if available, otherwise fall back to audio_embeds
+            if audio_output_lengths:
+                num_features = audio_output_lengths[item_idx]
+            else:
+                audio_embeds = out_mm_data.get("audio_embeds")
+                if audio_embeds is not None:
+                    embed = audio_embeds[item_idx]
+                    num_features = embed.shape[0]
+                else:
+                    raise ValueError(
+                        "Either feature_attention_mask or audio_embeds must be provided"
+                    )
+
+            if num_features == 0:
+                raise ValueError("Audio is too short")
+
+            audio_tokens = [audio_token_id] * int(num_features)
+            return PromptUpdateDetails.select_token_id(
+                audio_tokens,
+                embed_token_id=audio_token_id,
+            )
+
+        return [
+            PromptReplacement(
+                modality="audio",
+                target=audio_token,
+                replacement=get_replacement_glmasr,
+            )
+        ]
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    GlmAsrMultiModalProcessor,
+    info=GlmAsrProcessingInfo,
+    dummy_inputs=GlmAsrDummyInputsBuilder,
+)
+class GlmAsrForConditionalGeneration(
+    nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA, SupportsTranscription
+):
+    supported_languages = ISO639_1_SUPPORTED_LANGS
+
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        # Use optimized vLLM native encoder
+        self.audio_tower = GlmAsrEncoder(
+            config.audio_config,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "audio_tower"),
+        )
+        self.multi_modal_projector = GlmAsrMultiModalProjector(
+            config,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "multi_modal_projector"),
+        )
+        self.quant_config = quant_config
+
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+            architectures=["LlamaForCausalLM"],
+        )
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors
+        )
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
+        if modality.startswith("audio"):
+            return "<|begin_of_audio|><|pad|><|end_of_audio|>"
+
+        raise ValueError("Only audio modality is supported")
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        return MultiModelKeys.from_string_field(
+            language_model="language_model.",
+            connector="multi_modal_projector.",
+            tower_model="audio_tower.",
+        )
+
+    def _parse_and_validate_audio_input(self, **kwargs: object) -> GlmAsrInputs | None:
+        audio_embeds = kwargs.pop("audio_embeds", None)
+        if audio_embeds is not None:
+            return GlmAsrEmbeddingInputs(type="audio_embeds", audio_embeds=audio_embeds)
+
+        input_features = kwargs.pop("input_features", None)
+        if input_features is None:
+            return None
+
+        return GlmAsrFeatureInputs(
+            type="audio_features",
+            input_features=input_features,
+            feature_attention_mask=kwargs.pop("feature_attention_mask", None),
+            chunk_counts=kwargs.pop("chunk_counts", None),
+        )
+
+    def _process_audio_input(
+        self, audio_input: GlmAsrInputs
+    ) -> torch.Tensor | tuple[torch.Tensor, ...]:
+        if audio_input["type"] == "audio_embeds":
+            return tuple(audio_input["audio_embeds"])
+
+        input_features = audio_input["input_features"]
+        feature_attention_mask = audio_input["feature_attention_mask"]
+
+        if isinstance(input_features, list):
+            input_features = torch.cat(input_features, dim=0)
+            feature_attention_mask = torch.cat(feature_attention_mask, dim=0)
+
+        num_chunks = input_features.shape[0]
+        chunk_counts = _normalize_chunk_counts(
+            audio_input.get("chunk_counts"), num_chunks=num_chunks
+        )
+
+        # Convert input_features to model dtype (e.g., bfloat16) to match model weights
+        input_features = input_features.to(dtype=self.audio_tower.conv1.weight.dtype)
+
+        # audio_tower returns [batch_size, seq_len, hidden_size] where hidden_size=1280
+        audio_hidden_states = self.audio_tower(input_features).last_hidden_state
+
+        # GLM-ASR merges consecutive frames: 4 frames with hidden_size=1280
+        # -> 1 frame with intermediate_size=5120
+        hidden_size = self.config.audio_config.hidden_size
+        intermediate_size = self.config.audio_config.intermediate_size
+        merge_ratio = intermediate_size // hidden_size
+
+        # Truncate sequence length to be divisible by merge_ratio
+        seq_len = audio_hidden_states.shape[1]
+        seq_len_truncated = (seq_len // merge_ratio) * merge_ratio
+        if seq_len_truncated < seq_len:
+            audio_hidden_states = audio_hidden_states[:, :seq_len_truncated, :]
+
+        # Reshape to merge consecutive frames
+        audio_hidden_states = audio_hidden_states.reshape(
+            num_chunks,
+            -1,
+            intermediate_size,
+        )
+
+        audio_features = self.multi_modal_projector(audio_hidden_states)
+
+        merge_factor = getattr(self.config, "merge_factor", DEFAULT_MERGE_FACTOR)
+        conv_params = getattr(self.config, "conv_params", DEFAULT_CONV_PARAMS)
+
+        audio_output_lengths = _get_audio_output_lengths_for_tower(
+            self.audio_tower,
+            feature_attention_mask.sum(-1),
+            merge_factor,
+            conv_params,
+        )
+
+        masked_audio_features = _flatten_audio_features_by_length(
+            audio_features, audio_output_lengths
+        )
+
+        chunk_embeddings = torch.split(
+            masked_audio_features, audio_output_lengths.flatten().tolist()
+        )
+        return _group_audio_embeddings(chunk_embeddings, chunk_counts)
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
+        audio_input = self._parse_and_validate_audio_input(**kwargs)
+        if audio_input is None:
+            return []
+
+        masked_audio_features = self._process_audio_input(audio_input)
+
+        return masked_audio_features
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        **kwargs: object,
+    ) -> torch.Tensor | IntermediateTensors:
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        hidden_states = self.language_model.model(
+            input_ids,
+            positions,
+            intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor | None:
+        return self.language_model.compute_logits(hidden_states)
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        skip_prefixes = ["audio_tower.embed_positions"]
+        loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes)
+        return loader.load_weights(weights)
+
+    @classmethod
+    def _get_audio_token(cls, model_config: ModelConfig) -> str:
+        """Get the audio token from processor.
+
+        Similar to get_placeholder_str but returns single token.
+        """
+        processor = cached_processor_from_config(model_config)
+        return getattr(processor, "audio_token", "<|pad|>")
+
+    @classmethod
+    def get_speech_to_text_config(
+        cls, model_config: ModelConfig, task_type: str
+    ) -> SpeechToTextConfig:
+        processor = cached_processor_from_config(model_config)
+        feature_extractor = processor.feature_extractor
+        max_audio_clip_s = getattr(processor, "max_audio_len", DEFAULT_MAX_AUDIO_LEN_S)
+        return SpeechToTextConfig(
+            max_audio_clip_s=max_audio_clip_s,
+            sample_rate=feature_extractor.sampling_rate,
+        )
+
+    @classmethod
+    def get_generation_prompt(
+        cls,
+        audio: np.ndarray,
+        model_config: ModelConfig,
+        stt_config: SpeechToTextConfig,
+        language: str | None,
+        task_type: Literal["transcribe", "translate"],
+        request_prompt: str,
+        to_language: str | None,
+    ) -> PromptType:
+        """Get the generation prompt to be used for transcription requests."""
+        tokenizer = cached_tokenizer_from_config(model_config)
+        audio_token = cls._get_audio_token(model_config)
+
+        if task_type == "translate":
+            full_lang_name_to = cls.supported_languages.get(to_language, to_language)
+            user_content = f"{audio_token}translate the speech to {full_lang_name_to}"
+        elif task_type == "transcribe":
+            user_content = (
+                f"{audio_token}can you transcribe the speech into a written format?"
+            )
+        else:
+            raise ValueError(f"Unsupported task type {task_type}")
+
+        messages = [{"role": "user", "content": user_content}]
+        prompt = tokenizer.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+
+        prompt_token_ids = tokenizer.encode(prompt)
+        prompt_dict = {
+            "prompt_token_ids": prompt_token_ids,
+            "multi_modal_data": {"audio": audio},
+        }
+        return cast(PromptType, prompt_dict)
diff --git a/vllm/model_executor/models/glmasr_utils.py b/vllm/model_executor/models/glmasr_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..492e4b354b5ec310d0399061bc2442197d160ae9
--- /dev/null
+++ b/vllm/model_executor/models/glmasr_utils.py
@@ -0,0 +1,188 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Sequence
+from typing import cast
+
+import torch
+import torch.nn as nn
+
+DEFAULT_MAX_AUDIO_LEN_S = 655
+DEFAULT_MERGE_FACTOR = 4
+# Default convolution parameters: (padding, kernel_size, stride)
+# These correspond to the two conv layers in GlmAsrEncoder
+DEFAULT_CONV_PARAMS = [(1, 3, 1), (1, 3, 2)]
+
+
+def _calculate_conv_output_length(
+    input_length: torch.Tensor, padding: int, kernel_size: int, stride: int
+) -> torch.Tensor:
+    """Calculate Conv1d output length using standard formula."""
+    # Standard formula: floor((input + 2*padding - kernel_size) / stride) + 1
+    return (input_length + 2 * padding - kernel_size) // stride + 1
+
+
+def _as_list_chunk_counts(
+    chunk_counts: torch.Tensor | list[int] | list[torch.Tensor],
+) -> list[int]:
+    if isinstance(chunk_counts, torch.Tensor):
+        return chunk_counts.tolist()
+    if chunk_counts and isinstance(chunk_counts[0], torch.Tensor):
+        tensor_counts = cast(list[torch.Tensor], chunk_counts)
+        return [int(c.item()) for c in tensor_counts]
+    return [int(c) for c in chunk_counts]
+
+
+def _normalize_chunk_counts(
+    chunk_counts: torch.Tensor | list[int] | list[torch.Tensor] | None,
+    num_chunks: int,
+) -> list[int]:
+    if chunk_counts is None:
+        return [1] * num_chunks
+    return _as_list_chunk_counts(chunk_counts)
+
+
+def _get_audio_output_lengths_from_lengths(
+    audio_lengths: torch.Tensor,
+    merge_factor: int,
+    conv_params: list[tuple[int, int, int]],
+) -> torch.Tensor:
+    for padding, kernel_size, stride in conv_params:
+        audio_lengths = _calculate_conv_output_length(
+            audio_lengths, padding, kernel_size, stride
+        )
+    return (audio_lengths - merge_factor) // merge_factor + 1
+
+
+def _get_audio_output_lengths_from_mask(
+    mask: torch.Tensor,
+    merge_factor: int,
+    conv_params: list[tuple[int, int, int]],
+) -> torch.Tensor:
+    audio_lengths = mask.sum(-1)
+    return _get_audio_output_lengths_from_lengths(
+        audio_lengths, merge_factor, conv_params
+    )
+
+
+def _get_audio_output_lengths_for_tower(
+    audio_tower: nn.Module,
+    audio_lengths: torch.Tensor,
+    merge_factor: int,
+    conv_params: list[tuple[int, int, int]],
+) -> torch.Tensor:
+    """
+    Calculate the output lengths after audio processing.
+
+    The output length accounts for:
+    1. Convolution layers (downsampling)
+    2. Merge factor (further downsampling during projection)
+
+    Args:
+        audio_tower: The audio encoder module
+        audio_lengths: Input feature lengths [batch_size]
+        merge_factor: Factor for merging adjacent features
+        conv_params: List of (padding, kernel_size, stride) for each conv layer
+
+    Returns:
+        Output lengths after all processing [batch_size]
+    """
+    # First, calculate the output length after convolutions
+    if hasattr(audio_tower, "_get_feat_extract_output_lengths"):
+        _, conv_output_lengths = audio_tower._get_feat_extract_output_lengths(
+            audio_lengths
+        )
+    else:
+        conv_output_lengths = audio_lengths
+        for padding, kernel_size, stride in conv_params:
+            conv_output_lengths = _calculate_conv_output_length(
+                conv_output_lengths, padding, kernel_size, stride
+            )
+
+    # Then, apply merge_factor to get final output length
+    # Formula: (conv_output_lengths - merge_factor) // merge_factor + 1
+    return (conv_output_lengths - merge_factor) // merge_factor + 1
+
+
+def _flatten_audio_features_by_length(
+    audio_features: torch.Tensor,
+    audio_output_lengths: torch.Tensor,
+) -> torch.Tensor:
+    num_chunks, max_audio_tokens, embed_dim = audio_features.shape
+    audio_output_lengths = audio_output_lengths.unsqueeze(1)
+    audio_features_mask = (
+        torch.arange(max_audio_tokens)
+        .expand(num_chunks, max_audio_tokens)
+        .to(audio_output_lengths.device)
+        < audio_output_lengths
+    )
+    return audio_features[audio_features_mask].view(-1, embed_dim)
+
+
+def _group_audio_embeddings(
+    chunk_embeddings: Sequence[torch.Tensor],
+    chunk_counts: Sequence[int],
+) -> tuple[torch.Tensor, ...]:
+    grouped_embeddings = []
+    current_idx = 0
+    for count in chunk_counts:
+        audio_chunks = chunk_embeddings[current_idx : current_idx + count]
+        grouped_embeddings.append(torch.cat(audio_chunks, dim=0))
+        current_idx += count
+    return tuple(grouped_embeddings)
+
+
+def _normalize_to_tensor(mask: torch.Tensor | list[torch.Tensor]) -> torch.Tensor:
+    """Convert mask to tensor, handling both list and tensor formats."""
+    if isinstance(mask, list):
+        return (
+            torch.stack(mask)
+            if mask and isinstance(mask[0], torch.Tensor)
+            else torch.tensor(mask)
+        )
+    return mask
+
+
+def _extract_mask_for_item(
+    feature_attention_mask: torch.Tensor | list[torch.Tensor],
+    chunk_counts: torch.Tensor | list[int] | None,
+    item_idx: int,
+) -> torch.Tensor:
+    """Extract attention mask for a specific audio item."""
+    if chunk_counts is None:
+        # Single item per audio
+        mask = feature_attention_mask[item_idx]
+        if isinstance(feature_attention_mask, torch.Tensor):
+            return mask.unsqueeze(0)
+        return _normalize_to_tensor(mask)
+
+    # Multiple chunks per audio: calculate slice indices
+    counts = _as_list_chunk_counts(chunk_counts)
+    start_idx = sum(counts[:item_idx])
+    end_idx = start_idx + counts[item_idx]
+
+    # Extract slice
+    if isinstance(feature_attention_mask, torch.Tensor):
+        return feature_attention_mask[start_idx:end_idx]
+    mask_slice = feature_attention_mask[start_idx:end_idx]
+    return _normalize_to_tensor(mask_slice)
+
+
+def _get_num_features_for_item(
+    feature_attention_mask: torch.Tensor | None,
+    chunk_counts: torch.Tensor | list[int] | None,
+    item_idx: int,
+    audio_embeds: list[torch.Tensor] | None,
+    merge_factor: int,
+    conv_params: list[tuple[int, int, int]],
+) -> int:
+    """Get number of features for a specific audio item."""
+    if feature_attention_mask is not None:
+        mask = _extract_mask_for_item(feature_attention_mask, chunk_counts, item_idx)
+        audio_output_lengths = _get_audio_output_lengths_from_mask(
+            mask, merge_factor, conv_params
+        )
+        return audio_output_lengths.sum().item()
+    if audio_embeds is not None:
+        return audio_embeds[item_idx].shape[0]
+    raise ValueError("Either feature_attention_mask or audio_embeds must be provided")
diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py
index da5d48a94ff3ebfd5657745f1bb85eccc875e2b5..bacf30d126504b6be0f00a9d948e2b3dfbbb607f 100644
--- a/vllm/model_executor/models/gpt2.py
+++ b/vllm/model_executor/models/gpt2.py
@@ -41,6 +41,7 @@ from vllm.model_executor.layers.linear import (
     RowParallelLinear,
 )
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.pooler import DispatchPooler
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead,
@@ -49,7 +50,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import IntermediateTensors
 
-from ..layers.pooler import DispatchPooler, Pooler
 from .interfaces import SupportsCrossEncoding, SupportsPP
 from .utils import (
     AutoWeightsLoader,
@@ -351,19 +351,7 @@ class GPT2ForSequenceClassification(nn.Module, SupportsCrossEncoding):
         pooler_config = vllm_config.model_config.pooler_config
         assert pooler_config is not None
 
-        self.pooler = DispatchPooler(
-            {
-                "token_classify": Pooler.for_token_classify(
-                    pooler_config, classifier=self.score
-                ),
-                "classify": Pooler.for_classify(
-                    pooler_config, classifier=self.score, act_fn="classify"
-                ),
-                "score": Pooler.for_classify(
-                    pooler_config, classifier=self.score, act_fn="score"
-                ),
-            }
-        )
+        self.pooler = DispatchPooler.for_seq_cls(pooler_config, classifier=self.score)
 
     def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.transformer.embed_input_ids(input_ids)
diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py
index 6cc3e2d69b149c0c601898f4dc37c8b4eca745d4..c603cb5a8f8e389d5fd67ca10f5bb899dfee9e15 100644
--- a/vllm/model_executor/models/gpt_neox.py
+++ b/vllm/model_executor/models/gpt_neox.py
@@ -169,7 +169,7 @@ class GPTNeoXLayer(nn.Module):
         self.attention = GPTNeoXAttention(
             config, cache_config, quant_config, prefix=f"{prefix}.attention"
         )
-        self.mlp = GPTNeoXMLP(config, quant_config)
+        self.mlp = GPTNeoXMLP(config, quant_config, prefix=f"{prefix}.mlp")
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py
index 6e4d3f90d9490315fcbdc81e0e9ab6e71473b284..c587dc9beec2115116096699998c6f8c6676e18a 100644
--- a/vllm/model_executor/models/gpt_oss.py
+++ b/vllm/model_executor/models/gpt_oss.py
@@ -7,7 +7,6 @@ import torch.distributed as dist
 from torch import nn
 from transformers import GptOssConfig
 
-from vllm.attention.backends.abstract import AttentionType
 from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
@@ -38,6 +37,7 @@ from vllm.model_executor.models.utils import sequence_parallel_chunk
 from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 from vllm.utils.math_utils import cdiv
+from vllm.v1.attention.backend import AttentionType
 
 from .interfaces import SupportsEagle3, SupportsLoRA, SupportsPP
 from .utils import (
@@ -730,6 +730,7 @@ class GptOssForCausalLM(nn.Module, SupportsPP, SupportsEagle3, SupportsLoRA):
         # Params for weights, weight scales, activation scales
         # (param_name, weight_name, expert_id, shard_id)
         return FusedMoE.make_expert_params_mapping(
+            self,
             ckpt_gate_proj_name="gate_proj",
             ckpt_down_proj_name="down_proj",
             ckpt_up_proj_name="up_proj",
diff --git a/vllm/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py
index a4e50f4086281bab2c2fb776b65cc861492cd15d..321704f0f9ced9ad40c3f4b0ad02f4443398073b 100644
--- a/vllm/model_executor/models/granite_speech.py
+++ b/vllm/model_executor/models/granite_speech.py
@@ -348,7 +348,9 @@ class GraniteSpeechConformerAttention(nn.Module):
 
         if self.context_size <= 0 or self.context_size > self.max_pos_emb:
             raise ValueError(
-                "Context size is either less than 0 or exceeds the max_pos_emb"
+                f"Context size should be > 0 and "
+                f"<= max_pos_emb ({self.max_pos_emb}), "
+                f"got {self.context_size}."
             )
 
     def forward(
@@ -670,7 +672,13 @@ class GraniteSpeechForConditionalGeneration(
 
         else:
             # Otherwise we have a list of tensors, which are almost certainly
-            # differing in their respective numbers of audio features;
+            # differing in their respective numbers of audio features; when
+            # passed as a batch, we expect a list of 2D var len input features
+            # so unsqueeze them.
+            input_features = [
+                feat.unsqueeze(dim=0) for feat in input_features if feat.ndim == 2
+            ]
+
             # stack them into a 3D tensor of size [bsz, most_num_features, 160].
             input_features = self._pad_and_stack_input_features(
                 input_features,
@@ -722,13 +730,12 @@ class GraniteSpeechForConditionalGeneration(
 
         Args:
             input_features: list[torch.Tensor]
-                Input features to be coerced into a tensor.
+                3D Input features to be coerced into a tensor.
         Returns:
             torch.Tensor: Tensor of shape [bsz, num_features, 160], where
             num_features is the max number of features of any entry in the
             batch.
         """
-        # Input features are of shape [bsz, num_features, 160]
         feat_lens = [feats.shape[1] for feats in input_features]
         padding = [max(feat_lens) - length for length in feat_lens]
         # TODO (Alex) - Validate that it's okay to zero pad like this;
diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py
index 0b1064b6343e3f07993c7cd28e27285149794e02..237fabff98f7430f707ecf53ea0b62a3fedea72f 100644
--- a/vllm/model_executor/models/granitemoe.py
+++ b/vllm/model_executor/models/granitemoe.py
@@ -353,6 +353,7 @@ class GraniteMoeModel(nn.Module):
         # Params for weights, fp8 weight scales, fp8 activation scales
         # (param_name, weight_name, expert_id, shard_id)
         expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            self,
             ckpt_gate_proj_name="w1",
             ckpt_down_proj_name="w2",
             ckpt_up_proj_name="w3",
diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py
index 2aba626a7c737d97f22f7d6aeb4e8728db867b9b..b5c6946b67018ffbc38764747861dc39e2ccc9d2 100644
--- a/vllm/model_executor/models/gritlm.py
+++ b/vllm/model_executor/models/gritlm.py
@@ -4,21 +4,25 @@ from collections.abc import Set
 
 import numpy as np
 import torch
-import torch.nn as nn
 
 from vllm.config import ModelConfig, VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.layers.pooler import (
     DispatchPooler,
-    Pooler,
-    PoolerHead,
-    PoolerNormalize,
     PoolingParamsUpdate,
 )
+from vllm.model_executor.layers.pooler.activations import PoolerNormalize
+from vllm.model_executor.layers.pooler.seqwise import (
+    EmbeddingPoolerHead,
+    SequencePooler,
+    SequencePoolingMethod,
+    SequencePoolingMethodOutput,
+    get_seq_pooling_method,
+)
+from vllm.model_executor.layers.pooler.tokwise import pooler_for_token_embed
 from vllm.model_executor.models.llama import LlamaForCausalLM
 from vllm.tasks import PoolingTask
 from vllm.tokenizers import cached_tokenizer_from_config
-from vllm.v1.outputs import PoolerOutput
 from vllm.v1.pool.metadata import PoolingMetadata
 
 from .interfaces_base import default_pooling_type
@@ -26,7 +30,7 @@ from .interfaces_base import default_pooling_type
 logger = init_logger(__name__)
 
 
-class GritLMMeanPool(nn.Module):
+class GritLMMeanPool(SequencePoolingMethod):
     """As `MeanPool`, but only includes non-instruction tokens."""
 
     def __init__(self, model_config: ModelConfig):
@@ -141,16 +145,16 @@ class GritLMMeanPool(nn.Module):
         return instruction_len
 
     def get_supported_tasks(self) -> Set[PoolingTask]:
-        return {"encode", "embed"}
+        return {"embed"}
 
     def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
         return PoolingParamsUpdate(requires_token_ids=True)
 
     def forward(
         self,
-        hidden_states: torch.Tensor | list[torch.Tensor],
+        hidden_states: torch.Tensor,
         pooling_metadata: PoolingMetadata,
-    ) -> list[torch.Tensor] | torch.Tensor:
+    ) -> SequencePoolingMethodOutput:
         prompt_lens = pooling_metadata.prompt_lens
         instr_lens = torch.tensor(
             [
@@ -173,30 +177,25 @@ class GritLMMeanPool(nn.Module):
         return pooled_data
 
 
-class GritLMPooler(Pooler):
+class GritLMPooler(SequencePooler):
     def __init__(self, model_config: ModelConfig):
-        super().__init__()
-
-        self.pooling = GritLMMeanPool(model_config)
-        self.head = PoolerHead(PoolerNormalize())
-
-    def get_supported_tasks(self) -> Set[PoolingTask]:
-        return self.pooling.get_supported_tasks()
-
-    def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
-        return self.pooling.get_pooling_updates(task)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        pooling_metadata: PoolingMetadata,
-    ) -> PoolerOutput:
-        pooled_data = self.pooling(hidden_states, pooling_metadata)
-        pooled_data = self.head(pooled_data, pooling_metadata)
-        return pooled_data
+        pooler_config = model_config.pooler_config
+        assert pooler_config is not None
+
+        super().__init__(
+            pooling=(
+                GritLMMeanPool(model_config)
+                if pooler_config.seq_pooling_type == "MEAN"
+                else get_seq_pooling_method(pooler_config.seq_pooling_type)
+            ),
+            head=EmbeddingPoolerHead(
+                head_dtype=model_config.head_dtype,
+                activation=PoolerNormalize(),
+            ),
+        )
 
 
-@default_pooling_type("MEAN")
+@default_pooling_type(seq_pooling_type="MEAN")
 class GritLM(LlamaForCausalLM):
     """This class implements the embedding model for parasail-ai/GritLM-7B-vllm.
 
@@ -237,7 +236,7 @@ class GritLM(LlamaForCausalLM):
         if pooler_config is not None:
             self.pooler = DispatchPooler(
                 {
-                    "token_embed": Pooler.for_token_embed(pooler_config),
+                    "token_embed": pooler_for_token_embed(pooler_config),
                     "embed": GritLMPooler(vllm_config.model_config),
                 }
             )
diff --git a/vllm/model_executor/models/grok1.py b/vllm/model_executor/models/grok1.py
index 0a2e5cf39ffd83e83e46635742598c79ac621d42..43c658a2c11e11b3ebb3e3ef9333febcb195f8b3 100644
--- a/vllm/model_executor/models/grok1.py
+++ b/vllm/model_executor/models/grok1.py
@@ -21,8 +21,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Inference-only Grok1 model."""
+"""Inference-only Grok (Grok1/Grok2) model."""
 
+import math
 from collections.abc import Iterable
 from itertools import islice
 from typing import Any
@@ -35,9 +36,12 @@ from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.logger import init_logger
+from vllm.model_executor.layers.activation import GeluAndMul
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (
+    MergedColumnParallelLinear,
     QKVParallelLinear,
     ReplicatedLinear,
     RowParallelLinear,
@@ -68,6 +72,100 @@ from .utils import (
 DEFAULT_ATTN_OUTPUT_MULTIPLIER = 0.08838834764831845
 DEFAULT_OUTPUT_MULTIPLIER_SCALE = 0.5773502691896257
 DEFAULT_EMBEDDING_MULTIPLIER_SCALE = 78.38367176906169
+DEFAULT_ROUTER_LOGIT_SOFTCAP = 30.0
+
+logger = init_logger(__name__)
+
+
+def _get_num_experts(config) -> int:
+    return getattr(config, "num_experts", getattr(config, "num_local_experts", 8))
+
+
+def _get_moe_intermediate_size(config) -> int:
+    return getattr(config, "moe_intermediate_size", config.intermediate_size)
+
+
+def _get_grok_version(config) -> str:
+    """Detect Grok version from HF config using multiple heuristics."""
+    # Check for Grok2-specific attributes (both for robust detection)
+    has_residual_moe = getattr(config, "residual_moe", False)
+    has_moe_intermediate_size = hasattr(config, "moe_intermediate_size")
+
+    if has_residual_moe or has_moe_intermediate_size:
+        return "grok2"
+
+    return "grok1"  # Default to Grok1
+
+
+def _get_rope_parameters(config) -> dict[str, Any] | None:
+    rope_parameters = getattr(config, "rope_parameters", None)
+    if rope_parameters is None:
+        rope_type = getattr(config, "rope_type", None)
+        if rope_type is None:
+            return None
+        rope_parameters = {"rope_type": rope_type}
+        rope_theta = getattr(config, "rope_theta", None)
+        if rope_theta is not None:
+            rope_parameters["rope_theta"] = rope_theta
+        scaling_factor = getattr(config, "scaling_factor", None)
+        if scaling_factor is not None:
+            rope_parameters["factor"] = scaling_factor
+        for name in (
+            "original_max_position_embeddings",
+            "extrapolation_factor",
+            "attn_factor",
+            "beta_fast",
+            "beta_slow",
+        ):
+            value = getattr(config, name, None)
+            if value is not None:
+                rope_parameters[name] = value
+
+    if rope_parameters.get("rope_type") == "original":
+        rope_parameters = dict(rope_parameters)
+        rope_parameters["rope_type"] = "default"
+    return rope_parameters
+
+
+def _get_moe_renormalize(config) -> bool:
+    explicit_value = getattr(
+        config, "moe_router_renormalize", getattr(config, "moe_renormalize", None)
+    )
+    if explicit_value is not None:
+        return bool(explicit_value)
+    return not getattr(config, "residual_moe", False)
+
+
+class Grok1MLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            input_size=hidden_size,
+            output_sizes=[intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            input_size=intermediate_size,
+            output_size=hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
+        )
+        self.act_fn = GeluAndMul()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x, _ = self.gate_up_proj(x)
+        x = self.act_fn(x)
+        x, _ = self.down_proj(x)
+        return x
 
 
 class Grok1MoE(nn.Module):
@@ -85,9 +183,11 @@ class Grok1MoE(nn.Module):
         top_k: int,
         hidden_size: int,
         intermediate_size: int,
+        router_logit_soft_cap: float = 0.0,
         params_dtype: torch.dtype | None = None,
         quant_config: QuantizationConfig | None = None,
         tp_size: int | None = None,
+        renormalize: bool = False,
         prefix: str = "",
     ):
         super().__init__()
@@ -110,12 +210,13 @@ class Grok1MoE(nn.Module):
             intermediate_size=intermediate_size,
             params_dtype=params_dtype,
             reduce_results=True,
-            renormalize=True,
+            renormalize=renormalize,
             quant_config=quant_config,
             tp_size=tp_size,
             activation="gelu",
             prefix=f"{prefix}.experts",
         )
+        self.router_logit_soft_cap = router_logit_soft_cap
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         # NOTE: hidden_states can have either 1D or 2D shape.
@@ -123,7 +224,10 @@ class Grok1MoE(nn.Module):
         hidden_states = hidden_states.view(-1, self.hidden_size)
         # router_logits: (num_tokens, n_experts)
         router_logits, _ = self.gate(hidden_states)
-        router_logits = 30.0 * F.tanh(router_logits / 30.0)
+        if self.router_logit_soft_cap > 0:
+            router_logits = self.router_logit_soft_cap * F.tanh(
+                router_logits / self.router_logit_soft_cap
+            )
         final_hidden_states = self.experts(hidden_states, router_logits)
         return final_hidden_states.view(orig_shape)
 
@@ -187,6 +291,15 @@ class Grok1Attention(nn.Module):
         )
 
         attn_logits_soft_cap = max(getattr(config, "attn_logit_softcapping", 30.0), 0.0)
+        attn_logit_softcapping_method = getattr(
+            config, "attn_logit_softcapping_method", None
+        )
+        if attn_logit_softcapping_method not in (None, "tanh"):
+            logger.warning_once(
+                "Grok attention logit softcapping method '%s' is not "
+                "supported; falling back to default behavior.",
+                attn_logit_softcapping_method,
+            )
 
         self.attn = Attention(
             self.num_heads,
@@ -238,30 +351,50 @@ class Grok1DecoderLayer(nn.Module):
             num_heads=config.num_attention_heads,
             max_position=config.max_position_embeddings,
             num_kv_heads=config.num_key_value_heads,
-            rope_parameters=getattr(config, "rope_parameters", None),
+            rope_parameters=_get_rope_parameters(config),
             cache_config=cache_config,
             quant_config=quant_config,
             prefix=f"{prefix}.attn",
             config=config,
         )  # Pass config to Grok1Attention
 
-        # Grok1 uses "num_experts" in its config
-        num_experts = getattr(config, "num_experts", 8)
+        num_experts = _get_num_experts(config)
         num_experts_per_tok = getattr(config, "num_experts_per_tok", 2)
+        moe_intermediate_size = _get_moe_intermediate_size(config)
+        moe_renormalize = _get_moe_renormalize(config)
 
         self.moe_block = Grok1MoE(
             num_experts=num_experts,
             top_k=num_experts_per_tok,
             hidden_size=config.hidden_size,
-            intermediate_size=config.intermediate_size,
+            intermediate_size=moe_intermediate_size,
+            router_logit_soft_cap=max(
+                getattr(
+                    config,
+                    "router_logit_softcapping",
+                    DEFAULT_ROUTER_LOGIT_SOFTCAP,
+                ),
+                0.0,
+            ),
             quant_config=quant_config,
+            renormalize=moe_renormalize,
             prefix=f"{prefix}.moe_block",
         )
+        self.residual_moe = getattr(config, "residual_moe", False)
+        self.residual_moe_scale = 1.0 / math.sqrt(2.0)
 
         self.pre_attn_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.post_attn_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.pre_moe_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.post_moe_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.mlp = None
+        if self.residual_moe:
+            self.mlp = Grok1MLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp",
+            )
 
     def forward(
         self,
@@ -286,7 +419,13 @@ class Grok1DecoderLayer(nn.Module):
 
         # MoE block with normalization
         hidden_states, residual = self.pre_moe_norm(hidden_states, residual)
-        hidden_states = self.moe_block(hidden_states)
+        if self.residual_moe:
+            assert self.mlp is not None
+            hidden_states = (
+                self.moe_block(hidden_states) + self.mlp(hidden_states)
+            ) * self.residual_moe_scale
+        else:
+            hidden_states = self.moe_block(hidden_states)
         hidden_states = self.post_moe_norm(hidden_states)
 
         return hidden_states, residual
@@ -294,7 +433,16 @@ class Grok1DecoderLayer(nn.Module):
 
 @support_torch_compile
 class Grok1Model(nn.Module):
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        ckpt_gate_proj_name: str = "linear",
+        ckpt_down_proj_name: str = "linear_1",
+        ckpt_up_proj_name: str = "linear_v",
+        weight_name_remapping: dict[str, str] | None = None,
+    ):
         super().__init__()
 
         config = vllm_config.model_config.hf_config
@@ -305,6 +453,12 @@ class Grok1Model(nn.Module):
         self.quant_config = quant_config
         self.padding_idx = config.pad_token_id
 
+        # Store expert naming for weight loading
+        self.ckpt_gate_proj_name = ckpt_gate_proj_name
+        self.ckpt_down_proj_name = ckpt_down_proj_name
+        self.ckpt_up_proj_name = ckpt_up_proj_name
+        self.weight_name_remapping = weight_name_remapping or {}
+
         self.vocab_size = config.vocab_size
 
         self.embedding_multiplier_scale = getattr(
@@ -365,13 +519,13 @@ class Grok1Model(nn.Module):
         return hidden_states
 
     def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
-        # Map Grok1's unique expert parameter names to standard names
-        # Grok1 uses "num_experts" in its config
-        num_experts = getattr(self.config, "num_experts", 8)
+        # Map expert parameter names to standard names
+        num_experts = _get_num_experts(self.config)
         return FusedMoE.make_expert_params_mapping(
-            ckpt_gate_proj_name="linear",  # Grok1 specific
-            ckpt_down_proj_name="linear_1",  # Grok1 specific
-            ckpt_up_proj_name="linear_v",  # Grok1 specific
+            self,
+            ckpt_gate_proj_name=self.ckpt_gate_proj_name,
+            ckpt_down_proj_name=self.ckpt_down_proj_name,
+            ckpt_up_proj_name=self.ckpt_up_proj_name,
             num_experts=num_experts,
         )
 
@@ -381,12 +535,18 @@ class Grok1Model(nn.Module):
             ("qkv_proj", "q_proj", "q"),
             ("qkv_proj", "k_proj", "k"),
             ("qkv_proj", "v_proj", "v"),
+            ("mlp.gate_up_proj", "mlp.gate_proj", 0),
+            ("mlp.gate_up_proj", "mlp.up_proj", 1),
         ]
 
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
         expert_params_mapping = self.get_expert_mapping()
         for name, loaded_weight in weights:
+            # Apply version-specific weight name remapping
+            for old_pattern, new_pattern in self.weight_name_remapping.items():
+                if old_pattern in name:
+                    name = name.replace(old_pattern, new_pattern)
             if self.quant_config is not None and (
                 scale_name := self.quant_config.get_cache_scale(name)
             ):
@@ -417,6 +577,8 @@ class Grok1Model(nn.Module):
                     name = maybe_remap_kv_scale_name(name, params_dict)
                     if name is None:
                         continue
+                if name not in params_dict:
+                    continue
                 param = params_dict[name]
                 weight_loader = param.weight_loader
                 weight_loader(param, loaded_weight, shard_id)
@@ -463,6 +625,8 @@ class Grok1Model(nn.Module):
                     if "norm.scale" in name:
                         name = name.replace("scale", "weight")
 
+                    if name not in params_dict:
+                        continue
                     param = params_dict[name]
                     weight_loader = getattr(
                         param, "weight_loader", default_weight_loader
@@ -472,9 +636,12 @@ class Grok1Model(nn.Module):
         return loaded_params
 
 
-class Grok1ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+class GrokBaseForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+    """Base class for Grok models with shared logic."""
+
     fall_back_to_pt_during_load = False
 
+    # Subclasses should override these
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -483,6 +650,15 @@ class Grok1ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         ],
     }
 
+    # Expert weight naming - subclasses override these
+    ckpt_gate_proj_name: str = "linear"
+    ckpt_down_proj_name: str = "linear_1"
+    ckpt_up_proj_name: str = "linear_v"
+
+    def get_weight_name_remapping(self) -> dict[str, str]:
+        """Return weight name remapping for this version. Override in subclasses."""
+        return {}
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
@@ -490,11 +666,15 @@ class Grok1ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         quant_config = vllm_config.quant_config
 
         self.config = config
-
         self.quant_config = quant_config
 
         self.model = Grok1Model(
-            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+            vllm_config=vllm_config,
+            prefix=maybe_prefix(prefix, "model"),
+            ckpt_gate_proj_name=self.ckpt_gate_proj_name,
+            ckpt_down_proj_name=self.ckpt_down_proj_name,
+            ckpt_up_proj_name=self.ckpt_up_proj_name,
+            weight_name_remapping=self.get_weight_name_remapping(),
         )
 
         self.lm_head = ParallelLMHead(
@@ -511,7 +691,9 @@ class Grok1ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
             config, "output_multiplier_scale", DEFAULT_OUTPUT_MULTIPLIER_SCALE
         )
         self.logits_processor = LogitsProcessor(
-            config.vocab_size, scale=self.output_multiplier_scale
+            config.vocab_size,
+            scale=self.output_multiplier_scale,
+            soft_cap=getattr(config, "final_logit_softcapping", None),
         )
 
         self.make_empty_intermediate_tensors = (
@@ -552,3 +734,70 @@ class Grok1ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
 
     def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
         return self.model.get_expert_mapping()
+
+
+class Grok1ForCausalLM(GrokBaseForCausalLM):
+    """Grok1-specific implementation."""
+
+    # Grok1 expert weight naming
+    ckpt_gate_proj_name = "linear"
+    ckpt_down_proj_name = "linear_1"
+    ckpt_up_proj_name = "linear_v"
+
+    def get_weight_name_remapping(self) -> dict[str, str]:
+        # Grok1 uses standard naming, no remapping needed
+        return {}
+
+
+class Grok2ForCausalLM(GrokBaseForCausalLM):
+    """Grok2-specific implementation."""
+
+    # Grok2 has additional packed modules for MLP
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    # Grok2 expert weight naming
+    ckpt_gate_proj_name = "w1"
+    ckpt_down_proj_name = "w2"
+    ckpt_up_proj_name = "w3"
+
+    def get_weight_name_remapping(self) -> dict[str, str]:
+        # Grok2 checkpoint uses different naming conventions
+        return {
+            ".self_attn.": ".attn.",
+            ".block_sparse_moe.": ".moe_block.",
+        }
+
+
+# Version dispatch mapping
+_GROK_VERSIONS: dict[str, type[GrokBaseForCausalLM]] = {
+    "grok1": Grok1ForCausalLM,
+    "grok2": Grok2ForCausalLM,
+}
+
+
+class GrokForCausalLM(GrokBaseForCausalLM):
+    """Factory class that dispatches to version-specific implementation."""
+
+    def __new__(cls, *, vllm_config: VllmConfig, prefix: str = ""):
+        config = vllm_config.model_config.hf_config
+        version = _get_grok_version(config)
+
+        instance_cls = _GROK_VERSIONS.get(version)
+        if instance_cls is None:
+            raise ValueError(f"Unsupported Grok version: {version}")
+
+        # Merge class attributes for LoRA/quantization compatibility
+        cls.packed_modules_mapping = dict(cls.packed_modules_mapping)
+        cls.packed_modules_mapping.update(instance_cls.packed_modules_mapping)
+
+        return instance_cls(vllm_config=vllm_config, prefix=prefix)
diff --git a/vllm/model_executor/models/hunyuan_v1.py b/vllm/model_executor/models/hunyuan_v1.py
index 0e82e84c4edbe00b6f7adbd429d042ef46597876..1cf6e824fa28bcc6171c4f93b8b88b8eefa29afe 100644
--- a/vllm/model_executor/models/hunyuan_v1.py
+++ b/vllm/model_executor/models/hunyuan_v1.py
@@ -33,7 +33,6 @@ import torch
 from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.attention.backends.abstract import AttentionType
 from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config
@@ -65,6 +64,7 @@ from vllm.model_executor.model_loader.weight_utils import (
     maybe_remap_kv_scale_name,
 )
 from vllm.sequence import IntermediateTensors
+from vllm.v1.attention.backend import AttentionType
 
 from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP
 from .utils import (
@@ -427,6 +427,7 @@ class HunYuanSparseMoeBlock(nn.Module):
                 hidden_act=config.hidden_act,
                 quant_config=quant_config,
                 reduce_results=False,
+                prefix=f"{prefix}.shared_mlp",
             )
         else:
             self.shared_mlp = None
@@ -705,6 +706,7 @@ class HunYuanModel(nn.Module):
             # Params for weights, fp8 weight scales, fp8 activation scales
             # (param_name, weight_name, expert_id, shard_id)
             return SharedFusedMoE.make_expert_params_mapping(
+                self,
                 ckpt_gate_proj_name="gate_proj",
                 ckpt_down_proj_name="down_proj",
                 ckpt_up_proj_name="up_proj",
diff --git a/vllm/model_executor/models/hunyuan_vision.py b/vllm/model_executor/models/hunyuan_vision.py
index be084f4ee0f8ee05600094c4f6933402a9de5686..9afb86a89f7d1f67463360e132ebb9d70c2c4955 100644
--- a/vllm/model_executor/models/hunyuan_vision.py
+++ b/vllm/model_executor/models/hunyuan_vision.py
@@ -33,14 +33,13 @@ import torch.nn as nn
 import torch.nn.functional as F
 from transformers import BatchFeature
 
-from vllm.attention.backends.registry import AttentionBackendEnum
-from vllm.attention.layer import MultiHeadAttention
 from vllm.config import MultiModalConfig, VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import parallel_state
 from vllm.distributed import utils as dist_utils
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
@@ -81,6 +80,7 @@ from vllm.transformers_utils.configs.hunyuan_vl import (
 from vllm.transformers_utils.processors.hunyuan_vl import HunYuanVLProcessor
 from vllm.transformers_utils.processors.hunyuan_vl_image import smart_resize
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
 
 from .interfaces import (
     MultiModalEmbeddings,
@@ -232,7 +232,7 @@ class HunYuanVisionAttention(nn.Module):
         )
 
         self.scale = self.hidden_size_per_attention_head**-0.5
-        self.attn = MultiHeadAttention(
+        self.attn = MMEncoderAttention(
             self.num_attention_heads_per_partition,
             self.hidden_size_per_attention_head,
             self.scale,
diff --git a/vllm/model_executor/models/hyperclovax_vision.py b/vllm/model_executor/models/hyperclovax_vision.py
index 3a083870e4b5aef57a888a4fcaeecb1388b1e0ab..f5226baba5da5e5285457d48c671320bbc29d613 100644
--- a/vllm/model_executor/models/hyperclovax_vision.py
+++ b/vllm/model_executor/models/hyperclovax_vision.py
@@ -6,7 +6,7 @@ from collections import defaultdict
 from collections.abc import Iterable, Mapping, Sequence
 from functools import partial
 from itertools import accumulate
-from typing import Annotated, Any, Literal
+from typing import Annotated, Literal
 
 import numpy as np
 import torch
@@ -18,7 +18,7 @@ from transformers import BatchFeature, CLIPVisionConfig, SiglipVisionConfig
 from transformers.modeling_utils import no_init_weights
 
 from vllm.config import VllmConfig
-from vllm.config.multimodal import BaseDummyOptions
+from vllm.config.multimodal import BaseDummyOptions, MultiModalConfig
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.cache import BaseMultiModalProcessorCache
@@ -361,6 +361,7 @@ def _build_hcxvision_hf_processor(
 def init_vision_tower_for_hcxvision(
     vision_config,
     quant_config: QuantizationConfig | None,
+    multimodal_config: MultiModalConfig | None,
     *,
     use_nth_layer: int | None = None,
     require_post_norm: bool | None = None,
@@ -378,6 +379,7 @@ def init_vision_tower_for_hcxvision(
         return CLIPVisionModel(
             vision_config,
             quant_config=quant_config,
+            multimodal_config=multimodal_config,
             num_hidden_layers_override=num_hidden_layers,
             require_post_norm=require_post_norm,
             prefix=prefix,
@@ -386,6 +388,7 @@ def init_vision_tower_for_hcxvision(
         return SiglipVisionModel(
             vision_config,
             quant_config=quant_config,
+            multimodal_config=multimodal_config,
             num_hidden_layers_override=num_hidden_layers,
             require_post_norm=require_post_norm,
             prefix=prefix,
@@ -597,18 +600,13 @@ class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
         "gate_up_proj": ["gate_proj", "up_proj"],
     }
 
-    def __init__(
-        self,
-        *,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-        **kwargs: Any | None,
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         super().__init__()
 
         # init configs
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
         # text_config
         text_config = config.text_config
         if text_config.model_type in ["gpt2", "hyperclovax", "llama"]:
@@ -631,7 +629,8 @@ class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
         with no_init_weights():  # weight will be loaded in from_pretrained
             self.vision_model = init_vision_tower_for_hcxvision(
                 vision_config,
-                quant_config,
+                quant_config=quant_config,
+                multimodal_config=multimodal_config,
                 use_nth_layer=getattr(config, "use_nth_layer", -1),
                 require_post_norm=False,
                 prefix=maybe_prefix(prefix, "vision_model"),
diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py
index 06b8468e18db929fecb9cc64b8ad717a550e7f96..c78ad64790e845e36624426b1d01c8b73aba7471 100644
--- a/vllm/model_executor/models/idefics2_vision_model.py
+++ b/vllm/model_executor/models/idefics2_vision_model.py
@@ -27,9 +27,9 @@ from transformers.models.idefics2.configuration_idefics2 import (
     Idefics2VisionConfig,
 )
 
-from vllm.attention.layer import MultiHeadAttention
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention
 from vllm.model_executor.layers.conv import Conv2dLayer
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
@@ -161,8 +161,8 @@ class Idefics2VisionAttention(nn.Module):
             prefix=f"{prefix}.out_proj",
             disable_tp=use_data_parallel,
         )
-        # Use unified MultiHeadAttention with Flash Attention support
-        self.attn = MultiHeadAttention(
+        # Use unified MMEncoderAttention with Flash Attention support
+        self.attn = MMEncoderAttention(
             self.num_heads_per_partition, self.head_dim, self.scale
         )
 
@@ -175,7 +175,7 @@ class Idefics2VisionAttention(nn.Module):
         )  # batch_size, q_len, 3 * num_heads_per_partition * head_dim
         query_states, key_states, value_states = qkv.chunk(3, dim=-1)
 
-        # Use unified MultiHeadAttention implementation
+        # Use unified MMEncoderAttention implementation
         out = self.attn(query_states, key_states, value_states)
         attn_output, _ = self.out_proj(out)
         return attn_output
diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index 0eed464487865f6a6fb75dd5cef60b55b5f84d57..459043e91da4a20fcb9bfad05786466dd25d515d 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -714,3 +714,21 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLo
             connector="model.connector",
             tower_model="model.vision_model",
         )
+
+    def get_num_mm_encoder_tokens(
+        self,
+        num_image_tokens: int,
+    ) -> int:
+        hf_config = self.config
+        scale_factor = hf_config.scale_factor
+
+        return num_image_tokens * scale_factor**2
+
+    def get_num_mm_connector_tokens(
+        self,
+        num_vision_tokens: int,
+    ) -> int:
+        hf_config = self.config
+        scale_factor = hf_config.scale_factor
+
+        return num_vision_tokens // scale_factor**2
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 04c6c70a6793ca8f56203a5b65408e9075456881..b6f5b210496fcfa787d7bed048d18da8baec34f4 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from collections.abc import Callable, Iterable, Mapping, MutableSequence, Set
+from collections.abc import Callable, Iterable, Mapping, MutableSequence
 from typing import (
     TYPE_CHECKING,
     ClassVar,
@@ -98,15 +98,10 @@ class SupportsMultiModal(Protocol):
     `multimodal_config.mm_encoder_tp_mode="data"`.
     """
 
-    merge_by_field_config: ClassVar[bool | None] = None
+    requires_raw_input_tokens: ClassVar[bool] = False
     """
-    [DEPRECATED] A flag that indicates which implementation of
-    `vllm.multimodal.utils.group_mm_kwargs_by_modality` to use.
-    """
-
-    multimodal_cpu_fields: ClassVar[Set[str] | None] = None
-    """
-    [DEPRECATED] A set indicating CPU-only multimodal fields.
+    A flag that indicates this model processes input id tokens
+    in their raw form and not input embeddings.
     """
 
     _processor_factory: ClassVar[_ProcessorFactories]
@@ -145,6 +140,32 @@ class SupportsMultiModal(Protocol):
         """
         ...
 
+    def get_num_mm_encoder_tokens(self, num_image_tokens: int) -> int:
+        """
+        Implement this function to enable LoRA support
+        for the tower module of the multi-modal model.
+        Given the number of image tokens, output the number of
+        multi-modal encoder tokens.
+        """
+        ...
+
+    def get_num_mm_connector_tokens(self, num_vision_tokens: int) -> int:
+        """
+        Implement this function to enable LoRA support
+        for the connector module of the multi-modal model.
+        Given the number of vision tokens, output the number of
+        multi-modal connector tokens.
+        """
+        ...
+
+    @classmethod
+    def get_language_model_spec(cls) -> tuple[nn.Module | None, str | None]:
+        """
+        Return the language model spec:
+        (language model class, language model attr)
+        """
+        return None, None
+
     @overload
     def embed_input_ids(self, input_ids: Tensor) -> Tensor: ...
 
@@ -267,45 +288,25 @@ def supports_multimodal(model: object) -> TypeIs[SupportsMultiModal]: ...
 def supports_multimodal(
     model: type[object] | object,
 ) -> TypeIs[type[SupportsMultiModal]] | TypeIs[SupportsMultiModal]:
-    res = getattr(model, "supports_multimodal", False)
-
-    if res:
-        # We can remove this starting from v0.14
-        merge_by_field_config = getattr(model, "merge_by_field_config", None)
-        if merge_by_field_config is False:
-            raise ValueError(
-                "`merge_by_field_config=False` is no longer effective, "
-                "please update your model to consider the new batching logic "
-                "in `group_mm_kwargs_by_modality` (refer to "
-                "https://github.com/vllm-project/vllm/issues/26149), "
-                "and then remove the override from your model."
-            )
-        if merge_by_field_config is True:
-            logger.warning_once(
-                "`merge_by_field_config=True` is redundant, "
-                "please remove the override from your model."
-            )
-
-        multimodal_cpu_fields = getattr(model, "multimodal_cpu_fields", None)
-        if multimodal_cpu_fields is not None:
-            raise ValueError(
-                "`multimodal_cpu_fields` is no longer effective, "
-                "please set `keep_on_cpu=True` in `MultiModalFieldConfig` "
-                "(refer to https://github.com/vllm-project/vllm/pull/30181), "
-                "and then remove the override from your model."
-            )
-
-    return res
+    return getattr(model, "supports_multimodal", False)
 
 
 def supports_multimodal_raw_input_only(model: type[object] | object) -> bool:
     return getattr(model, "supports_multimodal_raw_input_only", False)
 
 
+def requires_raw_input_tokens(model: type[object] | object) -> bool:
+    return getattr(model, "requires_raw_input_tokens", False)
+
+
 def supports_multimodal_encoder_tp_data(model: type[object] | object) -> bool:
     return getattr(model, "supports_encoder_tp_data", False)
 
 
+def supports_mm_encoder_only(model: type[object] | object) -> bool:
+    return getattr(model, "is_mm_encoder_only_model", False)
+
+
 @overload
 def supports_multimodal_pruning(
     model: type[object],
diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py
index 134a1d94838048803292366bb8f9769e6fb75daf..e658825e1ab01a179c6b3faaea678c3e3acc5917 100644
--- a/vllm/model_executor/models/interfaces_base.py
+++ b/vllm/model_executor/models/interfaces_base.py
@@ -20,12 +20,13 @@ from vllm.utils.func_utils import supports_kw
 if TYPE_CHECKING:
     from vllm.config import VllmConfig
     from vllm.config.model import AttnTypeStr
-    from vllm.config.pooler import PoolingTypeStr
+    from vllm.config.pooler import SequencePoolingType, TokenPoolingType
     from vllm.model_executor.layers.pooler import Pooler
 else:
     VllmConfig = Any
     Pooler = Any
-    PoolingTypeStr = Any
+    SequencePoolingType = Any
+    TokenPoolingType = Any
     AttnTypeStr = Any
 
 logger = init_logger(__name__)
@@ -155,9 +156,19 @@ class VllmModelForPooling(VllmModel[T_co], Protocol[T_co]):
         MRO of your model class.
     """
 
-    default_pooling_type: ClassVar[PoolingTypeStr] = "LAST"
+    default_seq_pooling_type: ClassVar[SequencePoolingType] = "LAST"
     """
-    Indicates the [vllm.config.pooler.PoolerConfig.pooling_type][]
+    Indicates the [vllm.config.pooler.PoolerConfig.seq_pooling_type][]
+    to use by default.
+
+    You can use the
+    [vllm.model_executor.models.interfaces_base.default_pooling_type][]
+    decorator to conveniently set this field.
+    """
+
+    default_tok_pooling_type: ClassVar[TokenPoolingType] = "ALL"
+    """
+    Indicates the [vllm.config.pooler.PoolerConfig.tok_pooling_type][]
     to use by default.
 
     You can use the
@@ -200,18 +211,31 @@ def is_pooling_model(
 _T = TypeVar("_T", bound=type[nn.Module])
 
 
-def default_pooling_type(pooling_type: PoolingTypeStr):
-    """Decorator to set `VllmModelForPooling.default_pooling_type`."""
+def default_pooling_type(
+    *,
+    seq_pooling_type: SequencePoolingType = "LAST",
+    tok_pooling_type: TokenPoolingType = "ALL",
+):
+    """Decorator to set `VllmModelForPooling.default_*_pooling_type`."""
 
     def func(model: _T) -> _T:
-        model.default_pooling_type = pooling_type  # type: ignore
+        model.default_seq_pooling_type = seq_pooling_type  # type: ignore
+        model.default_tok_pooling_type = tok_pooling_type  # type: ignore
         return model
 
     return func
 
 
-def get_default_pooling_type(model: type[object] | object) -> PoolingTypeStr:
-    return getattr(model, "default_pooling_type", "LAST")
+def get_default_seq_pooling_type(
+    model: type[object] | object,
+) -> SequencePoolingType:
+    return getattr(model, "default_seq_pooling_type", "LAST")
+
+
+def get_default_tok_pooling_type(
+    model: type[object] | object,
+) -> TokenPoolingType:
+    return getattr(model, "default_tok_pooling_type", "ALL")
 
 
 def attn_type(attn_type: AttnTypeStr):
diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py
index 38530355a491c5178b7023195a519e19c5b88f24..49a932f122d9f98e58dcb1ea357ef1fee8e279b8 100644
--- a/vllm/model_executor/models/intern_vit.py
+++ b/vllm/model_executor/models/intern_vit.py
@@ -15,7 +15,6 @@ import torch.nn as nn
 import torch.nn.functional as F
 from transformers import PretrainedConfig
 
-from vllm.attention.layer import MultiHeadAttention
 from vllm.distributed import (
     divide,
     get_tensor_model_parallel_rank,
@@ -24,6 +23,7 @@ from vllm.distributed import (
     tensor_model_parallel_all_gather,
 )
 from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention
 from vllm.model_executor.layers.conv import Conv2dLayer
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (
@@ -209,7 +209,7 @@ class InternParallelAttention(nn.Module):
             disable_tp=use_data_parallel,
         )
 
-        self.attn = MultiHeadAttention(
+        self.attn = MMEncoderAttention(
             self.num_heads_per_partition, self.head_dim, self.scale
         )
 
diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
index 3ca88646186288d84b4e8e0026d3be48ec2721b8..45628b4feaf851dffd5dc55896452093a21abed8 100644
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -28,7 +28,7 @@ from vllm.model_executor.layers.linear import (
     RowParallelLinear,
 )
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.pooler import DispatchPooler, Pooler
+from vllm.model_executor.layers.pooler.tokwise import pooler_for_token_classify
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -402,7 +402,7 @@ class InternLM2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
         return loaded_params
 
 
-@default_pooling_type("ALL")
+@default_pooling_type(tok_pooling_type="ALL")
 class InternLM2ForRewardModel(InternLM2ForCausalLM):
     is_pooling_model = True
 
@@ -434,9 +434,7 @@ class InternLM2ForRewardModel(InternLM2ForCausalLM):
         pooler_config = vllm_config.model_config.pooler_config
         assert pooler_config is not None
 
-        self.pooler = DispatchPooler(
-            {"token_classify": Pooler.for_token_classify(pooler_config)}
-        )
+        self.pooler = pooler_for_token_classify(pooler_config)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/interns1_vit.py b/vllm/model_executor/models/interns1_vit.py
index cb0414bbc95a8ce80e9f951e1724ce09127e50e8..2b2866d678a8d2f6bdd6f5e045a943a6cc2090dc 100644
--- a/vllm/model_executor/models/interns1_vit.py
+++ b/vllm/model_executor/models/interns1_vit.py
@@ -14,8 +14,8 @@ import torch.nn as nn
 from transformers import PretrainedConfig
 from transformers.utils import torch_int
 
-from vllm.attention.layer import MultiHeadAttention
 from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention
 from vllm.model_executor.layers.conv import Conv2dLayer
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear
@@ -214,8 +214,8 @@ class InternSdpaAttention(nn.Module):
 
         self.projection_layer = nn.Linear(self.dummy_dim, self.embed_dim)
 
-        # Use unified MultiHeadAttention with automatic backend selection
-        self.attn = MultiHeadAttention(self.num_heads, self.head_dim, self.scale)
+        # Use unified MMEncoderAttention with automatic backend selection
+        self.attn = MMEncoderAttention(self.num_heads, self.head_dim, self.scale)
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """x shape: (B, N, C)"""
@@ -228,7 +228,7 @@ class InternSdpaAttention(nn.Module):
             q = self.q_norm(q)
             k = self.k_norm(k)
 
-        # Use unified MultiHeadAttention with automatic backend selection
+        # Use unified MMEncoderAttention with automatic backend selection
         x = self.attn(q, k, v)
 
         x = self.projection_layer(x)
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index 15f7d4f418e48d5171c90b7ba8cc1bd3ebc90633..048bc49ea6af610645e2d2ae9894f1ea70e27b98 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -7,7 +7,6 @@
 # Copyright (c) 2023 OpenGVLab
 # Licensed under The MIT License [see LICENSE for details]
 # --------------------------------------------------------
-import os
 from abc import ABC, abstractmethod
 from collections.abc import Iterable, Mapping, Sequence
 from typing import Annotated, Any, Literal, TypeAlias, TypeVar
@@ -52,7 +51,6 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.tokenizers import TokenizerLike
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
-from vllm.utils.torch_utils import set_default_torch_num_threads
 
 from .interfaces import (
     MultiModalEmbeddings,
@@ -143,19 +141,7 @@ def build_transform(input_size: int):
             T.Normalize(mean=MEAN, std=STD),
         ]
     )
-    # Image transformation operations (which include tensor computations
-    # on the CPU) can occupy a substantial number of CPU cores, introducing
-    # overhead due to CPU contention. This issue becomes particularly
-    # noticeable when deploying multiple vLLM instances on a single machine.
-    # Therefore, it is necessary to limit the number of threads allocated to
-    # image transformation tasks.
-    num_threads = int(os.environ.get("OMP_NUM_THREADS", "1"))
-
-    def apply(img):
-        with set_default_torch_num_threads(num_threads):
-            return transform(img)
-
-    return apply
+    return transform
 
 
 # adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
diff --git a/vllm/model_executor/models/iquest_loopcoder.py b/vllm/model_executor/models/iquest_loopcoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..1901cc6e81c4c4423acbd11edf4ea5cda2087b4d
--- /dev/null
+++ b/vllm/model_executor/models/iquest_loopcoder.py
@@ -0,0 +1,595 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only LoopCoder model compatible with HuggingFace weights."""
+
+from __future__ import annotations
+
+from collections.abc import Iterable
+from dataclasses import replace
+from typing import Any
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from vllm.attention.layer import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from vllm.model_executor.models.llama import LlamaMLP
+from vllm.sequence import IntermediateTensors
+from vllm.v1.attention.backend import AttentionType
+
+from .utils import (
+    AutoWeightsLoader,
+    extract_layer_index,
+    make_layers,
+    maybe_prefix,
+)
+
+
+class LoopCoderAttention(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        max_position: int = 4096 * 32,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+        attn_type: str = AttentionType.DECODER,
+        dual_chunk_attention_config: dict[str, Any] | None = None,
+        layer_idx: int = 0,
+    ) -> None:
+        super().__init__()
+        self.layer_idx = layer_idx
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.dual_chunk_attention_config = dual_chunk_attention_config
+
+        # Get loop_num from config, default to 2 if not specified
+        self.loop_num = getattr(config, "loop_num", 2)
+
+        self.loop_window_size = getattr(config, "loop_window_size", 64)
+
+        # Use total number of hidden layers instead of hardcoded 24
+        total_layers = config.num_hidden_layers
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            max_position=max_position,
+            rope_parameters=config.rope_parameters,
+            dual_chunk_attention_config=dual_chunk_attention_config,
+        )
+        self.attn = nn.ModuleList()
+
+        base_cache_config = cache_config
+
+        for loop_idx in range(self.loop_num):
+            base_layer_idx = extract_layer_index(prefix)
+            unique_layer_idx = loop_idx * total_layers + base_layer_idx
+
+            unique_prefix = prefix.replace(
+                f"layers.{base_layer_idx}", f"layers.{unique_layer_idx}"
+            )
+
+            if loop_idx == 0:
+                loop_cache_config = cache_config
+            else:
+                if base_cache_config is not None:
+                    loop_cache_config = replace(
+                        base_cache_config,
+                        sliding_window=self.loop_window_size,
+                    )
+                else:
+                    loop_cache_config = CacheConfig(
+                        sliding_window=self.loop_window_size,
+                        cache_dtype="auto",
+                    )
+
+            self.attn.append(
+                Attention(
+                    self.num_heads,
+                    self.head_dim,
+                    self.scaling,
+                    num_kv_heads=self.num_kv_heads,
+                    cache_config=loop_cache_config,
+                    quant_config=quant_config,
+                    attn_type=attn_type,
+                    prefix=f"{unique_prefix}.attn",
+                    **{
+                        "layer_idx": unique_layer_idx,
+                        "dual_chunk_attention_config": dual_chunk_attention_config,
+                    }
+                    if dual_chunk_attention_config and loop_idx == 0
+                    else {},
+                )
+            )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        loop_idx: int,
+        gate_proj: LoopGateProjection | None = None,
+    ) -> torch.Tensor:
+        if loop_idx == 0:
+            attn = self.attn[0]
+            qkv, _ = self.qkv_proj(hidden_states)
+            q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+            q, k = self.rotary_emb(positions, q, k)
+            attn_output = attn(q, k, v)
+            output, _ = self.o_proj(attn_output)
+            return output
+        else:
+            global_attn = self.attn[0]
+            local_attn = self.attn[loop_idx]
+            qkv, _ = self.qkv_proj(hidden_states)
+            q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+            q, k = self.rotary_emb(positions, q, k)
+            num_tokens, _ = q.shape
+            num_heads = self.num_heads
+            head_dim = self.head_dim
+
+            q_reshaped = q.view(num_tokens, num_heads, head_dim).transpose(0, 1)
+
+            global_attn_output = global_attn(q, None, None)
+            local_attn_output = local_attn(q, k, v)
+            assert gate_proj is not None, "gate_proj must be provided for loop_idx > 0"
+            gate = gate_proj(q_reshaped)
+            output = global_attn_output * gate + local_attn_output * (1 - gate)
+            output, _ = self.o_proj(output)
+            return output
+
+
+class LoopCoderDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+        layer_idx: int = 0,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        dual_chunk_attention_config = getattr(
+            config, "dual_chunk_attention_config", None
+        )
+        self.layer_idx = layer_idx
+        if getattr(config, "is_causal", True):
+            attn_type = AttentionType.DECODER
+        else:
+            attn_type = AttentionType.ENCODER_ONLY
+
+        self.self_attn = LoopCoderAttention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            max_position=config.max_position_embeddings,
+            num_kv_heads=config.num_key_value_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+            attn_type=attn_type,
+            dual_chunk_attention_config=dual_chunk_attention_config,
+            layer_idx=self.layer_idx,
+        )
+        self.mlp = LlamaMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        loop_idx: int,
+        gate_proj: LoopGateProjection | None = None,
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            loop_idx=loop_idx,
+            gate_proj=gate_proj,
+        )
+        hidden_states = hidden_states + residual
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = hidden_states + residual
+
+        return hidden_states
+
+
+class LoopGateProjection(nn.Module):
+    """Gate projection for mixed attention in Loop 2+.
+
+    Computes: g = sigmoid(linear(Q)) for each head independently.
+    This gate determines how much to use Loop1's KV (global) vs current
+    loop's KV (local).
+
+    Supports tensor parallelism: each GPU handles a subset of heads.
+    The weight matrix has shape [num_heads, head_dim] and is split along
+    the head dimension.
+    """
+
+    def __init__(
+        self,
+        total_num_heads: int,
+        head_dim: int,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.total_num_heads = total_num_heads
+        self.head_dim = head_dim
+        tp_size = get_tensor_model_parallel_world_size()
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+
+        self.gate_proj = ColumnParallelLinear(
+            head_dim,
+            self.total_num_heads,
+            bias=True,
+            gather_output=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_proj",
+        )
+
+    def forward(self, query: torch.Tensor) -> torch.Tensor:
+        """Compute gate values from query tensor.
+
+        Args:
+            query: [num_heads, num_tokens, head_dim] (vLLM flattened format)
+                where num_heads is the number of heads on this TP rank
+                and num_tokens = batch * seq_len
+
+        Returns:
+            gate: [num_tokens, num_heads * head_dim] (flattened format matching q shape)
+        """
+        num_heads, num_tokens, head_dim = query.shape
+
+        assert num_heads == self.num_heads, (
+            f"Expected {self.num_heads} heads, got {num_heads}"
+        )
+
+        query_flat = query.reshape(-1, head_dim)
+
+        gate_logits_flat, _ = self.gate_proj(query_flat)
+
+        gate_logits = gate_logits_flat.reshape(
+            num_heads, num_tokens, self.num_heads
+        )  # [num_heads, num_tokens, num_heads]
+
+        # Extract diagonal: each head h's query should use output column h
+        # gate_logits[h, :, h] gives the output for head h at each token
+        gate_logits = torch.diagonal(
+            gate_logits, dim1=0, dim2=2
+        )  # [num_tokens, num_heads]
+        gate_logits = gate_logits.transpose(0, 1)  # [num_heads, num_tokens]
+        gate_logits = gate_logits.unsqueeze(-1)  # [num_heads, num_tokens, 1]
+
+        # Apply sigmoid
+        gate = torch.sigmoid(gate_logits)  # [num_heads, num_tokens, 1]
+
+        # Expand and reshape to match q shape: [num_tokens, num_heads * head_dim]
+        gate = gate.transpose(0, 1)  # [num_tokens, num_heads, 1]
+        gate = gate.expand(-1, -1, head_dim)  # [num_tokens, num_heads, head_dim]
+        gate = gate.reshape(
+            num_tokens, num_heads * head_dim
+        )  # [num_tokens, num_heads * head_dim]
+
+        return gate
+
+
+@support_torch_compile(
+    dynamic_arg_dims={
+        "input_ids": 0,
+        "positions": -1,
+        "intermediate_tensors": 0,
+        "inputs_embeds": 0,
+    }
+)
+class IQuestLoopCoderModel(nn.Module):
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        decoder_layer_type: type[nn.Module] = LoopCoderDecoderLayer,
+    ):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
+        # TODO (@robertgshaw2): see if this can be moved out
+        if cache_config.sliding_window is not None and hasattr(
+            config, "max_window_layers"
+        ):
+            assert config.max_window_layers == config.num_hidden_layers, (
+                "Sliding window for some but all layers is not supported. "
+                "This model uses sliding window but `max_window_layers` = {} "
+                "is less than `num_hidden_layers` = {}. Please open an issue "
+                "to discuss this feature.".format(
+                    config.max_window_layers,
+                    config.num_hidden_layers,
+                )
+            )
+
+        self.config = config
+        self.quant_config = quant_config
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.embed_tokens",
+        )
+
+        self.loop_num = getattr(self.config, "loop_num", 2)
+        self.window_size = getattr(self.config, "loop_window_size", 64)
+
+        # Gate projections for Loop 2+ (one per layer)
+        head_dim = config.hidden_size // config.num_attention_heads
+        _, _, self.gate_projections = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: LoopGateProjection(
+                total_num_heads=config.num_attention_heads,
+                head_dim=head_dim,
+                quant_config=quant_config,
+                prefix=prefix,
+            ),
+            prefix=f"{prefix}.gate_projections",
+        )
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: LoopCoderDecoderLayer(
+                config=config,
+                cache_config=cache_config,
+                quant_config=quant_config,
+                prefix=prefix,
+                layer_idx=extract_layer_index(prefix),
+            ),
+            prefix=f"{prefix}.layers",
+        )
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+        else:
+            hidden_states = self.embed_input_ids(input_ids)
+
+        for loop_idx in range(self.loop_num):
+            for layer_idx, layer in enumerate(
+                self.layers[self.start_layer : self.end_layer]
+            ):
+                # Get the actual layer index (accounting for pipeline parallelism)
+                actual_layer_idx = self.start_layer + layer_idx
+                # Get gate_proj for this layer (only for loop_idx > 0)
+                gate_proj = (
+                    self.gate_projections[actual_layer_idx] if loop_idx > 0 else None
+                )
+                hidden_states = layer(positions, hidden_states, loop_idx, gate_proj)
+        hidden_states = self.norm(hidden_states)
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if self.quant_config is not None and (
+                scale_name := self.quant_config.get_cache_scale(name)
+            ):
+                # Loading kv cache quantization scales
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                loaded_weight = (
+                    loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0]
+                )
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if "gate_projections" in name:
+                    continue
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if name.endswith("scale"):
+                    # Remapping the name of FP8 kv-scale.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                if weight_loader == default_weight_loader:
+                    weight_loader(param, loaded_weight)
+                else:
+                    weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                if name.startswith("gate_projections."):
+                    if name.endswith(".weight"):
+                        vllm_name = name.replace(".weight", ".gate_proj.weight")
+                    elif name.endswith(".bias"):
+                        vllm_name = name.replace(".bias", ".gate_proj.bias")
+                    else:
+                        continue
+
+                    if vllm_name in params_dict:
+                        param = params_dict[vllm_name]
+                        weight_loader = getattr(
+                            param, "weight_loader", default_weight_loader
+                        )
+                        weight_loader(param, loaded_weight)
+                        loaded_params.add(vllm_name)
+                        continue
+                    continue
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class IQuestLoopCoderForCausalLM(nn.Module):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+
+        self.config = config
+
+        self.quant_config = quant_config
+        self.model = IQuestLoopCoderModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
+
+        if config.tie_word_embeddings:
+            self.lm_head = self.model.embed_tokens
+        else:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=maybe_prefix(prefix, "lm_head"),
+            )
+
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor | None:
+        logits = self.logits_processor(self.lm_head, hidden_states)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/isaac.py b/vllm/model_executor/models/isaac.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d331b95f9159fddf5243839f50dd8409983afc2
--- /dev/null
+++ b/vllm/model_executor/models/isaac.py
@@ -0,0 +1,1503 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from __future__ import annotations
+
+import math
+from collections.abc import Iterable, Iterator, Mapping, Sequence
+from typing import Annotated, Any
+
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from transformers.image_processing_utils import BatchFeature
+from transformers.tokenization_utils import TensorType
+from typing_extensions import TypedDict, Unpack
+
+from vllm.config import MultiModalConfig, VllmConfig
+from vllm.config.model import ModelConfig
+from vllm.distributed import parallel_state
+from vllm.distributed import utils as dist_utils
+from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader,
+)
+from vllm.model_executor.models.interfaces import (
+    MultiModalEmbeddings,
+    SupportsLoRA,
+    SupportsMRoPE,
+    SupportsMultiModal,
+    SupportsPP,
+)
+from vllm.model_executor.models.module_mapping import MultiModelKeys
+from vllm.model_executor.models.siglip import SiglipMLP
+from vllm.model_executor.models.utils import (
+    AutoWeightsLoader,
+    WeightsMapper,
+    init_vllm_registered_model,
+    maybe_prefix,
+)
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (
+    MultiModalDataDict,
+    MultiModalFeatureSpec,
+    MultiModalFieldConfig,
+    MultiModalKwargsItems,
+)
+from vllm.multimodal.parse import ImageSize, MultiModalDataItems
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    PromptReplacement,
+    PromptUpdate,
+    PromptUpdateDetails,
+)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+from vllm.tokenizers import get_tokenizer
+from vllm.tokenizers.hf import get_cached_tokenizer
+from vllm.transformers_utils.config import patch_rope_parameters
+from vllm.transformers_utils.configs import (
+    IsaacConfig,
+    PixelShuffleSiglip2VisionConfig,
+)
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
+
+
+def create_cumulative_seq_lengths(
+    seq_sizes: torch.Tensor, device: torch.device
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Create cumulative sequence lengths for variable-length attention."""
+    cu_seqlens = torch.zeros(len(seq_sizes) + 1, dtype=torch.int32, device=device)
+    cu_seqlens[1:] = seq_sizes.cumsum(0)
+    max_seqlen = (
+        seq_sizes.max()
+        if len(seq_sizes) > 0
+        else torch.tensor(0, dtype=torch.int32, device=device)
+    )
+    return cu_seqlens, max_seqlen
+
+
+class Siglip2VariableSequenceEmbeddings(nn.Module):
+    def __init__(self, config: PixelShuffleSiglip2VisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.patch_size = config.patch_size
+
+        self.patch_embedding = ReplicatedLinear(
+            input_size=config.num_channels * self.patch_size * self.patch_size,
+            output_size=self.embed_dim,
+            return_bias=False,
+        )
+
+        self.num_patches = config.num_patches
+        self.position_embedding_size = int(self.num_patches**0.5)
+        self.position_embedding = nn.Embedding(self.num_patches, self.embed_dim)
+
+    def positional_embeddings(
+        self, packed_seq_patches: tuple[torch.Tensor, torch.Tensor, torch.Tensor]
+    ) -> torch.Tensor:
+        # Prepare positional embeddings grid: (1, embed_dim, h, w)
+        positional_embeddings = (
+            self.position_embedding.weight.reshape(
+                self.position_embedding_size, self.position_embedding_size, -1
+            )
+            .permute(2, 0, 1)
+            .unsqueeze(0)
+        )
+
+        _seq_patches, _seq_sizes, spatial_shapes = packed_seq_patches
+        pos_embeds_list = []
+        mode = "bilinear"
+        align_corners = False
+        antialias = True
+        for spatial_shape in spatial_shapes:
+            height, width = int(spatial_shape[0]), int(spatial_shape[1])
+            # Guard to ensure height and width are positive for torch.compile
+            if height > 0 and width > 0:
+                resized_pos_embed = F.interpolate(
+                    positional_embeddings,
+                    size=(height, width),
+                    mode=mode,
+                    align_corners=align_corners,
+                    antialias=antialias,
+                )
+                # Reshape from (1, embed_dim, height, width) to
+                # (height*width, embed_dim)
+                resized_pos_embed = resized_pos_embed.reshape(
+                    self.embed_dim, height * width
+                ).transpose(0, 1)
+            else:
+                # Fallback - should never happen in practice
+                resized_pos_embed = positional_embeddings.reshape(
+                    self.embed_dim,
+                    self.position_embedding_size * self.position_embedding_size,
+                ).transpose(0, 1)[: height * width]
+            pos_embeds_list.append(resized_pos_embed)
+
+        # Concatenate all positional embeddings along the sequence dimension
+        pos_embeds = torch.cat(pos_embeds_list, dim=0)
+        return pos_embeds
+
+    def forward(
+        self, packed_seq_patches: tuple[torch.Tensor, torch.Tensor, torch.Tensor]
+    ):
+        seq_patches, _seq_sizes, _spatial_shapes = packed_seq_patches
+
+        target_weight = self.patch_embedding.weight
+        seq_patches = seq_patches.to(
+            device=target_weight.device, dtype=target_weight.dtype
+        )
+        patch_embeds = self.patch_embedding(seq_patches)
+        pos_embeds = self.positional_embeddings(packed_seq_patches)
+
+        # Flatten patch embeddings to match positional embeddings format
+        if patch_embeds.dim() == 3:
+            patch_embeds = patch_embeds.view(-1, patch_embeds.size(-1))
+
+        # Add positional embeddings to patch embeddings
+        embeddings = patch_embeds + pos_embeds
+        return embeddings
+
+
+def create_pixel_shuffle_index_map(
+    seq_sizes: torch.Tensor,
+    token_grids: torch.Tensor,
+    scale_factor: int = 1,
+    device: torch.device | None = None,
+) -> torch.Tensor:
+    """
+    Build a gather-index map that tells us, for every *output* token after
+    pixel-shuffle, which `scale_factor**2` *input* tokens are being merged.
+
+    Args
+    ----
+    seq_sizes     : (num_images,)  - #patches in each image (row-major order)
+    token_grids   : (num_images,2) - (height, width) for every image
+    scale_factor  : spatial down-scale factor (≥2)
+    device        : (optional) overrides `seq_sizes.device`
+
+    Returns
+    -------
+    gather_idx : (new_total_seq_len, scale_factor**2) int64 tensor.
+                 gather_idx[i, j] is the *flat* index into the *original*
+                 packed sequence for the j-th sub-patch that forms the
+                 i-th output token.
+    """
+    if device is None:
+        device = seq_sizes.device
+
+    r = int(scale_factor)
+    if r < 2:
+        raise ValueError("`scale_factor` must be ≥ 2")
+
+    # Safety: all spatial dims must be divisible by r
+    # Cannot run under torch compile fullgraph mode hence
+    if not torch.compiler.is_compiling() and not (
+        (token_grids[:, 0] % r == 0).all() and (token_grids[:, 1] % r == 0).all()
+    ):
+        raise AssertionError(
+            "Every (H,W) in `token_grids` must be divisible by "
+            f"scale_factor={r}, got {token_grids.tolist()}"
+        )
+
+    gather_chunks: list[torch.Tensor] = []
+    tok_offset = 0
+
+    for seq_len, (h, w) in zip(seq_sizes.tolist(), token_grids.tolist(), strict=False):
+        # Build the (H, W) grid of flat indices for this image
+        grid = torch.arange(seq_len, device=device, dtype=torch.int64) + tok_offset
+        grid = grid.view(h, w)  # (H, W)
+
+        # -------- identical ordering to your fixed-res routine --------
+        # Step 1: split width into blocks of r
+        grid = grid.view(h, w // r, r)  # (H, W/r, r)
+        # Step 2: now split height into blocks of r
+        grid = grid.view(h // r, r, w // r, r)  # (H/r, r, W/r, r)
+        # Step 3: final permutation to (H/r, W/r, r, r)
+        grid = grid.permute(0, 2, 1, 3).contiguous()  # (H/r, W/r, r, r)
+        # Step 4: each (r, r) block forms one output token
+        gather_chunks.append(grid.reshape(-1, r * r))  # (H*W / r², r²)
+
+        tok_offset += seq_len
+
+    # Concatenate over all images in the packed batch
+    gather_idx = torch.cat(gather_chunks, dim=0)  # (Σ_i HᵢWᵢ/r², r²)
+    return gather_idx
+
+
+def pixel_shuffle_varlen(
+    x: torch.Tensor,
+    token_grids: torch.Tensor,
+    scale_factor: int = 1,
+) -> torch.Tensor:
+    r"""Apply pixel shuffle to a packed vision sequence without unpacking per image.
+
+    Args:
+        x (`torch.Tensor`):
+            Concatenated vision embeddings. Accepts `(seq_len, hidden_size)` or
+            `(1, seq_len, hidden_size)` shapes produced by stacking image
+            patches.
+        token_grids (`torch.Tensor`):
+            Integer tensor of shape `(num_images, 2)` whose rows give the
+            `(height, width)` patch grid sizes corresponding to each image
+            segment inside `x`.
+        scale_factor (`int`, *optional*, defaults to 1):
+            Spatial down-sampling factor specific to pixel shuffle. Values
+            greater than one merge `scale_factor**2` neighboring patches into a
+            single embedding channel-group.
+
+    Returns:
+        `torch.Tensor`: Pixel-shuffled embeddings with shape matching the input
+        convention: `(seq_len, hidden_size * scale_factor**2)` when the input
+        was 2D, or `(1, seq_len, hidden_size * scale_factor**2)` if the
+        singleton batch dimension was present.
+
+    Raises:
+        ValueError: If more than one batch item is provided.
+    """
+    keep_batch_dim = x.dim() == 3
+    if keep_batch_dim:
+        if x.size(0) != 1:
+            raise AssertionError("Packed sequence is expected to have batch_size == 1")
+        x_ = x.squeeze(0)  # (seq, embed)
+    else:
+        x_ = x  # (seq, embed)
+
+    embed_dim = x_.size(-1)
+    r = int(scale_factor)
+
+    # Calculate seq_sizes from token_grids
+    seq_sizes = torch.prod(token_grids, dim=-1)
+
+    # Build index map and gather in one go
+    gather_idx = create_pixel_shuffle_index_map(
+        seq_sizes=seq_sizes,
+        token_grids=token_grids,
+        scale_factor=r,
+        device=x_.device,
+    )  # (new_seq, r²)
+
+    # Gather → (new_seq, r², embed_dim)
+    gathered = x_[gather_idx]  # fancy indexing keeps gradient
+
+    # Merge the r² group dimension into channels to finish the shuffle
+    out = gathered.reshape(gathered.size(0), embed_dim * r * r)
+
+    # Restore batch dimension if needed
+    if keep_batch_dim:
+        out = out.unsqueeze(0)
+    return out
+
+
+# ============================================================================
+# Configuration
+# ============================================================================
+
+MAX_PIXELS = 60_000_000  # 60-megapixel ceiling ≈ 8200 × 7300 px
+
+# Vision preprocessing constants
+VISION_MEAN = (0.5, 0.5, 0.5)
+VISION_STD = (0.5, 0.5, 0.5)
+VISION_SCALE = 1 / 255
+
+
+def _make_writeable(arr: np.ndarray) -> np.ndarray:
+    """Return *arr* itself if it is already writeable, otherwise try to flip the
+    write flag in-place and finally fall back to `arr.copy()`.
+    This guarantees the buffer handed to `torch.from_numpy()` is always
+    writeable, silencing the PyTorch warning about undefined behaviour.
+    """
+    if arr.flags.writeable:
+        return arr
+
+    # First, try the cheap path — in-place flag toggle (works for mmap'd arrays
+    # and some shared memory buffers):
+    try:
+        arr.setflags(write=True)
+        return arr  # success: no data copy
+    except ValueError:
+        # Buffer is inherently read-only (e.g. backed by PyAV / PIL): make copy
+        return arr.copy()
+
+
+def extract_image_pil(image: PIL.Image.Image) -> torch.Tensor | None:
+    if image.width * image.height > MAX_PIXELS:
+        raise ValueError(
+            f"Image (w={image.width}, h={image.height}) > MAX=`{MAX_PIXELS}`"
+        )
+    img = image if image.mode == "RGB" else image.convert("RGB")
+    arr = np.asarray(img)
+    arr = _make_writeable(arr)
+    return torch.from_numpy(arr)
+
+
+def get_image_size_for_max_num_patches(
+    image_height: int,
+    image_width: int,
+    patch_size: int,
+    max_num_patches: int,
+    min_num_patches: int | None = None,
+    eps: float = 1e-5,
+    pixel_shuffle_scale: int = 1,
+) -> tuple[int, int]:
+    r"""Compute a target resolution whose patch grid satisfies patching parametrization.
+
+    Args:
+        image_height (`int`):
+            Height in pixels of the source image prior to any resizing.
+        image_width (`int`):
+            Width in pixels of the source image prior to any resizing.
+        patch_size (`int`):
+            Size of the square patch used by the vision encoder.
+        max_num_patches (`int`):
+            Upper bound on `(height / patch_size) * (width / patch_size)` after
+            resizing.
+        min_num_patches (`int`, *optional*):
+            Lower bound on the number of patches. When provided the image will
+            be scaled up if necessary.
+        eps (`float`, *optional*, defaults to 1e-5):
+            Convergence tolerance for the internal binary search to determine
+            the target dimensions.
+        pixel_shuffle_scale (`int`, *optional*, defaults to 1):
+            Additional stride multiplier applied when pixel shuffle later
+            reduces spatial resolution.
+
+    Returns:
+        `tuple[int, int]`: Height and width (in pixels) that are multiples of
+        `patch_size * pixel_shuffle_scale` and respect both the maximum and
+        optional minimum patch-count constraints.
+    """
+
+    def get_scaled_image_size(scale, original_size, patch_size, pixel_shuffle_scale):
+        scaled_size = scale * original_size
+        divisor = patch_size * pixel_shuffle_scale
+        scaled_size = math.ceil(scaled_size / divisor) * divisor
+        scaled_size = max(divisor, scaled_size)
+        return int(scaled_size)
+
+    # Ensure divisibility
+    divisor = patch_size * pixel_shuffle_scale
+    adjusted_height = math.ceil(image_height / divisor) * divisor
+    adjusted_height = max(divisor, adjusted_height)
+    adjusted_width = math.ceil(image_width / divisor) * divisor
+    adjusted_width = max(divisor, adjusted_width)
+
+    num_patches = (adjusted_height / patch_size) * (adjusted_width / patch_size)
+
+    if min_num_patches is not None and num_patches < min_num_patches:
+        # Scale up
+        scale_min, scale_max = 1.0, 100.0
+        while (scale_max - scale_min) >= eps:
+            scale = (scale_min + scale_max) / 2
+            target_height = get_scaled_image_size(
+                scale, image_height, patch_size, pixel_shuffle_scale
+            )
+            target_width = get_scaled_image_size(
+                scale, image_width, patch_size, pixel_shuffle_scale
+            )
+            num_patches = (target_height / patch_size) * (target_width / patch_size)
+            if num_patches >= min_num_patches:
+                scale_max = scale
+            else:
+                scale_min = scale
+        scale = scale_max
+        target_height = get_scaled_image_size(
+            scale, image_height, patch_size, pixel_shuffle_scale
+        )
+        target_width = get_scaled_image_size(
+            scale, image_width, patch_size, pixel_shuffle_scale
+        )
+        return target_height, target_width
+    elif num_patches <= max_num_patches:
+        return adjusted_height, adjusted_width
+    else:
+        # Scale down
+        scale_min, scale_max = eps / 10, 1.0
+        while (scale_max - scale_min) >= eps:
+            scale = (scale_min + scale_max) / 2
+            target_height = get_scaled_image_size(
+                scale, image_height, patch_size, pixel_shuffle_scale
+            )
+            target_width = get_scaled_image_size(
+                scale, image_width, patch_size, pixel_shuffle_scale
+            )
+            num_patches = (target_height / patch_size) * (target_width / patch_size)
+            if num_patches <= max_num_patches:
+                scale_min = scale
+            else:
+                scale_max = scale
+        scale = scale_min
+        target_height = get_scaled_image_size(
+            scale, image_height, patch_size, pixel_shuffle_scale
+        )
+        target_width = get_scaled_image_size(
+            scale, image_width, patch_size, pixel_shuffle_scale
+        )
+        return target_height, target_width
+
+
+_MEAN_TENSOR = torch.tensor(VISION_MEAN, dtype=torch.float32).view(1, 1, 1, -1)
+_STD_TENSOR = torch.tensor(VISION_STD, dtype=torch.float32).view(1, 1, 1, -1)
+
+
+def _resolve_vision_token_id(model_config: ModelConfig, vision_token: str) -> int:
+    tokenizer_name = model_config.tokenizer or model_config.model
+    tokenizer = get_cached_tokenizer(
+        get_tokenizer(
+            tokenizer_name,
+            tokenizer_mode=model_config.tokenizer_mode,
+            trust_remote_code=model_config.trust_remote_code,
+            revision=model_config.tokenizer_revision or model_config.revision,
+        )
+    )
+    return tokenizer.encode(vision_token, add_special_tokens=False)[0]
+
+
+def prepare_image_tensor(
+    image: torch.Tensor,
+    scale: float = VISION_SCALE,
+) -> torch.Tensor:
+    r"""Standardize RGB images prior to patch extraction via rescaling and whitening.
+
+    Args:
+        image (`torch.Tensor`):
+            Tensor with shape `(..., height, width, 3)` containing RGB values.
+            The tensor is converted to floating point if needed.
+        scale (`float`, *optional*, defaults to `VISION_SCALE`):
+            Scalar multiplier applied before normalization.
+    Returns:
+        `torch.Tensor`: Normalized tensor with the same shape as the input and
+        dtype `torch.float32`.
+    """
+    if not torch.is_floating_point(image):
+        image = image.float()
+    rescaled = image * scale
+
+    # Use precomputed tensors and move to the correct device if needed
+    mean_tensor = _MEAN_TENSOR.to(image.device)
+    std_tensor = _STD_TENSOR.to(image.device)
+
+    normalized = (rescaled - mean_tensor) / std_tensor
+    return normalized
+
+
+def patchify_vision(image: torch.Tensor, patch_size: int) -> torch.Tensor:
+    r"""Convert normalized images into flattened ViT-style patches.
+
+    Args:
+        image (`torch.Tensor`):
+            Tensor of shape `(num_images, height, width, channels)`.
+        patch_size (`int`):
+            Edge length of the square patches
+
+    Returns:
+        `torch.Tensor`:
+            Patch tensor where each position stores the flattened pixels
+            belonging to that patch.
+
+    Raises:
+        ValueError: If `height` or `width` is not divisible by `patch_size`.
+    """
+    num_images, height, width, channels = image.shape
+    if height % patch_size or width % patch_size:
+        raise ValueError(
+            "Dimensions of images "
+            f"{image.shape} are not divisible by patch_size={patch_size}."
+        )
+    patches = image.reshape(
+        num_images,
+        height // patch_size,
+        patch_size,
+        width // patch_size,
+        patch_size,
+        channels,
+    )
+    patches = patches.permute(0, 1, 3, 2, 4, 5)
+    patches = patches.reshape(
+        num_images,
+        height // patch_size,
+        width // patch_size,
+        channels * patch_size * patch_size,
+    )
+    return patches
+
+
+def process_vision_for_patches(
+    images: torch.Tensor,
+    patch_size: int,
+    max_num_patches: int,
+    min_num_patches: int | None = None,
+    pixel_shuffle_scale: int = 1,
+) -> tuple[torch.Tensor, list[int]]:
+    r"""Resize, normalize, and patchify RGB images for the vision encoder.
+
+    Args:
+        images (`torch.Tensor`):
+            Either `(height, width, channels)` for a single image or
+            `(num_images, height, width, channels)` for a batch. Channels are
+            expected to be RGB.
+        patch_size (`int`):
+            Edge length of square patches; implictly controls resize grid granularity.
+        max_num_patches (`int`):
+            Maximum number of patches allowed after resizing.
+        min_num_patches (`int`, *optional*):
+            Minimum number of patches. If provided, the routine upsamples images
+            as needed to satisfy the lower bound.
+        pixel_shuffle_scale (`int`, *optional*, defaults to 1):
+            Pixel shuffle scale factor; influences the target grid that the
+            function produces.
+
+    Returns:
+        `tuple[torch.Tensor, list[int]]`: A pair `(patches, dims_virtual)`
+        where `patches` has shape `(num_images, target_h / patch_size, target_w
+        / patch_size, channels * patch_size**2)` and `dims_virtual` encodes
+        effective `(images, height, width)` dimensions after optional pixel
+        shuffling.
+    """
+    # Add batch dim if single image
+    if images.dim() == 3:
+        images = images.unsqueeze(0)
+
+    # Permute to channel first for resize
+    images = images.permute(0, 3, 1, 2)
+
+    # Get target dimensions
+    _, _, orig_height, orig_width = images.shape
+    target_height, target_width = get_image_size_for_max_num_patches(
+        orig_height,
+        orig_width,
+        patch_size,
+        max_num_patches,
+        min_num_patches=min_num_patches,
+        pixel_shuffle_scale=pixel_shuffle_scale,
+    )
+
+    # Resize
+    images = F.interpolate(
+        images,
+        size=(target_height, target_width),
+        mode="bilinear",
+        align_corners=False,
+    )
+
+    # Back to channel last
+    images = images.permute(0, 2, 3, 1)
+
+    # Normalize
+    images = prepare_image_tensor(images)
+
+    # Patchify
+    patches = patchify_vision(images, patch_size=patch_size)
+
+    # Calculate dimensions for the patches
+    n_images, h_patches, w_patches, _ = patches.shape
+    dims_virtual = (
+        [1, h_patches, w_patches]
+        if pixel_shuffle_scale == 1
+        else [1, h_patches // pixel_shuffle_scale, w_patches // pixel_shuffle_scale]
+    )
+
+    return patches, dims_virtual
+
+
+class IsaacImageProcessorKwargs(TypedDict, total=False):
+    patch_size: int
+    max_num_patches: int
+    min_num_patches: int
+    pixel_shuffle_scale: int
+
+
+class IsaacImageProcessor:
+    patch_size = 16
+    max_num_patches = 6144
+    min_num_patches = 256
+    pixel_shuffle_scale = 2
+
+    valid_kwargs = IsaacImageProcessorKwargs
+    model_input_names = ["pixel_values", "image_grid_thw"]
+
+    def __init__(self, kwargs):
+        self.patch_size = kwargs.pop("patch_size", self.patch_size)
+        self.vision_max_num_patches = kwargs.pop(
+            "vision_max_num_patches", self.max_num_patches
+        )
+        self.vision_min_num_patches = kwargs.pop(
+            "vision_min_num_patches", self.min_num_patches
+        )
+        self.pixel_shuffle_scale = kwargs.pop("pixel_shuffle_scale", 2)
+
+    def preprocess(
+        self,
+        images: list[torch.Tensor],
+        return_tensors: str | TensorType | None,
+        **kwargs: Unpack[IsaacImageProcessorKwargs],
+    ) -> BatchFeature:
+        """Preprocess images into format compatibile with vLLM input processing."""
+
+        all_pixel_values: list[torch.Tensor] = []
+        all_image_grids: list[torch.Tensor] = []
+
+        for image in images:
+            image_tensor = extract_image_pil(image)
+
+            patches, dims_virtual = process_vision_for_patches(
+                image_tensor,
+                patch_size=self.patch_size,
+                max_num_patches=self.vision_max_num_patches,
+                min_num_patches=self.vision_min_num_patches,
+                pixel_shuffle_scale=self.pixel_shuffle_scale,
+            )
+
+            # Isaac packs a dummy temporal dim for images
+            patches = patches.unsqueeze(1)  # [N, T=1, Hp, Wp, D]
+
+            hp, wp, dim = patches.shape[-3], patches.shape[-2], patches.shape[-1]
+            current_num_patches = hp * wp
+            pixel_values = patches.reshape(current_num_patches, dim)  # [N_tokens, D]
+
+            # Use real patch dimensions for image_grid_thw, not virtual dimensions
+            # This ensures the vision model receives correct grid info for pixel shuffle
+            dims_real = [1, hp, wp]  # Real patch dimensions
+            image_grid_thw = torch.tensor(dims_real).unsqueeze(0)
+
+            all_pixel_values.append(pixel_values)
+            all_image_grids.append(image_grid_thw)
+
+        if all_pixel_values:
+            final_pixel_values = torch.cat(all_pixel_values, dim=0)
+            final_image_grids = torch.cat(all_image_grids, dim=0)
+        else:
+            final_pixel_values = torch.empty(0, 0)
+            final_image_grids = torch.empty(0, 3)
+
+        return BatchFeature(
+            data={
+                "pixel_values": final_pixel_values,
+                "image_grid_thw": final_image_grids,
+            },
+            tensor_type=return_tensors,
+        )
+
+
+class IsaacProcessor:
+    """Processor wrapper (tokenizer + IsaacImageProcessor)."""
+
+    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
+        self.image_token = kwargs.pop("image_token", "<image>")
+        self.image_processor = image_processor or IsaacImageProcessor(kwargs)
+        self.tokenizer = tokenizer
+
+    def __call__(self, text=None, images=None, **kwargs) -> BatchFeature:
+        result = {}
+
+        if images is not None:
+            image_inputs = self.image_processor.preprocess(images, **kwargs)
+            image_grid_thw = image_inputs["image_grid_thw"]
+            result.update(image_inputs)
+
+            if text is not None:
+                if not isinstance(text, list):
+                    text = [text]
+
+                text = text.copy()  # below lines change text in-place
+                merge_length = self.image_processor.pixel_shuffle_scale**2
+                index = 0
+                for i in range(len(text)):
+                    while self.image_token in text[i]:
+                        num_image_tokens = image_grid_thw[index].prod() // merge_length
+                        text[i] = text[i].replace(
+                            self.image_token, "<|placeholder|>" * num_image_tokens, 1
+                        )
+                        index += 1
+                    text[i] = text[i].replace("<|placeholder|>", "<|image_pad|>")
+
+        if text is not None:
+            result.update(self.tokenizer(text, **kwargs))
+
+        return BatchFeature(result)
+
+    def apply_chat_template(
+        self,
+        messages: list[dict[str, Any]],
+        tokenize: bool = False,
+        add_generation_prompt: bool = False,
+        **kwargs,
+    ) -> Any:
+        # Convert mixed content messages to simple text format
+        processed_messages = []
+
+        for message in messages:
+            if "content" in message and isinstance(message["content"], list):
+                # Handle mixed content (text + image)
+                text_parts = []
+                for content_item in message["content"]:
+                    if content_item.get("type") == "text":
+                        text_parts.append(content_item.get("text", ""))
+                    elif content_item.get("type") == "image":
+                        # Replace image with vision token
+                        text_parts.append(self.image_token)
+
+                processed_message = {
+                    "role": message.get("role", "user"),
+                    "content": "".join(text_parts),
+                }
+                processed_messages.append(processed_message)
+            else:
+                # Regular text message
+                processed_messages.append(message)
+
+        return self.tokenizer.apply_chat_template(
+            processed_messages,
+            tokenize=tokenize,
+            add_generation_prompt=add_generation_prompt,
+            **kwargs,
+        )
+
+
+class IsaacProcessingInfo(BaseProcessingInfo):
+    def get_hf_config(self) -> IsaacConfig:
+        if hasattr(self.ctx, "get_hf_config"):
+            original_config = self.ctx.get_hf_config()
+            # Map HF config parameters to our vLLM config parameters
+            return IsaacConfig(
+                # Vision parameters - map from HF names
+                vision_config=getattr(original_config, "vision_config", None),
+                vision_patch_size=getattr(original_config, "video_patch_size", 16),
+                vision_max_num_patches=getattr(
+                    original_config, "vision_max_num_patches", 256
+                ),
+                vision_min_num_patches=getattr(
+                    original_config, "vision_min_num_patches", None
+                ),
+                pixel_shuffle_scale=getattr(original_config, "pixel_shuffle_scale", 1),
+                max_sequence_length=getattr(
+                    original_config, "max_sequence_length", 16384
+                ),
+                vision_token=getattr(original_config, "vision_token", "<image>"),
+                vision_attn_implementation=getattr(
+                    original_config, "vision_attn_implementation", None
+                ),
+            )
+        return IsaacConfig()
+
+    def get_hf_processor(self, **kwargs) -> IsaacProcessor:
+        hf_config = self.get_hf_config()
+        processor_kwargs = {
+            "image_token": hf_config.vision_token,
+        }
+        processor_kwargs.update(kwargs)
+        return self.ctx.get_hf_processor(IsaacProcessor, **processor_kwargs)
+
+    def get_tokenizer(self):
+        return self.ctx.tokenizer
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        hf_config = self.get_hf_config()
+        # Get target dimensions
+        target_height, target_width = get_image_size_for_max_num_patches(
+            9999999,
+            9999999,
+            hf_config.video_patch_size,
+            hf_config.vision_max_num_patches,
+            min_num_patches=hf_config.vision_min_num_patches,
+            pixel_shuffle_scale=hf_config.pixel_shuffle_scale,
+        )
+        return ImageSize(width=target_width, height=target_height)
+
+    def get_image_processor(self, **kwargs) -> IsaacImageProcessor:
+        return self.get_hf_processor(**kwargs).image_processor
+
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
+        return {"image": None}
+
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
+        hf_config = self.get_hf_config()
+        num_vision_tokens = hf_config.vision_max_num_patches // (
+            hf_config.pixel_shuffle_scale**2
+        )
+        return {"image": num_vision_tokens}
+
+
+class IsaacDummyInputsBuilder(BaseDummyInputsBuilder[IsaacProcessingInfo]):
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        hf_processor = self.info.get_hf_processor()
+        image_token: str = hf_processor.image_token
+
+        return image_token * num_images
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+        mm_options: Mapping[str] | None = None,
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 0)
+
+        target_width, target_height = self.info.get_image_size_with_most_features()
+        image_overrides = mm_options.get("image") if mm_options else None
+
+        return {
+            "image": self._get_dummy_images(
+                width=target_width,
+                height=target_height,
+                num_images=num_images,
+                overrides=image_overrides,
+            ),
+        }
+
+
+class IsaacImagePixelInputs(TensorSchema):
+    """
+    Schema for validating Isaac image inputs.
+
+    Dimensions:
+        - np: Number of patches
+        - d: Patch dimension
+        - ni: Number of images
+
+    The schema enforces:
+        - pixel_values must be 2D: (num_patches, patch_dim)
+        - image_grid_thw must be 2D: (num_images, 3)
+          where 3 represents [T, H, W]
+    """
+
+    pixel_values: Annotated[
+        torch.Tensor,
+        TensorShape("np", "d"),
+    ]
+
+    image_grid_thw: Annotated[
+        torch.Tensor,
+        TensorShape("ni", 3),
+    ]
+
+
+class IsaacMultiModalProcessor(BaseMultiModalProcessor):
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        # Configure multimodal fields for Isaac model
+        image_grid_thw = hf_inputs.get("image_grid_thw", torch.empty((0, 3)))
+        image_grid_sizes = image_grid_thw.prod(-1)
+
+        return {
+            "pixel_values": MultiModalFieldConfig.flat_from_sizes(
+                "image", image_grid_sizes
+            ),
+            "image_grid_thw": MultiModalFieldConfig.batched("image"),
+        }
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, Any],
+        out_mm_kwargs: MultiModalKwargsItems,
+    ) -> Sequence[PromptUpdate]:
+        image_processor = self.info.get_image_processor(**hf_processor_mm_kwargs)
+
+        pixel_shuffle_scale = getattr(image_processor, "pixel_shuffle_scale", 2)
+        merge_length = pixel_shuffle_scale**2
+
+        def get_replacement_isaac(item_idx: int):
+            out_item = out_mm_kwargs["image"][item_idx]
+            grid_thw = out_item["image_grid_thw"].data
+            assert isinstance(grid_thw, torch.Tensor)
+
+            feature_size = int(grid_thw.prod()) // merge_length
+            repl_full = "<|image_pad|>" * feature_size
+            return PromptUpdateDetails.select_text(repl_full, "<|image_pad|>")
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target="<image>",
+                replacement=get_replacement_isaac,
+            )
+        ]
+
+
+class Siglip2VisionAttention(nn.Module):
+    def __init__(
+        self,
+        config: PixelShuffleSiglip2VisionConfig,
+        quant_config: QuantizationConfig | None = None,
+        *,
+        prefix: str = "",
+        multimodal_config: MultiModalConfig | None = None,
+    ) -> None:
+        super().__init__()
+
+        use_data_parallel = (
+            multimodal_config.mm_encoder_tp_mode == "data"
+            if multimodal_config
+            else False
+        )
+        self.tp_size = (
+            1
+            if use_data_parallel
+            else parallel_state.get_tensor_model_parallel_world_size()
+        )
+        self.tp_rank = parallel_state.get_tensor_model_parallel_rank()
+        self.hidden_size_per_attention_head = dist_utils.divide(
+            config.hidden_size, config.num_attention_heads
+        )
+        self.num_attention_heads_per_partition = dist_utils.divide(
+            config.num_attention_heads, self.tp_size
+        )
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size=config.hidden_size,
+            head_size=self.hidden_size_per_attention_head,
+            total_num_heads=config.num_attention_heads,
+            total_num_kv_heads=config.num_attention_heads,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+            disable_tp=use_data_parallel,
+        )
+        self.out_proj = RowParallelLinear(
+            input_size=config.hidden_size,
+            output_size=config.hidden_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.out_proj",
+            disable_tp=use_data_parallel,
+        )
+
+        self.attn = MMEncoderAttention(
+            num_heads=self.num_attention_heads_per_partition,
+            head_size=self.hidden_size_per_attention_head,
+            scale=self.hidden_size_per_attention_head**-0.5,
+            prefix=f"{prefix}.attn",
+            multimodal_config=multimodal_config,
+        )
+
+    def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]:
+        seq_len, bs, _ = qkv.shape
+        q, k, v = qkv.chunk(3, dim=2)
+        new_shape = (
+            seq_len,
+            bs,
+            self.num_attention_heads_per_partition,
+            self.hidden_size_per_attention_head,
+        )
+        q, k, v = (x.view(*new_shape) for x in (q, k, v))
+        return q, k, v
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        *,
+        cu_seqlens: torch.Tensor,
+        max_seqlen: torch.Tensor | None,
+    ) -> torch.Tensor:
+        batch_size, _, _ = hidden_states.shape
+        if batch_size != 1:
+            raise ValueError("packed variable-length attention expects batch_size=1")
+        x = rearrange(hidden_states, "b s d -> s b d")
+        x, _ = self.qkv_proj(x)
+        q, k, v = self.split_qkv(x)
+        q, k, v = (rearrange(t, "s b h d -> b s h d") for t in (q, k, v))
+
+        context_layer = self.attn(
+            query=q,
+            key=k,
+            value=v,
+            cu_seqlens=cu_seqlens,
+            max_seqlen=max_seqlen,
+        )
+        context_layer = rearrange(context_layer, "b s h d -> s b (h d)").contiguous()
+
+        output, _ = self.out_proj(context_layer)
+        output = rearrange(output, "s b d -> b s d")
+        return output
+
+
+class Siglip2EncoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: PixelShuffleSiglip2VisionConfig,
+        quant_config: QuantizationConfig | None = None,
+        *,
+        prefix: str = "",
+        multimodal_config: MultiModalConfig | None = None,
+    ) -> None:
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.self_attn = Siglip2VisionAttention(
+            config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+            multimodal_config=multimodal_config,
+        )
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = SiglipMLP(
+            config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        *,
+        cu_seqlens: torch.Tensor,
+        max_seqlen: torch.Tensor | None,
+    ) -> torch.Tensor:
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states = self.self_attn(
+            hidden_states=hidden_states,
+            cu_seqlens=cu_seqlens,
+            max_seqlen=max_seqlen,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        return hidden_states
+
+
+class Siglip2Encoder(nn.Module):
+    def __init__(
+        self,
+        config: PixelShuffleSiglip2VisionConfig,
+        quant_config: QuantizationConfig | None = None,
+        *,
+        prefix: str = "",
+        multimodal_config: MultiModalConfig | None = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList(
+            [
+                Siglip2EncoderLayer(
+                    config,
+                    quant_config=quant_config,
+                    prefix=f"{prefix}.layers.{layer_idx}",
+                    multimodal_config=multimodal_config,
+                )
+                for layer_idx in range(config.num_hidden_layers)
+            ]
+        )
+
+    def forward(
+        self,
+        inputs_embeds: torch.Tensor,
+        *,
+        cu_seqlens: torch.Tensor | None = None,
+        max_seqlen: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        hidden_states = inputs_embeds
+        for encoder_layer in self.layers:
+            hidden_states = encoder_layer(
+                hidden_states,
+                cu_seqlens=cu_seqlens,
+                max_seqlen=max_seqlen,
+            )
+        return hidden_states
+
+
+class Siglip2VisionTransformer(nn.Module):
+    def __init__(
+        self,
+        config: PixelShuffleSiglip2VisionConfig,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+        multimodal_config: MultiModalConfig | None = None,
+    ):
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        embed_dim = config.hidden_size
+
+        self.embeddings = Siglip2VariableSequenceEmbeddings(config)
+        self.pixel_shuffle_scale_factor = config.pixel_shuffle_scale_factor
+        self.encoder = Siglip2Encoder(
+            config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.encoder",
+            multimodal_config=multimodal_config,
+        )
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        packed_seq_patches: tuple[torch.Tensor, torch.Tensor],
+    ) -> torch.Tensor:
+        r"""
+        spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
+            Tensor containing the spatial dimensions (height, width)
+            of the input images.
+        """
+
+        seq_patches, token_grids = packed_seq_patches
+        seq_sizes = torch.prod(token_grids, dim=-1)
+
+        # Get embeddings from packed sequence
+        hidden_states = self.embeddings((seq_patches, seq_sizes, token_grids))
+
+        # Add a pseudo batch dimension for the encoder
+        hidden_states = hidden_states.unsqueeze(0)
+
+        cu_seqlens, max_seqlen = create_cumulative_seq_lengths(
+            seq_sizes, hidden_states.device
+        )
+
+        hidden_states = self.encoder(
+            inputs_embeds=hidden_states,
+            cu_seqlens=cu_seqlens,
+            max_seqlen=max_seqlen,
+        )
+        hidden_states = self.post_layernorm(hidden_states)
+
+        if self.pixel_shuffle_scale_factor > 1:
+            hidden_states = pixel_shuffle_varlen(
+                x=hidden_states,
+                token_grids=token_grids,
+                scale_factor=self.pixel_shuffle_scale_factor,
+            )
+        # Remove the pseudo batch dimension we added earlier
+        hidden_states = hidden_states.squeeze(0)
+
+        # return last_hidden_state
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+
+        for name, loaded_weight in weights:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class IsaacVisionEmbedding(nn.Module):
+    def __init__(
+        self,
+        vision_cfg: PixelShuffleSiglip2VisionConfig,
+        hidden_dim: int,
+        output_dim: int,
+        quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.transformer = Siglip2VisionTransformer(
+            vision_cfg,
+            quant_config=quant_config,
+            multimodal_config=multimodal_config,
+            prefix=maybe_prefix(prefix, "0"),
+        )
+        self.linear_fc1 = ColumnParallelLinear(
+            hidden_dim,
+            4 * hidden_dim,
+            bias=False,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "1"),
+            return_bias=False,
+        )
+        self.act = nn.SiLU()
+        self.linear_fc2 = RowParallelLinear(
+            4 * hidden_dim,
+            output_dim,
+            bias=False,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "3"),
+            return_bias=False,
+        )
+
+    def forward(
+        self, packed_seq_patches: tuple[torch.Tensor, torch.Tensor]
+    ) -> torch.Tensor:
+        hidden_states = self.transformer(packed_seq_patches)
+        hidden_states = self.linear_fc1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_fc2(hidden_states)
+        return hidden_states
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    IsaacMultiModalProcessor,
+    info=IsaacProcessingInfo,
+    dummy_inputs=IsaacDummyInputsBuilder,
+)
+class IsaacForConditionalGeneration(
+    nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE
+):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    supports_encoder_tp_data = True
+
+    # To ensure correct weight loading and mapping.
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            "lm_head.": "language_model.lm_head.",
+            "model.text_model.lm_head.": "language_model.lm_head.",
+            "model.text_model.": "language_model.model.",
+            "model.vision_embedding.0": "vision_embedding.transformer",
+            "model.vision_embedding.1": "vision_embedding.linear_fc1",
+            "model.vision_embedding.2": "vision_embedding.act",
+            "model.vision_embedding.3": "vision_embedding.linear_fc2",
+            "model.vision_embedding.": "vision_embedding.",
+            "model.lm_head.": "language_model.lm_head.",
+            "model.": "language_model.model.",
+        }
+    )
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
+        if modality.startswith("image"):
+            return "<image>"
+
+        raise ValueError("Only image modality is supported")
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "model"):
+        super().__init__()
+        config: IsaacConfig = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.multimodal_config = vllm_config.model_config.multimodal_config
+
+        head_dim = config.head_dim
+        calculated_mrope_section = [
+            head_dim // 4,  # 2x more for temporal dim
+            head_dim // 8,
+            head_dim // 8,
+        ]
+
+        self.vision_token_id = _resolve_vision_token_id(
+            vllm_config.model_config, config.vision_token
+        )
+        config.image_token_id = self.vision_token_id
+
+        text_cfg = getattr(config, "text_config", None)
+        target_cfg = (
+            text_cfg
+            if text_cfg is not None and not isinstance(text_cfg, dict)
+            else config
+        )
+
+        rope_scaling = getattr(target_cfg, "rope_scaling", None)
+        if rope_scaling is None and target_cfg is config:
+            rope_scaling = getattr(config, "_rope_scaling", None)
+
+        patch_rope_parameters(target_cfg)
+        rope_parameters = target_cfg.rope_parameters
+        rope_parameters["mrope_section"] = calculated_mrope_section
+        if rope_scaling is not None and "mrope_interleaved" in rope_scaling:
+            rope_parameters.setdefault(
+                "mrope_interleaved", rope_scaling["mrope_interleaved"]
+            )
+        target_cfg.rope_parameters = rope_parameters
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            architectures=["Qwen3ForCausalLM"],
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors
+        )
+
+        vision_cfg = config.vision_config
+        if vision_cfg is None:
+            raise ValueError("IsaacConfig should always have vision_config")
+        attn_impl = (
+            config.vision_attn_implementation
+            if config.vision_attn_implementation is not None
+            else getattr(config, "_attn_implementation", None)
+        )
+        if attn_impl is not None:
+            vision_cfg._attn_implementation = attn_impl
+
+        hidden_dim = vision_cfg.hidden_size * (vision_cfg.pixel_shuffle_scale_factor**2)
+        self.vision_embedding = IsaacVisionEmbedding(
+            vision_cfg=vision_cfg,
+            hidden_dim=hidden_dim,
+            output_dim=config.hidden_size,
+            quant_config=quant_config,
+            multimodal_config=self.multimodal_config,
+            prefix=maybe_prefix(prefix, "vision_embedding"),
+        )
+
+    def iter_mm_grid_hw(
+        self, input_tokens: list[int], mm_features: list[MultiModalFeatureSpec]
+    ) -> Iterator[tuple[int, int, int]]:
+        spatial_merge_size = self.config.vision_config.pixel_shuffle_scale_factor
+        for mm_feature in sorted(mm_features, key=lambda f: f.mm_position.offset):
+            offset = mm_feature.mm_position.offset
+            if mm_feature.modality == "image":
+                t, h, w = mm_feature.data["image_grid_thw"].data.tolist()
+                assert t == 1, f"Image must have 1 frame, got {t}"
+                yield offset, h // spatial_merge_size, w // spatial_merge_size
+            else:
+                raise ValueError(f"Unsupported modality: {mm_feature.modality}")
+
+    def get_mrope_input_positions(
+        self,
+        input_tokens: list[int],
+        mm_features: list[MultiModalFeatureSpec],
+    ) -> tuple[torch.Tensor, int]:
+        llm_pos_ids_list = []
+        st = 0
+        for offset, llm_grid_h, llm_grid_w in self.iter_mm_grid_hw(
+            input_tokens, mm_features
+        ):
+            text_len = offset - st
+            st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+            llm_pos_ids_list.append(
+                np.broadcast_to(np.arange(text_len), (3, text_len)) + st_idx
+            )
+
+            grid_indices = np.indices((1, llm_grid_h, llm_grid_w)).reshape(3, -1)
+            grid_indices[0, :] = grid_indices[0, :] + text_len + st_idx
+            llm_pos_ids_list.append(grid_indices)
+            st = offset + llm_grid_h * llm_grid_w
+
+        if st < len(input_tokens):
+            st_idx = llm_pos_ids_list[-1][0, -1] + 1 if len(llm_pos_ids_list) > 0 else 0
+            text_len = len(input_tokens) - st
+            llm_pos_ids_list.append(
+                np.broadcast_to(np.arange(text_len), (3, text_len)) + st_idx
+            )
+
+        llm_positions = np.concatenate(llm_pos_ids_list, axis=1).reshape(3, -1)
+        mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item()
+
+        return torch.from_numpy(llm_positions), mrope_position_delta
+
+    def _parse_and_validate_image_input(
+        self, **kwargs: object
+    ) -> IsaacImagePixelInputs | None:
+        pixel_values = kwargs.get("pixel_values")
+        image_grid_thw = kwargs.get("image_grid_thw")
+        if pixel_values is None or image_grid_thw is None:
+            return None
+
+        # TensorSchema will automatically validate shapes on initialization
+        return IsaacImagePixelInputs(
+            pixel_values=pixel_values,
+            image_grid_thw=image_grid_thw,
+        )
+
+    def _process_image_input(
+        self,
+        image_input: IsaacImagePixelInputs,
+    ) -> tuple[torch.Tensor, ...]:
+        pixel_values = image_input["pixel_values"]
+        image_grid_thw = image_input["image_grid_thw"]
+        if pixel_values.numel() == 0:
+            return ()
+
+        device = next(self.language_model.parameters()).device
+        dtype = self.vision_embedding.linear_fc1.weight.dtype
+        pixel_values = pixel_values.to(device=device, dtype=dtype)
+        spatial_grids = image_grid_thw[:, 1:3].to(device, dtype=torch.int32)
+
+        vision_embeddings = self.vision_embedding((pixel_values, spatial_grids))
+        merge_size = self.config.vision_config.pixel_shuffle_scale_factor
+        sizes = spatial_grids.prod(-1) // (merge_size * merge_size)
+        return tuple(vision_embeddings.split(sizes.tolist()))
+
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings | None:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return ()
+        return self._process_image_input(image_input)
+
+    def get_multimodal_embeddings(
+        self, **kwargs: object
+    ) -> MultiModalEmbeddings | None:
+        # Backward compatibility for older runners.
+        embeddings = self.embed_multimodal(**kwargs)
+        if not embeddings:
+            return []
+        return embeddings
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        **kwargs: object,
+    ) -> torch.Tensor | IntermediateTensors:
+        return self.language_model(
+            input_ids=input_ids,
+            positions=positions,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+            **kwargs,
+        )
+
+    def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor | None:
+        return self.language_model.compute_logits(hidden_states)
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="language_model",
+            connector="vision_embedding.linear_fc2",  # The final linear layer
+            tower_model="vision_embedding",
+        )
diff --git a/vllm/model_executor/models/jais2.py b/vllm/model_executor/models/jais2.py
index 01e75338a8ced9ae009a433bdb67cd3bfab72942..aacc4abd43e615ba291ef47f56c95faa763d3f75 100644
--- a/vllm/model_executor/models/jais2.py
+++ b/vllm/model_executor/models/jais2.py
@@ -48,7 +48,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -167,7 +166,6 @@ class Jais2Attention(nn.Module):
 
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
             rope_parameters=getattr(config, "rope_parameters", None),
             is_neox_style=is_neox_style,
@@ -304,17 +302,12 @@ class Jais2Model(nn.Module):
 
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
         self.quant_config = quant_config
         self.padding_idx = config.pad_token_id
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
+
+        self.vocab_size = config.vocab_size
         self.org_vocab_size = config.vocab_size
         if get_pp_group().is_first_rank or (
             config.tie_word_embeddings and get_pp_group().is_last_rank
@@ -456,29 +449,15 @@ class Jais2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
         self.config = config
-        self.lora_config = lora_config
-
         self.model = self._init_model(
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
         )
 
         if get_pp_group().is_last_rank:
-            self.unpadded_vocab_size = config.vocab_size
-            if lora_config:
-                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
             self.lm_head = ParallelLMHead(
-                self.unpadded_vocab_size,
+                config.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
-                padding_size=(
-                    DEFAULT_VOCAB_PADDING_SIZE
-                    # We need bigger padding if using lora for kernel
-                    # compatibility
-                    if not lora_config
-                    else lora_config.lora_vocab_padding_size
-                ),
                 quant_config=quant_config,
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
@@ -487,7 +466,7 @@ class Jais2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
 
             logit_scale = getattr(config, "logit_scale", 1.0)
             self.logits_processor = LogitsProcessor(
-                self.unpadded_vocab_size, config.vocab_size, logit_scale
+                config.vocab_size, scale=logit_scale
             )
         else:
             self.lm_head = PPMissingLayer()
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index b2ad12be1e3586587bc4e8e3eaf160b661217067..91b58a83e09a7667a7d05fe77d71e95006bd55ac 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -27,7 +27,7 @@ from vllm.model_executor.layers.mamba.mamba_utils import (
     MambaStateDtypeCalculator,
     MambaStateShapeCalculator,
 )
-from vllm.model_executor.layers.pooler import DispatchPooler, Pooler
+from vllm.model_executor.layers.pooler import DispatchPooler
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead,
@@ -78,6 +78,7 @@ class JambaMoE(nn.Module):
                 bias=False,
                 quant_config=None,
                 params_dtype=params_dtype,
+                prefix=f"{prefix}.router",
             )
 
         self.experts = FusedMoE(
@@ -377,6 +378,7 @@ class JambaModel(nn.Module):
         # Params for weights, fp8 weight scales, fp8 activation scales
         # (param_name, weight_name, expert_id, shard_id)
         return FusedMoE.make_expert_params_mapping(
+            self,
             ckpt_gate_proj_name="gate_proj",
             ckpt_down_proj_name="down_proj",
             ckpt_up_proj_name="up_proj",
@@ -594,16 +596,4 @@ class JambaForSequenceClassification(JambaForCausalLM):
         pooler_config = vllm_config.model_config.pooler_config
         assert pooler_config is not None
 
-        self.pooler = DispatchPooler(
-            {
-                "token_classify": Pooler.for_token_classify(
-                    pooler_config, classifier=self.score
-                ),
-                "classify": Pooler.for_classify(
-                    pooler_config, classifier=self.score, act_fn="classify"
-                ),
-                "score": Pooler.for_classify(
-                    pooler_config, classifier=self.score, act_fn="score"
-                ),
-            }
-        )
+        self.pooler = DispatchPooler.for_seq_cls(pooler_config, classifier=self.score)
diff --git a/vllm/model_executor/models/jina_vl.py b/vllm/model_executor/models/jina_vl.py
index 8bba7b62882f1f6030be08b4697da36185e5ca2b..c03fa211a4dfabb8a8bf2fa04582790dbad6ffde 100644
--- a/vllm/model_executor/models/jina_vl.py
+++ b/vllm/model_executor/models/jina_vl.py
@@ -10,7 +10,7 @@ from vllm.config import ModelConfig, VllmConfig
 from vllm.inputs import TokensPrompt
 from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear
-from vllm.model_executor.layers.pooler import DispatchPooler, Pooler
+from vllm.model_executor.layers.pooler import DispatchPooler
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.sequence import IntermediateTensors
 
@@ -27,15 +27,23 @@ logger = init_logger(__name__)
 
 
 class JinaVLScorer(nn.Module):
-    def __init__(self, model_config: "ModelConfig"):
+    def __init__(self, model_config: "ModelConfig", prefix: str = ""):
         super().__init__()
         config = model_config.hf_config.get_text_config()
         head_dtype = model_config.head_dtype
         self.dense = ColumnParallelLinear(
-            config.hidden_size, config.hidden_size, params_dtype=head_dtype, bias=True
+            config.hidden_size,
+            config.hidden_size,
+            params_dtype=head_dtype,
+            bias=True,
+            prefix=f"{prefix}.dense",
         )
         self.out_proj = RowParallelLinear(
-            config.hidden_size, config.num_labels, params_dtype=head_dtype, bias=True
+            config.hidden_size,
+            config.num_labels,
+            params_dtype=head_dtype,
+            bias=True,
+            prefix=f"{prefix}.out_proj",
         )
 
     def forward(self, x, **kwargs):
@@ -94,20 +102,10 @@ class JinaVLForSequenceClassification(
         pooler_config = vllm_config.model_config.pooler_config
         assert pooler_config is not None
 
-        self.score = JinaVLScorer(vllm_config.model_config)
-        self.pooler = DispatchPooler(
-            {
-                "token_classify": Pooler.for_token_classify(
-                    pooler_config, classifier=self.score
-                ),
-                "classify": Pooler.for_classify(
-                    pooler_config, classifier=self.score, act_fn="classify"
-                ),
-                "score": Pooler.for_classify(
-                    pooler_config, classifier=self.score, act_fn="score"
-                ),
-            }
+        self.score = JinaVLScorer(
+            vllm_config.model_config, prefix=maybe_prefix(prefix, "score")
         )
+        self.pooler = DispatchPooler.for_seq_cls(pooler_config, classifier=self.score)
 
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> str | None:
diff --git a/vllm/model_executor/models/kanana_v.py b/vllm/model_executor/models/kanana_v.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e1667c1e489ec7dcadf8b6f752b961059ba4697
--- /dev/null
+++ b/vllm/model_executor/models/kanana_v.py
@@ -0,0 +1,756 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Iterable, Mapping, Sequence
+from functools import partial
+from typing import Annotated, Literal, TypeAlias
+
+import numpy as np
+import regex as re
+import torch
+from einops import rearrange
+from PIL import Image
+from timm.layers import LayerNorm2d
+from timm.layers.pos_embed import resample_abs_pos_embed
+from timm.models.regnet import RegStage
+from torch import nn
+from transformers import BatchFeature
+from transformers.modeling_outputs import BaseModelOutput
+from transformers.models.qwen2_vl.configuration_qwen2_vl import Qwen2VLVisionConfig
+
+from vllm.config import VllmConfig
+from vllm.config.multimodal import BaseDummyOptions
+from vllm.logger import init_logger
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (
+    MultiModalDataDict,
+    MultiModalFieldConfig,
+    MultiModalKwargsItems,
+)
+from vllm.multimodal.parse import ImageSize, MultiModalDataItems
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    PromptReplacement,
+    PromptUpdate,
+)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+from vllm.utils.import_utils import resolve_obj_by_qualname
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
+
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
+from .qwen2_vl import Qwen2VisionTransformer
+from .utils import AutoWeightsLoader, init_vllm_registered_model, maybe_prefix
+
+logger = init_logger(__name__)
+
+
+class KananaVImagePixelInputs(TensorSchema):
+    """
+    Dimensions:
+        - np: The total number of patches over all images in the batch
+        - cps: Number of channels * patch_size * patch_size
+        - ni: Number of images
+    """
+
+    type: Literal["pixel_values"]
+
+    pixel_values: Annotated[
+        torch.Tensor,
+        TensorShape("np", "cps"),
+    ]
+
+    vision_grid_thw: Annotated[
+        torch.Tensor,
+        TensorShape("ni", 3),
+    ]
+
+
+KananaVImageInputs: TypeAlias = KananaVImagePixelInputs
+
+
+def build_pos_embeds(
+    config: Qwen2VLVisionConfig,
+    num_input_tokens: int,
+    vision_hidden_size: int,
+) -> nn.Parameter | None:
+    """Build positional embeddings for the visual encoder output."""
+    if config.pos_emb:
+        pos_emb = nn.Parameter(torch.zeros(1, num_input_tokens, vision_hidden_size))
+        nn.init.trunc_normal_(pos_emb, mean=0.0, std=0.02)
+    else:
+        pos_emb = None
+
+    return pos_emb
+
+
+def build_mlp(
+    depth: int,
+    hidden_size: int,
+    output_hidden_size: int,
+) -> nn.Sequential:
+    """Simple SiLU-activated MLP used as a projector readout."""
+    layers = [nn.Linear(hidden_size, output_hidden_size)]
+    for _ in range(1, depth):
+        layers.append(nn.SiLU())
+        layers.append(nn.Linear(output_hidden_size, output_hidden_size))
+    return nn.Sequential(*layers)
+
+
+class PatchMerge(nn.Module):
+    """Merge neighboring patches spatially to reduce resolution."""
+
+    def __init__(self, merge_size: int) -> None:
+        super().__init__()
+        self.merge_size = merge_size
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        channel_last: bool = False,
+    ) -> torch.Tensor:
+        """Merge patches by `merge_size x merge_size`."""
+        if channel_last:
+            x = rearrange(x, "B H W D -> B D H W")
+        _, _, H, W = x.shape
+        merged_x = rearrange(
+            x,
+            "B D (H h2) (W w2) -> B (D h2 w2) H W",
+            h2=self.merge_size,
+            w2=self.merge_size,
+        )
+        return merged_x
+
+
+class DynamicCAbstractor(nn.Module):
+    """Dynamic C-Abstractor based on RegNet blocks."""
+
+    def __init__(
+        self,
+        config: Qwen2VLVisionConfig,
+        num_input_tokens: int,
+    ) -> None:
+        super().__init__()
+        assert hasattr(config, "merge_size"), "merge_size must be provided."
+        self.config = config
+        self.merge_size = config.merge_size
+        self.pos_emb_size = config.pos_emb_size
+        if num_input_tokens == -1:
+            num_input_tokens = config.pos_emb_size
+        self.num_input_tokens = num_input_tokens
+        self.pos_emb = build_pos_embeds(
+            config, num_input_tokens, config.encoder_hidden_size
+        )
+        self.build_net()
+
+    def _load_from_state_dict(self, state_dict, *args, **kwargs) -> None:
+        if not state_dict:
+            return
+
+        if self.pos_emb is not None:
+            key_re = re.compile(r"[\w,.]*abstractor[\w,.]*pos_emb")
+            pos_emb_key = None
+            for key in state_dict:
+                if key_re.match(key):
+                    pos_emb_key = key
+                    break
+
+            assert pos_emb_key is not None
+            # update old ckpt compatible with current code
+            pos_emb = state_dict[pos_emb_key]
+            if pos_emb.size(1) == self.pos_emb.size(1) + 1:
+                # remove obsolete first pos emb (for cls token originally)
+                state_dict[pos_emb_key] = pos_emb[:, 1:]
+
+        super()._load_from_state_dict(state_dict, *args, **kwargs)
+
+    def build_net(self) -> None:
+        encoder_hidden_size = self.config.encoder_hidden_size
+        hidden_size = self.config.hidden_size
+        output_hidden_size = self.config.output_hidden_size
+        depth = self.config.depth
+        mlp_depth = self.config.mlp_depth
+
+        RegBlock = partial(
+            RegStage,
+            stride=1,
+            dilation=1,
+            act_layer=nn.SiLU,
+            norm_layer=LayerNorm2d,
+        )
+
+        s1 = RegBlock(
+            depth,
+            encoder_hidden_size,
+            hidden_size,
+        )
+        sampler = PatchMerge(merge_size=self.merge_size)
+        s2 = RegBlock(
+            depth,
+            self.merge_size**2 * hidden_size,
+            hidden_size,
+        )
+
+        if depth:
+            self.net = nn.ModuleList([s1, sampler, s2])
+            self.readout = build_mlp(mlp_depth, hidden_size, output_hidden_size)
+        else:
+            self.net = sampler
+            self.readout = build_mlp(mlp_depth, encoder_hidden_size, output_hidden_size)
+
+    def forward(
+        self,
+        flattened_visual_embeds: torch.Tensor,
+        grid_thw: torch.Tensor,
+        **unused_kwargs: object,
+    ) -> BaseModelOutput:
+        """Apply the dynamic abstractor over flattened visual embeddings."""
+        n_token_loc = torch.prod(grid_thw, dim=1)
+        split_visual_embeds = torch.split(flattened_visual_embeds, n_token_loc.tolist())
+
+        flattened_visual_embeds = []
+        for _visual_embeds, _grid_thw in zip(split_visual_embeds, grid_thw):
+            T, H, W = _grid_thw
+            assert T == 1, "T must be 1. Video is not supported yet."
+            reshaped_visual_embeds = rearrange(
+                _visual_embeds, "(t h w) d -> 1 t h w d", t=T, h=H, w=W
+            )
+            # remove temporal dim
+            reshaped_visual_embeds = reshaped_visual_embeds[:, 0]
+
+            if self.pos_emb is not None:
+                # interpolate pos emb and add to visual embeds
+                _local_pos_emb = resample_abs_pos_embed(
+                    posemb=self.pos_emb,
+                    old_size=tuple([int(self.pos_emb_size**0.5)] * 2),
+                    new_size=(H, W),
+                    num_prefix_tokens=0,
+                )
+                _local_pos_emb = rearrange(
+                    _local_pos_emb,
+                    "1 (h w) d -> 1 h w d",
+                    h=H,
+                    w=W,
+                )
+                reshaped_visual_embeds = reshaped_visual_embeds + _local_pos_emb
+
+            reshaped_visual_embeds = self._forward(
+                reshaped_visual_embeds,
+                input_size=(H, W),
+            )
+            flattened_visual_embeds.append(reshaped_visual_embeds)
+        reshaped_visual_embeds = torch.cat(flattened_visual_embeds, dim=0)
+        return BaseModelOutput(last_hidden_state=reshaped_visual_embeds)
+
+    def _forward(
+        self,
+        x: torch.Tensor,
+        input_size: tuple[int, int],
+    ) -> torch.Tensor:
+        h, w = input_size
+        x = rearrange(x, "1 h w d -> 1 d h w", h=h, w=w)
+        if self.config.depth:
+            x = self.net[0](x)
+            x = self.net[1](x)
+            x = self.net[2](x)
+        else:
+            # When depth=0, self.net is a single PatchMerge module
+            x = self.net(x)
+        x = rearrange(x, "1 d h w -> (h w) d")
+        x = self.readout(x)
+        return x
+
+
+class CustomQwen2VLVE(Qwen2VisionTransformer):
+    """Thin wrapper around the Qwen2-VL used as a vision encoder.
+
+    This mirrors the original HF-based vision encoder used in Kanana-V, but
+    reuses vLLM's optimized `Qwen2VisionTransformer` building blocks.
+    """
+
+    def __init__(self, config: Qwen2VLVisionConfig) -> None:
+        super().__init__(
+            vision_config=config,
+            norm_eps=getattr(config, "rms_norm_eps", 1e-6),
+            quant_config=None,
+            prefix="",
+        )
+
+        # Kanana-V uses its own projector/abstractor instead of the Qwen2
+        # built-in patch merger, so we drop the merger module to keep the
+        # parameter set compatible with the original checkpoint.
+        if hasattr(self, "merger"):
+            del self.merger
+
+    @classmethod
+    def _from_config(cls, config: Qwen2VLVisionConfig) -> "CustomQwen2VLVE":
+        """Drop-in replacement for the HF `_from_config` constructor."""
+        return cls(config)
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        grid_thw: torch.Tensor,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+    ) -> tuple | BaseModelOutput:
+        """Run the vision transformer and optionally return intermediate states.
+
+        Unlike the base `Qwen2VisionTransformer`, this wrapper exposes the
+        pre-merger patch-level representations and a HF-style `BaseModelOutput`
+        so that the existing projector / abstractor code can be reused.
+        """
+        assert return_dict, "Only return_dict=True is supported."
+
+        # Patchify
+        x = pixel_values.to(device=self.device, dtype=self.dtype)
+        x = self.patch_embed(x)  # (num_patches, embed_dim)
+
+        # Prepare grid and rotary embeddings – mirror base implementation.
+        if isinstance(grid_thw, list):
+            grid_thw_list = grid_thw
+            grid_thw_np = np.array(grid_thw, dtype=np.int32)
+        else:
+            grid_thw_list = grid_thw.tolist()
+            grid_thw_np = grid_thw.cpu().numpy()
+
+        rotary_pos_emb_cos, rotary_pos_emb_sin = self.rot_pos_emb(grid_thw_list)
+
+        # Compute cu_seqlens in numpy then move to device, same as base model.
+        cu_seqlens = np.repeat(
+            grid_thw_np[:, 1] * grid_thw_np[:, 2],
+            grid_thw_np[:, 0],
+        ).cumsum(axis=0, dtype=np.int32)
+        cu_seqlens = np.concatenate([np.zeros(1, dtype=np.int32), cu_seqlens])
+        cu_seqlens = torch.from_numpy(cu_seqlens).to(
+            self.device,
+            non_blocking=True,
+        )
+
+        # Shape to (S, B, D) with batch dimension 1 as expected by the blocks.
+        x = x.unsqueeze(1)
+
+        # Pre-compute seqlens for attention backend.
+        max_seqlen = self.compute_attn_mask_seqlen(cu_seqlens)
+
+        encoder_states = () if output_hidden_states else None
+
+        for blk in self.blocks:
+            if output_hidden_states:
+                # Store patch-level states (S, D).
+                encoder_states = encoder_states + (x.squeeze(1),)
+
+            x = blk(
+                x,
+                cu_seqlens=cu_seqlens,
+                rotary_pos_emb_cos=rotary_pos_emb_cos,
+                rotary_pos_emb_sin=rotary_pos_emb_sin,
+                max_seqlen=max_seqlen,
+            )
+
+        # Final hidden state at patch level (S, D).
+        hidden_states = x.squeeze(1)
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_states,
+        )
+
+    def get_num_tokens(self) -> int:
+        # Not used in the current Kanana-V pipeline, kept for API compatibility.
+        return -1
+
+
+class KananaVProcessingInfo(BaseProcessingInfo):
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
+        return {"image": None}
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        max_image_size, _ = self._get_vision_info(
+            image_width=9999,
+            image_height=9999,
+            num_frames=1,
+        )
+        return max_image_size
+
+    def _get_vision_info(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        num_frames: int = 1,
+        do_resize: bool = True,
+    ) -> tuple[ImageSize, int]:
+        image_processor = self.ctx.get_hf_processor().image_processor
+        smart_resize = resolve_obj_by_qualname(
+            f"{type(image_processor).__module__}.smart_resize"
+        )
+
+        hf_config = self.get_hf_config()
+        vision_config = hf_config.vision_config
+        patch_size = vision_config.patch_size
+        merge_size = vision_config.spatial_merge_size
+        temporal_patch_size = vision_config.temporal_patch_size
+
+        if do_resize:
+            resized_height, resized_width = smart_resize(
+                height=image_height,
+                width=image_width,
+                factor=patch_size * merge_size,
+                min_pixels=image_processor.min_pixels,
+                max_pixels=image_processor.max_pixels,
+            )
+            preprocessed_size = ImageSize(width=resized_width, height=resized_height)
+        else:
+            preprocessed_size = ImageSize(width=image_width, height=image_height)
+
+        # NOTE: Frames are padded to be divisible by `temporal_patch_size`
+        # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py#L294
+        padded_num_frames = num_frames + num_frames % temporal_patch_size
+
+        grid_t = max(padded_num_frames // temporal_patch_size, 1)
+        grid_h = preprocessed_size.height // patch_size
+        grid_w = preprocessed_size.width // patch_size
+
+        num_patches = grid_t * grid_h * grid_w
+        num_vision_tokens = num_patches // (merge_size**2)
+
+        return preprocessed_size, num_vision_tokens
+
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
+        target_width, target_height = self.get_image_size_with_most_features()
+        num_vision_tokens = self._get_vision_info(
+            image_width=target_width,
+            image_height=target_height,
+            num_frames=1,
+        )[1]
+        return {"image": num_vision_tokens}
+
+
+class KananaVDummyInputsBuilder(BaseDummyInputsBuilder[KananaVProcessingInfo]):
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+        return "<image>" * num_images
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 0)
+        return {
+            "image": self._get_dummy_images(
+                width=9999, height=9999, num_images=num_images
+            ),
+        }
+
+
+class KananaVMultiModalProcessor(BaseMultiModalProcessor[KananaVProcessingInfo]):
+    """vLLM multimodal processor for Kanana-V (text + image)."""
+
+    @property
+    def media_token_id(self) -> int:
+        return self.info.get_hf_config().text_config.eos_token_id + 1
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        """Run the underlying HF processor on text and image data."""
+        # Text-only input is handled as a special case here.
+        if not mm_data or not mm_data.get("images", []):
+            prompt_ids = self.info.get_tokenizer().encode(prompt)
+            return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
+
+        # Images
+        image_inputs = mm_data.get("images", [])
+        pixel_sizes = []
+        if not isinstance(image_inputs[0], Image.Image):
+            image_inputs = [Image.fromarray(image) for image in image_inputs]
+
+        image_processor = self.info.get_hf_processor().image_processor
+        processor_output = [image_processor(image) for image in image_inputs]
+        pixel_values = [o["pixel_values"] for o in processor_output]
+        image_meta = [o["image_meta"] for o in processor_output]
+        # list of dict -> dict of list
+        image_meta = {k: [d[k] for d in image_meta] for k in image_meta[0]}
+
+        for pixel_value in pixel_values:
+            pixel_sizes.append(pixel_value.shape[0])
+        # flattened pixel_values for single example (already includes batch dim)
+        pixel_values = torch.concat(pixel_values, dim=0)
+
+        tokenizer = self.info.get_tokenizer()
+        media_token = tokenizer.convert_ids_to_tokens([self.media_token_id])[0]
+        prompt_replaced = prompt.replace("<image>", media_token)
+        input_ids = tokenizer.encode(prompt_replaced)
+        input_ids = torch.tensor(input_ids)
+
+        # Ensure HF output is consistent with vLLM prompt-update expectations:
+        # if the HF tokenizer emits exactly 1 placeholder token per image, expand
+        # it to `T*H*W` placeholder tokens per image so placeholder detection works.
+        num_images = len(image_inputs)
+        image_token_thw = torch.tensor(image_meta["image_token_thw"])
+        per_image_token_counts = image_token_thw.prod(dim=1).tolist()
+        expected_total = int(sum(int(x) for x in per_image_token_counts))
+
+        n_placeholders = int((input_ids == self.media_token_id).sum().item())
+        if n_placeholders == num_images and expected_total != num_images:
+            expanded: list[int] = []
+            img_i = 0
+            for tok in input_ids.tolist():
+                if tok == self.media_token_id and img_i < num_images:
+                    expanded.extend(
+                        [self.media_token_id] * int(per_image_token_counts[img_i])
+                    )
+                    img_i += 1
+                else:
+                    expanded.append(tok)
+            input_ids = input_ids.new_tensor(expanded)
+
+        combined_outputs = dict(
+            # Add batch dimension to input_ids.
+            input_ids=input_ids.unsqueeze(0),
+            pixel_values=pixel_values,
+            vision_grid_thw=torch.tensor(image_meta["vision_grid_thw"]),
+            image_token_thw=torch.tensor(image_meta["image_token_thw"]),
+            pixel_sizes=torch.tensor(pixel_sizes),
+        )
+        return BatchFeature(combined_outputs, tensor_type="pt")
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargsItems,
+    ) -> Sequence[PromptUpdate]:
+        def get_replacement(idx: int) -> Sequence[int]:
+            out_item = out_mm_kwargs["image"][idx]
+            image_token_thw = out_item["image_token_thw"].data
+            assert isinstance(image_token_thw, torch.Tensor)
+
+            num_tokens = int(image_token_thw.prod().item())
+            return [self.media_token_id] * num_tokens
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target="<image>",
+                replacement=get_replacement,
+            ),
+        ]
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        pixel_sizes = hf_inputs.get("pixel_sizes", torch.empty(0))
+
+        mm_fields_config = dict(
+            pixel_values=MultiModalFieldConfig.flat_from_sizes("image", pixel_sizes),
+            vision_grid_thw=MultiModalFieldConfig.batched("image"),
+            image_token_thw=MultiModalFieldConfig.batched("image"),
+        )
+        return mm_fields_config
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    KananaVMultiModalProcessor,
+    info=KananaVProcessingInfo,
+    dummy_inputs=KananaVDummyInputsBuilder,
+)
+class KananaVForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
+        if modality.startswith("image"):
+            return "<image>"
+        else:
+            raise ValueError(f"Unsupported modality: {modality}")
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        self.config = config
+
+        self.vision_model = CustomQwen2VLVE._from_config(config.vision_config)
+        self.abstractor = DynamicCAbstractor(
+            config.projector_config, num_input_tokens=self.vision_model.get_num_tokens()
+        )
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "model"),
+            architectures=["LlamaForCausalLM"],
+        )
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors
+        )
+
+    def _parse_and_validate_image_input(
+        self, **kwargs: object
+    ) -> KananaVImageInputs | None:
+        pixel_values = kwargs.pop("pixel_values", None)
+        vision_grid_thw = kwargs.pop("vision_grid_thw", None)
+
+        if pixel_values is None:
+            return None
+
+        if vision_grid_thw is None:
+            raise ValueError(
+                "vision_grid_thw is required when pixel_values is provided"
+            )
+
+        # Normalize pixel_values to 2D tensor (num_patches, channels*patch*patch)
+        if isinstance(pixel_values, torch.Tensor):
+            if pixel_values.ndim == 2:
+                pass  # Already in expected shape
+            elif pixel_values.ndim == 3:
+                pixel_values = pixel_values.flatten(0, 1)
+            else:
+                raise ValueError(
+                    f"pixel_values should be 2D or batched 3D tensor. "
+                    f"Got ndim: {pixel_values.ndim} "
+                    f"(shape={pixel_values.shape})"
+                )
+        else:
+            pixel_values = torch.concat(pixel_values)
+
+        # Normalize vision_grid_thw to 2D tensor (num_images, 3)
+        if isinstance(vision_grid_thw, torch.Tensor):
+            if vision_grid_thw.ndim == 3:
+                vision_grid_thw = vision_grid_thw.flatten(0, 1)
+        else:
+            vision_grid_thw = torch.concat(vision_grid_thw)
+
+        return KananaVImagePixelInputs(
+            type="pixel_values",
+            pixel_values=pixel_values,
+            vision_grid_thw=vision_grid_thw,
+        )
+
+    def _process_image_input(self, image_input: KananaVImageInputs) -> torch.Tensor:
+        pixel_values = image_input["pixel_values"]
+        vision_grid_thw = image_input["vision_grid_thw"]
+
+        image_metas = {"vision_grid_thw": vision_grid_thw}
+        visual_embeds = self.forward_and_project_vision(pixel_values, image_metas)
+
+        merge_size = self.abstractor.merge_size
+        batch_size = vision_grid_thw.size(0)
+        multi_modal_embeddings: tuple[torch.Tensor, ...] = ()
+        sample_index = 0
+        for i in range(batch_size):
+            t, h, w = (
+                vision_grid_thw[i][0],
+                vision_grid_thw[i][1] // merge_size,
+                vision_grid_thw[i][2] // merge_size,
+            )
+            num_tokens = t * h * w
+            visual_embed = visual_embeds[sample_index : sample_index + num_tokens]
+            multi_modal_embeddings += (visual_embed,)
+            sample_index += num_tokens
+
+        return multi_modal_embeddings
+
+    def _get_visual_feature_at(
+        self,
+        v_output: Sequence[torch.Tensor],
+        layer_index: int | Sequence[int],
+    ) -> torch.Tensor:
+        if isinstance(layer_index, (list, tuple)):
+            visual_features = torch.stack(v_output, dim=1)[
+                :, layer_index
+            ]  # [B, n_scales, L, dim]
+        else:
+            visual_features = v_output[layer_index]  # [B, L, dim]
+        return visual_features
+
+    def forward_vision(
+        self,
+        pixel_values: torch.Tensor,
+        image_metas: dict | None = None,
+    ) -> torch.Tensor:
+        vision_model_args = {
+            "pixel_values": pixel_values,
+            "return_dict": True,
+            "output_hidden_states": True,
+            "grid_thw": image_metas["vision_grid_thw"],
+        }
+        v_outputs = self.vision_model(**vision_model_args)
+        layer_index = self.config.projector_config.feature_layer_index
+        visual_features = self._get_visual_feature_at(
+            v_outputs.hidden_states, layer_index
+        )
+        return visual_features
+
+    def forward_projector(
+        self,
+        visual_features: torch.Tensor,
+        image_metas: dict | None = None,
+    ) -> torch.Tensor:
+        visual_embeds = self.abstractor(
+            visual_features,
+            grid_thw=image_metas["vision_grid_thw"],
+        )["last_hidden_state"]
+        return visual_embeds
+
+    def forward_and_project_vision(
+        self,
+        pixel_values: torch.Tensor,
+        image_metas: dict | None = None,
+    ) -> torch.Tensor:
+        assert pixel_values is not None
+        visual_features = self.forward_vision(pixel_values, image_metas=image_metas)
+        visual_embeds = self.forward_projector(visual_features, image_metas=image_metas)
+        return visual_embeds
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return []
+
+        return self._process_image_input(image_input)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        **kwargs,
+    ):
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        hidden_states = self.language_model(
+            input_ids=input_ids,
+            positions=positions,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
+
+        return hidden_states
+
+    def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return self.language_model.compute_logits(hidden_states)
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py
index fcf88953ba20fb40a7bb5866da42297202659676..8e6b6642591d30399bd1de0e8daf203d044c2320 100644
--- a/vllm/model_executor/models/keye.py
+++ b/vllm/model_executor/models/keye.py
@@ -16,13 +16,13 @@ from transformers.feature_extraction_utils import BatchFeature
 from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
 from transformers.utils import torch_int
 
-from vllm.attention.layers.mm_encoder_attention import (
-    MMEncoderAttention,
-)
 from vllm.config import MultiModalConfig, VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.logger import init_logger
+from vllm.model_executor.layers.attention.mm_encoder_attention import (
+    MMEncoderAttention,
+)
 from vllm.model_executor.layers.conv import Conv2dLayer
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
@@ -339,15 +339,11 @@ def apply_rotary_pos_emb_flashatt(
     k: torch.Tensor,
     cos: torch.Tensor,
     sin: torch.Tensor,
+    apply_rotary_emb: ApplyRotaryEmb,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     cos = cos.chunk(2, dim=-1)[0].contiguous()
     sin = sin.chunk(2, dim=-1)[0].contiguous()
 
-    apply_rotary_emb = ApplyRotaryEmb(
-        enforce_enable=True,
-        enable_fp32_compute=True,
-    )
-
     q_embed = apply_rotary_emb(q, cos, sin)
     k_embed = apply_rotary_emb(k, cos, sin)
 
@@ -404,11 +400,17 @@ class KeyeSiglipAttention(nn.Module):
         self.attn = MMEncoderAttention(
             num_heads=self.num_heads,
             head_size=self.head_dim,
+            scale=self.scale,
             num_kv_heads=self.num_kv_heads,
             prefix=f"{prefix}.attn",
             multimodal_config=multimodal_config,
         )
 
+        self.apply_rotary_emb = ApplyRotaryEmb(
+            enforce_enable=True,
+            enable_fp32_compute=True,
+        )
+
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -447,7 +449,7 @@ class KeyeSiglipAttention(nn.Module):
                 self.num_kv_heads,
                 self.head_dim,
             )
-            q, k = apply_rotary_pos_emb_flashatt(q, k, cos, sin)
+            q, k = apply_rotary_pos_emb_flashatt(q, k, cos, sin, self.apply_rotary_emb)
             v = v.view(
                 *v.shape[:-1],
                 self.num_kv_heads,
@@ -511,6 +513,7 @@ class KeyeSiglipEncoderLayer(nn.Module):
         self.mlp = SiglipMLP(
             config,
             quant_config=quant_config,
+            multimodal_config=multimodal_config,
             prefix=f"{prefix}.mlp",
         )
 
diff --git a/vllm/model_executor/models/kimi_linear.py b/vllm/model_executor/models/kimi_linear.py
index 4562b2202c5ec86c843ac183940f55d99293dd6c..d149c364240678ea5b3c1e3a44f991f89f0be7f8 100644
--- a/vllm/model_executor/models/kimi_linear.py
+++ b/vllm/model_executor/models/kimi_linear.py
@@ -560,6 +560,7 @@ class KimiLinearForCausalLM(
             # Params for weights, fp8 weight scales, fp8 activation scales
             # (param_name, weight_name, expert_id, shard_id)
             expert_params_mapping = FusedMoE.make_expert_params_mapping(
+                self,
                 ckpt_gate_proj_name="w1",
                 ckpt_down_proj_name="w2",
                 ckpt_up_proj_name="w3",
diff --git a/vllm/model_executor/models/kimi_vl.py b/vllm/model_executor/models/kimi_vl.py
index 85267ccda8a916b4313c5a6c092fa47e780a8938..b5436ca1af056080ce2ec7e24df919b2a67f460d 100644
--- a/vllm/model_executor/models/kimi_vl.py
+++ b/vllm/model_executor/models/kimi_vl.py
@@ -325,7 +325,7 @@ class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
         self.hidden_size = config.text_config.hidden_size
         self.vision_tower = MoonVitPretrainedModel(
             config.vision_config,
-            self.use_data_parallel,
+            multimodal_config=model_config.multimodal_config,
             prefix=maybe_prefix(prefix, "vision_tower"),
         )
 
@@ -462,6 +462,7 @@ class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
             # Params for weights, fp8 weight scales, fp8 activation scales
             # (param_name, weight_name, expert_id, shard_id)
             expert_params_mapping = FusedMoE.make_expert_params_mapping(
+                self,
                 ckpt_gate_proj_name="gate_proj",
                 ckpt_down_proj_name="down_proj",
                 ckpt_up_proj_name="up_proj",
diff --git a/vllm/model_executor/models/lfm2_moe.py b/vllm/model_executor/models/lfm2_moe.py
index 70804e0a843e86f89c04be41c8a8748cf053f078..6677eb9f93e8fb4b4832c98b068695bef31684d6 100644
--- a/vllm/model_executor/models/lfm2_moe.py
+++ b/vllm/model_executor/models/lfm2_moe.py
@@ -486,6 +486,7 @@ class Lfm2MoeModel(nn.Module):
 
     def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
         return FusedMoE.make_expert_params_mapping(
+            self,
             ckpt_gate_proj_name="w1",
             ckpt_down_proj_name="w2",
             ckpt_up_proj_name="w3",
diff --git a/vllm/model_executor/models/lfm2_vl.py b/vllm/model_executor/models/lfm2_vl.py
new file mode 100644
index 0000000000000000000000000000000000000000..d87b23d00cbaa46b460c9898fd952b9ce5c4719a
--- /dev/null
+++ b/vllm/model_executor/models/lfm2_vl.py
@@ -0,0 +1,732 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import itertools
+import math
+from collections.abc import Iterable, Mapping, Sequence
+from typing import Annotated, Literal
+
+import torch
+import torch.nn as nn
+from transformers import BatchFeature
+from transformers.activations import ACT2FN
+from transformers.models.lfm2_vl import Lfm2VlProcessor
+from transformers.models.lfm2_vl.configuration_lfm2_vl import Lfm2VlConfig
+from transformers.models.lfm2_vl.image_processing_lfm2_vl_fast import (
+    Lfm2VlImageProcessorFast,
+    find_closest_aspect_ratio,
+    round_by_factor,
+)
+
+from vllm.config import VllmConfig
+from vllm.config.multimodal import BaseDummyOptions
+from vllm.forward_context import set_forward_context
+from vllm.model_executor.layers.mamba.mamba_utils import (
+    MambaStateDtypeCalculator,
+    MambaStateShapeCalculator,
+)
+from vllm.model_executor.models.module_mapping import MultiModelKeys
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (
+    MultiModalDataDict,
+    MultiModalFieldConfig,
+    MultiModalKwargsItems,
+)
+from vllm.multimodal.parse import ImageProcessorItems, ImageSize, MultiModalDataItems
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    PromptReplacement,
+    PromptUpdateDetails,
+)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
+
+from .interfaces import (
+    IsHybrid,
+    MultiModalEmbeddings,
+    SupportsLoRA,
+    SupportsMultiModal,
+    SupportsPP,
+)
+from .siglip2 import Siglip2Model
+from .utils import (
+    AutoWeightsLoader,
+    WeightsMapper,
+    init_vllm_registered_model,
+    maybe_prefix,
+)
+
+
+class Lfm2VLImagePixelInputs(TensorSchema):
+    """
+    Dimensions:
+        - b: Number of images in the prompt
+        - bn: Batch size * number of images
+        - d: Number of dimensions
+        - fd: Number of features per dimension
+    """
+
+    type: Literal["pixel_values"] = "pixel_values"
+    pixel_values: Annotated[torch.Tensor, TensorShape("bn", "d", "fd")]
+    spatial_shapes: Annotated[torch.Tensor, TensorShape("bn", 2)]
+    num_patches: Annotated[torch.Tensor, TensorShape("b")]
+
+
+LFM2VLImageInputs = Lfm2VLImagePixelInputs
+
+
+class Lfm2VLProcessingInfo(BaseProcessingInfo):
+    def get_hf_config(self):
+        return self.ctx.get_hf_config(Lfm2VlConfig)
+
+    def get_hf_processor(self, **kwargs):
+        return self.ctx.get_hf_processor(Lfm2VlProcessor, **kwargs)
+
+    def get_image_processor(self, **kwargs: object) -> Lfm2VlImageProcessorFast:
+        return self.get_hf_processor(**kwargs).image_processor
+
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
+        return {"image": None}
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        processor = self.get_image_processor()
+        max_image_tokens = processor.max_image_tokens
+        encoder_patch_size = processor.encoder_patch_size
+        downsample_factor = processor.downsample_factor
+        max_pixels = max_image_tokens * (encoder_patch_size**2) * (downsample_factor**2)
+        side = int(math.sqrt(max_pixels))
+        return ImageSize(width=side, height=side)
+
+    def _is_image_too_large(
+        self,
+        height: int,
+        width: int,
+        max_image_tokens: int,
+        encoder_patch_size: int,
+        downsample_factor: int,
+        max_pixels_tolerance: float,
+    ) -> bool:
+        """Check if the image is too large to be processed as one tile."""
+        total_factor = encoder_patch_size * downsample_factor
+
+        h_bar = max(encoder_patch_size, round_by_factor(height, total_factor))
+        w_bar = max(encoder_patch_size, round_by_factor(width, total_factor))
+        return (
+            h_bar * w_bar
+            > max_image_tokens
+            * encoder_patch_size**2
+            * downsample_factor**2
+            * max_pixels_tolerance
+        )
+
+    def smart_resize(
+        self,
+        height: int,
+        width: int,
+        downsample_factor: int,
+        min_image_tokens: int,
+        max_image_tokens: int,
+        encoder_patch_size: int,
+    ) -> tuple[int, int]:
+        total_factor = encoder_patch_size * downsample_factor
+        smart_resize_min_pixels = (
+            min_image_tokens * encoder_patch_size**2 * downsample_factor**2
+        )
+        smart_resize_max_pixels = (
+            max_image_tokens * encoder_patch_size**2 * downsample_factor**2
+        )
+
+        h_bar = max(total_factor, round_by_factor(height, total_factor))
+        w_bar = max(total_factor, round_by_factor(width, total_factor))
+
+        if h_bar * w_bar > smart_resize_max_pixels:
+            beta = math.sqrt((height * width) / smart_resize_max_pixels)
+            h_bar = max(
+                total_factor, math.floor(height / beta / total_factor) * total_factor
+            )
+            w_bar = max(
+                total_factor, math.floor(width / beta / total_factor) * total_factor
+            )
+        elif h_bar * w_bar < smart_resize_min_pixels:
+            beta = math.sqrt(smart_resize_min_pixels / (height * width))
+            h_bar = math.ceil(height * beta / total_factor) * total_factor
+            w_bar = math.ceil(width * beta / total_factor) * total_factor
+
+        return w_bar, h_bar
+
+    def _target_ratios(self, min_tiles: int, max_tiles: int) -> list[tuple[int, int]]:
+        ratios = [
+            (w, h)
+            for n in range(min_tiles, max_tiles + 1)
+            for w in range(1, n + 1)
+            for h in range(1, n + 1)
+            if min_tiles <= w * h <= max_tiles
+        ]
+        return sorted(set(ratios), key=lambda x: x[0] * x[1])
+
+    def _get_grid_layout(
+        self,
+        height: int,
+        width: int,
+        min_tiles: int,
+        max_tiles: int,
+        tile_size: int,
+    ) -> tuple[int, int]:
+        aspect_ratio = width / height
+        target_ratios = self._target_ratios(min_tiles, max_tiles)
+        # find best matching grid configuration
+        grid_width, grid_height = find_closest_aspect_ratio(
+            aspect_ratio, target_ratios, width, height, tile_size
+        )
+        total_patches = grid_width * grid_height
+        return grid_width, grid_height, total_patches
+
+    def _get_image_feature_grid_size(
+        self,
+        image_width: int,
+        image_height: int,
+        processor: Lfm2VlProcessor | None,
+    ) -> tuple[int, int]:
+        if processor is None:
+            processor = self.get_image_processor()
+
+        downsample_factor = processor.image_processor.downsample_factor
+        encoder_patch_size = processor.image_processor.encoder_patch_size
+        max_pixels_tolerance = processor.image_processor.max_pixels_tolerance
+        min_tiles = processor.image_processor.min_tiles
+        max_tiles = processor.image_processor.max_tiles
+        max_image_tokens = processor.image_processor.max_image_tokens
+        tile_size = processor.image_processor.tile_size
+
+        do_image_splitting = not min_tiles == max_tiles == 1
+        is_image_large = self._is_image_too_large(
+            height=image_height,
+            width=image_width,
+            max_image_tokens=max_image_tokens,
+            encoder_patch_size=encoder_patch_size,
+            downsample_factor=downsample_factor,
+            max_pixels_tolerance=max_pixels_tolerance,
+        )
+
+        # Big image will be cropped into patches and small images are just resized
+        if is_image_large and do_image_splitting:
+            grid_width, grid_height, total_patches = self._get_grid_layout(
+                image_height,
+                image_width,
+                min_tiles=min_tiles,
+                max_tiles=max_tiles,
+                tile_size=tile_size,
+            )
+        else:
+            grid_width = grid_height = total_patches = 1
+
+        if grid_width * grid_height != 1:  # Thumbnail
+            total_patches += 1
+
+        return grid_width, grid_height, total_patches
+
+    def get_num_patches(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        processor: Lfm2VlProcessor | None,
+    ) -> int:
+        _, _, total_patches = self._get_image_feature_grid_size(
+            image_width=image_width,
+            image_height=image_height,
+            processor=processor,
+        )
+        return total_patches
+
+    def get_image_repl(
+        self,
+        image_width: int,
+        image_height: int,
+        spatial_shapes: torch.Tensor,
+        processor: Lfm2VlProcessor | None,
+    ) -> str:
+        if processor is None:
+            processor = self.get_hf_processor()
+
+        grid_placeholder = "<|img_row_{n_h}_col_{n_w}|>"
+        image_token = processor.image_token
+        image_start_token = processor.image_start_token
+        image_end_token = processor.image_end_token
+        image_thumbnail_token = processor.image_thumbnail_token
+
+        num_thumbnail_tokens, num_tokens_per_tile = self.get_num_image_tokens(
+            spatial_shapes=spatial_shapes,
+            processor=processor,
+        )
+        tile_img_placeholder = grid_placeholder + (image_token * num_tokens_per_tile)
+
+        grid_w, grid_h, _ = self._get_image_feature_grid_size(
+            image_width=image_width,
+            image_height=image_height,
+            processor=processor,
+        )
+
+        if grid_w > 1 or grid_h > 1:
+            tiles_placeholder: list[str] = [
+                tile_img_placeholder.format(n_h=i + 1, n_w=j + 1)
+                for i in range(grid_h)
+                for j in range(grid_w)
+            ]
+
+            if num_thumbnail_tokens > 0:
+                tiles_placeholder.append(
+                    image_thumbnail_token + (image_token * num_thumbnail_tokens)
+                )
+        else:
+            tiles_placeholder = [image_token * num_thumbnail_tokens]
+
+        placeholder = "".join(
+            itertools.chain([image_start_token], tiles_placeholder, [image_end_token])
+        )
+        return placeholder
+
+    def get_num_image_tokens(
+        self,
+        *,
+        spatial_shapes: torch.Tensor,
+        processor: Lfm2VlProcessor | None,
+    ) -> tuple[int, int]:
+        tile_size = processor.image_processor.tile_size
+        downsample_factor = processor.image_processor.downsample_factor
+        encoder_patch_size = processor.image_processor.encoder_patch_size
+        num_thumbnail_tokens = spatial_shapes[-1].prod() // (downsample_factor**2)
+        num_patches_tile = tile_size // encoder_patch_size
+        dwn_num_patches_tile = math.ceil(num_patches_tile / downsample_factor)
+        num_tiles_tokens = dwn_num_patches_tile * dwn_num_patches_tile
+        return num_thumbnail_tokens, num_tiles_tokens
+
+
+class Lfm2VLDummyInputsBuilder(BaseDummyInputsBuilder[Lfm2VLProcessingInfo]):
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+        processor = self.info.get_hf_processor()
+        image_token = processor.image_token
+        return image_token * num_images
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 0)
+
+        target_width, target_height = self.info.get_image_size_with_most_features()
+
+        image_overrides = mm_options.get("image") if mm_options else None
+
+        return {
+            "image": self._get_dummy_images(
+                width=target_width,
+                height=target_height,
+                num_images=num_images,
+                overrides=image_overrides,
+            ),
+        }
+
+
+class Lfm2VLMultiModalProcessor(BaseMultiModalProcessor[Lfm2VLProcessingInfo]):
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        # Text-only input not supported in composite processor
+        if not (images := mm_data.get("images", [])):
+            prompt_ids = self.info.get_tokenizer().encode(prompt)
+            prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
+            return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
+
+        processed_outputs = super()._call_hf_processor(
+            prompt,
+            mm_data,
+            mm_kwargs,
+            tok_kwargs,
+        )
+
+        parsed_images = (
+            self._get_data_parser()
+            .parse_mm_data({"image": images})
+            .get_items("image", ImageProcessorItems)
+        )
+        image_sizes = [
+            parsed_images.get_image_size(i) for i in range(len(parsed_images))
+        ]
+        hf_processor = self.info.get_hf_processor(**mm_kwargs)
+
+        num_patches = [
+            self.info.get_num_patches(
+                image_width=size.width,
+                image_height=size.height,
+                processor=hf_processor,
+            )
+            for size in image_sizes
+        ]
+        processed_outputs["num_patches"] = torch.tensor(num_patches)
+
+        return processed_outputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        num_patches = hf_inputs.get("num_patches", torch.empty(0))
+
+        return dict[str, MultiModalFieldConfig](
+            pixel_values=MultiModalFieldConfig.flat_from_sizes("image", num_patches),
+            spatial_shapes=MultiModalFieldConfig.flat_from_sizes(
+                "image", num_patches, keep_on_cpu=True
+            ),
+            num_patches=MultiModalFieldConfig.batched("image", keep_on_cpu=True),
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargsItems,
+    ) -> Sequence[PromptReplacement]:
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        image_token = hf_processor.image_token
+
+        def get_image_replacement_lfm2vl(item_idx: int):
+            images = mm_items.get_items("image", ImageProcessorItems)
+            image_size = images.get_image_size(item_idx)
+            out_item = out_mm_kwargs["image"][item_idx]
+            spatial_shapes = out_item["spatial_shapes"].data
+            assert isinstance(spatial_shapes, torch.Tensor)
+            image_repl = self.info.get_image_repl(
+                image_width=image_size.width,
+                image_height=image_size.height,
+                spatial_shapes=spatial_shapes,
+                processor=hf_processor,
+            )
+            return PromptUpdateDetails.select_text(
+                image_repl,
+                embed_text=image_token,
+            )
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=image_token,
+                replacement=get_image_replacement_lfm2vl,
+            )
+        ]
+
+
+class Lfm2VLMultiModalProjector(nn.Module):
+    def __init__(
+        self, config: Lfm2VlConfig, use_data_parallel: bool = False, prefix: str = ""
+    ):
+        super().__init__()
+        self.use_data_parallel = use_data_parallel
+
+        in_channels = config.vision_config.hidden_size * (config.downsample_factor**2)
+        self.factor = config.downsample_factor
+        self.projector_use_layernorm = config.projector_use_layernorm
+        if self.projector_use_layernorm:
+            self.layer_norm = nn.LayerNorm(in_channels)
+        self.linear_1 = nn.Linear(
+            in_channels,
+            config.projector_hidden_size,
+            bias=config.projector_bias,
+        )
+        self.act = ACT2FN[config.projector_hidden_act]
+        self.linear_2 = nn.Linear(
+            config.projector_hidden_size,
+            config.text_config.hidden_size,
+            bias=config.projector_bias,
+        )
+
+    def forward(self, image_features: torch.Tensor):
+        image_features = self.pixel_unshuffle(image_features)
+        if self.projector_use_layernorm:
+            image_features = self.layer_norm(image_features)
+        hidden_states = self.linear_1(image_features)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+
+    def pixel_unshuffle(self, hidden_states: torch.Tensor):
+        batch_size, width, height, channels = hidden_states.size()
+        hidden_states = hidden_states.reshape(
+            batch_size, width, height // self.factor, channels * self.factor
+        )
+        hidden_states = hidden_states.permute(0, 2, 1, 3)
+        hidden_states = hidden_states.reshape(
+            batch_size,
+            height // self.factor,
+            width // self.factor,
+            channels * self.factor**2,
+        )
+        hidden_states = hidden_states.permute(0, 2, 1, 3)
+        return hidden_states
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    Lfm2VLMultiModalProcessor,
+    info=Lfm2VLProcessingInfo,
+    dummy_inputs=Lfm2VLDummyInputsBuilder,
+)
+class Lfm2VLForConditionalGeneration(
+    nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP, IsHybrid
+):
+    merge_by_field_config = True
+
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            "lm_head.": "language_model.lm_head.",
+            "model.language_model.": "language_model.model.",
+            "model.vision_tower.": "vision_tower.",
+            "model.multi_modal_projector.": "multi_modal_projector.",
+        }
+    )
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
+        if modality.startswith("image"):
+            return "<image>"
+
+        raise ValueError("Only image modality is supported")
+
+    @classmethod
+    def get_mamba_state_dtype_from_config(
+        cls,
+        vllm_config: "VllmConfig",
+    ) -> tuple[torch.dtype, ...]:
+        return MambaStateDtypeCalculator.short_conv_state_dtype(
+            vllm_config.model_config.dtype,
+            vllm_config.cache_config.mamba_cache_dtype,
+        )
+
+    @classmethod
+    def get_mamba_state_shape_from_config(
+        cls,
+        vllm_config: "VllmConfig",
+    ) -> tuple[tuple[int, int]]:
+        """Calculate shapes for LFM2's convolutional cache.
+
+        Args:
+            vllm_config: vLLM config
+
+        Returns:
+            Tuple containing:
+            - conv_state_shape: Shape for convolutional state cache
+        """
+        parallel_config = vllm_config.parallel_config
+        hf_language_config = vllm_config.model_config.hf_config.text_config
+
+        return MambaStateShapeCalculator.short_conv_state_shape(
+            tp_world_size=parallel_config.tensor_parallel_size,
+            intermediate_size=hf_language_config.hidden_size,
+            conv_kernel=hf_language_config.conv_L_cache,
+        )
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "model"):
+        super().__init__()
+        config: Lfm2VlConfig = vllm_config.model_config.hf_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+        vision_config = config.vision_config
+        quant_config = vllm_config.quant_config
+
+        self.config = config
+        self.vllm_config = vllm_config
+        self.multimodal_config = multimodal_config
+        self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data"
+
+        if vision_config.model_type == "siglip2_vision_model":
+            self.vision_tower = Siglip2Model(
+                config=vision_config,
+                quant_config=quant_config,
+                multimodal_config=multimodal_config,
+                prefix=maybe_prefix(prefix, "vision_tower"),
+            )
+        else:
+            raise ValueError(
+                f"Unsupported visual tokenizer model_type: {vision_config.model_type}"
+            )
+
+        self.multi_modal_projector = Lfm2VLMultiModalProjector(
+            config=config,
+            use_data_parallel=self.use_data_parallel,
+            prefix=f"{prefix}.multi_modal_projector",
+        )
+
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "language"),
+            architectures=config.text_config.architectures,
+        )
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors
+        )
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def _parse_and_validate_image_input(
+        self, **kwargs: object
+    ) -> LFM2VLImageInputs | None:
+        pixel_values = kwargs.pop("pixel_values", None)
+        spatial_shapes = kwargs.pop("spatial_shapes", None)
+        num_patches = kwargs.pop("num_patches", None)
+        if pixel_values is None:
+            return None
+
+        return LFM2VLImageInputs(
+            type="pixel_values",
+            pixel_values=pixel_values,
+            spatial_shapes=spatial_shapes,
+            num_patches=num_patches,
+        )
+
+    def image_pixels_to_features(
+        self,
+        pixel_values: torch.FloatTensor,
+        spatial_shapes: torch.Tensor,
+    ) -> torch.Tensor:
+        pixel_values = pixel_values.to(
+            dtype=self.vision_tower.vision_model.embeddings.patch_embedding.weight.dtype
+        )  # fp16 compatibility
+
+        # LFM2-VL's HF processor pads patch sequences with trailing zeros.
+        # Derive the valid-patch mask from spatial_shapes instead of carrying
+        # pixel_attention_mask through the vLLM multimodal pipeline.
+        max_seq_len = pixel_values.shape[1]
+        lengths_cpu = (spatial_shapes[:, 0] * spatial_shapes[:, 1]).to(
+            dtype=torch.int32
+        )
+        max_seqlen = (
+            lengths_cpu.max().reshape(1).to(device=pixel_values.device)
+            if lengths_cpu.numel()
+            else torch.tensor([0], dtype=torch.int32, device=pixel_values.device)
+        )
+        lengths = lengths_cpu.to(device=pixel_values.device)
+        packed_mask = (
+            torch.arange(max_seq_len, device=pixel_values.device)[None, :]
+            < lengths[:, None]
+        )
+        cu_seqlens = torch.zeros(
+            lengths.shape[0] + 1,
+            dtype=torch.int32,
+            device=lengths.device,
+        )
+        cu_seqlens[1:] = torch.cumsum(lengths, dim=0)
+
+        with set_forward_context(None, self.vllm_config):
+            vision_outputs = self.vision_tower(
+                pixel_values=pixel_values,
+                spatial_shapes=spatial_shapes,
+                packed_mask=packed_mask,
+                cu_seqlens=cu_seqlens,
+                max_seqlen=max_seqlen,
+            )
+        image_outputs = getattr(vision_outputs, "last_hidden_state", vision_outputs)
+
+        image_features = []
+
+        # spatial_shapes is on CPU (keep_on_cpu=True), so .tolist() is instant
+        spatial_shapes_list = spatial_shapes.tolist()
+        for img_idx, (feature_org_h, feature_org_w) in enumerate(spatial_shapes_list):
+            feature_len = feature_org_h * feature_org_w
+            feature = image_outputs[img_idx, :feature_len]
+
+            # reshape to original height and width
+            feature = feature.reshape(1, feature_org_h, feature_org_w, -1)
+
+            # project the image representation
+            img_embedding = self.multi_modal_projector(feature)
+
+            # flatten here to handle variable length in naflex
+            img_embedding = img_embedding.reshape(-1, img_embedding.size(-1))
+            image_features.append(img_embedding)
+
+        return image_features
+
+    def _process_image_input(
+        self,
+        image_input: LFM2VLImageInputs,
+    ) -> torch.Tensor | list[torch.Tensor]:
+        pixel_values = image_input["pixel_values"]
+        spatial_shapes = image_input["spatial_shapes"]
+        num_patches = image_input["num_patches"]
+
+        image_features = self.image_pixels_to_features(
+            pixel_values,
+            spatial_shapes=spatial_shapes,
+        )
+
+        # Group patches by image - num_patches is on CPU (keep_on_cpu=True)
+        # so .tolist() is instant with no DtoH sync
+        num_patches_list = num_patches.tolist()
+        batched_features: list[torch.Tensor] = []
+        patch_idx = 0
+        for count in num_patches_list:
+            # Slice the list of patch tensors for this image
+            image_patches = image_features[patch_idx : patch_idx + count]
+            # Concatenate patches for this image
+            batched_features.append(torch.cat(image_patches, dim=0))
+            patch_idx += count
+
+        return batched_features
+
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return []
+
+        return self._process_image_input(image_input)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        **kwargs: object,
+    ) -> torch.Tensor | IntermediateTensors:
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        hidden_states = self.language_model(
+            input_ids=input_ids,
+            positions=positions,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor | None:
+        logits = self.language_model.compute_logits(hidden_states)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="language_model",
+            connector="multi_modal_projector",
+            tower_model="vision_tower",
+        )
diff --git a/vllm/model_executor/models/lightonocr.py b/vllm/model_executor/models/lightonocr.py
index 353ee7806b1b1c8e9ba069052cfd2be17316f1cd..27ec12a8f1065c384f9d4b42fa36de8e57acc30f 100644
--- a/vllm/model_executor/models/lightonocr.py
+++ b/vllm/model_executor/models/lightonocr.py
@@ -155,6 +155,7 @@ class LightOnOCRForConditionalGeneration(Mistral3ForConditionalGeneration):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         nn.Module.__init__(self)
+
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
         multimodal_config = vllm_config.model_config.multimodal_config
@@ -164,7 +165,8 @@ class LightOnOCRForConditionalGeneration(Mistral3ForConditionalGeneration):
 
         self.vision_tower = init_vision_tower_for_llava(
             config,
-            quant_config,
+            quant_config=quant_config,
+            multimodal_config=multimodal_config,
             require_post_norm=False,
             prefix=maybe_prefix(prefix, "vision_tower"),
         )
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 773fd061f3bf900de3443638515662f157587e5d..db22ac1ad5d142aa860e45dd7356e38ebf536e5e 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -30,17 +30,17 @@ from itertools import islice
 import torch
 from torch import nn
 from transformers import LlamaConfig
-
 import os
 import re
 
-from vllm.attention.backends.abstract import AttentionType
 from vllm.attention.layer import Attention
-from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.attention.encoder_only_attention import (
+    EncoderOnlyAttention,
+)
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (
     MergedColumnParallelLinear,
@@ -59,8 +59,15 @@ from vllm.model_executor.model_loader.weight_utils import (
     maybe_remap_kv_scale_name,
 )
 from vllm.sequence import IntermediateTensors
-
-from .interfaces import SupportsEagle, SupportsEagle3, SupportsLoRA, SupportsPP
+from vllm.v1.attention.backend import AttentionType
+
+from .adapters import as_embedding_model, as_seq_cls_model
+from .interfaces import (
+    SupportsEagle,
+    SupportsEagle3,
+    SupportsLoRA,
+    SupportsPP,
+)
 from .utils import (
     AutoWeightsLoader,
     PPMissingLayer,
@@ -201,8 +208,8 @@ class LlamaAttention(nn.Module):
                 # This is a target model, use layer_idx directly
                 effective_layer_idx = layer_idx
             assert effective_layer_idx < len(layer_types), (
-                f"effective_layer_idx: {effective_layer_idx} \
-                is out of bounds for layer_types: {layer_types}"
+                f"effective_layer_idx: {effective_layer_idx} "
+                f"is out of bounds for layer_types: {layer_types}"
             )
 
             is_sliding = layer_types[effective_layer_idx] == "sliding_attention"
@@ -374,7 +381,11 @@ def llama_model_invariants(
         torch._check(positions.size()[0] == input_ids.size()[0])
 
 
-@support_torch_compile(shape_invariants=llama_model_invariants)
+@support_torch_compile(
+    # TODO[#32068]: Investigate recompilation
+    # mark_unbacked_dims={"input_ids": 0},
+    shape_invariants=llama_model_invariants
+)
 class LlamaModel(nn.Module):
     def __init__(
         self,
@@ -769,3 +780,15 @@ class LlamaForCausalLM(
                 name = name.replace(item, mapping[item])
 
         return name, loaded_weight
+
+
+class LlamaBidirectionalForSequenceClassification(as_seq_cls_model(LlamaForCausalLM)):
+    # This class sets the correct attention type and pooling type
+    # through LlamaBidirectionalConfig.
+    pass
+
+
+class LlamaBidirectionalModel(as_embedding_model(LlamaForCausalLM)):
+    # This class sets the correct attention type and pooling type
+    # through LlamaBidirectionalConfig.
+    pass
diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py
index 7b3da3e10ab8a024a15ae5549107f24c9f32bd42..dde6db7c204b7f7593782fca1a4e37f80a8bca63 100644
--- a/vllm/model_executor/models/llama4.py
+++ b/vllm/model_executor/models/llama4.py
@@ -25,7 +25,6 @@ from torch import nn
 from transformers import Llama4TextConfig
 
 from vllm.attention.layer import Attention
-from vllm.attention.layers.chunked_local_attention import ChunkedLocalAttention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (
@@ -34,6 +33,9 @@ from vllm.distributed import (
     tensor_model_parallel_all_gather,
 )
 from vllm.logger import init_logger
+from vllm.model_executor.layers.attention.chunked_local_attention import (
+    ChunkedLocalAttention,
+)
 from vllm.model_executor.layers.fused_moe import SharedFusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (
@@ -539,6 +541,7 @@ class Llama4Model(LlamaModel):
         # Expert parameter mapping for the case where the expert weights are
         # not fused into a single weight tensor.
         expert_params_mapping = SharedFusedMoE.make_expert_params_mapping(
+            self,
             ckpt_gate_proj_name="gate_proj",
             ckpt_down_proj_name="down_proj",
             ckpt_up_proj_name="up_proj",
@@ -548,6 +551,7 @@ class Llama4Model(LlamaModel):
         # Expert parameter mapping for the case where the expert weights are
         # fused into a single weight tensor.
         expert_params_mapping_fused = SharedFusedMoE.make_expert_params_mapping(
+            self,
             ckpt_gate_proj_name="gate_up_proj",
             ckpt_down_proj_name="down_proj",
             ckpt_up_proj_name="gate_up_proj",
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 66a327bb7603d80f37e6fd2a1d7c25a089bab98f..ba54623d9ddc7f27b6040052b715854e29d3b03b 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -19,7 +19,7 @@ from transformers.models.llava import LlavaProcessor
 from transformers.models.pixtral import PixtralProcessor
 
 from vllm.config import VllmConfig
-from vllm.config.multimodal import BaseDummyOptions
+from vllm.config.multimodal import BaseDummyOptions, MultiModalConfig
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear
 from vllm.model_executor.layers.quantization import QuantizationConfig
@@ -51,7 +51,13 @@ from vllm.sequence import IntermediateTensors
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .clip import CLIPVisionModel
-from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
+from .interfaces import (
+    MultiModalEmbeddings,
+    SupportsLoRA,
+    SupportsMultiModal,
+    SupportsPP,
+)
+from .module_mapping import MultiModelKeys
 from .pixtral import PixtralHFEncoderInfo, PixtralHFVisionModel
 from .siglip import SiglipVisionModel
 from .utils import (
@@ -462,6 +468,7 @@ def _get_layer_index(feature_layer_index: int, num_hidden_layers: int) -> int:
 def init_vision_tower_for_llava(
     hf_config: LlavaLikeConfig,
     quant_config: QuantizationConfig | None,
+    multimodal_config: MultiModalConfig | None,
     *,
     require_post_norm: bool | None = None,
     prefix: str = "",
@@ -475,6 +482,7 @@ def init_vision_tower_for_llava(
         return CLIPVisionModel(
             vision_config,
             quant_config=quant_config,
+            multimodal_config=multimodal_config,
             num_hidden_layers_override=num_hidden_layers,
             require_post_norm=require_post_norm,
             prefix=prefix,
@@ -483,6 +491,7 @@ def init_vision_tower_for_llava(
         return SiglipVisionModel(
             vision_config,
             quant_config=quant_config,
+            multimodal_config=multimodal_config,
             num_hidden_layers_override=num_hidden_layers,
             require_post_norm=require_post_norm,
             prefix=prefix,
@@ -491,6 +500,7 @@ def init_vision_tower_for_llava(
         return PixtralHFVisionModel(
             vision_config,
             quant_config=quant_config,
+            multimodal_config=multimodal_config,
             num_hidden_layers_override=num_hidden_layers,
             require_post_norm=require_post_norm,
             prefix=prefix,
@@ -505,7 +515,9 @@ def init_vision_tower_for_llava(
     info=_build_llava_or_pixtral_hf_info,
     dummy_inputs=LlavaDummyInputsBuilder,
 )
-class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
+class LlavaForConditionalGeneration(
+    nn.Module, SupportsLoRA, SupportsMultiModal, SupportsPP
+):
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
         "gate_up_proj": ["gate_proj", "up_proj"],
@@ -555,7 +567,8 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
         if multimodal_config.get_limit_per_prompt("image"):
             self.vision_tower = init_vision_tower_for_llava(
                 config,
-                quant_config,
+                quant_config=quant_config,
+                multimodal_config=multimodal_config,
                 require_post_norm=False,
                 prefix=maybe_prefix(prefix, "vision_tower"),
             )
@@ -734,6 +747,32 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
         loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes)
         return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
 
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="language_model",
+            connector="multi_modal_projector",
+            tower_model="vision_tower",
+        )
+
+    def get_num_mm_encoder_tokens(
+        self,
+        num_image_tokens: int,
+    ) -> int:
+        # LLaVA's vision encoder outputs one token per patch without
+        # spatial merging or pixel shuffle
+        return num_image_tokens
+
+    def get_num_mm_connector_tokens(
+        self,
+        num_vision_tokens: int,
+    ) -> int:
+        # LLaVA's MLP projector outputs the same number of tokens
+        # as it receives from the vision encoder (1:1 mapping)
+        return num_vision_tokens
+
 
 class MantisProcessingInfo(LlavaProcessingInfo):
     def get_hf_processor(self, **kwargs: object):
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 526846d0d98128529af8b31073ad5ceb53f747c7..21a9c2f28231b7e83b67b5e110e33f2a154a2e66 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -243,6 +243,7 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsP
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         super().__init__()
+
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
         multimodal_config = vllm_config.model_config.multimodal_config
@@ -270,7 +271,8 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsP
         # TODO: Optionally initializes this for supporting embeddings.
         self.vision_tower = init_vision_tower_for_llava(
             config,
-            quant_config,
+            quant_config=quant_config,
+            multimodal_config=multimodal_config,
             require_post_norm=False,
             prefix=maybe_prefix(prefix, "vision_tower"),
         )
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index cd55cfec6cdec90086da8f2b13789c0803f087a0..b146a144ea0bf7a3298febd89ef7b5287fb9a6e7 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -321,6 +321,7 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal, Supp
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         super().__init__()
+
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
         multimodal_config = vllm_config.model_config.multimodal_config
@@ -331,7 +332,8 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal, Supp
         # Initialize the vision tower only up to the required feature layer
         self.vision_tower = init_vision_tower_for_llava(
             config,
-            quant_config,
+            quant_config=quant_config,
+            multimodal_config=multimodal_config,
             require_post_norm=False,
             prefix=maybe_prefix(prefix, "vision_tower"),
         )
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index 5aa8de7dc252ef92e306b59e921261c7821f8eaa..a89f456ef92f119ac0458389780a5231f8c8cac8 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -511,7 +511,8 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, Supp
         # Initialize the vision tower only up to the required feature layer
         self.vision_tower = init_vision_tower_for_llava(
             config,
-            quant_config,
+            quant_config=quant_config,
+            multimodal_config=multimodal_config,
             require_post_norm=False,
             prefix=maybe_prefix(prefix, "vision_tower"),
         )
diff --git a/vllm/model_executor/models/longcat_flash.py b/vllm/model_executor/models/longcat_flash.py
index c5441283f9711173a1e1127605953659bf771ecd..fed3a1caee8f63f1c0f3f2ee4e0f2af0aafc2eec 100644
--- a/vllm/model_executor/models/longcat_flash.py
+++ b/vllm/model_executor/models/longcat_flash.py
@@ -46,7 +46,7 @@ from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.fused_moe import FusedMoE, ZeroExpertFusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (
     MergedColumnParallelLinear,
@@ -179,7 +179,7 @@ class FlashConfig(PretrainedConfig):
         self.intermediate_size = (
             self.ffn_hidden_size
             if hasattr(self, "ffn_hidden_size")
-            else self.intermediate_size
+            else intermediate_size
         )
         if hasattr(self, "moe_intermediate_size"):
             self.moe_intermediate_size = self.moe_intermediate_size
@@ -280,10 +280,6 @@ class LongcatMoe(nn.Module):
     ):
         super().__init__()
         self.hidden_size = hidden_size
-        self.zero_expert_num = config.zero_expert_num
-        self.zero_expert_type = config.zero_expert_type
-        self.routed_scaling_factor = config.routed_scaling_factor
-        self.enable_eplb = enable_eplb
         # Gate always runs at half / full precision for now.
         self.rounter_params_dtype = params_dtype
         if config.router_dtype == "float32":
@@ -291,25 +287,27 @@ class LongcatMoe(nn.Module):
 
         self.router = LongcatRouter(
             config=config,
-            zero_expert_num=self.zero_expert_num,
+            zero_expert_num=config.zero_expert_num,
             rounter_params_dtype=self.rounter_params_dtype,
             prefix=f"{prefix}.gate",
         )
 
-        self.experts = FusedMoE(
+        assert config.zero_expert_num is not None
+        assert config.zero_expert_type is not None
+        self.experts = ZeroExpertFusedMoE(
+            zero_expert_num=config.zero_expert_num,
+            zero_expert_type=config.zero_expert_type,
+            router=self.router,
             num_experts=num_experts,
             top_k=top_k,
             hidden_size=hidden_size,
             intermediate_size=intermediate_size,
             reduce_results=True,
             params_dtype=params_dtype,
-            e_score_correction_bias=self.router.e_score_correction_bias,
             renormalize=False,
             quant_config=quant_config,
             prefix=f"{prefix}.experts",
-            zero_expert_num=self.zero_expert_num,
-            zero_expert_type=self.zero_expert_type,
-            enable_eplb=self.enable_eplb,
+            enable_eplb=enable_eplb,
             routed_scaling_factor=config.routed_scaling_factor,
         )
 
@@ -317,11 +315,34 @@ class LongcatMoe(nn.Module):
         num_tokens, hidden_dim = hidden_states.shape
         hidden_states = hidden_states.view(-1, hidden_dim)
 
-        router_logits = self.router(hidden_states.to(self.rounter_params_dtype))
+        # Align to FusedMoE padded hidden size to avoid dim mismatch
+        padded_hidden = self.experts.hidden_size
+        if hidden_dim < padded_hidden:
+            hidden_states_padded = torch.nn.functional.pad(
+                hidden_states,
+                (0, padded_hidden - hidden_dim),
+                mode="constant",
+                value=0.0,
+            )
+        else:
+            hidden_states_padded = hidden_states
+
+        router_logits_full = self.router(
+            hidden_states_padded.to(self.rounter_params_dtype)
+        )
+
+        # ZeroExpertFusedMoE handles routing memoization and zero expert computation
+        # internally. Pass full router_logits (including zero experts) so that
+        # zero experts can be properly identified in routing.
         final_hidden_states = self.experts(
-            hidden_states=hidden_states, router_logits=router_logits
+            hidden_states=hidden_states_padded,
+            router_logits=router_logits_full,  # Full logits (includes zero experts)
         )
 
+        # Crop back to original hidden dimension if padded earlier
+        if padded_hidden != hidden_dim:
+            final_hidden_states = final_hidden_states[..., :hidden_dim]
+
         return final_hidden_states.view(num_tokens, hidden_dim)
 
 
@@ -419,6 +440,7 @@ class FlashDecoderLayer(nn.Module):
         hidden_states = self.self_attn[0](
             positions=positions,
             hidden_states=hidden_states,
+            llama_4_scaling=None,
         )
 
         hidden_states, residual = self.post_attention_layernorm[0](
@@ -438,6 +460,7 @@ class FlashDecoderLayer(nn.Module):
         hidden_states = self.self_attn[1](
             positions=positions,
             hidden_states=hidden_states,
+            llama_4_scaling=None,
         )
         hidden_states, residual = self.post_attention_layernorm[1](
             hidden_states, residual
@@ -603,6 +626,7 @@ class LongcatFlashForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         # Params for weights, fp8 weight scales, fp8 activation scales
         # (param_name, weight_name, expert_id, shard_id)
         return FusedMoE.make_expert_params_mapping(
+            self,
             ckpt_gate_proj_name="gate_proj",
             ckpt_down_proj_name="down_proj",
             ckpt_up_proj_name="up_proj",
diff --git a/vllm/model_executor/models/longcat_flash_mtp.py b/vllm/model_executor/models/longcat_flash_mtp.py
index e554d1e2de9274d1b19b6bbb076fa2374373baff..f96d3cf28f1134b7c21dc28ca2f174768a469c9c 100644
--- a/vllm/model_executor/models/longcat_flash_mtp.py
+++ b/vllm/model_executor/models/longcat_flash_mtp.py
@@ -24,7 +24,6 @@ from vllm.model_executor.models.longcat_flash import FlashConfig
 from vllm.sequence import IntermediateTensors
 
 from .deepseek_v2 import DeepseekV2DecoderLayer
-from .interfaces import SupportsPP
 from .utils import maybe_prefix
 
 
@@ -124,7 +123,7 @@ class LongCatMultiTokenPredictor(nn.Module):
         )
 
 
-class LongCatFlashMTP(nn.Module, SupportsPP):
+class LongCatFlashMTP(nn.Module):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         # LongCat MTP without MoE layers
diff --git a/vllm/model_executor/models/mimo_v2_flash.py b/vllm/model_executor/models/mimo_v2_flash.py
new file mode 100644
index 0000000000000000000000000000000000000000..db85073b38a410945aebe9c65c11056b05e664c6
--- /dev/null
+++ b/vllm/model_executor/models/mimo_v2_flash.py
@@ -0,0 +1,718 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Iterable
+from itertools import islice
+
+import torch
+from torch import nn
+
+from vllm.attention.layer import Attention
+from vllm.config import (
+    CacheConfig,
+    VllmConfig,
+    get_current_vllm_config,
+    str_dtype_to_torch_dtype,
+)
+from vllm.distributed import (
+    get_ep_group,
+    get_pp_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_gather,
+)
+from vllm.logger import init_logger
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from vllm.model_executor.models.utils import sequence_parallel_chunk
+from vllm.sequence import IntermediateTensors
+from vllm.v1.attention.backend import AttentionType
+
+from .interfaces import MixtureOfExperts, SupportsPP
+from .utils import (
+    AutoWeightsLoader,
+    PPMissingLayer,
+    extract_layer_index,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
+
+logger = init_logger(__name__)
+
+
+class MiMoV2MLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: QuantizationConfig | None = None,
+        reduce_results: bool = True,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            reduce_results=reduce_results,
+            prefix=f"{prefix}.down_proj",
+        )
+        if hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. Only silu is supported for now."
+            )
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class MiMoV2MoE(nn.Module):
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        is_nextn: bool = False,
+    ):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_text_config
+        parallel_config = vllm_config.parallel_config
+        quant_config = vllm_config.quant_config
+
+        self.tp_size = get_tensor_model_parallel_world_size()
+
+        self.ep_group = get_ep_group().device_group
+        self.ep_rank = get_ep_group().rank_in_group
+        self.ep_size = self.ep_group.size()
+        self.n_routed_experts = config.n_routed_experts
+
+        self.is_sequence_parallel = parallel_config.use_sequence_parallel_moe
+
+        if self.tp_size > config.n_routed_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than "
+                f"the number of experts {config.n_routed_experts}."
+            )
+
+        if config.hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {config.hidden_act}. "
+                "Only silu is supported for now."
+            )
+
+        vllm_config = get_current_vllm_config()
+        eplb_config = vllm_config.parallel_config.eplb_config
+        self.enable_eplb = parallel_config.enable_eplb
+
+        self.n_logical_experts = self.n_routed_experts
+        self.n_redundant_experts = eplb_config.num_redundant_experts
+        self.n_physical_experts = self.n_logical_experts + self.n_redundant_experts
+        self.n_local_physical_experts = self.n_physical_experts // self.ep_size
+
+        self.physical_expert_start = self.ep_rank * self.n_local_physical_experts
+        self.physical_expert_end = (
+            self.physical_expert_start + self.n_local_physical_experts
+        )
+
+        dtype = getattr(config, "moe_router_dtype", "float32")
+        self.gate_dtype = str_dtype_to_torch_dtype(dtype)
+        self.gate = nn.Linear(
+            config.hidden_size,
+            config.n_routed_experts,
+            bias=False,
+            dtype=self.gate_dtype,
+        )
+        self.gate.e_score_correction_bias = nn.Parameter(
+            torch.empty(config.n_routed_experts, dtype=self.gate_dtype)
+        )
+
+        self.experts = FusedMoE(
+            num_experts=self.n_routed_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.moe_intermediate_size,
+            reduce_results=True,
+            renormalize=config.norm_topk_prob,
+            quant_config=quant_config,
+            prefix=f"{prefix}.experts",
+            e_score_correction_bias=self.gate.e_score_correction_bias,
+            enable_eplb=self.enable_eplb,
+            num_redundant_experts=self.n_redundant_experts,
+            is_sequence_parallel=self.is_sequence_parallel,
+            use_grouped_topk=True,
+            num_expert_group=config.n_group,
+            topk_group=config.topk_group,
+            scoring_func="sigmoid",
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        assert hidden_states.dim() <= 2, "MiMoV2MoE only supports 1D or 2D inputs"
+        is_input_1d = hidden_states.dim() == 1
+        num_tokens, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+
+        if self.is_sequence_parallel:
+            hidden_states = sequence_parallel_chunk(hidden_states)
+
+        if self.gate_dtype is not None:
+            gate_input = hidden_states.to(self.gate_dtype)
+        else:
+            gate_input = hidden_states
+        router_logits = self.gate(gate_input)
+        final_hidden_states = self.experts(
+            hidden_states=hidden_states, router_logits=router_logits
+        )
+
+        if self.is_sequence_parallel:
+            final_hidden_states = tensor_model_parallel_all_gather(
+                final_hidden_states, 0
+            )
+            final_hidden_states = final_hidden_states[:num_tokens]
+
+        return final_hidden_states.squeeze(0) if is_input_1d else final_hidden_states
+
+
+class MiMoV2Attention(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        head_dim: int,
+        v_head_dim: int | None = None,
+        v_scale: float | None = None,
+        sliding_window_size: int = -1,
+        attention_bias: bool = False,
+        add_swa_attention_sink_bias: bool = False,
+        layer_id: int = 0,
+        rope_theta: float = 1000000,
+        max_position_embeddings: int = 32768,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        partial_rotary_factor: float = 1.0,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.layer_id = layer_id
+        tp_size = get_tensor_model_parallel_world_size()
+
+        self.total_num_heads = num_heads
+        self.num_heads = self.total_num_heads // tp_size
+
+        self.total_num_kv_heads = num_kv_heads
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+
+        self.head_dim = head_dim
+
+        self.v_head_dim = v_head_dim if v_head_dim is not None else head_dim
+
+        self.q_size = self.num_heads * self.head_dim
+        self.k_size = self.num_kv_heads * self.head_dim
+        self.v_size = self.num_kv_heads * self.v_head_dim
+
+        self.v_scale = v_scale
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=attention_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+            v_head_size=self.v_head_dim,
+        )
+
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.v_head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            reduce_results=True,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        self.rotary_emb = get_rope(
+            head_size=self.head_dim,
+            max_position=max_position_embeddings,
+            rope_parameters={
+                "rope_type": "default",
+                "rope_theta": rope_theta,
+                "partial_rotary_factor": partial_rotary_factor,
+            },
+        )
+
+        self.attention_sink_bias = (
+            torch.nn.Parameter(torch.empty(self.num_heads), requires_grad=False)
+            if add_swa_attention_sink_bias
+            else None
+        )
+
+        sliding_window = sliding_window_size if sliding_window_size > -1 else None
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            per_layer_sliding_window=sliding_window,
+            attn_type=AttentionType.DECODER,
+            prefix=f"{prefix}.attn",
+            sinks=self.attention_sink_bias,
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.k_size, self.v_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+
+        # Apply v_scale before attention
+        if self.v_scale is not None:
+            v = v * self.v_scale
+
+        v = v.view(-1, self.num_kv_heads, self.v_head_dim)
+        v = torch.nn.functional.pad(v, [0, self.head_dim - self.v_head_dim], value=0)
+        v = v.view(-1, self.num_kv_heads * self.head_dim)
+
+        attn_output = self.attn(q, k, v)
+
+        attn_output = attn_output.view(-1, self.num_heads, self.head_dim)[
+            ..., : self.v_head_dim
+        ].reshape(-1, self.num_heads * self.v_head_dim)
+
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class MiMoV2FlashDecoderLayer(nn.Module):
+    def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None:
+        super().__init__()
+        config = vllm_config.model_config.hf_text_config
+        quant_config = vllm_config.quant_config
+        layer_id = extract_layer_index(prefix)
+
+        self.hidden_size = config.hidden_size
+        self.config = config
+        self.layer_id = layer_id
+
+        rope_theta = getattr(config, "rope_theta", 1000000)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 32768)
+
+        v_scale = getattr(config, "attention_value_scale", None)
+
+        if self.is_compressed_softmax_layer():
+            self.self_attn = MiMoV2Attention(
+                hidden_size=self.hidden_size,
+                num_heads=config.swa_num_attention_heads,
+                num_kv_heads=config.swa_num_key_value_heads,
+                head_dim=config.swa_head_dim,
+                v_head_dim=getattr(config, "swa_v_head_dim", None),
+                v_scale=v_scale,
+                sliding_window_size=config.sliding_window_size,
+                attention_bias=config.attention_bias,
+                add_swa_attention_sink_bias=getattr(
+                    config, "add_swa_attention_sink_bias", False
+                ),
+                layer_id=layer_id,
+                rope_theta=getattr(config, "swa_rope_theta", rope_theta),
+                max_position_embeddings=max_position_embeddings,
+                quant_config=quant_config,
+                partial_rotary_factor=getattr(config, "partial_rotary_factor", 1.0),
+                prefix=f"{prefix}.self_attn",
+            )
+        else:
+            self.self_attn = MiMoV2Attention(
+                hidden_size=self.hidden_size,
+                num_heads=config.num_attention_heads,
+                num_kv_heads=config.num_key_value_heads,
+                head_dim=config.head_dim,
+                v_head_dim=getattr(config, "v_head_dim", None),
+                v_scale=v_scale,
+                sliding_window_size=-1,  # normal attention
+                attention_bias=config.attention_bias,
+                layer_id=layer_id,
+                rope_theta=rope_theta,
+                max_position_embeddings=max_position_embeddings,
+                quant_config=quant_config,
+                partial_rotary_factor=getattr(config, "partial_rotary_factor", 1.0),
+                prefix=f"{prefix}.self_attn",
+            )
+
+        self.is_layer_sparse = self.is_moe_layer(layer_id)
+        if self.is_layer_sparse:
+            self.mlp = MiMoV2MoE(
+                vllm_config=vllm_config,
+                prefix=f"{prefix}.mlp",
+            )
+        else:
+            self.mlp = MiMoV2MLP(
+                hidden_size=self.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp",
+            )
+
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.layernorm_epsilon)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.layernorm_epsilon
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor | None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+    def is_moe_layer(self, layer_idx: int) -> bool:
+        return (
+            hasattr(self.config, "moe_layer_freq")
+            and layer_idx >= 0
+            and not isinstance(self.config.moe_layer_freq, int)
+            and self.config.moe_layer_freq[layer_idx]
+        )
+
+    def is_compressed_softmax_layer(self) -> bool:
+        return self.config.hybrid_layer_pattern[self.layer_id] == 1
+
+
+class MiMoV2Model(nn.Module):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config.get_text_config()
+        quant_config = vllm_config.quant_config
+        eplb_config = vllm_config.parallel_config.eplb_config
+
+        self.config = config
+        self.quant_config = quant_config
+        self.vocab_size = config.vocab_size
+        self.num_redundant_experts = eplb_config.num_redundant_experts
+
+        if get_pp_group().is_first_rank or (
+            config.tie_word_embeddings and get_pp_group().is_last_rank
+        ):
+            self.embed_tokens = VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=f"{prefix}.embed_tokens",
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: MiMoV2FlashDecoderLayer(
+                vllm_config=vllm_config,
+                prefix=prefix,
+            ),
+            prefix=f"{prefix}.layers",
+        )
+
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
+        if get_pp_group().is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.layernorm_epsilon)
+        else:
+            self.norm = PPMissingLayer()
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.embed_input_ids(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        for idx, layer in enumerate(
+            islice(self.layers, self.start_layer, self.end_layer)
+        ):
+            hidden_states, residual = layer(positions, hidden_states, residual)
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+
+        return hidden_states
+
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        return FusedMoE.make_expert_params_mapping(
+            self,
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.n_routed_experts,
+            num_redundant_experts=self.num_redundant_experts,
+        )
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        tp_rank = get_tensor_model_parallel_rank()
+        tp_size = get_tensor_model_parallel_world_size()
+
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: set[str] = set()
+        expert_params_mapping = self.get_expert_mapping()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                continue
+            if "mtp" in name:
+                continue
+
+            if self.quant_config is not None:
+                cache_scale_name = self.quant_config.get_cache_scale(name)
+                if cache_scale_name is not None and cache_scale_name in params_dict:
+                    param = params_dict[cache_scale_name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+
+                    kv_scale = loaded_weight
+                    if kv_scale.dim() > 0 and kv_scale.numel() > 1:
+                        kv_scale = kv_scale.view(-1)[0]
+
+                    weight_loader(param, kv_scale)
+                    loaded_params.add(cache_scale_name)
+                    continue
+
+            expert_matched = False
+            for param_name, weight_name, expert_id, shard_id in expert_params_mapping:
+                if weight_name not in name:
+                    continue
+
+                name_rewritten = name.replace(weight_name, param_name)
+
+                if is_pp_missing_parameter(name_rewritten, self):
+                    continue
+
+                if (
+                    name_rewritten.endswith(".bias") or name_rewritten.endswith("_bias")
+                ) and name_rewritten not in params_dict:
+                    continue
+
+                if name_rewritten not in params_dict:
+                    continue
+
+                param = params_dict[name_rewritten]
+                weight_loader = param.weight_loader
+
+                weight_loader(
+                    param,
+                    loaded_weight,
+                    name_rewritten,
+                    shard_id=shard_id,
+                    expert_id=expert_id,
+                )
+                loaded_params.add(name_rewritten)
+                expert_matched = True
+                break
+
+            if expert_matched:
+                continue
+
+            stacked_matched = False
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name_rewritten = name.replace(weight_name, param_name)
+
+                if (
+                    name_rewritten.endswith(".bias")
+                    and name_rewritten not in params_dict
+                ):
+                    continue
+
+                if is_pp_missing_parameter(name_rewritten, self):
+                    continue
+
+                if name_rewritten not in params_dict:
+                    continue
+
+                param = params_dict[name_rewritten]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight, shard_id)
+                loaded_params.add(name_rewritten)
+
+                stacked_matched = True
+                break
+
+            if stacked_matched:
+                continue
+
+            if name.endswith(".bias") and name not in params_dict:
+                continue
+
+            orig_name = name
+            mapped_name = maybe_remap_kv_scale_name(name, params_dict)
+            name = mapped_name if mapped_name is not None else orig_name
+
+            if name not in params_dict:
+                continue
+
+            param = params_dict[name]
+
+            if "attention_sink_bias" in name:
+                total_heads = loaded_weight.shape[0]
+                heads_per_rank = total_heads // tp_size
+                head_start = tp_rank * heads_per_rank
+                narrow_weight = loaded_weight.narrow(0, head_start, heads_per_rank)
+
+                param.data.copy_(narrow_weight)
+                loaded_params.add(name)
+            else:
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+                loaded_params.add(name)
+
+        return loaded_params
+
+
+class MiMoV2FlashForCausalLM(nn.Module, SupportsPP, MixtureOfExperts):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+
+        self.config = config
+        self.quant_config = quant_config
+        self.model = MiMoV2Model(
+            vllm_config=vllm_config,
+            prefix=maybe_prefix(prefix, "model"),
+        )
+
+        if get_pp_group().is_last_rank:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=maybe_prefix(prefix, "lm_head"),
+            )
+        else:
+            self.lm_head = PPMissingLayer()
+
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors
+        )
+
+    def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None:
+        self.model.aux_hidden_state_layers = layers
+
+    def get_eagle3_aux_hidden_state_layers(self) -> tuple[int, ...]:
+        num_layers = len(self.model.layers)
+        return (2, num_layers // 2, num_layers - 3)
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor | None:
+        logits = self.logits_processor(self.lm_head, hidden_states)
+        return logits
+
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        return self.model.get_expert_mapping()
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
index f104018d3aa6c61d50ea0e046449072d5679d4aa..a05be794a29cd71f8d61f769f1902e5cc16ffe31 100644
--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -90,6 +90,7 @@ class MiniCPMMoE(nn.Module):
         intermediate_size: int,
         params_dtype: torch.dtype | None = None,
         tp_size: int | None = None,
+        prefix: str = "",
     ):
         super().__init__()
         self.tp_size = tp_size or get_tensor_model_parallel_world_size()
@@ -108,6 +109,7 @@ class MiniCPMMoE(nn.Module):
             bias=False,
             params_dtype=self.params_dtype,
             quant_config=None,
+            prefix=f"{prefix}.gate",
         )
 
         self.ws = nn.Parameter(
@@ -352,6 +354,7 @@ class MiniCPMDecoderLayer(nn.Module):
                 hidden_act=self.config.hidden_act,
                 hidden_act_param=getattr(self.config, "hidden_act_param", 0.0),
                 quant_config=self.quant_config,
+                prefix=f"{self.prefix}.mlp",
             )
         else:
             self.mlp = MiniCPMMoE(
@@ -359,6 +362,7 @@ class MiniCPMDecoderLayer(nn.Module):
                 top_k=self.config.num_experts_per_tok,
                 hidden_size=self.config.hidden_size,
                 intermediate_size=self.config.intermediate_size,
+                prefix=f"{self.prefix}.mlp",
             )
 
     def forward(
diff --git a/vllm/model_executor/models/minicpm_eagle.py b/vllm/model_executor/models/minicpm_eagle.py
index 9f3587a6d2fa54203b293355387ed1571b67e6e6..e9f1a91bfc4a46379ca1c19ffb64e6a881a1deea 100644
--- a/vllm/model_executor/models/minicpm_eagle.py
+++ b/vllm/model_executor/models/minicpm_eagle.py
@@ -108,6 +108,7 @@ class EagleMiniCPMDecoderLayer(nn.Module):
                 top_k=self.config.num_experts_per_tok,
                 hidden_size=self.config.hidden_size,
                 intermediate_size=self.config.intermediate_size,
+                prefix=f"{self.prefix}.mlp",
             )
 
     def forward(
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index c45bdf95e7487e0ef12b0a963be62e2044f70d5f..930ff737bcdac26f80dd6f1ec75cf290591aecb6 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -139,7 +139,7 @@ class MiniCPMVImageEmbeddingInputs(TensorSchema):
     type: Literal["image_embeds"]
     image_embeds: Annotated[
         torch.Tensor | list[torch.Tensor],
-        TensorShape("bn", "ns", "hs"),
+        TensorShape("bn", "ns", "hs", dynamic_dims={"ns"}),
     ]
 
 
diff --git a/vllm/model_executor/models/minimax_m2.py b/vllm/model_executor/models/minimax_m2.py
index ee19288ae685240b56ab8c9926a37b99bdcb49b5..292969db6d03ae0b72381d3625c3224b31e32249 100644
--- a/vllm/model_executor/models/minimax_m2.py
+++ b/vllm/model_executor/models/minimax_m2.py
@@ -234,8 +234,9 @@ class MiniMaxM2Attention(nn.Module):
     ) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
-        q = self.q_norm(q)
-        k = self.k_norm(k)
+        q, k = MiniMaxText01RMSNormTP.forward_qk(
+            self.q_norm, self.k_norm, q.contiguous(), k.contiguous()
+        )
         q, k = self.rotary_emb(positions, q, k)
         attn_output = self.attn(q, k, v)
         output, _ = self.o_proj(attn_output)
@@ -391,6 +392,7 @@ class MiniMaxM2Model(nn.Module):
 
     def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
         return FusedMoE.make_expert_params_mapping(
+            self,
             ckpt_gate_proj_name="w1",
             ckpt_down_proj_name="w2",
             ckpt_up_proj_name="w3",
diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py
index 4bfe3c391c26f5d1a5b2d3da65ae3e83878de0bc..955a73ff19edab140fe9a8cdcfc444d3f83c2d7c 100644
--- a/vllm/model_executor/models/minimax_text_01.py
+++ b/vllm/model_executor/models/minimax_text_01.py
@@ -14,7 +14,6 @@ import torch
 from torch import nn
 from transformers import MiniMaxConfig
 
-from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, ModelConfig, VllmConfig
@@ -48,6 +47,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.utils import maybe_prefix
 from vllm.sequence import IntermediateTensors
+from vllm.v1.attention.backend import AttentionMetadata
 
 from .interfaces import HasInnerState, IsHybrid
 from .utils import PPMissingLayer, is_pp_missing_parameter, make_layers
@@ -332,7 +332,8 @@ class MiniMaxText01DecoderLayer(nn.Module):
             )
         else:
             raise ValueError(
-                f"Unsupported attention type: {self.config.attention_type}"
+                f"Unsupported attention_type {self.config.attention_type}: "
+                f"should be 0 (linear) or 1 (full)."
             )
 
         if expert_num == 1:
diff --git a/vllm/model_executor/models/minimax_vl_01.py b/vllm/model_executor/models/minimax_vl_01.py
index e480454953df8c7412c1ffe1ca79e75cdc20336f..b4a496dcb688d0e585b64f9bad215e3b80fae073 100644
--- a/vllm/model_executor/models/minimax_vl_01.py
+++ b/vllm/model_executor/models/minimax_vl_01.py
@@ -204,7 +204,8 @@ class MiniMaxVL01ForConditionalGeneration(nn.Module, SupportsMultiModal, Support
         # TODO: Optionally initializes this for supporting embeddings.
         self.vision_tower = init_vision_tower_for_llava(
             config,
-            quant_config,
+            quant_config=quant_config,
+            multimodal_config=multimodal_config,
             require_post_norm=False,
             prefix=maybe_prefix(prefix, "vision_tower"),
         )
diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py
index e9161e69e731baffbebb6c69a60e2571821c38c0..d3617fd4bec24f720387b29b12176f104f24b718 100644
--- a/vllm/model_executor/models/mistral3.py
+++ b/vllm/model_executor/models/mistral3.py
@@ -16,7 +16,7 @@ from transformers import (
 from transformers.models.pixtral import PixtralProcessor
 
 from vllm.config import VllmConfig
-from vllm.config.multimodal import BaseDummyOptions
+from vllm.config.multimodal import BaseDummyOptions, MultiModalConfig
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear
@@ -395,6 +395,7 @@ def _get_layer_index(feature_layer_index: int, num_hidden_layers: int) -> int:
 def init_vision_tower_for_llava(
     hf_config: LlavaLikeConfig,
     quant_config: QuantizationConfig | None,
+    multimodal_config: MultiModalConfig | None,
     *,
     require_post_norm: bool | None = None,
     prefix: str = "",
@@ -409,6 +410,7 @@ def init_vision_tower_for_llava(
     return PixtralHFVisionModel(
         vision_config,
         quant_config=quant_config,
+        multimodal_config=multimodal_config,
         num_hidden_layers_override=num_hidden_layers,
         require_post_norm=require_post_norm,
         prefix=prefix,
@@ -472,7 +474,8 @@ class Mistral3ForConditionalGeneration(
         if multimodal_config.get_limit_per_prompt("image"):
             self.vision_tower = init_vision_tower_for_llava(
                 config,
-                quant_config,
+                quant_config=quant_config,
+                multimodal_config=multimodal_config,
                 require_post_norm=False,
                 prefix=maybe_prefix(prefix, "vision_tower"),
             )
diff --git a/vllm/model_executor/models/mistral_large_3_eagle.py b/vllm/model_executor/models/mistral_large_3_eagle.py
index 37cd4324e53d9cc70e64cb7768b1d7b4bd946164..830f210e743861b71edb0b513ae3851445458230 100644
--- a/vllm/model_executor/models/mistral_large_3_eagle.py
+++ b/vllm/model_executor/models/mistral_large_3_eagle.py
@@ -67,6 +67,7 @@ class EagleMistralLarge3Model(DeepseekV2Model):
             input_is_parallel=False,
             quant_config=quant_config,
             return_bias=False,
+            prefix=maybe_prefix(prefix, "fc"),
         )
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 35002ca51559f3f9c28775b91d1262c62ba01720..d07eb30ab2ea5be340a83a1da8ba151824a3df74 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -375,6 +375,7 @@ class MixtralModel(nn.Module):
         # Params for weights, fp8 weight scales, fp8 activation scales
         # (param_name, weight_name, expert_id, shard_id)
         return FusedMoE.make_expert_params_mapping(
+            self,
             ckpt_gate_proj_name="w1",
             ckpt_down_proj_name="w2",
             ckpt_up_proj_name="w3",
diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py
index fe963cc6644fb7cb7fba704af7cbd86a73e425e6..13f79c91fec569c1aac91030d26076c4811075b5 100644
--- a/vllm/model_executor/models/mllama4.py
+++ b/vllm/model_executor/models/mllama4.py
@@ -31,10 +31,12 @@ from transformers.models.llama4.image_processing_llama4_fast import (
     get_best_fit,
 )
 
-from vllm.attention.layer import MultiHeadAttention
-from vllm.config import VllmConfig
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.forward_context import set_forward_context
+from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
@@ -47,6 +49,7 @@ from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.model_loader.utils import initialize_model
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.module_mapping import MultiModelKeys
+from vllm.model_executor.models.vision import should_torch_compile_mm_vit
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (
     MultiModalDataDict,
@@ -255,7 +258,7 @@ class Llama4VisionAttention(nn.Module):
         self.attention_dropout = config.attention_dropout
         self.scaling = self.head_dim**-0.5
 
-        self.attn = MultiHeadAttention(
+        self.attn = MMEncoderAttention(
             self.num_local_heads, self.head_dim, self.scaling
         )
 
@@ -456,6 +459,9 @@ class Llama4UnfoldConvolution(nn.Module):
         return hidden_states
 
 
+@support_torch_compile(
+    dynamic_arg_dims={"images_flattened": 0}, enable_if=should_torch_compile_mm_vit
+)
 class Llama4VisionModel(nn.Module):
     def __init__(
         self,
@@ -497,6 +503,7 @@ class Llama4VisionModel(nn.Module):
             prefix=f"{prefix}.model",
             use_data_parallel=use_data_parallel,
         )
+
         self.vision_adapter = Llama4VisionPixelShuffleMLP(
             config,
             quant_config,
@@ -762,18 +769,28 @@ class Llama4ForConditionalGeneration(
         multimodal_config = vllm_config.model_config.multimodal_config
         self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data"
 
+        self.vllm_config = vllm_config
         self.config = config
         self.quant_config = quant_config
         self.multimodal_config = multimodal_config
         if multimodal_config.get_limit_per_prompt("image"):
-            self.vision_model = Llama4VisionModel(
-                config.vision_config,
-                None,
-                prefix=maybe_prefix(prefix, "vision_model"),
-                use_data_parallel=self.use_data_parallel,
-            )
+            from vllm.compilation.backends import set_model_tag
+
+            with (
+                set_current_vllm_config(vllm_config),
+                set_model_tag("Llama4VisionModel", is_encoder=True),
+            ):
+                self.vision_model = Llama4VisionModel(
+                    config=config.vision_config,
+                    quant_config=None,
+                    prefix=maybe_prefix(prefix, "vision_model"),
+                    use_data_parallel=self.use_data_parallel,
+                )
+
             self.multi_modal_projector = Llama4MultiModalProjector(
-                self.config, None, prefix=maybe_prefix(prefix, "multi_modal_projector")
+                config=self.config,
+                quant_config=None,
+                prefix=maybe_prefix(prefix, "multi_modal_projector"),
             )
         else:
             self.vision_model = None
@@ -883,7 +900,10 @@ class Llama4ForConditionalGeneration(
         if image_input is None:
             return []
 
-        return self._process_image_input(image_input)
+        with (
+            set_forward_context(None, self.vllm_config),
+        ):
+            return self._process_image_input(image_input)
 
     def forward(
         self,
@@ -1084,6 +1104,7 @@ class Llama4ForConditionalGeneration(
         # Params for weights, fp8 weight scales, fp8 activation scales
         # (param_name, weight_name, expert_id, shard_id)
         return FusedMoE.make_expert_params_mapping(
+            self,
             ckpt_gate_proj_name="gate_proj",
             ckpt_down_proj_name="down_proj",
             ckpt_up_proj_name="up_proj",
diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py
index 4655ffa7b2f61b95e3126164c1f3624a4edea679..f3fec2bfb861d774d298160ef22e167a42fef267 100644
--- a/vllm/model_executor/models/modernbert.py
+++ b/vllm/model_executor/models/modernbert.py
@@ -1,31 +1,31 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from collections.abc import Iterable, Set
+from collections.abc import Iterable
 
 import torch
 from torch import nn
 from transformers import ModernBertConfig
 from transformers.activations import ACT2FN
 
-from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import VllmConfig
+from vllm.config import ModelConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.attention.encoder_only_attention import (
+    EncoderOnlyAttention,
+)
 from vllm.model_executor.layers.linear import QKVParallelLinear, RowParallelLinear
-from vllm.model_executor.layers.pooler import (
-    ClassifierPooler,
-    DispatchPooler,
-    Pooler,
-    PoolingMethod,
-    PoolingParamsUpdate,
-    PoolingType,
+from vllm.model_executor.layers.pooler import DispatchPooler
+from vllm.model_executor.layers.pooler.activations import LambdaPoolerActivation
+from vllm.model_executor.layers.pooler.seqwise import (
+    EmbeddingPoolerHead,
+    SequencePooler,
+    get_seq_pooling_method,
 )
+from vllm.model_executor.layers.pooler.tokwise import pooler_for_token_classify
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import IntermediateTensors
-from vllm.tasks import PoolingTask
-from vllm.v1.pool.metadata import PoolingMetadata
 
 from .interfaces import SupportsCrossEncoding
 from .interfaces_base import attn_type, default_pooling_type
@@ -63,7 +63,9 @@ class ModernBertEmbeddings(nn.Module):
 
 
 class ModernBertAttention(nn.Module):
-    def __init__(self, config: ModernBertConfig, layer_id: int | None = None):
+    def __init__(
+        self, config: ModernBertConfig, layer_id: int | None = None, prefix: str = ""
+    ):
         super().__init__()
         self.config = config
         self.hidden_size = config.hidden_size
@@ -80,6 +82,7 @@ class ModernBertAttention(nn.Module):
             self.head_dim,
             self.num_heads,
             bias=config.attention_bias,
+            prefix=f"{prefix}.Wqkv",
         )
 
         if layer_types := getattr(config, "layer_types", None):
@@ -117,7 +120,10 @@ class ModernBertAttention(nn.Module):
             per_layer_sliding_window=sliding_window,
         )
         self.Wo = RowParallelLinear(
-            config.hidden_size, config.hidden_size, bias=config.attention_bias
+            config.hidden_size,
+            config.hidden_size,
+            bias=config.attention_bias,
+            prefix=f"{prefix}.Wo",
         )
 
     def forward(
@@ -135,7 +141,7 @@ class ModernBertAttention(nn.Module):
 
 
 class ModernBertMLP(nn.Module):
-    def __init__(self, config: ModernBertConfig):
+    def __init__(self, config: ModernBertConfig, prefix: str = ""):
         super().__init__()
         self.config = config
         self.Wi = nn.Linear(
@@ -143,7 +149,10 @@ class ModernBertMLP(nn.Module):
         )
         self.act = nn.GELU()
         self.Wo = RowParallelLinear(
-            config.intermediate_size, config.hidden_size, bias=config.mlp_bias
+            config.intermediate_size,
+            config.hidden_size,
+            bias=config.mlp_bias,
+            prefix=f"{prefix}.Wo",
         )
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
@@ -163,11 +172,13 @@ class ModernBertLayer(nn.Module):
             self.attn_norm = nn.LayerNorm(
                 config.hidden_size, eps=config.norm_eps, bias=config.norm_bias
             )
-        self.attn = ModernBertAttention(config=config, layer_id=layer_id)
+        self.attn = ModernBertAttention(
+            config=config, layer_id=layer_id, prefix=f"{prefix}.attn"
+        )
         self.mlp_norm = nn.LayerNorm(
             config.hidden_size, eps=config.norm_eps, bias=config.norm_bias
         )
-        self.mlp = ModernBertMLP(config)
+        self.mlp = ModernBertMLP(config, prefix=f"{prefix}.mlp")
 
     def forward(
         self,
@@ -189,7 +200,11 @@ class ModernBertEncoderLayer(nn.Module):
         config = vllm_config.model_config.hf_config
         self.layers = nn.ModuleList(
             [
-                ModernBertLayer(config=config, layer_id=layer_id)
+                ModernBertLayer(
+                    config=config,
+                    layer_id=layer_id,
+                    prefix=f"{prefix}.layers.{layer_id}",
+                )
                 for layer_id in range(config.num_hidden_layers)
             ]
         )
@@ -205,7 +220,7 @@ class ModernBertEncoderLayer(nn.Module):
 
 
 @support_torch_compile
-@default_pooling_type("CLS")
+@default_pooling_type(seq_pooling_type="CLS")
 class ModernBertModel(nn.Module):
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={"layers.": "encoder_layer.layers."}
@@ -220,7 +235,9 @@ class ModernBertModel(nn.Module):
         config = vllm_config.model_config.hf_config
         self.config = config
         self.embeddings = ModernBertEmbeddings(config)
-        self.encoder_layer = ModernBertEncoderLayer(vllm_config)
+        self.encoder_layer = ModernBertEncoderLayer(
+            vllm_config, prefix=f"{prefix}.encoder_layer"
+        )
         self.final_norm = nn.LayerNorm(
             config.hidden_size, eps=config.norm_eps, bias=config.norm_bias
         )
@@ -263,52 +280,55 @@ class ModernBertModel(nn.Module):
         return norm_outputs
 
 
-class ModernBertPooler(Pooler):
-    def __init__(self, config: ModernBertConfig):
-        super().__init__()
+class ModernBertPooler(SequencePooler):
+    def __init__(self, model_config: ModelConfig):
+        pooler_config = model_config.pooler_config
+        assert pooler_config is not None
+
+        config: ModernBertConfig = model_config.hf_config
+        hf_pooling_type = config.classifier_pooling.upper()
+        # vllm_pooling_type = pooler_config.seq_pooling_type
+        # Currently we don't have a way to see if the user set the pooling type
+        # explicitly or not, so we always use the HF pooling type for now.
+
+        super().__init__(
+            pooling=get_seq_pooling_method(hf_pooling_type),
+            # We set this dummy to avoid adding parameters to nn.Module too early
+            head=nn.Identity(),
+        )
 
-        pooling_type = PoolingType[config.classifier_pooling.upper()]
-        self.pooling = PoolingMethod.from_pooling_type(pooling_type)
+        head_dtype = model_config.head_dtype
         self.dense = nn.Linear(
-            config.hidden_size, config.hidden_size, config.classifier_bias
+            config.hidden_size,
+            config.hidden_size,
+            config.classifier_bias,
+            dtype=head_dtype,
         )
         self.act = nn.GELU()
         self.norm = nn.LayerNorm(
-            config.hidden_size, eps=config.norm_eps, bias=config.norm_bias
+            config.hidden_size,
+            eps=config.norm_eps,
+            bias=config.norm_bias,
+            dtype=head_dtype,
         )
 
-    def get_supported_tasks(self) -> Set[PoolingTask]:
-        return self.pooling.get_supported_tasks()
-
-    def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
-        return self.pooling.get_pooling_updates(task)
-
-    def _head(self, pooled_output: torch.Tensor):
-        pooled_output = pooled_output.to(self.dense.weight.dtype)
-        return self.norm(self.act(self.dense(pooled_output)))
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor | list[torch.Tensor],
-        pooling_metadata: PoolingMetadata,
-    ) -> torch.Tensor | list[torch.Tensor]:
-        pooled_output = self.pooling(hidden_states, pooling_metadata)
-
-        if isinstance(pooled_output, list):
-            pooled_output = [self._head(output) for output in pooled_output]
-        else:
-            pooled_output = self._head(pooled_output)
-
-        return pooled_output
+        # Use lambdas so that weights are not registered under `self.head`
+        self.head = EmbeddingPoolerHead(
+            head_dtype=head_dtype,
+            projector=lambda x: self.dense(x),
+            activation=LambdaPoolerActivation(lambda x: self.norm(self.act(x))),
+        )
 
 
-@default_pooling_type("CLS")
+@default_pooling_type(seq_pooling_type="CLS")
 class ModernBertForSequenceClassification(nn.Module, SupportsCrossEncoding):
     is_pooling_model = True
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
+
         config = vllm_config.model_config.hf_config
+
         self.config = config
         self.model = ModernBertModel(
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "modernbert")
@@ -318,23 +338,16 @@ class ModernBertForSequenceClassification(nn.Module, SupportsCrossEncoding):
             config.num_labels,
             dtype=vllm_config.model_config.head_dtype,
         )
-        self.pooling = ModernBertPooler(config)
 
         pooler_config = vllm_config.model_config.pooler_config
         assert pooler_config is not None
 
-        self.pooler = DispatchPooler(
-            {
-                "token_classify": Pooler.for_token_classify(
-                    pooler_config, classifier=self.classifier
-                ),
-                "classify": ClassifierPooler(
-                    pooling=self.pooling, classifier=self.classifier, act_fn="classify"
-                ),
-                "score": ClassifierPooler(
-                    pooling=self.pooling, classifier=self.classifier, act_fn="score"
-                ),
-            }
+        self.pooling = ModernBertPooler(vllm_config.model_config)
+
+        self.pooler = DispatchPooler.for_seq_cls(
+            pooler_config,
+            pooling=self.pooling,
+            classifier=self.classifier,
         )
 
     def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
@@ -397,7 +410,7 @@ class ModernBertPredictionHead(nn.Module):
 
 
 @attn_type("encoder_only")
-@default_pooling_type("ALL")
+@default_pooling_type(tok_pooling_type="ALL")
 class ModernBertForTokenClassification(nn.Module):
     is_pooling_model = True
 
@@ -417,13 +430,7 @@ class ModernBertForTokenClassification(nn.Module):
         pooler_config = vllm_config.model_config.pooler_config
         assert pooler_config is not None
 
-        self.pooler = DispatchPooler(
-            {
-                "token_classify": Pooler.for_token_classify(
-                    pooler_config=pooler_config
-                ),
-            }
-        )
+        self.pooler = pooler_for_token_classify(pooler_config)
 
     def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.model.embed_input_ids(input_ids)
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 71c6b1aa2e8144478fffa90fa9c0ea0beee21f45..bdfa6178b4e3200c856a3bfec068aaedec70204f 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -17,7 +17,7 @@ from transformers import BatchFeature, PretrainedConfig, ProcessorMixin, TensorT
 from transformers.image_utils import ImageInput
 from transformers.tokenization_utils_base import TextInput
 
-from vllm.attention.layer import Attention, MultiHeadAttention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
@@ -29,6 +29,7 @@ from vllm.distributed import (
     tensor_model_parallel_all_gather,
 )
 from vllm.model_executor.layers.activation import MulAndSilu, QuickGELU, SiluAndMul
+from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
@@ -141,6 +142,7 @@ class ViTMLP(nn.Module):
         self,
         config: VisionBackboneConfig,
         quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
     ):
         super().__init__()
         self.w1 = ColumnParallelLinear(
@@ -148,6 +150,7 @@ class ViTMLP(nn.Module):
             config.image_mlp_dim,
             bias=True,
             quant_config=quant_config,
+            prefix=f"{prefix}.w1",
         )
         # Activation function.
         assert config.image_mlp_activations == "quick_gelu"
@@ -157,6 +160,7 @@ class ViTMLP(nn.Module):
             config.image_emb_dim,
             bias=True,
             quant_config=quant_config,
+            prefix=f"{prefix}.w2",
         )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -175,6 +179,7 @@ class MultiHeadDotProductAttention(nn.Module):
         use_bias: bool = True,
         nlayers: int = 1,
         quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
     ):
         super().__init__()
 
@@ -201,28 +206,32 @@ class MultiHeadDotProductAttention(nn.Module):
             self.total_num_heads * self.head_dim,
             bias=use_bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.wq",
         )
         self.wk = ColumnParallelLinear(
             nlayers * self.hidden_size,
             self.total_num_kv_heads * self.head_dim,
             bias=use_bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.wk",
         )
         self.wv = ColumnParallelLinear(
             nlayers * self.hidden_size,
             self.total_num_kv_heads * self.head_dim,
             bias=use_bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.wv",
         )
         self.wo = RowParallelLinear(
             self.total_num_heads * self.head_dim,
             self.hidden_size,
             bias=use_bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.wo",
         )
 
         self.scale = self.head_dim**-0.5
-        self.attn = MultiHeadAttention(
+        self.attn = MMEncoderAttention(
             self.num_heads, self.head_dim, self.scale, num_kv_heads=self.num_kv_heads
         )
 
@@ -253,10 +262,15 @@ class ResidualAttentionBlock(nn.Module):
         self,
         config: VisionBackboneConfig,
         quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
     ):
         super().__init__()
-        self.attention = MultiHeadDotProductAttention(config, quant_config=quant_config)
-        self.feed_forward = ViTMLP(config, quant_config)
+        self.attention = MultiHeadDotProductAttention(
+            config, quant_config=quant_config, prefix=f"{prefix}.attention"
+        )
+        self.feed_forward = ViTMLP(
+            config, quant_config, prefix=f"{prefix}.feed_forward"
+        )
         self.attention_norm = nn.LayerNorm(
             config.image_emb_dim,
             eps=config.image_norm_eps,
@@ -279,12 +293,15 @@ class BlockCollection(nn.Module):
         self,
         config: VisionBackboneConfig,
         quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
     ):
         super().__init__()
         self.resblocks = nn.ModuleList(
             [
-                ResidualAttentionBlock(config, quant_config)
-                for _ in range(config.image_num_layers)
+                ResidualAttentionBlock(
+                    config, quant_config, prefix=f"{prefix}.resblocks.{i}"
+                )
+                for i in range(config.image_num_layers)
             ]
         )
 
@@ -307,6 +324,7 @@ class VisionTransformer(nn.Module):
         self,
         config: VisionBackboneConfig,
         quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
     ):
         super().__init__()
         scale = config.image_emb_dim**-0.5
@@ -323,7 +341,9 @@ class VisionTransformer(nn.Module):
             bias=False,
         )
         self.pre_ln = nn.LayerNorm(config.image_emb_dim, eps=config.image_norm_eps)
-        self.transformer = BlockCollection(config, quant_config)
+        self.transformer = BlockCollection(
+            config, quant_config, prefix=f"{prefix}.transformer"
+        )
 
     def add_pos_emb(self, x: torch.Tensor, patch_num: int) -> torch.Tensor:
         cls_emb = self.positional_embedding[0:1]
@@ -418,6 +438,7 @@ class MolmoAttention(nn.Module):
             self.total_num_kv_heads,
             bias=config.qkv_bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
         )
 
         self.tp_rank: int | None = None
@@ -453,6 +474,7 @@ class MolmoAttention(nn.Module):
             self.hidden_size,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
         )
 
     def _apply_qk_norm(
@@ -492,6 +514,7 @@ class LanguageModelMLP(nn.Module):
         config: PretrainedConfig,
         input_dim: int | None = None,
         quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
@@ -502,6 +525,7 @@ class LanguageModelMLP(nn.Module):
             [self.intermediate_size] * 2,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
         )
         # Activation function.
         self.act_fn = MulAndSilu()
@@ -511,6 +535,7 @@ class LanguageModelMLP(nn.Module):
             self.hidden_size,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
         )
 
     def forward(
@@ -531,6 +556,7 @@ class ImageProjectorMLP(nn.Module):
         config: PretrainedConfig,
         input_dim: int | None = None,
         quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
@@ -541,6 +567,7 @@ class ImageProjectorMLP(nn.Module):
             [self.intermediate_size] * 2,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.merged_linear",
         )
         # Activation function.
         self.act_fn = SiluAndMul()
@@ -551,6 +578,7 @@ class ImageProjectorMLP(nn.Module):
             self.hidden_size,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
         )
 
     def forward(
@@ -578,7 +606,9 @@ class MolmoDecoderLayer(nn.Module):
         )
 
         # MLP block.
-        self.mlp = LanguageModelMLP(config, quant_config=quant_config)
+        self.mlp = LanguageModelMLP(
+            config, quant_config=quant_config, prefix=f"{prefix}.mlp"
+        )
 
         # LayerNorm
         assert config.layer_norm_type == "rms"
@@ -642,6 +672,7 @@ class MolmoVisionBackbone(nn.Module, SupportsQuant):
         config: PretrainedConfig,
         vision_config: VisionBackboneConfig,
         quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.vit_layers = VIT_LAYERS
@@ -650,18 +681,24 @@ class MolmoVisionBackbone(nn.Module, SupportsQuant):
             (self.image_num_patch[0] + 1) // POOLING_SIZE,
             (self.image_num_patch[1] + 1) // POOLING_SIZE,
         )
-        self.image_vit = VisionTransformer(vision_config, quant_config=quant_config)
+        self.image_vit = VisionTransformer(
+            vision_config, quant_config=quant_config, prefix=f"{prefix}.image_vit"
+        )
         self.num_prefix_tokens = self.image_vit.num_prefix_tokens
         assert self.num_prefix_tokens in {0, 1}, (
             "Only 0 or 1 prefix tokens are supported"
         )
         self.image_pooling_2d = MultiHeadDotProductAttention(
-            vision_config, nlayers=len(self.vit_layers), quant_config=quant_config
+            vision_config,
+            nlayers=len(self.vit_layers),
+            quant_config=quant_config,
+            prefix=f"{prefix}.image_pooling_2d",
         )
         self.image_projector = ImageProjectorMLP(
             config,
             input_dim=vision_config.image_emb_dim,
             quant_config=quant_config,
+            prefix=f"{prefix}.image_projector",
         )
 
         image_dim = vision_config.image_emb_dim * len(self.vit_layers)
@@ -1404,7 +1441,12 @@ class MolmoForCausalLM(
         self.multimodal_config = multimodal_config
 
         vision_config = VisionBackboneConfig()
-        self.vision_backbone = MolmoVisionBackbone(config, vision_config, quant_config)
+        self.vision_backbone = MolmoVisionBackbone(
+            config,
+            vision_config,
+            quant_config,
+            prefix=maybe_prefix(prefix, "vision_backbone"),
+        )
         self.model = MolmoModel(
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
         )
diff --git a/vllm/model_executor/models/moonvit.py b/vllm/model_executor/models/moonvit.py
index 63ea6b259a71d6641c1c343d623c811a5046ceec..c675b2cd6594c4f5aeab2f0a239d1f7eba4c36b1 100644
--- a/vllm/model_executor/models/moonvit.py
+++ b/vllm/model_executor/models/moonvit.py
@@ -51,118 +51,20 @@ import torch.nn as nn
 import torch.nn.functional as F
 from transformers.activations import ACT2FN
 from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import is_flash_attn_2_available
 
+from vllm.config import MultiModalConfig
+from vllm.distributed import divide, get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention
 from vllm.model_executor.layers.conv import Conv2dLayer
-from vllm.model_executor.layers.linear import ReplicatedLinear
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.models.utils import maybe_prefix
 from vllm.platforms import current_platform
 from vllm.transformers_utils.configs.moonvit import MoonViTConfig
 
-if is_flash_attn_2_available():
-    from flash_attn import flash_attn_varlen_func
-elif current_platform.is_xpu():
-    from vllm.attention.utils.fa_utils import flash_attn_varlen_func
-else:
-    flash_attn_varlen_func = None
-
-
-def multihead_attention(
-    q: torch.Tensor,
-    k: torch.Tensor,
-    v: torch.Tensor,
-    q_cu_seqlens: torch.Tensor | None = None,
-    k_cu_seqlens: torch.Tensor | None = None,
-) -> torch.Tensor:
-    """Multi-head attention using flash attention 2.
-
-    Args:
-        q: Query tensor of shape (batch_size, seqlen, num_heads, head_dim),
-            or (tot_seqlens, num_heads, head_dim) if packing.
-        k: Key tensor of shape (batch_size, seqlen, num_heads, head_dim),
-            or (tot_seqlens, num_heads, head_dim) if packing.
-        v: Value tensor of shape (batch_size, seqlen, num_heads, head_dim),
-            or (tot_seqlens, num_heads, head_dim) if packing.
-        q_cu_seqlens (torch.Tensor): cumulative sequence lengths of q.
-            The first element should be 0 and the last element should be q.shape[0].
-        k_cu_seqlens (torch.Tensor): cumulative sequence lengths of k.
-            The first element should be 0 and the last element should be k.shape[0].
-
-    Returns:
-        output: shape (batch_size, seqlen, dim) or (tot_seqlens, dim) if packing,
-            where dim = num_heads * head_dim
-    """
-    # Unified format legal check
-    assert q.dim() == k.dim() == v.dim() == 3, "q, k, v must have 3 dims"
-    assert q_cu_seqlens[-1] == q.shape[0], "q_cu_seqlens must sum to q.shape[0]"
-    assert k_cu_seqlens[-1] == k.shape[0] == v.shape[0], (
-        "k_cu_seqlens must sum to k.shape[0]"
-    )
-    assert q.dtype in [
-        torch.bfloat16,
-        torch.float16,
-    ], f"unsupported dtype {q.dtype} for multihead attn"
-
-    max_seqlen_q = (q_cu_seqlens[1:] - q_cu_seqlens[:-1]).max().item()
-    max_seqlen_k = (k_cu_seqlens[1:] - k_cu_seqlens[:-1]).max().item()
-    attn_out = flash_attn_varlen_func(
-        q,
-        k,
-        v,
-        cu_seqlens_q=q_cu_seqlens,
-        cu_seqlens_k=k_cu_seqlens,
-        max_seqlen_q=max_seqlen_q,
-        max_seqlen_k=max_seqlen_k,
-        causal=False,
-    )
-    attn_out = attn_out.flatten(start_dim=-2)
-
-    return attn_out
-
-
-def sdpa_attention(
-    q: torch.Tensor,
-    k: torch.Tensor,
-    v: torch.Tensor,
-    q_cu_seqlens: torch.Tensor | None = None,
-    k_cu_seqlens: torch.Tensor | None = None,
-) -> torch.Tensor:
-    """SDPA attention.
-
-    Args:
-        q: Query tensor of shape (batch_size, seqlen, num_heads, head_dim),
-            or (tot_seqlens, num_heads, head_dim) if packing.
-        k: Key tensor of shape (batch_size, seqlen, num_heads, head_dim),
-            or (tot_seqlens, num_heads, head_dim) if packing.
-        v: Value tensor of shape (batch_size, seqlen, num_heads, head_dim),
-            or (tot_seqlens, num_heads, head_dim) if packing.
-        q_cu_seqlens: Optional cumulative sequence lengths of q.
-        k_cu_seqlens: Optional cumulative sequence lengths of k.
-    """
-    seq_length = q.shape[0]
-    attention_mask = torch.zeros(
-        [1, seq_length, seq_length], device=q.device, dtype=torch.bool
-    )
-    for i in range(1, len(q_cu_seqlens)):
-        attention_mask[
-            ...,
-            q_cu_seqlens[i - 1] : q_cu_seqlens[i],
-            q_cu_seqlens[i - 1] : q_cu_seqlens[i],
-        ] = True
-    q = q.transpose(0, 1)
-    k = k.transpose(0, 1)
-    v = v.transpose(0, 1)
-    attn_output = F.scaled_dot_product_attention(q, k, v, attention_mask, dropout_p=0.0)
-    attn_output = attn_output.transpose(0, 1)
-    attn_output = attn_output.reshape(seq_length, -1)
-    return attn_output
-
-
-VL_VISION_ATTENTION_FUNCTIONS = {
-    "flash_attention_2": multihead_attention,
-    "sdpa": sdpa_attention,
-}
-
 
 def _apply_rope_input_validation(x, freqs_cis):
     assert x.ndim == freqs_cis.ndim + 1, (x.shape, freqs_cis.shape)
@@ -411,11 +313,19 @@ class MLP2(nn.Module):
         super().__init__()
         assert len(dims) == 3
         self.use_data_parallel = use_data_parallel
-        self.fc0 = ReplicatedLinear(
-            dims[0], dims[1], bias=bias, prefix=maybe_prefix(prefix, "fc0")
+        self.fc0 = ColumnParallelLinear(
+            dims[0],
+            dims[1],
+            bias=bias,
+            prefix=maybe_prefix(prefix, "fc0"),
+            disable_tp=self.use_data_parallel,
         )
-        self.fc1 = ReplicatedLinear(
-            dims[1], dims[2], bias=bias, prefix=maybe_prefix(prefix, "fc1")
+        self.fc1 = RowParallelLinear(
+            dims[1],
+            dims[2],
+            bias=bias,
+            prefix=maybe_prefix(prefix, "fc1"),
+            disable_tp=self.use_data_parallel,
         )
         self.activation = activation
 
@@ -433,35 +343,56 @@ class MoonVitEncoderLayer(nn.Module):
         hidden_dim: int,
         mlp_dim: int,
         prefix: str = "",
-        use_data_parallel: bool = False,
+        multimodal_config: MultiModalConfig | None = None,
         *,
-        attn_implementation: str = "sdpa",
         activation=F.gelu,
         attn_bias: bool = False,
     ):
         super().__init__()
+        self.use_data_parallel = (
+            multimodal_config.mm_encoder_tp_mode == "data"
+            if multimodal_config
+            else False
+        )
+
         self.num_heads = num_heads
         self.hidden_dim = hidden_dim
         self.hidden_size_per_attention_head = self.hidden_dim // self.num_heads
-        self.attn_implementation = attn_implementation
-        # use fa2 in vllm by default
-        if is_flash_attn_2_available() or current_platform.is_xpu():
-            self.attn_implementation = "flash_attention_2"
+        self.tp_size = (
+            1 if self.use_data_parallel else get_tensor_model_parallel_world_size()
+        )
+        self.num_attention_heads_per_partition = divide(num_heads, self.tp_size)
 
         self.norm0 = nn.LayerNorm(hidden_dim)
         self.norm1 = nn.LayerNorm(hidden_dim)
-        self.use_data_parallel = use_data_parallel
         self.mlp = MLP2(
             [hidden_dim, mlp_dim, hidden_dim],
             activation,
             prefix=f"{prefix}.mlp",
-            use_data_parallel=use_data_parallel,
+            use_data_parallel=self.use_data_parallel,
         )
-        self.wqkv = ReplicatedLinear(
-            hidden_dim, hidden_dim * 3, bias=attn_bias, prefix=f"{prefix}.wqkv"
+        self.wqkv = QKVParallelLinear(
+            hidden_size=hidden_dim,
+            head_size=self.hidden_size_per_attention_head,
+            total_num_heads=num_heads,
+            total_num_kv_heads=num_heads,
+            bias=attn_bias,
+            prefix=f"{prefix}.wqkv",
+            disable_tp=self.use_data_parallel,
         )
-        self.wo = ReplicatedLinear(
-            hidden_dim, hidden_dim, bias=attn_bias, prefix=f"{prefix}.wo"
+        self.wo = RowParallelLinear(
+            hidden_dim,
+            hidden_dim,
+            bias=attn_bias,
+            prefix=f"{prefix}.wo",
+            disable_tp=self.use_data_parallel,
+        )
+        self.attn = MMEncoderAttention(
+            num_heads=self.num_attention_heads_per_partition,
+            head_size=self.hidden_size_per_attention_head,
+            scale=self.hidden_size_per_attention_head**-0.5,
+            multimodal_config=multimodal_config,
+            prefix=f"{prefix}.attn",
         )
 
     def attention_qkvpacked(
@@ -472,14 +403,15 @@ class MoonVitEncoderLayer(nn.Module):
     ):
         """
         Args:
-            x (torch.Tensor): (batch_size, seqlen, hidden_dim)
+            x (torch.Tensor): (seqlen, hidden_dim)
             cu_seqlens (torch.Tensor):
         """
+        seq_length = x.size(0)
         xqkv, _ = self.wqkv(x)
 
         qkv_shape = xqkv.size()[:-1] + (
             3,
-            self.num_heads,
+            self.num_attention_heads_per_partition,
             self.hidden_size_per_attention_head,
         )
         # xqkv: (batch_size, seqlen, 3, nheads, headdim)
@@ -488,9 +420,18 @@ class MoonVitEncoderLayer(nn.Module):
 
         xq, xk = apply_rope(xq, xk, rope_freqs_cis)
 
-        attn_func = VL_VISION_ATTENTION_FUNCTIONS[self.attn_implementation]
-        attn_out = attn_func(
-            xq, xk, xv, q_cu_seqlens=cu_seqlens, k_cu_seqlens=cu_seqlens
+        max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
+        attn_out = self.attn(
+            xq.unsqueeze(0),
+            xk.unsqueeze(0),
+            xv.unsqueeze(0),
+            cu_seqlens=cu_seqlens,
+            max_seqlen=max_seqlen,
+        )
+        attn_out = attn_out.reshape(
+            seq_length,
+            self.num_attention_heads_per_partition
+            * self.hidden_size_per_attention_head,
         )
         attn_out, _ = self.wo(attn_out)
         return attn_out
@@ -528,7 +469,7 @@ class MoonVitEncoder(nn.Module):
         num_layers: int,
         block_cfg: dict,
         prefix: str = "",
-        use_data_parallel: bool = False,
+        multimodal_config: MultiModalConfig | None = None,
     ) -> None:
         super().__init__()
 
@@ -538,7 +479,7 @@ class MoonVitEncoder(nn.Module):
         self.blocks = nn.ModuleList(
             [
                 MoonVitEncoderLayer(
-                    use_data_parallel=use_data_parallel,
+                    multimodal_config=multimodal_config,
                     prefix=f"{prefix}.blocks.{layer_idx}",
                     **block_cfg,
                 )
@@ -599,31 +540,6 @@ def patch_merger(
     return outputs
 
 
-class MoonVitVLProjector(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        merge_kernel_size: list[int, int],
-        hidden_act: str = "gelu",
-        ln_eps: float = 1e-5,
-        out_dim: int = 4096,
-    ):
-        super().__init__()
-        self.hidden_size = in_channels * merge_kernel_size[0] * merge_kernel_size[1]
-
-        self.pre_norm = nn.nn.LayerNorm(in_channels, eps=ln_eps)
-        self.linear_1 = nn.Linear(self.hidden_size, self.hidden_size, bias=True)
-        self.act = ACT2FN[hidden_act]
-        self.linear_2 = nn.Linear(self.hidden_size, out_dim, bias=True)
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.pre_norm(hidden_states).view(-1, self.hidden_size)
-        hidden_states = self.linear_1(hidden_states)
-        hidden_states = self.act(hidden_states)
-        hidden_states = self.linear_2(hidden_states)
-        return hidden_states
-
-
 class MoonVitPretrainedModel(PreTrainedModel):
     config_class = MoonViTConfig
     model_type = "moonvit"
@@ -634,14 +550,13 @@ class MoonVitPretrainedModel(PreTrainedModel):
     def __init__(
         self,
         config: MoonViTConfig,
-        use_data_parallel: bool = False,
+        multimodal_config: MultiModalConfig | None = None,
         prefix: str = "",
         *inputs,
         **kwargs,
     ):
         super().__init__(config, *inputs, **kwargs)
         config = deepcopy(config)
-        self.use_data_parallel = use_data_parallel
         self.merge_kernel_size = config.merge_kernel_size
         self.hidden_size = config.hidden_size
         self.patch_size = config.patch_size
@@ -662,9 +577,9 @@ class MoonVitPretrainedModel(PreTrainedModel):
                 "mlp_dim": config.intermediate_size,
                 "activation": ACT2FN["gelu_pytorch_tanh"],
                 "attn_bias": True,
-                "attn_implementation": config._attn_implementation,
             },
             prefix=f"{prefix}.encoder",
+            multimodal_config=multimodal_config,
         )
 
     def forward(
diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py
index 6dfab595e5b92b4bedec554a6d4ee0f7f123cf04..a88496eca91da7cb8a600fca805c8451f5dddf2a 100644
--- a/vllm/model_executor/models/nano_nemotron_vl.py
+++ b/vllm/model_executor/models/nano_nemotron_vl.py
@@ -1220,7 +1220,7 @@ class NemotronH_Nano_VL_V2(
         n = pixel_values.shape[0]
         vit_embeds_list = []
         for i in range(0, n, micro_batch_size):
-            vit_embeds = self.vision_model(pixel_values[i : i + micro_batch_size])
+            _, vit_embeds = self.vision_model(pixel_values[i : i + micro_batch_size])
             vit_embeds = vit_embeds.to(dtype=torch.bfloat16)
             h = w = int(vit_embeds.shape[1] ** 0.5)
             vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
@@ -1695,12 +1695,7 @@ class NemotronH_Nano_VL_V2(
             patch_size=patch_size,
             norm_mean=hf_config.norm_mean,
             norm_std=hf_config.norm_std,
-            reg_tokens=(
-                hf_config_vision.args.get("register_multiple")
-                if hasattr(hf_config_vision, "args")
-                and isinstance(hf_config_vision.args, dict)
-                else None
-            ),
+            **hf_config_vision.args,
         )
 
         return RadioModel(config=radio_config)
diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py
index 2d9dfbd3e7688056fc0fbc201b59f3db608d2728..aff1d5fd4107afd72eea9c6df84001a11f28093f 100644
--- a/vllm/model_executor/models/nemotron_h.py
+++ b/vllm/model_executor/models/nemotron_h.py
@@ -210,16 +210,12 @@ class NemotronHMoE(nn.Module):
         )
 
         if self.use_latent_moe:
-            # TODO: check if using ReplicatedLinear is better than
-            # ColumnParallelLinear + all_gather
-            self.fc1_latent_proj = ColumnParallelLinear(
+            self.fc1_latent_proj = ReplicatedLinear(
                 input_size=config.hidden_size,
                 output_size=self.moe_hidden_size,
                 bias=config.mlp_bias,
                 quant_config=quant_config,
                 disable_tp=self.is_sequence_parallel,
-                # We need to gather the output to prepare input for moe
-                gather_output=True,
                 prefix=f"{prefix}.fc1_latent_proj",
             )
             self.fc2_latent_proj = ReplicatedLinear(
@@ -487,6 +483,7 @@ class NemotronHAttention(nn.Module):
             self.scaling,
             num_kv_heads=self.num_kv_heads,
             cache_config=cache_config,
+            quant_config=quant_config,
             prefix=f"{prefix}.attn",
         )
 
@@ -632,14 +629,7 @@ class NemotronHModel(nn.Module):
         hidden_states, _ = self.norm_f(hidden_states, residual)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-        ]
-
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
         if self.has_moe:
             # (param_name, weight_name, expert_id, shard_id)
             expert_params_mapping = FusedMoE.make_expert_params_mapping(
@@ -647,14 +637,26 @@ class NemotronHModel(nn.Module):
                 #   what the activation is applied to
                 # - FusedMoe.w3 (aka up_proj) should be ignored since we're
                 #   using non-gated MoE
+                self,
                 ckpt_gate_proj_name="up_proj",
                 ckpt_down_proj_name="down_proj",
                 ckpt_up_proj_name="",
                 num_experts=self.config.n_routed_experts,
                 num_redundant_experts=getattr(self, "num_redundant_experts", 0),
             )
-        else:
-            expert_params_mapping = []
+            return expert_params_mapping
+
+        return []
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+
+        expert_params_mapping = self.get_expert_mapping()
 
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
diff --git a/vllm/model_executor/models/nemotron_nas.py b/vllm/model_executor/models/nemotron_nas.py
index 19a942a5277ccecef1472e7d8f97fabdfcb7b807..da0688f719586b8094e8cabedb5a9e1165674b7d 100644
--- a/vllm/model_executor/models/nemotron_nas.py
+++ b/vllm/model_executor/models/nemotron_nas.py
@@ -31,7 +31,6 @@ import torch
 from torch import nn
 from transformers import LlamaConfig
 
-from vllm.attention.backends.abstract import AttentionType
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group
@@ -49,6 +48,7 @@ from vllm.model_executor.model_loader.weight_utils import (
 )
 from vllm.model_executor.models.llama import LlamaAttention, LlamaMLP
 from vllm.sequence import IntermediateTensors
+from vllm.v1.attention.backend import AttentionType
 
 from .interfaces import HasNoOps, SupportsLoRA, SupportsPP
 from .utils import (
@@ -169,10 +169,13 @@ class DeciLMDecoderLayer(nn.Module):
             self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
         if not self._is_no_op_ffn:
-            ffn_mult = block_config.ffn.ffn_mult
-            intermediate_size = _ffn_mult_to_intermediate_size(
-                ffn_mult, config.hidden_size
-            )
+            if hasattr(block_config.ffn, "ffn_mult"):
+                ffn_mult = block_config.ffn.ffn_mult
+                intermediate_size = _ffn_mult_to_intermediate_size(
+                    ffn_mult, config.hidden_size
+                )
+            else:
+                intermediate_size = block_config.ffn.intermediate_size
 
             self.mlp = LlamaMLP(
                 hidden_size=self.hidden_size,
diff --git a/vllm/model_executor/models/nemotron_parse.py b/vllm/model_executor/models/nemotron_parse.py
new file mode 100644
index 0000000000000000000000000000000000000000..a88e52b5585b1506e927aa59def40d68eb95427d
--- /dev/null
+++ b/vllm/model_executor/models/nemotron_parse.py
@@ -0,0 +1,958 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+#
+# Adapted from https://github.com/amalad/vllm/blob/nemotron_parse/vllm/model_executor/models/nemotron_parse.py
+# that's based on https://huggingface.co/nvidia/NVIDIA-Nemotron-Parse-v1.1/blob/main/hf_nemotron_parse_modeling.py
+#
+# Bart classes based on old vLLM codebase:
+# https://github.com/vllm-project/vllm/blob/v0.10.2/vllm/model_executor/models/bart.py
+
+import math
+from collections.abc import Iterable, Mapping, Sequence
+from typing import Annotated, Literal
+
+import cv2
+import numpy as np
+import torch
+import torch.nn as nn
+from einops import rearrange
+from PIL import Image
+from timm.data.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
+from torchvision import transforms as T
+from transformers import (
+    BartConfig,
+    BatchFeature,
+    PretrainedConfig,
+    TensorType,
+)
+
+from vllm.config import CacheConfig, VllmConfig
+from vllm.config.lora import LoRAConfig
+from vllm.config.multimodal import BaseDummyOptions
+from vllm.logger import init_logger
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.interfaces import (
+    MultiModalEmbeddings,
+    SupportsMultiModal,
+)
+from vllm.model_executor.models.radio import RadioModel
+from vllm.model_executor.models.whisper import WhisperAttention, WhisperCrossAttention
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (
+    MultiModalDataDict,
+    MultiModalFieldConfig,
+    MultiModalKwargsItems,
+)
+from vllm.multimodal.parse import MultiModalDataItems
+from vllm.multimodal.processing import (
+    BaseProcessingInfo,
+    EncDecMultiModalProcessor,
+    PromptReplacement,
+    PromptUpdate,
+)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.transformers_utils.configs.radio import RadioConfig
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
+from vllm.v1.attention.backend import AttentionType
+
+logger = init_logger(__name__)
+DEFAULT_FINAL_IMAGE_SIZE = (2048, 1648)
+
+
+class BartScaledWordEmbedding(VocabParallelEmbedding):
+    """
+    This module overrides VocabParallelEmbedding's
+    forward by multiplying with embeddings scale.
+    """
+
+    def __init__(
+        self, num_embeddings: int, embedding_dim: int, embed_scale: float = 1.0
+    ):
+        super().__init__(num_embeddings, embedding_dim)
+        self.embed_scale = embed_scale
+
+    def forward(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return super().forward(input_ids) * self.embed_scale
+
+
+class BartParallelLMHead(ParallelLMHead):
+    """
+    This module overrides ParallelLMHead's
+    forward by dividing by embeddings scale,
+    yielding effectively the inverse of
+    BartScaledWordEmbedding
+    """
+
+    def __init__(
+        self, num_embeddings: int, embedding_dim: int, embed_scale: float = 1.0
+    ):
+        super().__init__(num_embeddings, embedding_dim)
+        self.embed_scale = embed_scale
+
+    def forward(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return super().forward(input_ids) / self.embed_scale
+
+
+class BartDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: BartConfig,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.embed_dim = config.d_model
+
+        self.self_attn = WhisperAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.decoder_attention_heads,
+            attn_type=AttentionType.DECODER,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+        )
+        self.activation_fn = get_act_fn(config.activation_function)
+
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        """
+        afeldman-nm: personally I would call this "cross-attention",
+        however I left the name as "encoder_attn" to maintain consistency
+        with the name of the pretrained weights.
+        """
+        self.encoder_attn = WhisperCrossAttention(
+            self.embed_dim,
+            config.decoder_attention_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.encoder_attn",
+        )
+        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+
+        ffn_hidden_size = self.embed_dim
+        ffn_intermediate_size = config.encoder_ffn_dim
+        ffn_has_bias = True
+        self.fc1 = ColumnParallelLinear(
+            ffn_hidden_size,
+            ffn_intermediate_size,
+            bias=ffn_has_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc1",
+        )
+        self.fc2 = RowParallelLinear(
+            ffn_intermediate_size,
+            ffn_hidden_size,
+            bias=ffn_has_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc2",
+        )
+
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        decoder_hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        r"""
+        Args:
+            decoder_hidden_states: torch.Tensor of *decoder* input embeddings.
+            encoder_hidden_states: torch.Tensor of *encoder* input embeddings.
+        Returns:
+            Decoder layer output torch.Tensor
+        """
+        residual = decoder_hidden_states
+
+        # Self Attention
+        hidden_states = self.self_attn(hidden_states=decoder_hidden_states)
+
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Cross-Attention Block
+
+        residual = hidden_states
+
+        hidden_states = self.encoder_attn(
+            hidden_states=hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+        )
+
+        hidden_states = residual + hidden_states
+        hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+        # Fully Connected
+        residual = hidden_states
+        fc1_out, _ = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(fc1_out)
+
+        hidden_states, _ = self.fc2(hidden_states)
+
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        return hidden_states
+
+
+class MBartDecoderLayer(BartDecoderLayer):
+    def forward(
+        self,
+        decoder_hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        residual = decoder_hidden_states
+        hidden_states = self.self_attn_layer_norm(decoder_hidden_states)
+
+        # Self Attention
+        hidden_states = self.self_attn(hidden_states=hidden_states)
+
+        hidden_states = residual + hidden_states
+
+        # Cross-Attention Block
+
+        residual = hidden_states
+        hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+        hidden_states = self.encoder_attn(
+            hidden_states=hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+        )
+
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        fc1_out, _ = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(fc1_out)
+
+        hidden_states, _ = self.fc2(hidden_states)
+
+        hidden_states = residual + hidden_states
+
+        return hidden_states
+
+
+class MBartDecoderNoPos(nn.Module):
+    """
+    Transformer decoder consisting of *config.decoder_layers* layers.
+    Each layer is a [`BartDecoderLayer`]
+    Args:
+        config: BartConfig
+        embed_tokens (nn.Embedding): output embedding
+    """
+
+    def __init__(
+        self,
+        config: BartConfig,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        lora_config: LoRAConfig | None = None,
+        embed_tokens: nn.Embedding | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.cache_config = cache_config
+        self.quant_config = quant_config
+        self.lora_config = lora_config
+        embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+
+        self.embed_tokens = BartScaledWordEmbedding(
+            config.vocab_size, config.d_model, embed_scale=embed_scale
+        )
+
+        if embed_tokens is not None:
+            self.embed_tokens.weight = embed_tokens.weight
+
+        self.layers = nn.ModuleList(
+            [
+                MBartDecoderLayer(
+                    config,
+                    cache_config,
+                    quant_config,
+                    prefix=f"{prefix}.layers.{layer_idx}",
+                )
+                for layer_idx in range(config.decoder_layers)
+            ]
+        )
+
+        self.layernorm_embedding = nn.LayerNorm(config.d_model)
+        self.layer_norm = nn.LayerNorm(config.d_model)
+
+    def forward(
+        self,
+        decoder_input_ids: torch.Tensor,
+        *,
+        encoder_hidden_states: torch.Tensor | None,
+        inputs_embeds: torch.Tensor | None = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        r"""
+        Args:
+            decoder_input_ids: Indices of *decoder* input sequence tokens in the
+                vocabulary. Padding will be ignored by default should you provide it.
+            encoder_hidden_states: Tensor of encoder output embeddings
+        Returns:
+            Decoder output torch.Tensor
+        """
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(decoder_input_ids)
+
+        hidden_states = self.layernorm_embedding(inputs_embeds)
+
+        # decoder layers
+
+        for decoder_layer in self.layers:
+            hidden_states = decoder_layer(
+                decoder_hidden_states=hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+            )
+
+        hidden_states = self.layer_norm(hidden_states)
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".self_attn.qkv_proj", ".self_attn.q_proj", "q"),
+            (".self_attn.qkv_proj", ".self_attn.k_proj", "k"),
+            (".self_attn.qkv_proj", ".self_attn.v_proj", "v"),
+            (".encoder_attn.kv_proj", ".encoder_attn.k_proj", "k"),
+            (".encoder_attn.kv_proj", ".encoder_attn.v_proj", "v"),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if name.startswith("embed_positions"):
+                continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class NemotronParsePixelInputs(TensorSchema):
+    """
+    Dimensions:
+        - b: Batch size
+        - c: Number of channels (3)
+        - h: Height
+        - w: Width
+    """
+
+    type: Literal["pixel_values"]
+    data: Annotated[torch.Tensor, TensorShape("b", 3, "h", "w")]
+
+
+class NemotronParseImageProcessor:
+    """
+    NemotronParse Image Processor
+    """
+
+    def __init__(
+        self,
+        final_size: tuple = DEFAULT_FINAL_IMAGE_SIZE,
+        **kwargs,
+    ):
+        # Ensure final_size is properly formatted
+        if isinstance(final_size, (list, tuple)) and len(final_size) >= 2:
+            self.final_size = (int(final_size[0]), int(final_size[1]))
+        elif isinstance(final_size, (int, float)):
+            self.final_size = (int(final_size), int(final_size))
+        else:
+            self.final_size = DEFAULT_FINAL_IMAGE_SIZE  # Default fallback
+
+        self.norm_mean = torch.Tensor(OPENAI_CLIP_MEAN).reshape(1, 3, 1, 1)
+        self.norm_std = torch.Tensor(OPENAI_CLIP_STD).reshape(1, 3, 1, 1)
+
+        # Create transforms
+        self._create_transforms()
+
+    def _create_transforms(self):
+        """Create transform objects."""
+        try:
+            import albumentations as A
+        except ImportError as err:
+            raise ImportError(
+                "The package `albumentations` is required to use "
+                "NemotronParse model. Please install it with `pip install "
+                "albumentations`."
+            ) from err
+
+        # Ensure final_size is a tuple of integers
+        if isinstance(self.final_size, (list, tuple)):
+            self.target_height, self.target_width = (
+                int(self.final_size[0]),
+                int(self.final_size[1]),
+            )
+        else:
+            self.target_height = self.target_width = int(self.final_size)
+
+        self.transform = A.Compose(
+            [
+                A.PadIfNeeded(
+                    min_height=self.target_height,
+                    min_width=self.target_width,
+                    border_mode=cv2.BORDER_CONSTANT,
+                    fill=[255, 255, 255],
+                    p=1.0,
+                ),
+            ]
+        )
+
+        self.torch_transform = T.Compose(
+            [
+                T.ToTensor(),
+            ]
+        )
+
+    def _resize_with_aspect_ratio(self, image: np.ndarray) -> np.ndarray:
+        """Resize image maintaining aspect ratio (exact replica of original
+        LongestMaxSizeHW)."""
+        height, width = image.shape[:2]
+        max_size_height = self.target_height
+        max_size_width = self.target_width
+
+        # Original LongestMaxSizeHW algorithm from custom_augmentations.py
+        aspect_ratio = width / height
+        new_height = height
+        new_width = width
+
+        # If height too big then scale image down
+        if height > max_size_height:
+            new_height = max_size_height
+            new_width = int(new_height * aspect_ratio)
+
+        # If width too big, scale image down further
+        if new_width > max_size_width:
+            new_width = max_size_width
+            new_height = int(new_width / aspect_ratio)
+
+        # Use cv2.INTER_LINEAR like the original
+        return cv2.resize(
+            image, (new_width, new_height), interpolation=cv2.INTER_LINEAR
+        )
+
+    def _pad_to_size(self, image: np.ndarray) -> np.ndarray:
+        """Pad image to target size with white padding (matches A.PadIfNeeded
+        behavior)."""
+        h, w = image.shape[:2]
+        min_height, min_width = self.target_height, self.target_width
+
+        # Only pad if image is smaller than target (matches A.PadIfNeeded logic)
+        pad_h = max(0, min_height - h)
+        pad_w = max(0, min_width - w)
+
+        if pad_h == 0 and pad_w == 0:
+            return image
+
+        # A.PadIfNeeded pads to bottom-right with constant value
+        if len(image.shape) == 3:
+            # Color image - pad bottom and right with white (255, 255, 255)
+            padded = np.pad(
+                image,
+                ((0, pad_h), (0, pad_w), (0, 0)),
+                mode="constant",
+                constant_values=255,
+            )
+        else:
+            # Grayscale image - pad with white (255)
+            padded = np.pad(
+                image, ((0, pad_h), (0, pad_w)), mode="constant", constant_values=255
+            )
+
+        return padded
+
+    def preprocess(
+        self,
+        images: Image.Image | list[Image.Image],
+        **kwargs,
+    ) -> dict[str, torch.Tensor]:
+        """
+        Preprocess an image or batch of images for the NemotronParse model.
+
+        Args:
+            images: Input image(s)
+        """
+        # Ensure images is a list
+        if not isinstance(images, list):
+            images = [images]
+
+        # Convert PIL images to numpy arrays if needed
+        processed_images = []
+        for image in images:
+            if isinstance(image, Image.Image):
+                image = np.asarray(image)
+            processed_images.append(image)
+
+        # Apply NemotronParse-specific transforms
+        pixel_values = []
+        for image in processed_images:
+            # Manual resize with aspect ratio preservation
+            # (replaces LongestMaxSizeHW)
+            processed_image = self._resize_with_aspect_ratio(image)
+
+            # Apply remaining albumentations transforms if available
+            if self.transform is not None:
+                transformed = self.transform(image=processed_image)
+                processed_image = transformed["image"]
+            else:
+                # Fallback: just pad to target size
+                processed_image = self._pad_to_size(processed_image)
+
+            # Convert to tensor
+            pixel_values_tensor = self.torch_transform(processed_image)
+
+            # Handle grayscale images
+            if pixel_values_tensor.shape[0] == 1:
+                pixel_values_tensor = pixel_values_tensor.expand(3, -1, -1)
+
+            pixel_values.append(pixel_values_tensor)
+
+        # Stack into batch
+        pixel_values = torch.stack(pixel_values)
+
+        # Normalize pixel values
+        normalized_values = (pixel_values - self.norm_mean) / self.norm_std
+        return {"pixel_values": normalized_values}
+
+    def __call__(
+        self, images: Image.Image | list[Image.Image], **kwargs
+    ) -> dict[str, torch.Tensor]:
+        return self.preprocess(images, **kwargs)
+
+
+class NemotronParseProcessor:
+    """
+    NemotronParse Processor
+    """
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        tokenizer: AnyTokenizer,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.tokenizer = tokenizer
+
+        self.image_processor = NemotronParseImageProcessor(final_size=config.image_size)
+
+    def _make_batch_input(self, input_item=None):
+        if input_item is None:
+            input_item = []
+        if not isinstance(input_item, list):
+            input_item = [input_item]
+        return input_item
+
+    def __call__(
+        self,
+        text: str | None = None,
+        images: Image.Image | list[Image.Image] | None = None,
+        return_tensors: str | TensorType | None = None,
+        **kwargs,
+    ) -> BatchFeature:
+        text, images = [self._make_batch_input(x) for x in (text, images)]
+        image_inputs = {} if len(images) == 0 else self.image_processor(images)
+
+        text_inputs = self.tokenizer(text, add_special_tokens=False, **kwargs)
+        combined_outputs = BatchFeature(
+            data={**text_inputs, **image_inputs},
+            tensor_type=return_tensors,
+        )
+        return combined_outputs
+
+
+class NemotronParseProcessingInfo(BaseProcessingInfo):
+    def get_hf_config(self):
+        return self.ctx.get_hf_config()
+
+    def get_hf_processor(self, **kwargs) -> NemotronParseProcessor:
+        return self.ctx.init_processor(
+            NemotronParseProcessor,
+            config=self.get_hf_config(),
+            tokenizer=self.get_tokenizer(),
+            **kwargs,
+        )
+
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
+        return {"image": 1}
+
+    def get_num_image_tokens(self) -> int:
+        config = self.get_hf_config()
+        final_size = config.image_size
+        patch_size = config.encoder.patch_size
+
+        return (final_size[0] // patch_size) * ((final_size[1] // patch_size) // 4) + 1
+
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int] | None:
+        image_tokens = self.get_num_image_tokens()
+        return {"image": image_tokens}
+
+
+class NemotronParseDummyInputsBuilder(
+    BaseDummyInputsBuilder[NemotronParseProcessingInfo]
+):
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        return ""
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 0)
+
+        target_width, target_height = self.info.get_hf_config().image_size
+
+        return {
+            "image": self._get_dummy_images(
+                width=target_width, height=target_height, num_images=num_images
+            )
+        }
+
+
+class NemotronParseMultiModalProcessor(
+    EncDecMultiModalProcessor[NemotronParseProcessingInfo]
+):
+    def create_encoder_prompt(
+        self,
+        prompt: str | list[int],
+        mm_data: MultiModalDataDict,
+    ) -> str | list[int]:
+        return [0]
+
+    @property
+    def pad_dummy_encoder_prompt(self) -> bool:
+        return True
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        if mm_data:
+            processed_outputs = super()._call_hf_processor(
+                prompt, mm_data, mm_kwargs, tok_kwargs
+            )
+        else:
+            hf_processor = self.info.get_hf_processor()
+            tokenizer = hf_processor.tokenizer
+            processed_outputs = tokenizer(
+                prompt, add_special_tokens=False, return_tensors="pt"
+            )
+        return processed_outputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(pixel_values=MultiModalFieldConfig.batched("image"))
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargsItems,
+    ) -> Sequence[PromptUpdate]:
+        num_image_tokens = self.info.get_num_image_tokens()
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=[0],
+                replacement=[0] * num_image_tokens,
+            )
+        ]
+
+
+class RadioWithNeck(nn.Module):
+    """Vision encoder using RADIO model with custom neck."""
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config.encoder
+
+        self.model_encoder = self.get_vit_model_from_radio_config(
+            config, quant_config=quant_config
+        )
+
+        # Neck components
+        last_hidden_state = 1024
+        self.conv1 = nn.Conv1d(1280, last_hidden_state, 1)
+        self.layer_norm1 = nn.LayerNorm(
+            last_hidden_state, eps=1e-06, elementwise_affine=True
+        )
+        self.conv2 = nn.Conv2d(
+            last_hidden_state,
+            last_hidden_state,
+            kernel_size=(1, 4),
+            stride=(1, 4),
+            padding=0,
+            bias=False,
+        )
+        self.layer_norm2 = nn.LayerNorm(
+            last_hidden_state, eps=1e-06, elementwise_affine=True
+        )
+        self.sum_proj = ColumnParallelLinear(
+            3840,
+            last_hidden_state,
+            quant_config=quant_config,
+            prefix=f"{prefix}.sum_proj",
+        )
+        self.layer_norm3 = nn.LayerNorm(
+            last_hidden_state, eps=1e-06, elementwise_affine=True
+        )
+
+    def get_vit_model_from_radio_config(
+        self,
+        hf_config: PretrainedConfig,
+        quant_config: QuantizationConfig | None = None,
+    ) -> RadioModel:
+        hf_config_vision = hf_config.encoder
+        model_name = hf_config_vision.args.get("model")
+        if model_name is None:
+            raise ValueError(f"Unsupported vit model type: {model_name}")
+
+        radio_config = RadioConfig(
+            model_name=model_name,
+            image_size=hf_config.image_size,
+            **hf_config_vision.args,
+        )
+
+        return RadioModel(config=radio_config, quant_config=quant_config)
+
+    def forward(self, pixel_values: torch.Tensor, **kwargs) -> torch.Tensor:
+        summary, feature = self.model_encoder(pixel_values)
+
+        output = self.conv1(feature.permute(0, 2, 1)).permute(0, 2, 1)
+        output = self.layer_norm1(output)
+
+        patch_size = self.config.patch_size
+        output = rearrange(
+            output,
+            "b (h w) d -> b d h w",
+            h=pixel_values.shape[-2] // patch_size,
+            w=pixel_values.shape[-1] // patch_size,
+        )
+
+        output = self.conv2(output)
+        output = rearrange(output, "b d h w -> b (h w) d")
+        output = self.layer_norm2(output)
+        summary = self.layer_norm3(self.sum_proj(summary)[0])
+        output = torch.cat((output, summary.unsqueeze(1)), dim=1)
+
+        return output
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+        model_encoder_weights = []
+        adaptor_dict = {
+            name: param
+            for name, param in dict(self.named_parameters()).items()
+            if not name.startswith("model_encoder")
+        }
+        for name, w in weights:
+            if name.startswith("model_encoder"):
+                model_encoder_weights.append((".".join(name.split(".")[1:]), w))
+            else:
+                param = adaptor_dict[name]
+                with torch.no_grad():
+                    default_weight_loader(param, w)
+
+        self.model_encoder.load_weights(model_encoder_weights)
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    NemotronParseMultiModalProcessor,
+    info=NemotronParseProcessingInfo,
+    dummy_inputs=NemotronParseDummyInputsBuilder,
+)
+class NemotronParseForConditionalGeneration(nn.Module, SupportsMultiModal):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+
+        self.config = config
+        self.vision_config = config.encoder
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
+        self.encoder = RadioWithNeck(
+            config=config, quant_config=quant_config, prefix=f"{prefix}.encoder"
+        )
+
+        self.decoder = MBartDecoderNoPos(
+            config.decoder,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.decoder",
+        )
+
+        self.vocab_size = config.decoder.vocab_size
+        self.lm_head = ParallelLMHead(
+            config.decoder.vocab_size, config.decoder.d_model, quant_config=quant_config
+        )
+        self.logits_processor = LogitsProcessor(
+            self.vocab_size, config.decoder.vocab_size
+        )
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
+        if modality.startswith("image"):
+            return None
+
+        raise ValueError("Only image modality is supported")
+
+    def _parse_and_validate_image_input(
+        self, **kwargs: object
+    ) -> NemotronParsePixelInputs | None:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+
+        if pixel_values is None and image_embeds is None:
+            return None
+
+        if pixel_values is not None and image_embeds is not None:
+            raise ValueError("Both pixel values and image embeds are provided.")
+
+        if pixel_values is not None:
+            h, w = self.config.image_size
+            return NemotronParsePixelInputs(
+                type="pixel_values",
+                data=pixel_values,
+                resolve_bindings={
+                    "h": h,
+                    "w": w,
+                },
+            )
+
+        if image_embeds is not None:
+            raise NotImplementedError
+
+        raise AssertionError("This line should be unreachable.")
+
+    def _process_image_input(
+        self, image_input: NemotronParsePixelInputs
+    ) -> torch.Tensor:
+        assert image_input["type"] == "pixel_values"
+        pixel_values = image_input["data"]
+        dtype = next(self.encoder.parameters()).dtype
+        pixel_values = pixel_values.to(dtype)
+        return self.encoder(pixel_values)
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.decoder
+
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings | None:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return None
+        vision_embeddings = self._process_image_input(image_input)
+        return vision_embeddings
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        encoder_outputs: list[torch.Tensor] | None = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        r"""
+        Args:
+            input_ids: torch.Tensor of *decoder* input token ids.
+            positions: torch.Tensor of *decoder* position indices.
+            encoder_outputs: List of encoder output tensors (vision embeddings).
+                During profiling, this may be None or empty.
+        Returns:
+            Output torch.Tensor
+        """
+        inputs_embeds = None
+        if encoder_outputs:
+            inputs_embeds = torch.cat(encoder_outputs, dim=0)
+        hidden_states = self.decoder(
+            decoder_input_ids=input_ids, encoder_hidden_states=inputs_embeds
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor | None:
+        return self.logits_processor(self.lm_head, hidden_states)
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+        lm_head_dict = dict(self.lm_head.named_parameters())
+
+        def is_encoder(name: str) -> bool:
+            return name.startswith("encoder")
+
+        def is_decoder(name: str) -> bool:
+            return name.startswith("decoder")
+
+        def is_lm_head(name: str):
+            return name.startswith("lm_head")
+
+        # Separate weights by component
+        encoder_weights = []
+        decoder_weights = []
+
+        for name, w in weights:
+            if is_encoder(name):
+                encoder_weights.append((".".join(name.split(".")[1:]), w))
+            elif is_decoder(name):
+                decoder_weights.append((".".join(name.split(".")[1:]), w))
+            elif is_lm_head(name):
+                trimmed_name = ".".join(name.split(".")[1:])
+                param = lm_head_dict[trimmed_name]
+                with torch.no_grad():
+                    default_weight_loader(param, w)
+            else:
+                logger.info("Found unexpected weight: %s", name)
+
+        # Load encoder weights
+        self.encoder.load_weights(encoder_weights)
+        # Load decoder weights
+        self.decoder.load_weights(decoder_weights)
diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py
index a5a926151c5c96ff64d735bd54cab0bcfdccfc25..b4cf98de18103a0a9586b6a33c8042cdc944c64f 100644
--- a/vllm/model_executor/models/olmoe.py
+++ b/vllm/model_executor/models/olmoe.py
@@ -86,7 +86,11 @@ class OlmoeMoE(nn.Module):
 
         # Gate always runs at half / full precision for now.
         self.gate = ReplicatedLinear(
-            hidden_size, num_experts, bias=False, quant_config=None
+            hidden_size,
+            num_experts,
+            bias=False,
+            quant_config=None,
+            prefix=f"{prefix}.gate",
         )
 
         self.experts = FusedMoE(
@@ -334,6 +338,7 @@ class OlmoeModel(nn.Module):
         # Params for weights, fp8 weight scales, fp8 activation scales
         # (param_name, weight_name, expert_id, shard_id)
         return FusedMoE.make_expert_params_mapping(
+            self,
             ckpt_gate_proj_name="gate_proj",
             ckpt_down_proj_name="down_proj",
             ckpt_up_proj_name="up_proj",
diff --git a/vllm/model_executor/models/openpangu.py b/vllm/model_executor/models/openpangu.py
index 47abd7bf0b68acd018389ea534c38e404af01ea9..9f569bcc71cf34847837ea04db2d6a8c52e3be04 100644
--- a/vllm/model_executor/models/openpangu.py
+++ b/vllm/model_executor/models/openpangu.py
@@ -29,18 +29,21 @@ import torch
 from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.attention.backends.abstract import AttentionType
-from vllm.attention.layer import Attention
+from vllm.attention.layer import Attention, AttentionType
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, ParallelConfig, VllmConfig
 from vllm.distributed import (
     get_ep_group,
     get_pp_group,
+    get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
     get_tp_group,
     tensor_model_parallel_all_gather,
 )
 from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.attention.static_sink_attention import (
+    StaticSinkAttention,
+)
 from vllm.model_executor.layers.fused_moe import SharedFusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (
@@ -77,8 +80,11 @@ from vllm.model_executor.models.utils import (
     maybe_prefix,
     sequence_parallel_chunk,
 )
+from vllm.model_executor.utils import set_weight_attrs
+from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.config import set_default_rope_theta
+from vllm.v1.attention.backends.flash_attn_diffkv import FlashAttentionDiffKVBackend
 
 
 def check_ffn_act_fn(act_fn: str):
@@ -155,7 +161,15 @@ class OpenPanguMoE(nn.Module):
             quant_config=None,
             prefix=f"{prefix}.gate",
         )
-        self.gate.e_score_correction_bias = None
+        if (
+            hasattr(config, "router_enable_expert_bias")
+            and config.router_enable_expert_bias
+        ):
+            self.gate.e_score_correction_bias = nn.Parameter(
+                torch.empty(self.n_routed_experts, dtype=torch.float32)
+            )
+        else:
+            self.gate.e_score_correction_bias = None
 
         # Load balancing settings.
         eplb_config = parallel_config.eplb_config
@@ -530,6 +544,264 @@ class OpenPanguEmbeddedAttention(nn.Module):
         )
 
 
+class OpenPanguSinkAttention(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rope_parameters: dict[str, Any] | None = None,
+        max_position_embeddings: int = 8192,
+        quant_config: QuantizationConfig | None = None,
+        bias: bool = False,
+        bias_o_proj: bool = False,
+        cache_config: CacheConfig | None = None,
+        prefix: str = "",
+        attn_type: str = AttentionType.DECODER,
+    ) -> None:
+        super().__init__()
+        layer_idx = extract_layer_index(prefix)
+        self.hidden_size = hidden_size
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank = get_tensor_model_parallel_rank()
+        self.total_num_heads = num_heads
+        if self.total_num_heads % self.tp_size != 0:
+            raise ValueError(
+                f"total_num_heads {self.total_num_heads} "
+                f"is not divisible by tp_size {self.tp_size}."
+            )
+        self.num_heads = self.total_num_heads // self.tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if (
+            self.total_num_kv_heads > self.tp_size
+            and self.total_num_kv_heads % self.tp_size != 0
+        ):
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel ranks.
+            raise ValueError(
+                "Number of KV heads is greater than TP size, "
+                f"but total_num_kv_heads {self.total_num_kv_heads} "
+                f"is not divisible by tp_size {self.tp_size}."
+            )
+        elif self.total_num_kv_heads < self.tp_size:
+            # TODO: Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel ranks.
+            raise ValueError(
+                f"Number of KV heads {self.total_num_kv_heads} is less than "
+                f"TP size {self.tp_size}, KV heads replication is not support yet."
+            )
+        self.num_kv_heads = max(1, self.total_num_kv_heads // self.tp_size)
+        self.qk_nope_dim = getattr(config, "qk_nope_dim", None)
+        self.qk_rope_dim = getattr(config, "qk_rope_dim", None)
+        self.v_channels = getattr(config, "v_channels", None)
+        self.head_dim = self.qk_rope_dim + self.qk_nope_dim
+        self.q_size = self.num_heads * self.head_dim
+        self.k_size = self.num_kv_heads * self.head_dim
+        self.v_size = self.num_kv_heads * self.v_channels
+        self.scaling = self.head_dim**-0.5
+        self.max_position_embeddings = max_position_embeddings
+
+        self.param_sink_number = getattr(config, "param_sink_number", 0)
+        self.param_sink_with_value = getattr(config, "param_sink_with_value", False)
+        self.param_sink_scalar = getattr(config, "param_sink_scalar", None)
+        self.param_sink_of_head_num = getattr(config, "param_sink_of_head_dim", False)
+
+        self.qkv_proj = MergedColumnParallelLinear(
+            input_size=hidden_size,
+            output_sizes=[
+                self.q_size * self.tp_size,
+                self.k_size * self.tp_size,
+                self.v_size * self.tp_size,
+            ],
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+
+        self.o_proj = RowParallelLinear(
+            input_size=self.total_num_heads * self.v_channels,
+            output_size=hidden_size,
+            bias=bias_o_proj,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        self.k_layernorm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+
+        self._init_rotary_emb(
+            config, rope_parameters=rope_parameters, quant_config=quant_config
+        )
+
+        if hasattr(config, "interleaved_sliding_window"):
+            interleaved_sliding_window = config.interleaved_sliding_window
+            if isinstance(interleaved_sliding_window, int):
+                sliding_window = interleaved_sliding_window
+            elif isinstance(interleaved_sliding_window, list):
+                sw_idx = layer_idx % len(interleaved_sliding_window)
+                sliding_window = interleaved_sliding_window[sw_idx]
+            else:
+                raise ValueError(
+                    f"{type(interleaved_sliding_window)} "
+                    "for interleaved_sliding_window is not supported."
+                )
+        else:
+            sliding_window = None
+
+        FlashAttentionDiffKVBackend.set_head_size_v(self.v_channels)
+        self.attn = StaticSinkAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            sink_len=self.param_sink_number,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            per_layer_sliding_window=sliding_window,
+            attn_type=attn_type,
+            prefix=f"{prefix}.attn",
+            attn_backend=FlashAttentionDiffKVBackend,
+            head_size_v=self.v_channels,
+        )
+
+        if self.param_sink_number > 0:
+            self.param_sink_key = torch.nn.Parameter(
+                torch.empty(
+                    (
+                        self.param_sink_number,
+                        self.num_kv_heads,
+                        self.head_dim,
+                    ),
+                    device=current_platform.current_device(),
+                    dtype=config.torch_dtype,
+                )
+            )
+            set_weight_attrs(
+                self.param_sink_key,
+                {
+                    "output_dim": 1,
+                    "weight_loader": self.weight_loader,
+                },
+            )
+
+            if self.param_sink_with_value:
+                self.param_sink_value = torch.nn.Parameter(
+                    torch.empty(
+                        (
+                            self.param_sink_number,
+                            self.num_kv_heads,
+                            self.v_channels,
+                        ),
+                        device=current_platform.current_device(),
+                        dtype=config.torch_dtype,
+                    )
+                )
+                set_weight_attrs(
+                    self.param_sink_value,
+                    {
+                        "output_dim": 1,
+                        "weight_loader": self.weight_loader,
+                    },
+                )
+            else:
+                self.param_sink_value = torch.zeros(
+                    (
+                        self.param_sink_number,
+                        self.num_kv_heads,
+                        self.v_channels,
+                    ),
+                    device=current_platform.current_device(),
+                    dtype=config.torch_dtype,
+                )
+        # To enable dummy run with out weight
+        self.post_weight_load()
+
+    def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor):
+        output_dim = getattr(param, "output_dim", None)
+
+        is_sharded_weight = getattr(param, "is_sharded_weight", False)
+        use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
+        # bitsandbytes loads the weights of the specific portion
+        # no need to narrow
+        is_sharded_weight = is_sharded_weight or use_bitsandbytes_4bit
+
+        # Special case for GGUF
+        is_gguf_weight = getattr(param, "is_gguf_weight", False)
+        is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False)
+        if is_gguf_weight_type:
+            param.weight_type = loaded_weight.item()
+
+        # Materialize GGUF UninitializedParameter
+        if is_gguf_weight and isinstance(param, nn.UninitializedParameter):
+            final_shape = list(loaded_weight.shape)
+            if output_dim is not None:
+                assert final_shape[output_dim] % self.tp_size == 0
+                final_shape[output_dim] = final_shape[output_dim] // self.tp_size
+            param.materialize(final_shape, dtype=loaded_weight.dtype)
+
+        param_data = param.data
+        if output_dim is not None and not is_sharded_weight:
+            shard_size = param_data.shape[output_dim]
+            start_idx = self.tp_rank * shard_size
+            loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
+
+        # Special case for loading scales off disk, which often do not
+        # have a shape (such as in the case of AutoFP8).
+        if len(loaded_weight.shape) == 0:
+            loaded_weight = loaded_weight.reshape(1)
+
+        assert param_data.shape == loaded_weight.shape
+        param_data.copy_(loaded_weight)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.k_size, self.v_size], dim=-1)
+        k = self.k_layernorm(k.view(-1, self.num_kv_heads, self.head_dim))
+        q, k = self.rotary_emb(positions, q, k)
+
+        q = q.view(-1, self.q_size)
+        k = k.view(-1, self.k_size)
+
+        attn_output = self.attn(
+            q,
+            k,
+            v,
+            output_shape=torch.Size(
+                [q.shape[0], q.shape[1] // self.head_dim * self.v_channels]
+            ),
+        )
+        output, _ = self.o_proj(attn_output)
+        return output
+
+    def _init_rotary_emb(
+        self,
+        config: PretrainedConfig,
+        rope_parameters: dict[str, Any] | None,
+        quant_config: QuantizationConfig | None,
+    ) -> None:
+        is_neox_style = False
+        rope_parameters = {"partial_rotary_factor": self.qk_rope_dim / self.head_dim}
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            max_position=self.max_position_embeddings,
+            rope_parameters=rope_parameters,
+            is_neox_style=is_neox_style,
+        )
+
+    def post_weight_load(self) -> None:
+        if hasattr(self, "k_layernorm") and self.k_layernorm is not None:
+            param_sink_key = self.k_layernorm(self.param_sink_key)
+        else:
+            param_sink_key = self.param_sink_key
+
+        self.attn.update_sink_kv(param_sink_key, self.param_sink_value)
+
+
 class OpenPanguDecoderLayer(nn.Module):
     def __init__(
         self,
@@ -557,6 +829,9 @@ class OpenPanguDecoderLayer(nn.Module):
             and hasattr(config, "v_head_dim")
             and hasattr(config, "kv_lora_rank")
         )
+        self.use_sink_attention = (
+            hasattr(config, "param_sink_number") and config.param_sink_number > 0
+        )
         if self.use_mla:
             self.self_attn = OpenPanguMLAAttention(
                 config=config,
@@ -574,6 +849,42 @@ class OpenPanguDecoderLayer(nn.Module):
                 quant_config=quant_config,
                 prefix=f"{prefix}.self_attn",
             )
+        elif self.use_sink_attention:
+            attention_bias = getattr(config, "attention_bias", False) or getattr(
+                config, "bias", False
+            )
+            bias_o_proj = attention_bias
+            if hasattr(config, "qkv_bias"):
+                attention_bias = config.qkv_bias
+            if getattr(config, "is_causal", True):
+                attn_type = AttentionType.DECODER
+            else:
+                raise ValueError(
+                    f"is_causal={config.is_causal} is not support "
+                    "for attention with sink"
+                )
+            rope_parameters = getattr(config, "rope_scaling", None)
+            if rope_parameters is None:
+                rope_parameters = {
+                    "rope_type": "default",
+                    "rope_theta": config.rope_theta,
+                }
+            self.self_attn = OpenPanguSinkAttention(
+                config=config,
+                hidden_size=self.hidden_size,
+                num_heads=config.num_attention_heads,
+                num_kv_heads=getattr(
+                    config, "num_key_value_heads", config.num_attention_heads
+                ),
+                rope_parameters=rope_parameters,
+                max_position_embeddings=max_position_embeddings,
+                quant_config=quant_config,
+                bias=attention_bias,
+                bias_o_proj=bias_o_proj,
+                cache_config=cache_config,
+                prefix=f"{prefix}.self_attn",
+                attn_type=attn_type,
+            )
         else:
             attention_bias = getattr(config, "attention_bias", False) or getattr(
                 config, "bias", False
@@ -852,6 +1163,7 @@ class OpenPanguModel(nn.Module):
         has_experts = hasattr(self.config, "n_routed_experts")
         if has_experts:
             expert_merge_mapping = SharedFusedMoE.make_expert_params_mapping(
+                self,
                 ckpt_gate_proj_name="gate_proj",
                 ckpt_down_proj_name="down_proj",
                 ckpt_up_proj_name="up_proj",
@@ -903,6 +1215,10 @@ class OpenPanguModel(nn.Module):
                 if name.endswith(".bias") and name not in params_dict:
                     continue
                 name = maybe_remap_kv_scale_name(name, params_dict)
+                if name.endswith("e_score_correction_bias"):
+                    name = name.replace(
+                        "e_score_correction_bias", "gate.e_score_correction_bias"
+                    )
                 if name is None:
                     continue
                 if is_pp_missing_parameter(name, self):
@@ -912,8 +1228,17 @@ class OpenPanguModel(nn.Module):
                 weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
                 loaded_params.add(name)
+
+        self.post_weight_load()
         return loaded_params
 
+    def post_weight_load(self) -> None:
+        for name, module in self.named_modules():
+            if module is self:
+                continue
+            if hasattr(module, "post_weight_load"):
+                module.post_weight_load()
+
 
 class OpenPanguModelBase(nn.Module, SupportsPP, SupportsLoRA):
     packed_modules_mapping = {
@@ -1047,3 +1372,7 @@ class PanguEmbeddedForCausalLM(OpenPanguEmbeddedModel):
 
 class PanguUltraMoEForCausalLM(OpenPanguMoEModel):
     pass
+
+
+class PanguProMoEV2ForCausalLM(OpenPanguMoEModel):
+    pass
diff --git a/vllm/model_executor/models/openpangu_mtp.py b/vllm/model_executor/models/openpangu_mtp.py
index 436b7f981b1f909365326d706079c9382580cbdf..27335105179746ffa08ee18c246fa8ab4dd4064f 100644
--- a/vllm/model_executor/models/openpangu_mtp.py
+++ b/vllm/model_executor/models/openpangu_mtp.py
@@ -43,7 +43,6 @@ from vllm.model_executor.models.deepseek_mtp import (
 from vllm.model_executor.models.utils import maybe_prefix
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsPP
 from .openpangu import OpenPanguDecoderLayer
 
 
@@ -92,7 +91,7 @@ class OpenPanguMultiTokenPredictor(DeepSeekMultiTokenPredictor):
 
 
 @support_torch_compile
-class OpenPanguMTP(nn.Module, SupportsPP):
+class OpenPanguMTP(nn.Module):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         self.config = vllm_config.model_config.hf_config
@@ -149,6 +148,7 @@ class OpenPanguMTP(nn.Module, SupportsPP):
         ]
 
         expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            self,
             ckpt_gate_proj_name="gate_proj",
             ckpt_down_proj_name="down_proj",
             ckpt_up_proj_name="up_proj",
diff --git a/vllm/model_executor/models/ouro.py b/vllm/model_executor/models/ouro.py
index 829148b4c1fb7a9a9bdc6636b4db53a5efb431c0..f51c0f095072daf7e06f651e3fb66c51d0b7786e 100644
--- a/vllm/model_executor/models/ouro.py
+++ b/vllm/model_executor/models/ouro.py
@@ -33,7 +33,6 @@ import torch
 from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.attention.backends.abstract import AttentionType
 from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
@@ -57,6 +56,7 @@ from vllm.model_executor.model_loader.weight_utils import (
     maybe_remap_kv_scale_name,
 )
 from vllm.sequence import IntermediateTensors
+from vllm.v1.attention.backend import AttentionType
 
 from .interfaces import SupportsLoRA
 from .utils import (
diff --git a/vllm/model_executor/models/paddleocr_vl.py b/vllm/model_executor/models/paddleocr_vl.py
index 56565266c0dcc583125da2569c573dd429c4e1a5..530974f7fa8b1bb5d4706007fdac1bd189bb4229 100644
--- a/vllm/model_executor/models/paddleocr_vl.py
+++ b/vllm/model_executor/models/paddleocr_vl.py
@@ -30,18 +30,15 @@ from transformers.modeling_outputs import (
 )
 from transformers.utils import torch_int
 
-from vllm.attention.backends.registry import AttentionBackendEnum
-from vllm.attention.layers.mm_encoder_attention import (
-    MMEncoderAttention,
-)
 from vllm.config import MultiModalConfig, VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import parallel_state
 from vllm.distributed import utils as dist_utils
-from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.attention.mm_encoder_attention import (
+    MMEncoderAttention,
+)
 from vllm.model_executor.layers.conv import Conv2dLayer
 from vllm.model_executor.layers.linear import (
-    ColumnParallelLinear,
     QKVParallelLinear,
     RowParallelLinear,
 )
@@ -74,9 +71,11 @@ from vllm.multimodal.processing import (
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
 
 from .ernie45 import Ernie4_5ForCausalLM
 from .interfaces import MultiModalEmbeddings, SupportsMRoPE, SupportsMultiModal
+from .siglip import SiglipMLP
 from .utils import (
     AutoWeightsLoader,
     PPMissingLayer,
@@ -565,6 +564,7 @@ class SiglipAttention(nn.Module):
         self.attn = MMEncoderAttention(
             num_heads=self.num_attention_heads_per_partition,
             head_size=self.hidden_size_per_attention_head,
+            scale=self.hidden_size_per_attention_head**-0.5,
             multimodal_config=multimodal_config,
             prefix=f"{prefix}.attn",
         )
@@ -657,46 +657,6 @@ class SigLIPRotaryEmbedding(nn.Module):
         return freqs
 
 
-class SiglipMLP(nn.Module):
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        quant_config: QuantizationConfig | None = None,
-        prefix: str = "",
-    ) -> None:
-        super().__init__()
-
-        self.config = config
-        self.activation_fn = get_act_fn(config.hidden_act)
-        # Special handling for BNB and torchao quantization
-        if quant_config and quant_config.get_name() in ["bitsandbytes", "torchao"]:
-            quantizable = True
-        else:
-            # For other quantization, we require the hidden size to be a
-            # multiple of 64
-            quantizable = (
-                config.hidden_size % 64 == 0 and config.intermediate_size % 64 == 0
-            )
-        self.fc1 = ColumnParallelLinear(
-            config.hidden_size,
-            config.intermediate_size,
-            quant_config=quant_config if quantizable else None,
-            prefix=f"{prefix}.fc1",
-        )
-        self.fc2 = RowParallelLinear(
-            config.intermediate_size,
-            config.hidden_size,
-            quant_config=quant_config if quantizable else None,
-            prefix=f"{prefix}.fc2",
-        )
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        hidden_states, _ = self.fc1(hidden_states)
-        hidden_states = self.activation_fn(hidden_states)
-        hidden_states, _ = self.fc2(hidden_states)
-        return hidden_states
-
-
 class SiglipEncoderLayer(nn.Module):
     def __init__(
         self,
@@ -720,6 +680,7 @@ class SiglipEncoderLayer(nn.Module):
         self.mlp = SiglipMLP(
             config,
             quant_config=quant_config,
+            multimodal_config=multimodal_config,
             prefix=f"{prefix}.mlp",
         )
 
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index 67240c6e71249445506b4682834f0e9aad12b6f7..8671bbd5c7dcefcbd9a7095c353ae1cd415bd020 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -35,7 +35,13 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
-from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
+from .interfaces import (
+    MultiModalEmbeddings,
+    SupportsLoRA,
+    SupportsMultiModal,
+    SupportsPP,
+)
+from .module_mapping import MultiModelKeys
 from .siglip import SiglipVisionModel
 from .utils import (
     AutoWeightsLoader,
@@ -250,7 +256,9 @@ class PaliGemmaMultiModalProcessor(BaseMultiModalProcessor[PaliGemmaProcessingIn
     info=PaliGemmaProcessingInfo,
     dummy_inputs=PaliGemmaDummyInputsBuilder,
 )
-class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
+class PaliGemmaForConditionalGeneration(
+    nn.Module, SupportsLoRA, SupportsMultiModal, SupportsPP
+):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -406,3 +414,16 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsP
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        return MultiModelKeys.from_string_field(
+            language_model="language_model",
+            connector="multi_modal_projector",
+            tower_model="vision_tower",
+        )
+
+    def get_num_mm_encoder_tokens(self, num_image_tokens: int) -> int:
+        return num_image_tokens
+
+    def get_num_mm_connector_tokens(self, num_vision_tokens: int) -> int:
+        return num_vision_tokens
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 900b0eade308cf446e37edcd7f8497023f7e0da5..75823ec582513169fbd5b43b3e822e2130b886d1 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -29,7 +29,7 @@ from transformers import (
 )
 
 from vllm.config import VllmConfig
-from vllm.config.multimodal import BaseDummyOptions
+from vllm.config.multimodal import BaseDummyOptions, MultiModalConfig
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
@@ -96,6 +96,7 @@ CLIP_VIT_LARGE_PATCH14_336_CONFIG = CLIPVisionConfig(
 def _init_img_processor(
     hf_config: PretrainedConfig,
     quant_config: QuantizationConfig | None,
+    multimodal_config: MultiModalConfig | None,
     prefix: str = "",
 ) -> CLIPVisionModel:
     clip_config = CLIP_VIT_LARGE_PATCH14_336_CONFIG
@@ -109,7 +110,8 @@ def _init_img_processor(
 
     img_processor = CLIPVisionModel(
         clip_config,
-        quant_config,
+        quant_config=quant_config,
+        multimodal_config=multimodal_config,
         num_hidden_layers_override=num_hidden_layers,
         prefix=prefix,
     )
@@ -160,38 +162,15 @@ class Phi3VImageEmbeddingInputs(TensorSchema):
 Phi3VImageInputs: TypeAlias = Phi3VImagePixelInputs | Phi3VImageEmbeddingInputs
 
 
-class Phi3ImageEmbeddingBase(nn.Module):
-    def __init__(self) -> None:
-        super().__init__()
-        self.layer_idx: int
-        self.type_feature: str
-        self.img_processor: CLIPVisionModel
-
-    def get_img_features(self, img_embeds: torch.FloatTensor) -> torch.FloatTensor:
-        TYPE_FEATURE = self.type_feature
-
-        # NOTE: we skip the step to select the vision feature layer since
-        # this is already done inside the img_processor
-        img_feature = self.img_processor(img_embeds)
-
-        if TYPE_FEATURE == "patch":
-            patch_feature = img_feature[:, 1:]
-            return patch_feature
-
-        if TYPE_FEATURE == "cls_patch":
-            return img_feature
-
-        raise NotImplementedError
-
-
 # adapted from https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_embedding_phi3_v.py
-class Phi3HDImageEmbedding(Phi3ImageEmbeddingBase):
+class Phi3HDImageEmbedding(nn.Module):
     """Phi3 Image embedding with HD transform."""
 
     def __init__(
         self,
         config: PretrainedConfig,
         quant_config: QuantizationConfig | None,
+        multimodal_config: MultiModalConfig | None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -200,7 +179,10 @@ class Phi3HDImageEmbedding(Phi3ImageEmbeddingBase):
         hidden_size = config.n_embd if hasattr(config, "n_embd") else config.hidden_size
 
         self.img_processor = _init_img_processor(
-            config, quant_config, prefix=f"{prefix}.img_processor"
+            config,
+            quant_config=quant_config,
+            multimodal_config=multimodal_config,
+            prefix=f"{prefix}.img_processor",
         )
 
         image_dim_out = config.img_processor["image_dim_out"]
@@ -223,13 +205,29 @@ class Phi3HDImageEmbedding(Phi3ImageEmbeddingBase):
 
         dim_projection = hidden_size
         depth = 2
-        layers = [nn.Linear(image_dim_out * 4, dim_projection)]
+        layers: list[nn.Module] = [nn.Linear(image_dim_out * 4, dim_projection)]
         for _ in range(1, depth):
             layers.extend([nn.GELU(), nn.Linear(dim_projection, dim_projection)])
         self.img_projection = nn.Sequential(*layers)
 
         self.type_feature = config.img_processor.get("type_feature", "patch")
 
+    def get_img_features(self, img_embeds: torch.FloatTensor) -> torch.FloatTensor:
+        type_feature = self.type_feature
+
+        # NOTE: we skip the step to select the vision feature layer since
+        # this is already done inside the img_processor
+        img_feature = self.img_processor(img_embeds)
+
+        if type_feature == "patch":
+            patch_feature = img_feature[:, 1:]
+            return patch_feature
+
+        if type_feature == "cls_patch":
+            return img_feature
+
+        raise NotImplementedError(type_feature)
+
     def forward(
         self, pixel_values: torch.FloatTensor, image_sizes: torch.Tensor
     ) -> torch.FloatTensor:
@@ -582,6 +580,7 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsQuant)
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
         multimodal_config = vllm_config.model_config.multimodal_config
         self.config = config
         self.multimodal_config = multimodal_config
@@ -590,14 +589,15 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsQuant)
         self.embed_tokens = VocabParallelEmbedding(
             config.vocab_size,
             config.hidden_size,
-            quant_config=self.quant_config,
+            quant_config=quant_config,
             prefix=maybe_prefix(prefix, "model.embed_tokens"),
         )
 
         # TODO: Optionally initializes this for supporting input embeddings.
         self.vision_embed_tokens = Phi3HDImageEmbedding(
             config,
-            self.quant_config,
+            quant_config=quant_config,
+            multimodal_config=multimodal_config,
             prefix=maybe_prefix(prefix, "model.vision_embed_tokens"),
         )
 
diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index 179d5df869beafc5615a9d8773b4e50cce665ffa..c58abefc85446f8c5ea8087a21c99420b43b490b 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -428,14 +428,13 @@ class Phi4MMImageEncoder(nn.Module):
                 output_imgs.append(torch.cat([sub_img, self.glb_GN, glb_img], dim=1))
             else:
                 raise NotImplementedError(
-                    f'hd_transform_order = {self.hd_transform_order}, "\
-                        "not implemented'
+                    f"hd_transform_order = {self.hd_transform_order}, not implemented"
                 )
 
             # temp_len = int((h*w+1)*144 + 1 + (h+1)*12)
             assert temp_len == output_imgs[-1].shape[1], (
-                f'temp_len: {temp_len}, output_imgs[-1].shape[1]: "\
-                    "{output_imgs[-1].shape[1]}'
+                f"temp_len: {temp_len}, output_imgs[-1].shape[1]: "
+                f"{output_imgs[-1].shape[1]}"
             )
 
             output_len.append(temp_len)
diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py
index 14f73d0c64586946e15ea6ca9a3ea88543f63e81..835f360df05848f86ecb4fffb37fdd94917df83c 100644
--- a/vllm/model_executor/models/phimoe.py
+++ b/vllm/model_executor/models/phimoe.py
@@ -272,6 +272,7 @@ class PhiMoE(nn.Module):
             bias=False,
             params_dtype=params_dtype,
             quant_config=None,
+            prefix=f"{prefix}.gate",
         )
 
         self.experts = FusedMoE(
@@ -515,6 +516,7 @@ class PhiMoEModel(nn.Module):
 
     def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
         return FusedMoE.make_expert_params_mapping(
+            self,
             ckpt_gate_proj_name="w1",
             ckpt_down_proj_name="w2",
             ckpt_up_proj_name="w3",
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 555e6ea4b8cb296926af11bbc564e449a94c0fe6..f57047bfa4327cb76d7937689971f65b4b0f9119 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -28,7 +28,7 @@ from transformers.models.pixtral.modeling_pixtral import (
 from transformers.tokenization_utils_base import TextInput
 
 from vllm.config import VllmConfig
-from vllm.config.multimodal import BaseDummyOptions
+from vllm.config.multimodal import BaseDummyOptions, MultiModalConfig
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_and_mul_fn
 from vllm.model_executor.layers.conv import Conv2dLayer
@@ -63,7 +63,13 @@ from vllm.tokenizers import cached_tokenizer_from_config
 from vllm.tokenizers.mistral import MistralTokenizer
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
-from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
+from .interfaces import (
+    MultiModalEmbeddings,
+    SupportsLoRA,
+    SupportsMultiModal,
+    SupportsPP,
+)
+from .module_mapping import MultiModelKeys
 from .utils import init_vllm_registered_model, maybe_prefix
 from .vision import (
     VisionEncoderInfo,
@@ -365,7 +371,9 @@ class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo])
     info=PixtralProcessingInfo,
     dummy_inputs=PixtralDummyInputsBuilder,
 )
-class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
+class PixtralForConditionalGeneration(
+    nn.Module, SupportsLoRA, SupportsMultiModal, SupportsPP
+):
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
@@ -581,6 +589,25 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP)
         # Now we call the language model load with the generator
         self.language_model.load_weights(llm_weights_generator())
 
+    def get_mm_mapping(self) -> MultiModelKeys:
+        return MultiModelKeys.from_string_field(
+            language_model="language_model",
+            connector="vision_language_adapter",
+            tower_model="vision_encoder",
+        )
+
+    def get_num_mm_encoder_tokens(self, num_image_tokens: int) -> int:
+        if getattr(self, "patch_merger", None) is None:
+            return num_image_tokens
+        merge_size = self.vision_args.spatial_merge_size
+        return num_image_tokens * (merge_size**2)
+
+    def get_num_mm_connector_tokens(self, num_vision_tokens: int) -> int:
+        if getattr(self, "patch_merger", None) is None:
+            return num_vision_tokens
+        merge_size = self.vision_args.spatial_merge_size
+        return num_vision_tokens // (merge_size**2)
+
 
 # Vision encoder
 @dataclass
@@ -1043,11 +1070,18 @@ class PixtralHFMLP(nn.Module):
         self,
         config: PixtralVisionConfig,
         quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         *,
         prefix: str = "",
     ) -> None:
         super().__init__()
 
+        use_data_parallel = (
+            multimodal_config.mm_encoder_tp_mode == "data"
+            if multimodal_config
+            else False
+        )
+
         assert config.intermediate_size is not None
         self.gate_up_proj = MergedColumnParallelLinear(
             input_size=config.hidden_size,
@@ -1055,6 +1089,7 @@ class PixtralHFMLP(nn.Module):
             bias=False,
             quant_config=quant_config,
             prefix=f"{prefix}.gate_up_proj",
+            disable_tp=use_data_parallel,
         )
         self.down_proj = RowParallelLinear(
             input_size=config.intermediate_size,
@@ -1062,6 +1097,7 @@ class PixtralHFMLP(nn.Module):
             bias=False,
             quant_config=quant_config,
             prefix=f"{prefix}.down_proj",
+            disable_tp=use_data_parallel,
         )
         self.act_and_mul = get_act_and_mul_fn(config.hidden_act)
 
@@ -1077,6 +1113,7 @@ class PixtralHFAttention(nn.Module):
         self,
         config: PixtralVisionConfig,
         quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         *,
         prefix: str = "",
     ) -> None:
@@ -1085,10 +1122,14 @@ class PixtralHFAttention(nn.Module):
         self.config = config
         assert not config.hidden_size % config.num_attention_heads
         self.total_num_heads = config.num_attention_heads
-        tp_size = get_tensor_model_parallel_world_size()
-        self.n_heads = divide(config.num_attention_heads, tp_size)
         self.head_dim = config.hidden_size // config.num_attention_heads
+        assert self.total_num_heads * self.head_dim == config.hidden_size
 
+        use_data_parallel = (
+            multimodal_config.mm_encoder_tp_mode == "data"
+            if multimodal_config
+            else False
+        )
         self.qkv_proj = QKVParallelLinear(
             hidden_size=config.hidden_size,
             head_size=self.head_dim,
@@ -1096,15 +1137,21 @@ class PixtralHFAttention(nn.Module):
             bias=False,
             quant_config=quant_config,
             prefix=f"{prefix}.qkv_proj",
+            disable_tp=use_data_parallel,
         )
-        assert self.total_num_heads * self.head_dim == config.hidden_size
         self.o_proj = RowParallelLinear(
             input_size=config.hidden_size,
             output_size=config.hidden_size,
             bias=False,
             quant_config=quant_config,
             prefix=f"{prefix}.o_proj",
+            disable_tp=use_data_parallel,
+        )
+
+        self.tp_size = (
+            1 if use_data_parallel else get_tensor_model_parallel_world_size()
         )
+        self.n_heads = divide(config.num_attention_heads, self.tp_size)
 
     def forward(
         self,
@@ -1147,6 +1194,7 @@ class PixtralHFTransformerBlock(nn.Module):
         self,
         config: PixtralVisionConfig,
         quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         *,
         prefix: str = "",
     ) -> None:
@@ -1154,10 +1202,16 @@ class PixtralHFTransformerBlock(nn.Module):
 
         self.attention_norm = RMSNorm(config.hidden_size, eps=1e-5)
         self.attention = PixtralHFAttention(
-            config, quant_config=quant_config, prefix=f"{prefix}.attention"
+            config,
+            quant_config=quant_config,
+            multimodal_config=multimodal_config,
+            prefix=f"{prefix}.attention",
         )
         self.feed_forward = PixtralHFMLP(
-            config, quant_config=quant_config, prefix=f"{prefix}.feed_forward"
+            config,
+            quant_config=quant_config,
+            multimodal_config=multimodal_config,
+            prefix=f"{prefix}.feed_forward",
         )
         self.ffn_norm = RMSNorm(config.hidden_size, eps=1e-5)
 
@@ -1183,6 +1237,7 @@ class PixtralHFTransformer(nn.Module):
         self,
         config: PixtralVisionConfig,
         quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         *,
         num_hidden_layers_override: int | None = None,
         prefix: str = "",
@@ -1199,6 +1254,7 @@ class PixtralHFTransformer(nn.Module):
                 PixtralHFTransformerBlock(
                     config=config,
                     quant_config=quant_config,
+                    multimodal_config=multimodal_config,
                     prefix=f"{prefix}.layers.{layer_idx}",
                 )
                 for layer_idx in range(num_hidden_layers)
@@ -1230,6 +1286,7 @@ class PixtralHFVisionModel(nn.Module):
         self,
         config: PixtralVisionConfig,
         quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         *,
         num_hidden_layers_override: int | None = None,
         require_post_norm: bool | None = None,
@@ -1249,7 +1306,8 @@ class PixtralHFVisionModel(nn.Module):
         self.ln_pre = RMSNorm(config.hidden_size, eps=1e-5)
         self.transformer = PixtralHFTransformer(
             config,
-            quant_config,
+            quant_config=quant_config,
+            multimodal_config=multimodal_config,
             num_hidden_layers_override=num_hidden_layers_override,
             prefix=f"{prefix}.transformer",
         )
diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py
index 6765ee0c5779cfe5e3e60d76c537990247320b32..45512d23d269a90ac9973de2eb9fcfc4ceefad45 100644
--- a/vllm/model_executor/models/plamo2.py
+++ b/vllm/model_executor/models/plamo2.py
@@ -9,7 +9,6 @@ import torch
 from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import VllmConfig, get_current_vllm_config
@@ -50,16 +49,24 @@ from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader,
     sharded_weight_loader,
 )
-from vllm.model_executor.models.interfaces import HasInnerState, IsHybrid, SupportsPP
+from vllm.model_executor.models.interfaces import (
+    HasInnerState,
+    IsHybrid,
+    SupportsLoRA,
+    SupportsPP,
+)
 from vllm.model_executor.models.utils import (
+    AutoWeightsLoader,
     is_pp_missing_parameter,
     make_empty_intermediate_tensors_factory,
     make_layers,
     maybe_prefix,
 )
 from vllm.model_executor.utils import set_weight_attrs
+from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 from vllm.utils.torch_utils import direct_register_custom_op
+from vllm.v1.attention.backend import AttentionMetadata
 from vllm.v1.attention.backends.mamba2_attn import Mamba2AttentionMetadata
 
 
@@ -97,14 +104,18 @@ def is_mamba(config: Plamo2Config, i: int) -> bool:
 # Adapted from:
 # vllm.model_executor.layers.mamba.mamba_mixer2.MambaMixer2
 # transformers.models.mamba.modeling_mamba.MambaMixer
-@CustomOp.register(name="plamo2_mamba_mixer")
+# --8<-- [start:plamo2_mamba_mixer]
+@CustomOp.register("plamo2_mamba_mixer")
 class Plamo2MambaMixer(MambaBase, CustomOp):
+    # --8<-- [end:plamo2_mamba_mixer]
+
     def __init__(self, vllm_config: VllmConfig, *, prefix: str = "", **kwargs) -> None:
         super().__init__()
         self.config = vllm_config.model_config.hf_config
         self.cache_config = vllm_config.cache_config
         self.model_config = vllm_config.model_config
         self.quant_config = vllm_config.quant_config
+        self.is_lora_enabled = bool(vllm_config.lora_config)
         self.hidden_size = self.config.hidden_size
         self.ssm_state_size = self.config.mamba_d_state
         self.conv_kernel_size = self.config.mamba_d_conv
@@ -202,7 +213,11 @@ class Plamo2MambaMixer(MambaBase, CustomOp):
         self.prefix = prefix
 
     def _project_ssm_parameters(self, hidden_states):
-        ssm_parameters = self.bcdt_proj(hidden_states)
+        if self.is_lora_enabled:
+            #  Lora kernel requires contiguous tensor.
+            ssm_parameters = self.bcdt_proj(hidden_states.contiguous())
+        else:
+            ssm_parameters = self.bcdt_proj(hidden_states)
         B, C, time_step = torch.split(
             ssm_parameters,
             [self.ssm_state_size, self.ssm_state_size, self.time_step_rank],
@@ -400,6 +415,13 @@ class Plamo2MambaMixer(MambaBase, CustomOp):
                 conv_state_indices=state_indices_tensor_d,
             )
 
+            # ROCm: Ensure contiguous tensor for bcdt_proj linear layer.
+            # causal_conv1d_update returns a non-contiguous view (stride 8192
+            # instead of 4096 for shape [batch, 4096]), causing incorrect GEMM
+            # results when batch > 1 on ROCm.
+            if current_platform.is_rocm():
+                hidden_states_d = hidden_states_d.contiguous()
+
             B, C, dt = self._project_ssm_parameters(hidden_states_d)
 
             # 3. State Space Model sequence transformation
@@ -780,13 +802,13 @@ class Plamo2Model(torch.nn.Module):
         return hidden_states
 
 
-class Plamo2ForCausalLM(torch.nn.Module, HasInnerState, SupportsPP, IsHybrid):
+class Plamo2ForCausalLM(
+    torch.nn.Module, HasInnerState, SupportsLoRA, SupportsPP, IsHybrid
+):
     packed_modules_mapping = {
-        "qkv_proj": [
-            "q_proj",
-            "k_proj",
-            "v_proj",
-        ],
+        "qkv_proj": ["qkv_proj"],
+        "gate_up_proj": ["gate_up_proj"],
+        "in_proj": ["in_proj"],
     }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
@@ -892,6 +914,12 @@ class Plamo2ForCausalLM(torch.nn.Module, HasInnerState, SupportsPP, IsHybrid):
             if name == "lm_head.weight" and self.config.tie_word_embeddings:
                 assert "lm_head.weight" not in params_dict
                 continue
+            # Same workaround as AutoWeightsLoader for GPTQModel
+            if any(
+                substr in name
+                for substr in AutoWeightsLoader.ROTARY_EMBEDS_UNUSED_WEIGHTS
+            ):
+                continue
 
             # Update the weight names to be compatible with the vllm version
             # of the model.
diff --git a/vllm/model_executor/models/plamo3.py b/vllm/model_executor/models/plamo3.py
index 3557104d905cbad14f08995172413020d37ae13f..3550c9fa7f65dc1b63fd95c65cba401de8e6902c 100644
--- a/vllm/model_executor/models/plamo3.py
+++ b/vllm/model_executor/models/plamo3.py
@@ -35,7 +35,7 @@ from vllm.model_executor.model_loader.weight_utils import (
     composed_weight_loader,
     default_weight_loader,
 )
-from vllm.model_executor.models.interfaces import SupportsPP
+from vllm.model_executor.models.interfaces import SupportsLoRA, SupportsPP
 from vllm.model_executor.models.utils import (
     AutoWeightsLoader,
     extract_layer_index,
@@ -369,13 +369,10 @@ class Plamo3Model(nn.Module):
         return hidden_states
 
 
-class Plamo3ForCausalLM(nn.Module, SupportsPP):
+class Plamo3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     packed_modules_mapping = {
-        "qkv_proj": [
-            "q_proj",
-            "k_proj",
-            "v_proj",
-        ],
+        "qkv_proj": ["qkv_proj"],
+        "gate_up_proj": ["gate_up_proj"],
     }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index 025706f712028228cf72784847b7aaa1ca6830dd..b4768e4bfc75155e3fa6eb831c0a0c8ba7dfdece 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -61,13 +61,22 @@ class QWenMLP(nn.Module):
         intermediate_size: int,
         hidden_act: str = "silu",
         quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
     ):
         super().__init__()
         self.gate_up_proj = MergedColumnParallelLinear(
-            hidden_size, [intermediate_size] * 2, bias=False, quant_config=quant_config
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
         )
         self.c_proj = RowParallelLinear(
-            intermediate_size, hidden_size, bias=False, quant_config=quant_config
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.c_proj",
         )
         if hidden_act != "silu":
             raise ValueError(
@@ -174,7 +183,10 @@ class QWenBlock(nn.Module):
         self.ln_2 = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
 
         self.mlp = QWenMLP(
-            config.hidden_size, config.intermediate_size // 2, quant_config=quant_config
+            config.hidden_size,
+            config.intermediate_size // 2,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
         )
 
     def forward(
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 493c4a0a9cb188f8ba92eea28723d58f1e54f4bc..ea9f189bf8c977454af82c6646f4f4546a11c555 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -35,15 +35,15 @@ from transformers import Qwen2Config
 
 import os
 import re
-from vllm import envs
 
-from vllm.attention.backends.abstract import AttentionType
 from vllm.attention.layer import Attention
-from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.attention.encoder_only_attention import (
+    EncoderOnlyAttention,
+)
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (
     MergedColumnParallelLinear,
@@ -63,6 +63,7 @@ from vllm.model_executor.model_loader.weight_utils import (
 )
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.config import is_interleaved, set_default_rope_theta
+from vllm.v1.attention.backend import AttentionType
 
 from .interfaces import SupportsEagle3, SupportsLoRA, SupportsPP
 from .utils import (
@@ -402,8 +403,6 @@ class Qwen2Model(nn.Module):
         else:
             self.embed_tokens = PPMissingLayer()
 
-        # Use the provided decoder layer type or default to Qwen2DecoderLayer
-        decoder_layer_type = decoder_layer_type or Qwen2DecoderLayer
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
             lambda prefix: decoder_layer_type(
diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py
index f9bce4bf981b2e8dbb6b1729b88a3eed3cb6c517..744c21b54f40bef79b14d3b68c1dc5be0e99b742 100644
--- a/vllm/model_executor/models/qwen2_5_omni_thinker.py
+++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py
@@ -70,7 +70,6 @@ from vllm.multimodal.inputs import (
     MultiModalFeatureSpec,
     MultiModalFieldConfig,
     MultiModalKwargsItems,
-    NestedTensors,
 )
 from vllm.multimodal.parse import (
     AudioProcessorItems,
@@ -227,6 +226,10 @@ class Qwen2_5OmniThinkerProcessingInfo(
         assert isinstance(feature_extractor, WhisperFeatureExtractor)
         return feature_extractor
 
+    def get_target_channels(self) -> int:
+        """Return target audio channels for Qwen2.5 Omni models (mono)."""
+        return 1
+
     def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"audio": None, "image": None, "video": None}
 
@@ -311,6 +314,7 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
         return Qwen2_5OmniThinkerMultiModalDataParser(
             spatial_merge_size=self.info.get_hf_config().vision_config.spatial_merge_size,
             target_sr=feature_extractor.sampling_rate,
+            target_channels=self.info.get_target_channels(),
         )
 
     def _call_hf_processor(
@@ -1129,8 +1133,6 @@ class Qwen2_5OmniThinkerForConditionalGeneration(
                 multimodal_embeddings += tuple(audio_embeddings)
         return multimodal_embeddings
 
-    # TODO (ywang96): support overlapping modality embeddings so that
-    # `use_audio_in_video` will work on V1.
     def embed_input_ids(
         self,
         input_ids: torch.Tensor,
@@ -1150,27 +1152,6 @@ class Qwen2_5OmniThinkerForConditionalGeneration(
             handle_oov_mm_token=handle_oov_mm_token,
         )
 
-    def embed_multimodal_v0(self, **kwargs: object) -> NestedTensors | None:
-        audio_input = self._parse_and_validate_audio_input(**kwargs)
-        image_input = self._parse_and_validate_image_input(**kwargs)
-        video_input = self._parse_and_validate_video_input(**kwargs)
-
-        if audio_input is None and image_input is None and video_input is None:
-            return None
-
-        multimodal_embeddings: list[tuple[NestedTensors, str]] = []
-
-        if audio_input is not None:
-            audio_embeds = self._process_audio_input(audio_input)
-            multimodal_embeddings.append((audio_embeds, "audio"))
-        if image_input is not None:
-            image_embeds = self._process_image_input(image_input)
-            multimodal_embeddings.append((image_embeds, "image"))
-        if video_input is not None:
-            video_embeds = self._process_video_input(video_input)
-            multimodal_embeddings.append((video_embeds, "video"))
-        return multimodal_embeddings
-
     def forward(
         self,
         input_ids: torch.Tensor,
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 71107985cde2c045f2ceeee902205eb36bb3e317..86605ccdde98a88bd5169fc8f0f0d9062b5cb2a4 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -34,15 +34,13 @@ import einops
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from transformers import BatchFeature
+from transformers import BatchFeature, Qwen2ForCausalLM
 from transformers.models.qwen2_5_vl import Qwen2_5_VLProcessor
 from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import (
     Qwen2_5_VLConfig,
     Qwen2_5_VLVisionConfig,
 )
 
-from vllm.attention.backends.registry import AttentionBackendEnum
-from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import MultiModalConfig, VllmConfig
 from vllm.distributed import parallel_state
@@ -50,6 +48,7 @@ from vllm.distributed import utils as dist_utils
 from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import get_act_and_mul_fn
+from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention
 from vllm.model_executor.layers.conv import Conv3dLayer
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (
@@ -83,6 +82,7 @@ from vllm.multimodal.processing import PromptReplacement, PromptUpdate
 from vllm.sequence import IntermediateTensors
 from vllm.utils.platform_utils import is_pin_memory_available
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
 
 from .interfaces import (
     MultiModalEmbeddings,
@@ -358,6 +358,7 @@ class Qwen2_5_VisionAttention(nn.Module):
         self.attn = MMEncoderAttention(
             num_heads=self.num_attention_heads_per_partition,
             head_size=self.hidden_size_per_attention_head,
+            scale=self.hidden_size_per_attention_head**-0.5,
             multimodal_config=multimodal_config,
         )
 
@@ -1087,6 +1088,7 @@ class Qwen2_5_VLForConditionalGeneration(
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
         "gate_up_proj": ["gate_proj", "up_proj"],
+        "qkv": ["qkv"],  # For vision tower's already-packed QKV
     }
 
     # To ensure correct weight loading and mapping.
@@ -1627,4 +1629,31 @@ class Qwen2_5_VLForConditionalGeneration(
             language_model="language_model",
             connector="visual.merger.",
             tower_model="visual.",
-        )
\ No newline at end of file
+        )
+
+    def get_num_mm_encoder_tokens(
+        self,
+        num_image_tokens: int,
+    ) -> int:
+        hf_config = self.config
+        vision_config = hf_config.vision_config
+        merge_size = vision_config.spatial_merge_size
+
+        return num_image_tokens * merge_size**2
+
+    def get_num_mm_connector_tokens(
+        self,
+        num_vision_tokens: int,
+    ) -> int:
+        hf_config = self.config
+        vision_config = hf_config.vision_config
+        merge_size = vision_config.spatial_merge_size
+        return num_vision_tokens // merge_size**2
+
+    @classmethod
+    def get_language_model_spec(cls) -> tuple[nn.Module | None, str | None]:
+        """
+        Return the language model spec:
+        (language model class, language model attr)
+        """
+        return Qwen2ForCausalLM, "language_model"
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index 2037c3c40ba38063e7eaaadfc7439584c25a070f..4eb5f77796d897e2c831dc32c9b33d481d6196a3 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -101,7 +101,7 @@ class Qwen2AudioEmbeddingInputs(TensorSchema):
 
     audio_embeds: Annotated[
         list[torch.Tensor],
-        TensorShape("bn", "naf", "hs"),
+        TensorShape("bn", "naf", "hs", dynamic_dims={"naf"}),
     ]
 
 
@@ -140,6 +140,10 @@ class Qwen2AudioProcessingInfo(BaseProcessingInfo):
         assert isinstance(feature_extractor, WhisperFeatureExtractor)
         return feature_extractor
 
+    def get_target_channels(self) -> int:
+        """Return target audio channels for Qwen2 Audio models (mono)."""
+        return 1
+
     def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"audio": None}
 
@@ -201,7 +205,10 @@ class Qwen2AudioMultiModalDataParser(MultiModalDataParser):
 class Qwen2AudioMultiModalProcessor(BaseMultiModalProcessor[Qwen2AudioProcessingInfo]):
     def _get_data_parser(self) -> MultiModalDataParser:
         feature_extractor = self.info.get_feature_extractor()
-        return Qwen2AudioMultiModalDataParser(target_sr=feature_extractor.sampling_rate)
+        return Qwen2AudioMultiModalDataParser(
+            target_sr=feature_extractor.sampling_rate,
+            target_channels=self.info.get_target_channels(),
+        )
 
     def _call_hf_processor(
         self,
diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
index 542883d5a009915badc967483c111cdea9fca390..7928ffc65a421f713c09b7ee292cc7067777e5bb 100644
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -115,7 +115,7 @@ class Qwen2MoeMLP(nn.Module):
         out, _ = self.down_proj(out)
 
         if self.expert_gate is not None:
-            out = F.sigmoid(self.expert_gate(x)) * out
+            out = F.sigmoid(self.expert_gate(x)[0]) * out
 
         return out
 
@@ -144,7 +144,13 @@ class Qwen2MoeSparseMoeBlock(nn.Module):
             prefix=f"{prefix}.gate",
         )
 
-        self.shared_expert_gate = torch.nn.Linear(config.hidden_size, 1, bias=False)
+        self.shared_expert_gate = ReplicatedLinear(
+            config.hidden_size,
+            1,
+            bias=False,
+            quant_config=None,
+            prefix=f"{prefix}.shared_expert_gate",
+        )
 
         if config.shared_expert_intermediate_size > 0:
             self.shared_expert = Qwen2MoeMLP(
@@ -430,6 +436,7 @@ class Qwen2MoeModel(nn.Module):
         # Params for weights, fp8 weight scales, fp8 activation scales
         # (param_name, weight_name, expert_id, shard_id)
         return SharedFusedMoE.make_expert_params_mapping(
+            self,
             ckpt_gate_proj_name="gate_proj",
             ckpt_down_proj_name="down_proj",
             ckpt_up_proj_name="up_proj",
diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py
index eac46e0f8b05561798efe172d56e883e4bfc7eba..b0fa576f5ef25f0321ac94176efd4af1df85aaaa 100644
--- a/vllm/model_executor/models/qwen2_rm.py
+++ b/vllm/model_executor/models/qwen2_rm.py
@@ -14,7 +14,8 @@ from torch import nn
 
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear
-from vllm.model_executor.layers.pooler import DispatchPooler, Pooler
+from vllm.model_executor.layers.pooler import Pooler
+from vllm.model_executor.layers.pooler.tokwise import pooler_for_token_classify
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
@@ -95,7 +96,7 @@ class Qwen2RewardBaseModel(nn.Module, SupportsLoRA, SupportsPP):
         return loader.load_weights(weights)
 
 
-@default_pooling_type("ALL")
+@default_pooling_type(tok_pooling_type="ALL")
 class Qwen2ForRewardModel(Qwen2RewardBaseModel):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         vllm_config.model_config.hf_config.num_labels = 1
@@ -104,12 +105,10 @@ class Qwen2ForRewardModel(Qwen2RewardBaseModel):
         pooler_config = vllm_config.model_config.pooler_config
         assert pooler_config is not None
 
-        self.pooler = DispatchPooler(
-            {"token_classify": Pooler.for_token_classify(pooler_config)}
-        )
+        self.pooler = pooler_for_token_classify(pooler_config)
 
 
-@default_pooling_type("STEP")
+@default_pooling_type(tok_pooling_type="STEP")
 class Qwen2ForProcessRewardModel(Qwen2RewardBaseModel):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         vllm_config.model_config.hf_config.num_labels = 2
@@ -118,6 +117,4 @@ class Qwen2ForProcessRewardModel(Qwen2RewardBaseModel):
         pooler_config = vllm_config.model_config.pooler_config
         assert pooler_config is not None
 
-        self.pooler = DispatchPooler(
-            {"token_classify": Pooler.for_token_classify(pooler_config)}
-        )
+        self.pooler = pooler_for_token_classify(pooler_config)
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 1dbd98369ef645d46f80c64c187cce2861a94535..3da698893a35f8463d379ac5c8e2ed820f40ef09 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -43,14 +43,13 @@ from transformers.models.qwen2_vl.configuration_qwen2_vl import (
 from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
 from transformers.models.qwen2_vl.video_processing_qwen2_vl import Qwen2VLVideoProcessor
 
-from vllm.attention.backends.registry import AttentionBackendEnum
-from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention
 from vllm.config import MultiModalConfig, VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import parallel_state, tensor_model_parallel_all_gather
 from vllm.distributed import utils as dist_utils
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import QuickGELU
+from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention
 from vllm.model_executor.layers.conv import Conv3dLayer
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
@@ -90,6 +89,7 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.tokenizers import TokenizerLike
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
 
 from .interfaces import (
     MultiModalEmbeddings,
@@ -333,6 +333,7 @@ class Qwen2VisionAttention(nn.Module):
         self.attn = MMEncoderAttention(
             num_heads=self.num_attention_heads_per_partition,
             head_size=self.hidden_size_per_attention_head,
+            scale=self.hidden_size_per_attention_head**-0.5,
             multimodal_config=multimodal_config,
         )
 
@@ -1555,6 +1556,25 @@ class Qwen2VLForConditionalGeneration(
             tower_model="visual.",
         )
 
+    def get_num_mm_encoder_tokens(
+        self,
+        num_image_tokens: int,
+    ) -> int:
+        hf_config = self.config
+        vision_config = hf_config.vision_config
+        merge_size = vision_config.spatial_merge_size
+
+        return num_image_tokens * merge_size**2
+
+    def get_num_mm_connector_tokens(
+        self,
+        num_vision_tokens: int,
+    ) -> int:
+        hf_config = self.config
+        vision_config = hf_config.vision_config
+        merge_size = vision_config.spatial_merge_size
+        return num_vision_tokens // merge_size**2
+
 
 class Tarsier2MultiModalProcessor(Qwen2VLMultiModalProcessor):
     pass
diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py
index 815aaf6baed16cc9dce1837a21661572d60e1d89..6aa7ef719243c2ae76a8643130b918fec99a1b55 100644
--- a/vllm/model_executor/models/qwen3.py
+++ b/vllm/model_executor/models/qwen3.py
@@ -30,7 +30,6 @@ import torch
 from torch import nn
 from transformers import Qwen3Config
 
-from vllm.attention.backends.abstract import AttentionType
 from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
@@ -44,6 +43,7 @@ from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.config import set_default_rope_theta
+from vllm.v1.attention.backend import AttentionType
 
 from .interfaces import SupportsEagle3, SupportsLoRA, SupportsPP
 from .qwen2 import Qwen2MLP as Qwen3MLP
diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py
index d766ec04cbbdb7b32ed74fe5d11928a80e7cbf50..9c193c0bb0a58ab1d7449c64fef7c5b69456b4f8 100644
--- a/vllm/model_executor/models/qwen3_moe.py
+++ b/vllm/model_executor/models/qwen3_moe.py
@@ -694,6 +694,7 @@ class Qwen3MoeModel(nn.Module):
         # Params for weights, fp8 weight scales, fp8 activation scales
         # (param_name, weight_name, expert_id, shard_id)
         return FusedMoE.make_expert_params_mapping(
+            self,
             ckpt_gate_proj_name="gate_proj",
             ckpt_down_proj_name="down_proj",
             ckpt_up_proj_name="up_proj",
diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py
index 7c3ffcef329ebbdb11310a2f8a4122e9a7f4627a..e16a861ba94680c13793b8ff3c80b36d6fbfa872 100644
--- a/vllm/model_executor/models/qwen3_next.py
+++ b/vllm/model_executor/models/qwen3_next.py
@@ -10,7 +10,6 @@ from einops import rearrange
 from torch import nn
 from transformers.activations import ACT2FN
 
-from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import (
@@ -65,6 +64,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
 )
 from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader,
+    maybe_remap_kv_scale_name,
     sharded_weight_loader,
 )
 from vllm.model_executor.models.qwen2_moe import Qwen2MoeMLP as Qwen3NextMLP
@@ -75,6 +75,7 @@ from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs import Qwen3NextConfig
 from vllm.triton_utils import tl, triton
 from vllm.utils.torch_utils import direct_register_custom_op
+from vllm.v1.attention.backend import AttentionMetadata
 from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadata
 
 from .interfaces import (
@@ -146,7 +147,13 @@ class Qwen3NextSparseMoeBlock(nn.Module):
             prefix=f"{prefix}.gate",
         )
 
-        self.shared_expert_gate = torch.nn.Linear(config.hidden_size, 1, bias=False)
+        self.shared_expert_gate = ReplicatedLinear(
+            config.hidden_size,
+            1,
+            bias=False,
+            quant_config=None,
+            prefix=f"{prefix}.shared_expert_gate",
+        )
 
         if config.shared_expert_intermediate_size > 0:
             self.shared_expert = Qwen3NextMLP(
@@ -865,6 +872,7 @@ class Qwen3NextDecoderLayer(nn.Module):
                 intermediate_size=config.intermediate_size,
                 hidden_act=config.hidden_act,
                 quant_config=quant_config,
+                prefix=f"{prefix}.mlp",
             )
 
         self.input_layernorm = Qwen3NextRMSNorm(
@@ -1031,6 +1039,7 @@ class Qwen3NextModel(nn.Module):
         # Params for weights, fp8 weight scales, fp8 activation scales
         # (param_name, weight_name, expert_id, shard_id)
         return SharedFusedMoE.make_expert_params_mapping(
+            self,
             ckpt_gate_proj_name="gate_proj",
             ckpt_down_proj_name="down_proj",
             ckpt_up_proj_name="up_proj",
@@ -1058,6 +1067,12 @@ class Qwen3NextModel(nn.Module):
             if name.startswith("mtp."):
                 continue
 
+            # Remapping the name of FP8 kv-scale.
+            if name.endswith("scale"):
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
diff --git a/vllm/model_executor/models/qwen3_next_mtp.py b/vllm/model_executor/models/qwen3_next_mtp.py
index 83694caa52480a884ae63f26e2b3048cead1758c..565fd7d8f9b8f8aca8acad4d0a4bd10838875e67 100644
--- a/vllm/model_executor/models/qwen3_next_mtp.py
+++ b/vllm/model_executor/models/qwen3_next_mtp.py
@@ -27,7 +27,6 @@ from vllm.model_executor.models.qwen3_next import (
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs import Qwen3NextConfig
 
-from .interfaces import SupportsPP
 from .utils import (
     AutoWeightsLoader,
     is_pp_missing_parameter,
@@ -147,6 +146,7 @@ class Qwen3NextMultiTokenPredictor(nn.Module):
         # Params for weights, fp8 weight scales, fp8 activation scales
         # (param_name, weight_name, expert_id, shard_id)
         expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            self,
             ckpt_gate_proj_name="gate_proj",
             ckpt_down_proj_name="down_proj",
             ckpt_up_proj_name="up_proj",
@@ -220,7 +220,7 @@ class Qwen3NextMultiTokenPredictor(nn.Module):
 
 
 @support_torch_compile
-class Qwen3NextMTP(nn.Module, SupportsPP, QwenNextMixtureOfExperts):
+class Qwen3NextMTP(nn.Module, QwenNextMixtureOfExperts):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -252,9 +252,6 @@ class Qwen3NextMTP(nn.Module, SupportsPP, QwenNextMixtureOfExperts):
             prefix=maybe_prefix(prefix, "lm_head"),
         )
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors
-        )
         self.set_moe_parameters()
 
     def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
index 089129e443c0142af4f4f5b55a738ad9fd8f51d6..d17ac6ce8cb324a7b0e3ee1cd449d13ea18750fe 100755
--- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py
+++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
@@ -46,7 +46,6 @@ from transformers.models.qwen3_omni_moe.processing_qwen3_omni_moe import (
 )
 from transformers.models.whisper import WhisperFeatureExtractor
 
-from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import MultiModalConfig, VllmConfig
 from vllm.distributed import get_pp_group
@@ -75,6 +74,7 @@ from vllm.multimodal.processing import (
     PromptUpdateDetails,
 )
 from vllm.sequence import IntermediateTensors
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
 
 from .interfaces import (
     MultiModalEmbeddings,
@@ -118,7 +118,7 @@ def _get_feat_extract_output_lengths(input_lengths: torch.Tensor):
     output_lengths = (
         ((feat_lengths - 1) // 2 + 1 - 1) // 2 + 1 + (input_lengths // 100) * 13
     )
-    return feat_lengths, output_lengths
+    return output_lengths
 
 
 class Qwen3_VisionPatchEmbed(nn.Module):
@@ -323,7 +323,7 @@ class Qwen3Omni_VisionTransformer(nn.Module):
             hidden_size=self.hidden_size,
         )
 
-        # vit pos embeding, TODO: spatial_patch_size vs patch_size
+        # vit pos embedding, TODO: spatial_patch_size vs patch_size
         if self.apply_vit_abs_pos_embed:
             self.pos_embed = nn.Embedding(self.num_grid_per_side**2, self.hidden_size)
         else:
@@ -750,15 +750,42 @@ class Qwen3OmniMoeThinkerMultiModalProcessor(
             # https://github.com/huggingface/transformers/pull/41473
             mm_kwargs = dict(mm_kwargs)
             tok_kwargs = dict(tok_kwargs)
+            mm_kwargs["audio_kwargs"] = dict(mm_kwargs.get("audio_kwargs") or {})
+            mm_kwargs["text_kwargs"] = dict(mm_kwargs.get("text_kwargs") or {})
             if Version(TRANSFORMERS_VERSION) < Version("4.58.0"):
+                # Extract audio_sample_rate before restructuring
+                audio_sample_rate = mm_kwargs.pop("audio_sample_rate", None)
+
                 # move truncation to audio_kwargs level to avoid conflict
                 # with tok_kwargs
-                mm_kwargs["audio_kwargs"] = {
-                    "truncation": mm_kwargs.pop("truncation", False)
-                }
-                mm_kwargs["text_kwargs"] = {
-                    "truncation": tok_kwargs.pop("truncation", False)
-                }
+                mm_kwargs["audio_kwargs"].setdefault(
+                    "truncation", mm_kwargs.pop("truncation", False)
+                )
+                mm_kwargs["text_kwargs"].setdefault(
+                    "truncation", tok_kwargs.pop("truncation", False)
+                )
+
+                # Validate and conditionally pass audio_sample_rate
+                # WhisperFeatureExtractor has a fixed sampling rate, and vLLM's
+                # audio loader already resamples audio to the target rate.
+                # Only pass the value if it matches to avoid unexpected behavior.
+                if audio_sample_rate is not None:
+                    expected_sr = feature_extractor.sampling_rate
+                    if audio_sample_rate != expected_sr:
+                        logger.warning(
+                            "[%s] audio_sample_rate mismatch: user provided %dHz "
+                            "but model expects %dHz. Ignoring user value. "
+                            "vLLM's audio loader already resampled to %dHz.",
+                            self.__class__.__name__,
+                            audio_sample_rate,
+                            expected_sr,
+                            expected_sr,
+                        )
+                    else:
+                        # Sample rate matches, safe to pass
+                        mm_kwargs["audio_kwargs"]["audio_sample_rate"] = (
+                            audio_sample_rate
+                        )
 
         hf_inputs = super()._call_hf_processor(
             prompt=prompt,
@@ -921,13 +948,11 @@ class Qwen3OmniMoeThinkerMultiModalProcessor(
         if audio_feature_lengths is None and feature_attention_mask is None:
             audio_output_lengths = []
         elif audio_feature_lengths is not None:
-            _, audio_output_lens = _get_feat_extract_output_lengths(
-                audio_feature_lengths
-            )
+            audio_output_lens = _get_feat_extract_output_lengths(audio_feature_lengths)
             audio_output_lengths = audio_output_lens.tolist()
         elif feature_attention_mask is not None:
             assert isinstance(feature_attention_mask, torch.Tensor)
-            _, audio_output_lens = _get_feat_extract_output_lengths(
+            audio_output_lens = _get_feat_extract_output_lengths(
                 feature_attention_mask.sum(-1)
             )
             audio_output_lengths = audio_output_lens.tolist()
@@ -1111,18 +1136,16 @@ class Qwen3OmniMoeConditionalGenerationMixin(Qwen2_5OmniConditionalGenerationMix
         audio_input: Qwen2_5OmniAudioFeatureInputs,
         audio_hashes: list[str] | None = None,
         cached_audio_features: torch.Tensor | None = None,
-    ) -> torch.Tensor:
+    ) -> tuple[torch.Tensor, ...]:
         input_features = audio_input["input_features"]
         audio_feature_lengths = audio_input["audio_feature_lengths"]
 
-        audio_feat_lengths, audio_output_lengths = _get_feat_extract_output_lengths(
-            audio_feature_lengths
-        )
+        audio_output_lengths = _get_feat_extract_output_lengths(audio_feature_lengths)
 
         audio_outputs = self.audio_tower(
             input_features.to(self.audio_tower.dtype),
             feature_lens=audio_feature_lengths,
-            aftercnn_lens=audio_feat_lengths,
+            aftercnn_lens=audio_output_lengths,
         )
         audio_features = audio_outputs.last_hidden_state
         return audio_features.split(audio_output_lengths.tolist())
@@ -1350,8 +1373,6 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
             return inputs_embeds
 
         deepstack_input_embeds = None
-        # TODO (ywang96): support overlapping modalitiy embeddings so that
-        # `use_audio_in_video` will work on V1.
         # split the feat dim to obtain multi-scale visual feature
         has_vision_embeddings = [
             embeddings.shape[-1] != self.config.text_config.hidden_size
@@ -1579,7 +1600,7 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
                     + st_idx
                 )
                 st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
-                _, audio_len = _get_feat_extract_output_lengths(
+                audio_len = _get_feat_extract_output_lengths(
                     audio_feature_lengths[audio_idx]
                 )
                 llm_pos_ids = (
@@ -1700,7 +1721,7 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
                 llm_pos_ids_list.append(bos_block)
                 llm_pos_ids_list.append(bos_block)
                 st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
-                _, audio_len = _get_feat_extract_output_lengths(
+                audio_len = _get_feat_extract_output_lengths(
                     audio_feature_lengths[audio_idx]
                 )
                 audio_llm_pos_ids = (
diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index 5b6f8412c85e7d4813459110d6816d893a50a2ae..98e239898119ac12ac2289da55d1bcc9d8797fb2 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -49,7 +49,6 @@ from transformers.models.qwen3_vl.video_processing_qwen3_vl import (
 )
 from transformers.video_utils import VideoMetadata
 
-from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import MultiModalConfig, VllmConfig
 from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions
@@ -93,6 +92,7 @@ from vllm.multimodal.processing import (
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.utils.collection_utils import is_list_of
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
 
 from .interfaces import (
     MultiModalEmbeddings,
@@ -1243,6 +1243,7 @@ class Qwen3VLForConditionalGeneration(
             "gate_proj",
             "up_proj",
         ],
+        "qkv": ["qkv"],  # For vision tower's already-packed QKV
     }
 
     supports_encoder_tp_data = True
@@ -2090,6 +2091,33 @@ class Qwen3VLForConditionalGeneration(
         """
         return MultiModelKeys.from_string_field(
             language_model="language_model",
-            connector="visual.merger",
+            connector=["visual.merger", "visual.deepstack_merger_list"],
             tower_model="visual.",
         )
+
+    def get_num_mm_encoder_tokens(
+        self,
+        num_image_tokens: int,
+    ) -> int:
+        hf_config = self.config
+        vision_config = hf_config.vision_config
+        merge_size = vision_config.spatial_merge_size
+
+        return num_image_tokens * merge_size**2
+
+    def get_num_mm_connector_tokens(
+        self,
+        num_vision_tokens: int,
+    ) -> int:
+        hf_config = self.config
+        vision_config = hf_config.vision_config
+        merge_size = vision_config.spatial_merge_size
+        return num_vision_tokens // merge_size**2
+
+    @classmethod
+    def get_language_model_spec(cls) -> tuple[nn.Module | None, str | None]:
+        """
+        Return the language model spec:
+        (language model class, language model attr)
+        """
+        return Qwen3LLMForCausalLM, "language_model"
diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py
index caac14716782ac38afc7a1cb499625385adec6d7..df0733de98039a767f3e5a831b55d64fd84c3e59 100644
--- a/vllm/model_executor/models/qwen_vl.py
+++ b/vllm/model_executor/models/qwen_vl.py
@@ -109,6 +109,7 @@ class VisualAttention(nn.Module):
         bias: bool = True,
         kdim: int | None = None,
         vdim: int | None = None,
+        prefix: str = "",
     ):
         super().__init__()
         self.embed_dim = embed_dim
@@ -128,8 +129,12 @@ class VisualAttention(nn.Module):
         assert self._qkv_same_embed_dim, (
             "Visual Attention implementation only supports self-attention"
         )
-        self.in_proj = ReplicatedLinear(embed_dim, 3 * embed_dim)
-        self.out_proj = ReplicatedLinear(embed_dim, embed_dim)
+        self.in_proj = ReplicatedLinear(
+            embed_dim, 3 * embed_dim, prefix=f"{prefix}.in_proj"
+        )
+        self.out_proj = ReplicatedLinear(
+            embed_dim, embed_dim, prefix=f"{prefix}.out_proj"
+        )
         self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
 
     def forward(
@@ -214,10 +219,15 @@ class QwenVLMLP(nn.Module):
         hidden_size: int,
         intermediate_size: int,
         quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
     ):
         super().__init__()
         self.c_fc = ColumnParallelLinear(
-            hidden_size, intermediate_size, bias=True, quant_config=quant_config
+            hidden_size,
+            intermediate_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.c_fc",
         )
         self.act_fn = get_act_fn("gelu")
         self.c_proj = RowParallelLinear(
@@ -225,6 +235,7 @@ class QwenVLMLP(nn.Module):
             hidden_size,
             bias=True,
             quant_config=quant_config,
+            prefix=f"{prefix}.c_proj",
         )
 
     def forward(self, x):
@@ -242,17 +253,19 @@ class VisualAttentionBlock(nn.Module):
         mlp_ratio: float = 4.0,
         norm_layer: Callable[[int], nn.Module] = nn.LayerNorm,
         quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
     ):
         super().__init__()
 
         self.ln_1 = norm_layer(d_model)
         self.ln_2 = norm_layer(d_model)
         mlp_width = int(d_model * mlp_ratio)
-        self.attn = VisualAttention(d_model, n_head)
+        self.attn = VisualAttention(d_model, n_head, prefix=f"{prefix}.attn")
         self.mlp = QwenVLMLP(
             hidden_size=d_model,
             intermediate_size=mlp_width,
             quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
         )
 
     def attention(
@@ -282,6 +295,7 @@ class TransformerBlock(nn.Module):
         mlp_ratio: float = 4.0,
         norm_layer: Callable[[int], nn.Module] = nn.LayerNorm,
         quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
     ):
         super().__init__()
         self.width = width
@@ -295,8 +309,9 @@ class TransformerBlock(nn.Module):
                     mlp_ratio,
                     norm_layer=norm_layer,
                     quant_config=quant_config,
+                    prefix=f"{prefix}.resblocks.{i}",
                 )
-                for _ in range(layers)
+                for i in range(layers)
             ]
         )
 
@@ -327,6 +342,7 @@ class VisionTransformer(nn.Module):
         output_dim: int = 512,
         image_start_id: int = 151857,
         quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
         **kwargs,
     ):
         super().__init__()
@@ -356,6 +372,7 @@ class VisionTransformer(nn.Module):
             mlp_ratio,
             norm_layer=norm_layer,
             quant_config=quant_config,
+            prefix=f"{prefix}.transformer",
         )
 
         self.attn_pool = Resampler2(
@@ -366,6 +383,7 @@ class VisionTransformer(nn.Module):
             norm_layer=norm_layer,
             adaptive=False,
             do_post_projection=False,
+            prefix=f"{prefix}.attn_pool",
         ).to(
             device=self.positional_embedding.device,
             dtype=self.positional_embedding.dtype,
@@ -413,7 +431,9 @@ class QwenVLModel(QWenModel):
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
 
-        self.visual = VisionTransformer(**config.visual, quant_config=quant_config)
+        self.visual = VisionTransformer(
+            **config.visual, quant_config=quant_config, prefix=f"{prefix}.visual"
+        )
 
 
 @lru_cache(maxsize=1)
diff --git a/vllm/model_executor/models/radio.py b/vllm/model_executor/models/radio.py
index 6a42564ac70a7c0b466f8697313b0b5a45606238..ea0e7500f8a2b8c94005d157020fd1b17638bcca 100644
--- a/vllm/model_executor/models/radio.py
+++ b/vllm/model_executor/models/radio.py
@@ -427,15 +427,17 @@ class RadioInternVisionModel(nn.Module):
             to_2tuple(config.patch_size), config.image_size
         )
         max_img_size = int(
-            round(config.max_img_size / config.patch_size) * config.patch_size
+            round(config.cpe_max_size / config.patch_size) * config.patch_size
         )
+        unique_teachers = set(t["name"] for t in config.teachers)
         self.patch_generator = ViTPatchGenerator(
             config.patch_size,
             config.hidden_size,
             input_dims=self.img_size,
             max_input_dims=max_img_size,
             cls_token=True,
-            register_multiple=config.reg_tokens,
+            num_cls_tokens=len(unique_teachers) if config.cls_token_per_teacher else 1,
+            register_multiple=config.register_multiple,
         )
 
         self.encoder = InternVisionEncoder(
@@ -489,11 +491,20 @@ class RadioModel(nn.Module):
             prefix=prefix,
         )
 
+        summary_idxs = None
+        if config.teachers:
+            summary_idxs = torch.tensor(
+                [i for i, t in enumerate(config.teachers) if t.get("use_summary", True)]
+            )
+            if summary_idxs.numel() > 0:
+                self.register_buffer("summary_idxs", summary_idxs)
+        self.summary_idxs = summary_idxs
+
     def forward(
         self,
         pixel_values: torch.Tensor | None = None,
         pixel_embeds: torch.Tensor | None = None,
-    ) -> torch.FloatTensor:
+    ) -> tuple[torch.FloatTensor, torch.FloatTensor]:
         y = self.model(pixel_values)
         return self._extract_final(y)
 
@@ -546,10 +557,17 @@ class RadioModel(nn.Module):
 
         return loaded_params
 
-    def _extract_final(self, y: torch.Tensor):
+    def _extract_final(
+        self, y: torch.Tensor
+    ) -> tuple[torch.FloatTensor, torch.FloatTensor]:
         # Remove CLS + REGISTERS tokens
         patch_gen = getattr(self.model, "patch_generator", None)
         if patch_gen is not None:
+            all_summary = y[:, : patch_gen.num_cls_tokens]
+            if self.summary_idxs is not None:
+                bb_summary = all_summary[:, self.summary_idxs]
+            else:
+                bb_summary = all_summary
             all_feat = y[:, patch_gen.num_skip :]
 
-        return all_feat
+        return bb_summary.flatten(1), all_feat
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index e858d31232de81f25448fa54c3bfe4eb7183d786..8be13d3eb4414f0248ed7816851328cbded66e37 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -35,10 +35,11 @@ from vllm.utils.hashing import safe_hash
 
 if TYPE_CHECKING:
     from vllm.config.model import AttnTypeStr
-    from vllm.config.pooler import PoolingTypeStr
+    from vllm.config.pooler import SequencePoolingType, TokenPoolingType
 else:
     AttnTypeStr = Any
-    PoolingTypeStr = Any
+    SequencePoolingType = Any
+    TokenPoolingType = Any
 
 
 from .interfaces import (
@@ -46,6 +47,7 @@ from .interfaces import (
     has_noops,
     is_attention_free,
     is_hybrid,
+    requires_raw_input_tokens,
     supports_cross_encoding,
     supports_mamba_prefix_caching,
     supports_multimodal,
@@ -56,7 +58,8 @@ from .interfaces import (
 )
 from .interfaces_base import (
     get_attn_type,
-    get_default_pooling_type,
+    get_default_seq_pooling_type,
+    get_default_tok_pooling_type,
     is_pooling_model,
     is_text_generation_model,
 )
@@ -95,6 +98,7 @@ _TEXT_GENERATION_MODELS = {
     "Ernie4_5_MoeForCausalLM": ("ernie45_moe", "Ernie4_5_MoeForCausalLM"),
     "ExaoneForCausalLM": ("exaone", "ExaoneForCausalLM"),
     "Exaone4ForCausalLM": ("exaone4", "Exaone4ForCausalLM"),
+    "ExaoneMoEForCausalLM": ("exaone_moe", "ExaoneMoeForCausalLM"),
     "Fairseq2LlamaForCausalLM": ("fairseq2_llama", "Fairseq2LlamaForCausalLM"),
     "FalconForCausalLM": ("falcon", "FalconForCausalLM"),
     "FalconMambaForCausalLM": ("mamba", "MambaForCausalLM"),
@@ -118,7 +122,8 @@ _TEXT_GENERATION_MODELS = {
     "GraniteMoeHybridForCausalLM": ("granitemoehybrid", "GraniteMoeHybridForCausalLM"),  # noqa: E501
     "GraniteMoeSharedForCausalLM": ("granitemoeshared", "GraniteMoeSharedForCausalLM"),  # noqa: E501
     "GritLM": ("gritlm", "GritLM"),
-    "Grok1ModelForCausalLM": ("grok1", "Grok1ForCausalLM"),
+    "Grok1ModelForCausalLM": ("grok1", "GrokForCausalLM"),
+    "Grok1ForCausalLM": ("grok1", "GrokForCausalLM"),
     "HunYuanMoEV1ForCausalLM": ("hunyuan_v1", "HunYuanMoEV1ForCausalLM"),
     "HunYuanDenseV1ForCausalLM": ("hunyuan_v1", "HunYuanDenseV1ForCausalLM"),
     "HunYuanForCausalLM": ("hunyuan", "HunYuanForCausalLM"),
@@ -127,6 +132,8 @@ _TEXT_GENERATION_MODELS = {
     "InternLM2ForCausalLM": ("internlm2", "InternLM2ForCausalLM"),
     "InternLM2VEForCausalLM": ("internlm2_ve", "InternLM2VEForCausalLM"),
     "InternLM3ForCausalLM": ("llama", "LlamaForCausalLM"),
+    "IQuestCoderForCausalLM": ("llama", "LlamaForCausalLM"),
+    "IQuestLoopCoderForCausalLM": ("iquest_loopcoder", "IQuestLoopCoderForCausalLM"),
     "JAISLMHeadModel": ("jais", "JAISLMHeadModel"),
     "Jais2ForCausalLM": ("jais2", "Jais2ForCausalLM"),
     "JambaForCausalLM": ("jamba", "JambaForCausalLM"),
@@ -153,6 +160,7 @@ _TEXT_GENERATION_MODELS = {
     "MptForCausalLM": ("mpt", "MPTForCausalLM"),
     "MPTForCausalLM": ("mpt", "MPTForCausalLM"),
     "MiMoForCausalLM": ("mimo", "MiMoForCausalLM"),
+    "MiMoV2FlashForCausalLM": ("mimo_v2_flash", "MiMoV2FlashForCausalLM"),
     "NemotronForCausalLM": ("nemotron", "NemotronForCausalLM"),
     "NemotronHForCausalLM": ("nemotron_h", "NemotronHForCausalLM"),
     "OlmoForCausalLM": ("olmo", "OlmoForCausalLM"),
@@ -163,6 +171,7 @@ _TEXT_GENERATION_MODELS = {
     "OrionForCausalLM": ("orion", "OrionForCausalLM"),
     "OuroForCausalLM": ("ouro", "OuroForCausalLM"),
     "PanguEmbeddedForCausalLM": ("openpangu", "PanguEmbeddedForCausalLM"),
+    "PanguProMoEV2ForCausalLM": ("openpangu", "PanguProMoEV2ForCausalLM"),
     "PanguUltraMoEForCausalLM": ("openpangu", "PanguUltraMoEForCausalLM"),
     "PersimmonForCausalLM": ("persimmon", "PersimmonForCausalLM"),
     "PhiForCausalLM": ("phi", "PhiForCausalLM"),
@@ -203,6 +212,7 @@ _EMBEDDING_MODELS = {
     "GteNewModel": ("bert_with_rope", "GteNewModel"),
     "InternLM2ForRewardModel": ("internlm2", "InternLM2ForRewardModel"),
     "JambaForSequenceClassification": ("jamba", "JambaForSequenceClassification"),  # noqa: E501
+    "LlamaBidirectionalModel": ("llama", "LlamaBidirectionalModel"),
     "LlamaModel": ("llama", "LlamaForCausalLM"),
     **{
         # Multiple models share the same architecture, so we include them all
@@ -246,6 +256,11 @@ _CROSS_ENCODER_MODELS = {
         "bert_with_rope",
         "GteNewForSequenceClassification",
     ),
+    "JinaVLForRanking": ("jina_vl", "JinaVLForSequenceClassification"),
+    "LlamaBidirectionalForSequenceClassification": (
+        "llama",
+        "LlamaBidirectionalForSequenceClassification",
+    ),
     "ModernBertForSequenceClassification": (
         "modernbert",
         "ModernBertForSequenceClassification",
@@ -259,8 +274,6 @@ _CROSS_ENCODER_MODELS = {
         "roberta",
         "RobertaForSequenceClassification",
     ),
-    # [Auto-converted (see adapters.py)]
-    "JinaVLForRanking": ("jina_vl", "JinaVLForSequenceClassification"),  # noqa: E501,
 }
 
 _MULTIMODAL_MODELS = {
@@ -298,6 +311,7 @@ _MULTIMODAL_MODELS = {
         "gemma3n_mm",
         "Gemma3nForConditionalGeneration",
     ),
+    "GlmAsrForConditionalGeneration": ("glmasr", "GlmAsrForConditionalGeneration"),
     "GLM4VForCausalLM": ("glm4v", "GLM4VForCausalLM"),
     "Glm4vForConditionalGeneration": ("glm4_1v", "Glm4vForConditionalGeneration"),  # noqa: E501
     "Glm4vMoeForConditionalGeneration": ("glm4_1v", "Glm4vMoeForConditionalGeneration"),  # noqa: E501
@@ -328,7 +342,9 @@ _MULTIMODAL_MODELS = {
         "idefics3",
         "Idefics3ForConditionalGeneration",
     ),
+    "IsaacForConditionalGeneration": ("isaac", "IsaacForConditionalGeneration"),
     "SmolVLMForConditionalGeneration": ("smolvlm", "SmolVLMForConditionalGeneration"),  # noqa: E501
+    "KananaVForConditionalGeneration": ("kanana_v", "KananaVForConditionalGeneration"),
     "KeyeForConditionalGeneration": ("keye", "KeyeForConditionalGeneration"),
     "KeyeVL1_5ForConditionalGeneration": (
         "keye_vl1_5",
@@ -340,6 +356,7 @@ _MULTIMODAL_MODELS = {
         "lightonocr",
         "LightOnOCRForConditionalGeneration",
     ),
+    "Lfm2VlForConditionalGeneration": ("lfm2_vl", "Lfm2VLForConditionalGeneration"),
     "Llama_Nemotron_Nano_VL": ("nemotron_vl", "LlamaNemotronVLChatModel"),
     "Llama4ForConditionalGeneration": ("mllama4", "Llama4ForConditionalGeneration"),  # noqa: E501
     "LlavaForConditionalGeneration": ("llava", "LlavaForConditionalGeneration"),
@@ -418,7 +435,12 @@ _MULTIMODAL_MODELS = {
     ),
     "UltravoxModel": ("ultravox", "UltravoxModel"),
     "VoxtralForConditionalGeneration": ("voxtral", "VoxtralForConditionalGeneration"),  # noqa: E501
+    "VoxtralStreamingGeneration": ("voxtral_streaming", "VoxtralStreamingGeneration"),  # noqa: E501
     # [Encoder-decoder]
+    "NemotronParseForConditionalGeneration": (
+        "nemotron_parse",
+        "NemotronParseForConditionalGeneration",
+    ),
     "WhisperForConditionalGeneration": ("whisper", "WhisperForConditionalGeneration"),  # noqa: E501
 }
 
@@ -438,6 +460,7 @@ _SPECULATIVE_DECODING_MODELS = {
     "EagleDeepSeekMTPModel": ("deepseek_eagle", "EagleDeepseekV3ForCausalLM"),
     "DeepSeekMTPModel": ("deepseek_mtp", "DeepSeekMTP"),
     "ErnieMTPModel": ("ernie_mtp", "ErnieMTP"),
+    "ExaoneMoeMTP": ("exaone_moe_mtp", "ExaoneMoeMTP"),
     "LongCatFlashMTPModel": ("longcat_flash_mtp", "LongCatFlashMTP"),
     "Glm4MoeMTPModel": ("glm4_moe_mtp", "Glm4MoeMTP"),
     "MedusaModel": ("medusa", "Medusa"),
@@ -531,10 +554,12 @@ class _ModelInfo:
     is_text_generation_model: bool
     is_pooling_model: bool
     attn_type: AttnTypeStr
-    default_pooling_type: PoolingTypeStr
+    default_seq_pooling_type: SequencePoolingType
+    default_tok_pooling_type: TokenPoolingType
     supports_cross_encoding: bool
     supports_multimodal: bool
     supports_multimodal_raw_input_only: bool
+    requires_raw_input_tokens: bool
     supports_multimodal_encoder_tp_data: bool
     supports_pp: bool
     has_inner_state: bool
@@ -551,13 +576,15 @@ class _ModelInfo:
             architecture=model.__name__,
             is_text_generation_model=is_text_generation_model(model),
             is_pooling_model=is_pooling_model(model),
-            default_pooling_type=get_default_pooling_type(model),
+            default_seq_pooling_type=get_default_seq_pooling_type(model),
+            default_tok_pooling_type=get_default_tok_pooling_type(model),
             attn_type=get_attn_type(model),
             supports_cross_encoding=supports_cross_encoding(model),
             supports_multimodal=supports_multimodal(model),
             supports_multimodal_raw_input_only=supports_multimodal_raw_input_only(
                 model
             ),
+            requires_raw_input_tokens=requires_raw_input_tokens(model),
             supports_multimodal_encoder_tp_data=supports_multimodal_encoder_tp_data(
                 model
             ),
@@ -861,6 +888,7 @@ class _ModelRegistry:
                         module,
                         model_config.model,
                         revision=model_config.revision,
+                        trust_remote_code=model_config.trust_remote_code,
                         warn_on_fail=False,
                     )
 
@@ -873,6 +901,7 @@ class _ModelRegistry:
                         module,
                         model_config.model,
                         revision=model_config.revision,
+                        trust_remote_code=model_config.trust_remote_code,
                         warn_on_fail=True,
                     )
                     if model_module is not None:
diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py
index 31cc645099141197e8a621232c524aaa1702798e..7bf9a68824d8ce7e02fde0a997363e76400f84b1 100644
--- a/vllm/model_executor/models/roberta.py
+++ b/vllm/model_executor/models/roberta.py
@@ -8,12 +8,7 @@ from torch import nn
 from transformers import RobertaConfig
 
 from vllm.config import ModelConfig, VllmConfig
-from vllm.model_executor.layers.pooler import (
-    ClassifierPooler,
-    CLSPool,
-    DispatchPooler,
-    Pooler,
-)
+from vllm.model_executor.layers.pooler import DispatchPooler
 from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
 from vllm.model_executor.models.bert import (
     TOKEN_TYPE_SHIFT,
@@ -57,12 +52,6 @@ class RobertaEmbedding(nn.Module):
             torch.arange(config.max_position_embeddings).unsqueeze(0),
         )
 
-        self.position_embedding_type = config.position_embedding_type
-        if self.position_embedding_type != "absolute":
-            raise ValueError(
-                "Only 'absolute' position_embedding_type" + " is supported"
-            )
-
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -96,14 +85,14 @@ class RobertaClassificationHead(nn.Module):
         )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        # CLSPool has already been applied in `pooling`
+        # Token extraction has already been applied in `pooler.pooling`
         x = self.dense(x)
         x = torch.tanh(x)
         x = self.out_proj(x)
         return x
 
 
-@default_pooling_type("CLS")
+@default_pooling_type(seq_pooling_type="CLS")
 class RobertaEmbeddingModel(BertEmbeddingModel):
     """A model that uses Roberta to provide embedding functionalities."""
 
@@ -135,12 +124,12 @@ class RobertaEmbeddingModel(BertEmbeddingModel):
     def _build_model(
         self, vllm_config: VllmConfig, prefix: str = ""
     ) -> BertModel | BertWithRope:
-        if vllm_config.model_config.hf_config.position_embedding_type == "rotary":
-            return JinaRobertaModel(vllm_config=vllm_config, prefix=prefix)
+        hf_config = vllm_config.model_config.hf_config
+        kwargs = dict(vllm_config=vllm_config, prefix=prefix)
+        if getattr(hf_config, "position_embedding_type", "absolute") == "absolute":
+            return BertModel(**kwargs, embedding_class=RobertaEmbedding)
         else:
-            return BertModel(
-                vllm_config=vllm_config, prefix=prefix, embedding_class=RobertaEmbedding
-            )
+            return JinaRobertaModel(**kwargs)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         weights_list = list(weights)
@@ -160,7 +149,7 @@ class RobertaEmbeddingModel(BertEmbeddingModel):
         return loader.load_weights(weights_list, mapper=mapper)
 
 
-@default_pooling_type("CLS")
+@default_pooling_type(seq_pooling_type="CLS")
 class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding):
     """A model that uses Roberta to provide embedding functionalities.
 
@@ -202,18 +191,9 @@ class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding):
         pooler_config = vllm_config.model_config.pooler_config
         assert pooler_config is not None
 
-        self.pooler = DispatchPooler(
-            {
-                "token_classify": Pooler.for_token_classify(
-                    pooler_config=pooler_config, classifier=self.classifier
-                ),
-                "classify": ClassifierPooler(
-                    pooling=CLSPool(), classifier=self.classifier, act_fn="classify"
-                ),
-                "score": ClassifierPooler(
-                    pooling=CLSPool(), classifier=self.classifier, act_fn="score"
-                ),
-            }
+        self.pooler = DispatchPooler.for_seq_cls(
+            pooler_config,
+            classifier=self.classifier,
         )
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
diff --git a/vllm/model_executor/models/seed_oss.py b/vllm/model_executor/models/seed_oss.py
index f25223c782552f86e02f0227c81932b3b946ba8b..91a60bfd1668e329feb796d6c62cc75712e693a2 100644
--- a/vllm/model_executor/models/seed_oss.py
+++ b/vllm/model_executor/models/seed_oss.py
@@ -30,7 +30,6 @@ import torch
 from torch import nn
 from transformers import PretrainedConfig as SeedOssConfig
 
-from vllm.attention.backends.abstract import AttentionType
 from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
@@ -56,6 +55,7 @@ from vllm.model_executor.model_loader.weight_utils import (
 )
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.config import set_default_rope_theta
+from vllm.v1.attention.backend import AttentionType
 
 from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (
diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
index 2600dc1c9f79c47f0a92922e98aa2009b3224f2e..1bda00653b7de5dc3994ac647676a3a9df05af95 100644
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import math
 from collections.abc import Callable, Iterable, Mapping
 from functools import cached_property
 from typing import Annotated, Literal
@@ -16,19 +15,21 @@ from transformers import (
     SiglipVisionConfig,
 )
 
-from vllm.attention.layer import MultiHeadAttention
-from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention
 from vllm.config import VllmConfig
-from vllm.config.multimodal import BaseDummyOptions
+from vllm.config.multimodal import BaseDummyOptions, MultiModalConfig
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.attention.encoder_only_attention import (
+    EncoderOnlyAttention,
+)
+from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention
 from vllm.model_executor.layers.conv import Conv2dLayer
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
     QKVParallelLinear,
     RowParallelLinear,
 )
-from vllm.model_executor.layers.pooler import DispatchPooler, Pooler
+from vllm.model_executor.layers.pooler import DispatchPooler
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
 from vllm.model_executor.model_loader.weight_utils import (
@@ -128,7 +129,7 @@ class SiglipProcessingInfo(BaseProcessingInfo):
                 image_width=image_width,
                 image_height=image_height,
             ),
-            _get_vision_feature_select_strategy(pooler_config.pooling_type),
+            _get_vision_feature_select_strategy(pooler_config.seq_pooling_type),
         )
 
     def get_image_size_with_most_features(self) -> ImageSize:
@@ -276,7 +277,7 @@ class SiglipEncoderInfo(VisionEncoderInfo[SiglipVisionConfig]):
         return image_size // patch_size
 
 
-# Adapted from https://github.com/huggingface/transformers/blob/v4.43.3/src/transformers/models/siglip/modeling_siglip.py#L249 # noqa
+# Adapted from https://github.com/huggingface/transformers/blob/v4.57.3/src/transformers/models/siglip/modeling_siglip.py#L216
 class SiglipVisionEmbeddings(nn.Module):
     def __init__(self, config: SiglipVisionConfig):
         super().__init__()
@@ -295,9 +296,7 @@ class SiglipVisionEmbeddings(nn.Module):
 
         self.num_patches = (self.image_size // self.patch_size) ** 2
         self.num_positions = self.num_patches
-        self.position_embedding = VocabParallelEmbedding(
-            self.num_positions, self.embed_dim
-        )
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
         self.register_buffer(
             "position_ids",
             torch.arange(self.num_positions, dtype=torch.int64).expand((1, -1)),
@@ -307,50 +306,30 @@ class SiglipVisionEmbeddings(nn.Module):
     def interpolate_pos_encoding(
         self, embeddings: torch.Tensor, height: int, width: int
     ) -> torch.Tensor:
-        """
-        This method is an adapted method for SigLIP (due to SigLIP not having
-        class embedding unlike other ViTs) that allows the model to interpolate
-        the pre-trained position encodings such that it can be usable on higher
-        resolution images.
-
-        Source:
-        https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174
-        """
-        position_embeddings = self.position_embedding.weight.unsqueeze(0)
         num_patches = embeddings.shape[1]
-        num_positions = position_embeddings.shape[1]
+        num_positions = self.position_embedding.weight.shape[1]
         if num_patches == num_positions and height == width:
-            return position_embeddings
+            return self.position_embedding(self.position_ids)
+
+        patch_pos_embed = self.position_embedding.weight.unsqueeze(0)
 
         dim = embeddings.shape[-1]
-        height = height // self.patch_size
-        width = width // self.patch_size
-        # we add a small number to avoid floating point error
-        # in the interpolation
-        # see discussion at https://github.com/facebookresearch/dino/issues/8
-        height, width = height + 0.1, width + 0.1
-
-        patch_pos_embed = position_embeddings.reshape(
-            1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim
+
+        new_height = height // self.patch_size
+        new_width = width // self.patch_size
+
+        sqrt_num_positions = int(num_positions**0.5)
+        patch_pos_embed = patch_pos_embed.reshape(
+            1, sqrt_num_positions, sqrt_num_positions, dim
         )
         patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+
         patch_pos_embed = nn.functional.interpolate(
             patch_pos_embed,
-            scale_factor=(
-                height / math.sqrt(num_positions),
-                width / math.sqrt(num_positions),
-            ),
+            size=(new_height, new_width),
             mode="bicubic",
             align_corners=False,
         )
-        if (
-            int(height) != patch_pos_embed.shape[-2]
-            or int(width) != patch_pos_embed.shape[-1]
-        ):
-            raise ValueError(
-                "Width or height does not match with "
-                "the interpolated position embeddings"
-            )
 
         patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
         return patch_pos_embed
@@ -377,9 +356,10 @@ class SiglipAttention(nn.Module):
         self,
         config: SiglipVisionConfig | SiglipTextConfig,
         quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         *,
         prefix: str = "",
-        attn_cls: type[EncoderOnlyAttention] | type[MultiHeadAttention],
+        attn_cls: type[EncoderOnlyAttention] | type[MMEncoderAttention],
     ) -> None:
         super().__init__()
 
@@ -389,19 +369,25 @@ class SiglipAttention(nn.Module):
         self.head_dim = self.embed_dim // self.num_heads
         if self.head_dim * self.num_heads != self.embed_dim:
             raise ValueError(
-                f"embed_dim must be divisible by num_heads (got "
-                "`embed_dim`: {self.embed_dim} and `num_heads`:"
-                f" {self.num_heads})."
+                f"embed_dim must be divisible by num_heads "
+                f"(got `embed_dim`: {self.embed_dim} and "
+                f"`num_heads`: {self.num_heads})."
             )
 
         self.scale = self.head_dim**-0.5
-        self.dropout = config.attention_dropout
+
+        use_data_parallel = (
+            multimodal_config.mm_encoder_tp_mode == "data"
+            if multimodal_config
+            else False
+        )
         self.qkv_proj = QKVParallelLinear(
             hidden_size=self.embed_dim,
             head_size=self.head_dim,
             total_num_heads=self.num_heads,
             quant_config=quant_config,
             prefix=f"{prefix}.qkv_proj",
+            disable_tp=use_data_parallel,
         )
 
         self.out_proj = RowParallelLinear(
@@ -409,17 +395,29 @@ class SiglipAttention(nn.Module):
             output_size=self.embed_dim,
             quant_config=quant_config,
             prefix=f"{prefix}.out_proj",
+            disable_tp=use_data_parallel,
         )
 
-        self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_size = (
+            1 if use_data_parallel else get_tensor_model_parallel_world_size()
+        )
         self.num_heads_per_partition = divide(self.num_heads, self.tp_size)
 
-        self.attn = attn_cls(
-            self.num_heads_per_partition,
-            self.head_dim,
-            self.scale,
-            prefix=f"{prefix}.attn",
-        )
+        if attn_cls == MMEncoderAttention:
+            self.attn = attn_cls(
+                self.num_heads_per_partition,
+                self.head_dim,
+                self.scale,
+                prefix=f"{prefix}.attn",
+                multimodal_config=multimodal_config,
+            )
+        else:
+            self.attn = attn_cls(
+                self.num_heads_per_partition,
+                self.head_dim,
+                self.scale,
+                prefix=f"{prefix}.attn",
+            )
 
     def forward(
         self,
@@ -439,12 +437,19 @@ class SiglipMLP(nn.Module):
         self,
         config: SiglipVisionConfig | SiglipTextConfig,
         quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
 
         self.config = config
+        use_data_parallel = (
+            multimodal_config.mm_encoder_tp_mode == "data"
+            if multimodal_config
+            else False
+        )
         self.activation_fn = get_act_fn(config.hidden_act)
+
         # Special handling for BNB and torchao quantization
         if quant_config and quant_config.get_name() in ["bitsandbytes", "torchao"]:
             quantizable = True
@@ -454,17 +459,20 @@ class SiglipMLP(nn.Module):
             quantizable = (
                 config.hidden_size % 64 == 0 and config.intermediate_size % 64 == 0
             )
+
         self.fc1 = ColumnParallelLinear(
             config.hidden_size,
             config.intermediate_size,
             quant_config=quant_config if quantizable else None,
             prefix=f"{prefix}.fc1",
+            disable_tp=use_data_parallel,
         )
         self.fc2 = RowParallelLinear(
             config.intermediate_size,
             config.hidden_size,
             quant_config=quant_config if quantizable else None,
             prefix=f"{prefix}.fc2",
+            disable_tp=use_data_parallel,
         )
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
@@ -479,9 +487,10 @@ class SiglipEncoderLayer(nn.Module):
         self,
         config: SiglipVisionConfig | SiglipTextConfig,
         quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         *,
         prefix: str = "",
-        attn_cls: type[EncoderOnlyAttention] | type[MultiHeadAttention],
+        attn_cls: type[EncoderOnlyAttention] | type[MMEncoderAttention],
     ) -> None:
         super().__init__()
 
@@ -490,6 +499,7 @@ class SiglipEncoderLayer(nn.Module):
         self.self_attn = SiglipAttention(
             config,
             quant_config=quant_config,
+            multimodal_config=multimodal_config,
             prefix=f"{prefix}.self_attn",
             attn_cls=attn_cls,
         )
@@ -497,6 +507,7 @@ class SiglipEncoderLayer(nn.Module):
         self.mlp = SiglipMLP(
             config,
             quant_config=quant_config,
+            multimodal_config=multimodal_config,
             prefix=f"{prefix}.mlp",
         )
         self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
@@ -524,10 +535,11 @@ class SiglipEncoder(nn.Module):
         self,
         config: SiglipVisionConfig | SiglipTextConfig,
         quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         num_hidden_layers_override: int | None = None,
         *,
         prefix: str = "",
-        attn_cls: type[EncoderOnlyAttention] | type[MultiHeadAttention],
+        attn_cls: type[EncoderOnlyAttention] | type[MMEncoderAttention],
     ) -> None:
         super().__init__()
 
@@ -543,6 +555,7 @@ class SiglipEncoder(nn.Module):
                 SiglipEncoderLayer(
                     config,
                     quant_config=quant_config,
+                    multimodal_config=multimodal_config,
                     prefix=f"{prefix}.layers.{layer_idx}",
                     attn_cls=attn_cls,
                 )
@@ -647,6 +660,7 @@ class SiglipMultiheadAttentionPoolingHead(nn.Module):
         self,
         config: SiglipVisionConfig,
         quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -658,7 +672,10 @@ class SiglipMultiheadAttentionPoolingHead(nn.Module):
         )
         self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.mlp = SiglipMLP(
-            config=config, quant_config=quant_config, prefix=f"{prefix}.mlp"
+            config=config,
+            quant_config=quant_config,
+            multimodal_config=multimodal_config,
+            prefix=f"{prefix}.mlp",
         )
 
     def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
@@ -683,6 +700,7 @@ class SiglipVisionTransformer(nn.Module):
         self,
         config: SiglipVisionConfig,
         quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         *,
         num_hidden_layers_override: int | None = None,
         require_post_norm: bool | None = None,
@@ -698,9 +716,10 @@ class SiglipVisionTransformer(nn.Module):
         self.encoder = SiglipEncoder(
             config,
             quant_config=quant_config,
+            multimodal_config=multimodal_config,
             num_hidden_layers_override=num_hidden_layers_override,
             prefix=f"{prefix}.encoder",
-            attn_cls=MultiHeadAttention,
+            attn_cls=MMEncoderAttention,
         )
 
         num_hidden_layers = config.num_hidden_layers
@@ -726,6 +745,7 @@ class SiglipVisionTransformer(nn.Module):
             self.head = SiglipMultiheadAttentionPoolingHead(
                 config=config,
                 quant_config=quant_config,
+                multimodal_config=multimodal_config,
                 prefix=f"{prefix}.head",
             )
 
@@ -812,13 +832,11 @@ class SiglipVisionTransformer(nn.Module):
 
 
 class SiglipVisionModel(nn.Module):
-    config_class = SiglipVisionConfig
-    main_input_name = "pixel_values"
-
     def __init__(
         self,
         config: SiglipVisionConfig,
         quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
         *,
         num_hidden_layers_override: int | None = None,
         require_post_norm: bool | None = None,
@@ -829,7 +847,8 @@ class SiglipVisionModel(nn.Module):
         self.quant_config = quant_config
         self.vision_model = SiglipVisionTransformer(
             config,
-            quant_config,
+            quant_config=quant_config,
+            multimodal_config=multimodal_config,
             num_hidden_layers_override=num_hidden_layers_override,
             require_post_norm=require_post_norm,
             prefix=f"{prefix}.vision_model",
@@ -979,7 +998,7 @@ class SiglipTextEmbeddings(nn.Module):
 
 
 # Assume EOS token corresponds to CLS token in text model
-@default_pooling_type("CLS")
+@default_pooling_type(seq_pooling_type="CLS")
 @MULTIMODAL_REGISTRY.register_processor(
     SiglipMultiModalProcessor,
     info=SiglipProcessingInfo,
@@ -1023,6 +1042,7 @@ class SiglipEmbeddingModel(nn.Module, SupportsMultiModal, SupportsQuant):
         self.vision_model = SiglipVisionTransformer(
             vision_config,
             quant_config=quant_config,
+            multimodal_config=multimodal_config,
             prefix=maybe_prefix(prefix, "vision_model"),
         )
 
@@ -1032,12 +1052,7 @@ class SiglipEmbeddingModel(nn.Module, SupportsMultiModal, SupportsQuant):
         assert pooler_config is not None
         self.pooler_config = pooler_config
 
-        self.pooler = DispatchPooler(
-            {
-                "token_embed": Pooler.for_token_embed(pooler_config),
-                "embed": Pooler.for_embed(pooler_config),
-            }
-        )
+        self.pooler = DispatchPooler.for_embedding(pooler_config)
 
         self._is_text_input = True
 
@@ -1110,7 +1125,7 @@ class SiglipEmbeddingModel(nn.Module, SupportsMultiModal, SupportsQuant):
     ) -> torch.Tensor:
         if feature_select_strategy is None:
             feature_select_strategy = _get_vision_feature_select_strategy(
-                self.pooler_config.pooling_type
+                self.pooler_config.seq_pooling_type
             )
 
         pooled_output = self.vision_model(
diff --git a/vllm/model_executor/models/siglip2.py b/vllm/model_executor/models/siglip2.py
new file mode 100644
index 0000000000000000000000000000000000000000..8fbc408ec23e4e86c13e094b8e6c224f3d3f1873
--- /dev/null
+++ b/vllm/model_executor/models/siglip2.py
@@ -0,0 +1,495 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Implementation of Siglip2VisionModel intended to be only used
+within a vision language model."""
+
+from collections.abc import Iterable
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+from transformers import Siglip2VisionConfig
+
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import MultiModalConfig
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+
+from .vision import should_torch_compile_mm_vit
+
+
+class Siglip2VisionEmbeddings(nn.Module):
+    def __init__(self, config: Siglip2VisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.patch_size = config.patch_size
+        self.patch_embedding = nn.Linear(
+            in_features=config.num_channels * self.patch_size * self.patch_size,
+            out_features=self.embed_dim,
+        )
+        self.num_patches = config.num_patches
+        self.position_embedding_size = int(self.num_patches**0.5)
+        self.position_embedding = nn.Embedding(self.num_patches, self.embed_dim)
+
+    @staticmethod
+    def resize_positional_embeddings(
+        positional_embeddings: torch.Tensor,
+        spatial_shapes: torch.LongTensor,
+        max_length: int,
+    ) -> torch.Tensor:
+        """
+        Resize positional embeddings to image-specific size and pad to a fixed size.
+
+        Args:
+            positional_embeddings (`torch.Tensor`):
+                Position embeddings of shape (height, width, embed_dim)
+            spatial_shapes (`torch.LongTensor`):
+                Spatial shapes of shape (batch_size, 2) to resize the positional
+                embeddings to
+            max_length (`int`):
+                Maximum length of the positional embeddings to pad resized
+                positional embeddings to
+
+        Returns:
+            `torch.Tensor`: Embeddings of shape (batch_size, max_length, embed_dim)
+        """
+        batch_size = spatial_shapes.shape[0]
+        embed_dim = positional_embeddings.shape[-1]
+        source_dtype = positional_embeddings.dtype
+
+        resulted_positional_embeddings = torch.empty(
+            (batch_size, max_length, embed_dim),
+            device=positional_embeddings.device,
+            dtype=source_dtype,
+        )
+
+        # (height, width, embed_dim) -> (1, embed_dim, height, width) for interpolation
+        positional_embeddings = positional_embeddings.permute(2, 0, 1).unsqueeze(0)
+
+        # Upcast to float32 on CPU because antialias is not supported for
+        # bfloat16/float16 on CPU
+        if positional_embeddings.device.type == "cpu":
+            positional_embeddings = positional_embeddings.to(torch.float32)
+
+        for i in range(batch_size):
+            # (1, dim, height, width) -> (1, dim, target_height, target_width)
+            height, width = spatial_shapes[i]
+            resized_embeddings = F.interpolate(
+                positional_embeddings,
+                size=(height, width),
+                mode="bilinear",
+                align_corners=False,
+                antialias=True,
+            )
+
+            # (1, dim, target_height, target_width) ->
+            # (target_height * target_width, dim)
+            resized_embeddings = resized_embeddings.reshape(
+                embed_dim, height * width
+            ).transpose(0, 1)
+
+            # Cast to original dtype
+            resized_embeddings = resized_embeddings.to(source_dtype)
+
+            resulted_positional_embeddings[i, : height * width] = resized_embeddings
+            resulted_positional_embeddings[i, height * width :] = resized_embeddings[0]
+
+        return resulted_positional_embeddings
+
+    def forward(
+        self, pixel_values: torch.FloatTensor, spatial_shapes: torch.LongTensor
+    ) -> torch.Tensor:
+        """
+        Args:
+            pixel_values (`torch.FloatTensor`):
+                Pixel values of shape (batch_size, max_num_patches,
+                num_channels * patch_size * patch_size)
+            spatial_shapes (`list[tuple[int, int]]`):
+                Spatial shapes of shape (batch_size, 2) to resize the positional
+                embeddings to
+        """
+
+        # Apply patch embeddings to already patchified pixel values
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))
+
+        # Get positional resized and padded positional embeddings
+        positional_embeddings = self.position_embedding.weight.reshape(
+            self.position_embedding_size, self.position_embedding_size, -1
+        )
+        resized_positional_embeddings = self.resize_positional_embeddings(
+            positional_embeddings, spatial_shapes, max_length=pixel_values.shape[1]
+        )
+
+        # Add positional embeddings to patch embeddings
+        embeddings = patch_embeds + resized_positional_embeddings
+        return embeddings
+
+
+class Siglip2Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        config: Siglip2VisionConfig,
+        quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads "
+                f"(got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+
+        use_data_parallel = (
+            multimodal_config is not None
+            and multimodal_config.mm_encoder_tp_mode == "data"
+        )
+        tp_size = 1 if use_data_parallel else get_tensor_model_parallel_world_size()
+        assert self.num_heads % tp_size == 0
+        self.num_heads_per_partition = self.num_heads // tp_size
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size=self.embed_dim,
+            head_size=self.head_dim,
+            total_num_heads=self.num_heads,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+            disable_tp=use_data_parallel,
+        )
+        self.out_proj = RowParallelLinear(
+            input_size=self.embed_dim,
+            output_size=self.embed_dim,
+            quant_config=quant_config,
+            prefix=f"{prefix}.out_proj",
+            disable_tp=use_data_parallel,
+        )
+        self.attn = MMEncoderAttention(
+            num_heads=self.num_heads_per_partition,
+            head_size=self.head_dim,
+            scale=self.scale,
+            prefix=f"{prefix}.attn",
+            multimodal_config=multimodal_config,
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        max_seqlen: int | torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(
+            hidden_states
+        )  # batch_size, q_len, 3 * num_heads_per_partition * head_dim
+        bsz, q_len, _ = qkv.shape
+        query_states, key_states, value_states = qkv.chunk(3, dim=-1)
+        query_states = query_states.view(
+            bsz, q_len, self.num_heads_per_partition, self.head_dim
+        )
+        key_states = key_states.view(
+            bsz, q_len, self.num_heads_per_partition, self.head_dim
+        )
+        value_states = value_states.view(
+            bsz, q_len, self.num_heads_per_partition, self.head_dim
+        )
+
+        # Use unified MultiHeadAttention implementation
+        out = self.attn(
+            query=query_states,
+            key=key_states,
+            value=value_states,
+            cu_seqlens=cu_seqlens,
+            max_seqlen=max_seqlen,
+        )
+        out = out.reshape(bsz, q_len, -1)
+        attn_output, _ = self.out_proj(out)
+        return attn_output
+
+
+class Siglip2MLP(nn.Module):
+    def __init__(
+        self,
+        config: Siglip2VisionConfig,
+        quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.activation_fn = get_act_fn(config.hidden_act)
+        use_data_parallel = (
+            multimodal_config is not None
+            and multimodal_config.mm_encoder_tp_mode == "data"
+        )
+        self.fc1 = ColumnParallelLinear(
+            config.hidden_size,
+            config.intermediate_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc1",
+            disable_tp=use_data_parallel,
+        )
+        self.fc2 = RowParallelLinear(
+            config.intermediate_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc2",
+            disable_tp=use_data_parallel,
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states, _ = self.fc2(hidden_states)
+        return hidden_states
+
+
+@support_torch_compile(
+    dynamic_arg_dims={"hidden_states": [0, 1], "cu_seqlens": 0},
+    enable_if=should_torch_compile_mm_vit,
+)
+class Siglip2EncoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: Siglip2VisionConfig,
+        quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.self_attn = Siglip2Attention(
+            config,
+            quant_config=quant_config,
+            multimodal_config=multimodal_config,
+            prefix=f"{prefix}.self_attn",
+        )
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = Siglip2MLP(
+            config,
+            quant_config=quant_config,
+            multimodal_config=multimodal_config,
+            prefix=f"{prefix}.mlp",
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        max_seqlen: int | torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Args:
+            hidden_states: Input tensor of shape (batch, seq_len, embed_dim).
+            cu_seqlens: Cumulative sequence lengths tensor.
+            max_seqlen: Maximum sequence length.
+        """
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states = self.self_attn(
+            hidden_states=hidden_states,
+            cu_seqlens=cu_seqlens,
+            max_seqlen=max_seqlen,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+class Siglip2Encoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers`
+    self attention layers. Each layer is a [`Siglip2EncoderLayer`].
+
+    Args:
+        config: PretrainedConfig
+    """
+
+    def __init__(
+        self,
+        config: Siglip2VisionConfig,
+        quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList(
+            [
+                Siglip2EncoderLayer(
+                    config=config,
+                    quant_config=quant_config,
+                    multimodal_config=multimodal_config,
+                    prefix=f"{prefix}.layers.{idx}",
+                )
+                for idx in range(config.num_hidden_layers)
+            ]
+        )
+
+    def forward(
+        self,
+        inputs_embeds: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        max_seqlen: int | torch.Tensor,
+    ) -> torch.Tensor:
+        hidden_states = inputs_embeds
+        for encoder_layer in self.layers:
+            layer_outputs = encoder_layer(
+                hidden_states,
+                cu_seqlens=cu_seqlens,
+                max_seqlen=max_seqlen,
+            )
+            hidden_states = layer_outputs
+        return hidden_states
+
+
+class Siglip2VisionTransformer(nn.Module):
+    def __init__(
+        self,
+        config: Siglip2VisionConfig,
+        quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        embed_dim = config.hidden_size
+        self.config = config
+        self.embeddings = Siglip2VisionEmbeddings(config)
+        # Keep the import local to avoid circular dependencies during model init.
+        from vllm.compilation.backends import set_model_tag
+
+        with set_model_tag("Siglip2Encoder", is_encoder=True):
+            self.encoder = Siglip2Encoder(
+                config,
+                quant_config=quant_config,
+                multimodal_config=multimodal_config,
+                prefix=f"{prefix}.encoder",
+            )
+        num_hidden_layers = config.num_hidden_layers
+        if len(self.encoder.layers) > config.num_hidden_layers:
+            raise ValueError(
+                f"The original encoder only has {num_hidden_layers} "
+                f"layers, but you requested {len(self.encoder.layers)} layers."
+            )
+
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+    def get_input_embeddings(self):
+        return self.embeddings
+
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        spatial_shapes: torch.LongTensor,
+        packed_mask: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        max_seqlen: int | torch.Tensor,
+    ) -> torch.Tensor:
+        r"""
+        spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
+            Tensor containing the spatial dimensions (height, width)
+            of the input images.
+        """
+        hidden_states = self.embeddings(pixel_values, spatial_shapes)
+        flat_mask = packed_mask.view(-1)
+        packed_indices = flat_mask.nonzero(as_tuple=True)[0]
+        flat_hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
+        hidden_states = flat_hidden_states.index_select(0, packed_indices).unsqueeze(0)
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            cu_seqlens=cu_seqlens,
+            max_seqlen=max_seqlen,
+        )
+        unpacked = encoder_outputs.new_zeros(
+            packed_mask.numel(), encoder_outputs.shape[-1]
+        )
+        unpacked.index_copy_(0, packed_indices, encoder_outputs.squeeze(0))
+        encoder_outputs = unpacked.view(
+            packed_mask.shape + (encoder_outputs.shape[-1],)
+        )
+        last_hidden_state = self.post_layernorm(encoder_outputs)
+        return last_hidden_state
+
+
+class Siglip2Model(torch.nn.Module):
+    def __init__(
+        self,
+        config: Siglip2VisionConfig,
+        quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        self.vision_model = Siglip2VisionTransformer(
+            config,
+            quant_config=quant_config,
+            multimodal_config=multimodal_config,
+            prefix=f"{prefix}.vision_model",
+        )
+
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        spatial_shapes: torch.LongTensor,
+        packed_mask: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        max_seqlen: int | torch.Tensor,
+    ) -> torch.Tensor:
+        return self.vision_model(
+            pixel_values=pixel_values,
+            spatial_shapes=spatial_shapes,
+            packed_mask=packed_mask,
+            cu_seqlens=cu_seqlens,
+            max_seqlen=max_seqlen,
+        )
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+
+        for name, loaded_weight in weights:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/siglip2navit.py b/vllm/model_executor/models/siglip2navit.py
index efdee255ab5eb2a23ced0182c6d35ca237971f25..f4b79da5c3a43e4a6cb78281a4010543dc2319f1 100644
--- a/vllm/model_executor/models/siglip2navit.py
+++ b/vllm/model_executor/models/siglip2navit.py
@@ -11,11 +11,10 @@ from torch.nn import functional as F
 from transformers import Siglip2VisionConfig
 from transformers.configuration_utils import PretrainedConfig
 
-from vllm.attention.backends.registry import AttentionBackendEnum
-from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention
 from vllm.config import MultiModalConfig
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention
 from vllm.model_executor.layers.conv import Conv2dLayer
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
@@ -153,18 +152,16 @@ def apply_rotary_pos_emb(
     k: torch.Tensor,
     cos: torch.Tensor,
     sin: torch.Tensor,
-    is_flash_attn_backend: bool = False,
+    is_flash_attn_backend: bool,
+    apply_rotary_emb: ApplyRotaryEmb,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     cos = cos.chunk(2, dim=-1)[0].contiguous()
     sin = sin.chunk(2, dim=-1)[0].contiguous()
 
-    apply_rotary_emb = ApplyRotaryEmb(
-        enforce_enable=True,
-        enable_fp32_compute=True,
-    )
-
-    if is_flash_attn_backend and not current_platform.is_cuda():
+    if is_flash_attn_backend and current_platform.is_cuda():
         apply_rotary_emb_func = apply_rotary_emb.forward_cuda
+    elif is_flash_attn_backend and current_platform.is_rocm():
+        apply_rotary_emb_func = apply_rotary_emb.forward_hip
     else:
         apply_rotary_emb_func = apply_rotary_emb.forward_native
 
@@ -184,7 +181,6 @@ class Siglip2Attention(nn.Module):
         multimodal_config: MultiModalConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
-        attn_backend_override: AttentionBackendEnum | None = None,
     ):
         super().__init__()
         self.config = config
@@ -194,12 +190,11 @@ class Siglip2Attention(nn.Module):
         if self.head_dim * self.num_heads != self.embed_dim:
             raise ValueError(
                 f"embed_dim must be divisible by num_heads "
-                f"(got `embed_dim`: {self.embed_dim} and `num_heads`:"
-                f" {self.num_heads})."
+                f"(got `embed_dim`: {self.embed_dim} and "
+                f"`num_heads`: {self.num_heads})."
             )
         self.scale = self.head_dim**-0.5
         self.dropout = config.attention_dropout
-        self.is_causal = False
 
         use_data_parallel = (
             multimodal_config.mm_encoder_tp_mode == "data"
@@ -231,10 +226,16 @@ class Siglip2Attention(nn.Module):
         self.attn = MMEncoderAttention(
             num_heads=self.num_heads_per_partition,
             head_size=self.head_dim,
+            scale=self.scale,
             prefix=f"{prefix}.attn",
             multimodal_config=multimodal_config,
         )
 
+        self.apply_rotary_emb = ApplyRotaryEmb(
+            enforce_enable=True,
+            enable_fp32_compute=True,
+        )
+
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -260,6 +261,7 @@ class Siglip2Attention(nn.Module):
                 cos,
                 sin,
                 self.attn.is_flash_attn_backend,
+                self.apply_rotary_emb,
             )
             queries = queries.squeeze(0)
             keys = keys.squeeze(0)
diff --git a/vllm/model_executor/models/step3_vl.py b/vllm/model_executor/models/step3_vl.py
index e5038e56a270859e14b1eee77484c815eba03298..771e5974ae0007879f90a6d4acbf2a3a4ed3ef5a 100644
--- a/vllm/model_executor/models/step3_vl.py
+++ b/vllm/model_executor/models/step3_vl.py
@@ -15,11 +15,11 @@ from torchvision import transforms
 from torchvision.transforms.functional import InterpolationMode
 from transformers import BatchFeature, PretrainedConfig, TensorType
 
-from vllm.attention.layer import MultiHeadAttention
 from vllm.config import VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention
 from vllm.model_executor.layers.conv import Conv2dLayer
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
@@ -753,8 +753,8 @@ class Step3VisionAttention(nn.Module):
             disable_tp=use_data_parallel,
         )
 
-        # Use unified MultiHeadAttention with automatic backend selection
-        self.attn = MultiHeadAttention(self.num_heads, self.head_dim, self.scale)
+        # Use unified MMEncoderAttention with automatic backend selection
+        self.attn = MMEncoderAttention(self.num_heads, self.head_dim, self.scale)
 
     def forward(
         self,
@@ -767,7 +767,7 @@ class Step3VisionAttention(nn.Module):
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.chunk(chunks=3, dim=-1)
 
-        # Use unified MultiHeadAttention with automatic backend selection
+        # Use unified MMEncoderAttention with automatic backend selection
         attn_output = self.attn(q, k, v)
 
         attn_output, _ = self.out_proj(attn_output)
diff --git a/vllm/model_executor/models/swin.py b/vllm/model_executor/models/swin.py
index a74fd80c06d8c9cf18ac36cf76b5f15e8e1c80b8..fbf5594851ece0c01e3b014646d64b4b01b96d55 100644
--- a/vllm/model_executor/models/swin.py
+++ b/vllm/model_executor/models/swin.py
@@ -102,7 +102,6 @@ class SwinSelfAttention(nn.Module):
         self,
         hidden_states: torch.Tensor,
         attention_mask: torch.FloatTensor | None = None,
-        head_mask: torch.FloatTensor | None = None,
         output_attentions: bool | None = False,
     ) -> tuple[torch.Tensor, ...]:
         batch_size, dim, num_channels = hidden_states.shape
@@ -201,12 +200,9 @@ class SwinAttention(nn.Module):
         self,
         hidden_states: torch.Tensor,
         attention_mask: torch.FloatTensor | None = None,
-        head_mask: torch.FloatTensor | None = None,
         output_attentions: bool | None = False,
     ) -> tuple[torch.Tensor]:
-        self_outputs = self.self(
-            hidden_states, attention_mask, head_mask, output_attentions
-        )
+        self_outputs = self.self(hidden_states, attention_mask, output_attentions)
         attention_output = self.output(self_outputs[0], hidden_states)
         outputs = (attention_output,) + self_outputs[1:]
         return outputs
@@ -339,18 +335,14 @@ class SwinStage(nn.Module):
         self,
         hidden_states: torch.Tensor,
         input_dimensions: tuple[int, int],
-        head_mask: torch.FloatTensor | None = None,
         output_attentions: bool | None = False,
         always_partition: bool | None = False,
     ) -> tuple[torch.Tensor]:
         height, width = input_dimensions
         for i, layer_module in enumerate(self.blocks):
-            layer_head_mask = head_mask[i] if head_mask is not None else None
-
             layer_outputs = layer_module(
                 hidden_states,
                 input_dimensions,
-                layer_head_mask,
                 output_attentions,
                 always_partition,
             )
@@ -425,17 +417,13 @@ class SwinEncoder(nn.Module):
         self,
         hidden_states: torch.Tensor,
         input_dimensions: tuple[int, int],
-        head_mask: torch.FloatTensor | None = None,
         output_attentions: bool | None = False,
         always_partition: bool | None = False,
     ) -> tuple[torch.Tensor]:
         for i, layer_module in enumerate(self.layers):
-            layer_head_mask = head_mask[i] if head_mask is not None else None
-
             layer_outputs = layer_module(
                 hidden_states,
                 input_dimensions,
-                layer_head_mask,
                 output_attentions,
                 always_partition,
             )
@@ -473,7 +461,6 @@ class SwinModel(nn.Module):
     def forward(
         self,
         pixel_values: torch.FloatTensor | None = None,
-        head_mask: torch.FloatTensor | None = None,
         output_attentions: bool | None = None,
     ) -> tuple[torch.Tensor]:
         embedding_output, input_dimensions = self.embeddings(pixel_values)
@@ -481,7 +468,6 @@ class SwinModel(nn.Module):
         encoder_outputs = self.encoder(
             embedding_output,
             input_dimensions,
-            head_mask=head_mask,
             output_attentions=output_attentions,
         )
 
diff --git a/vllm/model_executor/models/tarsier.py b/vllm/model_executor/models/tarsier.py
index 7e82a4d725a620b9f1c16108aadb83ad0fa303db..dcfd43272148c62a2f269a26ba0c9a18727fd44a 100644
--- a/vllm/model_executor/models/tarsier.py
+++ b/vllm/model_executor/models/tarsier.py
@@ -19,7 +19,7 @@ from transformers.models.llava import LlavaProcessor
 from transformers.processing_utils import ProcessingKwargs, Unpack
 from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
 
-from vllm.config import VllmConfig
+from vllm.config import MultiModalConfig, VllmConfig
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear
 from vllm.model_executor.layers.quantization import QuantizationConfig
@@ -346,6 +346,7 @@ def _build_tarsier_hf_processor(
 def init_vision_tower_for_tarsier(
     hf_config: TarsierHfConfig,  # Use the Tarsier specific config protocol
     quant_config: QuantizationConfig | None,
+    multimodal_config: MultiModalConfig | None,
     *,
     require_post_norm: bool | None = None,
     prefix: str = "",
@@ -377,6 +378,7 @@ def init_vision_tower_for_tarsier(
         return CLIPVisionModel(
             vision_config,
             quant_config=quant_config,
+            multimodal_config=multimodal_config,
             num_hidden_layers_override=num_hidden_layers_to_init,
             require_post_norm=require_post_norm,
             prefix=prefix,
@@ -385,6 +387,7 @@ def init_vision_tower_for_tarsier(
         return SiglipVisionModel(
             vision_config,
             quant_config=quant_config,
+            multimodal_config=multimodal_config,
             num_hidden_layers_override=num_hidden_layers_to_init,
             require_post_norm=require_post_norm,
             prefix=prefix,
@@ -414,12 +417,16 @@ class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP)
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         super().__init__()
+
         config: TarsierHfConfig = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+
         self.config = config  # Storing the Tarsier-specific HF config
         self.vision_tower = init_vision_tower_for_tarsier(
             config,
-            quant_config,
+            quant_config=quant_config,
+            multimodal_config=multimodal_config,
             require_post_norm=False,
             prefix=maybe_prefix(prefix, "vision_tower"),
         )
diff --git a/vllm/model_executor/models/terratorch.py b/vllm/model_executor/models/terratorch.py
index 402081a70631e47a342488d51930bb0fd3ae6b2d..c97af0db5e8d265accad6f10ded9e1ef2af12a79 100644
--- a/vllm/model_executor/models/terratorch.py
+++ b/vllm/model_executor/models/terratorch.py
@@ -34,7 +34,7 @@ from transformers import BatchFeature
 from vllm.config import VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.logger import init_logger
-from vllm.model_executor.layers.pooler import DispatchPooler, DummyPooler
+from vllm.model_executor.layers.pooler import IdentityPooler
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.utils import AutoWeightsLoader
 from vllm.multimodal import MULTIMODAL_REGISTRY
@@ -248,7 +248,7 @@ class Terratorch(nn.Module, IsAttentionFree, SupportsMultiModal):
         pooler_config = vllm_config.model_config.pooler_config
         assert pooler_config is not None
 
-        self.pooler = DispatchPooler({"plugin": DummyPooler()})
+        self.pooler = IdentityPooler()
 
     def embed_input_ids(
         self,
diff --git a/vllm/model_executor/models/transformers/base.py b/vllm/model_executor/models/transformers/base.py
index 45e746ac2d356acfd5909c572d175eba6eba8fe4..d094bb2895f27872ff9c6270c2dd5e55a97306bd 100644
--- a/vllm/model_executor/models/transformers/base.py
+++ b/vllm/model_executor/models/transformers/base.py
@@ -27,13 +27,14 @@ from torch import nn
 from transformers import AutoModel
 from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
 
-from vllm.attention.backends.abstract import AttentionType
 from vllm.attention.layer import Attention
-from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention
 from vllm.config.utils import getattr_iter
 from vllm.distributed import get_pp_group, get_tp_group
 from vllm.distributed.utils import get_pp_indices
 from vllm.logger import init_logger
+from vllm.model_executor.layers.attention.encoder_only_attention import (
+    EncoderOnlyAttention,
+)
 from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
 from vllm.model_executor.models.interfaces import (
     SupportsEagle,
@@ -47,6 +48,7 @@ from vllm.model_executor.models.transformers.utils import (
     get_feature_request_tip,
     init_on_device_without_buffers,
     log_replacement,
+    replace_conv_class,
     replace_linear_class,
     replace_rms_norm_class,
 )
@@ -58,6 +60,7 @@ from vllm.model_executor.models.utils import (
     maybe_prefix,
 )
 from vllm.sequence import IntermediateTensors
+from vllm.v1.attention.backend import AttentionType
 
 if TYPE_CHECKING:
     from transformers import PreTrainedModel
@@ -314,6 +317,8 @@ class Base(
                     new_module = replace_linear_class(
                         child_module, style, self.quant_config, prefix=qual_name
                     )
+                elif isinstance(child_module, (nn.Conv2d, nn.Conv3d)):
+                    new_module = replace_conv_class(child_module)
                 elif child_module.__class__.__name__.endswith("RMSNorm"):
                     new_module = replace_rms_norm_class(
                         child_module, self.text_config.hidden_size
diff --git a/vllm/model_executor/models/transformers/moe.py b/vllm/model_executor/models/transformers/moe.py
index 31db9d682bd40ada73e58a63ec0bc1490f7b002b..2fa23f96f390d73c1a5b77e02bbbb72d6aacd82a 100644
--- a/vllm/model_executor/models/transformers/moe.py
+++ b/vllm/model_executor/models/transformers/moe.py
@@ -37,10 +37,13 @@ if TYPE_CHECKING:
     from vllm.config import VllmConfig
 
 
+# --8<-- [start:transformers_fused_moe]
 @CustomOp.register("transformers_fused_moe")
 class TransformersFusedMoE(FusedMoE):
     """Custom FusedMoE for the Transformers modeling backend."""
 
+    # --8<-- [end:transformers_fused_moe]
+
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self._topk_ids: torch.Tensor = None
@@ -165,6 +168,7 @@ class MoEMixin(MixtureOfExperts):
         for gate_proj, down_proj, up_proj in ckpt_names:
             expert_mapping.extend(
                 FusedMoE.make_expert_params_mapping(
+                    self,
                     ckpt_gate_proj_name=gate_proj,
                     ckpt_down_proj_name=down_proj,
                     ckpt_up_proj_name=up_proj,
diff --git a/vllm/model_executor/models/transformers/multimodal.py b/vllm/model_executor/models/transformers/multimodal.py
index 9d77dee2810c3fa3f824f0b196ddab0065f59b80..fcf9a0d077abe051c885eeccde8f7c6788e29f62 100644
--- a/vllm/model_executor/models/transformers/multimodal.py
+++ b/vllm/model_executor/models/transformers/multimodal.py
@@ -22,6 +22,7 @@ from typing import TYPE_CHECKING
 import torch
 
 from vllm.config.utils import getattr_iter
+from vllm.logger import init_logger
 from vllm.model_executor.models.interfaces import SupportsMRoPE, SupportsMultiModal
 from vllm.model_executor.models.utils import WeightsMapper
 from vllm.multimodal import MultiModalKwargsItems
@@ -36,6 +37,7 @@ from vllm.multimodal.inputs import (
 from vllm.multimodal.parse import ImageProcessorItems, MultiModalDataItems
 from vllm.multimodal.processing import BaseMultiModalProcessor, BaseProcessingInfo
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 
 if TYPE_CHECKING:
@@ -52,6 +54,8 @@ DYNAMIC_ARG_DIMS = {
     "inputs_embeds": 0,
 }
 
+logger = init_logger(__name__)
+
 
 class MultiModalProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self):
@@ -345,8 +349,29 @@ class MultiModalMixin(SupportsMultiModal, SupportsMRoPE):
 
         num_image_patches = kwargs.pop("num_image_patches")
         kwargs.pop("token_type_ids", None)  # used only in `forward`
+
         if pixel_values is not None:
-            vision_embeddings = self.model.get_image_features(pixel_values, **kwargs)
+            # ROCm: Force math SDP backend for vision encoder to avoid accuracy issues
+            # with flash_sdp and mem_efficient_sdp
+            if current_platform.is_rocm():
+                # TODO: [ROCm] Fix accuracy issues with flash backend
+                logger.debug(
+                    "ROCm platform detected. Forcing math SDP backend "
+                    "for vision encoder. Currently ROCm platform has "
+                    "accuracy issues with `flash_sdp` and"
+                    "`mem_efficient_sdp` backends. See issue: "
+                    "https://github.com/vllm-project/vllm/issues/30167"
+                )
+                with torch.nn.attention.sdpa_kernel(
+                    backends=[torch.nn.attention.SDPBackend.MATH]
+                ):
+                    vision_embeddings = self.model.get_image_features(
+                        pixel_values, **kwargs
+                    )
+            else:
+                vision_embeddings = self.model.get_image_features(
+                    pixel_values, **kwargs
+                )
 
             if isinstance(vision_embeddings, torch.Tensor):
                 if vision_embeddings.ndim == 2:
@@ -364,6 +389,11 @@ class MultiModalMixin(SupportsMultiModal, SupportsMRoPE):
                 ]
 
             return vision_embeddings
+        else:
+            logger.debug(
+                "No pixel values or image embeddings provided for multimodal embedding."
+            )
+            return None
 
     def get_mrope_input_positions(
         self,
diff --git a/vllm/model_executor/models/transformers/pooling.py b/vllm/model_executor/models/transformers/pooling.py
index 4c2a74bccb6a96e7e105a61d21a7f2da0ca2ed0f..8f3173c33e4c556d6de48d7300d66d7923bec870 100644
--- a/vllm/model_executor/models/transformers/pooling.py
+++ b/vllm/model_executor/models/transformers/pooling.py
@@ -22,12 +22,7 @@ import torch
 from transformers import AutoModelForSequenceClassification
 
 from vllm.config.utils import getattr_iter
-from vllm.model_executor.layers.pooler import (
-    ClassifierPooler,
-    CLSPool,
-    DispatchPooler,
-    Pooler,
-)
+from vllm.model_executor.layers.pooler import DispatchPooler
 from vllm.model_executor.models.interfaces import SupportsCrossEncoding
 from vllm.model_executor.models.interfaces_base import VllmModelForPooling
 
@@ -36,7 +31,7 @@ if TYPE_CHECKING:
 
 
 class EmbeddingMixin(VllmModelForPooling):
-    default_pooling_type = "CLS"
+    default_seq_pooling_type = "CLS"
 
     def __init__(self, *, vllm_config: "VllmConfig", prefix: str = ""):
         # Skip VllmModelForPooling.__init__ and call the next class in MRO
@@ -47,16 +42,11 @@ class EmbeddingMixin(VllmModelForPooling):
         pooler_config = vllm_config.model_config.pooler_config
         assert pooler_config is not None
 
-        self.pooler = DispatchPooler(
-            {
-                "token_embed": Pooler.for_token_embed(pooler_config),
-                "embed": Pooler.for_embed(pooler_config),
-            }
-        )
+        self.pooler = DispatchPooler.for_embedding(pooler_config)
 
 
 class SequenceClassificationMixin(SupportsCrossEncoding, VllmModelForPooling):
-    default_pooling_type = "CLS"
+    default_seq_pooling_type = "CLS"
 
     def __init__(self, *, vllm_config: "VllmConfig", prefix: str = ""):
         # Skip VllmModelForPooling.__init__ and call the next class in MRO
@@ -94,8 +84,10 @@ class SequenceClassificationMixin(SupportsCrossEncoding, VllmModelForPooling):
         self.init_parameters(self.classifier, dtype=self.model_config.head_dtype)
 
         class ClassifierWithReshape(self.classifier.__class__):
-            """CLSPool has already been applied in `pooling`.
-            Add dim to match expected input shape of `classifier.forward`."""
+            """
+            Token extraction has already been applied in `pooler.pooling`.
+            Add dim to match expected input shape of `classifier.forward`.
+            """
 
             def forward(self, *args, **kwargs):
                 if len(args) > 0:
@@ -104,16 +96,7 @@ class SequenceClassificationMixin(SupportsCrossEncoding, VllmModelForPooling):
 
         self.classifier.__class__ = ClassifierWithReshape
 
-        self.pooler = DispatchPooler(
-            {
-                "token_classify": Pooler.for_token_classify(
-                    pooler_config, classifier=self.classifier
-                ),
-                "classify": ClassifierPooler(
-                    pooling=CLSPool(), classifier=self.classifier, act_fn="classify"
-                ),
-                "score": ClassifierPooler(
-                    pooling=CLSPool(), classifier=self.classifier, act_fn="score"
-                ),
-            }
+        self.pooler = DispatchPooler.for_seq_cls(
+            pooler_config,
+            classifier=self.classifier,
         )
diff --git a/vllm/model_executor/models/transformers/utils.py b/vllm/model_executor/models/transformers/utils.py
index b807f45b5d52bbfdfbc7791c12d10af03e3dd15b..e47f3bba5cfb2741917eae7339de0ad329ccce9a 100644
--- a/vllm/model_executor/models/transformers/utils.py
+++ b/vllm/model_executor/models/transformers/utils.py
@@ -22,16 +22,17 @@ from typing import TYPE_CHECKING, Literal
 
 import torch
 from torch import nn
-from transformers.configuration_utils import ALLOWED_LAYER_TYPES
 
 from vllm.config.utils import getattr_iter
 from vllm.logger import init_logger
+from vllm.model_executor.layers.conv import Conv2dLayer, Conv3dLayer
 from vllm.model_executor.layers.layernorm import GemmaRMSNorm, RMSNorm
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
     ReplicatedLinear,
     RowParallelLinear,
 )
+from vllm.transformers_utils.config import is_rope_parameters_nested
 
 if TYPE_CHECKING:
     from vllm.config import VllmConfig
@@ -136,6 +137,45 @@ def replace_linear_class(
     )
 
 
+TorchConv = nn.Conv2d | nn.Conv3d
+VllmConv = Conv2dLayer | Conv3dLayer
+
+
+def replace_conv_class(conv: TorchConv) -> VllmConv | TorchConv:
+    """Replace a Transformers Conv2d/Conv3d with vLLM's Conv2d/Conv3d.
+
+    Args:
+        conv: `nn.Conv2d` or `nn.Conv3d` to be replaced.
+    Returns:
+        The new `Conv2dLayer` or `Conv3dLayer`. If the conv module is not supported,
+        returns the original conv module.
+    """
+    # vLLM does not handle non-zero padding modes
+    if conv.padding_mode != "zeros":
+        return conv
+
+    vllm_conv_cls = {
+        nn.Conv2d: Conv2dLayer,
+        nn.Conv3d: Conv3dLayer,
+    }.get(type(conv))
+
+    if vllm_conv_cls is None:
+        return conv
+
+    return vllm_conv_cls(
+        in_channels=conv.in_channels,
+        out_channels=conv.out_channels,
+        kernel_size=conv.kernel_size,
+        stride=conv.stride,
+        padding=conv.padding,
+        dilation=conv.dilation,
+        groups=conv.groups,
+        bias=conv.bias is not None,
+        padding_mode=conv.padding_mode,
+        params_dtype=conv.weight.dtype,
+    )
+
+
 def replace_rms_norm_class(rms_norm: nn.Module, hidden_size: int) -> RMSNorm:
     """Replace a Transformers RMSNorm with vLLM's RMSNorm.
 
@@ -207,7 +247,7 @@ def can_enable_torch_compile(vllm_config: "VllmConfig") -> bool:
     rope_parameters: dict | None = getattr(text_config, "rope_parameters", None) or {}
     if rope_parameters:
         # Nest rope_parameters if not nested already to simplify logic
-        if not set(rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES):
+        if not is_rope_parameters_nested(rope_parameters):
             rope_parameters = {"": rope_parameters}
         return all(rp["rope_type"] != "dynamic" for rp in rope_parameters.values())
     return True
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 32a2ba1ef38f7649ac5756aed5d05acb19f4418d..7fb46979ba9ea8ca57feefd69ca3ae061e587560 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -5,6 +5,7 @@
 """PyTorch Ultravox model."""
 
 import copy
+import inspect
 from collections.abc import Iterable, Mapping, Sequence
 from types import SimpleNamespace
 from typing import Annotated, Any, Literal, TypeAlias
@@ -132,6 +133,10 @@ class UltravoxProcessingInfo(BaseProcessingInfo):
         assert isinstance(feature_extractor, WhisperFeatureExtractor)
         return feature_extractor
 
+    def get_target_channels(self) -> int:
+        """Return target audio channels for Ultravox models (mono)."""
+        return 1
+
     def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"audio": None}
 
@@ -168,7 +173,10 @@ class UltravoxDummyInputsBuilder(BaseDummyInputsBuilder[UltravoxProcessingInfo])
 class UltravoxMultiModalProcessor(BaseMultiModalProcessor[UltravoxProcessingInfo]):
     def _get_data_parser(self) -> MultiModalDataParser:
         feature_extractor = self.info.get_feature_extractor()
-        return MultiModalDataParser(target_sr=feature_extractor.sampling_rate)
+        return MultiModalDataParser(
+            target_sr=feature_extractor.sampling_rate,
+            target_channels=self.info.get_target_channels(),
+        )
 
     def _call_hf_processor(
         self,
@@ -380,11 +388,17 @@ class UltravoxTransformerProjector(nn.Module, ModuleUtilsMixin):
         )
         hidden_states = hidden_states + positions
 
+        # Backward compatibility for Transformers v4 where layer_head_mask
+        # was a required argument for WhisperEncoderLayer.forward
+        kwargs = {}
+        if "layer_head_mask" in inspect.signature(self.layers[0].forward).parameters:
+            kwargs["layer_head_mask"] = None
+
         for layer in self.layers:
             layer_outputs = layer(
                 hidden_states,
                 attention_mask=extended_attention_mask,
-                layer_head_mask=None,
+                **kwargs,
             )
             hidden_states = layer_outputs[0]
 
@@ -479,11 +493,17 @@ class ModifiedWhisperEncoder(WhisperEncoder):
 
         attention_mask = self.get_attention_mask_by_audio_len(audio_lens, hidden_states)
 
+        # Backward compatibility for Transformers v4 where layer_head_mask
+        # was a required argument for WhisperEncoderLayer.forward
+        kwargs = {}
+        if "layer_head_mask" in inspect.signature(self.layers[0].forward).parameters:
+            kwargs["layer_head_mask"] = None
+
         for encoder_layer in self.layers:
             layer_outputs = encoder_layer(
                 hidden_states,
                 attention_mask,
-                layer_head_mask=None,
+                **kwargs,
             )
 
             hidden_states = layer_outputs[0]
diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
index 024c50f1207ed0574a9e7e89bd9f5d4ac126ac85..2a4bec774b0923450ec4dccc8cb2a2a117ac63f9 100644
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -10,7 +10,6 @@ from typing import Final, Generic, Literal, Protocol, TypeAlias, TypeVar
 import torch
 from transformers import PretrainedConfig
 
-from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config import VllmConfig
 from vllm.distributed import (
     get_tensor_model_parallel_rank,
@@ -19,6 +18,7 @@ from vllm.distributed import (
 )
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
 
 logger = init_logger(__name__)
 
diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py
index 331f0c54ecfbc26e1fe70e8af696abcb846800dc..63b26c7890910e474bb821e99929de7ce5f4a9b4 100644
--- a/vllm/model_executor/models/voxtral.py
+++ b/vllm/model_executor/models/voxtral.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import inspect
 import math
 from collections.abc import Iterable, Mapping, Sequence
 from functools import cached_property
@@ -16,7 +17,11 @@ from mistral_common.protocol.instruct.chunk import AudioChunk, RawAudio, TextChu
 from mistral_common.protocol.instruct.messages import UserMessage
 from mistral_common.protocol.instruct.request import ChatCompletionRequest
 from mistral_common.protocol.transcription.request import TranscriptionRequest
-from mistral_common.tokens.tokenizers.audio import Audio, AudioEncoder
+from mistral_common.tokens.tokenizers.audio import (
+    Audio,
+    AudioEncoder,
+    TranscriptionFormat,
+)
 from transformers import BatchFeature, TensorType, WhisperConfig
 from transformers.tokenization_utils_base import TextInput
 
@@ -116,10 +121,7 @@ class VoxtralProcessorAdapter:
         self,
         audio_length: int,
     ) -> int:
-        pad_audio_length = self._audio_processor.next_multiple_of_chunk_frames(
-            audio_length, self.sampling_rate
-        )
-        return ceil(pad_audio_length / (self.sampling_rate // self.frame_rate))
+        return ceil(audio_length / (self.sampling_rate // self.frame_rate))
 
     def __call__(
         self,
@@ -158,7 +160,18 @@ class VoxtralProcessorAdapter:
             assert audio.ndim == 1
 
             # pad if necessary
-            audio = self._audio_processor.pad(audio, self.sampling_rate)
+            # TODO(Patrick) - remove once mistral-common is bumped
+            if (
+                self._audio_processor.audio_config.transcription_format
+                != TranscriptionFormat.STREAMING
+            ):
+                sig = inspect.signature(self._audio_processor.pad)
+                if "is_online_streaming" in sig.parameters:
+                    audio = self._audio_processor.pad(
+                        audio, self.sampling_rate, is_online_streaming=False
+                    )
+                else:
+                    audio = self._audio_processor.pad(audio, self.sampling_rate)
 
             audio_tokens = [self.begin_audio_token_id] + [
                 self.audio_token_id
@@ -510,6 +523,7 @@ class VoxtralForConditionalGeneration(
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         remapping_rules = [
+            (r"mm_streams_embeddings.embedding_module\.(.*)", r"\1"),
             (r"mm_whisper_embeddings\.(.*)", r"\1"),
             (r"audio_language_projection\.(.*)", r"audio_language_adapter.\1"),
             (
@@ -535,13 +549,16 @@ class VoxtralForConditionalGeneration(
         def llm_weights_generator():
             nonlocal loaded_weights
             for name, w in weights:
-                is_encoder = (
-                    name.startswith("mm_whisper_embeddings")
-                    and not name.startswith("mm_whisper_embeddings.tok_embeddings")
-                    and not name.startswith(
-                        "mm_whisper_embeddings.audio_language_projection"
+                is_encoder = False
+                for k in [
+                    "mm_whisper_embeddings",
+                    "mm_streams_embeddings.embedding_module",
+                ]:
+                    is_encoder |= (
+                        name.startswith(k)
+                        and not name.startswith(f"{k}.tok_embeddings")
+                        and not name.startswith(f"{k}.audio_language_projection")
                     )
-                )
 
                 for pattern, repl in remapping_rules:
                     if re.fullmatch(pattern, name):
@@ -676,6 +693,7 @@ class VoxtralEncoderModel(nn.Module):
     packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]}
 
     mistral_remapping = [
+        (r"mm_streams_embeddings.embedding_module\.(.*)", r"\1"),
         (
             r"whisper_encoder\.conv_layers\.0\.(weight|bias)",
             r"whisper_encoder.conv1.\1",
@@ -684,6 +702,14 @@ class VoxtralEncoderModel(nn.Module):
             r"whisper_encoder\.conv_layers\.1\.(weight|bias)",
             r"whisper_encoder.conv2.\1",
         ),
+        (
+            r"whisper_encoder\.conv_layers\.0\.conv\.(weight|bias)",
+            r"whisper_encoder.conv1.\1",
+        ),  # noqa: E501
+        (
+            r"whisper_encoder\.conv_layers\.1\.conv\.(weight|bias)",
+            r"whisper_encoder.conv2.\1",
+        ),  # noqa: E501
         (
             r"whisper_encoder\.transformer\.layers\.(\d+)\.attention\.w([qkv])\.(weight|bias)",  # noqa: E501
             r"whisper_encoder.layers.\1.self_attn.\2_proj.\3",
diff --git a/vllm/model_executor/models/voxtral_streaming.py b/vllm/model_executor/models/voxtral_streaming.py
new file mode 100644
index 0000000000000000000000000000000000000000..a89a0eedd8e75ae0a18cc4e484db39113136ea1c
--- /dev/null
+++ b/vllm/model_executor/models/voxtral_streaming.py
@@ -0,0 +1,304 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import math
+from collections.abc import Mapping
+from typing import Literal, cast
+
+import numpy as np
+import torch
+from mistral_common.protocol.instruct.chunk import RawAudio
+from mistral_common.protocol.transcription.request import (
+    StreamingMode,
+    TranscriptionRequest,
+)
+from mistral_common.tokens.tokenizers.audio import Audio
+
+from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig
+from vllm.inputs.data import PromptType
+from vllm.logger import init_logger
+from vllm.model_executor.models.interfaces import MultiModalEmbeddings
+from vllm.model_executor.models.voxtral import (
+    VoxtralDummyInputsBuilder,
+    VoxtralForConditionalGeneration,
+    VoxtralMultiModalProcessor,
+    VoxtralProcessingInfo,
+)
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.cache import _I, BaseMultiModalProcessorCache
+from vllm.multimodal.inputs import (
+    MultiModalKwargsOptionalItems,
+)
+from vllm.multimodal.parse import MultiModalDataItems
+from vllm.multimodal.processing import (
+    MultiModalPromptUpdates,
+    PlaceholderFeaturesInfo,
+)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+from vllm.tokenizers import cached_tokenizer_from_config
+
+from .utils import (
+    _flatten_embeddings,
+)
+
+logger = init_logger(__name__)
+
+
+class VoxtralStreamingMultiModalProcessor(VoxtralMultiModalProcessor):
+    def __init__(
+        self,
+        info: _I,
+        dummy_inputs: BaseDummyInputsBuilder[_I],
+        *,
+        cache: BaseMultiModalProcessorCache | None = None,
+    ) -> None:
+        # streaming can't make use of a cache yet
+        super().__init__(info, dummy_inputs, cache=None)
+
+    def _maybe_apply_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        prompt_ids: list[int],
+        mm_kwargs: MultiModalKwargsOptionalItems,
+        mm_prompt_updates: MultiModalPromptUpdates,
+        is_update_applied: bool,
+    ) -> tuple[list[int], Mapping[str, list[PlaceholderFeaturesInfo]]]:
+        # there are no placeholder audio tokens for streaming
+        # so we need to build the place placeholder positions manually
+
+        # in streaming there is always only one audio input
+        audios = mm_kwargs.get("audio", [])
+        assert len(audios) == 1, (
+            f"Expected only one audio input for streaming, got {mm_kwargs=}"
+        )
+        tokenizer = self.info.get_tokenizer()
+        audio_config = tokenizer.instruct.audio_encoder.audio_config
+
+        num_audio_samples = audios[0]["audio_arrays"].data.shape[0]
+        length = audio_config.num_audio_tokens(num_audio_samples)
+
+        features_info = PlaceholderFeaturesInfo(
+            modality="audio",
+            item_idx=0,
+            start_idx=0,
+            tokens=length
+            * [0],  # only used for length computation, so we can take dummy inputs
+            is_embed=None,
+        )
+        return prompt_ids, {"audio": [features_info]}
+
+
+class TimeEmbedding(torch.nn.Module):
+    """Sinusoidal Embedding for encoding time"""
+
+    def __init__(self, dim: int, theta: float = 10000.0) -> None:
+        super().__init__()
+        self.dim = dim
+        self.theta = theta
+        inv_freq = torch.exp(
+            -math.log(self.theta)
+            * torch.arange(self.dim // 2).float()
+            / (self.dim // 2)
+        )
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+    def forward(self, t: torch.Tensor) -> torch.Tensor:
+        t = t[..., None]  # (B,) -> (B, 1) or (B, T) -> (B, T, 1)
+        inv_freq = self.inv_freq.to(device=t.device, dtype=t.dtype)
+        emb = (
+            t * inv_freq
+        )  # (B, 1) x (D/2,) -> (B, D/2) or (B, T, 1) x (D/2,) -> (B, T, D/2)
+        return torch.cat((emb.cos(), emb.sin()), dim=-1)  # (B, D) or (B, T, D)
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    VoxtralStreamingMultiModalProcessor,
+    info=VoxtralProcessingInfo,
+    dummy_inputs=VoxtralDummyInputsBuilder,
+)
+class VoxtralStreamingGeneration(VoxtralForConditionalGeneration):
+    requires_raw_input_tokens = True
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+        self.time_embedding: TimeEmbedding = TimeEmbedding(
+            dim=self.config.text_config.hidden_size
+        )
+
+        audio_config = self.tokenizer.instruct.audio_encoder.audio_config
+        _n_delay_tokens = (
+            audio_config.frame_rate * audio_config.transcription_delay_ms / 1000
+        )
+        assert _n_delay_tokens.is_integer(), (
+            f"n_delay_tokens must be integer, got {_n_delay_tokens}"
+        )
+
+        self.n_delay_tokens = int(_n_delay_tokens)
+
+    @property
+    def audio_config(self):
+        return self.tokenizer.instruct.audio_encoder.audio_config
+
+    def embed_input_ids(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: MultiModalEmbeddings | None = None,
+        *,
+        is_multimodal: torch.Tensor | None = None,
+        # Multi-modal token ID may exceed vocab size
+        handle_oov_mm_token: bool = True,
+    ) -> torch.Tensor:
+        """Pass post-conv embeddings directly as input"""
+        # for streaming we simply flatten the multimodal embeddings
+        # to be in tensor format, we treat the input ids later
+        assert multimodal_embeddings is not None
+        assert len(multimodal_embeddings) > 0, (
+            "For streaming you must provide a multimodal_embedding at every step."
+        )
+        mm_embeds_flat = _flatten_embeddings(multimodal_embeddings)
+        return mm_embeds_flat
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        **kwargs: object,
+    ) -> torch.Tensor | IntermediateTensors:
+        assert inputs_embeds is not None
+        assert input_ids is not None
+
+        pool_size = self.config.audio_config.block_pool_size
+        inputs_embeds = inputs_embeds.view(
+            inputs_embeds.shape[0] * pool_size, inputs_embeds.shape[1] // pool_size
+        )
+
+        audio_hidden_states = self.whisper_encoder.whisper_encoder.forward_layers(
+            inputs_embeds
+        )
+
+        num_tokens, audio_hidden_size = audio_hidden_states.shape
+        assert num_tokens % self.downsample_factor == 0
+        audio_hidden_states = audio_hidden_states.reshape(
+            num_tokens // self.downsample_factor,
+            audio_hidden_size * self.downsample_factor,
+        )
+        audio_text_embeds = self.audio_language_adapter(audio_hidden_states)
+
+        text_embeds = self.language_model.embed_input_ids(input_ids)
+
+        # sum pool text and audio embeddings
+        inputs_embeds = audio_text_embeds + text_embeds
+
+        time_tensor = torch.tensor(
+            [self.n_delay_tokens],
+            device=inputs_embeds.device,
+            dtype=inputs_embeds.dtype,
+        )
+        inputs_embeds = inputs_embeds + self.time_embedding(time_tensor)
+
+        hidden_states = self.language_model.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds=inputs_embeds
+        )
+
+        return hidden_states
+
+    def embed_multimodal(
+        self, **kwargs
+    ) -> list[torch.Tensor] | torch.Tensor | tuple[torch.Tensor, ...] | None:
+        """Transform audio waveforms -> initial whisper post-conv embeddings"""
+        audio_inputs = self._parse_and_validate_audio_arrays(**kwargs)
+
+        assert audio_inputs is not None, (
+            "For streaming you must provide an audio input at every step."
+        )
+
+        def _truncate_left(
+            sample: torch.Tensor, mult_of: int, pos: int
+        ) -> torch.Tensor:
+            assert pos in [0, 1], pos
+            if (ctx := sample.shape[pos] % mult_of) != 0:
+                sample = sample[ctx:] if pos == 0 else sample[:, ctx:]
+                assert sample.shape[pos] > 0, (
+                    f"Sample is empty after truncation with ctx {ctx}"
+                )
+
+            return sample
+
+        mel_features = [
+            self.whisper_encoder.compute_whisper_melspec(audio).to(
+                self.whisper_encoder.dtype
+            )
+            for audio in audio_inputs
+        ]
+
+        # we truncate the left most mel feature
+        # if the sequence length in impair
+        mel_features = [_truncate_left(mel, 2, 1) for mel in mel_features]
+
+        seq_lens = [mel.shape[1] for mel in mel_features]
+        # [total_num_20ms_frames, hidden_size]
+        audio_embeddings = self.whisper_encoder.whisper_encoder.forward_conv(
+            mel_features
+        )
+        conv_stride = self.whisper_encoder.whisper_encoder.total_stride
+        audio_embeddings_per_sample = audio_embeddings.split(
+            [s // conv_stride for s in seq_lens], dim=0
+        )
+
+        # audio_embeddings per sample need to be divisible by 4
+        pool_size = self.config.audio_config.block_pool_size
+
+        audio_embeddings_per_sample = [
+            _truncate_left(sample, pool_size, 0)
+            for sample in audio_embeddings_per_sample
+        ]
+
+        audio_embeddings_per_sample = [
+            e.view(e.shape[0] // pool_size, e.shape[1] * pool_size)
+            for e in audio_embeddings_per_sample
+        ]
+        return audio_embeddings_per_sample
+
+    @classmethod
+    def get_speech_to_text_config(
+        cls, model_config: ModelConfig, task_type: str
+    ) -> SpeechToTextConfig:
+        tokenizer = cached_tokenizer_from_config(model_config)
+        audio_config = tokenizer.instruct.audio_encoder.audio_config
+        sample_rate = audio_config.sampling_rate
+        return SpeechToTextConfig(
+            max_audio_clip_s=None,  # only limited by memory
+            sample_rate=sample_rate,
+            min_energy_split_window_size=None,
+        )
+
+    @classmethod
+    # for speech-to-text transcription
+    def get_generation_prompt(
+        cls,
+        audio: np.ndarray,
+        model_config: ModelConfig,
+        stt_config: SpeechToTextConfig,
+        language: str | None,
+        task_type: Literal["transcribe", "translate"],
+        request_prompt: str,
+        to_language: str | None,
+    ) -> PromptType:
+        tokenizer = cached_tokenizer_from_config(model_config)
+        audio = Audio(audio, int(stt_config.sample_rate), format="wav")  # lossless
+
+        req = TranscriptionRequest(
+            model=model_config.model,
+            audio=RawAudio.from_audio(audio),
+            language=language,
+            streaming=StreamingMode.OFFLINE,
+        )
+
+        tokenized = tokenizer.instruct.encode_transcription(req)
+        audio = (tokenized.audios[0].audio_array, stt_config.sample_rate)
+        prompts_dict = {"multi_modal_data": {"audio": audio}}
+        prompts_dict["prompt_token_ids"] = tokenized.tokens
+        return cast(PromptType, prompts_dict)
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index b513e3513b2e2084482003fdfc566881a9db3c58..14d646f8587610b82652de94a8681a1703e9971f 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -1,9 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import enum
 import math
 from collections.abc import Iterable, Mapping, Sequence
 from contextlib import nullcontext
+from functools import partial
 from typing import Annotated, Literal, cast
 
 import numpy as np
@@ -16,15 +18,15 @@ from transformers import (
 )
 from transformers.models.whisper.modeling_whisper import sinusoids
 
-from vllm.attention.backends.abstract import AttentionType
-from vllm.attention.layer import Attention, MultiHeadAttention
-from vllm.attention.layers.cross_attention import CrossAttention
+from vllm.attention.layer import Attention
 from vllm.config import CacheConfig, ModelConfig, SpeechToTextConfig, VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.inputs.data import PromptType
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.attention.cross_attention import CrossAttention
+from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
     QKVParallelLinear,
@@ -34,6 +36,11 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.whisper_utils import (
+    ISO639_1_SUPPORTED_LANGS,
+    WhisperAttentionWithBlockPooling,
+    WhisperCausalConv1d,
+)
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (
     MultiModalDataDict,
@@ -52,6 +59,9 @@ from vllm.transformers_utils.processor import cached_processor_from_config
 from vllm.utils.jsontree import json_map_leaves
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 from vllm.utils.torch_utils import set_default_torch_dtype
+from vllm.v1.attention.backend import (
+    AttentionType,
+)
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsTranscription
 from .utils import (
@@ -64,67 +74,11 @@ from .utils import (
 
 logger = init_logger(__name__)
 
-# From https://platform.openai.com/docs/guides/speech-to-text/supported-languages
-
-ISO639_1_SUPPORTED_LANGS = {
-    "af": "Afrikaans",
-    "ar": "Arabic",
-    "hy": "Armenian",
-    "az": "Azerbaijani",
-    "be": "Belarusian",
-    "bs": "Bosnian",
-    "bg": "Bulgarian",
-    "ca": "Catalan",
-    "zh": "Chinese",
-    "hr": "Croatian",
-    "cs": "Czech",
-    "da": "Danish",
-    "nl": "Dutch",
-    "en": "English",
-    "et": "Estonian",
-    "fi": "Finnish",
-    "fr": "French",
-    "gl": "Galician",
-    "de": "German",
-    "el": "Greek",
-    "he": "Hebrew",
-    "hi": "Hindi",
-    "hu": "Hungarian",
-    "is": "Icelandic",
-    "id": "Indonesian",
-    "it": "Italian",
-    "ja": "Japanese",
-    "kn": "Kannada",
-    "kk": "Kazakh",
-    "ko": "Korean",
-    "lv": "Latvian",
-    "lt": "Lithuanian",
-    "mk": "Macedonian",
-    "ms": "Malay",
-    "mr": "Marathi",
-    "mi": "Maori",
-    "ne": "Nepali",
-    "no": "Norwegian",
-    "fa": "Persian",
-    "pl": "Polish",
-    "pt": "Portuguese",
-    "ro": "Romanian",
-    "ru": "Russian",
-    "sr": "Serbian",
-    "sk": "Slovak",
-    "sl": "Slovenian",
-    "es": "Spanish",
-    "sw": "Swahili",
-    "sv": "Swedish",
-    "tl": "Tagalog",
-    "ta": "Tamil",
-    "th": "Thai",
-    "tr": "Turkish",
-    "uk": "Ukrainian",
-    "ur": "Urdu",
-    "vi": "Vietnamese",
-    "cy": "Welsh",
-}
+
+class WhisperPosEmbedType(enum.Enum):
+    SINUSOIDAL = "sinusoidal"
+    NOPE = "nope"
+    LEARNED = "learned"
 
 
 class WhisperAudioInputs(TensorSchema):
@@ -141,7 +95,7 @@ class WhisperAudioInputs(TensorSchema):
     ]
 
 
-class WhisperEncoderAttention(MultiHeadAttention):
+class WhisperEncoderAttention(MMEncoderAttention):
     """Multi-headed attention for Whisper encoder with 2D tensor support."""
 
     def forward(
@@ -184,6 +138,8 @@ class WhisperAttention(nn.Module):
         num_heads: int,
         bias: bool = True,
         attn_type: AttentionType = AttentionType.DECODER,
+        per_layer_sliding_window: int | None = None,
+        block_pool_size: int = 1,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
@@ -242,7 +198,14 @@ class WhisperAttention(nn.Module):
                 attn_type=self.attn_type,
             )
         else:  # AttentionType.DECODER (regular decoder self-attention)
-            self.attn = Attention(
+            if block_pool_size > 1:
+                attn_cls = partial(
+                    WhisperAttentionWithBlockPooling, block_pool_size=block_pool_size
+                )
+            else:
+                attn_cls = Attention
+
+            self.attn = attn_cls(
                 self.num_heads,
                 self.head_dim,
                 self.scaling,
@@ -251,6 +214,7 @@ class WhisperAttention(nn.Module):
                 quant_config=quant_config,
                 prefix=f"{prefix}.attn",
                 attn_type=self.attn_type,
+                per_layer_sliding_window=per_layer_sliding_window,
             )
 
     def _init_qkv(
@@ -386,6 +350,9 @@ class WhisperEncoderLayer(nn.Module):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
+        is_causal = getattr(config, "is_causal", False)
+        sliding_window = getattr(config, "sliding_window", None)
+        block_pool_size = getattr(config, "block_pool_size", 1)
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
 
@@ -393,7 +360,9 @@ class WhisperEncoderLayer(nn.Module):
         self.self_attn = WhisperAttention(
             embed_dim=self.embed_dim,
             num_heads=config.encoder_attention_heads,
-            attn_type=AttentionType.ENCODER,
+            attn_type=AttentionType.DECODER if is_causal else AttentionType.ENCODER,
+            block_pool_size=block_pool_size,
+            per_layer_sliding_window=sliding_window,
             cache_config=cache_config,
             quant_config=quant_config,
             prefix=f"{prefix}.self_attn",
@@ -492,12 +461,23 @@ class WhisperEncoder(nn.Module):
         super().__init__()
         config = vllm_config.model_config.hf_config
         embed_dim = config.d_model
+
+        self.pos_embed_type = WhisperPosEmbedType(
+            getattr(config, "pos_embed", "sinusoidal")
+        )
         self.num_mel_bins = config.num_mel_bins
         self.max_source_positions = config.max_source_positions
         self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
 
-        self.conv1 = nn.Conv1d(self.num_mel_bins, embed_dim, kernel_size=3, padding=1)
-        self.conv2 = nn.Conv1d(embed_dim, embed_dim, kernel_size=3, stride=2, padding=1)
+        self.is_causal = getattr(config, "is_causal", False)
+        Conv1d = (
+            WhisperCausalConv1d if self.is_causal else partial(nn.Conv1d, padding=1)
+        )
+
+        self.conv1 = Conv1d(self.num_mel_bins, embed_dim, kernel_size=3)
+        self.conv2 = Conv1d(embed_dim, embed_dim, stride=2, kernel_size=3)
+
+        self.total_stride = self.conv1.stride[0] * self.conv2.stride[0]
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.encoder_layers,
             lambda prefix: WhisperEncoderLayer(
@@ -507,44 +487,79 @@ class WhisperEncoder(nn.Module):
         )
         self.layer_norm = nn.LayerNorm(config.d_model)
 
-        maybe_fp32_init_ctx = (
-            set_default_torch_dtype(torch.float32) if init_in_fp32 else nullcontext()
-        )
-
-        with (
-            torch.no_grad(),
-            maybe_fp32_init_ctx,
+        if self.is_causal and self.pos_embed_type != WhisperPosEmbedType.NOPE:
+            raise ValueError(
+                "Only NOPE position embeddings are supported "
+                f"for causal models, but got {self.pos_embed_type}"
+            )
+        elif self.pos_embed_type in (
+            WhisperPosEmbedType.SINUSOIDAL,
+            WhisperPosEmbedType.LEARNED,
         ):
-            self.embed_positions = nn.Embedding(self.max_source_positions, embed_dim)
-            self.embed_positions.weight.copy_(
-                sinusoids(*self.embed_positions.weight.shape)
+            maybe_fp32_init_ctx = (
+                set_default_torch_dtype(torch.float32)
+                if init_in_fp32
+                else nullcontext()
             )
 
-    def forward(self, input_features: torch.Tensor | list[torch.Tensor]):
+            with (
+                torch.no_grad(),
+                maybe_fp32_init_ctx,
+            ):
+                self.embed_positions = nn.Embedding(
+                    self.max_source_positions, embed_dim
+                )
+                self.embed_positions.weight.copy_(
+                    sinusoids(*self.embed_positions.weight.shape)
+                )
+
+    def forward_conv(
+        self, input_features: torch.Tensor | list[torch.Tensor]
+    ) -> torch.Tensor:
         hidden_states = []
         input_is_batched = False
         for features in input_features:
             embeds = nn.functional.gelu(self.conv1(features))
             embeds = nn.functional.gelu(self.conv2(embeds))
-            embeds = embeds.transpose(-1, -2)
-            embeds = (embeds + self.embed_positions.weight[: embeds.size(-2), :]).to(
-                embeds.dtype
-            )
+
+            if self.pos_embed_type in (
+                WhisperPosEmbedType.SINUSOIDAL,
+                WhisperPosEmbedType.LEARNED,
+            ):
+                embeds = embeds.transpose(-1, -2)
+                embeds = (
+                    embeds + self.embed_positions.weight[: embeds.size(-2), :]
+                ).to(embeds.dtype)
+            elif self.pos_embed_type == WhisperPosEmbedType.NOPE:
+                embeds = embeds.transpose(-1, -2).to(embeds.dtype)
+            else:
+                raise ValueError(f"Unknown pos_embed_type: {self.pos_embed_type}")
+
             hidden_states.append(embeds)
             input_is_batched = embeds.ndim > 2
         # Input to MHA must be B x T x D
-        if input_is_batched:
+        if input_is_batched or self.is_causal:
             # Models using WhisperEncoder may handle batching internally.
+            # If WhisperEncoder is causal, sequences
+            # are not padded to have identical seq length (T)
+            # => concat over feature dim
             hidden_states = torch.cat(hidden_states)
         else:
             hidden_states = torch.stack(hidden_states, dim=0)
 
+        return hidden_states
+
+    def forward_layers(self, hidden_states: torch.Tensor) -> torch.Tensor:
         for encoder_layer in self.layers:
             hidden_states = encoder_layer(hidden_states)
 
         hidden_states = self.layer_norm(hidden_states)
         return hidden_states
 
+    def forward(self, input_features: torch.Tensor | list[torch.Tensor]):
+        hidden_states = self.forward_conv(input_features)
+        return self.forward_layers(hidden_states)
+
 
 class WhisperDecoder(nn.Module):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
@@ -675,6 +690,10 @@ class WhisperProcessingInfo(BaseProcessingInfo):
         assert isinstance(feature_extractor, WhisperFeatureExtractor)
         return feature_extractor
 
+    def get_target_channels(self) -> int:
+        """Return target audio channels for Whisper models (mono)."""
+        return 1
+
     def get_num_audio_tokens(self) -> int:
         return self.get_hf_config().max_source_positions
 
@@ -709,7 +728,10 @@ class WhisperDummyInputsBuilder(BaseDummyInputsBuilder[WhisperProcessingInfo]):
 class WhisperMultiModalProcessor(EncDecMultiModalProcessor[WhisperProcessingInfo]):
     def _get_data_parser(self) -> MultiModalDataParser:
         feature_extractor = self.info.get_feature_extractor()
-        return MultiModalDataParser(target_sr=feature_extractor.sampling_rate)
+        return MultiModalDataParser(
+            target_sr=feature_extractor.sampling_rate,
+            target_channels=self.info.get_target_channels(),
+        )
 
     @property
     def pad_dummy_encoder_prompt(self) -> bool:
diff --git a/vllm/model_executor/models/whisper_utils.py b/vllm/model_executor/models/whisper_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..d41ccde0a23a25d0370af9fc867cb28a812ea301
--- /dev/null
+++ b/vllm/model_executor/models/whisper_utils.py
@@ -0,0 +1,299 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import copy
+import functools
+import math
+from dataclasses import replace
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from vllm.attention.layer import Attention
+from vllm.config import CacheConfig, VllmConfig
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.v1.attention.backend import (
+    AttentionBackend,
+    AttentionMetadata,
+    AttentionType,
+    CommonAttentionMetadata,
+)
+from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend
+from vllm.v1.attention.backends.utils import (
+    subclass_attention_backend_with_overrides,
+)
+from vllm.v1.attention.selector import get_attn_backend
+from vllm.v1.kv_cache_interface import AttentionSpec
+
+# From https://platform.openai.com/docs/guides/speech-to-text/supported-languages
+ISO639_1_SUPPORTED_LANGS = {
+    "af": "Afrikaans",
+    "ar": "Arabic",
+    "hy": "Armenian",
+    "az": "Azerbaijani",
+    "be": "Belarusian",
+    "bs": "Bosnian",
+    "bg": "Bulgarian",
+    "ca": "Catalan",
+    "zh": "Chinese",
+    "hr": "Croatian",
+    "cs": "Czech",
+    "da": "Danish",
+    "nl": "Dutch",
+    "en": "English",
+    "et": "Estonian",
+    "fi": "Finnish",
+    "fr": "French",
+    "gl": "Galician",
+    "de": "German",
+    "el": "Greek",
+    "he": "Hebrew",
+    "hi": "Hindi",
+    "hu": "Hungarian",
+    "is": "Icelandic",
+    "id": "Indonesian",
+    "it": "Italian",
+    "ja": "Japanese",
+    "kn": "Kannada",
+    "kk": "Kazakh",
+    "ko": "Korean",
+    "lv": "Latvian",
+    "lt": "Lithuanian",
+    "mk": "Macedonian",
+    "ms": "Malay",
+    "mr": "Marathi",
+    "mi": "Maori",
+    "ne": "Nepali",
+    "no": "Norwegian",
+    "fa": "Persian",
+    "pl": "Polish",
+    "pt": "Portuguese",
+    "ro": "Romanian",
+    "ru": "Russian",
+    "sr": "Serbian",
+    "sk": "Slovak",
+    "sl": "Slovenian",
+    "es": "Spanish",
+    "sw": "Swahili",
+    "sv": "Swedish",
+    "tl": "Tagalog",
+    "ta": "Tamil",
+    "th": "Thai",
+    "tr": "Turkish",
+    "uk": "Ukrainian",
+    "ur": "Urdu",
+    "vi": "Vietnamese",
+    "cy": "Welsh",
+}
+
+
+def _pad1d(
+    x: torch.Tensor,
+    paddings: tuple[int, int],
+    mode: str = "constant",
+    value: float = 0.0,
+) -> torch.Tensor:
+    """Tiny wrapper around F.pad, just to allow for
+    reflect padding on small input.
+    If this is the case, we insert extra 0 padding
+    to the right before the reflection happen.
+    """
+    length = x.shape[-1]
+    padding_left, padding_right = paddings
+    assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right)
+    if mode == "reflect":
+        max_pad = max(padding_left, padding_right)
+        extra_pad = 0
+        if length <= max_pad:
+            extra_pad = max_pad - length + 1
+            x = F.pad(x, (0, extra_pad))
+        padded = F.pad(x, paddings, mode, value)
+        end = padded.shape[-1] - extra_pad
+        return padded[..., :end]
+    else:
+        return F.pad(x, paddings, mode, value)
+
+
+class WhisperCausalConv1d(nn.Conv1d):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        padding: int = 0,
+        bias: bool = True,
+    ) -> None:
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            bias=bias,
+        )
+        self._stride = self.stride[0]
+        self._effective_kernel_size = (kernel_size - 1) * self.dilation[0] + 1
+        self._padding_total = self._effective_kernel_size - self._stride
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        n_frames = (
+            x.shape[-1] - self._effective_kernel_size + self._padding_total
+        ) / self._stride + 1
+        target_length = (math.ceil(n_frames) - 1) * self._stride + (
+            self._effective_kernel_size - self._padding_total
+        )
+        extra_padding = target_length - x.shape[-1]
+        x = _pad1d(x, (self._padding_total, extra_padding), mode="constant")
+        return super().forward(x)
+
+
+@functools.lru_cache
+def create_whisper_attention_backend_with_block_pooling(
+    underlying_attn_backend: AttentionBackend, block_pool_size: int
+) -> type[AttentionBackend]:
+    prefix = "WhisperAttentionWithBlockPooling_"
+    underlying_builder = underlying_attn_backend.get_builder_cls()
+
+    class WhisperAttentionWithBlockPoolingBuilder(underlying_builder):  # type: ignore
+        def __init__(
+            self,
+            kv_cache_spec: AttentionSpec,
+            layer_names: list[str],
+            vllm_config: VllmConfig,
+            device: torch.device,
+        ):
+            assert kv_cache_spec.num_kv_heads % block_pool_size == 0
+            kv_cache_spec = replace(
+                kv_cache_spec,
+                block_size=kv_cache_spec.block_size * block_pool_size,
+                num_kv_heads=kv_cache_spec.num_kv_heads // block_pool_size,
+            )
+            super().__init__(kv_cache_spec, layer_names, vllm_config, device)
+
+        def build(
+            self,
+            common_prefix_len: int,
+            common_attn_metadata: CommonAttentionMetadata,
+            fast_build: bool = False,
+        ) -> AttentionMetadata:
+            new_common_attn_metadata = copy.deepcopy(common_attn_metadata)
+            new_common_attn_metadata.query_start_loc *= block_pool_size
+            new_common_attn_metadata.query_start_loc_cpu *= block_pool_size
+            new_common_attn_metadata.seq_lens *= block_pool_size
+            new_common_attn_metadata._seq_lens_cpu *= block_pool_size
+            new_common_attn_metadata._num_computed_tokens_cpu *= block_pool_size
+            new_common_attn_metadata.num_actual_tokens *= block_pool_size
+            new_common_attn_metadata.max_query_len *= block_pool_size
+            new_common_attn_metadata.max_seq_len *= block_pool_size
+            original_slot_mapping = common_attn_metadata.slot_mapping
+            common_prefix_len *= block_pool_size
+            new_common_attn_metadata.slot_mapping = (
+                (
+                    original_slot_mapping.unsqueeze(1) * block_pool_size
+                    + torch.arange(block_pool_size, device=original_slot_mapping.device)
+                )
+                .flatten()
+                .clamp(min=-1)
+            )
+            return super().build(
+                common_prefix_len, new_common_attn_metadata, fast_build
+            )
+
+    if not issubclass(underlying_attn_backend, FlashAttentionBackend):
+        raise NotImplementedError(
+            f"{underlying_attn_backend} is not yet supported."
+            "Contributions to support more backends are much "
+            "appreciated."
+        )
+
+    attn_backend = subclass_attention_backend_with_overrides(
+        name_prefix=prefix,
+        attention_backend_cls=underlying_attn_backend,
+        overrides={
+            "get_builder_cls": lambda: WhisperAttentionWithBlockPoolingBuilder,
+            "get_kv_cache_shape": lambda num_blocks,
+            block_size,
+            num_kv_heads,
+            head_size,
+            cache_dtype_str: (
+                2,
+                num_blocks,
+                # we stretch each block by `block_pool_size`
+                block_size * block_pool_size,
+                num_kv_heads // block_pool_size,
+                head_size,
+            ),  # TODO: generalize to other backends
+        },
+    )
+
+    return attn_backend
+
+
+class WhisperAttentionWithBlockPooling(Attention):
+    """Attention layer with block pooling."""
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int | None = None,
+        alibi_slopes: list[float] | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        logits_soft_cap: float | None = None,
+        per_layer_sliding_window: int | None = None,
+        prefix: str = "",
+        attn_type: str = AttentionType.DECODER,
+        kv_sharing_target_layer_name: str | None = None,
+        block_pool_size: int = 1,
+        attn_backend: type[AttentionBackend] | None = None,
+        **extra_impl_args,
+    ) -> None:
+        self.block_pool_size = block_pool_size
+        dtype = torch.get_default_dtype()
+
+        if cache_config is not None:
+            kv_cache_dtype = cache_config.cache_dtype
+            block_size = cache_config.block_size
+        else:
+            kv_cache_dtype = "auto"
+            block_size = 16
+
+        underlying_attn_backend = get_attn_backend(
+            head_size,
+            dtype,
+            kv_cache_dtype,
+            block_size,
+            attn_type=attn_type,
+        )
+        attn_backend = create_whisper_attention_backend_with_block_pooling(
+            underlying_attn_backend, block_pool_size
+        )
+
+        super().__init__(
+            num_heads=num_heads,
+            head_size=head_size,
+            scale=scale,
+            num_kv_heads=num_kv_heads,
+            alibi_slopes=alibi_slopes,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            logits_soft_cap=logits_soft_cap,
+            per_layer_sliding_window=per_layer_sliding_window,
+            prefix=prefix,
+            attn_type=attn_type,
+            kv_sharing_target_layer_name=kv_sharing_target_layer_name,
+            attn_backend=attn_backend,
+            **extra_impl_args,
+        )
+
+    def get_kv_cache_spec(self, vllm_config: VllmConfig):
+        kv_cache_spec = super().get_kv_cache_spec(vllm_config)
+        assert isinstance(kv_cache_spec, AttentionSpec)
+        kv_cache_spec = replace(
+            kv_cache_spec,
+            num_kv_heads=self.block_pool_size * kv_cache_spec.num_kv_heads,
+        )
+        return kv_cache_spec
diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py
index fe157887eea910a3bb5acd3b90f68fa6102eda5c..b5132cd860249bca34f6a17d7519503bbb1cccf8 100644
--- a/vllm/model_executor/models/zamba2.py
+++ b/vllm/model_executor/models/zamba2.py
@@ -86,7 +86,13 @@ class Zamba2LoRA(nn.Module):
             B_class = MergedColumnParallelLinear
         else:
             B_class = ColumnParallelLinear
-        self.B = B_class(rank, output_dim, bias=False, quant_config=quant_config)
+        self.B = B_class(
+            rank,
+            output_dim,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.B",
+        )
 
     def forward(
         self,
@@ -346,6 +352,7 @@ class Zamba2MLP(nn.Module):
                     config.adapter_rank,
                     2 * [self.intermediate_size],
                     quant_config,
+                    prefix=f"{prefix}.gate_up_proj_adapter_list.{block_idx}",
                 )
             else:
                 gate_up_proj_adapter = nn.Identity()
diff --git a/vllm/model_executor/warmup/deep_gemm_warmup.py b/vllm/model_executor/warmup/deep_gemm_warmup.py
index 936f6b1e28ce1784282f9b14c96008a3e1c3a572..2bbc655bd935ff03d990e5424420900d91442a19 100644
--- a/vllm/model_executor/warmup/deep_gemm_warmup.py
+++ b/vllm/model_executor/warmup/deep_gemm_warmup.py
@@ -10,7 +10,7 @@ import torch
 from tqdm import tqdm
 
 import vllm.envs as envs
-from vllm.distributed.parallel_state import get_dp_group
+from vllm.distributed.parallel_state import get_dp_group, is_global_first_rank
 from vllm.model_executor.layers.fused_moe.deep_gemm_moe import DeepGemmExperts
 from vllm.model_executor.layers.fused_moe.deep_gemm_utils import compute_aligned_M
 from vllm.model_executor.layers.fused_moe.layer import FusedMoE, FusedMoEModularMethod
@@ -175,7 +175,30 @@ def _fused_moe_grouped_gemm_may_use_deep_gemm(module: torch.nn.Module) -> bool:
 FP8_GEMM_NT_WARMUP_CACHE: set[torch.Size] = set()
 
 
-def _deepgemm_fp8_gemm_nt_warmup(w: torch.Tensor, ws: torch.Tensor, max_tokens: int):
+def _get_fp8_gemm_nt_m_values(w: torch.Tensor, max_tokens: int) -> list[int]:
+    """Get the M values to warmup for a given weight tensor."""
+    n, _ = w.size()
+    device = w.device
+
+    # Use optimal M values only if VLLM_DEEP_GEMM_WARMUP is set to "relax".
+    # Otherwise warmup all token sizes to avoid JIT compilation in hotpath
+    if envs.VLLM_DEEP_GEMM_WARMUP == "relax":
+        return _generate_optimal_warmup_m_values(max_tokens, n, device)
+    else:
+        assert envs.VLLM_DEEP_GEMM_WARMUP == "full", (
+            "Expected "
+            'VLLM_DEEP_GEMM_WARMUP env to be set to "full" but got '
+            f"{envs.VLLM_DEEP_GEMM_WARMUP}"
+        )
+        return list(range(1, max_tokens + 1))
+
+
+def _deepgemm_fp8_gemm_nt_warmup(
+    w: torch.Tensor,
+    ws: torch.Tensor,
+    max_tokens: int,
+    pbar: tqdm | None = None,
+):
     if w.size() in FP8_GEMM_NT_WARMUP_CACHE:
         return
 
@@ -189,27 +212,14 @@ def _deepgemm_fp8_gemm_nt_warmup(w: torch.Tensor, ws: torch.Tensor, max_tokens:
     )
     out = torch.empty((max_tokens, n), device=device, dtype=torch.bfloat16)
 
-    # Use optimal M values only if VLLM_DEEP_GEMM_WARMUP is set to "relax".
-    # Otherwise warmup all token sizes to avoid JIT compilation in hotpath
-    if envs.VLLM_DEEP_GEMM_WARMUP == "relax":
-        m_values = _generate_optimal_warmup_m_values(max_tokens, n, device)
-        desc = f"DeepGemm(fp8_gemm_nt) warmup (W={w.size()}) [relaxed]"
-    else:
-        assert envs.VLLM_DEEP_GEMM_WARMUP == "full", (
-            "Expected "
-            'VLLM_DEEP_GEMM_WARMUP env to be set to "full" but got '
-            f"{envs.VLLM_DEEP_GEMM_WARMUP}"
-        )
-        m_values = list(range(1, max_tokens + 1))
-        desc = f"DeepGemm(fp8_gemm_nt) warmup (W={w.size()}) [all tokens]"
-
-    pbar = tqdm(total=len(m_values), desc=desc)
+    m_values = _get_fp8_gemm_nt_m_values(w, max_tokens)
 
     for num_tokens in m_values:
         fp8_gemm_nt(
             (a1q[:num_tokens], a1q_scales[:num_tokens]), (w, ws), out[:num_tokens]
         )
-        pbar.update(1)
+        if pbar is not None:
+            pbar.update(1)
 
     FP8_GEMM_NT_WARMUP_CACHE.add(w.size())
 
@@ -217,20 +227,12 @@ def _deepgemm_fp8_gemm_nt_warmup(w: torch.Tensor, ws: torch.Tensor, max_tokens:
 GROUPED_FP8_GEMM_NT_CONTIGUOUS_WARMUP_CACHE: set[torch.Size] = set()
 
 
-def _deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(
+def _get_grouped_gemm_params(
     w1: torch.Tensor,
     w2: torch.Tensor,
-    w1_scale: torch.Tensor,
-    w2_scale: torch.Tensor,
     num_topk: int,
     max_tokens: int,
-):
-    if (
-        w1.size() in GROUPED_FP8_GEMM_NT_CONTIGUOUS_WARMUP_CACHE
-        and w2.size() in GROUPED_FP8_GEMM_NT_CONTIGUOUS_WARMUP_CACHE
-    ):
-        return
-
+) -> tuple[int, int, torch.Tensor]:
     assert w1.size(0) == w2.size(0), "w1 and w2 must have the same number of experts"
 
     block_m = get_mk_alignment_for_contiguous_layout()[0]
@@ -253,6 +255,27 @@ def _deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(
     )
     expert_ids = torch.repeat_interleave(expert_ids_block, block_m, dim=0)
 
+    return MAX_M, block_m, expert_ids
+
+
+def _deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    num_topk: int,
+    max_tokens: int,
+    pbar: tqdm | None = None,
+):
+    if (
+        w1.size() in GROUPED_FP8_GEMM_NT_CONTIGUOUS_WARMUP_CACHE
+        and w2.size() in GROUPED_FP8_GEMM_NT_CONTIGUOUS_WARMUP_CACHE
+    ):
+        return
+
+    MAX_M, block_m, expert_ids = _get_grouped_gemm_params(w1, w2, num_topk, max_tokens)
+    device = w1.device
+
     def _warmup(w: torch.Tensor, w_scale: torch.Tensor):
         _, n, k = w.size()
         a1q = torch.empty((MAX_M, k), device=device, dtype=torch.float8_e4m3fn)
@@ -261,15 +284,8 @@ def _deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(
         )
         out = torch.empty((MAX_M, n), device=device, dtype=torch.bfloat16)
 
-        # Generate M values in block_m increments (already optimized for MoE)
         m_values = list(range(block_m, MAX_M + 1, block_m))
 
-        pbar = tqdm(
-            total=len(m_values),
-            desc=f"DeepGemm(m_grouped_fp8_gemm_nt_contiguous) warmup (W={w.size()}) "
-            f"[{len(m_values)} values, block_m={block_m}]",
-        )
-
         for num_tokens in m_values:
             m_grouped_fp8_gemm_nt_contiguous(
                 (a1q[:num_tokens], a1q_scales[:num_tokens]),
@@ -277,7 +293,8 @@ def _deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(
                 out[:num_tokens],
                 expert_ids[:num_tokens],
             )
-            pbar.update(1)
+            if pbar is not None:
+                pbar.update(1)
 
     for w, ws in [(w1, w1_scale), (w2, w2_scale)]:
         if w.size() not in GROUPED_FP8_GEMM_NT_CONTIGUOUS_WARMUP_CACHE:
@@ -285,16 +302,18 @@ def _deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(
             GROUPED_FP8_GEMM_NT_CONTIGUOUS_WARMUP_CACHE.add(w.size())
 
 
-def deepgemm_fp8_gemm_nt_warmup(model: torch.nn.Module, max_tokens: int):
+def deepgemm_fp8_gemm_nt_warmup(
+    model: torch.nn.Module, max_tokens: int, pbar: tqdm | None = None
+):
     dg_modules = [m for m in model.modules() if _fp8_linear_may_use_deep_gemm(m)]
 
     for dgm in dg_modules:
         w, ws, _ = _extract_data_from_linear_base_module(dgm)
-        _deepgemm_fp8_gemm_nt_warmup(w=w, ws=ws, max_tokens=max_tokens)
+        _deepgemm_fp8_gemm_nt_warmup(w=w, ws=ws, max_tokens=max_tokens, pbar=pbar)
 
 
 def deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(
-    model: torch.nn.Module, max_tokens: int
+    model: torch.nn.Module, max_tokens: int, pbar: tqdm | None = None
 ):
     dg_modules = [
         m for m in model.modules() if _fused_moe_grouped_gemm_may_use_deep_gemm(m)
@@ -305,10 +324,48 @@ def deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(
             dgm
         )
         _deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(
-            w13, w2, w13_scale, w2_scale, num_topk, max_tokens
+            w13, w2, w13_scale, w2_scale, num_topk, max_tokens, pbar=pbar
         )
 
 
+def _count_warmup_iterations(model: torch.nn.Module, max_tokens: int) -> int:
+    seen_fp8_sizes: set[torch.Size] = set(FP8_GEMM_NT_WARMUP_CACHE)
+    seen_grouped_sizes: set[torch.Size] = set(
+        GROUPED_FP8_GEMM_NT_CONTIGUOUS_WARMUP_CACHE
+    )
+
+    total = 0
+    for m in model.modules():
+        if _fp8_linear_may_use_deep_gemm(m):
+            w, _, _ = _extract_data_from_linear_base_module(m)
+            if w.size() not in seen_fp8_sizes:
+                total += len(_get_fp8_gemm_nt_m_values(w, max_tokens))
+                seen_fp8_sizes.add(w.size())
+        elif _fused_moe_grouped_gemm_may_use_deep_gemm(m):
+            w13, _, w2, _, num_topk = _extract_data_from_fused_moe_module(m)
+            if w13.size() in seen_grouped_sizes and w2.size() in seen_grouped_sizes:
+                continue
+            MAX_M, block_m, _ = _get_grouped_gemm_params(w13, w2, num_topk, max_tokens)
+            n_values = (MAX_M - block_m) // block_m + 1
+            if w13.size() not in seen_grouped_sizes:
+                total += n_values
+                seen_grouped_sizes.add(w13.size())
+            if w2.size() not in seen_grouped_sizes:
+                total += n_values
+                seen_grouped_sizes.add(w2.size())
+    return total
+
+
 def deep_gemm_warmup(model: torch.nn.Module, max_tokens: int):
-    deepgemm_fp8_gemm_nt_warmup(model, max_tokens)
-    deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(model, max_tokens)
+    total = _count_warmup_iterations(model, max_tokens)
+    if total == 0:
+        return
+
+    # Only show progress bar on rank 0 to avoid cluttered output
+    if is_global_first_rank():
+        with tqdm(total=total, desc="DeepGEMM warmup") as pbar:
+            deepgemm_fp8_gemm_nt_warmup(model, max_tokens, pbar)
+            deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(model, max_tokens, pbar)
+    else:
+        deepgemm_fp8_gemm_nt_warmup(model, max_tokens, None)
+        deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(model, max_tokens, None)
diff --git a/vllm/model_executor/warmup/kernel_warmup.py b/vllm/model_executor/warmup/kernel_warmup.py
index 95f5982bc8c7b18a6b3e092b2bbd5d22c504b24b..98b28d3e5292f1050180a59c1510a0e85a4cadd5 100644
--- a/vllm/model_executor/warmup/kernel_warmup.py
+++ b/vllm/model_executor/warmup/kernel_warmup.py
@@ -49,13 +49,12 @@ def kernel_warmup(worker: "Worker"):
         except NotImplementedError:
             return False
 
-    # NOTE: we add check for empty attn_groups to avoid errors when
-    # deploying models such as E instances and encoder-only models.
-    # As for those models, worker.model_runner.attn_groups is empty.
-    # This change is made during EPD feature development.
     if (
         not worker.model_runner.is_pooling_model
         and worker.model_runner.attn_groups
+        # NOTE: This should be `any` instead of `all` but other hybrid attention
+        # backends don't support this dummy run. Once we remove
+        # `build_for_cudagraph_capture`, we can change it to `any`.
         and all(
             _is_flashinfer_backend(group.backend)
             for groups in worker.model_runner.attn_groups
diff --git a/vllm/model_inspection.py b/vllm/model_inspection.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4c9c40410b6a8d6b4dbcd3b5f34bc161ec3d33d
--- /dev/null
+++ b/vllm/model_inspection.py
@@ -0,0 +1,136 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Model inspection utilities for vLLM."""
+
+import torch.nn as nn
+
+
+def _get_module_info(module: nn.Module) -> str:
+    """Get info string for a module."""
+    class_name = type(module).__name__
+    parts = []
+
+    # Add quant_method if present
+    quant_method = getattr(module, "quant_method", None)
+    if quant_method is not None:
+        quant_name = type(quant_method).__name__
+        # For CompressedTensors, show the underlying scheme instead
+        scheme = getattr(module, "scheme", None)
+        if scheme is not None:
+            quant_name = type(scheme).__name__
+        # Skip unquantized methods
+        if "Unquantized" not in quant_name:
+            parts.append(f"quant={quant_name}")
+
+    # If module has extra_repr, use it
+    if hasattr(module, "extra_repr"):
+        parts.append(module.extra_repr().replace("\n", ""))
+
+    if parts:
+        return f"{class_name}({', '.join(parts)})"
+
+    # For unknown modules, use the default PyTorch repr
+    return str(module)
+
+
+def _get_child_signature(child: nn.Module) -> str:
+    """Get a signature for a child module to detect duplicates."""
+    lines = []
+    for name, submodule in child.named_modules():
+        lines.append(f"{name}:{_get_module_info(submodule)}")
+    return "\n".join(lines)
+
+
+def _format_index_ranges(indices: list[int]) -> str:
+    """Format indices into range notation (e.g., [0,1,2,4,5,6] -> '0-2, 4-6')."""
+    indices = sorted(indices)
+    ranges = []
+    start = end = indices[0]
+
+    for idx in indices[1:]:
+        if idx == end + 1:
+            end = idx
+        else:
+            ranges.append(str(start) if start == end else f"{start}-{end}")
+            start = end = idx
+
+    ranges.append(str(start) if start == end else f"{start}-{end}")
+    return ", ".join(ranges)
+
+
+def _format_module_tree(
+    module: nn.Module,
+    name: str = "",
+    indent: int = 0,
+) -> list[str]:
+    """Format a module tree with indentation, grouping identical layers.
+
+    Produces output like:
+        (layers): ModuleList(
+          (0-27, 29-47): 47 x LlamaDecoderLayer(
+            ...
+          )
+          (28, 48): 2 x DifferentDecoderLayer(
+            ...
+          )
+        )
+    """
+    lines = []
+    prefix = "  " * indent
+    children = list(module.named_children())
+
+    # Leaf node - just output the module info
+    if not children:
+        info = _get_module_info(module)
+        lines.append(f"{prefix}({name}): {info}" if name else f"{prefix}{info}")
+        return lines
+
+    # Non-leaf node - output opening line and recurse into children
+    info = _get_module_info(module)
+    lines.append(f"{prefix}({name}): {info}(" if name else f"{prefix}{info}(")
+
+    # Separate numbered children (e.g., "0", "1") from named ones (e.g., "norm")
+    numbered: list[tuple[int, nn.Module]] = []
+    non_numbered: list[tuple[str, nn.Module]] = []
+    for child_name, child_module in children:
+        try:
+            numbered.append((int(child_name), child_module))
+        except ValueError:
+            non_numbered.append((child_name, child_module))
+
+    # Group numbered children by structure signature to collapse identical layers
+    # e.g., layers 0-27 and 29-47 with same structure become "(0-27, 29-47): 47 x"
+    if numbered:
+        sig_to_group: dict[str, list[tuple[int, nn.Module]]] = {}
+        for idx, child_module in numbered:
+            sig = _get_child_signature(child_module)
+            sig_to_group.setdefault(sig, []).append((idx, child_module))
+
+        # Output groups sorted by first index
+        for group in sorted(sig_to_group.values(), key=lambda g: g[0][0]):
+            indices = [idx for idx, _ in group]
+            representative = group[0][1]
+            child_lines = _format_module_tree(representative, "", indent + 1)
+            first_line = child_lines[0].lstrip()
+            child_prefix = "  " * (indent + 1)
+
+            if len(indices) > 1:
+                range_str = _format_index_ranges(indices)
+                child_lines[0] = (
+                    f"{child_prefix}({range_str}): {len(indices)} x {first_line}"
+                )
+            else:
+                child_lines[0] = f"{child_prefix}({indices[0]}): {first_line}"
+            lines.extend(child_lines)
+
+    # Output non-numbered children (e.g., "embed_tokens", "norm")
+    for child_name, child_module in non_numbered:
+        lines.extend(_format_module_tree(child_module, child_name, indent + 1))
+
+    lines.append(f"{prefix})")
+    return lines
+
+
+def format_model_inspection(model: nn.Module) -> str:
+    """Format a model into a transformers-style hierarchical string."""
+    return "\n".join(_format_module_tree(model))
diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py
index 41e09a3b50f88436a0da1ee5217caae6c263c585..915d83e9fe5891f9240fb2ef2e128f8bb26ca3fd 100644
--- a/vllm/multimodal/__init__.py
+++ b/vllm/multimodal/__init__.py
@@ -6,7 +6,6 @@ from .inputs import (
     ModalityData,
     MultiModalDataBuiltins,
     MultiModalDataDict,
-    MultiModalKwargs,
     MultiModalKwargsItems,
     MultiModalPlaceholderDict,
     MultiModalUUIDDict,
@@ -30,7 +29,6 @@ __all__ = [
     "MultiModalDataBuiltins",
     "MultiModalDataDict",
     "MultiModalHasher",
-    "MultiModalKwargs",
     "MultiModalKwargsItems",
     "MultiModalPlaceholderDict",
     "MultiModalUUIDDict",
diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py
index 51b8f77f29088af5d1c872be0d3c8057f995ba2b..813725d6d8cb115d60a6eb59cd7167c7ec350243 100644
--- a/vllm/multimodal/audio.py
+++ b/vllm/multimodal/audio.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import base64
+from dataclasses import dataclass
+from enum import Enum
 from io import BytesIO
 from pathlib import Path
 from typing import Literal
@@ -26,6 +28,142 @@ except ImportError:
     soundfile = PlaceholderModule("soundfile")  # type: ignore[assignment]
 
 
+try:
+    import scipy.signal as scipy_signal
+except ImportError:
+    scipy_signal = PlaceholderModule("scipy").placeholder_attr("signal")  # type: ignore[assignment]
+
+# ============================================================
+
+
+class ChannelReduction(str, Enum):
+    """Method to reduce multi-channel audio to target channels."""
+
+    MEAN = "mean"  # Average across channels (default, preserves energy balance)
+    FIRST = "first"  # Take first channel only
+    MAX = "max"  # Take max value across channels
+    SUM = "sum"  # Sum across channels
+
+
+@dataclass
+class AudioSpec:
+    """Specification for target audio format.
+
+    This dataclass defines the expected audio format for a model's feature
+    extractor. It is used to normalize audio data before processing.
+
+    Attributes:
+        target_channels: Number of output channels. None means passthrough
+            (no normalization). 1 = mono, 2 = stereo, etc.
+        channel_reduction: Method to reduce channels when input has more
+            channels than target. Only used when reducing channels.
+    """
+
+    target_channels: int | None = 1
+    channel_reduction: ChannelReduction = ChannelReduction.MEAN
+
+    @property
+    def needs_normalization(self) -> bool:
+        """Whether audio normalization is needed."""
+        return self.target_channels is not None
+
+    def __repr__(self) -> str:
+        if self.target_channels is None:
+            return "AudioSpec(passthrough)"
+        return (
+            f"AudioSpec(channels={self.target_channels}, "
+            f"reduction={self.channel_reduction.value})"
+        )
+
+
+# Pre-defined specs for common use cases
+MONO_AUDIO_SPEC = AudioSpec(target_channels=1, channel_reduction=ChannelReduction.MEAN)
+PASSTHROUGH_AUDIO_SPEC = AudioSpec(target_channels=None)
+
+
+def normalize_audio(
+    audio: npt.NDArray[np.floating] | torch.Tensor,
+    spec: AudioSpec,
+) -> npt.NDArray[np.floating] | torch.Tensor:
+    """Normalize audio to the specified format.
+
+    This function handles channel reduction for multi-channel audio,
+    supporting both numpy arrays and torch tensors.
+
+    Args:
+        audio: Input audio data. Can be:
+            - 1D array/tensor: (time,) - already mono
+            - 2D array/tensor: (channels, time) - standard format from torchaudio
+            - 2D array/tensor: (time, channels) - format from soundfile
+              (will be auto-detected and transposed if time > channels)
+        spec: AudioSpec defining the target format.
+
+    Returns:
+        Normalized audio in the same type as input (numpy or torch).
+        For mono output (target_channels=1), returns 1D array/tensor.
+
+    Raises:
+        ValueError: If audio has unsupported dimensions or channel expansion
+            is requested (e.g., mono to stereo).
+    """
+    if not spec.needs_normalization:
+        return audio
+
+    # Handle 1D audio (already mono)
+    if audio.ndim == 1:
+        if spec.target_channels == 1:
+            return audio
+        raise ValueError(f"Cannot expand mono audio to {spec.target_channels} channels")
+
+    # Handle 2D audio
+    if audio.ndim != 2:
+        raise ValueError(f"Unsupported audio shape: {audio.shape}. Expected 1D or 2D.")
+
+    # Auto-detect format: if shape[0] > shape[1], assume (time, channels)
+    # This handles soundfile format where time dimension is typically much larger
+    if audio.shape[0] > audio.shape[1]:
+        # Transpose from (time, channels) to (channels, time)
+        audio = audio.T if isinstance(audio, np.ndarray) else audio.T
+
+    num_channels = audio.shape[0]
+
+    # No reduction needed if already at target
+    if num_channels == spec.target_channels:
+        return audio
+
+    # Cannot expand channels
+    if num_channels < spec.target_channels:
+        raise ValueError(
+            f"Cannot expand {num_channels} channels to {spec.target_channels}"
+        )
+
+    # Reduce channels
+    is_numpy = isinstance(audio, np.ndarray)
+
+    if spec.target_channels == 1:
+        # Reduce to mono
+        if spec.channel_reduction == ChannelReduction.MEAN:
+            result = np.mean(audio, axis=0) if is_numpy else audio.mean(dim=0)
+        elif spec.channel_reduction == ChannelReduction.FIRST:
+            result = audio[0]
+        elif spec.channel_reduction == ChannelReduction.MAX:
+            result = np.max(audio, axis=0) if is_numpy else audio.max(dim=0).values
+        elif spec.channel_reduction == ChannelReduction.SUM:
+            result = np.sum(audio, axis=0) if is_numpy else audio.sum(dim=0)
+        else:
+            raise ValueError(f"Unknown reduction method: {spec.channel_reduction}")
+        return result
+    else:
+        # Reduce to N channels (take first N and apply reduction if needed)
+        # For now, just take first N channels
+        return audio[: spec.target_channels]
+
+
+# ============================================================
+# Audio Resampling
+# ============================================================
+
+
 def resample_audio_librosa(
     audio: npt.NDArray[np.floating],
     *,
@@ -41,13 +179,10 @@ def resample_audio_scipy(
     orig_sr: float,
     target_sr: float,
 ):
-    # lazy import scipy.signal, otherwise it will crash doc build.
-    import scipy.signal
-
     if orig_sr > target_sr:
-        return scipy.signal.resample_poly(audio, 1, orig_sr // target_sr)
+        return scipy_signal.resample_poly(audio, 1, orig_sr // target_sr)
     elif orig_sr < target_sr:
-        return scipy.signal.resample_poly(audio, target_sr // orig_sr, 1)
+        return scipy_signal.resample_poly(audio, target_sr // orig_sr, 1)
     return audio
 
 
@@ -111,11 +246,16 @@ class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]):
     def load_file(self, filepath: Path) -> tuple[npt.NDArray, float]:
         return librosa.load(filepath, sr=None)
 
-    def encode_base64(self, media: tuple[npt.NDArray, int]) -> str:
+    def encode_base64(
+        self,
+        media: tuple[npt.NDArray, int],
+        *,
+        audio_format: str = "WAV",
+    ) -> str:
         audio, sr = media
 
         with BytesIO() as buffer:
-            soundfile.write(buffer, audio, sr, format="WAV")
+            soundfile.write(buffer, audio, sr, format=audio_format)
             data = buffer.getvalue()
 
         return base64.b64encode(data).decode("utf-8")
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 53eb4c591ef99ead02f42cfea7e8745d729a5a63..b8cdb10fda17f8106c60e14980c209100eb4d0f5 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -34,7 +34,11 @@ class MediaWithBytes(Generic[_T]):
 
     def __getattr__(self, name: str):
         """Delegate attribute access to the underlying media object."""
-        # This is only called when the attribute is not found on self
+        # Guard against recursion during unpickling when media isn't set yet.
+        # pickle creates objects without calling __init__, so self.media may
+        # not exist when __getattr__ is called for methods like __setstate__.
+        if "media" not in self.__dict__:
+            raise AttributeError(name)
         return getattr(self.media, name)
 
 
diff --git a/vllm/multimodal/cache.py b/vllm/multimodal/cache.py
index 67bdf5e1557f9dc9f74937fc987dc4abe4fcf4d7..41397a26ef956a912119f6278ce66481186f0e5d 100644
--- a/vllm/multimodal/cache.py
+++ b/vllm/multimodal/cache.py
@@ -20,6 +20,7 @@ from vllm.logger import init_logger
 from vllm.utils.cache import CacheInfo, LRUCache
 from vllm.utils.jsontree import json_count_leaves, json_map_leaves, json_reduce_leaves
 from vllm.utils.mem_constants import GiB_bytes, MiB_bytes
+from vllm.utils.mem_utils import format_gib
 
 from .inputs import (
     MultiModalBatchedField,
@@ -130,9 +131,9 @@ class MultiModalCache:
         if debug:
             leaf_count = json_count_leaves(value)
             logger.debug(
-                "Calculated size of %s to be %.2f GiB (%d leaves)",
+                "Calculated size of %s to be %s GiB (%d leaves)",
                 type(value),
-                size / GiB_bytes,
+                format_gib(size),
                 leaf_count,
             )
 
@@ -634,12 +635,17 @@ class BaseMultiModalReceiverCache(
         Update multimodal features with cached encoder outputs.
         Touch all identifier at first before update to avoid
         item in updated list evict during update.
+
+        Uses mm_hash for cache key to share across LoRAs (falls back to
+        identifier for backward compatibility).
         """
         for feature in mm_features:
-            self.touch_receiver_cache_item(feature.identifier, feature.data)
+            cache_key = feature.mm_hash or feature.identifier
+            self.touch_receiver_cache_item(cache_key, feature.data)
 
         for feature in mm_features:
-            feature.data = self.get_and_update_item(feature.data, feature.identifier)
+            cache_key = feature.mm_hash or feature.identifier
+            feature.data = self.get_and_update_item(feature.data, cache_key)
         return mm_features
 
     @abstractmethod
diff --git a/vllm/multimodal/hasher.py b/vllm/multimodal/hasher.py
index cc50322fed902ebaa5253504bd649d4f22f5f9e3..7f0c6d134929bc5fb4257a02fed4b905eda3ba42 100644
--- a/vllm/multimodal/hasher.py
+++ b/vllm/multimodal/hasher.py
@@ -98,6 +98,9 @@ class MultiModalHasher:
         key: str,
         obj: object,
     ) -> Iterable[bytes | memoryview]:
+        if obj is None:
+            yield key.encode("utf-8")
+            return
         # Recursive cases
         if isinstance(obj, (list, tuple)):
             for i, elem in enumerate(obj):
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index 1506ecb8c7aa0d5a553fa0ab2eda39219916bed4..8e1178bc7ea44c93fd0795956e236fd01ab2ed00 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -8,8 +8,12 @@ import pybase64
 import torch
 from PIL import Image
 
+from vllm.logger import init_logger
+
 from .base import MediaIO, MediaWithBytes
 
+logger = init_logger(__file__)
+
 
 def rescale_image_size(
     image: Image.Image, size_factor: float, transpose: int = -1
@@ -104,8 +108,17 @@ class ImageMediaIO(MediaIO[Image.Image]):
         self,
         media: Image.Image,
         *,
-        image_format: str = "JPEG",
+        image_format: str | None = None,
     ) -> str:
+        if image_format is None:
+            logger.warning_once(
+                "The default format of `ImageMediaIO.encode_base64` will be changed "
+                'from "JPEG" to "PNG" in v0.15 to avoid lossy compression. '
+                "To continue using the old default, "
+                'pass `format="JPEG"` explicitly to silence this warning.'
+            )
+            image_format = "JPEG"
+
         image = media
 
         with BytesIO() as buffer:
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index fa69818a7b1f83f3bb4c35212034d8e1ee5d9d5b..bd49d719234690c6e7c00835201990f3e5582090 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -20,7 +20,7 @@ from typing import (
 )
 
 import numpy as np
-from typing_extensions import NotRequired, TypeVar, deprecated
+from typing_extensions import NotRequired, TypeVar
 
 from vllm.utils.collection_utils import full_groupby, is_list_of
 from vllm.utils.import_utils import LazyLoader
@@ -171,10 +171,7 @@ class PlaceholderRange:
 
     @cached_property
     def embeds_cumsum(self) -> torch.Tensor | None:
-        if self.is_embed is None:
-            return None
-
-        return self.is_embed.cumsum(dim=0)
+        return None if self.is_embed is None else self.is_embed.cumsum(dim=0)
 
     @cached_property
     def get_num_embeds(self) -> int:
@@ -308,13 +305,7 @@ def batched_tensors_equal(a: BatchedTensorInputs, b: BatchedTensorInputs) -> boo
     Equality check between
     [`BatchedTensorInputs`][vllm.multimodal.inputs.BatchedTensorInputs] objects.
     """
-    for k in a:
-        if k not in b:
-            return False
-        if not nested_tensors_equal(a[k], b[k]):
-            return False
-
-    return True
+    return all(k in b and nested_tensors_equal(a[k], b[k]) for k in a)
 
 
 @dataclass
@@ -339,6 +330,9 @@ class MultiModalFeatureSpec:
     mm_position: PlaceholderRange
     """e.g., PlaceholderRange(offset=2, length=336)"""
 
+    mm_hash: str | None = None
+    """Base mm_hash for processor cache (without LoRA prefix)."""
+
     @staticmethod
     def gather_kwargs(features: list["MultiModalFeatureSpec"], keys: set[str]):
         kwargs = defaultdict[str, list[NestedTensors]](list)
@@ -356,8 +350,8 @@ class MultiModalFeatureSpec:
 @dataclass
 class MultiModalFieldElem:
     """
-    Represents a keyword argument corresponding to a multi-modal item
-    in [`MultiModalKwargs`][vllm.multimodal.inputs.MultiModalKwargs].
+    Represents a keyword argument inside a
+    [`MultiModalKwargsItem`][vllm.multimodal.inputs.MultiModalKwargsItem].
     """
 
     modality: str
@@ -369,14 +363,14 @@ class MultiModalFieldElem:
     key: str
     """
     The key of this field in
-    [`MultiModalKwargs`][vllm.multimodal.inputs.MultiModalKwargs],
+    [`MultiModalKwargsItem`][vllm.multimodal.inputs.MultiModalKwargsItem],
     i.e. the name of the keyword argument to be passed to the model.
     """
 
     data: NestedTensors
     """
     The tensor data of this field in
-    [`MultiModalKwargs`][vllm.multimodal.inputs.MultiModalKwargs],
+    [`MultiModalKwargsItem`][vllm.multimodal.inputs.MultiModalKwargsItem],
     i.e. the value of the keyword argument to be passed to the model.
 
     It may be set to `None` if it is determined that the item is cached
@@ -410,9 +404,9 @@ class MultiModalFieldElem:
 @dataclass(frozen=True, kw_only=True)
 class BaseMultiModalField(ABC):
     """
-    Defines how to interpret tensor data belonging to a keyword argument in
-    [`MultiModalKwargs`][vllm.multimodal.inputs.MultiModalKwargs] for multiple
-    multi-modal items, and vice versa.
+    Defines how to interpret tensor data belonging to a keyword argument for
+    [`MultiModalKwargsItems`][vllm.multimodal.inputs.MultiModalKwargsItems],
+    and vice versa.
     """
 
     keep_on_cpu: bool = False
@@ -985,62 +979,6 @@ MultiModalKwargsOptionalItems: TypeAlias = (
 )
 
 
-@deprecated("`MultiModalKwargs` is deprecated and will be removed in v0.14.")
-class MultiModalKwargs(UserDict[str, NestedTensors]):
-    """
-    A dictionary that represents the keyword arguments to
-    [`torch.nn.Module.forward`][].
-    """
-
-    @staticmethod
-    @deprecated(
-        "`MultiModalKwargs.from_hf_inputs` is deprecated and "
-        "will be removed in v0.14. "
-        "Please use `MultiModalKwargsItems.from_hf_inputs` and "
-        "access the tensor data using `.get_data()`."
-    )
-    def from_hf_inputs(
-        hf_inputs: "BatchFeature",
-        config_by_key: Mapping[str, MultiModalFieldConfig],
-    ):
-        return MultiModalKwargsItems.from_hf_inputs(hf_inputs, config_by_key).get_data()
-
-    @staticmethod
-    @deprecated(
-        "`MultiModalKwargs.from_items` is deprecated and "
-        "will be removed in v0.14. "
-        "Please use `MultiModalKwargsItems.from_seq` and "
-        "access the tensor data using `.get_data()`."
-    )
-    def from_items(
-        items: Sequence[MultiModalKwargsItem],
-        *,
-        pin_memory: bool = False,
-    ):
-        return MultiModalKwargsItems.from_seq(items).get_data(pin_memory=pin_memory)
-
-    def __getitem__(self, key: str):
-        if key not in self:
-            raise KeyError(
-                f"Keyword argument {key!r} not found. "
-                f"Available keys: {set(self.keys())}"
-            )
-
-        return super().__getitem__(key)
-
-    def __eq__(self, other: object) -> bool:
-        if not isinstance(other, self.__class__):
-            return False
-
-        for k in self:
-            if k not in other:
-                return False
-            if not nested_tensors_equal(self[k], other[k]):
-                return False
-
-        return True
-
-
 MultiModalPlaceholderDict: TypeAlias = Mapping[str, Sequence[PlaceholderRange]]
 """
 A dictionary containing placeholder ranges for each modality.
diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
index ef181ee2de0f3a8198934b23df1d0778d48ff5b7..f9e94034219a7e5aff966e4a89fc4777814d2c1a 100644
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -22,7 +22,7 @@ from typing_extensions import assert_never
 from vllm.utils.collection_utils import is_list_of
 from vllm.utils.import_utils import LazyLoader
 
-from .audio import AudioResampler
+from .audio import AudioResampler, AudioSpec, normalize_audio
 from .base import MediaWithBytes
 from .inputs import (
     AudioItem,
@@ -126,6 +126,30 @@ class ProcessorBatchItems(ModalityDataItems[Sequence[_T], _T]):
         return {}
 
 
+def validate_embedding_ndim(
+    tensor: torch.Tensor,
+    modality: str,
+    index: int | None = None,
+) -> None:
+    """Validate tensor ndim for multimodal embeddings.
+
+    Single embeddings should be 2D (seq_len, hidden_size).
+    Batched embeddings should be 3D (batch, seq_len, hidden_size).
+
+    Args:
+        tensor: The tensor to validate.
+        modality: The modality name for error messages (e.g., "image", "audio").
+        index: Optional index for list items, included in error messages.
+    """
+    if tensor.ndim < 2 or tensor.ndim > 3:
+        idx_str = f" [{index}]" if index is not None else ""
+        raise ValueError(
+            f"{modality.capitalize()} embedding{idx_str} must be 2D "
+            f"(seq_len, hidden_size) or 3D (batch, seq_len, hidden_size), "
+            f"got {tensor.ndim}D tensor with shape {tuple(tensor.shape)}"
+        )
+
+
 class EmbeddingItems(
     ModalityDataItems[torch.Tensor | list[torch.Tensor], torch.Tensor]
 ):
@@ -134,6 +158,63 @@ class EmbeddingItems(
     or a list of embedding tensors (one per item).
     """
 
+    def __init__(
+        self,
+        data: torch.Tensor | list[torch.Tensor],
+        modality: str,
+        expected_hidden_size: int | None = None,
+    ) -> None:
+        super().__init__(data, modality)
+
+        # Validate ndim first (before hidden_size which depends on correct ndim)
+        self._validate_ndim()
+
+        # Validate hidden dimension if expected size is provided
+        if expected_hidden_size is not None:
+            self._validate_hidden_size(expected_hidden_size)
+
+    def _validate_ndim(self) -> None:
+        """Validate that embedding tensors have correct ndim (2D or 3D)."""
+        if isinstance(self.data, torch.Tensor):
+            validate_embedding_ndim(self.data, self.modality)
+        else:
+            # List of tensors: each should be 2D (seq_len, hidden_size)
+            for idx, tensor in enumerate(self.data):
+                if tensor.ndim != 2:
+                    raise ValueError(
+                        f"{self.modality.capitalize()} embedding [{idx}] must be "
+                        f"2D (seq_len, hidden_size), got {tensor.ndim}D tensor "
+                        f"with shape {tuple(tensor.shape)}"
+                    )
+
+    def _validate_hidden_size(self, expected_hidden_size: int) -> None:
+        """Validate that embedding hidden dimension matches expected size.
+
+        This validates hidden dimensions to prevent vulnerabilities: Embeddings
+        with correct ndim but wrong hidden dimension could bypass initial
+        checks and cause crashes during model inference when dimensions don't match.
+        """
+        if isinstance(self.data, torch.Tensor):
+            # Batched tensor: shape is (batch, seq_len, hidden_size)
+            actual_hidden_size = self.data.shape[-1]
+            if actual_hidden_size != expected_hidden_size:
+                raise ValueError(
+                    f"{self.modality.capitalize()} embedding hidden dimension "
+                    f"mismatch: got {actual_hidden_size}, but model expects "
+                    f"{expected_hidden_size}. Embedding shape: {tuple(self.data.shape)}"
+                )
+        else:
+            # List of tensors: each has shape (seq_len, hidden_size)
+            for idx, tensor in enumerate(self.data):
+                actual_hidden_size = tensor.shape[-1]
+                if actual_hidden_size != expected_hidden_size:
+                    raise ValueError(
+                        f"{self.modality.capitalize()} embedding [{idx}] hidden "
+                        f"dimension mismatch: got {actual_hidden_size}, but model "
+                        f"expects {expected_hidden_size}. "
+                        f"Embedding shape: {tuple(tensor.shape)}"
+                    )
+
     def _unwrap(
         self, item: torch.Tensor | MediaWithBytes[torch.Tensor]
     ) -> torch.Tensor:
@@ -228,8 +309,12 @@ class AudioProcessorItems(ProcessorBatchItems[HfAudioItem]):
 
 
 class AudioEmbeddingItems(EmbeddingItems):
-    def __init__(self, data: torch.Tensor | list[torch.Tensor]) -> None:
-        super().__init__(data, "audio")
+    def __init__(
+        self,
+        data: torch.Tensor | list[torch.Tensor],
+        expected_hidden_size: int | None = None,
+    ) -> None:
+        super().__init__(data, "audio", expected_hidden_size)
 
 
 class ImageSize(NamedTuple):
@@ -256,8 +341,12 @@ class ImageProcessorItems(ProcessorBatchItems[HfImageItem]):
 
 
 class ImageEmbeddingItems(EmbeddingItems):
-    def __init__(self, data: torch.Tensor | list[torch.Tensor]) -> None:
-        super().__init__(data, "image")
+    def __init__(
+        self,
+        data: torch.Tensor | list[torch.Tensor],
+        expected_hidden_size: int | None = None,
+    ) -> None:
+        super().__init__(data, "image", expected_hidden_size)
 
 
 class VideoProcessorItems(ProcessorBatchItems[HfVideoItem]):
@@ -287,8 +376,12 @@ class VideoProcessorItems(ProcessorBatchItems[HfVideoItem]):
 
 
 class VideoEmbeddingItems(EmbeddingItems):
-    def __init__(self, data: torch.Tensor | list[torch.Tensor]) -> None:
-        super().__init__(data, "video")
+    def __init__(
+        self,
+        data: torch.Tensor | list[torch.Tensor],
+        expected_hidden_size: int | None = None,
+    ) -> None:
+        super().__init__(data, "video", expected_hidden_size)
 
 
 _D = TypeVar("_D", bound=ModalityDataItems[Any, Any])
@@ -363,14 +456,23 @@ class MultiModalDataParser:
     Args:
         target_sr (float, optional): Enables automatic resampling of audio
             items to the model's expected sampling rate.
+        target_channels (int, optional): Target number of audio channels.
+            If provided, normalizes audio to this many channels (e.g., 1 for mono).
+            If None, audio channels are passed through unchanged.
+        expected_hidden_size (int, optional): Expected hidden dimension for
+            embedding inputs. If provided, validates that user-supplied
+            embeddings have the correct hidden size to prevent crashes
+            during model inference.
     """
 
     def __init__(
         self,
         *,
         target_sr: float | None = None,
+        target_channels: int | None = None,
         audio_resample_method: Literal["librosa", "scipy"] = "librosa",
         video_needs_metadata: bool = False,
+        expected_hidden_size: int | None = None,
     ) -> None:
         super().__init__()
 
@@ -378,7 +480,9 @@ class MultiModalDataParser:
             target_sr=target_sr,
             method=audio_resample_method,
         )
+        self.target_channels = target_channels
         self.video_needs_metadata = video_needs_metadata
+        self.expected_hidden_size = expected_hidden_size
 
     @classmethod
     def is_embeddings(
@@ -443,7 +547,7 @@ class MultiModalDataParser:
             return None
 
         if self.is_embeddings(data):
-            return AudioEmbeddingItems(data)
+            return AudioEmbeddingItems(data, self.expected_hidden_size)
 
         data_items: list[AudioItem]
         if (
@@ -466,6 +570,11 @@ class MultiModalDataParser:
             else:
                 new_audio = self.audio_resampler.resample(audio, orig_sr=orig_sr)
 
+            # Apply channel normalization if target_channels is set
+            if self.target_channels is not None:
+                spec = AudioSpec(target_channels=self.target_channels)
+                new_audio = normalize_audio(new_audio, spec)
+
             new_audios.append(new_audio)
 
         return AudioProcessorItems(new_audios)
@@ -481,7 +590,7 @@ class MultiModalDataParser:
             return None
 
         if self.is_embeddings(data):
-            return ImageEmbeddingItems(data)
+            return ImageEmbeddingItems(data, self.expected_hidden_size)
 
         if (
             isinstance(data, (PILImage.Image, MediaWithBytes))
@@ -507,7 +616,7 @@ class MultiModalDataParser:
             return None
 
         if self.is_embeddings(data):
-            return VideoEmbeddingItems(data)
+            return VideoEmbeddingItems(data, self.expected_hidden_size)
 
         data_items: list[VideoItem]
         if (
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 0390773783961e9dd0f6c8d9fc5ab3086342b5a1..8e3f32698a6b75142f8fd9097766ea454c57769b 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -1,9 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import contextvars
+import threading
 import time
 from abc import ABC, abstractmethod
 from collections import defaultdict
 from collections.abc import Callable, Generator, ItemsView, Iterable, Mapping, Sequence
+from contextlib import contextmanager
 from dataclasses import dataclass, field, replace
 from enum import Enum
 from functools import lru_cache
@@ -53,7 +56,7 @@ if TYPE_CHECKING:
     from transformers.feature_extraction_utils import BatchFeature
     from transformers.processing_utils import ProcessorMixin
 
-    from vllm.config import ModelConfig
+    from vllm.config import ModelConfig, ObservabilityConfig
 
     from .cache import BaseMultiModalProcessorCache
     from .profiling import BaseDummyInputsBuilder
@@ -63,6 +66,7 @@ else:
     ProcessorMixin = object
 
     ModelConfig = object
+    ObservabilityConfig = object
 
     BaseMultiModalProcessorCache = object
 
@@ -70,6 +74,127 @@ logger = init_logger(__name__)
 
 _S = TypeVar("_S", str, list[int])
 
+_request_id_context: contextvars.ContextVar[str | None] = contextvars.ContextVar(
+    "_request_id_context", default=None
+)
+
+
+def get_current_request_id() -> str | None:
+    """Get the current request_id from the context, if available."""
+    return _request_id_context.get()
+
+
+@contextmanager
+def set_request_id(request_id: str) -> Generator[None, None, None]:
+    """Context manager to set the request_id for the current context."""
+    token = _request_id_context.set(request_id)
+    try:
+        yield
+    finally:
+        _request_id_context.reset(token)
+
+
+@dataclass
+class MultiModalProcessorTimingStats:
+    """Per-request timing statistics for multimodal processor stages."""
+
+    hf_processor_time: float = 0.0
+    """Time spent in HuggingFace processor calls (seconds)."""
+
+    hashing_time: float = 0.0
+    """Time spent computing multimodal item hashes (seconds)."""
+
+    cache_lookup_time: float = 0.0
+    """Time spent in cache lookups and merges (seconds)."""
+
+    prompt_update_time: float = 0.0
+    """Time spent applying prompt updates and finding placeholders (seconds)."""
+
+    total_time: float = 0.0
+    """Total processing time (seconds)."""
+
+    def to_dict(self) -> dict[str, float]:
+        """Convert stats to a dictionary for JSON serialization."""
+        return {
+            "hf_processor_time": self.hf_processor_time,
+            "hashing_time": self.hashing_time,
+            "cache_lookup_time": self.cache_lookup_time,
+            "prompt_update_time": self.prompt_update_time,
+            "total_time": self.total_time,
+        }
+
+
+def get_timing_stats_from_engine_client(
+    engine_client: Any,
+) -> dict[str, dict[str, float]]:
+    """
+    Get all timing stats from the context associated with the engine client.
+
+    Args:
+        engine_client: The engine client that has input_processor.
+
+    Returns:
+        A dictionary mapping request_id to stats dict.
+    """
+    try:
+        if not engine_client.vllm_config.observability_config.enable_mm_processor_stats:
+            return {}
+    except (AttributeError, RuntimeError):
+        return {}
+
+    try:
+        input_processor = engine_client.input_processor
+        input_preprocessor = input_processor.input_preprocessor
+
+        if hasattr(input_preprocessor, "_get_mm_processor"):
+            mm_processor = input_preprocessor._get_mm_processor()
+            if mm_processor is not None and hasattr(mm_processor, "info"):
+                ctx = mm_processor.info.ctx
+                return ctx.get_all_timing_stats()
+    except (AttributeError, RuntimeError):
+        pass
+
+    return {}
+
+
+@contextmanager
+def _timed_operation(ctx: "InputProcessingContext", stage_name: str):
+    """
+    Context manager to time an operation using the context's timing stats.
+
+    The request_id is automatically retrieved from the context variable,
+    so it doesn't need to be passed as a parameter.
+
+    Args:
+        ctx: The InputProcessingContext containing the timing stats registry.
+        stage_name: Name of the stage being timed.
+    """
+    request_id = get_current_request_id()
+    if ctx is None or request_id is None:
+        yield
+        return
+
+    stats = ctx.get_timing_stats(request_id)
+    if stats is None:
+        yield
+        return
+
+    start_time = time.perf_counter()
+    try:
+        yield
+    finally:
+        elapsed = time.perf_counter() - start_time
+        if stage_name == "hf_processor":
+            stats.hf_processor_time += elapsed
+        elif stage_name == "hashing":
+            stats.hashing_time += elapsed
+        elif stage_name == "cache_lookup":
+            stats.cache_lookup_time += elapsed
+        elif stage_name == "prompt_update":
+            stats.prompt_update_time += elapsed
+        stats.total_time += elapsed
+
+
 PromptSeq: TypeAlias = str | list[int]
 """A token sequence (list of token IDs) or text."""
 
@@ -951,6 +1076,21 @@ class InputProcessingContext:
     tokenizer: TokenizerLike | None
     """The tokenizer used to tokenize the inputs."""
 
+    observability_config: "ObservabilityConfig | None" = field(
+        default=None, compare=False, repr=False
+    )
+    """Configuration for observability features."""
+
+    timing_stats_registry: dict[str, MultiModalProcessorTimingStats] = field(
+        default_factory=dict, compare=False, repr=False
+    )
+    """Registry for storing timing stats keyed by request_id."""
+
+    _timing_stats_registry_lock: threading.Lock = field(
+        default_factory=threading.Lock, compare=False, repr=False
+    )
+    """Lock for thread-safe access to timing_stats_registry."""
+
     def get_tokenizer(self) -> TokenizerLike:
         if self.tokenizer is None:
             raise ValueError(
@@ -1046,10 +1186,16 @@ class InputProcessingContext:
 
             typ = ProcessorMixin
 
+        from vllm.tokenizers.mistral import MistralTokenizer
+
+        tokenizer = self.tokenizer
+        if isinstance(tokenizer, MistralTokenizer):
+            tokenizer = tokenizer.transformers_tokenizer
+
         return cached_processor_from_config(
             self.model_config,
             processor_cls=typ,
-            tokenizer=self.tokenizer,
+            tokenizer=tokenizer,
             **kwargs,
         )
 
@@ -1159,6 +1305,71 @@ class InputProcessingContext:
 
         return self._postprocess_output(output)
 
+    def get_timing_stats(
+        self, request_id: str
+    ) -> MultiModalProcessorTimingStats | None:
+        """
+        Get timing stats for a request.
+        """
+        if (
+            self.observability_config is None
+            or not self.observability_config.enable_mm_processor_stats
+        ):
+            return None
+        with self._timing_stats_registry_lock:
+            return self.timing_stats_registry.get(request_id)
+
+    def create_timing_stats(self, request_id: str) -> MultiModalProcessorTimingStats:
+        """
+        Create and store timing stats in the registry for a request.
+
+        This should be called at the start of processing for a request.
+        The stats object is created immediately and stored in the registry.
+        """
+        if (
+            self.observability_config is None
+            or not self.observability_config.enable_mm_processor_stats
+        ):
+            return MultiModalProcessorTimingStats()
+
+        with self._timing_stats_registry_lock:
+            if request_id in self.timing_stats_registry:
+                raise ValueError(
+                    f"Timing stats already exist for request_id: {request_id}"
+                )
+            stats = MultiModalProcessorTimingStats()
+            self.timing_stats_registry[request_id] = stats
+            return stats
+
+    def clear_timing_stats_registry(self) -> int:
+        """
+        Clear all stats from the registry. Returns the number of stats cleared.
+        """
+        if (
+            self.observability_config is None
+            or not self.observability_config.enable_mm_processor_stats
+        ):
+            return 0
+        with self._timing_stats_registry_lock:
+            count = len(self.timing_stats_registry)
+            self.timing_stats_registry.clear()
+            return count
+
+    def get_all_timing_stats(self) -> dict[str, dict[str, float]]:
+        """
+        Get all timing stats as a dictionary for API endpoints.
+        """
+        if (
+            self.observability_config is None
+            or not self.observability_config.enable_mm_processor_stats
+        ):
+            return {}
+        with self._timing_stats_registry_lock:
+            return {
+                rid: stats.to_dict()
+                for rid, stats in self.timing_stats_registry.items()
+            }
+
 
 class BaseProcessingInfo:
     """Base class to provide the information necessary for data processing."""
@@ -1330,7 +1541,15 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         of [`MultiModalDataParser`][vllm.multimodal.parse.MultiModalDataParser]
         that has additional subparsers.
         """
-        return MultiModalDataParser()
+        # Get expected hidden size for embedding validation if mm_embeds enabled
+        # This validates hidden dimensions to prevent vulnerabilities: embeddings
+        # with correct ndim but wrong shape could cause crashes at inference time
+        mm_config = self.info.ctx.model_config.get_multimodal_config()
+        expected_hidden_size = None
+        if mm_config.enable_mm_embeds:
+            expected_hidden_size = self.info.ctx.model_config.get_inputs_embeds_size()
+
+        return MultiModalDataParser(expected_hidden_size=expected_hidden_size)
 
     def validate_num_items(
         self,
@@ -1494,11 +1713,12 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         Call the HF processor on the prompt text and
         associated multi-modal data.
         """
-        return self.info.ctx.call_hf_processor(
-            self.info.get_hf_processor(**mm_kwargs),
-            dict(text=prompt, **mm_data),
-            dict(**mm_kwargs, **tok_kwargs),
-        )
+        with _timed_operation(self.info.ctx, "hf_processor"):
+            return self.info.ctx.call_hf_processor(
+                self.info.get_hf_processor(**mm_kwargs),
+                dict(text=prompt, **mm_data),
+                dict(**mm_kwargs, **tok_kwargs),
+            )
 
     def _hf_processor_applies_updates(
         self,
@@ -1846,12 +2066,13 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         )
 
         # Use overrides if provided; fallback to data-dependent hashing.
-        mm_hashes = self._hash_mm_items(
-            mm_data_items,
-            hf_processor_mm_kwargs,
-            tokenization_kwargs,
-            mm_uuids=mm_uuids,
-        )
+        with _timed_operation(self.info.ctx, "hashing"):
+            mm_hashes = self._hash_mm_items(
+                mm_data_items,
+                hf_processor_mm_kwargs,
+                tokenization_kwargs,
+                mm_uuids=mm_uuids,
+            )
 
         mm_prompt_updates = self._get_mm_prompt_updates(
             mm_data_items,
@@ -1892,18 +2113,20 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
                 mm_uuids=mm_uuids,
             )
 
-        mm_hashes = self._hash_mm_items(
-            mm_data_items,
-            hf_processor_mm_kwargs,
-            tokenization_kwargs,
-            mm_uuids=mm_uuids,
-        )
+        with _timed_operation(self.info.ctx, "hashing"):
+            mm_hashes = self._hash_mm_items(
+                mm_data_items,
+                hf_processor_mm_kwargs,
+                tokenization_kwargs,
+                mm_uuids=mm_uuids,
+            )
 
-        mm_is_cached, mm_missing_data_items = self._get_cache_missing_items(
-            cache=cache,
-            mm_data_items=mm_data_items,
-            mm_hashes=mm_hashes,
-        )
+        with _timed_operation(self.info.ctx, "cache_lookup"):
+            mm_is_cached, mm_missing_data_items = self._get_cache_missing_items(
+                cache=cache,
+                mm_data_items=mm_data_items,
+                mm_hashes=mm_hashes,
+            )
 
         # NOTE: `prompt` does not correspond to `mm_missing_data_items`,
         # so we can't apply prompt updates until the new multimodal
@@ -1933,13 +2156,14 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
             mm_missing_kwargs,
         )
 
-        mm_kwargs, mm_prompt_updates = self._merge_mm_kwargs(
-            cache,
-            mm_hashes=mm_hashes,
-            mm_is_cached=mm_is_cached,
-            mm_missing_kwargs=mm_missing_kwargs,
-            mm_missing_prompt_updates=mm_missing_prompt_updates,
-        )
+        with _timed_operation(self.info.ctx, "cache_lookup"):
+            mm_kwargs, mm_prompt_updates = self._merge_mm_kwargs(
+                cache,
+                mm_hashes=mm_hashes,
+                mm_is_cached=mm_is_cached,
+                mm_missing_kwargs=mm_missing_kwargs,
+                mm_missing_prompt_updates=mm_missing_prompt_updates,
+            )
 
         mm_info = MultiModalProcessingInfo(
             kwargs=mm_kwargs,
@@ -2121,6 +2345,10 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         3. Extract information about the placeholder tokens from the
            processed token IDs.
         """
+        request_id = get_current_request_id()
+        if request_id is not None:
+            self.info.ctx.create_timing_stats(request_id)
+
         mm_items = self._to_mm_items(mm_data)
 
         if tokenization_kwargs is None:
@@ -2139,13 +2367,14 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         )
 
         # NOTE: tokenization_kwargs are not required to init processor
-        prompt_ids, mm_placeholders = self._maybe_apply_prompt_updates(
-            mm_items=mm_items,
-            prompt_ids=prompt_ids,
-            mm_kwargs=mm_info.kwargs,
-            mm_prompt_updates=mm_info.prompt_updates,
-            is_update_applied=is_update_applied,
-        )
+        with _timed_operation(self.info.ctx, "prompt_update"):
+            prompt_ids, mm_placeholders = self._maybe_apply_prompt_updates(
+                mm_items=mm_items,
+                prompt_ids=prompt_ids,
+                mm_kwargs=mm_info.kwargs,
+                mm_prompt_updates=mm_info.prompt_updates,
+                is_update_applied=is_update_applied,
+            )
 
         mm_placeholder_ranges = {
             modality: [item.to_range() for item in placeholders]
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 1e7fe8648ab7146ac34c8a04bc14a91887886d9d..ed6a893288d30325d82fee2b84526334c17c283a 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -5,6 +5,7 @@ from dataclasses import dataclass
 from typing import TYPE_CHECKING, Generic, Protocol, TypeVar, cast
 
 from vllm.config.multimodal import BaseDummyOptions
+from vllm.config.observability import ObservabilityConfig
 from vllm.logger import init_logger
 from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config
 
@@ -22,7 +23,7 @@ from .profiling import (
 )
 
 if TYPE_CHECKING:
-    from vllm.config import ModelConfig
+    from vllm.config import ModelConfig, ObservabilityConfig
     from vllm.model_executor.models.interfaces import SupportsMultiModal
 
 logger = init_logger(__name__)
@@ -148,6 +149,7 @@ class MultiModalRegistry:
         *,
         cache: BaseMultiModalProcessorCache | None = None,
         profiler_limits: Mapping[str, int] | None = None,
+        observability_config: ObservabilityConfig | None = None,
     ) -> Mapping[str, int]:
         """
         Get the maximum number of tokens per data item from each modality based
@@ -156,7 +158,9 @@ class MultiModalRegistry:
         if not model_config.is_multimodal_model:
             return {}
 
-        processor = self.create_processor(model_config, cache=cache)
+        processor = self.create_processor(
+            model_config, observability_config, cache=cache
+        )
         profiler: MultiModalProfiler = MultiModalProfiler(processor)
 
         seq_len = model_config.max_model_len
@@ -174,6 +178,7 @@ class MultiModalRegistry:
         model_config: "ModelConfig",
         *,
         cache: BaseMultiModalProcessorCache | None = None,
+        observability_config: ObservabilityConfig | None = None,
     ) -> Mapping[str, int]:
         """
         Get the maximum number of multi-modal input instances for each modality
@@ -182,7 +187,9 @@ class MultiModalRegistry:
         if not model_config.is_multimodal_model:
             return {}
 
-        processor = self.create_processor(model_config, cache=cache)
+        processor = self.create_processor(
+            model_config, observability_config, cache=cache
+        )
         profiler: MultiModalProfiler = MultiModalProfiler(processor)
         return profiler.get_mm_limits()
 
@@ -231,27 +238,32 @@ class MultiModalRegistry:
     def _create_processing_ctx(
         self,
         model_config: "ModelConfig",
+        observability_config: "ObservabilityConfig | None" = None,
         tokenizer: TokenizerLike | None = None,
     ) -> InputProcessingContext:
         if tokenizer is None and not model_config.skip_tokenizer_init:
             tokenizer = cached_tokenizer_from_config(model_config)
 
-        return InputProcessingContext(model_config, tokenizer)
+        return InputProcessingContext(
+            model_config, tokenizer, observability_config=observability_config
+        )
 
     def _create_processing_info(
         self,
         model_config: "ModelConfig",
+        observability_config: "ObservabilityConfig | None" = None,
         *,
         tokenizer: TokenizerLike | None = None,
     ) -> BaseProcessingInfo:
         model_cls = self._get_model_cls(model_config)
         factories = model_cls._processor_factory
-        ctx = self._create_processing_ctx(model_config, tokenizer)
+        ctx = self._create_processing_ctx(model_config, observability_config, tokenizer)
         return factories.info(ctx)
 
     def create_processor(
         self,
         model_config: "ModelConfig",
+        observability_config: "ObservabilityConfig | None" = None,
         *,
         tokenizer: TokenizerLike | None = None,
         cache: BaseMultiModalProcessorCache | None = None,
@@ -265,7 +277,7 @@ class MultiModalRegistry:
         model_cls = self._get_model_cls(model_config)
         factories = model_cls._processor_factory
 
-        ctx = self._create_processing_ctx(model_config, tokenizer)
+        ctx = self._create_processing_ctx(model_config, observability_config, tokenizer)
 
         return factories.build_processor(ctx, cache=cache)
 
@@ -276,13 +288,16 @@ class MultiModalRegistry:
         mm_counts: Mapping[str, int] | None = None,
         *,
         cache: BaseMultiModalProcessorCache | None = None,
+        observability_config: ObservabilityConfig | None = None,
     ) -> DummyDecoderData:
         """
         Create dummy data for profiling the memory usage of a model.
 
         The model is identified by `model_config`.
         """
-        processor = self.create_processor(model_config, cache=cache)
+        processor = self.create_processor(
+            model_config, observability_config, cache=cache
+        )
         profiler: MultiModalProfiler = MultiModalProfiler(processor)
 
         # Extract configurable options from multimodal config.
@@ -309,13 +324,16 @@ class MultiModalRegistry:
         mm_counts: Mapping[str, int] | None = None,
         *,
         cache: BaseMultiModalProcessorCache | None = None,
+        observability_config: ObservabilityConfig | None = None,
     ) -> DummyEncoderData:
         """
         Create dummy data for profiling the memory usage of a model.
 
         The model is identified by `model_config`.
         """
-        processor = self.create_processor(model_config, cache=cache)
+        processor = self.create_processor(
+            model_config, observability_config, cache=cache
+        )
         profiler: MultiModalProfiler = MultiModalProfiler(processor)
 
         # Extract configurable options from multimodal config.
@@ -349,8 +367,8 @@ class MultiModalRegistry:
             # than whisper.
             return 0
         assert len(max_tokens) == 1, (
-            "Encoder-decoder models are expected \
-            to implement the multimodal interface with at most one modality."
+            "Encoder-decoder models are expected "
+            "to implement the multimodal interface with at most one modality."
         )
 
         first_modality = next(iter(max_tokens))
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 7fd05af583b0ac794bbe8f5f6ccad24859f83d7f..07165430b2c9e6246f092852b532cb27d872464a 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -3,7 +3,8 @@
 
 import asyncio
 import atexit
-from collections.abc import Generator, Set
+import mimetypes
+from collections.abc import Generator
 from concurrent.futures import ThreadPoolExecutor
 from itertools import groupby
 from pathlib import Path
@@ -357,17 +358,31 @@ class MediaConnector:
 def encode_audio_base64(
     audio: np.ndarray,
     sampling_rate: int,
+    *,
+    format: str = "WAV",
 ) -> str:
     """Encode audio as base64."""
     audio_io = AudioMediaIO()
-    return audio_io.encode_base64((audio, sampling_rate))
+    return audio_io.encode_base64((audio, sampling_rate), audio_format=format)
+
+
+def encode_audio_url(
+    audio: np.ndarray,
+    sampling_rate: int,
+    *,
+    format: str = "WAV",
+) -> str:
+    """Encode audio as a data URL."""
+    audio_b64 = encode_audio_base64(audio, sampling_rate, format=format)
+    mimetype = mimetypes.types_map.get("." + format.lower(), "audio")
+    return f"data:{mimetype};base64,{audio_b64}"
 
 
 def encode_image_base64(
     image: Image.Image,
     *,
     image_mode: str = "RGB",
-    format: str = "JPEG",
+    format: str | None = None,
 ) -> str:
     """
     Encode a pillow image to base64 format.
@@ -378,10 +393,45 @@ def encode_image_base64(
     return image_io.encode_base64(image, image_format=format)
 
 
-def encode_video_base64(frames: npt.NDArray) -> str:
+def encode_image_url(
+    image: Image.Image,
+    *,
+    image_mode: str = "RGB",
+    format: str = "PNG",
+) -> str:
+    """
+    Encode a pillow image as a data URL.
+
+    By default, the image is converted into RGB format before being encoded.
+    """
+    image_b64 = encode_image_base64(image, image_mode=image_mode, format=format)
+    mimetype = mimetypes.types_map.get("." + format.lower(), "image")
+    return f"data:{mimetype};base64,{image_b64}"
+
+
+def encode_video_base64(
+    frames: npt.NDArray,
+    *,
+    format: str = "JPEG",
+) -> str:
     image_io = ImageMediaIO()
     video_io = VideoMediaIO(image_io)
-    return video_io.encode_base64(frames)
+    return video_io.encode_base64(frames, video_format=format)
+
+
+def encode_video_url(
+    frames: npt.NDArray,
+    *,
+    format: str = "JPEG",
+) -> str:
+    video_b64 = encode_video_base64(frames, format=format)
+
+    if format.lower() == "jpeg":
+        mimetype = "video/jpeg"
+    else:
+        mimetype = mimetypes.types_map.get("." + format.lower(), "video")
+
+    return f"data:{mimetype};base64,{video_b64}"
 
 
 def argsort_mm_positions(
@@ -412,8 +462,6 @@ def group_mm_kwargs_by_modality(
     *,
     device: torch.types.Device = None,
     pin_memory: bool = False,
-    merge_by_field_config: bool | None = None,
-    multimodal_cpu_fields: Set[str] | None = None,
 ) -> Generator[tuple[str, int, BatchedTensorInputs], None, None]:
     """Group consecutive `MultiModalKwargsItem`s from `mm_kwargs` with the same
     modality together into the same `MultiModalKwargs` instance.
@@ -426,17 +474,6 @@ def group_mm_kwargs_by_modality(
     Yields:
         A tuple `(modality, num_items, grouped_kwargs)`.
     """
-    if merge_by_field_config is not None:
-        logger.warning_once(
-            "The `merge_by_field_config` argument of `group_mm_kwargs_by_modality` "
-            "is deprecated and will be removed in v0.14."
-        )
-    if multimodal_cpu_fields is not None:
-        logger.warning_once(
-            "The `multimodal_cpu_fields` argument of `group_mm_kwargs_by_modality` "
-            "is deprecated and will be removed in v0.14."
-        )
-
     from vllm.multimodal.inputs import MultiModalKwargsItems
 
     for modality, items in groupby(mm_kwargs, key=lambda item: item.modality):
diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index 024252799cf749691abcec7a65b98ae7d092f717..8204cdfbc5de7297897dc4e26955d57f680051d3 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -6,12 +6,15 @@ from abc import abstractmethod
 from functools import partial
 from io import BytesIO
 from pathlib import Path
-from typing import Any
+from typing import TYPE_CHECKING, Any
 
 import numpy as np
 import numpy.typing as npt
 from PIL import Image
 
+if TYPE_CHECKING:
+    import cv2
+
 from vllm import envs
 from vllm.logger import init_logger
 from vllm.utils.registry import ExtensionManager
@@ -63,6 +66,127 @@ class VideoLoader:
     ) -> tuple[npt.NDArray, dict[str, Any]]:
         raise NotImplementedError
 
+    @staticmethod
+    def _can_use_for_recovery(
+        idx: int,
+        failed_frames: list[int],
+        next_target_map: dict[int, int],
+        total_frames: int,
+    ) -> bool:
+        """Check if current frame can recover the oldest failed frame."""
+        if not failed_frames:
+            return False
+        oldest_failed = failed_frames[0]
+        limit = next_target_map.get(oldest_failed, total_frames)
+        return idx < limit
+
+    @staticmethod
+    def _read_frames_with_recovery(
+        cap: "cv2.VideoCapture",
+        frame_indices: list[int],
+        total_frames: int,
+    ) -> tuple[npt.NDArray, list[int], dict[int, int]]:
+        """
+        Read frames with dynamic window forward-scan recovery.
+
+        When a target frame fails to load, the next successfully grabbed
+        frame (before the next target frame) will be used to recover it.
+
+        Args:
+            cap: OpenCV VideoCapture object
+            frame_indices: Sorted list of target frame indices to load
+            total_frames: Total number of frames in the video
+
+        Returns:
+            Tuple of (frames_array, valid_frame_indices, recovered_map)
+            - frames_array: Array of loaded frames
+            - valid_frame_indices: List of frame indices that were loaded
+            - recovered_map: Dict mapping recovered_idx -> source_idx
+        """
+        import cv2
+
+        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+
+        assert width > 0 and height > 0, (
+            f"Invalid video frame size: width={width}, height={height}"
+        )
+
+        frame_idx_set = set(frame_indices)
+        max_frame_idx = frame_indices[-1] if frame_indices else 0
+
+        # Build map: target_idx -> next_target_idx (for recovery window)
+        next_target_map: dict[int, int] = {}
+        for k in range(len(frame_indices) - 1):
+            next_target_map[frame_indices[k]] = frame_indices[k + 1]
+        next_target_map[frame_indices[-1]] = total_frames
+
+        frames_list: list[npt.NDArray] = []
+        valid_frame_indices: list[int] = []
+        failed_frames_idx: list[int] = []
+        recovered_map: dict[int, int] = {}
+
+        i = 0
+        for idx in range(max_frame_idx + 1):
+            is_target_frame = idx in frame_idx_set
+
+            # Attempt to grab the current frame
+            ok = cap.grab()
+
+            if not ok:
+                if is_target_frame:
+                    logger.warning(
+                        "Failed to grab frame %d during video loading.",
+                        idx,
+                    )
+                    failed_frames_idx.append(idx)
+                continue
+
+            # Check if we should retrieve: target frame OR can recover a failed one
+            can_recover = VideoLoader._can_use_for_recovery(
+                idx, failed_frames_idx, next_target_map, total_frames
+            )
+
+            if is_target_frame or can_recover:
+                ret, frame = cap.retrieve()
+
+                if ret and frame is not None and frame.size > 0:
+                    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                    frames_list.append(rgb_frame)
+                    valid_frame_indices.append(idx)
+                    i += 1
+
+                    if can_recover:
+                        recovered_idx = failed_frames_idx.pop(0)
+                        recovered_map[recovered_idx] = idx
+                        logger.info(
+                            "Recovered frame %d using frame %d (delay: %d)",
+                            recovered_idx,
+                            idx,
+                            idx - recovered_idx,
+                        )
+                elif is_target_frame:
+                    logger.warning(
+                        "Failed to retrieve frame %d during video loading.",
+                        idx,
+                    )
+                    failed_frames_idx.append(idx)
+
+        # Log any remaining failed frames
+        for failed_idx in failed_frames_idx:
+            logger.warning(
+                "Frame %d could not be recovered (end of video).",
+                failed_idx,
+            )
+
+        # Stack frames
+        if frames_list:
+            frames = np.stack(frames_list)
+        else:
+            frames = np.empty((0, height, width, 3), dtype=np.uint8)
+
+        return frames, valid_frame_indices, recovered_map
+
     @staticmethod
     def _read_frames(
         cap,
@@ -142,8 +266,23 @@ class OpenCVVideoBackend(VideoLoader):
         data: bytes,
         num_frames: int = -1,
         fps: int = -1,
+        max_duration: int = 300,
+        frame_recovery: bool = False,
         **kwargs,
     ) -> tuple[npt.NDArray, dict[str, Any]]:
+        """
+        Load video frames from bytes.
+
+        Args:
+            data: Raw video bytes
+            num_frames: Target number of frames to sample (-1 for all)
+            fps: Target FPS for sampling (-1 for original)
+            max_duration: Maximum duration (unused in base backend)
+            frame_recovery: Enable forward-scan recovery for failed frames
+
+        Returns:
+            Tuple of (frames_array, metadata_dict)
+        """
         import cv2
 
         backend = cls().get_cv2_video_api()
@@ -172,11 +311,22 @@ class OpenCVVideoBackend(VideoLoader):
             )
             frame_idx = uniform_sampled_frames.tolist()
 
-        # Convert to set for O(1) lookup performance
-        frame_idx_set = set(frame_idx)
-        frames, valid_num_frames, valid_frame_indices = cls._read_frames(
-            cap, frame_idx_set, num_frames_to_sample, max(frame_idx)
-        )
+        if frame_recovery:
+            frames, valid_frame_indices, recovered_map = cls._read_frames_with_recovery(
+                cap, frame_idx, total_frames_num
+            )
+            valid_num_frames = len(valid_frame_indices)
+
+            if recovered_map:
+                logger.info(
+                    "Frame recovery: %d frames recovered using forward scan.",
+                    len(recovered_map),
+                )
+        else:
+            frame_idx_set = set(frame_idx)
+            frames, valid_num_frames, valid_frame_indices = cls._read_frames(
+                cap, frame_idx_set, num_frames_to_sample, max(frame_idx)
+            )
 
         # Use transformers transformers.video_utils.VideoMetadata format
         # NOTE(Isotr0py): For models like Qwen3-VL/GLM4.5V, this metadata
@@ -204,8 +354,22 @@ class OpenCVDynamicVideoBackend(OpenCVVideoBackend):
         num_frames: int = -1,
         fps: int = 2,
         max_duration: int = 300,
+        frame_recovery: bool = False,
         **kwargs,
     ) -> tuple[npt.NDArray, dict[str, Any]]:
+        """
+        Load video frames with dynamic sampling based on duration.
+
+        Args:
+            data: Raw video bytes
+            num_frames: Not used in dynamic backend
+            fps: Target FPS for sampling (default: 2)
+            max_duration: Maximum video duration to process (default: 300s)
+            frame_recovery: Enable forward-scan recovery for failed frames
+
+        Returns:
+            Tuple of (frames_array, metadata_dict)
+        """
         import cv2
 
         backend = cls().get_cv2_video_api()
@@ -245,14 +409,22 @@ class OpenCVDynamicVideoBackend(OpenCVVideoBackend):
                     }
                 )
 
-        # Convert to set for O(1) lookup performance
-        frame_indices_set = set(frame_indices_list)
-        frames, valid_num_frames, valid_frame_indices = cls._read_frames(
-            cap,
-            frame_indices_set,
-            len(frame_indices_list),
-            total_frames_num - 1,
-        )
+        if frame_recovery:
+            frames, valid_frame_indices, recovered_map = cls._read_frames_with_recovery(
+                cap, frame_indices_list, total_frames_num
+            )
+            valid_num_frames = len(valid_frame_indices)
+
+            if recovered_map:
+                logger.info(
+                    "Frame recovery: %d frames recovered using forward scan.",
+                    len(recovered_map),
+                )
+        else:
+            frame_indices_set = set(frame_indices_list)
+            frames, valid_num_frames, valid_frame_indices = cls._read_frames(
+                cap, frame_indices_set, len(frame_indices_list), total_frames_num - 1
+            )
 
         # Use transformers transformers.video_utils.VideoMetadata format
         metadata = {
diff --git a/vllm/outputs.py b/vllm/outputs.py
index cdfe06f1c7faecbd16a2c96e7341054fece3d799..cf23745c447d4aa55c9b35b9a4022670fb85a252 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -6,6 +6,7 @@ from collections.abc import Sequence as GenericSequence
 from dataclasses import dataclass
 from typing import Any, Generic
 
+import numpy as np
 import torch
 from typing_extensions import TypeVar
 
@@ -13,7 +14,6 @@ from vllm.logger import init_logger
 from vllm.logprobs import PromptLogprobs, SampleLogprobs
 from vllm.lora.request import LoRARequest
 from vllm.multimodal.inputs import MultiModalPlaceholderDict
-from vllm.sequence import RequestMetrics
 from vllm.v1.metrics.stats import RequestStateStats
 
 logger = init_logger(__name__)
@@ -43,6 +43,7 @@ class CompletionOutput:
     token_ids: GenericSequence[int]
     cumulative_logprob: float | None
     logprobs: SampleLogprobs | None
+    routed_experts: np.ndarray | None = None  # [seq_len,layer_num,topk]
     finish_reason: str | None = None
     stop_reason: int | str | None = None
     lora_request: LoRARequest | None = None
@@ -55,6 +56,7 @@ class CompletionOutput:
             f"CompletionOutput(index={self.index}, "
             f"text={self.text!r}, "
             f"token_ids={self.token_ids}, "
+            f"routed_experts={self.routed_experts}, "
             f"cumulative_logprob={self.cumulative_logprob}, "
             f"logprobs={self.logprobs}, "
             f"finish_reason={self.finish_reason}, "
@@ -113,7 +115,7 @@ class RequestOutput:
         prompt_logprobs: PromptLogprobs | None,
         outputs: list[CompletionOutput],
         finished: bool,
-        metrics: RequestMetrics | RequestStateStats | None = None,
+        metrics: RequestStateStats | None = None,
         lora_request: LoRARequest | None = None,
         encoder_prompt: str | None = None,
         encoder_prompt_token_ids: list[int] | None = None,
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index e1b461d79a6551471117be3d5448b000b0aa0605..949e9f41e39ee778b9e18bb525e6fae129df7576 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -15,16 +15,16 @@ import regex as re
 import torch
 
 from vllm import envs
-from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.logger import init_logger
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
 
 from .interface import CpuArchEnum, Platform, PlatformEnum
 
 logger = init_logger(__name__)
 
 if TYPE_CHECKING:
-    from vllm.attention.selector import AttentionSelectorConfig
     from vllm.config import VllmConfig
+    from vllm.v1.attention.selector import AttentionSelectorConfig
 else:
     VllmConfig = None
 
@@ -140,6 +140,7 @@ class CpuPlatform(Platform):
     @classmethod
     def get_device_total_memory(cls, device_id: int = 0) -> int:
         from vllm.utils.mem_constants import GiB_bytes
+        from vllm.utils.mem_utils import format_gib
 
         kv_cache_space = envs.VLLM_CPU_KVCACHE_SPACE
         node_dir = "/sys/devices/system/node"
@@ -153,10 +154,9 @@ class CpuPlatform(Platform):
             free_cpu_memory = psutil.virtual_memory().total // num_numa_nodes
             DEFAULT_CPU_MEM_UTILIZATION = 0.5
             kv_cache_space = int(free_cpu_memory * DEFAULT_CPU_MEM_UTILIZATION)
-            kv_cache_space_gib = kv_cache_space / GiB_bytes
             logger.warning_once(
-                "VLLM_CPU_KVCACHE_SPACE not set. Using "
-                f"{kv_cache_space_gib:.2f} GiB for KV cache."
+                "VLLM_CPU_KVCACHE_SPACE not set. Using %s GiB for KV cache.",
+                format_gib(kv_cache_space),
             )
         else:
             kv_cache_space *= GiB_bytes
@@ -193,6 +193,8 @@ class CpuPlatform(Platform):
             )
 
         scheduler_config = vllm_config.scheduler_config
+        # async scheduling is not required on CPU
+        scheduler_config.async_scheduling = False
         if (
             scheduler_config.enable_chunked_prefill
             or cache_config.enable_prefix_caching
@@ -388,7 +390,7 @@ class CpuPlatform(Platform):
         if env_key in os.environ and os.environ[env_key] != "":
             visible_nodes = [int(s) for s in os.environ[env_key].split(",")]
             allowed_numa_nodes_list = [
-                x for x in visible_nodes if x in allowed_cpu_id_list
+                x for x in sorted(list(set(visible_nodes))) if x in allowed_numa_nodes
             ]
 
         return allowed_numa_nodes_list, logical_cpu_list
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 2dc4ba5d70cacb28cbaf5521120e120925431163..47d634416ae5fe71e84cbeaa573202d20e89d4e6 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -14,17 +14,17 @@ from typing_extensions import ParamSpec
 
 # import custom ops, trigger op registration
 import vllm._C  # noqa
-from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.logger import init_logger
 from vllm.utils.import_utils import import_pynvml
 from vllm.utils.torch_utils import cuda_device_count_stateless
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
 
 from .interface import DeviceCapability, Platform, PlatformEnum
 
 if TYPE_CHECKING:
-    from vllm.attention.selector import AttentionSelectorConfig
     from vllm.config import VllmConfig
     from vllm.config.cache import CacheDType
+    from vllm.v1.attention.selector import AttentionSelectorConfig
 else:
     VllmConfig = None
     CacheDType = None
@@ -148,7 +148,7 @@ class CudaPlatformBase(Platform):
 
     @classmethod
     def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
-        from vllm.attention.backends.registry import AttentionBackendEnum
+        from vllm.v1.attention.backends.registry import AttentionBackendEnum
 
         parallel_config = vllm_config.parallel_config
         model_config = vllm_config.model_config
@@ -200,7 +200,7 @@ class CudaPlatformBase(Platform):
                 use_cutlass_mla = backend == AttentionBackendEnum.CUTLASS_MLA
                 use_flashinfer_mla = backend == AttentionBackendEnum.FLASHINFER_MLA
 
-            from vllm.attention.ops.flashmla import is_flashmla_dense_supported
+            from vllm.v1.attention.ops.flashmla import is_flashmla_dense_supported
 
             if (
                 use_flashmla
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index d4b40045df384846524b159a9b10104a921def72..f86abd712f6f1f0404914522291082d72a606ff3 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -11,19 +11,20 @@ from typing import TYPE_CHECKING, Any, NamedTuple, Optional
 
 import numpy as np
 import torch
+from typing_extensions import deprecated
 
-from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.logger import init_logger
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
 
 if TYPE_CHECKING:
     from torch.distributed import PrefixStore, ProcessGroup
 
-    from vllm.attention.selector import AttentionSelectorConfig
     from vllm.config import VllmConfig
     from vllm.inputs import ProcessorInputs, PromptType
     from vllm.pooling_params import PoolingParams
     from vllm.sampling_params import SamplingParams
     from vllm.utils.argparse_utils import FlexibleArgumentParser
+    from vllm.v1.attention.selector import AttentionSelectorConfig
 else:
     FlexibleArgumentParser = object
 
@@ -365,6 +366,10 @@ class Platform:
         return torch.inference_mode(mode=True)
 
     @classmethod
+    @deprecated(
+        "`seed_everything` is deprecated. It will be removed in v0.15.0 or later. "
+        "Please use `vllm.utils.torch_utils.set_random_seed` instead."
+    )
     def seed_everything(cls, seed: int | None = None) -> None:
         """
         Set the seed of each random module.
@@ -689,6 +694,13 @@ class Platform:
         """
         return max_model_len
 
+    @classmethod
+    def set_additional_forward_context(cls, *args, **kwargs) -> dict[str, Any]:
+        """
+        Set some additional forward context for the current platform if needs.
+        """
+        return {}
+
 
 class UnspecifiedPlatform(Platform):
     _enum = PlatformEnum.UNSPECIFIED
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 2cf307084fa7daec027aa167760676fb56bad9d8..9da9d99b9ed890db42fa8bb59f008d6198d8c30f 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -8,9 +8,9 @@ from typing import TYPE_CHECKING, Optional
 import torch
 
 import vllm.envs as envs
-from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.logger import init_logger
 from vllm.utils.torch_utils import cuda_device_count_stateless
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
 
 from .interface import DeviceCapability, Platform, PlatformEnum
 
@@ -22,8 +22,8 @@ from .interface import DeviceCapability, Platform, PlatformEnum
 #     os.environ['MOE_NN'] = '0'
 
 if TYPE_CHECKING:
-    from vllm.attention.selector import AttentionSelectorConfig
     from vllm.config import VllmConfig
+    from vllm.v1.attention.selector import AttentionSelectorConfig
 
 logger = init_logger(__name__)
 
@@ -141,41 +141,38 @@ def use_rocm_custom_paged_attention(
     alibi_slopes: torch.Tensor | None = None,
     sinks: torch.Tensor | None = None,
 ) -> bool:
-    from vllm._aiter_ops import rocm_aiter_ops
-
     GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
     ON_GFX9 = any(arch in GPU_ARCH for arch in ["gfx90a", "gfx942", "gfx950", "gfx928", "gfx936"])
     ON_GFX11_GFX12 = any(arch in GPU_ARCH for arch in ["gfx11", "gfx12"])
 
     # custom paged attn always supported on V0. On V1, requires sliding window
     # disabled due to observed numerical discrepancy.
-    # if ON_GFX9:
-    #     return (
-    #         (sliding_window == 0 or sliding_window == (-1, -1))
-    #         and (qtype == torch.half or qtype == torch.bfloat16)
-    #         and (head_size == 64 or head_size == 128)
-    #         and (block_size == 16 or block_size == 32)
-    #         and (gqa_ratio >= 1 and gqa_ratio <= 16)
-    #         and max_seq_len <= 128 * 1024
-    #         and (envs.VLLM_ROCM_CUSTOM_PAGED_ATTN)
-    #         and not (rocm_aiter_ops.is_pa_attn_enabled())
-    #         and sinks is None
-    #     )
-
-    # else:
-    #     return (
-    #         ON_GFX11_GFX12
-    #         and (sliding_window == 0 or sliding_window == (-1, -1))
-    #         and (qtype == torch.half or qtype == torch.bfloat16)
-    #         and head_size == 128
-    #         and block_size == 16
-    #         and (gqa_ratio >= 3 and gqa_ratio <= 16)
-    #         and max_seq_len <= 128 * 1024
-    #         and alibi_slopes is None
-    #         and kv_cache_dtype == "auto"
-    #         and envs.VLLM_ROCM_CUSTOM_PAGED_ATTN
-    #         and sinks is None
-    #     )
+    if ON_GFX9:
+        return (
+            (sliding_window == 0 or sliding_window == (-1, -1))
+            and (qtype == torch.half or qtype == torch.bfloat16)
+            and (head_size == 64 or head_size == 128)
+            and (block_size == 16 or block_size == 32)
+            and (gqa_ratio >= 1 and gqa_ratio <= 16)
+            and max_seq_len <= 128 * 1024
+            and (envs.VLLM_ROCM_CUSTOM_PAGED_ATTN)
+            and sinks is None
+        )
+
+    else:
+        return (
+            ON_GFX11_GFX12
+            and (sliding_window == 0 or sliding_window == (-1, -1))
+            and (qtype == torch.half or qtype == torch.bfloat16)
+            and head_size == 128
+            and block_size == 16
+            and (gqa_ratio >= 3 and gqa_ratio <= 16)
+            and max_seq_len <= 128 * 1024
+            and alibi_slopes is None
+            and kv_cache_dtype == "auto"
+            and envs.VLLM_ROCM_CUSTOM_PAGED_ATTN
+            and sinks is None
+        )
     return False
 
 
@@ -191,7 +188,9 @@ class RocmPlatform(Platform):
 
     supported_quantization: list[str] = [
         "awq",
+        "awq_marlin",  # will be overwritten with awq
         "gptq",
+        "gptq_marlin",  # will be overwritten with gptq
         "fp8",
         "compressed-tensors",
         "fbgemm_fp8",
@@ -231,7 +230,7 @@ class RocmPlatform(Platform):
             assert block_size == 1, (
                 "Sparse MLA backend on ROCm only supports block size 1 for now."
             )
-            logger.info_once("Using Sparse MLA backend on V1 engine.")
+            logger.info_once("Using Sparse MLA backend.")
             return AttentionBackendEnum.ROCM_AITER_MLA_SPARSE.get_path()
                 
         if attn_selector_config.use_mla:
@@ -277,16 +276,16 @@ class RocmPlatform(Platform):
             return AttentionBackendEnum.FLEX_ATTENTION.get_path()
 
         if selected_backend == AttentionBackendEnum.TRITON_ATTN:
-            logger.info("Using Triton Attention backend on V1 engine.")
+            logger.info("Using Triton Attention backend.")
             return AttentionBackendEnum.TRITON_ATTN.get_path()
 
         if selected_backend == AttentionBackendEnum.ROCM_ATTN:
-            logger.info("Using Rocm Attention backend on V1 engine.")
+            logger.info("Using Rocm Attention backend.")
             return AttentionBackendEnum.ROCM_ATTN.get_path()
 
         if selected_backend == AttentionBackendEnum.ROCM_AITER_FA:
             if on_gfx9():
-                logger.info("Using Aiter Flash Attention backend on V1 engine.")
+                logger.info("Using Aiter Flash Attention backend.")
                 return AttentionBackendEnum.ROCM_AITER_FA.get_path()
             else:
                 raise ValueError(
@@ -295,18 +294,39 @@ class RocmPlatform(Platform):
                 )
 
         if selected_backend == AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN:
-            logger.info("Using Aiter Unified Attention backend on V1 engine.")
+            logger.info("Using Aiter Unified Attention backend.")
             return AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN.get_path()
 
         # Handle automatic backend selection based on environment variables
         if selected_backend is None:
+            # Priority 1: Check for AITER Unified Attention (must check before MHA)
+            # if envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION:
+            #     logger.info("Using Aiter Unified Attention backend.")
+            #     return AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN.get_path()
+
+            # Priority 2: Check for AITER MHA (Flash Attention)
+            # Only use if explicitly enabled (not just VLLM_ROCM_USE_AITER=1)
+            # if envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MHA and on_gfx9():
+            #     logger.info("Using Aiter Flash Attention backend.")
+            #     return AttentionBackendEnum.ROCM_AITER_FA.get_path()
+
             # Priority 3: Check for ROCM_ATTN (prefill-decode split)
             if envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION:
-                logger.info("Using Rocm Attention backend on V1 engine.")
+                logger.info("Using Rocm Attention backend.")
                 return AttentionBackendEnum.ROCM_ATTN.get_path()
 
+            # Priority 4: Check for AITER enabled without specific flags
+            # This defaults to AITER FA only if MHA is not explicitly disabled
+            # if (
+            #     envs.VLLM_ROCM_USE_AITER
+            #     and on_gfx9()
+            #     and envs.VLLM_ROCM_USE_AITER_MHA is not False
+            # ):
+            #     logger.info("Using Aiter Flash Attention backend.")
+            #     return AttentionBackendEnum.ROCM_AITER_FA.get_path()
+
             # Default: Triton Unified Attention
-            logger.info("Using Triton Attention backend on V1 engine.")
+            logger.info("Using Triton Attention backend.")
             return AttentionBackendEnum.TRITON_ATTN.get_path()
 
         raise RuntimeError(
@@ -341,14 +361,19 @@ class RocmPlatform(Platform):
 
         from vllm._aiter_ops import rocm_aiter_ops
 
-        if rocm_aiter_ops.is_mha_enabled():
-            # Note: AITER FA is only supported for Qwen-VL models.
-            # TODO: Add support for other VL models in their model class.
+        if rocm_aiter_ops.is_enabled():
+            logger.info_once("Using AITER Flash Attention backend for ViT model.")
             return AttentionBackendEnum.ROCM_AITER_FA
 
-        if on_gfx9() and find_spec("flash_attn") is not None:
+        if (
+            on_gfx9()
+            and find_spec("flash_attn") is not None
+            and (dtype == torch.float16 or dtype == torch.bfloat16)
+        ):
+            logger.info_once("Using Flash Attention backend for ViT model.")
             return AttentionBackendEnum.FLASH_ATTN
 
+        logger.info_once("Using Torch SDPA backend for ViT model.")
         return AttentionBackendEnum.TORCH_SDPA
 
     @classmethod
@@ -412,8 +437,10 @@ class RocmPlatform(Platform):
         compilation_config = vllm_config.compilation_config
         parallel_config = vllm_config.parallel_config
         is_eager_execution = compilation_config == CUDAGraphMode.NONE
+        use_aiter_fused_moe = rocm_aiter_ops.is_fused_moe_enabled()
         use_aiter_rms_norm = rocm_aiter_ops.is_rmsnorm_enabled()
-        use_aiter_fp8_linear = rocm_aiter_ops.is_linear_fp8_enaled()
+        use_aiter_fp8_linear = rocm_aiter_ops.is_linear_fp8_enabled()
+        use_aiter_fused_se = rocm_aiter_ops.is_fusion_moe_shared_experts_enabled()
 
         if compilation_config.cudagraph_mode.has_full_cudagraphs():
             # decode context parallel does not support full cudagraphs
@@ -463,6 +490,22 @@ class RocmPlatform(Platform):
         if use_aiter_fp8_linear and "-quant_fp8" not in compilation_config.custom_ops:
             compilation_config.custom_ops.append("+quant_fp8")
 
+        if use_aiter_fused_se and "-grouped_topk" in compilation_config.custom_ops:
+            logger.warning_once(
+                "VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS is enabled, which "
+                "requires the 'grouped_topk' custom op. Overriding the "
+                "user-provided '-grouped_topk'."
+            )
+            compilation_config.custom_ops.remove("-grouped_topk")
+        # Ensure grouped_topk is always enabled when using AITER if
+        # its not disabled by user
+        if (
+            use_aiter_fused_moe
+            and "+grouped_topk" not in compilation_config.custom_ops
+            and "-grouped_topk" not in compilation_config.custom_ops
+        ):
+            compilation_config.custom_ops.append("+grouped_topk")
+
     @classmethod
     def verify_model_arch(cls, model_arch: str) -> None:
         if model_arch in _ROCM_UNSUPPORTED_MODELS:
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index 7c479bf2b6a0e57658223fa2d750363668e8c071..455aceb3269eb13cf6328727095c6485074c47cf 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -1,287 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import contextlib
-from typing import TYPE_CHECKING, Optional, cast
-
-import torch
-from tpu_info import device
-
-from vllm.attention.backends.registry import AttentionBackendEnum
-from vllm.inputs import ProcessorInputs, PromptType
 from vllm.logger import init_logger
 
-from .interface import Platform, PlatformEnum
-
-if TYPE_CHECKING:
-    from typing import TypeAlias
-
-    from vllm.attention.selector import AttentionSelectorConfig
-    from vllm.config import VllmConfig
-    from vllm.config.cache import BlockSize
-    from vllm.pooling_params import PoolingParams
-    from vllm.sampling_params import SamplingParams
-
-    ParamsType: TypeAlias = SamplingParams | PoolingParams
-else:
-    BlockSize = None
-    VllmConfig = None
-    PoolingParams = None
-    ParamsType = None
-
 logger = init_logger(__name__)
 
-USE_TPU_INFERENCE = False
-
-
-class TpuPlatform(Platform):
-    _enum = PlatformEnum.TPU
-    device_name: str = "tpu"
-    device_type: str = "tpu"
-    dispatch_key: str = "XLA"
-    ray_device_key: str = "TPU"
-    dist_backend: str = "gloo"
-    device_control_env_var: str = "TPU_VISIBLE_CHIPS"
-    simple_compile_backend: str = "openxla"
-
-    supported_quantization: list[str] = ["fp8", "tpu_int8", "compressed-tensors"]
-
-    additional_env_vars: list[str] = ["TPU_CHIPS_PER_HOST_BOUNDS", "TPU_HOST_BOUNDS"]
-
-    @classmethod
-    def import_kernels(cls) -> None:
-        # Do not import vllm._C
-        with contextlib.suppress(ImportError):
-            import vllm._moe_C  # noqa: F401
-
-    @classmethod
-    def get_attn_backend_cls(
-        cls,
-        selected_backend: "AttentionBackendEnum",
-        attn_selector_config: "AttentionSelectorConfig",
-    ) -> str:
-        if attn_selector_config.use_sparse:
-            raise NotImplementedError("Sparse Attention is not supported on TPU.")
-        if selected_backend != AttentionBackendEnum.PALLAS:
-            logger.info("Cannot use %s backend on TPU.", selected_backend)
-
-        logger.info("Using Pallas V1 backend.")
-        return AttentionBackendEnum.PALLAS.get_path()
-
-    @classmethod
-    def get_supported_vit_attn_backends(cls) -> list["AttentionBackendEnum"]:
-        return [
-            AttentionBackendEnum.PALLAS,
-        ]
-
-    @classmethod
-    def get_vit_attn_backend(
-        cls,
-        head_size: int,
-        dtype: torch.dtype,
-        backend: Optional["AttentionBackendEnum"] = None,
-    ) -> "AttentionBackendEnum":
-        if backend is not None:
-            assert backend in cls.get_supported_vit_attn_backends(), (
-                f"Backend {backend} is not supported for vit attention"
-                f"Supported backends are: {cls.get_supported_vit_attn_backends()}."
-            )
-            logger.info_once(f"Using backend {backend} for vit attention.")
-            return backend
-
-        logger.info_once(
-            f"Using default backend {AttentionBackendEnum.PALLAS} for vit attention."
-        )
-        return AttentionBackendEnum.PALLAS
-
-    @classmethod
-    def set_device(cls, device: torch.device) -> None:
-        """
-        Set the device for the current platform.
-        """
-        torch.tpu.set_device(device)
-
-    @classmethod
-    def get_device_name(cls, device_id: int = 0) -> str:
-        chip_type, _ = device.get_local_chips()
-        return f"TPU {chip_type.name}"
-
-    @classmethod
-    def get_device_total_memory(cls, device_id: int = 0) -> int:
-        raise NotImplementedError
-
-    @classmethod
-    def get_punica_wrapper(cls) -> str:
-        return "vllm.lora.punica_wrapper.punica_tpu.PunicaWrapperTPU"
-
-    @classmethod
-    def get_infinity_values(cls, dtype: torch.dtype) -> tuple[float, float]:
-        return torch.finfo(dtype).min, torch.finfo(dtype).max
-
-    @classmethod
-    def can_update_inplace(cls):
-        return False
-
-    @classmethod
-    def get_lora_vocab_padding_size(cls) -> int:
-        return 1
-
-    @classmethod
-    def inference_mode(cls):
-        return torch.no_grad()
-
-    @classmethod
-    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
-        from vllm.config import CompilationMode, CUDAGraphMode
-
-        cache_config = vllm_config.cache_config
-        # For v0, the default block size is 16.
-        if cache_config and cache_config.block_size is None:
-            cache_config.block_size = cast(BlockSize, 16)
-        compilation_config = vllm_config.compilation_config
-
-        # TPU only supports DYNAMO_TRACE_ONCE compilation mode
-        if compilation_config.mode != CompilationMode.DYNAMO_TRACE_ONCE:
-            logger.info(
-                "[TPU] Forcing DYNAMO_TRACE_ONCE compilation mode, and\
-                disabling cudagraph."
-            )
-            compilation_config.mode = CompilationMode.DYNAMO_TRACE_ONCE
-
-        if (
-            compilation_config.cudagraph_mode is None
-            or compilation_config.cudagraph_mode.max_cudagraph_mode()
-            != CUDAGraphMode.NONE
-        ):
-            logger.info(
-                "[TPU] CUDA graph is not supported on TPU, disabling cudagraphs."
-            )
-            compilation_config.cudagraph_mode = CUDAGraphMode.NONE
-
-        if compilation_config.backend == "":
-            compilation_config.backend = "openxla"
-
-        assert vllm_config.speculative_config is None, (
-            "TPU does not support speculative decoding"
-        )
-
-        model_config = vllm_config.model_config
-        if model_config is not None and model_config.dtype in (
-            torch.float16,
-            torch.float32,
-        ):
-            logger.warning(
-                "The TPU backend currently does not support %s. "
-                "Using bfloat16 instead.",
-                model_config.dtype,
-            )
-            model_config.dtype = torch.bfloat16
-
-        from vllm.v1.attention.backends.pallas import PallasAttentionBackend
-
-        cache_config.block_size = PallasAttentionBackend.get_page_size(vllm_config)  # type: ignore[assignment]
-
-        parallel_config = vllm_config.parallel_config
-        scheduler_config = vllm_config.scheduler_config
-        if parallel_config.worker_cls == "auto":
-            parallel_config.worker_cls = "vllm.v1.worker.tpu_worker.TPUWorker"
-
-        assert not vllm_config.speculative_config, (
-            "Speculative decoding is not yet supported for TPU backend"
-        )
-
-        if (
-            scheduler_config.is_multimodal_model
-            and not scheduler_config.disable_chunked_mm_input
-        ):
-            logger.warning(
-                "TPU does not support running Multimodal models"
-                " without setting `--disable_chunked_mm_input`. "
-                "Forcing --disable_chunked_mm_input."
-            )
-            scheduler_config.disable_chunked_mm_input = True
-
-        if model_config and model_config.use_mla:
-            logger.info(
-                "MLA is enabled on a non-GPU platform; forcing chunked "
-                "prefill and prefix caching to be disabled."
-            )
-            vllm_config.scheduler_config.enable_chunked_prefill = False
-            vllm_config.scheduler_config.max_num_batched_tokens = max(
-                vllm_config.model_config.max_model_len,
-                vllm_config.scheduler_config.DEFAULT_MAX_NUM_BATCHED_TOKENS,
-            )
-
-    @classmethod
-    def is_pin_memory_available(cls):
-        logger.warning("Pin memory is not supported on TPU.")
-        return False
-
-    @classmethod
-    def get_device_communicator_cls(cls) -> str:
-        return "vllm.distributed.device_communicators.tpu_communicator.TpuCommunicator"  # noqa
-
-    @classmethod
-    def validate_request(
-        cls,
-        prompt: PromptType,
-        params: ParamsType,
-        processed_inputs: ProcessorInputs,
-    ) -> None:
-        """Raises if this request is unsupported on this platform"""
-        from vllm.sampling_params import SamplingParams, SamplingType
-
-        if (
-            isinstance(params, SamplingParams)
-            and params.sampling_type == SamplingType.RANDOM_SEED
-        ):
-            raise ValueError("Torch XLA does not support per-request seed.")
-
-    @classmethod
-    @torch.compile(backend="openxla")
-    def insert_blocks_to_device(
-        cls,
-        src_cache: torch.Tensor,
-        dst_cache: torch.Tensor,
-        src_block_indices: torch.Tensor,
-        dst_block_indices: torch.Tensor,
-    ) -> None:
-        torch.ops.xla.dynamo_set_buffer_donor_(dst_cache, True)
-        dst_cache[dst_block_indices] = src_cache[src_block_indices].to(dst_cache.device)
-
-    @classmethod
-    @torch.compile(backend="openxla")
-    def swap_out_blocks_to_host(
-        cls,
-        src_cache: torch.Tensor,
-        dst_cache: torch.Tensor,
-        src_block_indices: torch.Tensor,
-        dst_block_indices: torch.Tensor,
-    ) -> None:
-        """tpu blocks to cpu blocks"""
-        torch.ops.xla.dynamo_set_buffer_donor_(src_cache, True)
-        dst_cache[dst_block_indices] = src_cache[src_block_indices].cpu()
-
-    @classmethod
-    def use_sync_weight_loader(cls) -> bool:
-        return True
-
-    @classmethod
-    def check_max_model_len(cls, max_model_len: int) -> int:
-        """
-        Check max_model_len for the current platform.
-        """
-        logger.warning(
-            "--max-model-len is not specified, "
-            "it's currently using model's default length %d, "
-            "which might be too large."
-            "Please input with --max-model-len based on your "
-            "request input length and output length, to avoid "
-            "unnecessary degradation.",
-            max_model_len,
-        )
-        return max_model_len
-
 
 try:
     from tpu_inference.platforms import (
@@ -291,5 +14,7 @@ try:
     TpuPlatform = TpuInferencePlatform  # type: ignore
     USE_TPU_INFERENCE = True
 except ImportError:
-    logger.info("tpu_inference not found, using vLLM's TpuPlatform")
+    logger.error(
+        "tpu_inference not found, please install tpu_inference to run vllm on TPU"
+    )
     pass
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index af8979af366430b72fc8e0f46cf035a8539f03b0..b2d7bf38dd48070153b596d542648f6d65eeb5f9 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -7,15 +7,14 @@ from typing import TYPE_CHECKING, Optional
 
 import torch
 
-import vllm.envs as envs
-from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.logger import init_logger
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
 
 from .interface import DeviceCapability, Platform, PlatformEnum
 
 if TYPE_CHECKING:
-    from vllm.attention.selector import AttentionSelectorConfig
     from vllm.config import VllmConfig
+    from vllm.v1.attention.selector import AttentionSelectorConfig
 else:
     VllmConfig = None
 
@@ -53,11 +52,18 @@ class XPUPlatform(Platform):
             "only NHD layout is supported by XPU attention kernels."
         )
 
+        dtype = attn_selector_config.dtype
         if attn_selector_config.use_sparse:
             raise NotImplementedError("Sparse Attention is not supported on XPU.")
         if selected_backend == AttentionBackendEnum.TRITON_ATTN:
             logger.info_once("Using Triton backend.")
             return AttentionBackendEnum.TRITON_ATTN.get_path()
+        elif dtype == torch.float32:
+            logger.warning_once(
+                "Flash Attention on XPU does not support float32 dtype. "
+                "Falling back to Triton Attention backend."
+            )
+            return AttentionBackendEnum.TRITON_ATTN.get_path()
         elif selected_backend == AttentionBackendEnum.FLASH_ATTN:
             logger.info_once("Using Flash Attention backend.")
             return AttentionBackendEnum.FLASH_ATTN.get_path()
@@ -156,39 +162,18 @@ class XPUPlatform(Platform):
 
         if vllm_config.lora_config is not None:
             compilation_config.mode = CompilationMode.NONE
-
+        # decrease triton kernel compilation scratch space for speculative decoding
+        if vllm_config.speculative_config is not None:
+            os.environ["IGC_ForceOCLSIMDWidth"] = "16"  # noqa: SIM112
         # check and update parallel config
         parallel_config = vllm_config.parallel_config
-        parallel_config.worker_cls = "vllm.v1.worker.xpu_worker.XPUWorker"
+        # Only override worker_cls if it's still the default "auto"
+        # This allows custom workers (like vllm-omni workers) to be used on XPU
+        if parallel_config.worker_cls == "auto":
+            parallel_config.worker_cls = "vllm.v1.worker.xpu_worker.XPUWorker"
         if vllm_config.kv_transfer_config is not None:
             vllm_config.kv_transfer_config.enable_permute_local_kv = True
 
-        if parallel_config.distributed_executor_backend is None:
-            if parallel_config.world_size > 1:
-                parallel_config.distributed_executor_backend = "ray"
-            else:
-                parallel_config.distributed_executor_backend = "uni"
-        elif parallel_config.distributed_executor_backend == "mp":
-            # FIXME(kunshang):
-            # spawn needs calling `if __name__ == '__main__':`
-            # fork is not supported for xpu start new process.
-            if envs.VLLM_WORKER_MULTIPROC_METHOD != "spawn":
-                os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
-                logger.warning(
-                    "Please use spawn as start method if you want to use mp."
-                )
-        elif (
-            parallel_config.distributed_executor_backend != "ray"
-            and parallel_config.distributed_executor_backend != "uni"
-            and parallel_config.distributed_executor_backend != "external_launcher"
-        ):
-            logger.warning(
-                "%s is not supported on XPU, fallback to ray distributed"
-                " executor backend.",
-                parallel_config.distributed_executor_backend,
-            )
-            parallel_config.distributed_executor_backend = "ray"
-
         if model_config and model_config.use_mla:
             logger.info(
                 "MLA is enabled on a non-GPU platform; forcing chunked "
diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py
index 4a5caa7e27fc79f2eee3f7d842eec42d091ab7c2..84101e1ae76595f94845d8f2d260ee4e0e03507e 100644
--- a/vllm/pooling_params.py
+++ b/vllm/pooling_params.py
@@ -26,9 +26,9 @@ class PoolingParams(
             Set to None to disable truncation.
         dimensions: Reduce the dimensions of embeddings
             if model support matryoshka representation.
-        normalize: Whether to normalize the embeddings outputs.
-        softmax: softmax will be deprecated, please use use_activation instead.
-        activation: activation will be deprecated, please use use_activation instead.
+        normalize: Deprecated, please use use_activation instead.
+        softmax: Deprecated, please use use_activation instead.
+        activation: Deprecated, please use use_activation instead.
         use_activation: Whether to apply activation function to
             the classification outputs.
     """
@@ -63,15 +63,15 @@ class PoolingParams(
 
     @property
     def all_parameters(self) -> list[str]:
-        return ["dimensions", "normalize", "use_activation"]
+        return ["dimensions", "use_activation"]
 
     @property
     def valid_parameters(self):
         return {
-            "embed": ["dimensions", "normalize"],
+            "embed": ["dimensions", "use_activation"],
             "classify": ["use_activation"],
             "score": ["use_activation"],
-            "token_embed": ["dimensions", "normalize"],
+            "token_embed": ["dimensions", "use_activation"],
             "token_classify": ["use_activation"],
         }
 
@@ -140,7 +140,7 @@ class PoolingParams(
         self, pooler_config: "PoolerConfig", valid_parameters: list[str]
     ):
         step_pooling_parameters = ["step_tag_id", "returned_token_ids"]
-        if pooler_config.pooling_type != "STEP":
+        if pooler_config.tok_pooling_type != "STEP":
             invalid_parameters = []
             for k in step_pooling_parameters:
                 if getattr(self, k, None) is not None:
@@ -162,8 +162,8 @@ class PoolingParams(
 
     def _set_default_parameters(self, model_config: Optional["ModelConfig"]):
         if self.task in ["embed", "token_embed"]:
-            if self.normalize is None:
-                self.normalize = True
+            if self.use_activation is None:
+                self.use_activation = True
 
             if self.dimensions is not None and model_config is not None:
                 if not model_config.is_matryoshka:
@@ -213,7 +213,6 @@ class PoolingParams(
         return (
             f"PoolingParams("
             f"task={self.task}, "
-            f"normalize={self.normalize}, "
             f"dimensions={self.dimensions}, "
             f"use_activation={self.use_activation}, "
             f"step_tag_id={self.step_tag_id}, "
diff --git a/vllm/reasoning/deepseek_v3_reasoning_parser.py b/vllm/reasoning/deepseek_v3_reasoning_parser.py
index 6604f70badbcf4c38d07f0aaaaaf8d03c2c6f34a..efb080276e4618122c4902408a5f34608deabdda 100644
--- a/vllm/reasoning/deepseek_v3_reasoning_parser.py
+++ b/vllm/reasoning/deepseek_v3_reasoning_parser.py
@@ -24,8 +24,10 @@ class DeepSeekV3ReasoningParser(ReasoningParser):
     def __init__(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs):
         super().__init__(tokenizer, *args, **kwargs)
 
-        chat_kwargs = kwargs.pop("chat_template_kwargs", {}) or {}
-        thinking = bool(chat_kwargs.pop("thinking", False))
+        chat_kwargs = kwargs.get("chat_template_kwargs", {}) or {}
+        thinking = bool(chat_kwargs.get("thinking", False))
+        enable_thinking = bool(chat_kwargs.get("enable_thinking", False))
+        thinking = thinking or enable_thinking
 
         if thinking:
             self._parser = DeepSeekR1ReasoningParser(tokenizer, *args, **kwargs)
diff --git a/vllm/reasoning/glm4_moe_reasoning_parser.py b/vllm/reasoning/glm4_moe_reasoning_parser.py
index e66acfe6e91d913b429b57980ae9b1eb6b6e224a..466819f8b45b954468d4dbf19974c7ea63192a48 100644
--- a/vllm/reasoning/glm4_moe_reasoning_parser.py
+++ b/vllm/reasoning/glm4_moe_reasoning_parser.py
@@ -1,171 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from collections.abc import Sequence
+from vllm.reasoning.holo2_reasoning_parser import Holo2ReasoningParser
 
-from transformers import PreTrainedTokenizerBase
 
-from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
-from vllm.logger import init_logger
-from vllm.reasoning import ReasoningParser
-
-logger = init_logger(__name__)
-
-
-class Glm4MoeModelReasoningParser(ReasoningParser):
+class Glm4MoeModelReasoningParser(Holo2ReasoningParser):
     """
-    Reasoning parser for the Glm4MoeModel model.
-    
-    The Glm4MoeModel model uses <think>...</think> tokens to denote reasoning
-    text within its output. The model provides a strict switch to disable
-    reasoning output via the 'enable_thinking=False' parameter. This parser
-    extracts the reasoning content enclosed by <think> and </think> tokens
-    from the model's output.
+    Reasoning parser for the Glm4MoeModel model,which inherits from
+    `Holo2ReasoningParser`.
     """
 
-    def __init__(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs):
-        super().__init__(tokenizer, *args, **kwargs)
-        self.think_start_token = "<think>"
-        self.think_end_token = "</think>"
-        self.assistant_token = "<|assistant|>"
-
-        if not self.model_tokenizer:
-            raise ValueError(
-                "The model tokenizer must be passed to the ReasoningParser "
-                "constructor during construction."
-            )
-
-        self.think_start_token_id = self.vocab.get(self.think_start_token)
-        self.think_end_token_id = self.vocab.get(self.think_end_token)
-        self.assistant_token_id = self.vocab.get(self.assistant_token)
-        if (
-            self.think_start_token_id is None
-            or self.think_end_token_id is None
-            or self.assistant_token_id is None
-        ):
-            raise RuntimeError(
-                "Glm4MoeModel reasoning parser could not locate "
-                "think start/end or assistant tokens in the tokenizer!"
-            )
-
-    def is_reasoning_end(self, input_ids: list[int]) -> bool:
-        """
-        GLM's chat template has <think></think> tokens after every
-        <|assistant|> token. Thus, we need to check if </think> is
-        after the most recent <|assistant|> token (if present).
-        """
-        for token_id in input_ids[::-1]:
-            if token_id == self.think_end_token_id:
-                return True
-            elif token_id == self.assistant_token_id:
-                return False
-        return False
-
-    def extract_content_ids(self, input_ids: list[int]) -> list[int]:
-        """
-        Extract the content after the end tokens
-        """
-        if self.think_end_token_id not in input_ids[:-1]:
-            return []
-        else:
-            return input_ids[input_ids.index(self.think_end_token_id) + 1 :]
-
-    def extract_reasoning_streaming(
-        self,
-        previous_text: str,
-        current_text: str,
-        delta_text: str,
-        previous_token_ids: Sequence[int],
-        current_token_ids: Sequence[int],
-        delta_token_ids: Sequence[int],
-    ) -> DeltaMessage | None:
-        """
-        Extract reasoning content from a delta message.
-        Handles streaming output where previous + delta = current.
-        Uses token IDs for faster processing.
-        For text <think>abc</think>xyz:
-        - 'abc' goes to reasoning
-        - 'xyz' goes to content
-        """
-        # Skip single special tokens
-        if len(delta_token_ids) == 1 and (
-            delta_token_ids[0] in [self.think_start_token_id, self.think_end_token_id]
-        ):
-            return None
-
-        if self.think_start_token_id in previous_token_ids:
-            if self.think_end_token_id in delta_token_ids:
-                # <think> in previous, </think> in delta,
-                # extract reasoning content
-                end_index = delta_text.find(self.think_end_token)
-                reasoning = delta_text[:end_index]
-                content = delta_text[end_index + len(self.think_end_token) :]
-                return DeltaMessage(
-                    reasoning=reasoning,
-                    content=content if content else None,
-                )
-            elif self.think_end_token_id in previous_token_ids:
-                # <think> in previous, </think> in previous,
-                # reasoning content continues
-                return DeltaMessage(content=delta_text)
-            else:
-                # <think> in previous, no </think> in previous or delta,
-                # reasoning content continues
-                return DeltaMessage(reasoning=delta_text)
-        elif self.think_start_token_id in delta_token_ids:
-            if self.think_end_token_id in delta_token_ids:
-                # <think> in delta, </think> in delta, extract reasoning content
-                start_index = delta_text.find(self.think_start_token)
-                end_index = delta_text.find(self.think_end_token)
-                reasoning = delta_text[
-                    start_index + len(self.think_start_token) : end_index
-                ]
-                content = delta_text[end_index + len(self.think_end_token) :]
-                return DeltaMessage(
-                    reasoning=reasoning,
-                    content=content if content else None,
-                )
-            else:
-                # <think> in delta, no </think> in delta,
-                # reasoning content continues
-                return DeltaMessage(reasoning=delta_text)
-        else:
-            # thinking is disabled, just content
-            return DeltaMessage(content=delta_text)
-
-    def extract_reasoning(
-        self, model_output: str, request: ChatCompletionRequest
-    ) -> tuple[str | None, str | None]:
-        """
-        Extract reasoning content from the model output.
-
-        For text <think>abc</think>xyz:
-        - 'abc' goes to reasoning
-        - 'xyz' goes to content
-
-        Returns:
-            tuple[Optional[str], Optional[str]]: reasoning content and content
-        """
-
-        # Check if the model output contains the <think> and </think> tokens.
-        if (
-            self.think_start_token not in model_output
-            or self.think_end_token not in model_output
-        ):
-            return None, model_output
-        # Check if the <think> is present in the model output, remove it
-        # if it is present.
-        model_output_parts = model_output.partition(self.think_start_token)
-        model_output = (
-            model_output_parts[2] if model_output_parts[1] else model_output_parts[0]
-        )
-        # Check if the model output contains the </think> tokens.
-        # If the end token is not found, return the model output as is.
-        if self.think_end_token not in model_output:
-            return None, model_output
-
-        # Extract reasoning content from the model output.
-        reasoning, _, content = model_output.partition(self.think_end_token)
-
-        final_content = content or None
-        return reasoning, final_content
+    pass
diff --git a/vllm/reasoning/holo2_reasoning_parser.py b/vllm/reasoning/holo2_reasoning_parser.py
index f80190d28d6aa5e1ea346fc508718dfb991d7721..3b5bfd838017f97c42ad332ebd1fb179f0ebd6b8 100644
--- a/vllm/reasoning/holo2_reasoning_parser.py
+++ b/vllm/reasoning/holo2_reasoning_parser.py
@@ -46,9 +46,10 @@ class Holo2ReasoningParser(ReasoningParser):
         # all requests in the structured output manager. So it is important that without
         # user specified chat template args, the default thinking is True.
 
-        enable_thinking = bool(chat_kwargs.get("thinking", True))
-
-        if enable_thinking:
+        thinking = bool(chat_kwargs.get("thinking", True))
+        enable_thinking = bool(chat_kwargs.get("enable_thinking", True))
+        thinking = thinking and enable_thinking
+        if thinking:
             self._parser = DeepSeekR1ReasoningParser(tokenizer, *args, **kwargs)
         else:
             self._parser = IdentityReasoningParser(tokenizer, *args, **kwargs)
diff --git a/vllm/reasoning/mistral_reasoning_parser.py b/vllm/reasoning/mistral_reasoning_parser.py
index de3d1296ec73484825cc0302598c2d230c8fcbfc..48a36b4c6634caf789f3855f806455545b44bb26 100644
--- a/vllm/reasoning/mistral_reasoning_parser.py
+++ b/vllm/reasoning/mistral_reasoning_parser.py
@@ -104,7 +104,7 @@ class MistralReasoningParser(BaseThinkingReasoningParser):
         # 3. Both BOT and EOT have been outputted.
         elif has_bot_token and has_eot_token:
             return input_ids[:bot_token_index] + input_ids[eot_token_index + 1 :]
-        # 4. Only EOT has been outputted => this should not have occured for a model
+        # 4. Only EOT has been outputted => this should not have occurred for a model
         #    well prompted and trained.
         else:
             return input_ids[:eot_token_index] + input_ids[eot_token_index + 1 :]
diff --git a/vllm/reasoning/step3_reasoning_parser.py b/vllm/reasoning/step3_reasoning_parser.py
index f635758a92c0dd6ac0637021b47f8e96de18458c..b85bc826572f3ec7ac69402a4d096815cdfd9831 100644
--- a/vllm/reasoning/step3_reasoning_parser.py
+++ b/vllm/reasoning/step3_reasoning_parser.py
@@ -100,6 +100,12 @@ class Step3ReasoningParser(ReasoningParser):
     def is_reasoning_end(self, input_ids: list[int]) -> bool:
         return self.think_end_token_id in input_ids
 
+    def is_reasoning_end_streaming(
+        self, input_ids: list[int], delta_ids: list[int]
+    ) -> bool:
+        end_token_id = self.think_end_token_id
+        return end_token_id in delta_ids
+
     def extract_content_ids(self, input_ids: list[int]) -> list[int]:
         if self.think_end_token_id not in input_ids[:-1]:
             return []
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 453100f2e51359c836bd02a2aa97efdad6915622..00542830a5d4f1912410e4aa7569f96d0cf862af 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -11,6 +11,7 @@ from typing import Annotated, Any
 import msgspec
 from pydantic.dataclasses import dataclass
 
+from vllm.exceptions import VLLMValidationError
 from vllm.logger import init_logger
 from vllm.logits_process import LogitsProcessor
 from vllm.tokenizers import TokenizerLike
@@ -211,6 +212,12 @@ class SamplingParams(
     set to an integer k, will use only the last k tokens from the prompt
     (i.e., left truncation). If set to `None`, truncation is disabled."""
     output_kind: RequestOutputKind = RequestOutputKind.CUMULATIVE
+    skip_clone: bool = False
+    """Internal flag indicating that this SamplingParams instance is safe to
+    reuse without cloning. When True, clone() will return self without
+    performing a deep copy. This should only be set when the params object
+    is guaranteed to be dedicated to a single request and won't be modified
+    in ways that would affect other uses."""
 
     # The below fields are not supposed to be used as an input.
     # They are set in post_init.
@@ -270,6 +277,7 @@ class SamplingParams(
         logit_bias: dict[int, float] | dict[str, float] | None = None,
         allowed_token_ids: list[int] | None = None,
         extra_args: dict[str, Any] | None = None,
+        skip_clone: bool = False,
     ) -> "SamplingParams":
         if logit_bias is not None:
             # Convert token_id to integer
@@ -310,6 +318,7 @@ class SamplingParams(
             logit_bias=logit_bias,
             allowed_token_ids=allowed_token_ids,
             extra_args=extra_args,
+            skip_clone=skip_clone,
         )
 
     def __post_init__(self) -> None:
@@ -385,11 +394,17 @@ class SamplingParams(
                 f"{self.repetition_penalty}."
             )
         if self.temperature < 0.0:
-            raise ValueError(
-                f"temperature must be non-negative, got {self.temperature}."
+            raise VLLMValidationError(
+                f"temperature must be non-negative, got {self.temperature}.",
+                parameter="temperature",
+                value=self.temperature,
             )
         if not 0.0 < self.top_p <= 1.0:
-            raise ValueError(f"top_p must be in (0, 1], got {self.top_p}.")
+            raise VLLMValidationError(
+                f"top_p must be in (0, 1], got {self.top_p}.",
+                parameter="top_p",
+                value=self.top_p,
+            )
         # quietly accept -1 as disabled, but prefer 0
         if self.top_k < -1:
             raise ValueError(
@@ -402,7 +417,11 @@ class SamplingParams(
         if not 0.0 <= self.min_p <= 1.0:
             raise ValueError(f"min_p must be in [0, 1], got {self.min_p}.")
         if self.max_tokens is not None and self.max_tokens < 1:
-            raise ValueError(f"max_tokens must be at least 1, got {self.max_tokens}.")
+            raise VLLMValidationError(
+                f"max_tokens must be at least 1, got {self.max_tokens}.",
+                parameter="max_tokens",
+                value=self.max_tokens,
+            )
         if self.min_tokens < 0:
             raise ValueError(
                 f"min_tokens must be greater than or equal to 0, got {self.min_tokens}."
@@ -413,24 +432,30 @@ class SamplingParams(
                 f"max_tokens={self.max_tokens}, got {self.min_tokens}."
             )
         if self.logprobs is not None and self.logprobs != -1 and self.logprobs < 0:
-            raise ValueError(
-                f"logprobs must be non-negative or -1, got {self.logprobs}."
+            raise VLLMValidationError(
+                f"logprobs must be non-negative or -1, got {self.logprobs}.",
+                parameter="logprobs",
+                value=self.logprobs,
             )
         if (
             self.prompt_logprobs is not None
             and self.prompt_logprobs != -1
             and self.prompt_logprobs < 0
         ):
-            raise ValueError(
+            raise VLLMValidationError(
                 f"prompt_logprobs must be non-negative or -1, got "
-                f"{self.prompt_logprobs}."
+                f"{self.prompt_logprobs}.",
+                parameter="prompt_logprobs",
+                value=self.prompt_logprobs,
             )
         if self.truncate_prompt_tokens is not None and (
             self.truncate_prompt_tokens == 0 or self.truncate_prompt_tokens < -1
         ):
-            raise ValueError(
+            raise VLLMValidationError(
                 f"truncate_prompt_tokens must be an integer >= 1 or -1, "
-                f"got {self.truncate_prompt_tokens}"
+                f"got {self.truncate_prompt_tokens}",
+                parameter="truncate_prompt_tokens",
+                value=self.truncate_prompt_tokens,
             )
         assert isinstance(self.stop_token_ids, list)
         if not all(isinstance(st_id, int) for st_id in self.stop_token_ids):
@@ -508,12 +533,14 @@ class SamplingParams(
             if token_id < 0 or token_id > tokenizer.max_token_id
         ]
         if len(invalid_token_ids) > 0:
-            raise ValueError(
+            raise VLLMValidationError(
                 f"The model vocabulary size is {tokenizer.max_token_id + 1},"
                 f" but the following tokens"
                 f" were specified as bad: {invalid_token_ids}."
                 f" All token id values should be integers satisfying:"
-                f" 0 <= token_id <= {tokenizer.max_token_id}."
+                f" 0 <= token_id <= {tokenizer.max_token_id}.",
+                parameter="bad_words",
+                value=self.bad_words,
             )
 
     @cached_property
@@ -540,8 +567,13 @@ class SamplingParams(
         data that is expensive to copy. However, if not copied, the processor
         needs to support parallel decoding for multiple sequences
         See https://github.com/vllm-project/vllm/issues/3087
+
+        If skip_clone is True, uses shallow copy instead of deep copy.
         """
 
+        if self.skip_clone:
+            return copy.copy(self)
+
         logit_processor_refs = (
             None
             if self.logits_processors is None
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 6d20ca9aac225d6ecc9365cb9e817d44aac4dc13..3e12f148b22ecd794d1005838b7000331d947e6a 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -12,40 +12,6 @@ if TYPE_CHECKING:
 else:
     KVConnectorOutput = Any
 
-VLLM_TOKEN_ID_ARRAY_TYPE = "l"
-
-VLLM_INVALID_TOKEN_ID = -1
-
-
-@dataclass
-class RequestMetrics:
-    """Metrics associated with a request.
-
-    Attributes:
-        arrival_time: The time when the request arrived.
-        first_scheduled_time: The time when the request was first scheduled.
-        first_token_time: The time when the first token was generated.
-        time_in_queue: The time the request spent in the queue.
-        finished_time: The time when the request was finished.
-        scheduler_time: The time spent in the scheduler when this request was
-                        being considered by the scheduler.
-        model_forward_time: The time spent in the model forward pass when this
-                            request was in the batch.
-        model_execute_time: The time spent in the model execute function. This
-                            will include model forward, block/sync across
-                            workers, cpu-gpu sync time and sampling time.
-    """
-
-    arrival_time: float
-    last_token_time: float
-    first_scheduled_time: float | None
-    first_token_time: float | None
-    time_in_queue: float | None
-    finished_time: float | None = None
-    scheduler_time: float | None = None
-    model_forward_time: float | None = None
-    model_execute_time: float | None = None
-
 
 # cannot use msgspec.Struct here because Dynamo does not support it
 @dataclass
diff --git a/vllm/tasks.py b/vllm/tasks.py
index b02cde74c12a9763341c9dfab2940c8528647669..bd3e5af779388987eeacf9840c6e056bab486f31 100644
--- a/vllm/tasks.py
+++ b/vllm/tasks.py
@@ -3,11 +3,11 @@
 from typing import Literal, get_args
 
 GenerationTask = Literal["generate", "transcription"]
-GENERATION_TASKS = get_args(GenerationTask)
+GENERATION_TASKS: tuple[GenerationTask, ...] = get_args(GenerationTask)
 
 PoolingTask = Literal[
     "embed", "classify", "score", "token_embed", "token_classify", "plugin"
 ]
-POOLING_TASKS = get_args(PoolingTask)
+POOLING_TASKS: tuple[PoolingTask, ...] = get_args(PoolingTask)
 
 SupportedTask = Literal[GenerationTask, PoolingTask]
diff --git a/vllm/tokenizers/__init__.py b/vllm/tokenizers/__init__.py
index 31e74b1a16e2039fe1b460e0011c53f9783a768f..2daba409881f2408a4cf2afc0e456ce7a68f408d 100644
--- a/vllm/tokenizers/__init__.py
+++ b/vllm/tokenizers/__init__.py
@@ -7,7 +7,6 @@ from .registry import (
     cached_get_tokenizer,
     cached_tokenizer_from_config,
     get_tokenizer,
-    init_tokenizer_from_config,
 )
 
 __all__ = [
@@ -16,5 +15,4 @@ __all__ = [
     "cached_get_tokenizer",
     "get_tokenizer",
     "cached_tokenizer_from_config",
-    "init_tokenizer_from_config",
 ]
diff --git a/vllm/tokenizers/deepseek_v32.py b/vllm/tokenizers/deepseek_v32.py
index bf279a5cf67c58b4ccb66a3f04a95d5fbd029106..4402054c9a5d4f8ba472bc6952f04aecf05ea388 100644
--- a/vllm/tokenizers/deepseek_v32.py
+++ b/vllm/tokenizers/deepseek_v32.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from pathlib import Path
-from typing import Any
+from typing import Any, overload
 
 from transformers import BatchEncoding
 
@@ -50,6 +50,8 @@ class DeepseekV32Tokenizer(CachedHfTokenizer):
         **kwargs,
     ) -> str | list[int]:
         thinking = kwargs.get("thinking", False)
+        enable_thinking = kwargs.get("enable_thinking", False)
+        thinking = thinking or enable_thinking
         thinking_mode = "thinking"
         if not thinking:
             thinking_mode = "chat"
@@ -63,6 +65,7 @@ class DeepseekV32Tokenizer(CachedHfTokenizer):
         drop_thinking = messages[-1]["role"] == "user"
 
         encode_config = dict(thinking_mode=thinking_mode, drop_thinking=drop_thinking)
+
         prompt_str = encode_messages(messages, **encode_config)  # type: ignore
 
         if kwargs.get("tokenize", True):
@@ -159,6 +162,15 @@ class DeepseekV32Tokenizer(CachedHfTokenizer):
             add_special_tokens=add_special_tokens,
         )
 
+    @overload
+    def convert_tokens_to_ids(self, tokens: str) -> int: ...
+
+    @overload
+    def convert_tokens_to_ids(self, tokens: list[str]) -> list[int]: ...
+
+    def convert_tokens_to_ids(self, tokens: str | list[str]) -> int | list[int]:
+        return self.tokenizer.convert_tokens_to_ids(tokens)
+
     def convert_tokens_to_string(self, tokens: list[str]) -> str:
         return self.tokenizer.convert_tokens_to_string(tokens)
 
diff --git a/vllm/tokenizers/deepseek_v32_encoding.py b/vllm/tokenizers/deepseek_v32_encoding.py
index 521bd9295931259d1a7eec3a1bde39692a1feecd..0c42699e570397b4e90509294dad4c6788ec2f9f 100644
--- a/vllm/tokenizers/deepseek_v32_encoding.py
+++ b/vllm/tokenizers/deepseek_v32_encoding.py
@@ -169,6 +169,7 @@ def render_message(
     response_format = msg.get("response_format")
     tool_calls = msg.get("tool_calls")
     reasoning_content = msg.get("reasoning") or msg.get("reasoning_content")
+    is_prefix = msg.get("prefix", False)
 
     if tools:
         tools = tools_from_openai_format(tools)
@@ -273,11 +274,14 @@ def render_message(
                 + thinking_end_token
             )
 
-        prompt += assistant_msg_template.format(
-            reasoning=thinking_part,
-            content=summary_content,
-            tool_calls=tool_calls_content,
-        )
+        if not tool_calls and is_prefix:
+            prompt += summary_content
+        else:
+            prompt += assistant_msg_template.format(
+                reasoning=thinking_part,
+                content=summary_content,
+                tool_calls=tool_calls_content,
+            )
     else:
         raise NotImplementedError(f"Unknown role: {role}")
 
diff --git a/vllm/tokenizers/grok2.py b/vllm/tokenizers/grok2.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4071908d8fb73df47d11988284a128c073c1fa4
--- /dev/null
+++ b/vllm/tokenizers/grok2.py
@@ -0,0 +1,443 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tokenizer for Grok-2 .tok.json format."""
+
+import functools
+import json
+from collections.abc import Collection, Set
+from pathlib import Path
+from typing import Any, Literal, overload
+
+from huggingface_hub import hf_hub_download
+from huggingface_hub.utils import (
+    EntryNotFoundError,
+    HfHubHTTPError,
+    RepositoryNotFoundError,
+    RevisionNotFoundError,
+)
+from transformers import BatchEncoding
+from transformers.utils import chat_template_utils as hf_chat_utils
+
+from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
+from vllm.logger import init_logger
+
+from .protocol import TokenizerLike
+
+logger = init_logger(__name__)
+
+PAD = "<|pad|>"
+EOS = "<|eos|>"
+SEP = "<|separator|>"
+RESERVED_TOKEN_TEXTS = [f"<|reserved_{i}|>" for i in range(3, 128)]
+CONTROL_TOKEN_TEXTS = [f"<|control{i}|>" for i in range(1, 705)]
+DEFAULT_SPECIAL_TOKENS = [PAD, SEP, EOS]
+DEFAULT_CONTROL_TOKENS = {"pad": PAD, "sep": SEP, "eos": EOS}
+DEFAULT_CHAT_TEMPLATE = (
+    "{% for message in messages %}"
+    "{% if message['role'] == 'user' %}"
+    "{{ 'Human: ' + message['content'].strip() + '<|separator|>\\n\\n' }}"
+    "{% elif message['role'] == 'system' %}"
+    "{{ 'System: ' + message['content'].strip() + '<|separator|>\\n\\n' }}"
+    "{% elif message['role'] == 'assistant' %}"
+    "{{ 'Assistant: ' + message['content'] + '<|separator|>\\n\\n' }}"
+    "{% endif %}"
+    "{% endfor %}"
+    "{% if add_generation_prompt %}"
+    "{{ 'Assistant:' }}"
+    "{% endif %}"
+)
+
+# Default + separate each single digit.
+PAT_STR_B = (
+    r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}|"""
+    r""" ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
+)
+
+
+def _maybe_load_tokenizer_config(
+    model_path: Path,
+    *,
+    repo_id: str | None,
+    revision: str | None,
+    download_dir: str | None,
+) -> dict[str, Any]:
+    config_path = model_path / "tokenizer_config.json"
+    if config_path.is_file():
+        with config_path.open("r", encoding="utf-8") as f:
+            return json.load(f)
+
+    if repo_id is None:
+        return {}
+
+    try:
+        config_file = hf_hub_download(
+            repo_id=repo_id,
+            filename="tokenizer_config.json",
+            revision=revision,
+            cache_dir=download_dir,
+        )
+    except (RepositoryNotFoundError, RevisionNotFoundError, EntryNotFoundError):
+        # If the repo, revision, or file does not exist, fall back silently.
+        return {}
+    except HfHubHTTPError as exc:
+        logger.warning(
+            "Failed to download tokenizer_config.json from %s. "
+            "This may be due to a network or authentication issue. "
+            "The default chat template will be used. Error: %s",
+            repo_id,
+            exc,
+        )
+        return {}
+
+    try:
+        with Path(config_file).open("r", encoding="utf-8") as f:
+            return json.load(f)
+    except json.JSONDecodeError as exc:
+        logger.warning(
+            "Failed to parse tokenizer_config.json. "
+            "The default chat template will be used. Error: %s",
+            exc,
+        )
+        return {}
+    except OSError as exc:
+        logger.warning(
+            "Failed to open tokenizer_config.json. "
+            "The default chat template will be used. Error: %s",
+            exc,
+        )
+        return {}
+
+
+def _load_tiktoken_encoding(
+    vocab_file: Path,
+) -> tuple[Any, dict[str, int]]:
+    try:
+        import tiktoken
+    except ImportError as exc:
+        raise ImportError("Grok-2 tokenizer requires the `tiktoken` package.") from exc
+
+    with vocab_file.open("rb") as f:
+        xtok_dict = json.load(f)
+
+    mergeable_ranks = {
+        bytes(item["bytes"]): item["token"]
+        for item in xtok_dict.get("regular_tokens", [])
+    }
+    special_tokens = {
+        bytes(item["bytes"]).decode("utf-8", errors="replace"): item["token"]
+        for item in xtok_dict.get("special_tokens", [])
+    }
+
+    if xtok_dict.get("word_split") == "V1":
+        pat_str = PAT_STR_B
+    else:
+        raise ValueError(f"Unknown word_split: {xtok_dict.get('word_split')!r}")
+
+    pat_str = xtok_dict.get("pat_str", pat_str)
+
+    kwargs = {
+        "name": str(vocab_file),
+        "pat_str": pat_str,
+        "mergeable_ranks": mergeable_ranks,
+        "special_tokens": special_tokens,
+    }
+
+    if "vocab_size" in xtok_dict:
+        kwargs["explicit_n_vocab"] = xtok_dict["vocab_size"]
+
+    tokenizer = tiktoken.Encoding(**kwargs)
+
+    default_allowed_special: set[str] | None = None
+    if "default_allowed_special" in xtok_dict:
+        default_allowed_special = {
+            bytes(bytes_list).decode("utf-8", errors="replace")
+            for bytes_list in xtok_dict["default_allowed_special"]
+        }
+
+    tokenizer._default_allowed_special = default_allowed_special or set()
+    tokenizer._control_tokens = DEFAULT_CONTROL_TOKENS
+
+    def encode_patched(
+        self,
+        text: str,
+        *,
+        allowed_special: Literal["all"] | Set[str] = set(),
+        disallowed_special: Literal["all"] | Collection[str] = "all",
+    ) -> list[int]:
+        del disallowed_special
+        if isinstance(allowed_special, set):
+            allowed_special |= self._default_allowed_special
+        return tiktoken.Encoding.encode(
+            self,
+            text,
+            allowed_special=allowed_special,
+            disallowed_special=(),
+        )
+
+    tokenizer.encode = functools.partial(encode_patched, tokenizer)
+    tokenizer._default_allowed_special |= set(DEFAULT_CONTROL_TOKENS.values())
+    tokenizer._default_allowed_special |= set(
+        CONTROL_TOKEN_TEXTS + RESERVED_TOKEN_TEXTS
+    )
+
+    return tokenizer, special_tokens
+
+
+class Grok2Tokenizer(TokenizerLike):
+    @classmethod
+    def from_pretrained(
+        cls,
+        path_or_repo_id: str | Path,
+        *args,
+        trust_remote_code: bool = False,
+        revision: str | None = None,
+        download_dir: str | None = None,
+        **kwargs,
+    ) -> "Grok2Tokenizer":
+        if args:
+            logger.debug_once("Ignoring extra positional args for Grok2Tokenizer.")
+
+        path = Path(path_or_repo_id)
+        if path.is_file():
+            vocab_file = path
+            model_path = path.parent
+            repo_id = None
+        elif path.is_dir():
+            vocab_file = path / "tokenizer.tok.json"
+            model_path = path
+            repo_id = None
+        else:
+            vocab_file = Path(
+                hf_hub_download(
+                    repo_id=str(path_or_repo_id),
+                    filename="tokenizer.tok.json",
+                    revision=revision,
+                    cache_dir=download_dir,
+                )
+            )
+            model_path = vocab_file.parent
+            repo_id = str(path_or_repo_id)
+
+        if not vocab_file.is_file():
+            raise FileNotFoundError(f"tokenizer.tok.json not found at {vocab_file}.")
+
+        config = _maybe_load_tokenizer_config(
+            model_path,
+            repo_id=repo_id,
+            revision=revision,
+            download_dir=download_dir,
+        )
+
+        return cls(
+            vocab_file=vocab_file,
+            name_or_path=str(path_or_repo_id),
+            truncation_side=kwargs.get("truncation_side", "left"),
+            chat_template=config.get("chat_template"),
+            init_kwargs=config,
+        )
+
+    def __init__(
+        self,
+        *,
+        vocab_file: Path,
+        name_or_path: str,
+        truncation_side: str,
+        chat_template: str | None,
+        init_kwargs: dict[str, Any] | None = None,
+    ) -> None:
+        super().__init__()
+        self.name_or_path = name_or_path
+        self._truncation_side = truncation_side
+        self.init_kwargs = init_kwargs or {}
+        self._chat_template = chat_template or DEFAULT_CHAT_TEMPLATE
+
+        self._tokenizer, self._special_tokens = _load_tiktoken_encoding(vocab_file)
+
+        self._token_to_id: dict[str, int] = {}
+        self._id_to_token: dict[int, str] = {}
+        for token, token_id in self._tokenizer._mergeable_ranks.items():
+            token_str = token.decode("utf-8", errors="replace")
+            self._token_to_id[token_str] = token_id
+            self._id_to_token[token_id] = token_str
+
+        for token, token_id in self._special_tokens.items():
+            self._token_to_id[token] = token_id
+            self._id_to_token[token_id] = token
+
+        bos_token_id = self._special_tokens.get(SEP)
+        if bos_token_id is None:
+            bos_token_id = self._special_tokens.get(PAD)
+        if bos_token_id is None:
+            bos_token_id = self._special_tokens.get(EOS)
+        if bos_token_id is None:
+            bos_token_id = 0
+        self._bos_token_id = bos_token_id
+
+        self._eos_token_id = self._special_tokens.get(EOS, self._bos_token_id)
+        self._pad_token_id = self._special_tokens.get(PAD, self._eos_token_id)
+        self._unk_token_id = self._pad_token_id
+
+    def num_special_tokens_to_add(self) -> int:
+        return 0
+
+    @property
+    def all_special_tokens(self) -> list[str]:
+        return list(self._special_tokens.keys())
+
+    @property
+    def all_special_ids(self) -> list[int]:
+        return list(self._special_tokens.values())
+
+    @property
+    def bos_token_id(self) -> int:
+        return self._bos_token_id
+
+    @property
+    def eos_token_id(self) -> int:
+        return self._eos_token_id
+
+    @property
+    def pad_token_id(self) -> int:
+        return self._pad_token_id
+
+    @property
+    def is_fast(self) -> bool:
+        return False
+
+    @property
+    def vocab_size(self) -> int:
+        return self._tokenizer.n_vocab
+
+    @property
+    def max_token_id(self) -> int:
+        return self._tokenizer.n_vocab - 1
+
+    @property
+    def truncation_side(self) -> str:
+        return self._truncation_side
+
+    def get_vocab(self) -> dict[str, int]:
+        return dict(self._token_to_id)
+
+    def get_added_vocab(self) -> dict[str, int]:
+        return dict(self._special_tokens)
+
+    def _maybe_truncate(self, tokens: list[int], max_length: int | None) -> list[int]:
+        if max_length is None or len(tokens) <= max_length:
+            return tokens
+        if self.truncation_side == "left":
+            return tokens[-max_length:]
+        return tokens[:max_length]
+
+    def encode(
+        self,
+        text: str,
+        truncation: bool | None = None,
+        max_length: int | None = None,
+        add_special_tokens: bool = True,
+    ) -> list[int]:
+        del add_special_tokens
+        tokens = self._tokenizer.encode(text)
+        if truncation:
+            tokens = self._maybe_truncate(tokens, max_length)
+        return tokens
+
+    def decode(self, ids: list[int] | int, skip_special_tokens: bool = False) -> str:
+        if isinstance(ids, int):
+            ids = [ids]
+        if skip_special_tokens:
+            ids = [
+                token_id
+                for token_id in ids
+                if token_id not in self._special_tokens.values()
+            ]
+        return self._tokenizer.decode(ids)
+
+    @overload
+    def convert_tokens_to_ids(self, tokens: str) -> int: ...
+
+    @overload
+    def convert_tokens_to_ids(self, tokens: list[str]) -> list[int]: ...
+
+    def convert_tokens_to_ids(self, tokens: str | list[str]) -> int | list[int]:
+        if isinstance(tokens, str):
+            return self._token_to_id.get(tokens, self._unk_token_id)
+        return [self._token_to_id.get(token, self._unk_token_id) for token in tokens]
+
+    def convert_ids_to_tokens(
+        self, ids: list[int], skip_special_tokens: bool = False
+    ) -> list[str]:
+        tokens = []
+        for token_id in ids:
+            if skip_special_tokens and token_id in self._special_tokens.values():
+                continue
+            tokens.append(self._id_to_token.get(token_id, "<|unk|>"))
+        return tokens
+
+    def convert_tokens_to_string(self, tokens: list[str]) -> str:
+        token_ids = self.convert_tokens_to_ids(tokens)
+        return self.decode(token_ids, skip_special_tokens=False)
+
+    def __call__(
+        self,
+        text: str | list[str],
+        text_pair: str | None = None,
+        add_special_tokens: bool = True,
+        truncation: bool = False,
+        max_length: int | None = None,
+    ) -> BatchEncoding:
+        if text_pair is not None:
+            raise NotImplementedError("text_pair is not supported for Grok2Tokenizer.")
+
+        if isinstance(text, list):
+            input_ids_batch: list[list[int]] = [
+                self.encode(
+                    item,
+                    truncation=truncation,
+                    max_length=max_length,
+                    add_special_tokens=add_special_tokens,
+                )
+                for item in text
+            ]
+            attention_mask_batch = [[1] * len(ids) for ids in input_ids_batch]
+            return BatchEncoding(
+                {"input_ids": input_ids_batch, "attention_mask": attention_mask_batch}
+            )
+
+        input_ids = self.encode(
+            text,
+            truncation=truncation,
+            max_length=max_length,
+            add_special_tokens=add_special_tokens,
+        )
+        attention_mask = [1] * len(input_ids)
+        return BatchEncoding({"input_ids": input_ids, "attention_mask": attention_mask})
+
+    def get_chat_template(
+        self, chat_template: str | None, tools: list[dict[str, Any]] | None = None
+    ) -> str | None:
+        del tools
+        return chat_template or self._chat_template
+
+    def apply_chat_template(
+        self,
+        messages: list[ChatCompletionMessageParam],
+        tools: list[dict[str, Any]] | None = None,
+        chat_template: str | None = None,
+        tokenize: bool = False,
+        **kwargs,
+    ) -> str | list[int]:
+        template = self.get_chat_template(chat_template, tools=tools)
+        if template is None:
+            raise ValueError(
+                "No chat template available. Provide `chat_template` explicitly."
+            )
+        prompt = hf_chat_utils.apply_chat_template(
+            conversation=messages,
+            chat_template=template,
+            tools=tools,
+            **kwargs,
+        )
+        if tokenize:
+            return self.encode(prompt, add_special_tokens=False)
+        return prompt
diff --git a/vllm/tokenizers/mistral.py b/vllm/tokenizers/mistral.py
index 534b0da484a5d9ddc29fe55e6ab5862f9069cbaf..35a11e95b8bd04b273837461a6ed650c969342d5 100644
--- a/vllm/tokenizers/mistral.py
+++ b/vllm/tokenizers/mistral.py
@@ -1,7 +1,22 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, cast
+from typing import TYPE_CHECKING, Any, cast, overload
+
+from mistral_common.protocol.instruct.request import (
+    ChatCompletionRequest as MistralChatCompletionRequest,
+)
+from mistral_common.protocol.instruct.tool_calls import Function, Tool
+from mistral_common.protocol.instruct.validator import ValidationMode
+from mistral_common.tokens.tokenizers.base import (
+    SpecialTokenPolicy,
+    SpecialTokens,
+)
+from mistral_common.tokens.tokenizers.instruct import InstructTokenizerV13
+from mistral_common.tokens.tokenizers.sentencepiece import (
+    SentencePieceTokenizer,
+)
+from mistral_common.tokens.tokenizers.tekken import Tekkenizer
 
 from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest
@@ -10,10 +25,6 @@ from vllm.logger import init_logger
 from .protocol import TokenizerLike
 
 if TYPE_CHECKING:
-    from mistral_common.protocol.instruct.request import (
-        ChatCompletionRequest as MistralChatCompletionRequest,
-    )
-    from mistral_common.tokens.tokenizers.tekken import Tekkenizer
     from transformers import BatchEncoding
 
     try:
@@ -101,8 +112,6 @@ def _prepare_apply_chat_template_tools_and_messages(
     continue_final_message: bool = False,
     add_generation_prompt: bool = False,
 ) -> tuple[list["ChatCompletionMessageParam"], list[dict[str, Any]] | None]:
-    from mistral_common.protocol.instruct.tool_calls import Function, Tool
-
     if add_generation_prompt and continue_final_message:
         raise ValueError(
             "Cannot set both `add_generation_prompt` and "
@@ -181,8 +190,6 @@ def validate_request_params(request: "ChatCompletionRequest"):
 
 
 def _tekken_token_to_id(tokenizer: "Tekkenizer", t: str | bytes) -> int:
-    from mistral_common.tokens.tokenizers.tekken import Tekkenizer
-
     assert isinstance(tokenizer, Tekkenizer), type(tokenizer)
 
     t_bytes = t.encode("utf-8") if not isinstance(t, bytes) else t
@@ -210,8 +217,6 @@ class MistralTokenizer(TokenizerLike):
         download_dir: str | None = None,
         **kwargs,
     ) -> "MistralTokenizer":
-        from mistral_common.protocol.instruct.validator import ValidationMode
-
         try:
             # Transformers v5
             from transformers.tokenization_mistral_common import MistralCommonBackend
@@ -235,12 +240,6 @@ class MistralTokenizer(TokenizerLike):
     def __init__(self, tokenizer: "MistralCommonBackend") -> None:
         super().__init__()
 
-        from mistral_common.protocol.instruct.validator import ValidationMode
-        from mistral_common.tokens.tokenizers.sentencepiece import (
-            SentencePieceTokenizer,
-        )
-        from mistral_common.tokens.tokenizers.tekken import Tekkenizer
-
         self.transformers_tokenizer = tokenizer
         self.mistral = tokenizer.tokenizer
         self.instruct = self.mistral.instruct_tokenizer
@@ -270,37 +269,20 @@ class MistralTokenizer(TokenizerLike):
         # Sort the dict for convenience
         self._vocab_dict = dict(sorted(self._vocab_dict.items(), key=lambda x: x[1]))
 
+        # Vocab sorted by token id.
+        self._vocab = self.tokenizer.vocab()
+        self._max_token_id = self.vocab_size - 1
+
         # Cache special tokens for faster access.
         self._special_token_ids = self._get_special_token_ids()
         self._special_token_ids_set = set(self._special_token_ids)
         self._special_tokens = self._get_special_tokens(self._special_token_ids)
         self._special_tokens_set = set(self._special_tokens)
 
-        # Vocab sorted by token id.
-        self._vocab = self.tokenizer._vocab
-        self._max_token_id = self.vocab_size - 1
-
     def _get_special_token_ids(self) -> list[int]:
-        from mistral_common.tokens.tokenizers.sentencepiece import (
-            SentencePieceTokenizer,
-        )
-        from mistral_common.tokens.tokenizers.tekken import Tekkenizer
-
-        if self.is_tekken:
-            assert isinstance(self.tokenizer, Tekkenizer), type(self.tokenizer)
-            special_ids = {t["rank"] for t in self.tokenizer._all_special_tokens}
-        elif self.is_spm:
-            assert isinstance(self.tokenizer, SentencePieceTokenizer), type(
-                self.tokenizer
-            )
-            special_ids = self.tokenizer._control_tokens
-        else:
-            raise ValueError(f"Unknown tokenizer type: {type(self.tokenizer)}")
-        return sorted(special_ids)
+        return [i for i in range(len(self._vocab)) if self.tokenizer.is_special(i)]
 
     def _get_special_tokens(self, all_special_ids: list[int]) -> list[str]:
-        from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy
-
         return [
             self.tokenizer.decode([i], special_token_policy=SpecialTokenPolicy.KEEP)
             for i in all_special_ids
@@ -459,16 +441,16 @@ class MistralTokenizer(TokenizerLike):
             ids, skip_special_tokens=skip_special_tokens
         )
 
-    def convert_tokens_to_string(self, tokens: list[str]) -> str:
-        from mistral_common.tokens.tokenizers.base import (
-            SpecialTokenPolicy,
-            SpecialTokens,
-        )
-        from mistral_common.tokens.tokenizers.sentencepiece import (
-            SentencePieceTokenizer,
-        )
-        from mistral_common.tokens.tokenizers.tekken import Tekkenizer
+    @overload
+    def convert_tokens_to_ids(self, tokens: str) -> int: ...
+
+    @overload
+    def convert_tokens_to_ids(self, tokens: list[str]) -> list[int]: ...
+
+    def convert_tokens_to_ids(self, tokens: str | list[str]) -> int | list[int]:
+        return self.transformers_tokenizer.convert_tokens_to_ids(tokens)
 
+    def convert_tokens_to_string(self, tokens: list[str]) -> str:
         to_decode_special_tokens = {SpecialTokens.tool_calls}
         if self.is_tekken:
             assert isinstance(self.tokenizer, Tekkenizer), type(self.tokenizer)
@@ -523,12 +505,6 @@ class MistralTokenizer(TokenizerLike):
         ids: list[int],
         skip_special_tokens: bool = False,
     ) -> list[str]:
-        from mistral_common.tokens.tokenizers.base import (
-            SpecialTokenPolicy,
-            SpecialTokens,
-        )
-        from mistral_common.tokens.tokenizers.instruct import InstructTokenizerV13
-
         if not skip_special_tokens:
             return [self.tokenizer.id_to_piece(token_id) for token_id in ids]
 
diff --git a/vllm/tokenizers/protocol.py b/vllm/tokenizers/protocol.py
index 28754f9e10d00e226f4071eae61a08bf465eac4d..21e5b3a7bbdd8bc164f4742510866be1cefcb6dd 100644
--- a/vllm/tokenizers/protocol.py
+++ b/vllm/tokenizers/protocol.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Protocol
+from typing import TYPE_CHECKING, Any, Protocol, overload
 
 if TYPE_CHECKING:
     from transformers import BatchEncoding
@@ -100,6 +100,15 @@ class TokenizerLike(Protocol):
     ) -> str | list[int]:
         raise NotImplementedError
 
+    @overload
+    def convert_tokens_to_ids(self, tokens: str) -> int: ...
+
+    @overload
+    def convert_tokens_to_ids(self, tokens: list[str]) -> list[int]: ...
+
+    def convert_tokens_to_ids(self, tokens: str | list[str]) -> int | list[int]:
+        raise NotImplementedError
+
     def convert_tokens_to_string(self, tokens: list[str]) -> str:
         raise NotImplementedError
 
diff --git a/vllm/tokenizers/registry.py b/vllm/tokenizers/registry.py
index 72447ef04e87c474f036ab2538513e17db62c30f..b5088a116629b174b2f6c77cec57f54a1df388a1 100644
--- a/vllm/tokenizers/registry.py
+++ b/vllm/tokenizers/registry.py
@@ -7,7 +7,7 @@ from pathlib import Path
 from typing import TYPE_CHECKING
 
 import huggingface_hub
-from typing_extensions import TypeVar, assert_never, deprecated
+from typing_extensions import TypeVar, assert_never
 
 import vllm.envs as envs
 from vllm.logger import init_logger
@@ -31,6 +31,7 @@ logger = init_logger(__name__)
 
 _VLLM_TOKENIZERS = {
     "deepseek_v32": ("deepseek_v32", "DeepseekV32Tokenizer"),
+    "grok2": ("grok2", "Grok2Tokenizer"),
     "hf": ("hf", "CachedHfTokenizer"),
     "mistral": ("mistral", "MistralTokenizer"),
 }
@@ -151,6 +152,17 @@ def resolve_tokenizer_args(
         if len(files_list) > 0:
             tokenizer_mode = "mistral"
 
+    # Try to use Grok2 tiktoken tokenizer if possible
+    if tokenizer_mode == "auto":
+        allow_patterns = ["tokenizer.tok.json"]
+        files_list = list_filtered_repo_files(
+            model_name_or_path=str(tokenizer_name),
+            allow_patterns=allow_patterns,
+            revision=revision,
+        )
+        if len(files_list) > 0:
+            tokenizer_mode = "grok2"
+
     # Fallback to HF tokenizer
     if tokenizer_mode == "auto":
         tokenizer_mode = "hf"
@@ -224,10 +236,3 @@ def cached_tokenizer_from_config(model_config: "ModelConfig", **kwargs):
         trust_remote_code=model_config.trust_remote_code,
         **kwargs,
     )
-
-
-@deprecated(
-    "Renamed to `cached_tokenizer_from_config`. The old name will be removed in v0.14."
-)
-def init_tokenizer_from_config(model_config: "ModelConfig"):
-    return cached_tokenizer_from_config(model_config)
diff --git a/vllm/tool_parsers/__init__.py b/vllm/tool_parsers/__init__.py
index 181d8bcba95538042f231f65343416cd0d062bc5..b26638c0959bf27ef29d7132f0717ffaf69b2d1d 100644
--- a/vllm/tool_parsers/__init__.py
+++ b/vllm/tool_parsers/__init__.py
@@ -42,6 +42,10 @@ _TOOL_PARSERS_TO_REGISTER = {
         "glm4_moe_tool_parser",
         "Glm4MoeModelToolParser",
     ),
+    "glm47": (
+        "glm47_moe_tool_parser",
+        "Glm47MoeModelToolParser",
+    ),
     "granite-20b-fc": (
         "granite_20b_fc_tool_parser",
         "Granite20bFCToolParser",
@@ -138,6 +142,10 @@ _TOOL_PARSERS_TO_REGISTER = {
         "gigachat3_tool_parser",
         "GigaChat3ToolParser",
     ),
+    "functiongemma": (
+        "functiongemma_tool_parser",
+        "FunctionGemmaToolParser",
+    ),
 }
 
 
diff --git a/vllm/tool_parsers/abstract_tool_parser.py b/vllm/tool_parsers/abstract_tool_parser.py
index e2ccb1dad9907a358e46f9e246a46dd4aaf68650..b7cac3454dab66c2d3633547289d59a90b35391e 100644
--- a/vllm/tool_parsers/abstract_tool_parser.py
+++ b/vllm/tool_parsers/abstract_tool_parser.py
@@ -67,6 +67,7 @@ class ToolParser:
                 # tool_choice: "Forced Function" or "required" will override
                 # structured output json settings to make tool calling work correctly
                 request.structured_outputs.json = json_schema_from_tool
+                request.response_format = None
             if isinstance(request, ResponsesRequest):
                 request.text = ResponseTextConfig()
                 request.text.format = ResponseFormatTextJSONSchemaConfig(
diff --git a/vllm/tool_parsers/functiongemma_tool_parser.py b/vllm/tool_parsers/functiongemma_tool_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..9be78b0a0691c36e7f5b7e89b65c7d9dafe2ea40
--- /dev/null
+++ b/vllm/tool_parsers/functiongemma_tool_parser.py
@@ -0,0 +1,321 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+from collections.abc import Sequence
+
+import regex as re
+
+from vllm.entrypoints.chat_utils import make_tool_call_id
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    DeltaFunctionCall,
+    DeltaMessage,
+    DeltaToolCall,
+    ExtractedToolCallInformation,
+    FunctionCall,
+    ToolCall,
+)
+from vllm.logger import init_logger
+from vllm.tokenizers import TokenizerLike
+from vllm.tool_parsers.abstract_tool_parser import ToolParser
+
+logger = init_logger(__name__)
+
+
+class FunctionGemmaToolParser(ToolParser):
+    """
+    Tool parser for Google's FunctionGemma model (google/functiongemma-270m-it).
+
+    Handles the FunctionGemma function call format:
+    <start_function_call>call:func_name{param:<escape>value<escape>}<end_function_call>
+    """
+
+    def __init__(self, tokenizer: TokenizerLike):
+        super().__init__(tokenizer)
+
+        # Streaming state
+        self.current_tool_name_sent: bool = False
+        self.prev_tool_call_arr: list[dict] = []
+        self.current_tool_id: int = -1
+        self.streamed_args_for_tool: list[str] = []
+
+        # FunctionGemma tokens
+        self.tool_call_start_token: str = "<start_function_call>"
+        self.tool_call_end_token: str = "<end_function_call>"
+
+        # Regex patterns
+        self.tool_call_regex = re.compile(
+            r"<start_function_call>call:(\w+)\{(.*?)\}<end_function_call>"
+            r"|<start_function_call>call:(\w+)\{(.*)",
+            re.DOTALL,
+        )
+        self.arg_regex = re.compile(
+            r"(\w+):<escape>(.*?)<escape>",
+            re.DOTALL,
+        )
+
+        if self.model_tokenizer:
+            self.tool_call_start_token_ids = self.model_tokenizer.encode(
+                self.tool_call_start_token, add_special_tokens=False
+            )
+            self.tool_call_end_token_ids = self.model_tokenizer.encode(
+                self.tool_call_end_token, add_special_tokens=False
+            )
+        else:
+            self.tool_call_start_token_ids = []
+            self.tool_call_end_token_ids = []
+
+        self.buffered_delta_text = ""
+
+    def _parse_arguments(self, args_str: str) -> dict:
+        """Parse FunctionGemma argument string into a dictionary."""
+        arguments = {}
+        if not args_str:
+            return arguments
+
+        matches = self.arg_regex.findall(args_str)
+        for key, value in matches:
+            try:
+                parsed_value = json.loads(value)
+                arguments[key] = parsed_value
+            except json.JSONDecodeError:
+                arguments[key] = value
+
+        return arguments
+
+    def adjust_request(self, request: ChatCompletionRequest) -> ChatCompletionRequest:
+        request = super().adjust_request(request)
+        if request.tools and request.tool_choice != "none":
+            request.skip_special_tokens = False
+        return request
+
+    def extract_tool_calls(
+        self,
+        model_output: str,
+        request: ChatCompletionRequest,
+    ) -> ExtractedToolCallInformation:
+        if self.tool_call_start_token not in model_output:
+            return ExtractedToolCallInformation(
+                tools_called=False, tool_calls=[], content=model_output
+            )
+
+        try:
+            matches = self.tool_call_regex.findall(model_output)
+
+            if not matches:
+                return ExtractedToolCallInformation(
+                    tools_called=False, tool_calls=[], content=model_output
+                )
+
+            tool_calls: list[ToolCall] = []
+
+            for match in matches:
+                func_name = match[0] if match[0] else match[2]
+                args_str = match[1] if match[1] else match[3]
+
+                if not func_name:
+                    continue
+
+                arguments = self._parse_arguments(args_str)
+
+                tool_calls.append(
+                    ToolCall(
+                        type="function",
+                        function=FunctionCall(
+                            name=func_name,
+                            arguments=json.dumps(arguments, ensure_ascii=False),
+                        ),
+                    )
+                )
+
+            if tool_calls:
+                content_end = model_output.find(self.tool_call_start_token)
+                content = (
+                    model_output[:content_end].strip() if content_end > 0 else None
+                )
+
+                return ExtractedToolCallInformation(
+                    tools_called=True,
+                    tool_calls=tool_calls,
+                    content=content if content else None,
+                )
+
+            return ExtractedToolCallInformation(
+                tools_called=False, tool_calls=[], content=model_output
+            )
+
+        except Exception:
+            logger.exception("Error extracting tool calls from FunctionGemma response")
+            return ExtractedToolCallInformation(
+                tools_called=False, tool_calls=[], content=model_output
+            )
+
+    def _buffer_delta_text(self, delta_text: str) -> str:
+        """Buffer incoming delta text to handle multi-token special sequences."""
+        potential_start = "<start_function_call>"
+        potential_end = "<end_function_call>"
+
+        combined = self.buffered_delta_text + delta_text
+
+        if combined.endswith(potential_start) or combined.endswith(potential_end):
+            self.buffered_delta_text = ""
+            return combined
+
+        for tag in [potential_start, potential_end]:
+            for i in range(1, len(tag)):
+                if combined.endswith(tag[:i]):
+                    self.buffered_delta_text = combined[-(i):]
+                    return combined[:-i]
+
+        self.buffered_delta_text = ""
+        return combined
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> DeltaMessage | None:
+        delta_text = self._buffer_delta_text(delta_text)
+        current_text = previous_text + delta_text
+
+        if self.tool_call_start_token not in current_text:
+            if delta_text:
+                return DeltaMessage(content=delta_text)
+            return None
+
+        try:
+            start_count = current_text.count(self.tool_call_start_token)
+            end_count = current_text.count(self.tool_call_end_token)
+            prev_start_count = previous_text.count(self.tool_call_start_token)
+            prev_end_count = previous_text.count(self.tool_call_end_token)
+
+            if self.tool_call_start_token not in current_text:
+                return DeltaMessage(content=delta_text)
+
+            # Starting a new function call
+            if start_count > prev_start_count and start_count > end_count:
+                self.current_tool_id += 1
+                self.current_tool_name_sent = False
+                self.streamed_args_for_tool.append("")
+                self.prev_tool_call_arr.append({})
+                logger.debug("Starting new tool call %d", self.current_tool_id)
+                return None
+
+            # In the middle of a function call
+            if start_count > end_count:
+                last_start = current_text.rfind(self.tool_call_start_token)
+                partial_call = current_text[
+                    last_start + len(self.tool_call_start_token) :
+                ]
+
+                if partial_call.startswith("call:"):
+                    func_part = partial_call[5:]
+
+                    if "{" in func_part:
+                        func_name = func_part.split("{")[0]
+                        args_part = (
+                            func_part.split("{", 1)[1] if "{" in func_part else ""
+                        )
+
+                        if not self.current_tool_name_sent and func_name:
+                            self.current_tool_name_sent = True
+                            self.prev_tool_call_arr[self.current_tool_id] = {
+                                "name": func_name,
+                                "arguments": {},
+                            }
+                            return DeltaMessage(
+                                tool_calls=[
+                                    DeltaToolCall(
+                                        index=self.current_tool_id,
+                                        type="function",
+                                        id=make_tool_call_id(),
+                                        function=DeltaFunctionCall(
+                                            name=func_name
+                                        ).model_dump(exclude_none=True),
+                                    )
+                                ]
+                            )
+
+                        if self.current_tool_name_sent and args_part:
+                            current_args = self._parse_arguments(args_part)
+                            if current_args:
+                                current_args_json = json.dumps(
+                                    current_args, ensure_ascii=False
+                                )
+                                prev_streamed = self.streamed_args_for_tool[
+                                    self.current_tool_id
+                                ]
+
+                                if len(current_args_json) > len(prev_streamed):
+                                    diff = current_args_json[len(prev_streamed) :]
+                                    self.streamed_args_for_tool[
+                                        self.current_tool_id
+                                    ] = current_args_json
+                                    self.prev_tool_call_arr[self.current_tool_id][
+                                        "arguments"
+                                    ] = current_args
+
+                                    return DeltaMessage(
+                                        tool_calls=[
+                                            DeltaToolCall(
+                                                index=self.current_tool_id,
+                                                function=DeltaFunctionCall(
+                                                    arguments=diff
+                                                ).model_dump(exclude_none=True),
+                                            )
+                                        ]
+                                    )
+
+                return None
+
+            # Function call just ended
+            if end_count > prev_end_count:
+                if self.current_tool_id >= 0 and self.current_tool_id < len(
+                    self.prev_tool_call_arr
+                ):
+                    all_calls = self.tool_call_regex.findall(current_text)
+                    args = {}
+                    if self.current_tool_id < len(all_calls):
+                        match = all_calls[self.current_tool_id]
+                        if match[0]:
+                            args_str = match[1]
+                            args = self._parse_arguments(args_str)
+                            self.prev_tool_call_arr[self.current_tool_id][
+                                "arguments"
+                            ] = args
+
+                    if args:
+                        args_json = json.dumps(args, ensure_ascii=False)
+                        prev_streamed = self.streamed_args_for_tool[
+                            self.current_tool_id
+                        ]
+                        if len(args_json) > len(prev_streamed):
+                            diff = args_json[len(prev_streamed) :]
+                            self.streamed_args_for_tool[self.current_tool_id] = (
+                                args_json
+                            )
+                            return DeltaMessage(
+                                tool_calls=[
+                                    DeltaToolCall(
+                                        index=self.current_tool_id,
+                                        function=DeltaFunctionCall(
+                                            arguments=diff
+                                        ).model_dump(exclude_none=True),
+                                    )
+                                ]
+                            )
+                return None
+
+            if delta_text:
+                return DeltaMessage(content=delta_text)
+            return None
+
+        except Exception:
+            logger.exception("Error in streaming tool call extraction")
+            return None
diff --git a/vllm/tool_parsers/glm47_moe_tool_parser.py b/vllm/tool_parsers/glm47_moe_tool_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae42a640d9413046bb2f0935846ed92d9b6311eb
--- /dev/null
+++ b/vllm/tool_parsers/glm47_moe_tool_parser.py
@@ -0,0 +1,23 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+import regex as re
+
+from vllm.logger import init_logger
+from vllm.tokenizers import TokenizerLike
+from vllm.tool_parsers.glm4_moe_tool_parser import Glm4MoeModelToolParser
+
+logger = init_logger(__name__)
+
+
+class Glm47MoeModelToolParser(Glm4MoeModelToolParser):
+    def __init__(self, tokenizer: TokenizerLike):
+        super().__init__(tokenizer)
+        self.func_detail_regex = re.compile(
+            r"<tool_call>(.*?)(<arg_key>.*?)?</tool_call>", re.DOTALL
+        )
+        self.func_arg_regex = re.compile(
+            r"<arg_key>(.*?)</arg_key>(?:\\n|\s)*<arg_value>(.*?)</arg_value>",
+            re.DOTALL,
+        )
diff --git a/vllm/tool_parsers/glm4_moe_tool_parser.py b/vllm/tool_parsers/glm4_moe_tool_parser.py
index d254fcb5240a5da1edaaba37c7109b23acdaa948..6ad7d7cb460cd30057f1fed8989e70ec4ecbbf05 100644
--- a/vllm/tool_parsers/glm4_moe_tool_parser.py
+++ b/vllm/tool_parsers/glm4_moe_tool_parser.py
@@ -56,6 +56,20 @@ class Glm4MoeModelToolParser(ToolParser):
         self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token)
         self._buffer = ""
 
+    def adjust_request(self, request: ChatCompletionRequest) -> ChatCompletionRequest:
+        """
+        Adjust request parameters to ensure tool call tokens are not skipped
+        during tokenizer decoding.
+        """
+        request = super().adjust_request(request)
+        if request.tools and request.tool_choice != "none":
+            # Ensure tool call tokens (<tool_call>, </tool_call>) are not skipped
+            # during decoding. Even though they are not marked as special tokens,
+            # setting skip_special_tokens=False ensures proper handling in
+            # transformers 5.x where decoding behavior may have changed.
+            request.skip_special_tokens = False
+        return request
+
     def extract_tool_calls(
         self,
         model_output: str,
@@ -114,7 +128,8 @@ class Glm4MoeModelToolParser(ToolParser):
                     ToolCall(
                         type="function",
                         function=FunctionCall(
-                            name=tc_name, arguments=json.dumps(arg_dct)
+                            name=tc_name,
+                            arguments=json.dumps(arg_dct, ensure_ascii=False),
                         ),
                     )
                 )
diff --git a/vllm/tool_parsers/kimi_k2_tool_parser.py b/vllm/tool_parsers/kimi_k2_tool_parser.py
index c215b7978854eed2f11f765b7e18147060a4649f..96630504f0681df8576bd399467cb383d5ee7cda 100644
--- a/vllm/tool_parsers/kimi_k2_tool_parser.py
+++ b/vllm/tool_parsers/kimi_k2_tool_parser.py
@@ -122,7 +122,6 @@ class KimiK2ToolParser(ToolParser):
             if variant in cleaned:
                 cleaned = cleaned.replace(variant, "")
                 found_end = True
-
         return cleaned, found_begin, found_end
 
     def _reset_section_state(self) -> None:
@@ -238,6 +237,7 @@ class KimiK2ToolParser(ToolParser):
             self.in_tool_section = True
             self.token_buffer = buffered_text  # Use cleaned buffer
             self.section_char_count = 0  # Reset counter for new section
+
         if found_section_end and self.in_tool_section:
             logger.debug("Detected section end marker")
             # CRITICAL: Don't exit early if tool_call_end is in this chunk.
@@ -252,13 +252,18 @@ class KimiK2ToolParser(ToolParser):
             else:
                 # No tool call ending, safe to exit immediately
                 logger.debug("Exiting tool section")
-                remaining = buffered_text
                 self._reset_section_state()
-                # Return remaining text as reasoning content if non-empty
-                if remaining.strip():
-                    return DeltaMessage(content=remaining)
-                # Return empty delta to maintain function contract
-                # (always returns DeltaMessage)
+                # Extract any content AFTER the section end marker in delta_text
+                # (don't use buffered_text as it contains tool call data)
+                post_section_content = ""
+                for variant in self.tool_calls_end_token_variants:
+                    if variant in delta_text:
+                        parts = delta_text.split(variant, 1)
+                        if len(parts) > 1:
+                            post_section_content = parts[1]
+                        break
+                if post_section_content.strip():
+                    return DeltaMessage(content=post_section_content)
                 return DeltaMessage(content="")
         else:
             self.token_buffer = buffered_text
@@ -316,12 +321,12 @@ class KimiK2ToolParser(ToolParser):
                 and prev_tool_end_count == cur_tool_end_count
                 and self.tool_call_end_token not in delta_text
             ):
-                # CRITICAL FIX: Suppress content if in tool section but
-                # no tool calls started
+                # Suppress content between section begin and first tool begin
+                # (header noise). Don't suppress content between tools to avoid
+                # breaking potential delimiter characters.
                 if self.in_tool_section and cur_tool_start_count == 0:
                     logger.debug(
-                        "In tool section but no tool calls started yet. "
-                        "Suppressing: %s",
+                        "In tool section before first tool, suppressing: %s",
                         delta_text,
                     )
                     # Return empty delta to maintain iterator contract
@@ -488,6 +493,9 @@ class KimiK2ToolParser(ToolParser):
             if tool_call_portion is None:
                 # if there's text but not tool calls, send that -
                 # otherwise None to skip chunk
+                # CRITICAL: Never return content if we're in a tool section
+                if self.in_tool_section:
+                    return None
                 delta = (
                     DeltaMessage(content=delta_text)
                     if text_portion is not None
diff --git a/vllm/tool_parsers/minimax_m2_tool_parser.py b/vllm/tool_parsers/minimax_m2_tool_parser.py
index dcb2b64f6e73c4371e2e7880e37a8a5c7bad6fc5..67bd0e61620da8b60f874ee2e0c2a5d7b5563c62 100644
--- a/vllm/tool_parsers/minimax_m2_tool_parser.py
+++ b/vllm/tool_parsers/minimax_m2_tool_parser.py
@@ -122,6 +122,8 @@ class MinimaxM2ToolParser(ToolParser):
         self.streaming_request = None
         # Clear previous tool call history to avoid state pollution
         self.prev_tool_call_arr.clear()
+        # Reset streamed args tracking
+        self.streamed_args_for_tool.clear()
 
     def _extract_name(self, name_str: str) -> str:
         """Extract name from quoted string."""
@@ -136,37 +138,167 @@ class MinimaxM2ToolParser(ToolParser):
         return name_str
 
     def _convert_param_value(self, value: str, param_type: str) -> Any:
-        """Convert parameter value to the correct type."""
+        """Convert parameter value to the correct type (legacy single-type version)."""
+        return self._convert_param_value_with_types(value, [param_type])
+
+    def _extract_types_from_schema(self, schema: Any) -> list[str]:
+        """
+        Extract all possible types from a JSON schema definition.
+        Handles anyOf, oneOf, allOf, type arrays, and enum fields.
+
+        Args:
+            schema: The JSON schema definition for a parameter
+
+        Returns:
+            List of type strings (e.g., ["string", "integer", "null"])
+        """
+        if schema is None:
+            return ["string"]
+
+        if not isinstance(schema, dict):
+            return ["string"]
+
+        types: set[str] = set()
+
+        # Handle direct "type" field
+        if "type" in schema:
+            type_value = schema["type"]
+            if isinstance(type_value, str):
+                types.add(type_value)
+            elif isinstance(type_value, list):
+                for t in type_value:
+                    if isinstance(t, str):
+                        types.add(t)
+
+        # Handle enum - infer types from enum values
+        if "enum" in schema and isinstance(schema["enum"], list) and schema["enum"]:
+            for value in schema["enum"]:
+                if value is None:
+                    types.add("null")
+                elif isinstance(value, bool):
+                    types.add("boolean")
+                elif isinstance(value, int):
+                    types.add("integer")
+                elif isinstance(value, float):
+                    types.add("number")
+                elif isinstance(value, str):
+                    types.add("string")
+                elif isinstance(value, list):
+                    types.add("array")
+                elif isinstance(value, dict):
+                    types.add("object")
+
+        # Handle anyOf, oneOf, allOf - recursively extract types
+        for choice_field in ("anyOf", "oneOf", "allOf"):
+            if choice_field in schema and isinstance(schema[choice_field], list):
+                for choice in schema[choice_field]:
+                    extracted = self._extract_types_from_schema(choice)
+                    types.update(extracted)
+
+        # If no types found, default to string
+        if not types:
+            return ["string"]
+
+        return list(types)
+
+    def _convert_param_value_with_types(
+        self, value: str, param_types: list[str]
+    ) -> Any:
+        """
+        Convert parameter value to the correct type based on a list of possible types.
+        Tries each type in order until one succeeds.
+
+        Args:
+            value: The string value to convert
+            param_types: List of possible type strings
+
+        Returns:
+            The converted value
+        """
         if value.lower() == "null":
             return None
 
-        param_type = param_type.lower()
-        if param_type in ["string", "str", "text"]:
-            return value
-        elif param_type in ["integer", "int"]:
-            try:
-                return int(value)
-            except (ValueError, TypeError):
-                return value
-        elif param_type in ["number", "float"]:
-            try:
-                val = float(value)
-                return val if val != int(val) else int(val)
-            except (ValueError, TypeError):
-                return value
-        elif param_type in ["boolean", "bool"]:
-            return value.lower() in ["true", "1"]
-        elif param_type in ["object", "array"]:
-            try:
-                return json.loads(value)
-            except json.JSONDecodeError:
-                return value
-        else:
-            # Try JSON parse first, fallback to string
-            try:
-                return json.loads(value)
-            except json.JSONDecodeError:
+        # Normalize types
+        normalized_types = [t.lower() for t in param_types]
+
+        # Try null first if it's in the list
+        if "null" in normalized_types or value.lower() in ("null", "none", "nil"):
+            return None
+
+        # Try each type in order of preference (most specific first, string as fallback)
+        # Priority: integer > number > boolean > object > array > string
+        type_priority = [
+            "integer",
+            "int",
+            "number",
+            "float",
+            "boolean",
+            "bool",
+            "object",
+            "array",
+            "string",
+            "str",
+            "text",
+        ]
+
+        for param_type in type_priority:
+            if param_type not in normalized_types:
+                continue
+
+            if param_type in ["string", "str", "text"]:
                 return value
+            elif param_type in ["integer", "int"]:
+                try:
+                    return int(value)
+                except (ValueError, TypeError):
+                    continue
+            elif param_type in ["number", "float"]:
+                try:
+                    val = float(value)
+                    return val if val != int(val) else int(val)
+                except (ValueError, TypeError):
+                    continue
+            elif param_type in ["boolean", "bool"]:
+                lower_val = value.lower().strip()
+                if lower_val in ["true", "1", "yes", "on"]:
+                    return True
+                elif lower_val in ["false", "0", "no", "off"]:
+                    return False
+                continue
+            elif param_type in ["object", "array"]:
+                try:
+                    return json.loads(value)
+                except json.JSONDecodeError:
+                    continue
+
+        # Fallback: try JSON parse, then return as string
+        try:
+            return json.loads(value)
+        except json.JSONDecodeError:
+            return value
+
+    def _get_param_types_from_config(
+        self, param_name: str, param_config: dict
+    ) -> list[str]:
+        """
+        Get parameter types from parameter configuration.
+        Handles anyOf, oneOf, allOf, and direct type definitions.
+
+        Args:
+            param_name: The name of the parameter
+            param_config: The properties dict from the tool schema
+
+        Returns:
+            List of type strings
+        """
+        if param_name not in param_config:
+            return ["string"]
+
+        param_schema = param_config[param_name]
+        if not isinstance(param_schema, dict):
+            return ["string"]
+
+        return self._extract_types_from_schema(param_schema)
 
     def _parse_single_invoke(
         self, invoke_str: str, tools: list | None
@@ -205,17 +337,11 @@ class MinimaxM2ToolParser(ToolParser):
                 if param_value.endswith("\n"):
                     param_value = param_value[:-1]
 
-                # Get parameter type
-                param_type = "string"
-                if (
-                    param_name in param_config
-                    and isinstance(param_config[param_name], dict)
-                    and "type" in param_config[param_name]
-                ):
-                    param_type = param_config[param_name]["type"]
+                # Get parameter types (supports anyOf/oneOf/allOf)
+                param_type = self._get_param_types_from_config(param_name, param_config)
 
                 # Convert value
-                param_dict[param_name] = self._convert_param_value(
+                param_dict[param_name] = self._convert_param_value_with_types(
                     param_value, param_type
                 )
 
@@ -421,9 +547,12 @@ class MinimaxM2ToolParser(ToolParser):
                         self.prev_tool_call_arr.append(
                             {
                                 "name": self.current_function_name,
-                                "arguments": "{}",  # Placeholder, will be updated later
+                                "arguments": {},  # Placeholder, will be updated later
                             }
                         )
+                        # Initialize streamed_args_for_tool for this tool call
+                        if len(self.streamed_args_for_tool) <= self.current_tool_index:
+                            self.streamed_args_for_tool.append("")
 
                     # Send header with function info
                     return DeltaMessage(
@@ -445,6 +574,9 @@ class MinimaxM2ToolParser(ToolParser):
             # Send opening brace if not sent yet
             if self.in_function and not self.json_started:
                 self.json_started = True
+                # Update streamed_args_for_tool for opening brace
+                if self.current_tool_index < len(self.streamed_args_for_tool):
+                    self.streamed_args_for_tool[self.current_tool_index] += "{"
                 return DeltaMessage(
                     tool_calls=[
                         DeltaToolCall(
@@ -493,7 +625,7 @@ class MinimaxM2ToolParser(ToolParser):
                                 args = parsed_tool.function.arguments
                                 self.prev_tool_call_arr[self.current_tool_index][
                                     "arguments"
-                                ] = args
+                                ] = json.loads(args)
                         except Exception:
                             pass  # Ignore parsing errors during streaming
 
@@ -505,7 +637,9 @@ class MinimaxM2ToolParser(ToolParser):
                             )
                         ]
                     )
-
+                    # Update streamed_args_for_tool for closing brace
+                    if self.current_tool_index < len(self.streamed_args_for_tool):
+                        self.streamed_args_for_tool[self.current_tool_index] += "}"
                     # Reset state for next tool
                     self.json_closed = True
                     self.in_function = False
@@ -583,7 +717,7 @@ class MinimaxM2ToolParser(ToolParser):
                         # Store raw value for later processing
                         self.accumulated_params[self.current_param_name] = param_value
 
-                        # Get parameter configuration for type conversion
+                        # Get parameter configuration with anyOf support
                         param_config = {}
                         if self.streaming_request and self.streaming_request.tools:
                             for tool in self.streaming_request.tools:
@@ -600,17 +734,12 @@ class MinimaxM2ToolParser(ToolParser):
                                         param_config = params["properties"]
                                     break
 
-                        # Get parameter type
-                        param_type = "string"
-                        if (
-                            self.current_param_name in param_config
-                            and isinstance(param_config[self.current_param_name], dict)
-                            and "type" in param_config[self.current_param_name]
-                        ):
-                            param_type = param_config[self.current_param_name]["type"]
+                        # Get parameter types (supports anyOf/oneOf/allOf)
+                        param_type = self._get_param_types_from_config(
+                            self.current_param_name, param_config
+                        )
 
-                        # Convert param value to appropriate type
-                        converted_value = self._convert_param_value(
+                        converted_value = self._convert_param_value_with_types(
                             param_value, param_type
                         )
 
@@ -630,7 +759,11 @@ class MinimaxM2ToolParser(ToolParser):
                             )
 
                         self.param_count += 1
-
+                        # Update streamed_args_for_tool for this tool call
+                        if self.current_tool_index < len(self.streamed_args_for_tool):
+                            self.streamed_args_for_tool[self.current_tool_index] += (
+                                json_fragment
+                            )
                         return DeltaMessage(
                             tool_calls=[
                                 DeltaToolCall(
diff --git a/vllm/tool_parsers/mistral_tool_parser.py b/vllm/tool_parsers/mistral_tool_parser.py
index 49a175f69f434ac33d50a224602dc90ecf539105..35b853b0ad7e1b44b0a51b2f8a4287c7022559bb 100644
--- a/vllm/tool_parsers/mistral_tool_parser.py
+++ b/vllm/tool_parsers/mistral_tool_parser.py
@@ -131,78 +131,105 @@ class MistralToolParser(ToolParser):
         request: ChatCompletionRequest,
     ) -> ExtractedToolCallInformation:
         """
-        Extract the tool calls from a complete model response. Requires
-        find-and-replacing single quotes with double quotes for JSON parsing,
-        make sure your tool call arguments don't ever include quotes!
+        Extract the tool calls from a complete model response.
+
+        Content and tool calls formatting depends on the Mistral's tokenizer version
+        used to train the model:
+
+        - < v11: `content[BOT] [{tool_call1},{tool_call2}]`
+        - >= v11: `content[BOT]tool_name1{args_call1}[BOT]tool_name2{args_call2}`
+
+        with [BOT] the tool call token.
+
+        Note:
+            For tokenizer versions >= v11, tool calls with arguments wrongly formatted
+            are still returned as tool calls. This is to allow the model to know it
+            tried to make a tool call. It reduces chance of another failure and
+            prevents that the context is filled with tool calls wrongly placed in
+            assistant message contents.
         """
 
-        # case -- if a tool call token is not present, return a text response
+        # If the tool call token is not present, return a text response
         if self.bot_token not in model_output:
             return ExtractedToolCallInformation(
                 tools_called=False, tool_calls=[], content=model_output
             )
 
-        # first remove the BOT token
-        tool_content = model_output.replace(self.bot_token, "").strip()
+        content_and_raw_tool_calls = model_output.split(self.bot_token)
+        content = content_and_raw_tool_calls[0]
+        raw_tool_calls = content_and_raw_tool_calls[1:]
+
+        # >= v11: content[BOT]tool_name1{args_call1}[BOT]tool_name2{args_call2}
+        if not self._is_pre_v11:
+            tool_calls = []
+            for raw_tool_call in raw_tool_calls:
+                if "{" not in raw_tool_call:
+                    continue
+
+                end_name = raw_tool_call.find("{")
+                tool_name, args = (
+                    raw_tool_call[:end_name],
+                    raw_tool_call[end_name:],
+                )
 
-        try:
+                tool_calls.append({"name": tool_name, "arguments": args})
+
+        # < v11: content[BOT] [{tool_call1},{tool_call2}]
+        else:
+            if len(raw_tool_calls) != 1:
+                raise ValueError(
+                    "Only one BOT token should have been outputted, "
+                    f"but got {model_output}."
+                )
+            stringified_tool_calls = raw_tool_calls[0].strip()
             try:
-                if not self._is_pre_v11:
-                    function_call_arr = []
-                    for single_tool_content in model_output.split(self.bot_token):
-                        if "{" not in single_tool_content:
-                            continue
-
-                        end_name = single_tool_content.find("{")
-                        fn_name, args = (
-                            single_tool_content[:end_name],
-                            single_tool_content[end_name:],
-                        )
-
-                        # fn_name is encoded outside serialized json dump
-                        # only arguments are serialized
-                        function_call_arr.append(
-                            {"name": fn_name, "arguments": json.loads(args)}
-                        )
-                else:
-                    function_call_arr = json.loads(tool_content)
+                tool_calls = json.loads(stringified_tool_calls)
             except json.JSONDecodeError:
                 # use a regex to find the part corresponding to the tool call.
                 # NOTE: This use case should not happen if the model is trained
                 # correctly. It's an easy possible fix so it's included, but
                 # can be brittle for very complex / highly nested tool calls
-                raw_tool_call = self.tool_call_regex.findall(tool_content)[0]
-                function_call_arr = json.loads(raw_tool_call)
-
-            # Tool Call
-            tool_calls: list[MistralToolCall] = [
-                MistralToolCall(
-                    type="function",
-                    function=FunctionCall(
-                        name=raw_function_call["name"],
-                        # function call args are JSON but as a string
-                        arguments=json.dumps(
-                            raw_function_call["arguments"], ensure_ascii=False
+                try:
+                    raw_tool_call = self.tool_call_regex.findall(
+                        stringified_tool_calls
+                    )[0]
+                    tool_calls = json.loads(raw_tool_call)
+                except (IndexError, json.JSONDecodeError):
+                    logger.exception("Error in extracting tool call from response: {e}")
+                    # If raw decoding and decoding post regex rule fails, then just
+                    # return content.
+                    return ExtractedToolCallInformation(
+                        tools_called=False,
+                        tool_calls=[],
+                        content=stringified_tool_calls,
+                    )
+            else:
+                tool_calls = [
+                    {
+                        "name": tool_call["name"],
+                        "arguments": json.dumps(
+                            tool_call["arguments"], ensure_ascii=False
                         ),
-                    ),
-                )
-                for raw_function_call in function_call_arr
-            ]
+                    }
+                    for tool_call in tool_calls
+                ]
 
-            # get any content before  the tool call
-            content = model_output.split(self.bot_token)[0]
-            return ExtractedToolCallInformation(
-                tools_called=True,
-                tool_calls=tool_calls,
-                content=content if len(content) > 0 else None,
+        mistral_tool_calls: list[MistralToolCall] = [
+            MistralToolCall(
+                type="function",
+                function=FunctionCall(
+                    name=tool_call["name"],
+                    arguments=tool_call["arguments"],
+                ),
             )
+            for tool_call in tool_calls
+        ]
 
-        except Exception:
-            logger.exception("Error in extracting tool call from response.")
-            # return information to just treat the tool call as regular JSON
-            return ExtractedToolCallInformation(
-                tools_called=False, tool_calls=[], content=tool_content
-            )
+        return ExtractedToolCallInformation(
+            tools_called=True,
+            tool_calls=mistral_tool_calls,
+            content=content if len(content) > 0 else None,
+        )
 
     def extract_tool_calls_streaming(
         self,
diff --git a/vllm/tool_parsers/openai_tool_parser.py b/vllm/tool_parsers/openai_tool_parser.py
index db92ea8982d70cfc629708dd856b444a80b27b50..da1a9c773f78f94b039a816370556def8e4cd756 100644
--- a/vllm/tool_parsers/openai_tool_parser.py
+++ b/vllm/tool_parsers/openai_tool_parser.py
@@ -79,6 +79,15 @@ class OpenAIToolParser(ToolParser):
                 elif msg.channel == "commentary" and not msg.recipient:
                     commentary_content = msg_text
 
+        # Extract partial content from the parser state if the generation was truncated
+        if parser.current_content:
+            if parser.current_channel == "final":
+                final_content = parser.current_content
+            elif (
+                parser.current_channel == "commentary" and not parser.current_recipient
+            ):
+                commentary_content = parser.current_content
+
         return ExtractedToolCallInformation(
             tools_called=len(tool_calls) > 0,
             tool_calls=tool_calls,
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 887f936a2d8ae5b81331dee4a790d29332691b01..0a7cfffd4778ecaa3f26cd201f06c902f1ccffc6 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -10,12 +10,9 @@ from pathlib import Path
 from typing import Any, Literal, TypeAlias
 
 import huggingface_hub
-from huggingface_hub import (
-    get_safetensors_metadata,
-)
+from huggingface_hub import get_safetensors_metadata
 from packaging.version import Version
 from transformers import GenerationConfig, PretrainedConfig
-from transformers.configuration_utils import ALLOWED_LAYER_TYPES
 from transformers.models.auto.image_processing_auto import get_image_processor_config
 from transformers.models.auto.modeling_auto import (
     MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
@@ -44,6 +41,16 @@ from .repo_utils import (
     with_retry,
 )
 
+try:
+    # Transformers v5
+    from transformers.configuration_utils import ALLOWED_ATTENTION_LAYER_TYPES
+except ImportError:
+    # Transformers v4
+    from transformers.configuration_utils import (
+        ALLOWED_LAYER_TYPES as ALLOWED_ATTENTION_LAYER_TYPES,
+    )
+
+
 if envs.VLLM_USE_MODELSCOPE:
     from modelscope import AutoConfig
 else:
@@ -72,6 +79,7 @@ _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict(
     deepseek_v32="DeepseekV3Config",
     flex_olmo="FlexOlmoConfig",
     hunyuan_vl="HunYuanVLConfig",
+    isaac="IsaacConfig",
     kimi_linear="KimiLinearConfig",
     kimi_vl="KimiVLConfig",
     RefinedWeb="RWConfig",  # For tiiuae/falcon-40b(-instruct)
@@ -104,6 +112,14 @@ _AUTO_CONFIG_KWARGS_OVERRIDES: dict[str, dict[str, Any]] = {
 }
 
 
+def is_rope_parameters_nested(rope_parameters: dict[str, Any]) -> bool:
+    """Check if rope_parameters is nested by layer types."""
+    # Cannot be nested if rope_parameters is empty
+    if not rope_parameters:
+        return False
+    return set(rope_parameters.keys()).issubset(ALLOWED_ATTENTION_LAYER_TYPES)
+
+
 class HFConfigParser(ConfigParserBase):
     def parse(
         self,
@@ -118,6 +134,7 @@ class HFConfigParser(ConfigParserBase):
             model,
             revision=revision,
             code_revision=code_revision,
+            trust_remote_code=trust_remote_code,
             token=_get_hf_token(),
             **kwargs,
         )
@@ -139,6 +156,7 @@ class HFConfigParser(ConfigParserBase):
                 model,
                 revision=revision,
                 code_revision=code_revision,
+                trust_remote_code=trust_remote_code,
                 token=_get_hf_token(),
                 **kwargs,
             )
@@ -313,19 +331,25 @@ def patch_rope_parameters(config: PretrainedConfig) -> None:
     rope_theta = getattr_iter(config, names, None, warn=True)
     names = ["partial_rotary_factor", "rotary_pct", "rotary_emb_fraction"]
     partial_rotary_factor = getattr_iter(config, names, None, warn=True)
+    ompe = getattr(config, "original_max_position_embeddings", None)
 
     if Version(version("transformers")) < Version("5.0.0.dev0"):
         # Transformers v4 installed, legacy config fields may be present
         if (rope_scaling := getattr(config, "rope_scaling", None)) is not None:
             config.rope_parameters = rope_scaling
         if (
-            rope_theta is not None or partial_rotary_factor is not None
+            rope_theta is not None
+            or partial_rotary_factor is not None
+            or ompe is not None
         ) and not getattr(config, "rope_parameters", None):
             config.rope_parameters = {"rope_type": "default"}
+        # Patch legacy fields into rope_parameters
         if rope_theta is not None:
             config.rope_parameters["rope_theta"] = rope_theta
         if partial_rotary_factor is not None:
             config.rope_parameters["partial_rotary_factor"] = partial_rotary_factor
+        if ompe is not None:
+            config.rope_parameters["original_max_position_embeddings"] = ompe
     elif rope_theta is not None or getattr(config, "rope_parameters", None):
         # Transformers v5 installed
         # Patch these fields in case they used non-standard names
@@ -341,12 +365,8 @@ def patch_rope_parameters(config: PretrainedConfig) -> None:
     if getattr(config, "rope_parameters", None) is None:
         return
 
-    # Add original_max_position_embeddings if present
-    if ompe := getattr(config, "original_max_position_embeddings", None):
-        config.rope_parameters["original_max_position_embeddings"] = ompe
-
     # Handle nested rope_parameters in interleaved sliding attention models
-    if set(config.rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES):
+    if is_rope_parameters_nested(config.rope_parameters):
         for rope_parameters_layer_type in config.rope_parameters.values():
             patch_rope_parameters_dict(rope_parameters_layer_type)
     else:
@@ -720,7 +740,10 @@ def get_config(
 
 
 @cache
-def get_pooling_config(model: str, revision: str | None = "main") -> dict | None:
+def get_pooling_config(
+    model: str,
+    revision: str | None = "main",
+) -> dict[str, Any] | None:
     """
     This function gets the pooling and normalize
     config from the model - only applies to
@@ -771,38 +794,40 @@ def get_pooling_config(model: str, revision: str | None = "main") -> dict | None
     )
 
     if pooling:
-        pooling_file_name = "{}/config.json".format(pooling["path"])
-        pooling_dict = get_hf_file_to_dict(pooling_file_name, model, revision)
-        pooling_type_name = next(
-            (item for item, val in pooling_dict.items() if val is True), None
-        )
+        from vllm.config.pooler import SEQ_POOLING_TYPES, TOK_POOLING_TYPES
 
-        if pooling_type_name is not None:
-            pooling_type_name = get_pooling_config_name(pooling_type_name)
+        pooling_file_name = "{}/config.json".format(pooling["path"])
+        pooling_dict = get_hf_file_to_dict(pooling_file_name, model, revision) or {}
 
         logger.info("Found pooling configuration.")
-        return {"pooling_type": pooling_type_name, "normalize": normalize}
+
+        config: dict[str, Any] = {"use_activation": normalize}
+        for key, val in pooling_dict.items():
+            if val is True:
+                pooling_type = parse_pooling_type(key)
+                if pooling_type in SEQ_POOLING_TYPES:
+                    config["seq_pooling_type"] = pooling_type
+                elif pooling_type in TOK_POOLING_TYPES:
+                    config["tok_pooling_type"] = pooling_type
+                else:
+                    logger.debug("Skipping unrelated field: %r=%r", key, val)
+
+        return config
 
     return None
 
 
-def get_pooling_config_name(pooling_name: str) -> str | None:
+def parse_pooling_type(pooling_name: str):
     if "pooling_mode_" in pooling_name:
         pooling_name = pooling_name.replace("pooling_mode_", "")
 
     if "_" in pooling_name:
-        pooling_name = pooling_name.split("_")[0]
+        pooling_name = pooling_name.split("_", 1)[0]
 
     if "lasttoken" in pooling_name:
         pooling_name = "last"
 
-    supported_pooling_types = ["LAST", "ALL", "CLS", "STEP", "MEAN"]
-    pooling_type_name = pooling_name.upper()
-
-    if pooling_type_name in supported_pooling_types:
-        return pooling_type_name
-
-    raise NotImplementedError(f"Pooling type {pooling_type_name} not supported")
+    return pooling_name.upper()
 
 
 @cache
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index ec3b86c3278be09417c6a9a25473c55299364294..03eac7a779905ef4eda7db2b0784747afed4e807 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -26,6 +26,7 @@ _CLASS_TO_MODULE: dict[str, str] = {
     "HunYuanVLConfig": "vllm.transformers_utils.configs.hunyuan_vl",
     "HunYuanVLTextConfig": "vllm.transformers_utils.configs.hunyuan_vl",
     "HunYuanVLVisionConfig": "vllm.transformers_utils.configs.hunyuan_vl",
+    "IsaacConfig": "vllm.transformers_utils.configs.isaac",
     # RWConfig is for the original tiiuae/falcon-40b(-instruct) and
     # tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
     # `FalconConfig` class from the official HuggingFace transformers library.
@@ -42,6 +43,7 @@ _CLASS_TO_MODULE: dict[str, str] = {
     "NemotronHConfig": "vllm.transformers_utils.configs.nemotron_h",
     "Olmo3Config": "vllm.transformers_utils.configs.olmo3",
     "OvisConfig": "vllm.transformers_utils.configs.ovis",
+    "PixelShuffleSiglip2VisionConfig": "vllm.transformers_utils.configs.isaac",
     "RadioConfig": "vllm.transformers_utils.configs.radio",
     "SpeculatorsConfig": "vllm.transformers_utils.configs.speculators.base",
     "UltravoxConfig": "vllm.transformers_utils.configs.ultravox",
@@ -66,6 +68,7 @@ __all__ = [
     "HunYuanVLConfig",
     "HunYuanVLTextConfig",
     "HunYuanVLVisionConfig",
+    "IsaacConfig",
     "RWConfig",
     "JAISConfig",
     "Lfm2MoeConfig",
@@ -79,6 +82,7 @@ __all__ = [
     "NemotronHConfig",
     "Olmo3Config",
     "OvisConfig",
+    "PixelShuffleSiglip2VisionConfig",
     "RadioConfig",
     "SpeculatorsConfig",
     "UltravoxConfig",
diff --git a/vllm/transformers_utils/configs/isaac.py b/vllm/transformers_utils/configs/isaac.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed36d19ebf667e3c9e6a58e5b92c023b15f3fd65
--- /dev/null
+++ b/vllm/transformers_utils/configs/isaac.py
@@ -0,0 +1,100 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from __future__ import annotations
+
+from transformers import Qwen3Config
+from transformers.models.siglip2.configuration_siglip2 import Siglip2VisionConfig
+
+
+class PixelShuffleSiglip2VisionConfig(Siglip2VisionConfig):
+    """Vision configuration for Isaac with Pixel Shuffle support.
+
+    Extends Siglip2VisionConfig with additional fields for pixel shuffle.
+    """
+
+    model_type = "pixel_shuffle_siglip2"
+    base_config_key = "vision_config"
+
+    def __init__(
+        self,
+        pixel_shuffle_scale_factor: int = 1,
+        num_patches: int = 256,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        # Add our custom fields
+        self.pixel_shuffle_scale_factor = pixel_shuffle_scale_factor
+        self.num_patches = num_patches
+
+
+class IsaacConfig(Qwen3Config):
+    """Configuration class for Isaac multimodal model."""
+
+    model_type = "isaac"
+    sub_configs = {
+        "vision_config": PixelShuffleSiglip2VisionConfig,
+        "text_config": Qwen3Config,
+    }
+
+    def __init__(
+        self,
+        text_config=None,
+        vision_config=None,
+        vision_patch_size: int = 16,
+        vision_max_num_patches: int = 256,
+        vision_min_num_patches: int | None = None,
+        pixel_shuffle_scale: int = 1,
+        max_sequence_length: int = 16384,
+        vision_token: str = "<image>",
+        vision_attn_implementation: str | None = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        if isinstance(text_config, dict):
+            # from HF config
+            self.text_config = self.sub_configs["text_config"](**text_config)
+        elif text_config is None:
+            # For BC use all kwargs to init text config.
+            self.text_config = self.sub_configs["text_config"](**kwargs)
+        else:
+            # from Qwen3Config
+            self.text_config = text_config
+
+        # EventStreamProcessor parameters (for backward compatibility)
+        self.video_patch_size = vision_patch_size
+        self.vision_max_num_patches = vision_max_num_patches
+        self.vision_min_num_patches = vision_min_num_patches
+        self.pixel_shuffle_scale = pixel_shuffle_scale
+
+        # Processing parameters
+        self.max_sequence_length = max_sequence_length
+        self.vision_token = vision_token
+
+        # Handle vision config - PixelShuffleSiglip2VisionConfig instance
+        if isinstance(vision_config, dict):
+            self.vision_config = PixelShuffleSiglip2VisionConfig(**vision_config)
+        elif vision_config is None:
+            self.vision_config = PixelShuffleSiglip2VisionConfig()
+        else:
+            self.vision_config = vision_config
+
+        # Ensure compatibility with pretrained checkpoints
+        self.vision_config.pixel_shuffle_scale_factor = getattr(
+            self.vision_config,
+            "pixel_shuffle_scale_factor",
+            pixel_shuffle_scale,
+        )
+        self.vision_config.num_patches = getattr(
+            self.vision_config,
+            "num_patches",
+            vision_max_num_patches,
+        )
+        self.vision_attn_implementation = vision_attn_implementation
+
+
+__all__ = [
+    "IsaacConfig",
+    "PixelShuffleSiglip2VisionConfig",
+]
diff --git a/vllm/transformers_utils/configs/mistral.py b/vllm/transformers_utils/configs/mistral.py
index d59169d95f0c9a7cab4c56f4078ad168d6aa7f25..4776c892eb7224f076fc121a9127c0547e15721c 100644
--- a/vllm/transformers_utils/configs/mistral.py
+++ b/vllm/transformers_utils/configs/mistral.py
@@ -184,18 +184,42 @@ def _remap_mistral_audio_args(config: dict) -> dict:
     whisper_args = config["multimodal"].pop("whisper_model_args")
     encoder_args = whisper_args["encoder_args"]
     downsample_args = whisper_args["downsample_args"]
+    downsample_factor = downsample_args["downsample_factor"]
+
+    # make sure that k/v blocks can be allocated with
+    # unified k/v cache class and pool whisper k/v cache blocks
+    # with downsample_factor:1 ratio
+    if encoder_args.get("causal"):
+        block_pool_size = downsample_factor
+        config["projection_size"] = downsample_factor * encoder_args["dim"]
+    else:
+        block_pool_size = 1
+
+    _maybe_sliding_window = encoder_args.get("ragged_attention", None)
+    if _maybe_sliding_window is None:
+        sliding_window = None
+    elif _maybe_sliding_window.isdigit():
+        sliding_window = int(_maybe_sliding_window)
+    else:
+        raise NotImplementedError(f"Unsupported: {_maybe_sliding_window=}")
+
+    architecture = (
+        "VoxtralStreamingGeneration"
+        if encoder_args.get("causal")
+        else "VoxtralForConditionalGeneration"
+    )
 
     quant_config = config.get("quantization_config")
     config = {
-        "model_type": "whixtral",
-        "architectures": ["VoxtralForConditionalGeneration"],
+        "model_type": "voxtral",
+        "architectures": [architecture],
         "text_config": PretrainedConfig.from_dict(config),
         "audio_config": WhisperConfig(
             num_mel_bins=encoder_args["audio_encoding_args"]["num_mel_bins"],
             window_size=encoder_args["audio_encoding_args"]["window_size"],
             sampling_rate=encoder_args["audio_encoding_args"]["sampling_rate"],
             hop_length=encoder_args["audio_encoding_args"]["hop_length"],
-            downsample_factor=downsample_args["downsample_factor"],
+            downsample_factor=downsample_factor,
             d_model=encoder_args["dim"],
             encoder_layers=encoder_args["n_layers"],
             encoder_ffn_dim=encoder_args["hidden_dim"],
@@ -203,6 +227,10 @@ def _remap_mistral_audio_args(config: dict) -> dict:
             vocab_size=encoder_args["vocab_size"],
             max_source_positions=encoder_args["max_source_positions"],
             is_encoder_decoder=False,  # Override WhisperConfig default
+            is_causal=encoder_args.get("causal", False),
+            sliding_window=sliding_window,
+            block_pool_size=block_pool_size,
+            pos_embed=encoder_args.get("pos_embed", "sinusoidal"),
         ),
     }
     if quant_config:
diff --git a/vllm/transformers_utils/configs/radio.py b/vllm/transformers_utils/configs/radio.py
index 2b6544fb273c2b62ef9488b0adf28268fd73fb54..ddd72db1aedd060af6c4796ed04cc95e3dd31066 100644
--- a/vllm/transformers_utils/configs/radio.py
+++ b/vllm/transformers_utils/configs/radio.py
@@ -2,6 +2,8 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Radio vision model configuration"""
 
+from typing import Any
+
 from transformers.configuration_utils import PretrainedConfig
 from transformers.utils import logging
 
@@ -36,12 +38,15 @@ class RadioConfig(PretrainedConfig):
         layer_norm_eps: The epsilon used by the layer normalization layers.
         initializer_factor: A factor for initializing all weight matrices.
         hidden_act: The non-linear activation function in the encoder.
-        max_img_size: Maximum image size for position embeddings.
+        cpe_max_size: Maximum image size for position embeddings.
         norm_mean: Mean values for image normalization (RGB channels).
             Defaults to (0.48145466, 0.4578275, 0.40821073)).
         norm_std: Standard deviation values for image normalization
             (RGB channels). Defaults to (0.26862954, 0.26130258, 0.27577711)).
-        reg_tokens: Number of register tokens to use.
+        register_multiple: Number of register tokens to use.
+        teachers: A list of teacher model configurations. Each teacher configuration is
+            a dict with keys like "name" and some may have "use_summary".
+        cls_token_per_teacher: Whether to use a separate CLS token for each teacher.
     """
 
     model_type = "radio"
@@ -57,10 +62,12 @@ class RadioConfig(PretrainedConfig):
         layer_norm_eps: float = 1e-6,
         initializer_factor: float = 1.0,
         hidden_act: str = "gelu",
-        max_img_size: int = 2048,
+        cpe_max_size: int = 2048,
         norm_mean: tuple[float, float, float] | list = OPENAI_CLIP_MEAN,
         norm_std: tuple[float, float, float] | list = OPENAI_CLIP_STD,
-        reg_tokens: int | None = None,
+        register_multiple: int | None = None,
+        teachers: list[dict[str, Any]] | None = None,
+        cls_token_per_teacher: bool = False,
         **kwargs,
     ):
         self.model_name = model_name
@@ -78,12 +85,14 @@ class RadioConfig(PretrainedConfig):
         self.layer_norm_eps = layer_norm_eps
         self.initializer_factor = initializer_factor
         self.hidden_act = hidden_act
-        self.max_img_size = max_img_size
+        self.cpe_max_size = cpe_max_size
         self.norm_mean = (
             list(norm_mean) if isinstance(norm_mean, (tuple, list)) else norm_mean
         )
         self.norm_std = (
             list(norm_std) if isinstance(norm_std, (tuple, list)) else norm_std
         )
-        self.reg_tokens = reg_tokens
+        self.register_multiple = register_multiple
+        self.teachers = teachers if teachers is not None else []
+        self.cls_token_per_teacher = cls_token_per_teacher
         super().__init__(**kwargs)
diff --git a/vllm/transformers_utils/dynamic_module.py b/vllm/transformers_utils/dynamic_module.py
index 24ead83785f7192c3385b870788a8516d8d19a54..f0702b15bb7e72fa8825754d31496adbd916f899 100644
--- a/vllm/transformers_utils/dynamic_module.py
+++ b/vllm/transformers_utils/dynamic_module.py
@@ -2,7 +2,10 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 
-from transformers.dynamic_module_utils import get_class_from_dynamic_module
+from transformers.dynamic_module_utils import (
+    get_class_from_dynamic_module,
+    resolve_trust_remote_code,
+)
 
 import vllm.envs as envs
 from vllm.logger import init_logger
@@ -13,6 +16,7 @@ logger = init_logger(__name__)
 def try_get_class_from_dynamic_module(
     class_reference: str,
     pretrained_model_name_or_path: str,
+    trust_remote_code: bool,
     cache_dir: str | os.PathLike | None = None,
     force_download: bool = False,
     resume_download: bool | None = None,
@@ -30,6 +34,13 @@ def try_get_class_from_dynamic_module(
     but ignoring any errors.
     """
     try:
+        resolve_trust_remote_code(
+            trust_remote_code,
+            pretrained_model_name_or_path,
+            has_local_code=False,
+            has_remote_code=True,
+        )
+
         return get_class_from_dynamic_module(
             class_reference,
             pretrained_model_name_or_path,
diff --git a/vllm/transformers_utils/model_arch_config_convertor.py b/vllm/transformers_utils/model_arch_config_convertor.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc067a09419b72b7f65de8f8a8c4c46becc92966
--- /dev/null
+++ b/vllm/transformers_utils/model_arch_config_convertor.py
@@ -0,0 +1,402 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import final
+
+import torch
+from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE
+from transformers import PretrainedConfig
+
+from vllm import envs
+from vllm.config.model_arch import (
+    ModelArchitectureConfig,
+)
+from vllm.config.utils import getattr_iter
+from vllm.logger import init_logger
+from vllm.transformers_utils.config import (
+    try_get_safetensors_metadata,
+)
+from vllm.utils.torch_utils import common_broadcastable_dtype
+
+logger = init_logger(__name__)
+
+
+class ModelArchConfigConvertorBase:
+    def __init__(self, hf_config: PretrainedConfig, hf_text_config: PretrainedConfig):
+        self.hf_config = hf_config
+        self.hf_text_config = hf_text_config
+
+    def get_architectures(self) -> list[str]:
+        return getattr(self.hf_config, "architectures", [])
+
+    def get_num_hidden_layers(self) -> int:
+        return getattr(self.hf_text_config, "num_hidden_layers", 0)
+
+    def get_total_num_attention_heads(self) -> int:
+        return getattr(self.hf_text_config, "num_attention_heads", 0)
+
+    def get_vocab_size(self) -> int:
+        return getattr(self.hf_text_config, "vocab_size", 0)
+
+    def get_hidden_size(self) -> int:
+        return getattr(self.hf_text_config, "hidden_size", 0)
+
+    def get_head_size(self) -> int:
+        if self.is_deepseek_mla():
+            qk_rope_head_dim = getattr(self.hf_text_config, "qk_rope_head_dim", 0)
+            if not envs.VLLM_MLA_DISABLE:
+                return self.hf_text_config.kv_lora_rank + qk_rope_head_dim
+            else:
+                qk_nope_head_dim = getattr(self.hf_text_config, "qk_nope_head_dim", 0)
+                if qk_rope_head_dim and qk_nope_head_dim:
+                    return qk_rope_head_dim + qk_nope_head_dim
+
+        # NOTE: Some configs may set head_dim=None in the config
+        if getattr(self.hf_text_config, "head_dim", None) is not None:
+            return self.hf_text_config.head_dim
+
+        # NOTE: Some models (such as PLaMo2.1) use `hidden_size_per_head`
+        if getattr(self.hf_text_config, "hidden_size_per_head", None) is not None:
+            return self.hf_text_config.hidden_size_per_head
+
+        # FIXME(woosuk): This may not be true for all models.
+        return (
+            self.hf_text_config.hidden_size // self.hf_text_config.num_attention_heads
+        )
+
+    def get_total_num_kv_heads(self) -> int:
+        attributes = [
+            # For Falcon:
+            "n_head_kv",
+            "num_kv_heads",
+            # For LLaMA-2:
+            "num_key_value_heads",
+            # For ChatGLM:
+            "multi_query_group_num",
+        ]
+        # For non-grouped-query attention models, the number of KV heads is
+        # equal to the number of attention heads.
+        default_factory = lambda: self.hf_text_config.num_attention_heads
+        return getattr_iter(
+            self.hf_text_config, attributes, default_factory=default_factory
+        )
+
+    def get_num_experts(self) -> int:
+        """Returns the number of experts in the model."""
+        num_expert_names = [
+            "num_experts",  # Jamba
+            "moe_num_experts",  # Dbrx
+            "n_routed_experts",  # DeepSeek
+            "num_local_experts",  # Mixtral
+        ]
+        num_experts = getattr_iter(self.hf_text_config, num_expert_names, 0)
+        if isinstance(num_experts, list):
+            # Ernie VL's remote code uses list[int]...
+            # The values are always the same so we just take the first one.
+            return num_experts[0]
+        # Coerce to 0 if explicitly set to None
+        return num_experts or 0
+
+    @final
+    @classmethod
+    def get_torch_dtype(
+        cls, hf_config: PretrainedConfig, model_id: str, revision: str | None
+    ):
+        # NOTE: getattr(config, "dtype", torch.float32) is not correct
+        # because config.dtype can be None.
+        config_dtype = getattr(hf_config, "dtype", None)
+
+        # Fallbacks for multi-modal models if the root config
+        # does not define dtype
+        if config_dtype is None:
+            config_dtype = getattr(hf_config.get_text_config(), "dtype", None)
+        if config_dtype is None and hasattr(hf_config, "vision_config"):
+            config_dtype = getattr(hf_config.vision_config, "dtype", None)
+        if config_dtype is None and hasattr(hf_config, "encoder_config"):
+            config_dtype = getattr(hf_config.encoder_config, "dtype", None)
+
+        # Try to read the dtype of the weights if they are in safetensors format
+        if config_dtype is None:
+            repo_mt = try_get_safetensors_metadata(model_id, revision=revision)
+
+            if repo_mt and (files_mt := repo_mt.files_metadata):
+                param_dtypes: set[torch.dtype] = {
+                    _SAFETENSORS_TO_TORCH_DTYPE[dtype_str]
+                    for file_mt in files_mt.values()
+                    for dtype_str in file_mt.parameter_count
+                    if dtype_str in _SAFETENSORS_TO_TORCH_DTYPE
+                }
+
+                if param_dtypes:
+                    return common_broadcastable_dtype(param_dtypes)
+
+        if config_dtype is None:
+            config_dtype = torch.float32
+
+        return config_dtype
+
+    def _normalize_quantization_config(self, config: PretrainedConfig):
+        quant_cfg = getattr(config, "quantization_config", None)
+        if quant_cfg is None:
+            # compressed-tensors uses a "compression_config" key
+            quant_cfg = getattr(config, "compression_config", None)
+
+        else:
+            # Set quant_method for ModelOpt models.
+            producer_name = quant_cfg.get("producer", {}).get("name")
+            if producer_name == "modelopt":
+                quant_algo = quant_cfg.get("quantization", {}).get("quant_algo")
+                if quant_algo is not None:
+                    quant_algo_upper = str(quant_algo).upper()
+                    if quant_algo_upper in {
+                        "FP8",
+                        "FP8_PER_CHANNEL_PER_TOKEN",
+                        "FP8_PB_WO",
+                    }:
+                        quant_cfg["quant_method"] = "modelopt"
+                    elif quant_algo_upper == "NVFP4":
+                        quant_cfg["quant_method"] = "modelopt_fp4"
+                    else:
+                        raise ValueError(f"Unknown ModelOpt quant algo: {quant_algo}")
+
+        if quant_cfg is not None:
+            # Use the community standard 'quant_method'
+            quant_method = quant_cfg.get("quant_method", "").lower()
+
+            # Normalize library names
+            quant_method = quant_method.replace(
+                "compressed_tensors", "compressed-tensors"
+            )
+
+            quant_cfg["quant_method"] = quant_method
+
+        return quant_cfg
+
+    def get_quantization_config(self):
+        quant_cfg = self._normalize_quantization_config(self.hf_config)
+        if quant_cfg is None and (
+            text_config := getattr(self.hf_config, "text_config", None)
+        ):
+            # Check the text config as well for multi-modal models.
+            quant_cfg = self._normalize_quantization_config(text_config)
+        return quant_cfg
+
+    def is_deepseek_mla(self) -> bool:
+        if not hasattr(self.hf_text_config, "model_type"):
+            return False
+        elif self.hf_text_config.model_type in (
+            "deepseek_v2",
+            "deepseek_v3",
+            "deepseek_v32",
+            "deepseek_mtp",
+            "kimi_k2",
+            "kimi_linear",
+            "longcat_flash",
+            "pangu_ultra_moe",
+            "pangu_ultra_moe_mtp",
+        ):
+            return self.hf_text_config.kv_lora_rank is not None
+        elif self.hf_text_config.model_type == "eagle":
+            # if the model is an EAGLE module, check for the
+            # underlying architecture
+            return (
+                self.hf_text_config.model.model_type
+                in ("deepseek_v2", "deepseek_v3", "deepseek_v32")
+                and self.hf_text_config.kv_lora_rank is not None
+            )
+        return False
+
+    def derive_max_model_len_and_key(self) -> tuple[float, str | None]:
+        derived_max_model_len = float("inf")
+        possible_keys = [
+            # OPT
+            "max_position_embeddings",
+            # GPT-2
+            "n_positions",
+            # MPT
+            "max_seq_len",
+            # ChatGLM2
+            "seq_length",
+            # Command-R
+            "model_max_length",
+            # Whisper
+            "max_target_positions",
+            # Others
+            "max_sequence_length",
+            "max_seq_length",
+            "seq_len",
+        ]
+        # Choose the smallest "max_length" from the possible keys
+        max_len_key = None
+        for key in possible_keys:
+            max_len = getattr(self.hf_text_config, key, None)
+            if max_len is not None:
+                if max_len < derived_max_model_len:
+                    max_len_key = key
+                derived_max_model_len = min(derived_max_model_len, max_len)
+
+        # For Command-R / Cohere, Cohere2 / Aya Vision models
+        if tmp_max_len := getattr(self.hf_text_config, "model_max_length", None):
+            max_len_key = "model_max_length"
+            derived_max_model_len = tmp_max_len
+        return derived_max_model_len, max_len_key
+
+    def convert(self) -> ModelArchitectureConfig:
+        model_arch_config = ModelArchitectureConfig(
+            architectures=self.get_architectures(),
+            model_type=self.hf_config.model_type,
+            text_model_type=getattr(self.hf_text_config, "model_type", None),
+            hidden_size=self.get_hidden_size(),
+            total_num_hidden_layers=self.get_num_hidden_layers(),
+            total_num_attention_heads=self.get_total_num_attention_heads(),
+            head_size=self.get_head_size(),
+            vocab_size=self.get_vocab_size(),
+            total_num_kv_heads=self.get_total_num_kv_heads(),
+            num_experts=self.get_num_experts(),
+            quantization_config=self.get_quantization_config(),
+            is_deepseek_mla=self.is_deepseek_mla(),
+            derived_max_model_len_and_key=self.derive_max_model_len_and_key(),
+        )
+
+        return model_arch_config
+
+
+class MambaModelArchConfigConvertor(ModelArchConfigConvertorBase):
+    def get_head_size(self) -> int:
+        return 0
+
+    def get_total_num_kv_heads(self) -> int:
+        return 0
+
+
+class TerratorchModelArchConfigConvertor(ModelArchConfigConvertorBase):
+    def get_head_size(self) -> int:
+        return 0
+
+    def get_total_num_kv_heads(self) -> int:
+        return 0
+
+
+class MedusaModelArchConfigConvertor(ModelArchConfigConvertorBase):
+    def get_head_size(self) -> int:
+        return 0
+
+    def get_total_num_kv_heads(self) -> int:
+        return 0
+
+
+class Zamba2ModelArchConfigConvertor(ModelArchConfigConvertorBase):
+    def get_head_size(self) -> int:
+        return getattr(self.hf_text_config, "attention_head_dim", 0)
+
+
+class FalconModelArchConfigConvertor(ModelArchConfigConvertorBase):
+    def get_total_num_kv_heads(self) -> int:
+        # NOTE: for falcon, when new_decoder_architecture is True, the
+        # multi_query flag is ignored and we use n_head_kv for the number of
+        # KV heads.
+        new_decoder_arch_falcon = getattr(
+            self.hf_text_config, "new_decoder_architecture", False
+        )
+
+        if not new_decoder_arch_falcon and getattr(
+            self.hf_text_config, "multi_query", False
+        ):
+            # Multi-query attention, only one KV head.
+            return 1
+
+        # Use the base implementation which checks n_head_kv, num_kv_heads, etc.
+        return super().get_total_num_kv_heads()
+
+
+class MPTModelArchConfigConvertor(ModelArchConfigConvertorBase):
+    def get_total_num_kv_heads(self) -> int:
+        if "kv_n_heads" in self.hf_text_config.attn_config:
+            return self.hf_text_config.attn_config["kv_n_heads"]
+        return self.hf_text_config.num_attention_heads
+
+
+class DbrxModelArchConfigConvertor(ModelArchConfigConvertorBase):
+    def get_total_num_kv_heads(self) -> int:
+        return getattr(
+            self.hf_text_config.attn_config,
+            "kv_n_heads",
+            self.hf_text_config.num_attention_heads,
+        )
+
+
+class NemotronNasModelArchConfigConvertor(ModelArchConfigConvertorBase):
+    def get_total_num_kv_heads(self) -> int:
+        for block in self.hf_text_config.block_configs:
+            if not block.attention.no_op:
+                return (
+                    self.hf_text_config.num_attention_heads
+                    // block.attention.n_heads_in_group
+                )
+        raise RuntimeError(
+            "Could not determine the number of key-value attention heads "
+            "from model configuration. "
+            f"Architecture: {self.get_architectures()}. "
+            "This usually indicates an unsupported model architecture or "
+            "missing configuration. "
+            "Please check if your model is supported at: "
+            "https://docs.vllm.ai/en/latest/models/supported_models.html"
+        )
+
+
+class DeepSeekMTPModelArchConfigConvertor(ModelArchConfigConvertorBase):
+    def get_num_hidden_layers(self) -> int:
+        return getattr(self.hf_text_config, "num_nextn_predict_layers", 0)
+
+
+class MimoMTPModelArchConfigConvertor(ModelArchConfigConvertorBase):
+    def get_num_hidden_layers(self) -> int:
+        return getattr(self.hf_text_config, "num_nextn_predict_layers", 0)
+
+
+class GLM4MoeMTPModelArchConfigConvertor(ModelArchConfigConvertorBase):
+    def get_num_hidden_layers(self) -> int:
+        return getattr(self.hf_text_config, "num_nextn_predict_layers", 0)
+
+
+class ErnieMTPModelArchConfigConvertor(ModelArchConfigConvertorBase):
+    def get_num_hidden_layers(self) -> int:
+        return getattr(self.hf_text_config, "num_nextn_predict_layers", 0)
+
+
+class Qwen3NextMTPModelArchConfigConvertor(ModelArchConfigConvertorBase):
+    def get_num_hidden_layers(self) -> int:
+        return getattr(self.hf_text_config, "num_nextn_predict_layers", 0)
+
+
+class PanguUltraMoeMTPModelArchConfigConvertor(ModelArchConfigConvertorBase):
+    def get_num_hidden_layers(self) -> int:
+        return getattr(self.hf_text_config, "num_nextn_predict_layers", 0)
+
+
+class LongCatFlashMTPModelArchConfigConvertor(ModelArchConfigConvertorBase):
+    def get_num_hidden_layers(self) -> int:
+        return getattr(self.hf_text_config, "num_nextn_predict_layers", 1)
+
+
+# hf_config.model_type -> convertor class
+MODEL_ARCH_CONFIG_CONVERTORS = {
+    "mamba": MambaModelArchConfigConvertor,
+    "falcon_mamba": MambaModelArchConfigConvertor,
+    "timm_wrapper": TerratorchModelArchConfigConvertor,
+    "medusa": MedusaModelArchConfigConvertor,
+    "zamba2": Zamba2ModelArchConfigConvertor,
+    "mpt": MPTModelArchConfigConvertor,
+    "dbrx": DbrxModelArchConfigConvertor,
+    "falcon": FalconModelArchConfigConvertor,
+    "RefinedWeb": FalconModelArchConfigConvertor,
+    "RefinedWebModel": FalconModelArchConfigConvertor,
+    "nemotron-nas": NemotronNasModelArchConfigConvertor,
+    "deepseek_mtp": DeepSeekMTPModelArchConfigConvertor,
+    "qwen3_next_mtp": Qwen3NextMTPModelArchConfigConvertor,
+    "mimo_mtp": MimoMTPModelArchConfigConvertor,
+    "glm4_moe_mtp": GLM4MoeMTPModelArchConfigConvertor,
+    "ernie_mtp": ErnieMTPModelArchConfigConvertor,
+    "pangu_ultra_moe_mtp": PanguUltraMoeMTPModelArchConfigConvertor,
+    "longcat_flash_mtp": LongCatFlashMTPModelArchConfigConvertor,
+}
diff --git a/vllm/transformers_utils/processors/bagel.py b/vllm/transformers_utils/processors/bagel.py
index 850e64f2fad1ef4235d17bb0b608f790a8568d22..7f7a0fd9e11bb7aa4b3dea47744edfc7966e746b 100644
--- a/vllm/transformers_utils/processors/bagel.py
+++ b/vllm/transformers_utils/processors/bagel.py
@@ -4,6 +4,7 @@
 """BAGEL processor for image and text inputs."""
 
 from transformers import AutoProcessor
+from transformers.feature_extraction_utils import BatchFeature
 from transformers.image_utils import ImageInput
 from transformers.processing_utils import ProcessorMixin
 from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
@@ -44,12 +45,16 @@ class BagelProcessor(ProcessorMixin):
         text_inputs = self.tokenizer(text, **kwargs) if text is not None else None
 
         if pixel_values is not None and text_inputs is not None:
-            text_inputs["pixel_values"] = pixel_values["pixel_values"]
-            return text_inputs
+            # Combine text and image inputs into BatchFeature
+            combined = dict(text_inputs)
+            combined["pixel_values"] = pixel_values["pixel_values"]
+            return BatchFeature(combined)
         elif pixel_values is not None:
             return pixel_values
+        elif text_inputs is not None:
+            return BatchFeature(dict(text_inputs))
         else:
-            return text_inputs
+            return BatchFeature({})
 
     def batch_decode(self, *args, **kwargs):
         """
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index 90af573535d3b6cab2e707f8e93ef1113aee77c9..212f1dccc4df2415877c7562102f463434f12355 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -1,127 +1,19 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
 import warnings
-from typing import Any
-
-from typing_extensions import deprecated
-
-from vllm.logger import init_logger
-from vllm.tokenizers import TokenizerLike
-
-logger = init_logger(__name__)
 
 
 def __getattr__(name: str):
-    if name == "AnyTokenizer":
-        warnings.warn(
-            "`vllm.transformers_utils.tokenizer.AnyTokenizer` has been moved to "
-            "`vllm.tokenizers.TokenizerLike`. "
-            "The old name will be removed in v0.14.",
-            DeprecationWarning,
-            stacklevel=2,
-        )
-
-        return TokenizerLike
+    # Keep until lm-eval is updated
     if name == "get_tokenizer":
         from vllm.tokenizers import get_tokenizer
 
         warnings.warn(
             "`vllm.transformers_utils.tokenizer.get_tokenizer` "
             "has been moved to `vllm.tokenizers.get_tokenizer`. "
-            "The old name will be removed in v0.14.",
+            "The old name will be removed in a future version.",
             DeprecationWarning,
             stacklevel=2,
         )
 
         return get_tokenizer
-    if name == "cached_get_tokenizer":
-        from vllm.tokenizers import cached_get_tokenizer
-
-        warnings.warn(
-            "`vllm.transformers_utils.tokenizer.cached_get_tokenizer` "
-            "has been moved to `vllm.tokenizers.cached_get_tokenizer`. "
-            "The old name will be removed in v0.14.",
-            DeprecationWarning,
-            stacklevel=2,
-        )
-
-        return cached_get_tokenizer
-    if name == "cached_tokenizer_from_config":
-        from vllm.tokenizers import cached_tokenizer_from_config
-
-        warnings.warn(
-            "`vllm.transformers_utils.tokenizer.cached_tokenizer_from_config` "
-            "has been moved to `vllm.tokenizers.cached_tokenizer_from_config`. "
-            "The old name will be removed in v0.14.",
-            DeprecationWarning,
-            stacklevel=2,
-        )
-
-        return cached_tokenizer_from_config
-    if name == "init_tokenizer_from_configs":
-        from vllm.tokenizers import cached_tokenizer_from_config
-
-        warnings.warn(
-            "`vllm.transformers_utils.tokenizer.init_tokenizer_from_configs` "
-            "has been moved to `vllm.tokenizers.cached_tokenizer_from_config`. "
-            "The old name will be removed in v0.14.",
-            DeprecationWarning,
-            stacklevel=2,
-        )
-
-        return cached_tokenizer_from_config
-
-    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
-
-
-@deprecated("Will be removed in v0.14. Please use `tokenizer.decode()` instead.")
-def decode_tokens(
-    tokenizer: TokenizerLike,
-    token_ids: list[int],
-    *,
-    skip_special_tokens: bool | None = None,
-) -> str:
-    """
-    Backend-agnostic equivalent of HF's
-    `tokenizer.decode(token_ids, ...)`.
-
-    `skip_special_tokens=None` means to use the backend's default
-    settings.
-    """
-    kw_args: dict[str, Any] = {}
-
-    if skip_special_tokens is not None:
-        kw_args["skip_special_tokens"] = skip_special_tokens
-
-    return tokenizer.decode(token_ids, **kw_args)
-
-
-@deprecated("Will be removed in v0.14. Please use `tokenizer.encode()` instead.")
-def encode_tokens(
-    tokenizer: TokenizerLike,
-    text: str,
-    *,
-    truncation: bool | None = None,
-    max_length: int | None = None,
-    add_special_tokens: bool | None = None,
-) -> list[int]:
-    """
-    Backend-agnostic equivalent of HF's
-    `tokenizer.encode(text, ...)`.
-
-    `add_special_tokens=None` means to use the backend's default
-    settings.
-    """
-
-    kw_args: dict[str, Any] = {}
-    if max_length is not None:
-        kw_args["max_length"] = max_length
-
-    if truncation is not None:
-        kw_args["truncation"] = truncation
-
-    if add_special_tokens is not None:
-        kw_args["add_special_tokens"] = add_special_tokens
-
-    return tokenizer.encode(text, **kw_args)
diff --git a/vllm/transformers_utils/tokenizer_base.py b/vllm/transformers_utils/tokenizer_base.py
deleted file mode 100644
index 3dfd4b4f2f6c118cc2717d8d9224c389485327ac..0000000000000000000000000000000000000000
--- a/vllm/transformers_utils/tokenizer_base.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import warnings
-
-
-def __getattr__(name: str):
-    if name == "TokenizerBase":
-        from vllm.tokenizers import TokenizerLike
-
-        warnings.warn(
-            "`vllm.transformers_utils.tokenizer_base.TokenizerBase` has been "
-            "moved to `vllm.tokenizers.TokenizerLike`. "
-            "The old name will be removed in v0.14.",
-            DeprecationWarning,
-            stacklevel=2,
-        )
-
-        return TokenizerLike
-    if name == "TokenizerRegistry":
-        from vllm.tokenizers import TokenizerRegistry
-
-        warnings.warn(
-            "`vllm.transformers_utils.tokenizer_base.TokenizerRegistry` has been "
-            "moved to `vllm.tokenizers.TokenizerRegistry`. "
-            "The old name will be removed in v0.14.",
-            DeprecationWarning,
-            stacklevel=2,
-        )
-
-        return TokenizerRegistry
-
-    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py
index 69226763aafe6c22086c80595af692154e0b7de8..b0886bba8a22a041a0b800fa8e18e833275f9e3f 100644
--- a/vllm/usage/usage_lib.py
+++ b/vllm/usage/usage_lib.py
@@ -186,20 +186,6 @@ class UsageMessage:
         except Exception:
             return False
 
-    def _report_torch_xla_usage(self) -> bool:
-        try:
-            import torch_xla
-
-            self.gpu_count = torch_xla.runtime.world_size()
-            self.gpu_type = torch_xla.tpu.get_tpu_type()
-            self.gpu_memory_per_device = torch_xla.core.xla_model.get_memory_info()[
-                "bytes_limit"
-            ]
-            self.cuda_runtime = "torch_xla"
-            return True
-        except Exception:
-            return False
-
     def _report_usage_once(
         self,
         model_architecture: str,
@@ -217,9 +203,7 @@ class UsageMessage:
         if current_platform.is_cuda():
             self.cuda_runtime = torch.version.cuda
         if current_platform.is_tpu():  # noqa: SIM102
-            if (not self._report_tpu_inference_usage()) and (
-                not self._report_torch_xla_usage()
-            ):
+            if not self._report_tpu_inference_usage():
                 logger.exception("Failed to collect TPU information")
         self.provider = _detect_cloud_provider()
         self.architecture = platform.machine()
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index ad99e1eb1dffea3fdeb2de95220a7ca6e38cb536..13698deb23be464f752f62a02d8238ced32570da 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -4,43 +4,9 @@
 import os
 import json
 import uuid
-import warnings
-from typing import Any
-from vllm import envs
 
 import torch
 
-_DEPRECATED_MAPPINGS = {
-    "cprofile": "profiling",
-    "cprofile_context": "profiling",
-    # Used by lm-eval
-    "get_open_port": "network_utils",
-}
-
-GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
-SUPPORT_MOE_MARLIN_W16A16 = any(arch in GPU_ARCH for arch in ["gfx936"])
-
-
-def __getattr__(name: str) -> Any:  # noqa: D401 - short deprecation docstring
-    """Module-level getattr to handle deprecated utilities."""
-    if name in _DEPRECATED_MAPPINGS:
-        submodule_name = _DEPRECATED_MAPPINGS[name]
-        warnings.warn(
-            f"vllm.utils.{name} is deprecated and will be removed in a future version. "
-            f"Use vllm.utils.{submodule_name}.{name} instead.",
-            DeprecationWarning,
-            stacklevel=2,
-        )
-        module = __import__(f"vllm.utils.{submodule_name}", fromlist=[submodule_name])
-        return getattr(module, name)
-    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
-
-
-def __dir__() -> list[str]:
-    # expose deprecated names in dir() for better UX/tab-completion
-    return sorted(list(globals().keys()) + list(_DEPRECATED_MAPPINGS.keys()))
-
-
 MASK_64_BITS = (1 << 64) - 1
 
 
diff --git a/vllm/utils/argparse_utils.py b/vllm/utils/argparse_utils.py
index 87ee6f54c0c9b587f27944b4f5f950f5b07b2614..9c2cec876ee39763f8af81dbcfaa4f1c32742c12 100644
--- a/vllm/utils/argparse_utils.py
+++ b/vllm/utils/argparse_utils.py
@@ -399,8 +399,7 @@ class FlexibleArgumentParser(ArgumentParser):
         index = args.index("--config")
         if index == len(args) - 1:
             raise ValueError(
-                "No config file specified! \
-                             Please check your command-line arguments."
+                "No config file specified! Please check your command-line arguments."
             )
 
         file_path = args[index + 1]
diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py
index 3d4f8449ad3b64c0f863561bcff50e53e5924dfa..84e0fbb449fbe507a9730b1c6e7e66f7eec69660 100644
--- a/vllm/utils/deep_gemm.py
+++ b/vllm/utils/deep_gemm.py
@@ -16,6 +16,9 @@ import torch
 
 import vllm.envs as envs
 from vllm.logger import logger
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    get_fp8_min_max,
+)
 from vllm.platforms import current_platform
 from vllm.utils.import_utils import has_deep_gemm
 from vllm.utils.math_utils import cdiv
@@ -32,16 +35,35 @@ class DeepGemmQuantScaleFMT(Enum):
     # element contains 4 scale values.
     UE8M0 = 2
 
-    @staticmethod
-    def from_oracle() -> "DeepGemmQuantScaleFMT":
-        if not is_deep_gemm_e8m0_used():
-            return DeepGemmQuantScaleFMT.FLOAT32
-        return (
-            DeepGemmQuantScaleFMT.UE8M0
+    @classmethod
+    def init_oracle_cache(cls) -> None:
+        """Initialize the oracle decision and store it in the class cache"""
+        cached = getattr(cls, "_oracle_cache", None)
+        if cached is not None:
+            return
+
+        use_e8m0 = (
+            envs.VLLM_USE_DEEP_GEMM_E8M0
+            and is_deep_gemm_supported()
+            and (_fp8_gemm_nt_impl is not None)
+        )
+        if not use_e8m0:
+            cls._oracle_cache = cls.FLOAT32  # type: ignore
+            return
+
+        cls._oracle_cache = (  # type: ignore
+            cls.UE8M0
             if current_platform.is_device_capability_family(100)
-            else DeepGemmQuantScaleFMT.FLOAT32_CEIL_UE8M0
+            else cls.FLOAT32_CEIL_UE8M0
         )
 
+    @classmethod
+    def from_oracle(cls) -> "DeepGemmQuantScaleFMT":
+        """Return the pre-initialized oracle decision"""
+        cached = getattr(cls, "_oracle_cache", None)
+        assert cached is not None, "DeepGemmQuantScaleFMT oracle cache not initialized"
+        return cached
+
 
 @functools.cache
 def is_deep_gemm_supported() -> bool:
@@ -149,6 +171,7 @@ def _lazy_init() -> None:
     _transform_sf_into_required_layout_impl = getattr(
         _dg, "transform_sf_into_required_layout", None
     )
+    DeepGemmQuantScaleFMT.init_oracle_cache()
 
 
 def get_num_sms() -> int:
@@ -335,7 +358,8 @@ def per_block_cast_to_fp8(
     x_padded[:m, :n] = x
     x_view = x_padded.view(-1, block_m, x_padded.size(1) // block_n, block_n)
     x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
-    sf = x_amax / 224.0 if current_platform.is_fp8_fnuz() else x_amax / 448.0
+    _, fp8_max = get_fp8_min_max()
+    sf = x_amax / fp8_max
     sf = _ceil_to_ue8m0(sf) if use_ue8m0 else sf
     x_scaled = (x_view * (1.0 / sf)).to(fp8_dtype)
     return x_scaled.view_as(x_padded)[:m, :n].contiguous(), sf.view(
@@ -369,7 +393,7 @@ def should_use_deepgemm_for_fp8_linear(
 
     # Verify DeepGEMM N/K dims requirements
     # NOTE: Also synchronized with test_w8a8_block_fp8_deep_gemm_matmul
-    # test inside kernels/quatization/test_block_fp8.py
+    # test inside kernels/quantization/test_block_fp8.py
     N_MULTIPLE = 64
     K_MULTIPLE = 128
 
diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py
index 5019b771f4a14ad276088d41b6d4ffba43ab46ed..3da8be098fbd0c3b5aa3afcab72b6284b5999e6e 100644
--- a/vllm/utils/flashinfer.py
+++ b/vllm/utils/flashinfer.py
@@ -184,6 +184,23 @@ def has_flashinfer_cutedsl() -> bool:
     )
 
 
+@functools.cache
+def has_flashinfer_trtllm_fused_moe() -> bool:
+    """Return `True` if FlashInfer TRTLLM fused MoE is available."""
+    if not has_flashinfer_moe():
+        return False
+    required_functions = [
+        ("flashinfer.fused_moe", "trtllm_fp8_block_scale_moe"),
+        ("flashinfer.fused_moe", "trtllm_fp8_per_tensor_scale_moe"),
+        ("flashinfer.fused_moe", "trtllm_fp4_block_scale_moe"),
+    ]
+    for module_name, attr_name in required_functions:
+        mod = _get_submodule(module_name)
+        if not mod or not hasattr(mod, attr_name):
+            return False
+    return True
+
+
 @functools.cache
 def has_flashinfer_cutlass_fused_moe() -> bool:
     """Return `True` if FlashInfer CUTLASS fused MoE is available."""
@@ -503,6 +520,59 @@ def flashinfer_scaled_fp8_mm(
     return output
 
 
+flashinfer_fp8_blockscale_gemm = _lazy_import_wrapper(
+    "flashinfer.gemm", "fp8_blockscale_gemm_sm90"
+)
+
+
+@functools.cache
+def has_flashinfer_fp8_blockscale_gemm() -> bool:
+    """Return `True` if FlashInfer block-scale FP8 GEMM is available."""
+    return (
+        has_flashinfer()
+        and current_platform.is_device_capability(90)
+        and hasattr(_get_submodule("flashinfer.gemm"), "fp8_blockscale_gemm_sm90")
+    )
+
+
+@functools.cache
+def is_flashinfer_fp8_blockscale_gemm_supported() -> bool:
+    """Return `True` if FlashInfer block-scale FP8 GEMM is supported."""
+    return (
+        envs.VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER
+        and has_flashinfer_fp8_blockscale_gemm()
+    )
+
+
+def should_use_flashinfer_for_blockscale_fp8_gemm(
+    is_flashinfer_supported: bool,
+    output_dtype: torch.dtype,
+    input: torch.Tensor,
+    weight: torch.Tensor,
+):
+    if not is_flashinfer_supported:
+        return False
+
+    # Verify DeepGEMM N/K dims requirements
+    # NOTE: Also synchronized with test_w8a8_block_fp8_deep_gemm_matmul
+    # test inside kernels/quatization/test_block_fp8.py
+    N_MULTIPLE = 64
+    K_MULTIPLE = 128
+
+    weight_dtype = weight.dtype
+    input_dtype = input.dtype
+
+    should_use_flashinfer = (
+        output_dtype == torch.bfloat16
+        and input_dtype == torch.bfloat16
+        and weight_dtype == torch.float8_e4m3fn
+        and weight.shape[0] % N_MULTIPLE == 0
+        and weight.shape[1] % K_MULTIPLE == 0
+    )
+
+    return should_use_flashinfer
+
+
 __all__ = [
     "has_flashinfer",
     "flashinfer_trtllm_fp8_block_scale_moe",
@@ -519,10 +589,14 @@ __all__ = [
     "has_flashinfer_all2all",
     "has_flashinfer_cutlass_fused_moe",
     "has_flashinfer_cutedsl_grouped_gemm_nt_masked",
+    "has_flashinfer_fp8_blockscale_gemm",
     "has_nvidia_artifactory",
     "supports_trtllm_attention",
     "can_use_trtllm_attention",
     "use_trtllm_attention",
     "flashinfer_scaled_fp4_mm",
     "flashinfer_scaled_fp8_mm",
+    "flashinfer_fp8_blockscale_gemm",
+    "should_use_flashinfer_for_blockscale_fp8_gemm",
+    "is_flashinfer_fp8_blockscale_gemm_supported",
 ]
diff --git a/vllm/utils/import_utils.py b/vllm/utils/import_utils.py
index ff0f0350fd941ff57807fe92441906303d4d505f..192ac69efa26296b3156c6e85e9c31a95cdb05ab 100644
--- a/vllm/utils/import_utils.py
+++ b/vllm/utils/import_utils.py
@@ -23,17 +23,6 @@ from vllm.logger import init_logger
 logger = init_logger(__name__)
 
 
-# TODO: This function can be removed if transformer_modules classes are
-# serialized by value when communicating between processes
-def init_cached_hf_modules() -> None:
-    """
-    Lazy initialization of the Hugging Face modules.
-    """
-    from transformers.dynamic_module_utils import init_hf_modules
-
-    init_hf_modules()
-
-
 def import_pynvml():
     """
     Historical comments:
diff --git a/vllm/utils/mem_utils.py b/vllm/utils/mem_utils.py
index e2517b935bf287f7ededc525e8dc10dfdbf05d0f..12d1541adafa09bf3e12b883398a0bf6135b1651 100644
--- a/vllm/utils/mem_utils.py
+++ b/vllm/utils/mem_utils.py
@@ -11,7 +11,15 @@ import psutil
 import torch
 import torch.types
 
-from .mem_constants import GiB_bytes
+from .mem_constants import GiB_bytes, MiB_bytes
+
+
+def format_mib(b: int) -> str:
+    return f"{round(b / MiB_bytes, 2)}"
+
+
+def format_gib(b: int) -> str:
+    return f"{round(b / GiB_bytes, 2)}"
 
 
 @cache
@@ -22,7 +30,7 @@ def get_max_shared_memory_bytes(gpu: int = 0) -> int:
     max_shared_mem = ops.get_max_shared_memory_per_block_device_attribute(gpu)
     # value 0 will cause MAX_SEQ_LEN become negative and test_attention.py
     # will fail
-    assert max_shared_mem > 0, "max_shared_mem can not be zero"
+    assert max_shared_mem > 0, "max_shared_mem cannot be zero"
     return int(max_shared_mem)
 
 
@@ -66,27 +74,43 @@ class MemorySnapshot:
     torch_memory: int = 0
     non_torch_memory: int = 0
     timestamp: float = 0.0
+
+    device: torch.types.Device = None
     auto_measure: bool = True
 
     def __post_init__(self) -> None:
+        if self.device is None:
+            from vllm.platforms import current_platform
+
+            device_fn = current_platform.current_device
+            assert device_fn is not None
+            self.device_ = torch.device(device_fn())
+        else:
+            self.device_ = torch.device(self.device)
+
         if self.auto_measure:
             self.measure()
 
     def measure(self) -> None:
         from vllm.platforms import current_platform
 
+        device = self.device_
+
         # we measure the torch peak memory usage via allocated_bytes,
         # rather than `torch.cuda.memory_reserved()` .
         # After `torch.cuda.reset_peak_memory_stats()`,
         # `torch.cuda.memory_reserved()` will keep growing, and only shrink
         # when we call `torch.cuda.empty_cache()` or OOM happens.
-        self.torch_peak = torch.cuda.memory_stats().get("allocated_bytes.all.peak", 0)
+        self.torch_peak = torch.cuda.memory_stats(device).get(
+            "allocated_bytes.all.peak", 0
+        )
 
-        self.free_memory, self.total_memory = torch.cuda.mem_get_info()
+        self.free_memory, self.total_memory = torch.cuda.mem_get_info(device)
         shared_sysmem_device_mem_sms = ((8, 7), (11, 0), (12, 1))  # Orin, Thor, Spark
         if (
             current_platform.is_cuda()
-            and current_platform.get_device_capability() in shared_sysmem_device_mem_sms
+            and current_platform.get_device_capability(device.index)
+            in shared_sysmem_device_mem_sms
         ):
             # On UMA (Orin, Thor and Spark) platform,
             # where both CPU and GPU rely on system memory,
@@ -106,12 +130,18 @@ class MemorySnapshot:
         # torch.cuda.memory_reserved() is how many bytes
         # PyTorch gets from cuda (by calling cudaMalloc, etc.)
         # this is used to measure the non-torch memory usage
-        self.torch_memory = torch.cuda.memory_reserved()
+        self.torch_memory = torch.cuda.memory_reserved(device)
 
         self.non_torch_memory = self.cuda_memory - self.torch_memory
         self.timestamp = time.time()
 
     def __sub__(self, other: "MemorySnapshot") -> "MemorySnapshot":
+        if self.device_ != other.device_:
+            raise ValueError(
+                "The two snapshots should be from the same device! "
+                f"Found: {self.device_} vs. {other.device_}"
+            )
+
         return MemorySnapshot(
             torch_peak=self.torch_peak - other.torch_peak,
             free_memory=self.free_memory - other.free_memory,
@@ -120,9 +150,22 @@ class MemorySnapshot:
             torch_memory=self.torch_memory - other.torch_memory,
             non_torch_memory=self.non_torch_memory - other.non_torch_memory,
             timestamp=self.timestamp - other.timestamp,
+            device=self.device_,
             auto_measure=False,
         )
 
+    def __repr__(self) -> str:
+        return (
+            f"torch_peak={format_gib(self.torch_peak)}GiB, "
+            f"free_memory={format_gib(self.free_memory)}GiB, "
+            f"total_memory={format_gib(self.total_memory)}GiB, "
+            f"cuda_memory={format_gib(self.cuda_memory)}GiB, "
+            f"torch_memory={format_gib(self.torch_memory)}GiB, "
+            f"non_torch_memory={format_gib(self.non_torch_memory)}GiB, "
+            f"timestamp={self.timestamp}, "
+            f"auto_measure={self.auto_measure}"
+        )
+
 
 @dataclass
 class MemoryProfilingResult:
@@ -131,30 +174,37 @@ class MemoryProfilingResult:
     non_kv_cache_memory: int = 0
     torch_peak_increase: int = 0
     non_torch_increase: int = 0
-    weights_memory: float = 0
+    weights_memory: int = 0
     before_create: MemorySnapshot = field(default_factory=MemorySnapshot)
-    before_profile: MemorySnapshot = field(default_factory=MemorySnapshot)
-    after_profile: MemorySnapshot = field(default_factory=MemorySnapshot)
     profile_time: float = 0.0
 
+    def __post_init__(self) -> None:
+        device = self.before_create.device_
+
+        self.before_profile = MemorySnapshot(device=device, auto_measure=False)
+        self.after_profile = MemorySnapshot(device=device, auto_measure=False)
+
     def __repr__(self) -> str:
         return (
             f"Memory profiling takes {self.profile_time:.2f} seconds. "
             f"Total non KV cache memory: "
-            f"{(self.non_kv_cache_memory / GiB_bytes):.2f}GiB; "
+            f"{format_gib(self.non_kv_cache_memory)}GiB; "
             f"torch peak memory increase: "
-            f"{(self.torch_peak_increase / GiB_bytes):.2f}GiB; "
+            f"{format_gib(self.torch_peak_increase)}GiB; "
             f"non-torch forward increase memory: "
-            f"{(self.non_torch_increase / GiB_bytes):.2f}GiB; "
-            f"weights memory: {(self.weights_memory / GiB_bytes):.2f}GiB."
+            f"{format_gib(self.non_torch_increase)}GiB; "
+            f"weights memory: {format_gib(self.weights_memory)}GiB."
         )
 
 
 @contextlib.contextmanager
 def memory_profiling(
-    baseline_snapshot: MemorySnapshot, weights_memory: int
+    baseline_snapshot: MemorySnapshot,
+    weights_memory: int = 0,
 ) -> Generator[MemoryProfilingResult, None, None]:
-    """Memory profiling context manager.
+    """
+    Memory profiling context manager.
+
     baseline_snapshot: the memory snapshot before the current vLLM instance.
     weights_memory: memory used by PyTorch when loading the model weights.
         Note that, before loading the model weights, we also initialize the device
@@ -194,21 +244,24 @@ def memory_profiling(
     b. 2 GiB reserved for the peak activation tensors (category 2)
     c. 1 GiB used by non-torch components (category 3)
 
-    The memory used for loading weights (a.) is directly given from the argument `weights_memory`.
+    The memory used for loading weights (a.) is directly given from the
+    argument `weights_memory`.
 
-    The increase of `torch.cuda.memory_stats()["allocated_bytes.all.peak"]` during profiling gives (b.).
+    The increase of `torch.cuda.memory_stats()["allocated_bytes.all.peak"]`
+    during profiling gives (b.).
 
-    The increase of `non_torch_memory` from creating the current vLLM instance until after profiling to get (c.).
-    """  # noqa
+    The increase of `non_torch_memory` from creating the current vLLM instance
+    until after profiling to get (c.).
+    """
     gc.collect()
     torch.cuda.empty_cache()
-    torch.cuda.reset_peak_memory_stats()
-
-    result = MemoryProfilingResult()
+    torch.cuda.reset_peak_memory_stats(baseline_snapshot.device_)
 
-    result.before_create = baseline_snapshot
-    # the part of memory used for holding the model weights
-    result.weights_memory = weights_memory
+    result = MemoryProfilingResult(
+        before_create=baseline_snapshot,
+        # the part of memory used for holding the model weights
+        weights_memory=weights_memory,
+    )
 
     result.before_profile.measure()
 
@@ -229,4 +282,4 @@ def memory_profiling(
     peak_activation_memory = result.torch_peak_increase
     result.non_kv_cache_memory = (
         non_torch_memory + peak_activation_memory + result.weights_memory
-    )  # noqa
+    )
diff --git a/vllm/utils/system_utils.py b/vllm/utils/system_utils.py
index 76cac59c1809804371b7dfbcd48003b5a4b6b0ec..aa14b3951644dca5c39f8a9dfdcdddd63f3e3df4 100644
--- a/vllm/utils/system_utils.py
+++ b/vllm/utils/system_utils.py
@@ -267,3 +267,30 @@ def set_ulimit(target_soft_limit: int = 65535):
                 current_soft,
                 e,
             )
+
+
+def find_loaded_library(lib_name: str) -> str | None:
+    """
+    According to according to https://man7.org/linux/man-pages/man5/proc_pid_maps.5.html,
+    the file `/proc/self/maps` contains the memory maps of the process, which includes the
+    shared libraries loaded by the process. We can use this file to find the path of the
+    a loaded library.
+    """  # noqa
+    found_line = None
+    with open("/proc/self/maps") as f:
+        for line in f:
+            if lib_name in line:
+                found_line = line
+                break
+    if found_line is None:
+        # the library is not loaded in the current process
+        return None
+    # if lib_name is libcudart, we need to match a line with:
+    # address /path/to/libcudart-hash.so.11.0
+    start = found_line.index("/")
+    path = found_line[start:].strip()
+    filename = path.split("/")[-1]
+    assert filename.rpartition(".so")[0].startswith(lib_name), (
+        f"Unexpected filename: {filename} for library {lib_name}"
+    )
+    return path
diff --git a/vllm/utils/torch_utils.py b/vllm/utils/torch_utils.py
index c97efce312b56825e484c281ba2598ddac67d309..f0c7e9366f11b8b3a3d04bcf700e58420b8d9beb 100644
--- a/vllm/utils/torch_utils.py
+++ b/vllm/utils/torch_utils.py
@@ -3,6 +3,7 @@
 import contextlib
 import importlib.metadata
 import os
+import random
 import threading
 from collections.abc import Callable, Collection
 from functools import lru_cache
@@ -24,6 +25,10 @@ else:
     ModelConfig = object
     IntermediateTensors = object
 
+import logging
+
+logger = logging.getLogger(__name__)
+
 
 STR_DTYPE_TO_TORCH_DTYPE = {
     "float32": torch.float32,
@@ -49,9 +54,46 @@ TORCH_DTYPE_TO_NUMPY_DTYPE = {
 }
 
 
+MODELOPT_TO_VLLM_KV_CACHE_DTYPE_MAP = {
+    # TODO: Add more modelopt kv cache dtype
+    # mappings here when it supported by some attention backend
+    # (for example supports nvfp4).
+    "fp8": "fp8_e4m3",
+}
+
 T = TypeVar("T")
 
 
+def is_strictly_contiguous(t: torch.Tensor) -> bool:
+    """
+    Check if tensor is contiguous AND has no degenerate strides.
+
+    A degenerate stride occurs when a dimension has size 1 but the stride
+    doesn't match the canonical contiguous layout. This can cause issues
+    in some CUDA kernels that rely on stride values for memory access.
+
+    For a C-contiguous tensor of shape (d0, d1, ..., dn), the expected
+    strides are: stride[i] = product(shape[i+1:]) for all i, with stride[-1]=1.
+
+    Example with torch.Size([16, 1, 8, 32]):
+        - Canonical strides: (256, 256, 32, 1)
+        - Degenerate strides: (256, 1, 32, 1)  # dim=1 has size=1, allowing
+                                                  # non-canonical stride in dim=0
+    """
+    if not t.is_contiguous():
+        return False
+
+    # Check that strides match canonical contiguous layout
+    shape = t.shape
+    strides = t.stride()
+    expected_stride = 1
+    for i in range(len(shape) - 1, -1, -1):
+        if strides[i] != expected_stride:
+            return False
+        expected_stride *= shape[i]
+    return True
+
+
 @contextlib.contextmanager
 def set_default_torch_dtype(dtype: torch.dtype):
     """Sets the default torch dtype to the given dtype."""
@@ -194,6 +236,89 @@ def get_kv_cache_torch_dtype(
     return torch_dtype
 
 
+def get_kv_cache_quant_algo_string(quant_cfg: dict[str, Any]) -> str | None:
+    """Get the KV cache quantization algorithm string from the quantization config.
+
+    Maps various FP8 format names to vLLM's standard cache dtype strings.
+    Returns None if no kv_cache_quant_algo is specified.
+    Returns "auto" if the value is not recognized/supported.
+    """
+    # Mapping from model config values to vLLM cache_dtype strings
+
+    quant_method = quant_cfg.get("quant_method", "")
+    if quant_method.startswith("modelopt"):
+        quantization_inner = quant_cfg.get("quantization", quant_cfg)
+        # Check if quant config is specified and use kv cache quant algo
+        kv_algo = (
+            quantization_inner.get("kv_cache_scheme")
+            or quant_cfg.get("kv_cache_scheme")
+            or quantization_inner.get("kv_cache_quant_algo")
+            or quant_cfg.get("kv_cache_quant_algo")
+        )
+        if isinstance(kv_algo, dict):
+            if (
+                kv_algo.get("dynamic") is False
+                and kv_algo.get("num_bits") == 8
+                and kv_algo.get("type") == "float"
+            ):
+                kv_algo = "fp8"
+            else:
+                # Unknown/unsupported format - return "auto" as safe fallback
+                logger.warning(
+                    "WARNING: Unknown kv_cache_quant_algo '%s' in model "
+                    "config. Supported values: %s. Falling back to 'auto'.",
+                    f"{kv_algo}",
+                    list(MODELOPT_TO_VLLM_KV_CACHE_DTYPE_MAP.keys()),
+                )
+                return "auto"
+        if isinstance(kv_algo, str):
+            kv_algo_lower = kv_algo.lower()
+
+            # Try to map to vLLM's standard format
+            if kv_algo_lower in MODELOPT_TO_VLLM_KV_CACHE_DTYPE_MAP:
+                return MODELOPT_TO_VLLM_KV_CACHE_DTYPE_MAP[kv_algo_lower]
+            else:
+                # Unknown/unsupported format - return "auto" as safe fallback
+                logger.warning(
+                    "WARNING: Unknown kv_cache_quant_algo '%s' in model "
+                    "config. Supported values: %s. Falling back to 'auto'.",
+                    kv_algo,
+                    list(MODELOPT_TO_VLLM_KV_CACHE_DTYPE_MAP.keys()),
+                )
+                return "auto"
+    return None
+
+
+def get_kv_cache_quant_algo_dtype(quant_cfg: dict[str, Any]) -> torch.dtype | None:
+    """Get the KV cache quantization algorithm dtype from the quantization config."""
+    kv_algo_str = get_kv_cache_quant_algo_string(quant_cfg)
+    if kv_algo_str is not None and kv_algo_str != "auto":
+        # Only convert if we have a valid dtype string (not "auto" fallback)
+        return STR_DTYPE_TO_TORCH_DTYPE[kv_algo_str]
+    return None
+
+
+def resolve_kv_cache_dtype_string(
+    kv_cache_dtype: str, model_config: ModelConfig
+) -> str:
+    """Resolve 'auto' kv_cache_dtype to the actual string value from model config.
+    Returns the resolved cache_dtype string.
+    """
+    if kv_cache_dtype != "auto":
+        return kv_cache_dtype
+
+    hf_cfg = getattr(model_config, "hf_config", None)
+    if hf_cfg is not None:
+        quant_cfg = getattr(hf_cfg, "quantization_config", None)
+        if quant_cfg is not None:
+            kv_algo_str = get_kv_cache_quant_algo_string(quant_cfg)
+            if kv_algo_str is not None:
+                return kv_algo_str
+
+    # Default to auto (will be handled by downstream code)
+    return "auto"
+
+
 def kv_cache_dtype_str_to_dtype(
     kv_cache_dtype: str, model_config: ModelConfig
 ) -> torch.dtype:
@@ -203,6 +328,13 @@ def kv_cache_dtype_str_to_dtype(
     return STR_DTYPE_TO_TORCH_DTYPE[kv_cache_dtype]
 
 
+def set_random_seed(seed: int | None) -> None:
+    if seed is not None:
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+
+
 def create_kv_caches_with_random_flash(
     num_blocks: int,
     block_size: int,
@@ -215,9 +347,7 @@ def create_kv_caches_with_random_flash(
     device: str | None = "cuda",
     cache_layout: str | None = "NHD",
 ) -> tuple[list[torch.Tensor], list[torch.Tensor]]:
-    from vllm.platforms import current_platform
-
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
 
     dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype)
     generic_kv_cache_shape = (num_blocks, 2, block_size, num_heads, head_size)
@@ -260,9 +390,8 @@ def create_kv_caches_with_random(
         raise ValueError(
             f"Does not support key cache of type fp8 with head_size {head_size}"
         )
-    from vllm.platforms import current_platform
 
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
 
     dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype)
 
@@ -390,9 +519,13 @@ def current_stream() -> torch.cuda.Stream:
         # when this function is called before any stream is set,
         # we return the default stream.
         # On ROCm using the default 0 stream in combination with RCCL
-        # is hurting performance. Therefore creating a dedicated stream
-        # per process
-        if current_platform.is_rocm():
+        # is hurting performance.
+        # On CUDA, we capture and replay cudagraph on the same stream,
+        # so we need to avoid using the default stream as well. The default
+        # stream cannot be used for cudagraph capture, see
+        # https://github.com/pytorch/pytorch/blob/42ad9edfb754743fdae3276ade43de000beb4f60/aten/src/ATen/cuda/CUDAGraph.cpp#L77
+        # for more details. Therefore, we create a dedicated stream per process.
+        if current_platform.is_rocm() or current_platform.is_cuda():
             # torch.cuda.set_stream here is the alias of _pathed_set_stream
             torch.cuda.set_stream(torch.cuda.Stream())
         elif current_platform.is_cpu():
diff --git a/vllm/attention/backends/abstract.py b/vllm/v1/attention/backend.py
similarity index 56%
rename from vllm/attention/backends/abstract.py
rename to vllm/v1/attention/backend.py
index 025ede1eb0a4e833412441639bfe10d8b2b874fa..0fd3d6eb3fbb360e49bb7211df2671656a5e79a6 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/v1/attention/backend.py
@@ -2,19 +2,25 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from enum import Enum
 from typing import TYPE_CHECKING, ClassVar, Generic, Protocol, TypeVar, get_args
 
+import numpy as np
 import torch
+from typing_extensions import deprecated
 
 if TYPE_CHECKING:
+    from vllm.config import VllmConfig
     from vllm.config.cache import CacheDType
     from vllm.model_executor.layers.linear import ColumnParallelLinear
     from vllm.model_executor.layers.quantization.utils.quant_utils import QuantKey
     from vllm.platforms.interface import DeviceCapability
     from vllm.v1.attention.backends.utils import KVCacheLayoutType
+    from vllm.v1.kv_cache_interface import AttentionSpec
 
 
-class AttentionType:
+class AttentionType(str, Enum):
     """
     Attention type.
     Use string to be compatible with `torch.compile`.
@@ -193,7 +199,7 @@ class AttentionBackend(ABC):
         head_size: int,
         dtype: torch.dtype,
         kv_cache_dtype: "CacheDType | None",
-        block_size: int | None,
+        block_size: int,
         use_mla: bool,
         has_sink: bool,
         use_sparse: bool,
@@ -207,7 +213,7 @@ class AttentionBackend(ABC):
         head_size: int,
         dtype: torch.dtype,
         kv_cache_dtype: "CacheDType | None",
-        block_size: int | None,
+        block_size: int,
         use_mla: bool,
         has_sink: bool,
         use_sparse: bool,
@@ -270,6 +276,288 @@ class AttentionMetadata:
 T = TypeVar("T", bound=AttentionMetadata)
 
 
+@dataclass
+class CommonAttentionMetadata:
+    """
+    Per-batch attention metadata, shared across layers and backends.
+    AttentionMetadataBuilder instances use it to construct per-layer metadata.
+
+    For many of the tensors we keep both GPU and CPU versions.
+    """
+
+    query_start_loc: torch.Tensor
+    query_start_loc_cpu: torch.Tensor
+    """(batch_size + 1,), the start location of each request in query Tensor"""
+
+    seq_lens: torch.Tensor
+    """(batch_size,), the number of computed tokens for each request"""
+
+    num_reqs: int
+    """Number of requests"""
+    # TODO(lucas): rename to num_tokens since it may be padded and this is misleading
+    num_actual_tokens: int
+    """Total number of tokens in batch"""
+    max_query_len: int
+    """Longest query in batch"""
+    max_seq_len: int
+    """Longest context length (may be an upper bound)"""
+
+    block_table_tensor: torch.Tensor
+    slot_mapping: torch.Tensor
+
+    causal: bool = True
+
+    # Needed by FastPrefillAttentionBuilder
+    logits_indices_padded: torch.Tensor | None = None
+    num_logits_indices: int | None = None
+
+    # Needed by CrossAttentionBuilder
+    encoder_seq_lens: torch.Tensor | None = None
+    encoder_seq_lens_cpu: np.ndarray | None = None
+
+    dcp_local_seq_lens: torch.Tensor | None = None
+    dcp_local_seq_lens_cpu: torch.Tensor | None = None
+    """Sequence lengths of the local rank in decode context parallelism world"""
+
+    # WARNING: Deprecated fields. Will be removed in a future release (v0.15.0)
+    _seq_lens_cpu: torch.Tensor | None = None
+    _num_computed_tokens_cpu: torch.Tensor | None = None
+
+    _num_computed_tokens_cache: torch.Tensor | None = None
+
+    @property
+    @deprecated(
+        """
+    Prefer using device seq_lens directly to avoid implicit H<>D sync.
+    If a CPU copy is needed, use `seq_lens.cpu()` instead.
+    Will be removed in a future release (v0.15.0)
+    """
+    )
+    def seq_lens_cpu(self) -> torch.Tensor:
+        if self._seq_lens_cpu is None:
+            self._seq_lens_cpu = self.seq_lens.to("cpu")
+        return self._seq_lens_cpu
+
+    @property
+    @deprecated(
+        """
+    Prefer using device seq_lens directly to avoid implicit H<>D sync which breaks full
+    async scheduling. If a CPU copy is needed, it can be derived from 
+    query_start_loc_cpu and seq_lens.
+    Will be removed in a future release (v0.15.0)
+    """
+    )
+    def num_computed_tokens_cpu(self) -> torch.Tensor:
+        if self._num_computed_tokens_cpu is None:
+            query_seq_lens = (
+                self.query_start_loc_cpu[1:] - self.query_start_loc_cpu[:-1]
+            )
+            self._num_computed_tokens_cpu = self.seq_lens_cpu - query_seq_lens
+        return self._num_computed_tokens_cpu
+
+    def compute_num_computed_tokens(self) -> torch.Tensor:
+        """Compute num_computed_tokens on device (seq_lens - query_lens)."""
+        if self._num_computed_tokens_cache is None:
+            query_lens = self.query_start_loc[1:] - self.query_start_loc[:-1]
+            self._num_computed_tokens_cache = self.seq_lens - query_lens
+        return self._num_computed_tokens_cache
+
+    # TODO(lucas): remove once we have FULL-CG spec-decode support
+    def unpadded(
+        self, num_actual_tokens: int, num_actual_reqs: int
+    ) -> "CommonAttentionMetadata":
+        maybe_slice_reqs = lambda x: x[:num_actual_reqs] if x is not None else None
+        return CommonAttentionMetadata(
+            query_start_loc=self.query_start_loc[: num_actual_reqs + 1],
+            query_start_loc_cpu=self.query_start_loc_cpu[: num_actual_reqs + 1],
+            seq_lens=self.seq_lens[:num_actual_reqs],
+            _seq_lens_cpu=self._seq_lens_cpu[:num_actual_reqs]
+            if self._seq_lens_cpu is not None
+            else None,
+            _num_computed_tokens_cpu=self._num_computed_tokens_cpu[:num_actual_reqs]
+            if self._num_computed_tokens_cpu is not None
+            else None,
+            num_reqs=num_actual_reqs,
+            num_actual_tokens=num_actual_tokens,
+            max_query_len=self.max_query_len,
+            max_seq_len=self.max_seq_len,
+            block_table_tensor=self.block_table_tensor[:num_actual_reqs],
+            slot_mapping=self.slot_mapping[:num_actual_tokens],
+            causal=self.causal,
+            logits_indices_padded=self.logits_indices_padded,
+            num_logits_indices=self.num_logits_indices,
+            encoder_seq_lens=maybe_slice_reqs(self.encoder_seq_lens),
+            encoder_seq_lens_cpu=maybe_slice_reqs(self.encoder_seq_lens_cpu),
+            dcp_local_seq_lens=maybe_slice_reqs(self.dcp_local_seq_lens),
+            dcp_local_seq_lens_cpu=maybe_slice_reqs(self.dcp_local_seq_lens_cpu),
+        )
+
+
+M = TypeVar("M")
+
+
+class AttentionCGSupport(Enum):
+    """Constants for the cudagraph support of the attention backend
+    Here we do not consider the cascade attention, as currently
+    it is never cudagraph supported."""
+
+    ALWAYS = 3
+    """Cudagraph always supported; supports mixed-prefill-decode"""
+    UNIFORM_BATCH = 2
+    """Cudagraph supported for batches the only contain query lengths that are
+    the same, this can be used for spec-decode
+        i.e. "decodes" are 1 + num_speculative_tokens"""
+    UNIFORM_SINGLE_TOKEN_DECODE = 1
+    """Cudagraph supported for batches the only contain query_len==1 decodes"""
+    NEVER = 0
+    """NO cudagraph support"""
+
+
+class AttentionMetadataBuilder(ABC, Generic[M]):
+    # Does this backend/builder support CUDA Graphs for attention (default: no).
+    # Do not access directly. Call get_cudagraph_support() instead.
+    _cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.NEVER
+    # Does this backend/builder reorder the batch?
+    # If not, set this to None. Otherwise set it to the query
+    # length that will be pulled into the front of the batch.
+    reorder_batch_threshold: int | None = None
+    # Does this backend/builder support updating the block table in existing
+    # metadata
+    supports_update_block_table: bool = False
+
+    @abstractmethod
+    def __init__(
+        self,
+        kv_cache_spec: "AttentionSpec",
+        layer_names: list[str],
+        vllm_config: "VllmConfig",
+        device: torch.device,
+    ):
+        self.kv_cache_spec = kv_cache_spec
+        self.layer_names = layer_names
+        self.vllm_config = vllm_config
+        self.device = device
+
+    @classmethod
+    def get_cudagraph_support(
+        cls: type["AttentionMetadataBuilder"],
+        vllm_config: "VllmConfig",
+        kv_cache_spec: "AttentionSpec",
+    ) -> AttentionCGSupport:
+        """Get the cudagraph support level of this builder class."""
+        return cls._cudagraph_support
+
+    def _init_reorder_batch_threshold(
+        self,
+        reorder_batch_threshold: int | None = 1,
+        supports_spec_as_decode: bool = False,
+        supports_dcp_with_varlen: bool = False,
+    ) -> None:
+        self.reorder_batch_threshold = reorder_batch_threshold
+        if self.reorder_batch_threshold is not None and supports_spec_as_decode:
+            # If the backend supports spec-as-decode kernels, then we can set
+            # the reorder_batch_threshold based on the number of speculative
+            # tokens from the config.
+            speculative_config = self.vllm_config.speculative_config
+            if (
+                speculative_config is not None
+                and speculative_config.num_speculative_tokens is not None
+            ):
+                self.reorder_batch_threshold = max(
+                    self.reorder_batch_threshold,
+                    1 + speculative_config.num_speculative_tokens,
+                )
+
+        if (
+            self.vllm_config.parallel_config.decode_context_parallel_size > 1
+            and not supports_dcp_with_varlen
+        ):
+            self.reorder_batch_threshold = 1
+
+    @abstractmethod
+    def build(
+        self,
+        common_prefix_len: int,
+        common_attn_metadata: CommonAttentionMetadata,
+        fast_build: bool = False,
+    ) -> M:
+        """
+        Central method that builds attention metadata.
+        Some builders (MLA) require reorder_batch to be called prior to build.
+
+        Args:
+            common_prefix_len: The length of the common prefix of the batch.
+            common_attn_metadata: The common attention metadata.
+            fast_build: The meta-data will prioritize speed of building over
+                then speed at execution. Can be used for spec-decode where the
+                result of a build call may only be used for few layers/iters.
+        """
+        raise NotImplementedError
+
+    def update_block_table(
+        self,
+        metadata: M,
+        blk_table: torch.Tensor,
+        slot_mapping: torch.Tensor,
+    ) -> M:
+        """
+        Update the block table for the attention metadata.
+        Faster when theres multiple kv-cache groups that create virtually the
+        same metadata but just with different block tables.
+
+        Only needs to be implemented if supports_update_block_table is True.
+        """
+        raise NotImplementedError
+
+    def build_for_cudagraph_capture(
+        self, common_attn_metadata: CommonAttentionMetadata
+    ) -> M:
+        """
+        Build attention metadata for CUDA graph capture. Uses build by default.
+        Subclasses that override this method should call self.build or
+        super().build_for_cudagraph_capture.
+        """
+        return self.build(
+            common_prefix_len=0, common_attn_metadata=common_attn_metadata
+        )
+
+    def build_for_drafting(
+        self,
+        common_attn_metadata: CommonAttentionMetadata,
+        draft_index: int,
+    ) -> M:
+        """
+        Build attention metadata for draft model. Uses build by default.
+
+        Args:
+            common_attn_metadata: The common attention metadata.
+            draft_index: The index of the current draft operation.
+                When speculating a chain of tokens, this index refers to the
+                draft attempt for the i-th token.
+                For tree-based attention, this index instead refers to the
+                draft attempt for the i-th level in the tree of tokens.
+        """
+        return self.build(
+            common_prefix_len=0,
+            common_attn_metadata=common_attn_metadata,
+            fast_build=True,
+        )
+
+    def use_cascade_attention(
+        self,
+        common_prefix_len: int,
+        query_lens: np.ndarray,
+        num_query_heads: int,
+        num_kv_heads: int,
+        use_alibi: bool,
+        use_sliding_window: bool,
+        use_local_attention: bool,
+        num_sms: int,
+        dcp_world_size: int,
+    ) -> bool:
+        return False
+
+
 class AttentionLayer(Protocol):
     _q_scale: torch.Tensor
     _k_scale: torch.Tensor
@@ -290,6 +578,11 @@ class AttentionLayer(Protocol):
 
 
 class AttentionImpl(ABC, Generic[T]):
+    # Required attributes that all impls should have
+    num_heads: int
+    head_size: int
+    scale: float
+
     # Whether the attention impl can return the softmax lse for decode.
     # Some features like decode context parallelism require the softmax lse.
     can_return_lse_for_decode: bool = False
diff --git a/vllm/v1/attention/backends/cpu_attn.py b/vllm/v1/attention/backends/cpu_attn.py
index 394d0c2f67136ed302a2ba9d43bac5f2f996efb8..3eb9b478230c113b0bea3fc9fc2fcb52373322a5 100644
--- a/vllm/v1/attention/backends/cpu_attn.py
+++ b/vllm/v1/attention/backends/cpu_attn.py
@@ -6,19 +6,19 @@ from typing import ClassVar
 import torch
 
 from vllm import _custom_ops as ops
-from vllm.attention.backends.abstract import (
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.platforms import CpuArchEnum, current_platform
+from vllm.v1.attention.backend import (
     AttentionBackend,
     AttentionImpl,
     AttentionLayer,
+    AttentionMetadataBuilder,
     AttentionType,
+    CommonAttentionMetadata,
     is_quantized_kv_cache,
 )
-from vllm.config import VllmConfig
-from vllm.logger import init_logger
-from vllm.platforms import CpuArchEnum, current_platform
 from vllm.v1.attention.backends.utils import (
-    AttentionMetadataBuilder,
-    CommonAttentionMetadata,
     split_decodes_and_prefills,
 )
 from vllm.v1.kv_cache_interface import AttentionSpec, CrossAttentionSpec
@@ -42,7 +42,7 @@ class CPUAttentionBackend(AttentionBackend):
 
     @classmethod
     def get_supported_head_sizes(cls) -> list[int]:
-        return [32, 64, 96, 128, 160, 192, 224, 256]
+        return [32, 64, 80, 96, 112, 128, 160, 192, 224, 256]
 
     @staticmethod
     def get_name() -> str:
@@ -137,7 +137,7 @@ class CPUAttentionMetadataBuilder(AttentionMetadataBuilder[CPUAttentionMetadata]
         if self.window_size is None:
             self.window_size = -1
         self.block_size = vllm_config.cache_config.block_size
-        self.isa = _get_attn_isa(self.dtype, self.block_size)
+        self.isa = _get_attn_isa(self.dtype, self.block_size, self.head_dim)
         self.is_cross_attention = isinstance(kv_cache_spec, CrossAttentionSpec)
 
     def build(
@@ -484,7 +484,11 @@ def _make_sliding_window_bias(
     return attn_biases
 
 
-def _get_attn_isa(dtype: torch.dtype, block_size: int) -> str:
+def _get_attn_isa(
+    dtype: torch.dtype, block_size: int, head_size: int | None = None
+) -> str:
+    if head_size is not None and head_size % 32 != 0 and head_size % 16 == 0:
+        return "vec16"
     supports_amx = torch._C._cpu._is_amx_tile_supported()
     if supports_amx and dtype in (torch.bfloat16,) and block_size % 32 == 0:
         return "amx"
diff --git a/vllm/attention/utils/fa_utils.py b/vllm/v1/attention/backends/fa_utils.py
similarity index 75%
rename from vllm/attention/utils/fa_utils.py
rename to vllm/v1/attention/backends/fa_utils.py
index 9e84e66487519ff0686480813d043361fcd61a40..10879860de48400a615923504085eae3f60102c0 100644
--- a/vllm/attention/utils/fa_utils.py
+++ b/vllm/v1/attention/backends/fa_utils.py
@@ -8,16 +8,18 @@ import torch
 logger = init_logger(__name__)
 
 if current_platform.is_cuda():
-    from vllm import _custom_ops as ops
-    
-    reshape_and_cache_flash = ops.reshape_and_cache_flash
-    from vllm.vllm_flash_attn import flash_attn_varlen_func, get_scheduler_metadata
+    from vllm._custom_ops import reshape_and_cache_flash
+    from vllm.vllm_flash_attn import (  # type: ignore[attr-defined]
+        flash_attn_varlen_func,
+        get_scheduler_metadata,
+    )
 elif current_platform.is_xpu():
-    from vllm._ipex_ops import ipex_ops as ops
+    from vllm._ipex_ops import ipex_ops
+
+    reshape_and_cache_flash = ipex_ops.reshape_and_cache_flash
+    flash_attn_varlen_func = ipex_ops.flash_attn_varlen_func
+    get_scheduler_metadata = ipex_ops.get_scheduler_metadata
 
-    reshape_and_cache_flash = ops.reshape_and_cache_flash
-    flash_attn_varlen_func = ops.flash_attn_varlen_func
-    get_scheduler_metadata = ops.get_scheduler_metadata
 elif current_platform.is_rocm():
     try:
         # from flash_attn import flash_attn_varlen_func  # noqa: F401
@@ -37,6 +39,9 @@ def get_flash_attn_version(requires_alibi: bool = False) -> int | None:
 
     if current_platform.is_xpu():
         return 2
+    if current_platform.is_rocm():
+        # ROCm doesn't use vllm_flash_attn; return None to skip fa_version arg
+        return None
     try:
         from vllm.vllm_flash_attn.flash_attn_interface import (
             fa_version_unsupported_reason,
@@ -53,16 +58,19 @@ def get_flash_attn_version(requires_alibi: bool = False) -> int | None:
         )
 
         # 2. override if passed by environment or config
-        from vllm.config import get_current_vllm_config
+        from vllm.config import get_current_vllm_config_or_none
 
-        vllm_config = get_current_vllm_config()
-        if vllm_config.attention_config.flash_attn_version is not None:
+        vllm_config = get_current_vllm_config_or_none()
+        if (
+            vllm_config is not None
+            and vllm_config.attention_config.flash_attn_version is not None
+        ):
             fa_version = vllm_config.attention_config.flash_attn_version
 
         # 3. fallback for unsupported combinations
         if device_capability.major == 10 and fa_version == 3:
             logger.warning_once(
-                "Cannot use FA version 3 on Blackwell platform "
+                "Cannot use FA version 3 on Blackwell platform, "
                 "defaulting to FA version 2."
             )
             fa_version = 2
@@ -91,7 +99,7 @@ def flash_attn_supports_fp8() -> bool:
         return True
     return (
         get_flash_attn_version() == 3
-        and current_platform.get_device_capability().major == 9
+        and current_platform.is_device_capability_family(90)
     )
 
 
@@ -111,10 +119,9 @@ def flash_attn_supports_mla():
                 is_fa_version_supported,
             )
 
-            return (
-                is_fa_version_supported(3)
-                and current_platform.get_device_capability()[0] == 9
-            )
+            return is_fa_version_supported(
+                3
+            ) and current_platform.is_device_capability_family(90)
         except (ImportError, AssertionError):
             pass
     return False
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index d906c8f5dd55ff40598eb5ff967bcc987ccce533..2fbff11139e149019f735a01830301c8a4404134 100644
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -2,43 +2,45 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention layer with FlashAttention."""
 
+import copy
 from dataclasses import dataclass
 from typing import ClassVar
 
 import numpy as np
 import torch
 
-from vllm.attention.backends.abstract import (
+from vllm.attention.layer import Attention
+from vllm.v1.attention.backend import (
     AttentionBackend,
     AttentionImpl,
     AttentionType,
     MultipleOf,
     is_quantized_kv_cache,
 )
-from vllm.attention.layer import Attention
-from vllm.attention.ops.common import cp_lse_ag_out_rs
-from vllm.attention.ops.merge_attn_states import merge_attn_states
-from vllm.attention.utils.fa_utils import (
+from vllm.v1.attention.backends.fa_utils import (
     flash_attn_supports_fp8,
     get_flash_attn_version,
     is_flash_attn_varlen_func_available,
 )
+from vllm.v1.attention.ops.common import cp_lse_ag_out_rs
+from vllm.v1.attention.ops.merge_attn_states import merge_attn_states
 
 from vllm.platforms import current_platform
 if is_flash_attn_varlen_func_available():
     if not current_platform.is_rocm():
-        from vllm.attention.utils.fa_utils import (
+        from vllm.v1.attention.backends.fa_utils import (
             flash_attn_supports_sinks,
             flash_attn_varlen_func,
             get_scheduler_metadata,
             reshape_and_cache_flash,
         )
     else:
-        from vllm.attention.utils.fa_utils import (
+        from vllm.v1.attention.backends.fa_utils import (
             flash_attn_supports_sinks,
             vllm_flash_attn_varlen_func,
-            vllm_flash_attn_varlen_func,
+            reshape_and_cache_cuda,
         )
+
 from vllm.config import VllmConfig, get_current_vllm_config, get_layers_from_vllm_config
 from vllm.config.cache import CacheDType
 from vllm.distributed.parallel_state import get_dcp_group
@@ -48,10 +50,12 @@ from vllm.model_executor.layers.batch_invariant import (
 )
 from vllm.platforms.interface import DeviceCapability
 from vllm.utils.math_utils import cdiv
-from vllm.v1.attention.backends.utils import (
+from vllm.v1.attention.backend import (
     AttentionCGSupport,
     AttentionMetadataBuilder,
     CommonAttentionMetadata,
+)
+from vllm.v1.attention.backends.utils import (
     get_dcp_local_seq_lens,
     get_kv_cache_layout,
 )
@@ -216,7 +220,7 @@ class FlashAttentionBackend(AttentionBackend):
         head_size: int,
         dtype: torch.dtype,
         kv_cache_dtype: CacheDType | None,
-        block_size: int,
+        block_size: int | None,
         use_mla: bool,
         has_sink: bool,
         use_sparse: bool,
@@ -300,6 +304,7 @@ class FlashAttentionMetadataBuilder(AttentionMetadataBuilder[FlashAttentionMetad
         if get_flash_attn_version() == 3 or current_platform.is_rocm()
         else AttentionCGSupport.UNIFORM_BATCH
     )
+    supports_update_block_table: bool = True
 
     def __init__(
         self,
@@ -402,7 +407,11 @@ class FlashAttentionMetadataBuilder(AttentionMetadataBuilder[FlashAttentionMetad
                     aot_schedule = False
 
         max_num_splits = 0  # 0 means use FA3's heuristics, not CG compatible
-        if self.use_full_cuda_graph and num_actual_tokens <= self.max_cudagraph_size:
+        if (
+            self.use_full_cuda_graph
+            and self.max_cudagraph_size is not None
+            and num_actual_tokens <= self.max_cudagraph_size
+        ):
             # NOTE(woosuk): Setting num_splits > 1 may increase the memory
             # usage, because the intermediate buffers of size [num_splits,
             # num_heads, num_tokens, head_size] are allocated. Therefore,
@@ -543,6 +552,17 @@ class FlashAttentionMetadataBuilder(AttentionMetadataBuilder[FlashAttentionMetad
         )
         return attn_metadata
 
+    def update_block_table(
+        self,
+        metadata: FlashAttentionMetadata,
+        blk_table: torch.Tensor,
+        slot_mapping: torch.Tensor,
+    ) -> FlashAttentionMetadata:
+        new_metadata = copy.copy(metadata)
+        new_metadata.block_table = blk_table
+        new_metadata.slot_mapping = slot_mapping
+        return new_metadata
+
     def use_cascade_attention(self, *args, **kwargs) -> bool:
         return use_cascade_attention(*args, **kwargs)
 
@@ -637,6 +657,9 @@ class FlashAttentionImpl(AttentionImpl):
               We use torch's .expand() to avoid duplicating values
         """
         assert output is not None, "Output tensor must be provided."
+        assert self.vllm_flash_attn_version is not None, (
+            "FlashAttention version not detected."
+        )
 
         if output_scale is not None or output_block_scale is not None:
             raise NotImplementedError(
@@ -747,7 +770,8 @@ class FlashAttentionImpl(AttentionImpl):
             block_table = attn_metadata.block_table
             scheduler_metadata = attn_metadata.scheduler_metadata
 
-            descale_shape = (cu_seqlens_q.shape[0] - 1, self.num_kv_heads)
+            if not current_platform.is_rocm():
+                descale_shape = (cu_seqlens_q.shape[0] - 1, self.num_kv_heads)
 
             if self.dcp_world_size > 1:
                 self._forward_with_dcp(
@@ -764,6 +788,11 @@ class FlashAttentionImpl(AttentionImpl):
                 )
                 return output
             else:
+                sliding_window_size = (
+                    list(self.sliding_window)
+                    if self.sliding_window is not None
+                    else None
+                )
                 if not current_platform.is_rocm():
                     flash_attn_varlen_func(
                         q=query[:num_actual_tokens],
@@ -777,7 +806,7 @@ class FlashAttentionImpl(AttentionImpl):
                         softmax_scale=self.scale,
                         causal=attn_metadata.causal,
                         alibi_slopes=self.alibi_slopes,
-                        window_size=self.sliding_window,
+                        window_size=sliding_window_size,
                         block_table=block_table,
                         softcap=self.logits_soft_cap,
                         scheduler_metadata=scheduler_metadata,
@@ -806,7 +835,7 @@ class FlashAttentionImpl(AttentionImpl):
                         softmax_scale=self.scale,
                         causal=attn_metadata.causal,
                         alibi_slopes=self.alibi_slopes,
-                        window_size=self.sliding_window,
+                        window_size=sliding_window_size,
                         block_table=block_table,
                         softcap=self.logits_soft_cap,
                         scheduler_metadata=scheduler_metadata,
@@ -814,14 +843,17 @@ class FlashAttentionImpl(AttentionImpl):
                         # q_descale=layer._q_scale.expand(descale_shape),
                         # k_descale=layer._k_scale.expand(descale_shape),
                         # v_descale=layer._v_scale.expand(descale_shape),
+                        q_descale=None,
+                        k_descale=layer._k_scale,
+                        v_descale=layer._v_scale,
                         # num_splits=attn_metadata.max_num_splits,
                         s_aux=self.sinks,
                         is_prefix_cache=True,
                     )
-            return output
- 
-        if not current_platform.is_rocm():
+                return output
+
         # Cascade attention (rare case).
+        if not current_platform.is_rocm():
             cascade_attention(
                 output[:num_actual_tokens],
                 query[:num_actual_tokens],
@@ -872,6 +904,9 @@ class FlashAttentionImpl(AttentionImpl):
                 # q_descale=layer._q_scale,
                 # k_descale=layer._k_scale,
                 # v_descale=layer._v_scale,
+                q_descale=None,
+                k_descale=layer._k_scale,
+                v_descale=layer._v_scale,
                 s_aux=self.sinks,
             )
         return output
@@ -889,12 +924,19 @@ class FlashAttentionImpl(AttentionImpl):
         k_descale: torch.Tensor | None = None,
         v_descale: torch.Tensor | None = None,
     ) -> torch.Tensor:
+        assert self.vllm_flash_attn_version is not None, (
+            "FlashAttention version not detected."
+        )
+
         cu_seqlens_q = attn_metadata.query_start_loc
         max_seqlen_q = attn_metadata.max_query_len
         block_table = attn_metadata.block_table
 
         query = query.contiguous()
         query_across_dcp = get_dcp_group().all_gather(query, dim=1)
+        sliding_window_size = (
+            list(self.sliding_window) if self.sliding_window is not None else None
+        )
         context_attn_out, context_lse = flash_attn_varlen_func(
             q=query_across_dcp,
             k=key_cache,
@@ -907,7 +949,7 @@ class FlashAttentionImpl(AttentionImpl):
             softmax_scale=self.scale,
             causal=False,
             alibi_slopes=self.alibi_slopes,
-            window_size=self.sliding_window,
+            window_size=sliding_window_size,
             block_table=block_table,
             softcap=self.logits_soft_cap,
             return_softmax_lse=True,
@@ -938,7 +980,7 @@ class FlashAttentionImpl(AttentionImpl):
             softmax_scale=self.scale,
             causal=attn_metadata.causal,
             alibi_slopes=self.alibi_slopes,
-            window_size=self.sliding_window,
+            window_size=sliding_window_size,
             softcap=self.logits_soft_cap,
             return_softmax_lse=True,
             fa_version=self.vllm_flash_attn_version,
@@ -975,6 +1017,10 @@ class FlashAttentionImpl(AttentionImpl):
             attn_metadata: Encoder attention metadata
             layer: The attention layer
         """
+        assert self.vllm_flash_attn_version is not None, (
+            "FlashAttention version not detected."
+        )
+
         # For encoder attention, process FP8 quantization if needed
         if self.kv_cache_dtype.startswith("fp8"):
             raise NotImplementedError(
@@ -993,6 +1039,9 @@ class FlashAttentionImpl(AttentionImpl):
         )
 
         # Call flash attention directly on Q, K, V tensors
+        sliding_window_size = (
+            list(self.sliding_window) if self.sliding_window is not None else None
+        )
         if not current_platform.is_rocm():
             flash_attn_varlen_func(
                 q=query,
@@ -1006,7 +1055,7 @@ class FlashAttentionImpl(AttentionImpl):
                 softmax_scale=self.scale,
                 causal=False,  # Encoder attention is bidirectional
                 alibi_slopes=self.alibi_slopes,
-                window_size=self.sliding_window,
+                window_size=sliding_window_size,
                 softcap=self.logits_soft_cap,
                 fa_version=self.vllm_flash_attn_version,
                 q_descale=layer._q_scale.expand(descale_shape),
@@ -1027,12 +1076,15 @@ class FlashAttentionImpl(AttentionImpl):
                 softmax_scale=self.scale,
                 causal=False,  # Encoder attention is bidirectional
                 alibi_slopes=self.alibi_slopes,
-                window_size=self.sliding_window,
+                window_size=sliding_window_size,
                 softcap=self.logits_soft_cap,
                 # fa_version=self.vllm_flash_attn_version,
                 # q_descale=layer._q_scale.expand(descale_shape),
                 # k_descale=layer._k_scale.expand(descale_shape),
                 # v_descale=layer._v_scale.expand(descale_shape),
+                q_descale=None,
+                k_descale=layer._k_scale,
+                v_descale=layer._v_scale,
                 # num_splits=1 if self.batch_invariant_enabled else 0,
                 is_prefix_cache=False,
             )
@@ -1169,7 +1221,7 @@ def cascade_attention(
             max_seqlen_k=common_prefix_len,
             softmax_scale=softmax_scale,
             causal=False,
-            window_size=sliding_window,
+            window_size=list(sliding_window),
             block_table=block_table[:1],
             softcap=logits_soft_cap,
             return_softmax_lse=True,
@@ -1194,18 +1246,18 @@ def cascade_attention(
             max_seqlen_k=common_prefix_len,
             softmax_scale=softmax_scale,
             causal=False,
-            window_size=sliding_window,
+            window_size=list(sliding_window),
             block_table=block_table[:1],
             softcap=logits_soft_cap,
             return_softmax_lse=True,
             scheduler_metadata=prefix_scheduler_metadata,
             # fa_version=fa_version,
-            # q_descale=q_descale.expand(descale_shape) if q_descale is not None else None,
-            # k_descale=k_descale.expand(descale_shape) if k_descale is not None else None,
-            # v_descale=v_descale.expand(descale_shape) if v_descale is not None else None,
-            # # s_aux is incorporated into prefix_lse inside the GPU kernel,
-            # # enabling its effect during the final attention merge.
-            # s_aux=s_aux,
+            q_descale=q_descale.expand(descale_shape) if q_descale is not None else None,
+            k_descale=k_descale.expand(descale_shape) if k_descale is not None else None,
+            v_descale=v_descale.expand(descale_shape) if v_descale is not None else None,
+            # s_aux is incorporated into prefix_lse inside the GPU kernel,
+            # enabling its effect during the final attention merge.
+            s_aux=s_aux,
             # num_splits=1 if vllm_is_batch_invariant() else max_num_splits,
             is_prefix_cache=True,
         )
@@ -1224,7 +1276,7 @@ def cascade_attention(
             max_seqlen_k=max_kv_len - common_prefix_len,
             softmax_scale=softmax_scale,
             causal=True,
-            window_size=sliding_window,
+            window_size=list(sliding_window),
             block_table=block_table[:, num_common_kv_blocks:],
             softcap=logits_soft_cap,
             return_softmax_lse=True,
@@ -1246,15 +1298,15 @@ def cascade_attention(
             max_seqlen_k=max_kv_len - common_prefix_len,
             softmax_scale=softmax_scale,
             causal=True,
-            window_size=sliding_window,
+            window_size=list(sliding_window),
             block_table=block_table[:, num_common_kv_blocks:],
             softcap=logits_soft_cap,
             return_softmax_lse=True,
             scheduler_metadata=suffix_scheduler_metadata,
             # fa_version=fa_version,
-            # q_descale=q_descale.expand(descale_shape) if q_descale is not None else None,
-            # k_descale=k_descale.expand(descale_shape) if k_descale is not None else None,
-            # v_descale=v_descale.expand(descale_shape) if v_descale is not None else None,
+            q_descale=q_descale.expand(descale_shape) if q_descale is not None else None,
+            k_descale=k_descale.expand(descale_shape) if k_descale is not None else None,
+            v_descale=v_descale.expand(descale_shape) if v_descale is not None else None,
             # num_splits=1 if vllm_is_batch_invariant() else max_num_splits,
             is_prefix_cache=True,
         )
diff --git a/vllm/v1/attention/backends/flash_attn_diffkv.py b/vllm/v1/attention/backends/flash_attn_diffkv.py
new file mode 100644
index 0000000000000000000000000000000000000000..5305cc1b8c1265fef4cb84863068afad27e3a329
--- /dev/null
+++ b/vllm/v1/attention/backends/flash_attn_diffkv.py
@@ -0,0 +1,277 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Attention layer with FlashAttention."""
+
+import torch
+
+from vllm.v1.attention.backend import AttentionType
+from vllm.v1.attention.backends.fa_utils import is_flash_attn_varlen_func_available
+from vllm.v1.attention.ops.triton_reshape_and_cache_flash import (
+    triton_reshape_and_cache_flash_diffkv,
+)
+
+if is_flash_attn_varlen_func_available():
+    from vllm.v1.attention.backends.fa_utils import flash_attn_varlen_func
+from vllm.logger import init_logger
+from vllm.v1.attention.backends.utils import get_kv_cache_layout
+
+from .flash_attn import (
+    FlashAttentionBackend,
+    FlashAttentionImpl,
+    FlashAttentionMetadata,
+    cascade_attention,
+)
+
+logger = init_logger(__name__)
+
+
+class FlashAttentionDiffKVBackend(FlashAttentionBackend):
+    # Default to 128 for this backend
+    head_size_v: int = 128
+
+    @classmethod
+    def set_head_size_v(cls, head_size_v: int) -> None:
+        cls.head_size_v = head_size_v
+
+    @staticmethod
+    def get_name() -> str:
+        return "FLASH_ATTN_DIFFKV"
+
+    @staticmethod
+    def get_impl_cls() -> type["FlashAttentionImpl"]:
+        return FlashAttentionDiffKVImpl
+
+    # Do not modify the interface of get_kv_cache_shape,
+    # but consider head_size_v when returning result.
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+        cache_dtype_str: str = "auto",
+    ) -> tuple[int, ...]:
+        if block_size % 16 != 0:
+            raise ValueError("Block size must be a multiple of 16.")
+        return (
+            num_blocks,
+            block_size,
+            num_kv_heads,
+            head_size + FlashAttentionDiffKVBackend.head_size_v,
+        )
+
+    @staticmethod
+    def get_kv_cache_stride_order(
+        include_num_layers_dimension: bool = False,
+    ) -> tuple[int, ...]:
+        # `stride_order` indicates the permutation that gets
+        # us from `get_kv_cache_shape` to the actual memory layout we want.
+        cache_layout = get_kv_cache_layout()
+        if cache_layout == "NHD" and include_num_layers_dimension:
+            # (num_blocks, num_layers, block_size,
+            # num_kv_heads, head_size + head_size_v)
+            return (1, 0, 2, 3, 4)
+        elif cache_layout == "NHD":
+            stride_order = (0, 1, 2, 3)
+        elif cache_layout == "HND" and include_num_layers_dimension:
+            # (num_blocks, num_kv_heads, num_layers,
+            # block_size, head_size + head_size_v)
+            return (1, 3, 0, 2, 4)
+        elif cache_layout == "HND":
+            stride_order = (0, 2, 1, 3)
+        else:
+            raise ValueError(f"Unknown cache layout format {cache_layout}.")
+        return stride_order
+
+
+class FlashAttentionDiffKVImpl(FlashAttentionImpl):
+    def forward(
+        self,
+        layer: torch.nn.Module,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: FlashAttentionMetadata,
+        output: torch.Tensor | None = None,
+        output_scale: torch.Tensor | None = None,
+        output_block_scale: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        """Forward pass with FlashAttention.
+
+        Args:
+            query: shape = [num_tokens, num_heads, head_size]
+            key: shape = [num_tokens, num_kv_heads, head_size]
+            value: shape = [num_tokens, num_kv_heads, head_size_v]
+            kv_cache: shape =
+                [num_blocks, block_size, num_kv_heads, head_size + head_size_v]
+            attn_metadata: Metadata for attention.
+        Returns:
+            shape = [num_tokens, num_heads * head_size_v]
+        NOTE: FP8 quantization, flash-attn expect the size of
+              {q,k,v}_descale to be (num_sequences, num_kv_heads).
+              We use torch's .expand() to avoid duplicating values
+        """
+        assert output is not None, "Output tensor must be provided."
+        assert self.vllm_flash_attn_version is not None, (
+            "FlashAttention version not detected."
+        )
+
+        if output_scale is not None or output_block_scale is not None:
+            raise NotImplementedError(
+                "fused output quantization is not yet supported for FlashAttentionImpl"
+            )
+
+        if attn_metadata is None:
+            # Profiling run.
+            return output.fill_(0)
+
+        attn_type = self.attn_type
+
+        # IMPORTANT!
+        # NOTE(woosuk): With piece-wise CUDA graphs, this method is executed in
+        # eager-mode PyTorch. Thus, we need to be careful about any CPU overhead
+        # in this method. For example, `view` and `slice` (or `[:n]`) operations
+        # are surprisingly slow even in the case they do not invoke any GPU ops.
+        # Minimize the PyTorch ops in this method as much as possible.
+        # Whenever making a change in this method, please benchmark the
+        # performance to make sure it does not introduce any overhead.
+
+        num_actual_tokens = attn_metadata.num_actual_tokens
+
+        # Handle encoder attention differently - no KV cache needed
+        if attn_type in (AttentionType.ENCODER_ONLY, AttentionType.ENCODER):
+            # For encoder attention,
+            # we use direct Q, K, V tensors without caching
+            return self._forward_encoder_attention(
+                query[:num_actual_tokens],
+                key[:num_actual_tokens],
+                value[:num_actual_tokens],
+                output[:num_actual_tokens],
+                attn_metadata,
+                layer,
+            )
+
+        # For decoder and cross-attention, use KV cache as before
+        # Different head_size for K and V
+        key_cache = kv_cache[..., : self.head_size]
+        value_cache = kv_cache[..., self.head_size :]
+
+        # key and value may be None in the case of cross attention. They are
+        # calculated once based on the output from the encoder and then cached
+        # in KV cache.
+        if (
+            self.kv_sharing_target_layer_name is None
+            and key is not None
+            and value is not None
+        ):
+            # Reshape the input keys and values and store them in the cache.
+            # Skip this if sharing KV cache with an earlier attention layer.
+            # NOTE(woosuk): Here, key and value are padded while slot_mapping is
+            # not padded. However, we don't need to do key[:num_actual_tokens]
+            # and value[:num_actual_tokens] because the reshape_and_cache_flash
+            # op uses the slot_mapping's shape to determine the number of
+            # actual tokens.
+
+            # kv_cache update for different head_size K and V
+            triton_reshape_and_cache_flash_diffkv(
+                key,
+                value,
+                kv_cache,
+                attn_metadata.slot_mapping,
+                self.kv_cache_dtype,
+                layer._k_scale,
+                layer._v_scale,
+            )
+
+        if self.kv_cache_dtype.startswith("fp8"):
+            # queries are quantized in the attention layer
+            dtype = FlashAttentionBackend.get_fp8_dtype_for_flashattn(
+                self.kv_cache_dtype
+            )
+            key_cache = key_cache.view(dtype)
+            value_cache = value_cache.view(dtype)
+
+        if not attn_metadata.use_cascade:
+            cu_seqlens_q = attn_metadata.query_start_loc
+            seqused_k = attn_metadata.seq_lens
+            max_seqlen_q = attn_metadata.max_query_len
+            max_seqlen_k = attn_metadata.max_seq_len
+            block_table = attn_metadata.block_table
+            scheduler_metadata = attn_metadata.scheduler_metadata
+
+            descale_shape = (cu_seqlens_q.shape[0] - 1, self.num_kv_heads)
+
+            if self.dcp_world_size > 1:
+                self._forward_with_dcp(
+                    query[:num_actual_tokens],
+                    key[:num_actual_tokens],
+                    value[:num_actual_tokens],
+                    key_cache,
+                    value_cache,
+                    output[:num_actual_tokens],
+                    attn_metadata,
+                    q_descale=layer._q_scale.expand(descale_shape),
+                    k_descale=layer._k_scale.expand(descale_shape),
+                    v_descale=layer._v_scale.expand(descale_shape),
+                )
+                return output
+            else:
+                sliding_window_size = (
+                    list(self.sliding_window)
+                    if self.sliding_window is not None
+                    else None
+                )
+                flash_attn_varlen_func(
+                    q=query[:num_actual_tokens],
+                    k=key_cache,
+                    v=value_cache,
+                    out=output[:num_actual_tokens],
+                    cu_seqlens_q=cu_seqlens_q,
+                    max_seqlen_q=max_seqlen_q,
+                    seqused_k=seqused_k,
+                    max_seqlen_k=max_seqlen_k,
+                    softmax_scale=self.scale,
+                    causal=attn_metadata.causal,
+                    alibi_slopes=self.alibi_slopes,
+                    window_size=sliding_window_size,
+                    block_table=block_table,
+                    softcap=self.logits_soft_cap,
+                    scheduler_metadata=scheduler_metadata,
+                    fa_version=self.vllm_flash_attn_version,
+                    q_descale=layer._q_scale.expand(descale_shape),
+                    k_descale=layer._k_scale.expand(descale_shape),
+                    v_descale=layer._v_scale.expand(descale_shape),
+                    num_splits=attn_metadata.max_num_splits,
+                    s_aux=self.sinks,
+                )
+                return output
+
+        # Cascade attention (rare case).
+        cascade_attention(
+            output[:num_actual_tokens],
+            query[:num_actual_tokens],
+            key_cache,
+            value_cache,
+            cu_query_lens=attn_metadata.query_start_loc,
+            max_query_len=attn_metadata.max_query_len,
+            cu_prefix_query_lens=attn_metadata.cu_prefix_query_lens,
+            prefix_kv_lens=attn_metadata.prefix_kv_lens,
+            suffix_kv_lens=attn_metadata.suffix_kv_lens,
+            max_kv_len=attn_metadata.max_seq_len,
+            softmax_scale=self.scale,
+            alibi_slopes=self.alibi_slopes,
+            sliding_window=self.sliding_window,
+            logits_soft_cap=self.logits_soft_cap,
+            block_table=attn_metadata.block_table,
+            common_prefix_len=attn_metadata.common_prefix_len,
+            max_num_splits=attn_metadata.max_num_splits,
+            fa_version=self.vllm_flash_attn_version,
+            prefix_scheduler_metadata=attn_metadata.prefix_scheduler_metadata,
+            suffix_scheduler_metadata=attn_metadata.scheduler_metadata,
+            q_descale=layer._q_scale,
+            k_descale=layer._k_scale,
+            v_descale=layer._v_scale,
+            s_aux=self.sinks,
+        )
+        return output
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 2740a6916fd975bd4bd5f25760b2e309053e8ca4..9892c360d3d62e30c04db96f8292eace1e369b75 100644
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -16,16 +16,9 @@ from flashinfer import (
 from flashinfer.decode import _get_range_buf, trtllm_batch_decode_with_kv_cache
 from flashinfer.prefill import trtllm_batch_context_with_kv_cache
 from flashinfer.utils import FP4Tensor
+from typing_extensions import override
 
 from vllm import envs
-from vllm.attention.backends.abstract import (
-    AttentionBackend,
-    AttentionImpl,
-    AttentionType,
-    MultipleOf,
-)
-from vllm.attention.ops.common import cp_lse_ag_out_rs
-from vllm.attention.ops.merge_attn_states import merge_attn_states
 from vllm.config import CUDAGraphMode, VllmConfig, get_current_vllm_config
 from vllm.config.cache import CacheDType
 from vllm.distributed.parallel_state import get_dcp_group
@@ -47,10 +40,17 @@ from vllm.utils.flashinfer import (
 )
 from vllm.utils.math_utils import cdiv
 from vllm.utils.platform_utils import is_pin_memory_available
-from vllm.v1.attention.backends.utils import (
+from vllm.utils.torch_utils import is_strictly_contiguous
+from vllm.v1.attention.backend import (
+    AttentionBackend,
     AttentionCGSupport,
+    AttentionImpl,
     AttentionMetadataBuilder,
+    AttentionType,
     CommonAttentionMetadata,
+    MultipleOf,
+)
+from vllm.v1.attention.backends.utils import (
     KVCacheLayoutType,
     get_dcp_local_seq_lens,
     get_kv_cache_layout,
@@ -58,7 +58,10 @@ from vllm.v1.attention.backends.utils import (
     infer_global_hyperparameters,
     split_decodes_and_prefills,
 )
+from vllm.v1.attention.ops.common import cp_lse_ag_out_rs
+from vllm.v1.attention.ops.merge_attn_states import merge_attn_states
 from vllm.v1.kv_cache_interface import AttentionSpec
+from vllm.v1.utils import CpuGpuBuffer
 
 FLASHINFER_WORKSPACE_BUFFER_SIZE_BATCH_INVARIANT = 2048 * 1024 * 1024
 
@@ -181,7 +184,6 @@ class BatchDCPPrefillWrapper:
         paged_kv_indptr_cpu: torch.Tensor,
         paged_kv_indices: torch.Tensor,
         paged_kv_last_page_len_cpu: torch.Tensor,
-        prefill_start: int,
         page_size: int,
         num_qo_heads: int,
         dcp_world_size: int,
@@ -200,7 +202,7 @@ class BatchDCPPrefillWrapper:
             qo_indptr_cpu,
             paged_kv_indptr_cpu,
             paged_kv_indices,
-            paged_kv_last_page_len_cpu[prefill_start:],
+            paged_kv_last_page_len_cpu,
             num_qo_heads * dcp_world_size,
             num_kv_heads,
             head_dim,
@@ -380,40 +382,103 @@ class FlashInferBackend(AttentionBackend):
 
 
 @dataclass
-class FlashInferMetadata:
-    num_actual_tokens: int  # Number of tokens excluding padding.
+class FIPrefill:
+    """Metadata for the native FlashInfer prefill pathway (non-TRTLLM)."""
 
-    # The data type of the query
-    q_data_type: torch.dtype
+    wrapper: BatchPrefillWithPagedKVCacheWrapper | BatchDCPPrefillWrapper
 
-    slot_mapping: torch.Tensor
 
-    # For flashinfer trtllm batch decode
+@dataclass
+class FIDecode:
+    """Metadata for the native FlashInfer decode pathway (non-TRTLLM)."""
+
+    wrapper: BatchDecodeWithPagedKVCacheWrapper
+
+
+@dataclass
+class TRTLLMPrefill:
+    """Metadata for the TRTLLM prefill pathway."""
+
+    block_tables: torch.Tensor
+    """
+    The slice of the block table tensor corresponding *only* to prefill requests.
+    Shape: [num_prefills, max_num_blocks_per_seq]
+    """
+
+    seq_lens: torch.Tensor
+    """
+    The slice of the sequence lengths tensor corresponding *only* to prefill requests.
+    Shape: [num_prefills]
+    """
+
+    cum_seq_lens_q: torch.Tensor
+    cum_seq_lens_kv: torch.Tensor
+
     max_q_len: int
-    max_q_len_prefill: int
+    """
+    The maximum query length *among prefill requests*. 
+    """
+
     max_seq_len: int
+    """The maximum sequence length for KV Cache."""
+
+
+@dataclass
+class TRTLLMDecode:
+    """Metadata for the TRTLLM decode pathway."""
+
+    block_tables: torch.Tensor
+    """
+    The slice of the block table tensor corresponding *only* to decode requests.
+    Shape: [num_decodes, max_num_blocks_per_seq]
+    """
+
     seq_lens: torch.Tensor
-    block_table_tensor: torch.Tensor
-    prefill_use_trtllm: bool
-    decode_use_trtllm: bool
+    """
+    The slice of the sequence lengths tensor corresponding *only* to decode requests.
+    Shape: [num_decodes]
+    """
+
+    max_seq_len: int
+    """The maximum sequence length for KV Cache."""
+
+
+@dataclass
+class FlashInferMetadata:
+    num_actual_tokens: int
+    """Total number of tokens in the batch (excluding padding)."""
+
+    slot_mapping: torch.Tensor
+    """Tensor for writing K/V to the cache. Shape: [num_actual_tokens]"""
+
+    q_data_type: torch.dtype
 
-    # For handling prefill decode split
     num_decodes: int
     num_decode_tokens: int
     num_prefills: int
     num_prefill_tokens: int
 
-    # For cascade attention (CPU for planning).
-    use_cascade: bool
+    prefill: FIPrefill | TRTLLMPrefill | None
+    """
+    Holds the metadata for the prefill portion of the batch.
+    Will be `None` if `num_prefill_tokens == 0`.
+    """
+
+    decode: FIDecode | TRTLLMDecode | None
+    """
+    Holds the metadata for the decode portion of the batch.
+    Will be `None` if `num_decode_tokens == 0`.
+    """
 
-    prefill_wrapper: (
-        BatchPrefillWithPagedKVCacheWrapper | BatchDCPPrefillWrapper | None
-    ) = None
-    decode_wrapper: BatchDecodeWithPagedKVCacheWrapper | None = None
-    cascade_wrapper: MultiLevelCascadeAttentionWrapper | None = None
+    # --- Special Case: Cascade Attention ---
+
+    use_cascade: bool
+    """
+    If True, the entire batch is a cascade attention call, and the
+    `prefill` and `decode` fields will both be None.
+    """
 
-    qo_indptr_gpu: torch.Tensor | None = None
-    paged_kv_indptr_gpu: torch.Tensor | None = None
+    cascade_wrapper: MultiLevelCascadeAttentionWrapper | None
 
 
 class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
@@ -466,11 +531,12 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
             self._decode_wrappers_cudagraph: dict[
                 int, BatchDecodeWithPagedKVCacheWrapper
             ] = {}
-            self._decode_cudagraph_max_bs = min(
-                (1 + num_spec_tokens) * max_num_reqs,
-                self.compilation_config.max_cudagraph_capture_size,
-            )
-
+            self._decode_cudagraph_max_bs = (1 + num_spec_tokens) * max_num_reqs
+            if self.compilation_config.max_cudagraph_capture_size is not None:
+                self._decode_cudagraph_max_bs = min(
+                    self._decode_cudagraph_max_bs,
+                    self.compilation_config.max_cudagraph_capture_size,
+                )
         try:
             self.dcp_world_size = get_dcp_group().world_size
             self.dcp_rank = get_dcp_group().rank_in_group
@@ -482,6 +548,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
             self.dcp_world_size = 1
             self.dcp_rank = 0
             self.dcp_kv_cache_interleave_size = 1
+        self.use_dcp = self.dcp_world_size > 1
 
         self.num_qo_heads = self.model_config.get_num_attention_heads(
             self.vllm_config.parallel_config
@@ -535,34 +602,14 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
                 "sinks, please use trtllm on blackwell or flash attention on "
                 "earlier GPUs."
             )
-        # Preparing persistent buffers (device-side)
-        self.paged_kv_indptr = torch.zeros(
-            max_num_reqs + 1, dtype=torch.int32, device=self.device
-        )
-        self.paged_kv_indices = torch.zeros(
-            max_num_pages,  # max num pages possible
-            dtype=torch.int32,
-            device=self.device,
-        )
-        self.paged_kv_last_page_len = torch.zeros(
-            max_num_reqs, dtype=torch.int32, device=self.device
-        )
-        # host-side buffer
-        pin_memory = is_pin_memory_available()
-        self.paged_kv_indptr_cpu = torch.zeros(
-            max_num_reqs + 1, dtype=torch.int32, device="cpu", pin_memory=pin_memory
-        )
-        self.paged_kv_indptr_np = self.paged_kv_indptr_cpu.numpy()
-        self.paged_kv_indptr_buffer = torch.zeros_like(
-            self.paged_kv_indptr_cpu, pin_memory=pin_memory
-        )
-        self.paged_kv_indices_cpu = torch.zeros(
-            max_num_pages, dtype=torch.int32, device="cpu", pin_memory=pin_memory
-        )
-        self.paged_kv_last_page_len_cpu = torch.zeros(
-            max_num_reqs, dtype=torch.int32, device="cpu", pin_memory=pin_memory
-        )
-        self.paged_kv_last_page_len_np = self.paged_kv_last_page_len_cpu.numpy()
+        # Preparing persistent buffers
+        self.pin_memory = is_pin_memory_available()
+        self.paged_kv_indptr = self._make_buffer(max_num_reqs + 1)
+        self.paged_kv_indptr_cpu_buffer = torch.zeros_like(
+            self.paged_kv_indptr.cpu, pin_memory=self.pin_memory
+        )  # Extra buffer for mutable paged_kv_indptr.cpu in cuda graph mode
+        self.paged_kv_indices = self._make_buffer(max_num_pages)
+        self.paged_kv_last_page_len = self._make_buffer(max_num_reqs)
 
         if self.head_dim == 256 and current_platform.is_device_capability_family(100):
             # https://github.com/flashinfer-ai/flashinfer/issues/1993 reports that
@@ -573,6 +620,18 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
                 "passing --block-size 32 or --block-size 64."
             )
 
+    def _make_buffer(
+        self, *size: int | torch.SymInt, dtype: torch.dtype = torch.int32
+    ) -> CpuGpuBuffer:
+        return CpuGpuBuffer(
+            *size,
+            dtype=dtype,
+            device=self.device,
+            pin_memory=self.pin_memory,
+            with_numpy=True,
+        )
+
+    @override  # type: ignore[misc]
     @classmethod
     def get_cudagraph_support(
         cls: type["FlashInferMetadataBuilder"],
@@ -607,7 +666,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
         self,
     ) -> BatchPrefillWithPagedKVCacheWrapper | BatchDCPPrefillWrapper:
         if self._prefill_wrapper is None:
-            if self.dcp_world_size > 1:
+            if self.use_dcp:
                 self._prefill_wrapper = BatchDCPPrefillWrapper(
                     workspace_buffer=self._get_workspace_buffer(),
                 )
@@ -626,9 +685,9 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
 
         if decode_wrapper is None:
             if use_cudagraph:
-                paged_kv_indptr = self.paged_kv_indptr[: batch_size + 1]
-                paged_kv_indices = self.paged_kv_indices
-                paged_kv_last_page_len = self.paged_kv_last_page_len[:batch_size]
+                paged_kv_indptr = self.paged_kv_indptr.gpu[: batch_size + 1]
+                paged_kv_indices = self.paged_kv_indices.gpu
+                paged_kv_last_page_len = self.paged_kv_last_page_len.gpu[:batch_size]
             else:
                 paged_kv_indptr = None
                 paged_kv_indices = None
@@ -661,99 +720,43 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
             )
         return self._cascade_wrapper
 
-    def build(
+    def _compute_flashinfer_kv_metadata(
         self,
-        common_prefix_len: int,
-        common_attn_metadata: CommonAttentionMetadata,
-        fast_build: bool = False,
-    ) -> FlashInferMetadata:
-        num_reqs = common_attn_metadata.num_reqs
-        num_actual_tokens = common_attn_metadata.num_actual_tokens
-        num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
-            split_decodes_and_prefills(
-                common_attn_metadata,
-                decode_threshold=self.reorder_batch_threshold,
-                require_uniform=True,
-            )
-        )
-
-        page_size = self.page_size
-        max_q_len = common_attn_metadata.max_query_len
-        max_seq_len = common_attn_metadata.max_seq_len
-        seq_lens = common_attn_metadata.seq_lens
-        seq_lens_cpu = common_attn_metadata.seq_lens_cpu
-        block_table_tensor = common_attn_metadata.block_table_tensor
-        qo_indptr_cpu = common_attn_metadata.query_start_loc_cpu
-
-        if self.dcp_world_size > 1:
-            if num_prefills > 0:
-                qo_indptr_prefill_cpu = (
-                    qo_indptr_cpu[num_decodes:] - qo_indptr_cpu[num_decodes]
-                )
-                query_lens_prefill_cpu = (
-                    qo_indptr_prefill_cpu[1:] - qo_indptr_prefill_cpu[:-1]
-                )
-                seq_lens_cpu[num_decodes:] = (
-                    seq_lens_cpu[num_decodes:] - query_lens_prefill_cpu
-                )
-
-            seq_lens_cpu = get_dcp_local_seq_lens(
-                seq_lens_cpu,
-                self.dcp_world_size,
-                self.dcp_rank,
-                self.dcp_kv_cache_interleave_size,
-            )
-
-        seq_lens_np = seq_lens_cpu.numpy()
-        num_blocks_np = (seq_lens_np + (page_size - 1)) // page_size
-
-        use_cascade = common_prefix_len > 0
-        if use_cascade:
-            # Grab the blocks of the shared prefix from the first request.
-            assert common_prefix_len % page_size == 0
-            num_common_kv_blocks = common_prefix_len // page_size
-
-            # Create CPU versions directly for cascade (no GPU versions needed)
-            shared_qo_indptr_cpu = torch.tensor(
-                [0, num_actual_tokens], dtype=torch.int32, device="cpu"
-            )
-            shared_kv_page_indptr_cpu = torch.tensor(
-                [0, num_common_kv_blocks], dtype=torch.int32, device="cpu"
-            )
-            shared_kv_page_indices_cpu = block_table_tensor[0, :num_common_kv_blocks]
-            shared_kv_last_page_len_cpu = torch.tensor(
-                [page_size], dtype=torch.int32, device="cpu"
-            )
+        num_blocks_np: np.ndarray,
+        seq_lens_np: np.ndarray,
+        block_table_tensor: torch.Tensor,
+        num_reqs: int,
+        page_size: int,
+    ) -> torch.Tensor:
+        """
+        Compute paged_kv_indptr, paged_kv_indices, paged_kv_last_page_len for FlashInfer
+        attention.
 
-            # Remove the blocks of the shared prefix from all requests.
-            block_table_tensor = block_table_tensor[:, num_common_kv_blocks:]
-            num_blocks_np -= num_common_kv_blocks
-        else:
-            shared_qo_indptr_cpu = None
-            shared_kv_page_indptr_cpu = None
-            shared_kv_page_indices_cpu = None
-            shared_kv_last_page_len_cpu = None
+        Results are stored in self.paged_kv_indptr,
+        self.paged_kv_indices, self.paged_kv_last_page_len buffers.
 
+        Returns paged_kv_indices, a GPU tensor with shape [num_actual_pages].
+        """
         # write self.paged_kv_indptr_cpu inplace (0-index is always 0)
         np.cumsum(
             num_blocks_np,
             dtype=np.int32,
-            out=self.paged_kv_indptr_np[1 : num_reqs + 1],
+            out=self.paged_kv_indptr.np[1 : num_reqs + 1],
         )
         # NOTE(woosuk): Because self.paged_kv_indptr_cpu can be modified
         # after this line (e.g., for cuda graphs), we need to copy the data to
         # self.paged_kv_indptr_buffer to avoid race condition.
-        self.paged_kv_indptr_buffer[: num_reqs + 1] = self.paged_kv_indptr_cpu[
+        self.paged_kv_indptr_cpu_buffer[: num_reqs + 1] = self.paged_kv_indptr.cpu[
             : num_reqs + 1
         ]
-        paged_kv_indptr = self.paged_kv_indptr[: num_reqs + 1]
+        paged_kv_indptr = self.paged_kv_indptr.gpu[: num_reqs + 1]
         paged_kv_indptr.copy_(
-            self.paged_kv_indptr_buffer[: num_reqs + 1], non_blocking=True
+            self.paged_kv_indptr_cpu_buffer[: num_reqs + 1], non_blocking=True
         )
 
         # write self.paged_kv_indices inplace
-        num_actual_pages = self.paged_kv_indptr_np[num_reqs]
-        paged_kv_indices = self.paged_kv_indices[:num_actual_pages]
+        num_actual_pages = self.paged_kv_indptr.np[num_reqs]
+        paged_kv_indices = self.paged_kv_indices.gpu[:num_actual_pages]
         _copy_page_indices_kernel[(num_reqs,)](
             paged_kv_indices,
             block_table_tensor,
@@ -764,12 +767,41 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
 
         # write self.paged_kv_last_page_len_cpu inplace
         paged_kv_last_page_len_np = seq_lens_np % page_size
-        self.paged_kv_last_page_len_np[:num_reqs] = np.where(
+        self.paged_kv_last_page_len.np[:num_reqs] = np.where(
             (paged_kv_last_page_len_np == 0) & (seq_lens_np != 0),
             page_size,
             paged_kv_last_page_len_np,
         )
+        return paged_kv_indices
+
+    def build(
+        self,
+        common_prefix_len: int,
+        common_attn_metadata: CommonAttentionMetadata,
+        fast_build: bool = False,
+    ) -> FlashInferMetadata:
+        num_reqs = common_attn_metadata.num_reqs
+        num_actual_tokens = common_attn_metadata.num_actual_tokens
+        num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
+            split_decodes_and_prefills(
+                common_attn_metadata,
+                decode_threshold=self.reorder_batch_threshold,
+                require_uniform=True,
+            )
+        )
 
+        page_size = self.page_size
+        max_seq_len = common_attn_metadata.max_seq_len
+        seq_lens = common_attn_metadata.seq_lens
+        block_table_tensor = common_attn_metadata.block_table_tensor
+        qo_indptr = common_attn_metadata.query_start_loc
+        qo_indptr_cpu = common_attn_metadata.query_start_loc_cpu
+
+        # Step 1: Decide which dispatch modes to use:
+        # - Cascade attention (distinct mode)
+        # - Prefill (FI native or TRTLLM)
+        # - Decode (FI native or TRTLLM)
+        use_cascade = common_prefix_len > 0
         uses_spec_reorder = self.reorder_batch_threshold > 1
         prefill_use_trtllm = use_trtllm_attention(
             self.num_qo_heads,
@@ -788,7 +820,14 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
             self.use_trtllm_decode_attention and self.dcp_world_size <= 1
         )
 
-        if not (prefill_use_trtllm and decode_use_trtllm):
+        all_uses_trtllm = (num_prefills == 0 or prefill_use_trtllm) and (
+            num_decodes == 0 or decode_use_trtllm
+        )
+        is_only_trtllm_decode = num_prefills == 0 and (
+            num_decodes > 0 and decode_use_trtllm
+        )
+
+        if not all_uses_trtllm:
             if self.has_sinks:
                 raise NotImplementedError(
                     "FlashInfer backend currently does not support attention "
@@ -813,28 +852,104 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
             # fall back to model dtype.
             self.q_data_type = self.model_config.dtype
 
+        # Step 2: Initialize the output metadata
+        # Leave prefill/decode/cascade_wrapper empty, to be populated
+        # case by case depending on the batch contents and backend selection.
         attn_metadata = FlashInferMetadata(
             num_actual_tokens=num_actual_tokens,
-            q_data_type=self.q_data_type,
             slot_mapping=common_attn_metadata.slot_mapping,
-            max_q_len=max_q_len,
-            max_q_len_prefill=max_q_len,
-            max_seq_len=max_seq_len,
-            seq_lens=seq_lens,
-            block_table_tensor=block_table_tensor,
-            prefill_use_trtllm=prefill_use_trtllm,
-            decode_use_trtllm=decode_use_trtllm,
+            q_data_type=self.q_data_type,
             num_decodes=num_decodes,
             num_decode_tokens=num_decode_tokens,
             num_prefills=num_prefills,
             num_prefill_tokens=num_prefill_tokens,
             use_cascade=use_cascade,
+            prefill=None,
+            decode=None,
+            cascade_wrapper=None,
         )
 
-        paged_kv_indptr_cpu = self.paged_kv_indptr_cpu[: 1 + num_reqs]
-        paged_kv_last_page_len_cpu = self.paged_kv_last_page_len_cpu[:num_reqs]
+        # Guard access to seq_lens_cpu, which may not always be needed
+        # and can be expensive to retrieve in async mode.
+        needs_seq_lens_cpu = self.use_dcp or use_cascade or not is_only_trtllm_decode
+        seq_lens_cpu = (
+            common_attn_metadata.seq_lens.cpu() if needs_seq_lens_cpu else None
+        )
+        seq_lens_np = seq_lens_cpu.numpy() if seq_lens_cpu is not None else None
+        num_blocks_np = (
+            (seq_lens_np + (page_size - 1)) // page_size
+            if seq_lens_np is not None
+            else None
+        )
+
+        # Adjust seq_lens_cpu for DCP
+        if self.use_dcp:
+            assert seq_lens_cpu is not None
+            if num_prefills > 0:
+                qo_indptr_prefill_cpu = (
+                    qo_indptr_cpu[num_decodes:] - qo_indptr_cpu[num_decodes]
+                )
+                query_lens_prefill_cpu = (
+                    qo_indptr_prefill_cpu[1:] - qo_indptr_prefill_cpu[:-1]
+                )
+                seq_lens_cpu[num_decodes:] = (
+                    seq_lens_cpu[num_decodes:] - query_lens_prefill_cpu
+                )
+
+            seq_lens_cpu = get_dcp_local_seq_lens(
+                seq_lens_cpu,
+                self.dcp_world_size,
+                self.dcp_rank,
+                self.dcp_kv_cache_interleave_size,
+            )
+
+        # Adjust num_block_np for cascade attention
+        if use_cascade:
+            assert num_blocks_np is not None
+            assert common_prefix_len % page_size == 0
+            num_common_kv_blocks = common_prefix_len // page_size
+            num_blocks_np -= num_common_kv_blocks
+
+        # Compute paged_kv_indices if necessary
+        needs_paged_kv_indices = use_cascade or not is_only_trtllm_decode
+        if needs_paged_kv_indices:
+            assert num_blocks_np is not None
+            assert seq_lens_np is not None
+            paged_kv_indices = self._compute_flashinfer_kv_metadata(
+                num_blocks_np,
+                seq_lens_np,
+                block_table_tensor,
+                num_reqs,
+                page_size,
+            )
+        else:
+            paged_kv_indices = None
+
+        # Early-out for cascade attention
+        if use_cascade:
+            # Grab the blocks of the shared prefix from the first request.
+            num_common_kv_blocks = common_prefix_len // page_size
+
+            # Create CPU versions directly for cascade (no GPU versions needed)
+            shared_qo_indptr_cpu = torch.tensor(
+                [0, num_actual_tokens], dtype=torch.int32, device="cpu"
+            )
+            shared_kv_page_indptr_cpu = torch.tensor(
+                [0, num_common_kv_blocks], dtype=torch.int32, device="cpu"
+            )
+            shared_kv_page_indices_cpu = block_table_tensor[0, :num_common_kv_blocks]
+            shared_kv_last_page_len_cpu = torch.tensor(
+                [page_size], dtype=torch.int32, device="cpu"
+            )
+
+            # Remove the blocks of the shared prefix from all requests.
+            block_table_tensor = block_table_tensor[:, num_common_kv_blocks:]
+            num_blocks_np -= num_common_kv_blocks
+
+            assert paged_kv_indices is not None
+            paged_kv_indptr_cpu = self.paged_kv_indptr.cpu[: 1 + num_reqs]
+            paged_kv_last_page_len_cpu = self.paged_kv_last_page_len.cpu[:num_reqs]
 
-        if attn_metadata.use_cascade:
             attn_metadata.cascade_wrapper = self._get_cascade_wrapper()
             attn_metadata.cascade_wrapper.plan(
                 [shared_qo_indptr_cpu, qo_indptr_cpu],
@@ -852,91 +967,107 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
                 q_data_type=self.q_data_type,
                 kv_data_type=self.kv_cache_dtype,
             )
-        else:
-            # Regular attention (common case).
-            # Decodes are at the front and prefills are at the back.
-            num_prefills = attn_metadata.num_prefills
-            num_decodes = attn_metadata.num_decodes
-            if num_prefills > 0:
-                # Decodes are first so prefills start after the last decode
-                prefill_start = num_decodes
-                attn_metadata.prefill_wrapper = self._get_prefill_wrapper()
-                assert qo_indptr_cpu[prefill_start:].shape[0] == num_prefills + 1
-                assert paged_kv_indptr_cpu[prefill_start:].shape[0] == num_prefills + 1
-                assert (
-                    paged_kv_last_page_len_cpu[prefill_start:].shape[0] == num_prefills
+            return attn_metadata
+
+        # Step 3: Handle prefill and decode pathways case by case
+        ## PREFILL PATHWAY
+        if num_prefills > 0:
+            # Slices for shared prefill metadata
+            prefill_start = num_decodes
+            qo_indptr_prefill_cpu = (
+                qo_indptr_cpu[prefill_start:] - qo_indptr_cpu[prefill_start]
+            )
+            assert qo_indptr_prefill_cpu.shape[0] == num_prefills + 1
+
+            if prefill_use_trtllm:
+                # Create GPU versions
+                qo_indptr_prefill_gpu = (
+                    qo_indptr[prefill_start:] - qo_indptr[prefill_start]
+                )
+                paged_kv_indptr_prefill_gpu = self.paged_kv_indptr.gpu[
+                    prefill_start : num_reqs + 1
+                ]
+                # Compute max_q_len for prefill requests
+                query_lens_prefill_cpu = (
+                    qo_indptr_prefill_cpu[1:] - qo_indptr_prefill_cpu[:-1]
                 )
-                # Since prefill_wrapper.run() will be called with
-                # query[num_decode_tokens:] we need to adjust the qo_indptr
-                # to be relative to the start of the prefill queries.
-                qo_indptr_cpu = (
-                    qo_indptr_cpu[prefill_start:] - qo_indptr_cpu[prefill_start]
+                max_q_len_prefill = int(query_lens_prefill_cpu.max().item())
+                attn_metadata.prefill = TRTLLMPrefill(
+                    block_tables=block_table_tensor[prefill_start:],
+                    seq_lens=seq_lens[prefill_start:],
+                    cum_seq_lens_q=qo_indptr_prefill_gpu,
+                    cum_seq_lens_kv=paged_kv_indptr_prefill_gpu,
+                    max_q_len=max_q_len_prefill,
+                    max_seq_len=max_seq_len,
                 )
-                paged_kv_indptr_cpu = paged_kv_indptr_cpu[prefill_start:]
-
-                # Recompute max_q_len for the slice of requests we are using
-                # for prefills. This can be different from max_q_len when
-                # we have a non-uniform batch with some short decodes offloaded
-                # to the prefill pathway
-                query_lens_prefill = qo_indptr_cpu[1:] - qo_indptr_cpu[:-1]
-                attn_metadata.max_q_len_prefill = int(query_lens_prefill.max().item())
-
-                if not attn_metadata.prefill_use_trtllm:
-                    if self.dcp_world_size > 1:
-                        assert isinstance(
-                            attn_metadata.prefill_wrapper, BatchDCPPrefillWrapper
-                        )
-                        attn_metadata.prefill_wrapper.plan(
-                            qo_indptr_cpu=qo_indptr_cpu,
-                            paged_kv_indptr_cpu=paged_kv_indptr_cpu,
-                            paged_kv_indices=paged_kv_indices,
-                            paged_kv_last_page_len_cpu=paged_kv_last_page_len_cpu,
-                            prefill_start=prefill_start,
-                            page_size=self.page_size,
-                            num_qo_heads=self.num_qo_heads,
-                            dcp_world_size=self.dcp_world_size,
-                            num_kv_heads=self.num_kv_heads,
-                            head_dim=self.head_dim,
-                            sm_scale=self.sm_scale,
-                            window_left=self.window_left,
-                            logits_soft_cap=self.logits_soft_cap,
-                            q_data_type=self.q_data_type,
-                            kv_cache_dtype=self.kv_cache_dtype,
-                            prefill_fixed_split_size=self.prefill_fixed_split_size,
-                            disable_split_kv=self.disable_split_kv,
-                        )
-                    else:
-                        assert isinstance(
-                            attn_metadata.prefill_wrapper,
-                            BatchPrefillWithPagedKVCacheWrapper,
-                        )
-                        attn_metadata.prefill_wrapper.plan(
-                            qo_indptr_cpu,
-                            paged_kv_indptr_cpu,
-                            paged_kv_indices,
-                            paged_kv_last_page_len_cpu[prefill_start:],
-                            self.num_qo_heads,
-                            self.num_kv_heads,
-                            self.head_dim,
-                            self.page_size,
-                            causal=True,
-                            sm_scale=self.sm_scale,
-                            window_left=self.window_left,
-                            logits_soft_cap=self.logits_soft_cap,
-                            q_data_type=self.q_data_type,
-                            kv_data_type=self.kv_cache_dtype,
-                            fixed_split_size=self.prefill_fixed_split_size,
-                            disable_split_kv=self.disable_split_kv,
-                        )
+            else:
+                prefill_wrapper = self._get_prefill_wrapper()
+                # Slicing CPU buffers that are only needed for FI native prefills
+                paged_kv_last_page_len_prefill_cpu = self.paged_kv_last_page_len.cpu[
+                    prefill_start:num_reqs
+                ]
+                assert paged_kv_last_page_len_prefill_cpu.shape[0] == num_prefills
+                paged_kv_indptr_prefill_cpu = self.paged_kv_indptr.cpu[
+                    prefill_start : num_reqs + 1
+                ]
+                assert paged_kv_indptr_prefill_cpu.shape[0] == num_prefills + 1
+                if self.use_dcp:
+                    assert isinstance(prefill_wrapper, BatchDCPPrefillWrapper)
+                    prefill_wrapper.plan(
+                        qo_indptr_cpu=qo_indptr_prefill_cpu,
+                        paged_kv_indptr_cpu=paged_kv_indptr_prefill_cpu,
+                        paged_kv_indices=paged_kv_indices,
+                        paged_kv_last_page_len_cpu=paged_kv_last_page_len_prefill_cpu,
+                        page_size=self.page_size,
+                        num_qo_heads=self.num_qo_heads,
+                        dcp_world_size=self.dcp_world_size,
+                        num_kv_heads=self.num_kv_heads,
+                        head_dim=self.head_dim,
+                        sm_scale=self.sm_scale,
+                        window_left=self.window_left,
+                        logits_soft_cap=self.logits_soft_cap,
+                        q_data_type=self.q_data_type,
+                        kv_cache_dtype=self.kv_cache_dtype,
+                        prefill_fixed_split_size=self.prefill_fixed_split_size,
+                        disable_split_kv=self.disable_split_kv,
+                    )
                 else:
-                    attn_metadata.qo_indptr_gpu = qo_indptr_cpu.to(
-                        self.device, non_blocking=True
+                    assert isinstance(
+                        prefill_wrapper,
+                        BatchPrefillWithPagedKVCacheWrapper,
                     )
-                    attn_metadata.paged_kv_indptr_gpu = paged_kv_indptr_cpu.to(
-                        self.device, non_blocking=True
+                    prefill_wrapper.plan(
+                        qo_indptr_prefill_cpu,
+                        paged_kv_indptr_prefill_cpu,
+                        paged_kv_indices,
+                        paged_kv_last_page_len_prefill_cpu,
+                        self.num_qo_heads,
+                        self.num_kv_heads,
+                        self.head_dim,
+                        self.page_size,
+                        causal=True,
+                        sm_scale=self.sm_scale,
+                        window_left=self.window_left,
+                        logits_soft_cap=self.logits_soft_cap,
+                        q_data_type=self.q_data_type,
+                        kv_data_type=self.kv_cache_dtype,
+                        fixed_split_size=self.prefill_fixed_split_size,
+                        disable_split_kv=self.disable_split_kv,
                     )
+                attn_metadata.prefill = FIPrefill(wrapper=prefill_wrapper)
 
-            if num_decodes > 0:
+        ## DECODE PATHWAY
+        if num_decodes > 0:
+            if decode_use_trtllm:
+                assert num_decode_tokens % num_decodes == 0, (
+                    "TRTLLM decode requires uniform query lengths per request."
+                )
+                attn_metadata.decode = TRTLLMDecode(
+                    block_tables=block_table_tensor[:num_decodes],
+                    seq_lens=seq_lens[:num_decodes],
+                    max_seq_len=max_seq_len,
+                )
+            else:
                 pure_decode = num_prefills == 0
                 use_cudagraph = (
                     self.enable_cuda_graph
@@ -945,33 +1076,33 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
                 )
                 num_input_tokens = num_decode_tokens
 
-                attn_metadata.decode_wrapper = self._get_decode_wrapper(
+                decode_wrapper = self._get_decode_wrapper(
                     num_input_tokens, use_cudagraph
                 )
-                if not attn_metadata.decode_use_trtllm:
-                    # Use the persistent buffer with padding length,
-                    # instead of the same address but chunked version
-                    # in atten_metadata when using cudagraph.
-                    fast_plan_decode(
-                        attn_metadata.decode_wrapper,
-                        self.paged_kv_indptr_cpu[: num_input_tokens + 1],
-                        paged_kv_indices,
-                        self.paged_kv_last_page_len_cpu[:num_input_tokens],
-                        seq_lens_cpu[:num_input_tokens],
-                        self.num_qo_heads * self.dcp_world_size,
-                        self.num_kv_heads,
-                        self.head_dim,
-                        self.page_size,
-                        # Disable flashinfer's pos encoding and use vllm's rope.
-                        pos_encoding_mode="NONE",
-                        sm_scale=self.sm_scale,
-                        window_left=self.window_left,
-                        logits_soft_cap=self.logits_soft_cap,
-                        q_data_type=self.q_data_type,
-                        kv_data_type=self.kv_cache_dtype,
-                        fixed_split_size=self.decode_fixed_split_size,
-                        disable_split_kv=self.disable_split_kv,
-                    )
+                # Use the persistent buffer with padding length,
+                # instead of the same address but chunked version
+                # in atten_metadata when using cudagraph.
+                fast_plan_decode(
+                    decode_wrapper,
+                    self.paged_kv_indptr.cpu[: num_input_tokens + 1],
+                    paged_kv_indices,
+                    self.paged_kv_last_page_len.cpu[:num_input_tokens],
+                    seq_lens_cpu[:num_input_tokens],
+                    self.num_qo_heads * self.dcp_world_size,
+                    self.num_kv_heads,
+                    self.head_dim,
+                    self.page_size,
+                    # Disable flashinfer's pos encoding and use vllm's rope.
+                    pos_encoding_mode="NONE",
+                    sm_scale=self.sm_scale,
+                    window_left=self.window_left,
+                    logits_soft_cap=self.logits_soft_cap,
+                    q_data_type=self.q_data_type,
+                    kv_data_type=self.kv_cache_dtype,
+                    fixed_split_size=self.decode_fixed_split_size,
+                    disable_split_kv=self.disable_split_kv,
+                )
+                attn_metadata.decode = FIDecode(wrapper=decode_wrapper)
         return attn_metadata
 
     def use_cascade_attention(self, *args, **kwargs) -> bool:
@@ -1104,6 +1235,9 @@ class FlashInferImpl(AttentionImpl):
         if self.bmm2_scale is None:
             self.bmm2_scale = layer._v_scale_float
 
+        prefill_use_trtllm = isinstance(attn_metadata.prefill, TRTLLMPrefill)
+        decode_use_trtllm = isinstance(attn_metadata.decode, TRTLLMDecode)
+
         # The attn+quant fusion happens when output_scale is provided.
         if output_scale is None:
             assert output_block_scale is None, (
@@ -1113,8 +1247,8 @@ class FlashInferImpl(AttentionImpl):
             assert attn_metadata.q_data_type == FP8_DTYPE, (
                 "Query must be FP8 when attn+quant fusion happened."
             )
-            assert (
-                attn_metadata.prefill_use_trtllm and attn_metadata.decode_use_trtllm
+            assert (attn_metadata.num_prefills == 0 or prefill_use_trtllm) and (
+                attn_metadata.num_decodes == 0 or decode_use_trtllm
             ), "Must use TRT-LLM attn"
 
             if output.dtype == FP8_DTYPE:
@@ -1191,22 +1325,25 @@ class FlashInferImpl(AttentionImpl):
 
         # When using spec decoding, num_decodes can be < num_decode_tokens
         # because some decode requests may have more than one query token.
-        num_decodes = attn_metadata.num_decodes
         num_decode_tokens = attn_metadata.num_decode_tokens
         num_prefill_tokens = attn_metadata.num_prefill_tokens
 
         stride_order = FlashInferBackend.get_kv_cache_stride_order()
         kv_cache_permute = kv_cache.permute(*stride_order)
+
+        use_dcp = self.dcp_world_size > 1
+
         # Regular attention (common case).
         # Decodes are at the front and prefills are at the back.
         if num_prefill_tokens > 0:
-            prefill_wrapper = attn_metadata.prefill_wrapper
             prefill_query = query[num_decode_tokens:]
             assert prefill_query.shape[0] == num_prefill_tokens
-            assert prefill_wrapper is not None
 
-            if not attn_metadata.prefill_use_trtllm:
-                if self.dcp_world_size > 1:
+            if not prefill_use_trtllm:
+                assert isinstance(attn_metadata.prefill, FIPrefill)
+                prefill_wrapper = attn_metadata.prefill.wrapper
+                assert prefill_wrapper is not None
+                if use_dcp:
                     assert isinstance(prefill_wrapper, BatchDCPPrefillWrapper)
                     assert prefill_wrapper._context._window_left == self.window_left
                     assert prefill_wrapper._context._logits_soft_cap == (
@@ -1247,19 +1384,20 @@ class FlashInferImpl(AttentionImpl):
                         out=output[num_decode_tokens:],
                     )
             else:
+                assert isinstance(attn_metadata.prefill, TRTLLMPrefill)
                 # prefill_query may be non-contiguous
                 prefill_query = prefill_query.contiguous()
                 workspace_buffer = _get_trtllm_gen_workspace_buffer()
-                block_tables_prefill = attn_metadata.block_table_tensor[num_decodes:]
-                seq_lens_prefill = attn_metadata.seq_lens[num_decodes:]
+                block_tables_prefill = attn_metadata.prefill.block_tables
+                seq_lens_prefill = attn_metadata.prefill.seq_lens
 
                 # This path needs to be enabled with VLLM_KV_CACHE_LAYOUT = HND
                 assert get_kv_cache_layout() == "HND"
-                assert prefill_query.is_contiguous()
-                assert kv_cache_permute.is_contiguous()
-                assert workspace_buffer.is_contiguous()
-                assert block_tables_prefill.is_contiguous()
-                assert seq_lens_prefill.is_contiguous()
+                assert is_strictly_contiguous(prefill_query)
+                assert is_strictly_contiguous(kv_cache_permute)
+                assert is_strictly_contiguous(workspace_buffer)
+                assert is_strictly_contiguous(block_tables_prefill)
+                assert is_strictly_contiguous(seq_lens_prefill)
 
                 if output.dtype == FP4_DTYPE:
                     assert self.o_sf_scale is not None
@@ -1298,13 +1436,13 @@ class FlashInferImpl(AttentionImpl):
                     workspace_buffer=workspace_buffer,
                     block_tables=mock_block_table,
                     seq_lens=seq_lens_prefill,
-                    max_q_len=attn_metadata.max_q_len_prefill,
-                    max_kv_len=attn_metadata.max_seq_len,
+                    max_q_len=attn_metadata.prefill.max_q_len,
+                    max_kv_len=attn_metadata.prefill.max_seq_len,
                     bmm1_scale=self.bmm1_scale,
                     bmm2_scale=self.bmm2_scale,
                     batch_size=attn_metadata.num_prefills,
-                    cum_seq_lens_q=attn_metadata.qo_indptr_gpu,
-                    cum_seq_lens_kv=attn_metadata.paged_kv_indptr_gpu,
+                    cum_seq_lens_q=attn_metadata.prefill.cum_seq_lens_q,
+                    cum_seq_lens_kv=attn_metadata.prefill.cum_seq_lens_kv,
                     window_left=self.window_left,
                     sinks=self.sinks,
                     o_sf_scale=self.o_sf_scale,
@@ -1312,17 +1450,18 @@ class FlashInferImpl(AttentionImpl):
                 )
 
         if num_decode_tokens > 0:
-            decode_wrapper = attn_metadata.decode_wrapper
             decode_query = query[:num_decode_tokens]
             assert decode_query.shape[0] == num_decode_tokens
-            assert decode_wrapper is not None
 
-            if not attn_metadata.decode_use_trtllm:
+            if not decode_use_trtllm:
+                assert isinstance(attn_metadata.decode, FIDecode)
+                decode_wrapper = attn_metadata.decode.wrapper
+                assert decode_wrapper is not None
                 assert decode_wrapper._window_left == self.window_left
                 assert decode_wrapper._logits_soft_cap == (self.logits_soft_cap or 0.0)
                 assert decode_wrapper._sm_scale == self.scale
 
-                if self.dcp_world_size > 1:
+                if use_dcp:
                     decode_query = get_dcp_group().all_gather(
                         decode_query.contiguous(), dim=-2
                     )
@@ -1357,20 +1496,19 @@ class FlashInferImpl(AttentionImpl):
                     )
             else:
                 # decode_query may be non-contiguous
+                assert isinstance(attn_metadata.decode, TRTLLMDecode)
                 decode_query = decode_query.contiguous()
                 workspace_buffer = _get_trtllm_gen_workspace_buffer()
-                block_tables_decode = attn_metadata.block_table_tensor[
-                    :num_decode_tokens
-                ]
-                seq_lens_decode = attn_metadata.seq_lens[:num_decode_tokens]
+                block_tables_decode = attn_metadata.decode.block_tables
+                seq_lens_decode = attn_metadata.decode.seq_lens
 
                 # This path needs to be enabled with VLLM_KV_CACHE_LAYOUT = HND
                 assert get_kv_cache_layout() == "HND"
-                assert decode_query.is_contiguous()
-                assert kv_cache_permute.is_contiguous()
-                assert workspace_buffer.is_contiguous()
-                assert block_tables_decode.is_contiguous()
-                assert seq_lens_decode.is_contiguous()
+                assert is_strictly_contiguous(decode_query)
+                assert is_strictly_contiguous(kv_cache_permute)
+                assert is_strictly_contiguous(workspace_buffer)
+                assert is_strictly_contiguous(block_tables_decode)
+                assert is_strictly_contiguous(seq_lens_decode)
 
                 if output.dtype == FP4_DTYPE:
                     assert self.o_sf_scale is not None
@@ -1397,7 +1535,7 @@ class FlashInferImpl(AttentionImpl):
                     workspace_buffer=workspace_buffer,
                     block_tables=block_tables_decode,
                     seq_lens=seq_lens_decode,
-                    max_seq_len=attn_metadata.max_seq_len,
+                    max_seq_len=attn_metadata.decode.max_seq_len,
                     bmm1_scale=self.bmm1_scale,
                     bmm2_scale=self.bmm2_scale,
                     window_left=self.window_left,
diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py
index 8193c05c2b1ab890a7f3dea3e7b937a7fce76517..48c8ac6a820b65661e9d73f1d5d548d9f18bf833 100644
--- a/vllm/v1/attention/backends/flex_attention.py
+++ b/vllm/v1/attention/backends/flex_attention.py
@@ -20,12 +20,6 @@ from torch.nn.attention.flex_attention import (
     or_masks,
 )
 
-from vllm.attention.backends.abstract import (
-    AttentionBackend,
-    AttentionImpl,
-    AttentionType,
-    is_quantized_kv_cache,
-)
 from vllm.config import VllmConfig
 from vllm.config.cache import CacheDType
 from vllm.logger import init_logger
@@ -35,9 +29,13 @@ from vllm.model_executor.layers.batch_invariant import (
 from vllm.platforms import current_platform
 from vllm.utils.math_utils import cdiv
 from vllm.utils.torch_utils import is_torch_equal_or_newer
-from vllm.v1.attention.backends.utils import (
+from vllm.v1.attention.backend import (
+    AttentionBackend,
+    AttentionImpl,
     AttentionMetadataBuilder,
+    AttentionType,
     CommonAttentionMetadata,
+    is_quantized_kv_cache,
 )
 from vllm.v1.kv_cache_interface import AttentionSpec
 
@@ -215,7 +213,7 @@ def physical_to_logical_mapping(
     )
 
     # Only process valid blocks to avoid garbage values
-    num_blocks_per_seq = cdiv(seq_lens, block_size)
+    num_blocks_per_seq: torch.Tensor = cdiv(seq_lens, block_size)
     mask = (
         torch.arange(max_num_blocks, device=device)[None, :]
         < num_blocks_per_seq[:, None]
@@ -727,9 +725,7 @@ class FlexAttentionMetadataBuilder(AttentionMetadataBuilder[FlexAttentionMetadat
             block_table_tensor, seq_lens, block_size, num_gpu_blocks
         )
 
-        offset_tensor = common_attn_metadata.num_computed_tokens_cpu.to(
-            self.device, non_blocking=True
-        )
+        offset_tensor = common_attn_metadata.compute_num_computed_tokens()
 
         out = FlexAttentionMetadata(
             causal=common_attn_metadata.causal,
diff --git a/vllm/v1/attention/backends/gdn_attn.py b/vllm/v1/attention/backends/gdn_attn.py
index ace2cbb0564c8f82243aad284a6d8dbefcfd8de0..426c17689ee0be35d79f01163d176b164ca52eca 100644
--- a/vllm/v1/attention/backends/gdn_attn.py
+++ b/vllm/v1/attention/backends/gdn_attn.py
@@ -6,13 +6,15 @@ from dataclasses import dataclass
 
 import torch
 
-from vllm.attention.backends.abstract import AttentionBackend
-from vllm.attention.backends.utils import PAD_SLOT_ID
 from vllm.config import VllmConfig
-from vllm.v1.attention.backends.utils import (
+from vllm.v1.attention.backend import (
+    AttentionBackend,
     AttentionCGSupport,
     AttentionMetadataBuilder,
     CommonAttentionMetadata,
+)
+from vllm.v1.attention.backends.utils import (
+    PAD_SLOT_ID,
     compute_causal_conv1d_metadata,
     split_decodes_and_prefills,
 )
@@ -75,8 +77,10 @@ class GDNAttentionMetadataBuilder(AttentionMetadataBuilder[GDNAttentionMetadata]
         self.compilation_config = vllm_config.compilation_config
         self.speculative_config = vllm_config.speculative_config
         self.kv_cache_spec = kv_cache_spec
+
         if self.speculative_config:
-            self.num_spec = self.speculative_config.num_speculative_tokens
+            assert self.speculative_config.num_speculative_tokens is not None
+            self.num_spec: int = self.speculative_config.num_speculative_tokens
         else:
             self.num_spec = 0
         self.use_spec_decode = self.num_spec > 0
@@ -85,10 +89,15 @@ class GDNAttentionMetadataBuilder(AttentionMetadataBuilder[GDNAttentionMetadata]
         self.use_full_cuda_graph = (
             self.compilation_config.cudagraph_mode.has_full_cudagraphs()
         )
-        self.decode_cudagraph_max_bs = min(
-            self.vllm_config.scheduler_config.max_num_seqs * (self.num_spec + 1),
-            self.compilation_config.max_cudagraph_capture_size,
+
+        self.decode_cudagraph_max_bs = (
+            self.vllm_config.scheduler_config.max_num_seqs * (self.num_spec + 1)
         )
+        if self.compilation_config.max_cudagraph_capture_size is not None:
+            self.decode_cudagraph_max_bs = min(
+                self.decode_cudagraph_max_bs,
+                self.compilation_config.max_cudagraph_capture_size,
+            )
 
         self.spec_state_indices_tensor = torch.empty(
             (self.decode_cudagraph_max_bs, self.num_spec + 1),
@@ -142,8 +151,7 @@ class GDNAttentionMetadataBuilder(AttentionMetadataBuilder[GDNAttentionMetadata]
         m = common_attn_metadata
 
         query_start_loc = m.query_start_loc
-        context_lens = m.num_computed_tokens_cpu
-        context_lens_tensor = context_lens.to(query_start_loc.device)
+        context_lens_tensor = m.compute_num_computed_tokens()
         nums_dict, batch_ptr, token_chunk_offset_ptr = None, None, None
 
         if (
@@ -370,6 +378,5 @@ class GDNAttentionMetadataBuilder(AttentionMetadataBuilder[GDNAttentionMetadata]
 
         num_accepted_tokens = torch.diff(m.query_start_loc)
         num_decode_draft_tokens_cpu = (num_accepted_tokens - 1).cpu()
-        m._num_computed_tokens_cpu = m.seq_lens_cpu - num_accepted_tokens.cpu()
 
         return self.build(0, m, num_accepted_tokens, num_decode_draft_tokens_cpu)
diff --git a/vllm/v1/attention/backends/linear_attn.py b/vllm/v1/attention/backends/linear_attn.py
index 004baa2d09cde5b6ab00341d6b52546909975236..4ef5656916dc2e21f487361c04690e4cbdc08bc5 100644
--- a/vllm/v1/attention/backends/linear_attn.py
+++ b/vllm/v1/attention/backends/linear_attn.py
@@ -4,14 +4,14 @@ from dataclasses import dataclass
 
 import torch
 
-from vllm.attention.backends.abstract import AttentionBackend
 from vllm.config import VllmConfig
-from vllm.v1.attention.backends.utils import (
+from vllm.v1.attention.backend import (
+    AttentionBackend,
     AttentionCGSupport,
     AttentionMetadataBuilder,
     CommonAttentionMetadata,
-    split_decodes_and_prefills,
 )
+from vllm.v1.attention.backends.utils import split_decodes_and_prefills
 from vllm.v1.kv_cache_interface import AttentionSpec, MambaSpec
 
 
diff --git a/vllm/v1/attention/backends/mamba1_attn.py b/vllm/v1/attention/backends/mamba1_attn.py
index fcda6134016ba17d0a635a339b6d30ee67e2b4a7..9d4a37576dd440acb2645321121ea3fbea52a1df 100644
--- a/vllm/v1/attention/backends/mamba1_attn.py
+++ b/vllm/v1/attention/backends/mamba1_attn.py
@@ -3,17 +3,11 @@
 
 from dataclasses import dataclass
 
-import torch
-
-from vllm.attention.backends.abstract import AttentionBackend
-from vllm.attention.backends.utils import PAD_SLOT_ID
-from vllm.config import VllmConfig
-from vllm.v1.attention.backends.mamba_attn import BaseMambaAttentionMetadataBuilder
-from vllm.v1.attention.backends.utils import (
-    CommonAttentionMetadata,
-    split_decodes_and_prefills,
+from vllm.v1.attention.backend import AttentionBackend
+from vllm.v1.attention.backends.mamba_attn import (
+    BaseMambaAttentionMetadata,
+    BaseMambaAttentionMetadataBuilder,
 )
-from vllm.v1.kv_cache_interface import AttentionSpec, MambaSpec
 
 
 class Mamba1AttentionBackend(AttentionBackend):
@@ -23,137 +17,12 @@ class Mamba1AttentionBackend(AttentionBackend):
 
 
 @dataclass
-class Mamba1AttentionMetadata:
-    query_start_loc_p: torch.Tensor
-    state_indices_tensor: torch.Tensor
-    has_initial_states_p: torch.Tensor | None
-    num_prefills: int
-    num_prefill_tokens: int
-    num_decodes: int
-    num_decode_tokens: int
-
-    block_idx_last_scheduled_token: torch.Tensor  # shape: [batch,]
-    block_idx_first_scheduled_token_p: torch.Tensor  # shape: [batch,]
-    block_idx_last_computed_token: torch.Tensor  # shape: [batch,]
-    num_computed_tokens_p: torch.Tensor  # shape: [batch,]
+class Mamba1AttentionMetadata(BaseMambaAttentionMetadata):
+    pass
 
 
 class Mamba1AttentionMetadataBuilder(
     BaseMambaAttentionMetadataBuilder[Mamba1AttentionMetadata]
 ):
-    def __init__(
-        self,
-        kv_cache_spec: AttentionSpec,
-        layer_names: list[str],
-        vllm_config: VllmConfig,
-        device: torch.device,
-    ):
-        super().__init__(kv_cache_spec, layer_names, vllm_config, device)
-        assert isinstance(kv_cache_spec, MambaSpec)
-
-    def build(
-        self,
-        common_prefix_len: int,
-        common_attn_metadata: CommonAttentionMetadata,
-        fast_build: bool = False,
-    ) -> Mamba1AttentionMetadata:
-        num_reqs = common_attn_metadata.num_reqs
-
-        num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
-            split_decodes_and_prefills(
-                common_attn_metadata, decode_threshold=self.reorder_batch_threshold
-            )
-        )
-
-        has_initial_states_p = None
-        query_start_loc_p = None
-        num_computed_tokens, num_computed_tokens_p = None, None
-        block_idx_first_scheduled_token = None
-        block_idx_first_scheduled_token_p = None
-
-        # TODO(@Josephasafg) Mamba1 and Mamba2 have a lot of code in common here.
-        # We should consolidate this code
-        if self.vllm_config.cache_config.enable_prefix_caching:
-            # Return a tensor of shape (#requests, #max blocks)
-            state_indices_tensor = common_attn_metadata.block_table_tensor
-            mamba_block_size = self.kv_cache_spec.block_size
-            num_computed_tokens = common_attn_metadata.num_computed_tokens_cpu.to(
-                self.device
-            )
-            (
-                block_idx_last_computed_token,
-                block_idx_first_scheduled_token,
-                block_idx_last_scheduled_token,
-            ) = self._compute_prefix_caching_block_indices(
-                common_attn_metadata, mamba_block_size
-            )
-        else:
-            # Always return just a single block per each request:
-            state_indices_tensor = common_attn_metadata.block_table_tensor[:, 0]
-            block_idx_last_scheduled_token = None
-            block_idx_last_computed_token = None
-
-        if num_prefills > 0:
-            query_start_loc_p = (
-                common_attn_metadata.query_start_loc[-num_prefills - 1 :]
-                - num_decode_tokens
-            )
-            has_initial_states_cpu = (
-                common_attn_metadata.num_computed_tokens_cpu[
-                    num_reqs - num_prefills : num_reqs
-                ]
-                > 0
-            )
-            has_initial_states_p = has_initial_states_cpu.to(
-                common_attn_metadata.query_start_loc.device
-            )
-
-            if self.vllm_config.cache_config.enable_prefix_caching:
-                assert num_computed_tokens is not None
-                num_computed_tokens_p = num_computed_tokens[
-                    num_reqs - num_prefills : num_reqs
-                ]
-                assert block_idx_first_scheduled_token is not None
-                block_idx_first_scheduled_token_p = block_idx_first_scheduled_token[
-                    num_reqs - num_prefills : num_reqs
-                ]
-
-        elif (
-            num_decodes > 0
-            and num_decodes <= self.decode_cudagraph_max_bs
-            and self.compilation_config.cudagraph_mode.has_full_cudagraphs()
-        ):
-            self.state_indices_tensor[:num_decodes].copy_(
-                state_indices_tensor, non_blocking=True
-            )
-            state_indices_tensor = self.state_indices_tensor[:num_decode_tokens]
-            state_indices_tensor[num_decodes:] = PAD_SLOT_ID
-
-            if self.vllm_config.cache_config.enable_prefix_caching:
-                self.block_idx_last_scheduled_token[:num_decodes].copy_(
-                    block_idx_last_scheduled_token, non_blocking=True
-                )
-                block_idx_last_scheduled_token = self.block_idx_last_scheduled_token[
-                    :num_decode_tokens
-                ]
-
-                self.block_idx_last_computed_token[:num_decodes].copy_(
-                    block_idx_last_computed_token, non_blocking=True
-                )
-                block_idx_last_computed_token = self.block_idx_last_computed_token[
-                    :num_decode_tokens
-                ]
-
-        return Mamba1AttentionMetadata(
-            query_start_loc_p=query_start_loc_p,
-            has_initial_states_p=has_initial_states_p,
-            state_indices_tensor=state_indices_tensor,
-            num_prefills=num_prefills,
-            num_prefill_tokens=num_prefill_tokens,
-            num_decodes=num_decodes,
-            num_decode_tokens=num_decode_tokens,
-            block_idx_last_scheduled_token=block_idx_last_scheduled_token,
-            block_idx_first_scheduled_token_p=block_idx_first_scheduled_token_p,
-            block_idx_last_computed_token=block_idx_last_computed_token,
-            num_computed_tokens_p=num_computed_tokens_p,
-        )
+    metadata_cls = Mamba1AttentionMetadata
+    supports_update_block_table: bool = False
diff --git a/vllm/v1/attention/backends/mamba2_attn.py b/vllm/v1/attention/backends/mamba2_attn.py
index bf1d8f09ab0accd1f01327478ddd12a88ea63fa0..f45315f1e310d5ffb01ffe3469eeea5230f83b23 100644
--- a/vllm/v1/attention/backends/mamba2_attn.py
+++ b/vllm/v1/attention/backends/mamba2_attn.py
@@ -1,18 +1,16 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import itertools
-from dataclasses import dataclass
+from dataclasses import dataclass, replace
 
 import torch
 
-from vllm.attention.backends.abstract import AttentionBackend
 from vllm.config import VllmConfig
 from vllm.utils.math_utils import cdiv
-from vllm.v1.attention.backends.mamba_attn import BaseMambaAttentionMetadataBuilder
-from vllm.v1.attention.backends.utils import (
-    CommonAttentionMetadata,
-    compute_causal_conv1d_metadata,
-    split_decodes_and_prefills,
+from vllm.v1.attention.backend import AttentionBackend, CommonAttentionMetadata
+from vllm.v1.attention.backends.mamba_attn import (
+    BaseMambaAttentionMetadata,
+    BaseMambaAttentionMetadataBuilder,
 )
 from vllm.v1.kv_cache_interface import AttentionSpec
 
@@ -93,47 +91,27 @@ class Mamba2AttentionBackend(AttentionBackend):
 
 
 @dataclass
-class Mamba2AttentionMetadata:
-    num_prefills: int
-    num_prefill_tokens: int
-    num_decodes: int
-    num_decode_tokens: int
-    query_start_loc_p: torch.Tensor
-    seq_lens: torch.Tensor
-
-    prep_initial_states: bool
-    chunk_size: int
-
-    # The following tensors only contain prefill requests and will be None if
-    # the batch has no prefill request.
-    has_initial_states_p: torch.Tensor | None
-    seq_idx_p: torch.Tensor | None
+class Mamba2AttentionMetadata(BaseMambaAttentionMetadata):
+    prep_initial_states: bool = False
+    chunk_size: int = 0
 
+    # Chunk-related metadata (only for prefill)
+    seq_idx_p: torch.Tensor | None = None
     # cu_chunk_seqlen_p is a tensor of shape (nchunks+1,) that contains, for
     # each chunk, its offests into the varlen sequence dimension. It is defined
     # such that the i-th chunk contains tokens from cu_chunk_seqlen_p[i] to
     # cu_chunk_seqlen_p[i+1].
-    cu_chunk_seqlen_p: torch.Tensor | None
-
+    cu_chunk_seqlen_p: torch.Tensor | None = None
     # last_chunk_indices_p is a tensor of shape (batch,) that contains the
     # index of the last chunk for every sequence in the (prefill) batch.
-    last_chunk_indices_p: torch.Tensor | None
-
-    state_indices_tensor: torch.Tensor  # shape: [batch,]
-    block_idx_last_scheduled_token: torch.Tensor  # shape: [batch,]
-    block_idx_first_scheduled_token_p: torch.Tensor  # shape: [batch,]
-    block_idx_last_computed_token: torch.Tensor  # shape: [batch,]
-    num_computed_tokens_p: torch.Tensor  # shape: [batch,]
-
-    # The following attributes are for triton implementation of causal_conv1d
-    nums_dict: dict | None = None
-    batch_ptr: torch.Tensor | None = None
-    token_chunk_offset_ptr: torch.Tensor | None = None
+    last_chunk_indices_p: torch.Tensor | None = None
 
 
 class Mamba2AttentionMetadataBuilder(
     BaseMambaAttentionMetadataBuilder[Mamba2AttentionMetadata]
 ):
+    metadata_cls = Mamba2AttentionMetadata
+
     def __init__(
         self,
         kv_cache_spec: AttentionSpec,
@@ -142,10 +120,73 @@ class Mamba2AttentionMetadataBuilder(
         device: torch.device,
     ):
         super().__init__(kv_cache_spec, layer_names, vllm_config, device)
-        self.chunk_size = vllm_config.model_config.get_mamba_chunk_size()
-        assert self.chunk_size is not None, (
+        chunk_size = vllm_config.model_config.get_mamba_chunk_size()
+        assert chunk_size is not None, (
             "chunk_size needs to be set in the model config for Mamba2 models"
         )
+        self.chunk_size: int = chunk_size
+
+    def _compute_chunk_metadata(
+        self,
+        num_prefills: int,
+        num_computed_tokens_p_cpu: torch.Tensor,
+        query_start_loc_p_cpu: torch.Tensor,
+    ) -> tuple[list[int], list[int], list[int]]:
+        """
+        Compute chunk-specific metadata for Mamba2.
+
+        The code below carefully constructs the chunks such that:
+        1. Chunks contain tokens from a *single* sequence only.
+        2. For every sequence, we are guaranteed that we can
+           retrieve the mamba state *every* chunk_size tokens.
+        Constraint (1) dramatically simplifies the mamba2 kernels.
+        Constraint (2) dramatically simplifies the implementation
+        of prefix caching for mamba2 (wip). We need to take care
+        of the interaction with chunked prefill in order to
+        satisfy constraint (2).
+        """
+        # TODO (tdoublep): This code could probably be optimized.
+        cu_chunk_seqlen = []
+        seq_idx = []
+        last_chunk_indices = []
+        seqlen_pos = 0
+
+        for req_idx in range(num_prefills):
+            this_num_computed = num_computed_tokens_p_cpu[req_idx].item()
+            this_new_tokens = (
+                query_start_loc_p_cpu[req_idx + 1].item()
+                - query_start_loc_p_cpu[req_idx].item()
+            )
+
+            # if computed tokens are not chunk-aligned, use the first
+            # chunk to finish it off
+            if this_num_computed % self.chunk_size != 0:
+                seq_idx.append(req_idx)
+                cu_chunk_seqlen.append(seqlen_pos)
+                # how many tokens to finish the chunk?
+                chunk_len = (
+                    cdiv(this_num_computed, self.chunk_size) * self.chunk_size
+                    - this_num_computed
+                )
+                # we can only use at most this_new_tokens
+                chunk_len = min(chunk_len, this_new_tokens)
+                seqlen_pos += chunk_len
+                this_new_tokens -= chunk_len
+
+            n_chunks = cdiv(this_new_tokens, self.chunk_size)
+            for chunk in range(n_chunks):
+                seq_idx.append(req_idx)
+                cu_chunk_seqlen.append(seqlen_pos)
+                chunk_len = min(self.chunk_size, this_new_tokens)
+                seqlen_pos += chunk_len
+                this_new_tokens -= chunk_len
+
+            assert this_new_tokens == 0
+            last_chunk_indices.append(len(cu_chunk_seqlen) - 1)
+
+        cu_chunk_seqlen.append(seqlen_pos)
+
+        return cu_chunk_seqlen, seq_idx, last_chunk_indices
 
     def build(
         self,
@@ -153,82 +194,29 @@ class Mamba2AttentionMetadataBuilder(
         common_attn_metadata: CommonAttentionMetadata,
         fast_build: bool = False,
     ) -> Mamba2AttentionMetadata:
-        num_reqs = common_attn_metadata.num_reqs
-        seq_lens = common_attn_metadata.seq_lens
+        common = self._compute_common_metadata(common_attn_metadata)
 
-        query_start_loc_p = None
         seq_idx_p = None
         cu_chunk_seqlen_p = None
         last_chunk_indices_p = None
-
-        # Need flags to indicate if there are initial states
-        has_initial_states_p = None
         prep_initial_states = False
 
-        # for causal_conv1d
-        nums_dict, batch_ptr, token_chunk_offset_ptr = None, None, None
-
-        num_computed_tokens, num_computed_tokens_p = None, None
-        block_idx_first_scheduled_token = None
-        block_idx_first_scheduled_token_p = None
-
-        if self.vllm_config.cache_config.enable_prefix_caching:
-            # Return a tensor of shape (#requests, #max blocks)
-            state_indices_tensor = common_attn_metadata.block_table_tensor
-            # Additional cache-related varaiables:
-            mamba_block_size = self.kv_cache_spec.block_size
-            num_computed_tokens = common_attn_metadata.num_computed_tokens_cpu.to(
-                self.device
-            )
-            (
-                block_idx_last_computed_token,
-                block_idx_first_scheduled_token,
-                block_idx_last_scheduled_token,
-            ) = self._compute_prefix_caching_block_indices(
-                common_attn_metadata, mamba_block_size
-            )
-        else:
-            # Always return just a single block per each request:
-            state_indices_tensor = common_attn_metadata.block_table_tensor[:, 0]
-            # Additional cache-related varaiables:
-            block_idx_last_scheduled_token = None
-            block_idx_last_computed_token = None
-
-        num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
-            split_decodes_and_prefills(
-                common_attn_metadata, decode_threshold=self.reorder_batch_threshold
-            )
-        )
-
         # Compute seq_idx for prefill only
-        if num_prefills > 0:
-            # [batch,]
-            has_initial_states_cpu = (
-                common_attn_metadata.num_computed_tokens_cpu[
-                    num_reqs - num_prefills : num_reqs
-                ]
-                > 0
-            )
-            prep_initial_states = torch.any(has_initial_states_cpu).item()
-            has_initial_states_p = has_initial_states_cpu.to(
-                common_attn_metadata.query_start_loc.device
+        if common.num_prefills > 0:
+            prep_initial_states = (
+                torch.any(common.has_initial_states_p).item()
+                if common.has_initial_states_p is not None
+                else False
             )
 
-            query_start_loc_p = (
-                common_attn_metadata.query_start_loc[-num_prefills - 1 :]
-                - num_decode_tokens
-            )
+            num_reqs = common.num_reqs
+            num_prefills = common.num_prefills
+            num_decode_tokens = common.num_decode_tokens
 
-            if self.vllm_config.cache_config.enable_prefix_caching:
-                assert num_computed_tokens is not None
-                num_computed_tokens_p = num_computed_tokens[
-                    num_reqs - num_prefills : num_reqs
-                ]
-                assert block_idx_first_scheduled_token is not None
-                block_idx_first_scheduled_token_p = block_idx_first_scheduled_token[
-                    num_reqs - num_prefills : num_reqs
-                ]
-            num_computed_tokens_p_cpu = common_attn_metadata.num_computed_tokens_cpu[
+            num_computed_tokens_cpu = (
+                common_attn_metadata.compute_num_computed_tokens().cpu()
+            )
+            num_computed_tokens_p_cpu = num_computed_tokens_cpu[
                 num_reqs - num_prefills : num_reqs
             ]
             query_start_loc_p_cpu = (
@@ -236,113 +224,33 @@ class Mamba2AttentionMetadataBuilder(
                 - num_decode_tokens
             )
 
-            # The code below carefully constructs the chunks such that:
-            # 1. Chunks contain tokens from a *single* sequence only.
-            # 2. For every sequence, we are guaranteed that we can
-            #    retrieve the mamba state *every* chunk_size tokens.
-            # Constraint (1) dramatically simplifies the mamba2 kernels.
-            # Constraint (2) dramatically simplifies the implementation
-            # of prefix caching for mamba2 (wip). We need to take care
-            # of the interaction with chunked prefill in order to
-            # satisfy constraint (2).
-            # TODO (tdoublep): This code could probably be optimized.
-            cu_chunk_seqlen = []
-            seq_idx = []
-            last_chunk_indices = []
-            seqlen_pos = 0
-            for req_idx in range(num_prefills):
-                this_num_computed = num_computed_tokens_p_cpu[req_idx].item()
-                this_new_tokens = (
-                    query_start_loc_p_cpu[req_idx + 1].item()
-                    - query_start_loc_p_cpu[req_idx].item()
-                )
-
-                # if computed tokens are not chunk-aligned, use the first
-                # chunk to finish it off
-                if this_num_computed % self.chunk_size != 0:
-                    seq_idx.append(req_idx)
-                    cu_chunk_seqlen.append(seqlen_pos)
-                    # how many tokens to finish the chunk?
-                    chunk_len = (
-                        cdiv(this_num_computed, self.chunk_size) * self.chunk_size
-                        - this_num_computed
-                    )
-                    # we can only use at most this_new_tokens
-                    chunk_len = min(chunk_len, this_new_tokens)
-                    seqlen_pos += chunk_len
-                    this_new_tokens -= chunk_len
-
-                n_chunks = cdiv(this_new_tokens, self.chunk_size)
-                for chunk in range(n_chunks):
-                    seq_idx.append(req_idx)
-                    cu_chunk_seqlen.append(seqlen_pos)
-                    chunk_len = min(self.chunk_size, this_new_tokens)
-                    seqlen_pos += chunk_len
-                    this_new_tokens -= chunk_len
-
-                assert this_new_tokens == 0
-                last_chunk_indices.append(len(cu_chunk_seqlen) - 1)
-
-            cu_chunk_seqlen.append(seqlen_pos)
+            cu_chunk_seqlen, seq_idx, last_chunk_indices = self._compute_chunk_metadata(
+                num_prefills,
+                num_computed_tokens_p_cpu,
+                query_start_loc_p_cpu,
+            )
 
             seq_idx_p = torch.as_tensor(
-                seq_idx, device=query_start_loc_p.device, dtype=torch.int32
+                seq_idx,
+                device=common_attn_metadata.query_start_loc.device,
+                dtype=torch.int32,
             )
             cu_chunk_seqlen_p = torch.as_tensor(
-                cu_chunk_seqlen, device=query_start_loc_p.device, dtype=torch.int32
+                cu_chunk_seqlen,
+                device=common_attn_metadata.query_start_loc.device,
+                dtype=torch.int32,
             )
             last_chunk_indices_p = torch.as_tensor(
-                last_chunk_indices, device=query_start_loc_p.device, dtype=torch.int32
-            )
-
-            nums_dict, batch_ptr, token_chunk_offset_ptr = (
-                compute_causal_conv1d_metadata(query_start_loc_p)
-            )
-
-        elif (
-            num_decodes <= self.decode_cudagraph_max_bs
-            and self.compilation_config.cudagraph_mode.has_full_cudagraphs()
-        ):
-            self.state_indices_tensor[:num_decodes].copy_(
-                state_indices_tensor, non_blocking=True
+                last_chunk_indices,
+                device=common_attn_metadata.query_start_loc.device,
+                dtype=torch.int32,
             )
-            state_indices_tensor = self.state_indices_tensor[:num_decode_tokens]
-
-            if self.vllm_config.cache_config.enable_prefix_caching:
-                self.block_idx_last_scheduled_token[:num_decodes].copy_(
-                    block_idx_last_scheduled_token, non_blocking=True
-                )
-                block_idx_last_scheduled_token = self.block_idx_last_scheduled_token[
-                    :num_decode_tokens
-                ]
 
-                self.block_idx_last_computed_token[:num_decodes].copy_(
-                    block_idx_last_computed_token, non_blocking=True
-                )
-                block_idx_last_computed_token = self.block_idx_last_computed_token[
-                    :num_decode_tokens
-                ]
-
-        attn_metadata = Mamba2AttentionMetadata(
-            num_prefills=num_prefills,
-            num_prefill_tokens=num_prefill_tokens,
-            num_decodes=num_decodes,
-            num_decode_tokens=num_decode_tokens,
-            query_start_loc_p=query_start_loc_p,
-            seq_lens=seq_lens,
+        return replace(
+            common,
             prep_initial_states=prep_initial_states,
             chunk_size=self.chunk_size,
-            has_initial_states_p=has_initial_states_p,
             seq_idx_p=seq_idx_p,
-            state_indices_tensor=state_indices_tensor,
             cu_chunk_seqlen_p=cu_chunk_seqlen_p,
             last_chunk_indices_p=last_chunk_indices_p,
-            nums_dict=nums_dict,
-            batch_ptr=batch_ptr,
-            token_chunk_offset_ptr=token_chunk_offset_ptr,
-            block_idx_last_scheduled_token=block_idx_last_scheduled_token,
-            block_idx_first_scheduled_token_p=block_idx_first_scheduled_token_p,
-            block_idx_last_computed_token=block_idx_last_computed_token,
-            num_computed_tokens_p=num_computed_tokens_p,
         )
-        return attn_metadata
diff --git a/vllm/v1/attention/backends/mamba_attn.py b/vllm/v1/attention/backends/mamba_attn.py
index a9705db59f19de6b96a1f43d3e67c32d78e0fe6f..0c55877a567582c6661f6f70ba33b7d4aeba321d 100644
--- a/vllm/v1/attention/backends/mamba_attn.py
+++ b/vllm/v1/attention/backends/mamba_attn.py
@@ -2,27 +2,63 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import abc
+import copy
+from dataclasses import dataclass
 from typing import ClassVar, TypeVar
 
 import torch
 
 from vllm.config import VllmConfig
 from vllm.utils.math_utils import cdiv
-from vllm.v1.attention.backends.utils import (
+from vllm.v1.attention.backend import (
     AttentionCGSupport,
     AttentionMetadataBuilder,
     CommonAttentionMetadata,
 )
+from vllm.v1.attention.backends.utils import (
+    PAD_SLOT_ID,
+    compute_causal_conv1d_metadata,
+    split_decodes_and_prefills,
+)
 from vllm.v1.kv_cache_interface import AttentionSpec, MambaSpec
 
-M = TypeVar("M")
+M = TypeVar("M", bound="BaseMambaAttentionMetadata")
+
+
+@dataclass
+class BaseMambaAttentionMetadata:
+    num_prefills: int
+    num_prefill_tokens: int
+    num_decodes: int
+    num_decode_tokens: int
+    num_reqs: int
+
+    # The following tensors only contain prefill requests and will be None if
+    # the batch has no prefill request.
+    has_initial_states_p: torch.Tensor | None
+    query_start_loc_p: torch.Tensor | None
+    num_computed_tokens_p: torch.Tensor | None
+
+    state_indices_tensor: torch.Tensor
+
+    # The following tensors are only used for prefix caching and are None if disabled
+    block_idx_last_scheduled_token: torch.Tensor | None
+    block_idx_first_scheduled_token_p: torch.Tensor | None
+    block_idx_last_computed_token: torch.Tensor | None
+
+    # The following attributes are for triton implementation of causal_conv1d
+    nums_dict: dict | None = None
+    batch_ptr: torch.Tensor | None = None
+    token_chunk_offset_ptr: torch.Tensor | None = None
 
 
 class BaseMambaAttentionMetadataBuilder(AttentionMetadataBuilder[M], abc.ABC):
+    metadata_cls: type[M]
     reorder_batch_threshold: int = 1
     _cudagraph_support: ClassVar[AttentionCGSupport] = (
         AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
     )
+    supports_update_block_table: bool = True
 
     def __init__(
         self,
@@ -35,10 +71,12 @@ class BaseMambaAttentionMetadataBuilder(AttentionMetadataBuilder[M], abc.ABC):
 
         assert isinstance(kv_cache_spec, MambaSpec)
         self.compilation_config = vllm_config.compilation_config
-        self.decode_cudagraph_max_bs = min(
-            self.vllm_config.scheduler_config.max_num_seqs,
-            self.compilation_config.max_cudagraph_capture_size,
-        )
+        self.decode_cudagraph_max_bs = self.vllm_config.scheduler_config.max_num_seqs
+        if self.compilation_config.max_cudagraph_capture_size is not None:
+            self.decode_cudagraph_max_bs = min(
+                self.decode_cudagraph_max_bs,
+                self.compilation_config.max_cudagraph_capture_size,
+            )
 
         if self.vllm_config.cache_config.enable_prefix_caching:
             self.state_indices_tensor = torch.empty(
@@ -87,14 +125,24 @@ class BaseMambaAttentionMetadataBuilder(AttentionMetadataBuilder[M], abc.ABC):
 
         return self.build(0, m)
 
+    def build(
+        self,
+        common_prefix_len: int,
+        common_attn_metadata: CommonAttentionMetadata,
+        fast_build: bool = False,
+    ) -> M:
+        """
+        Default build implementation for Mamba-like attention backends.
+        Subclasses (e.g., Mamba2) can override to add additional metadata.
+        """
+        return self._compute_common_metadata(common_attn_metadata)
+
     def _compute_prefix_caching_block_indices(
         self,
         common_attn_metadata: CommonAttentionMetadata,
         mamba_block_size: int,
     ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        num_computed_tokens = common_attn_metadata.num_computed_tokens_cpu.to(
-            self.device
-        )
+        num_computed_tokens = common_attn_metadata.compute_num_computed_tokens()
         # Block index of the last computed token
         block_idx_last_computed_token = cdiv(num_computed_tokens, mamba_block_size) - 1
         # which is <= block index for the first scheduled token
@@ -106,12 +154,160 @@ class BaseMambaAttentionMetadataBuilder(AttentionMetadataBuilder[M], abc.ABC):
             cdiv(common_attn_metadata.seq_lens, mamba_block_size) - 1
         )
         # -1 in case it's non-computed and causes later issues with indexing
-        block_idx_last_computed_token = block_idx_last_computed_token.clamp(min=0)
+        block_idx_last_computed_token = torch.clamp(
+            block_idx_last_computed_token, min=0
+        )
         # -1 in the case we have a padded request (0 seq-len)
-        block_idx_last_scheduled_token = block_idx_last_scheduled_token.clamp(min=0)
+        block_idx_last_scheduled_token = torch.clamp(
+            block_idx_last_scheduled_token, min=0
+        )
 
         return (
             block_idx_last_computed_token,
             block_idx_first_scheduled_token,
             block_idx_last_scheduled_token,
         )
+
+    def _compute_common_metadata(
+        self,
+        common_attn_metadata: CommonAttentionMetadata,
+    ) -> M:
+        """
+        Compute metadata common to both Mamba1 and Mamba2.
+        """
+        num_reqs = common_attn_metadata.num_reqs
+
+        num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
+            split_decodes_and_prefills(
+                common_attn_metadata, decode_threshold=self.reorder_batch_threshold
+            )
+        )
+
+        # Need flags to indicate if there are initial states
+        has_initial_states_p = None
+        query_start_loc_p = None
+        num_computed_tokens = None
+        num_computed_tokens_p = None
+
+        # for prefix caching
+        block_idx_first_scheduled_token = None
+        block_idx_first_scheduled_token_p = None
+        block_idx_last_computed_token = None
+        block_idx_last_scheduled_token = None
+
+        # for causal_conv1d
+        nums_dict, batch_ptr, token_chunk_offset_ptr = None, None, None
+
+        if self.vllm_config.cache_config.enable_prefix_caching:
+            num_computed_tokens = common_attn_metadata.compute_num_computed_tokens()
+
+            # Return a tensor of shape (#requests, #max blocks)
+            state_indices_tensor = common_attn_metadata.block_table_tensor
+            # Additional cache-related varaiables:
+            mamba_block_size = self.kv_cache_spec.block_size
+            (
+                block_idx_last_computed_token,
+                block_idx_first_scheduled_token,
+                block_idx_last_scheduled_token,
+            ) = self._compute_prefix_caching_block_indices(
+                common_attn_metadata, mamba_block_size
+            )
+        else:
+            # Always return just a single block per each request:
+            state_indices_tensor = common_attn_metadata.block_table_tensor[:, 0]
+
+        if num_prefills > 0:
+            if num_computed_tokens is None:
+                num_computed_tokens = common_attn_metadata.compute_num_computed_tokens()
+            num_computed_tokens_cpu = num_computed_tokens.cpu()
+
+            query_start_loc_p = (
+                common_attn_metadata.query_start_loc[-num_prefills - 1 :]
+                - num_decode_tokens
+            )
+            has_initial_states_cpu = (
+                num_computed_tokens_cpu[num_reqs - num_prefills : num_reqs] > 0
+            )
+            has_initial_states_p = has_initial_states_cpu.to(
+                common_attn_metadata.query_start_loc.device
+            )
+
+            nums_dict, batch_ptr, token_chunk_offset_ptr = (
+                compute_causal_conv1d_metadata(query_start_loc_p)
+            )
+
+            if self.vllm_config.cache_config.enable_prefix_caching:
+                assert num_computed_tokens is not None
+                num_computed_tokens_p = num_computed_tokens[
+                    num_reqs - num_prefills : num_reqs
+                ]
+                assert block_idx_first_scheduled_token is not None
+                block_idx_first_scheduled_token_p = block_idx_first_scheduled_token[
+                    num_reqs - num_prefills : num_reqs
+                ]
+        elif (
+            num_decodes <= self.decode_cudagraph_max_bs
+            and self.compilation_config.cudagraph_mode.has_full_cudagraphs()
+        ):
+            self.state_indices_tensor[:num_decodes].copy_(
+                state_indices_tensor, non_blocking=True
+            )
+            state_indices_tensor = self.state_indices_tensor[:num_decode_tokens]
+            state_indices_tensor[num_decodes:] = PAD_SLOT_ID
+
+            if self.vllm_config.cache_config.enable_prefix_caching:
+                self.block_idx_last_scheduled_token[:num_decodes].copy_(
+                    block_idx_last_scheduled_token, non_blocking=True
+                )
+                block_idx_last_scheduled_token = self.block_idx_last_scheduled_token[
+                    :num_decode_tokens
+                ]
+
+                self.block_idx_last_computed_token[:num_decodes].copy_(
+                    block_idx_last_computed_token, non_blocking=True
+                )
+                block_idx_last_computed_token = self.block_idx_last_computed_token[
+                    :num_decode_tokens
+                ]
+
+        return self.metadata_cls(
+            num_prefills=num_prefills,
+            num_prefill_tokens=num_prefill_tokens,
+            num_decodes=num_decodes,
+            num_decode_tokens=num_decode_tokens,
+            query_start_loc_p=query_start_loc_p,
+            has_initial_states_p=has_initial_states_p,
+            state_indices_tensor=state_indices_tensor,
+            block_idx_last_scheduled_token=block_idx_last_scheduled_token,
+            block_idx_first_scheduled_token_p=block_idx_first_scheduled_token_p,
+            block_idx_last_computed_token=block_idx_last_computed_token,
+            num_computed_tokens_p=num_computed_tokens_p,
+            num_reqs=num_reqs,
+            nums_dict=nums_dict,
+            batch_ptr=batch_ptr,
+            token_chunk_offset_ptr=token_chunk_offset_ptr,
+        )
+
+    def update_block_table(
+        self,
+        metadata: M,
+        blk_table: torch.Tensor,
+        slot_mapping: torch.Tensor,
+    ) -> M:
+        new_metadata = copy.copy(metadata)
+        prefix_caching = self.vllm_config.cache_config.enable_prefix_caching
+        state_indices_t = blk_table if prefix_caching else blk_table[:, 0]
+        num_reqs = blk_table.shape[0]
+
+        # For CUDA graphs, copy to persistent buffer
+        if (
+            metadata.num_prefills == 0
+            and num_reqs <= self.decode_cudagraph_max_bs
+            and self.compilation_config.cudagraph_mode.has_full_cudagraphs()
+        ):
+            persistent_state_indices_t = self.state_indices_tensor[:num_reqs]
+            persistent_state_indices_t.copy_(state_indices_t, non_blocking=True)
+            state_indices_t = persistent_state_indices_t
+
+        new_metadata.state_indices_tensor = state_indices_t
+        return new_metadata
diff --git a/vllm/v1/attention/backends/mla/aiter_triton_mla.py b/vllm/v1/attention/backends/mla/aiter_triton_mla.py
index 8a92152a0ca53bfaf400ddfb6033901a589d6823..b164bb7b2ecdb862895bea96e9df30e486188b97 100644
--- a/vllm/v1/attention/backends/mla/aiter_triton_mla.py
+++ b/vllm/v1/attention/backends/mla/aiter_triton_mla.py
@@ -1,13 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from vllm.v1.attention.backends.mla.common import MLACommonBackend
-from vllm.v1.attention.backends.mla.rocm_aiter_mla import (
-    AiterMLAImpl,
-    AiterMLAMetadataBuilder,
-)
+from vllm.v1.attention.backends.mla.rocm_aiter_mla import AiterMLABackend, AiterMLAImpl
 
 
-class AiterTritonMLABackend(MLACommonBackend):
+class AiterTritonMLABackend(AiterMLABackend):
     @staticmethod
     def get_name() -> str:
         return "AITER_TRITON_MLA"
@@ -16,10 +12,6 @@ class AiterTritonMLABackend(MLACommonBackend):
     def get_impl_cls() -> type["AiterTritonMLAImpl"]:
         return AiterTritonMLAImpl
 
-    @staticmethod
-    def get_builder_cls() -> type["AiterMLAMetadataBuilder"]:
-        return AiterMLAMetadataBuilder
-
 
 class AiterTritonMLAImpl(AiterMLAImpl):
     def __init__(
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index aa8f8ffdb7ef75f3a80c4a2f60e2023b0a0859ab..488e6ef0d9d3ce0a47696c1f398631436c1b6421 100644
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -199,16 +199,7 @@ from tqdm import tqdm
 from vllm import _custom_ops as ops
 from vllm import envs
 from vllm._aiter_ops import rocm_aiter_ops
-from vllm.attention.backends.abstract import (
-    AttentionBackend,
-    AttentionLayer,
-    MLAAttentionImpl,
-)
-from vllm.attention.backends.utils import get_mla_dims
-from vllm.attention.ops.common import cp_lse_ag_out_rs
-from vllm.attention.ops.merge_attn_states import merge_attn_states
-from vllm.attention.utils.fa_utils import get_flash_attn_version
-from vllm.config import VllmConfig, get_current_vllm_config
+from vllm.config import ModelConfig, VllmConfig, get_current_vllm_config
 from vllm.distributed.parallel_state import get_dcp_group, is_global_first_rank
 from vllm.logger import init_logger
 from vllm.model_executor.layers.batch_invariant import (
@@ -222,14 +213,23 @@ from vllm.model_executor.layers.linear import (
 from vllm.platforms import current_platform
 from vllm.utils.flashinfer import has_nvidia_artifactory
 from vllm.utils.math_utils import cdiv, round_down
-from vllm.v1.attention.backends.utils import (
+from vllm.v1.attention.backend import (
+    AttentionBackend,
+    AttentionLayer,
+    AttentionMetadata,
     AttentionMetadataBuilder,
     CommonAttentionMetadata,
+    MLAAttentionImpl,
+)
+from vllm.v1.attention.backends.fa_utils import get_flash_attn_version
+from vllm.v1.attention.backends.utils import (
     get_dcp_local_seq_lens,
     get_per_layer_parameters,
     infer_global_hyperparameters,
     split_decodes_and_prefills,
 )
+from vllm.v1.attention.ops.common import cp_lse_ag_out_rs
+from vllm.v1.attention.ops.merge_attn_states import merge_attn_states
 from vllm.v1.kv_cache_interface import AttentionSpec
 
 
@@ -251,13 +251,15 @@ class QueryLenSupport(Enum):
 
 
 try:
-    from vllm.vllm_flash_attn import flash_attn_varlen_func
+    from vllm.vllm_flash_attn import (  # type: ignore[attr-defined]
+        flash_attn_varlen_func,
+    )
 
     is_vllm_fa = True
 except ImportError:
     # For rocm use upstream flash attention
     if current_platform.is_rocm():
-        from flash_attn import flash_attn_varlen_func
+        from flash_attn import flash_attn_varlen_func  # type: ignore[no-redef]
     is_vllm_fa = False
 
 try:
@@ -355,6 +357,8 @@ class MLACommonPrefillMetadata:
     max_query_len: int
     chunked_context: ChunkedContextMetadata | None = None
     query_seq_lens: torch.Tensor | None = None
+    workspace_buffer: torch.Tensor | None = None
+    q_data_type: torch.dtype | None = None
 
 
 @dataclass
@@ -384,7 +388,7 @@ D = TypeVar("D", bound=MLACommonDecodeMetadata)
 
 
 @dataclass
-class MLACommonMetadata(Generic[D]):
+class MLACommonMetadata(AttentionMetadata, Generic[D]):
     """Metadata for MLACommon.
 
     NOTE: Please read the comment at the top of the file before trying to
@@ -432,7 +436,7 @@ class MLACommonMetadata(Generic[D]):
 
 
 M = TypeVar("M", bound=MLACommonMetadata)
-A = TypeVar("A")
+A = TypeVar("A", bound=AttentionMetadata)
 
 
 def use_flashinfer_prefill() -> bool:
@@ -474,6 +478,27 @@ def use_trtllm_ragged_deepseek_prefill() -> bool:
     )
 
 
+@dataclass
+class MLADims:
+    q_lora_rank: int | None
+    kv_lora_rank: int
+    qk_nope_head_dim: int
+    qk_rope_head_dim: int
+    v_head_dim: int
+
+
+def get_mla_dims(model_config: ModelConfig) -> MLADims:
+    hf_text_config = model_config.hf_text_config
+
+    return MLADims(
+        q_lora_rank=getattr(hf_text_config, "q_lora_rank", None),
+        kv_lora_rank=hf_text_config.kv_lora_rank,
+        qk_nope_head_dim=hf_text_config.qk_nope_head_dim,
+        qk_rope_head_dim=hf_text_config.qk_rope_head_dim,
+        v_head_dim=hf_text_config.v_head_dim,
+    )
+
+
 class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
     """
     NOTE: Please read the comment at the top of the file before trying to
@@ -558,6 +583,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
             self.dcp_rank = 0
         self.dcp_local_block_size = parallel_config.cp_kv_cache_interleave_size
         self.dcp_virtual_block_size = self.dcp_local_block_size * self.dcp_world_size
+        self.cp_kv_cache_interleave_size = parallel_config.cp_kv_cache_interleave_size
 
         # Don't try to access the runner on AMD
         if self.aot_schedule:
@@ -614,7 +640,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
             self._fi_prefill_chunks: list[BatchPrefillWithRaggedKVCacheWrapper] = []
 
             self._global_hyperparameters = infer_global_hyperparameters(
-                get_per_layer_parameters(vllm_config, layer_names, MLACommonImpl)
+                get_per_layer_parameters(vllm_config, layer_names, MLACommonImpl)  # type: ignore[type-abstract]
             )
 
         if self._use_trtllm_ragged_prefill:
@@ -722,8 +748,8 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
     def _build_decode(
         self,
         block_table_tensor: torch.Tensor,
-        seq_lens_cpu: torch.Tensor,
         seq_lens_device: torch.Tensor,
+        max_seq_len: int,
         query_start_loc_cpu: torch.Tensor,
         query_start_loc_device: torch.Tensor,
         num_decode_tokens: int,
@@ -773,13 +799,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
         query_start_loc = common_attn_metadata.query_start_loc
         query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu
         seq_lens = common_attn_metadata.seq_lens
-        seq_lens_cpu = common_attn_metadata.seq_lens_cpu
         dcp_local_seq_lens = common_attn_metadata.dcp_local_seq_lens
-        dcp_local_seq_lens_cpu = common_attn_metadata.dcp_local_seq_lens_cpu
-
-        query_seq_lens_cpu = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1]
-
-        num_computed_tokens_cpu = common_attn_metadata.seq_lens_cpu - query_seq_lens_cpu
 
         num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
             split_decodes_and_prefills(
@@ -794,6 +814,10 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
 
         prefill_metadata = None
         if num_prefills > 0:
+            num_computed_tokens_cpu = (
+                common_attn_metadata.compute_num_computed_tokens().cpu()
+            )
+
             reqs_start = num_decodes  # prefill_start
 
             context_lens_cpu = num_computed_tokens_cpu[reqs_start:num_reqs]
@@ -873,7 +897,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
                     )
                     # Note(qcs): The max local context lengths
                     # padded to `dcp_local_block_size`.
-                    padded_local_context_lens_cpu = (
+                    padded_local_context_lens_cpu: torch.Tensor = (
                         cdiv(
                             context_lens_cpu,
                             self.dcp_virtual_block_size,
@@ -983,19 +1007,29 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
                 prefill_metadata.query_seq_lens = (
                     prefill_query_start_loc[1:] - prefill_query_start_loc[:-1]
                 )
+                prefill_metadata.workspace_buffer = self._workspace_buffer
 
         decode_metadata = None
         if num_decodes > 0:
             dcp_tot_seq_lens_device = None
             if self.dcp_world_size > 1:
                 dcp_tot_seq_lens_device = seq_lens[:num_decodes]
-                seq_lens_cpu = dcp_local_seq_lens_cpu
                 seq_lens = dcp_local_seq_lens
 
+                # After DCP distribution, the maximum number of tokens for any rank is
+                # ceil(L / (N * I)) * I, where L is max_seq_len, N is dcp_world_size,
+                # and I is cp_kv_cache_interleave_size.
+                # This eliminates GPU->CPU sync while minimizing workspace
+                # over-allocation.
+                num_partitions = self.dcp_world_size * self.cp_kv_cache_interleave_size
+                max_seq_len = (
+                    (max_seq_len + num_partitions - 1) // num_partitions
+                ) * self.cp_kv_cache_interleave_size
+
             decode_metadata = self._build_decode(
                 block_table_tensor=block_table_tensor[:num_decodes, ...],
-                seq_lens_cpu=seq_lens_cpu[:num_decodes],
                 seq_lens_device=seq_lens[:num_decodes],
+                max_seq_len=max_seq_len,
                 query_start_loc_cpu=query_start_loc_cpu[: num_decodes + 1],
                 query_start_loc_device=query_start_loc[: num_decodes + 1],
                 num_decode_tokens=num_decode_tokens,
@@ -1161,7 +1195,9 @@ class MLACommonBaseImpl(MLAAttentionImpl[A], Generic[A]):
             )
 
         def get_and_maybe_dequant_weights(layer: LinearBase):
-            if not isinstance(layer.quant_method, UnquantizedLinearMethod):
+            if layer.quant_method is not None and not isinstance(
+                layer.quant_method, UnquantizedLinearMethod
+            ):
                 # NOTE: This should only be used offline, since it's O(N^3)
                 eye = torch.eye(
                     layer.input_size_per_partition,
@@ -1320,15 +1356,17 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
             # v with 0s to match the qk head dim for attention backends that do
             # not support different headdims
             # We don't need to pad V if we are on a hopper system with FA3
+            device_capability = current_platform.get_device_capability()
             if not current_platform.is_rocm():
                 self._pad_v = self.vllm_flash_attn_version is None or not (
                     self.vllm_flash_attn_version == 3
-                    and current_platform.get_device_capability()[0] == 9
+                    and device_capability is not None
+                    and device_capability[0] == 9
                 )
             else:
                 self._pad_v = torch.cuda.get_device_properties(torch.cuda.current_device()).multi_processor_count == 120
 
-        self.dcp_world_size: int | None = None
+        self.dcp_world_size: int = -1
 
         self.chunked_prefill_workspace_size = (
             MLACommonMetadataBuilder.determine_chunked_prefill_workspace_size(
@@ -1504,12 +1542,13 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
         from flashinfer.prefill import trtllm_ragged_attention_deepseek
 
         assert prefill.query_seq_lens is not None
+        assert prefill.workspace_buffer is not None
 
         ret = trtllm_ragged_attention_deepseek(
             query=q,
             key=k,
             value=v,
-            workspace_buffer=self._workspace_buffer,
+            workspace_buffer=prefill.workspace_buffer,
             seq_lens=prefill.query_seq_lens,
             max_q_len=prefill.max_query_len,
             max_kv_len=prefill.max_query_len,
@@ -1538,6 +1577,7 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
 
         assert prefill.chunked_context is not None
         assert prefill.chunked_context.seq_lens[chunk_idx] is not None
+        assert prefill.workspace_buffer is not None
 
         out = torch.zeros(
             q.shape[0],
@@ -1546,13 +1586,13 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
             device=q.device,
             dtype=q.dtype,
         )
-        self._workspace_buffer.fill_(0)
+        prefill.workspace_buffer.fill_(0)
 
         attn_out, lse = trtllm_ragged_attention_deepseek(
             query=q,
             key=k,
             value=v,
-            workspace_buffer=self._workspace_buffer,
+            workspace_buffer=prefill.workspace_buffer,
             seq_lens=prefill.chunked_context.seq_lens[chunk_idx],
             max_q_len=prefill.max_query_len,
             max_kv_len=prefill.chunked_context.max_seq_lens[chunk_idx],
@@ -1583,7 +1623,9 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
             )
 
         def get_and_maybe_dequant_weights(layer: LinearBase):
-            if not isinstance(layer.quant_method, UnquantizedLinearMethod):
+            if layer.quant_method is not None and not isinstance(
+                layer.quant_method, UnquantizedLinearMethod
+            ):
                 # NOTE: This should only be used offline, since it's O(N^3)
                 eye = torch.eye(
                     layer.input_size_per_partition,
@@ -1887,7 +1929,7 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
     ) -> None:
         # TODO (zyongye): Prefill function here
         assert attn_metadata.prefill is not None
-        assert self.dcp_world_size is not None
+        assert self.dcp_world_size != -1
 
         if envs.VLLM_HAS_CONTEXT_DEFAULT:
             has_context = attn_metadata.prefill.chunked_context is not None
@@ -2000,7 +2042,7 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
             # same expert outputs.
             return output.fill_(0)
 
-        if self.dcp_world_size is None:
+        if self.dcp_world_size == -1:
             self.dcp_world_size = get_dcp_group().world_size
 
         fp8_attention = self.kv_cache_dtype.startswith("fp8")
diff --git a/vllm/v1/attention/backends/mla/cutlass_mla.py b/vllm/v1/attention/backends/mla/cutlass_mla.py
index 5e3fbc0abf083861cd53cc601659649fdc95435c..55a8703c6cebed31fd12a4335e066c9e66806c38 100644
--- a/vllm/v1/attention/backends/mla/cutlass_mla.py
+++ b/vllm/v1/attention/backends/mla/cutlass_mla.py
@@ -7,22 +7,22 @@ from typing import ClassVar
 import torch
 
 import vllm._custom_ops as ops
-from vllm.attention.backends.abstract import (
+from vllm.config.cache import CacheDType
+from vllm.logger import init_logger
+from vllm.platforms.interface import DeviceCapability
+from vllm.v1.attention.backend import (
+    AttentionCGSupport,
     AttentionLayer,
     AttentionType,
     MultipleOf,
     is_quantized_kv_cache,
 )
-from vllm.config.cache import CacheDType
-from vllm.logger import init_logger
-from vllm.platforms.interface import DeviceCapability
 from vllm.v1.attention.backends.mla.common import (
     MLACommonBackend,
     MLACommonImpl,
     MLACommonMetadata,
     MLACommonMetadataBuilder,
 )
-from vllm.v1.attention.backends.utils import AttentionCGSupport
 
 logger = init_logger(__name__)
 
diff --git a/vllm/v1/attention/backends/mla/flashattn_mla.py b/vllm/v1/attention/backends/mla/flashattn_mla.py
index b28814aceada91ac90e28b04166861d6d5fa5704..eedaef72d5d3a28518c1975912a3f2d47cca3c9b 100644
--- a/vllm/v1/attention/backends/mla/flashattn_mla.py
+++ b/vllm/v1/attention/backends/mla/flashattn_mla.py
@@ -6,23 +6,24 @@ from typing import ClassVar
 
 import torch
 
-from vllm.attention.backends.abstract import (
+from vllm.config import VllmConfig
+from vllm.config.cache import CacheDType
+from vllm.logger import init_logger
+from vllm.model_executor.layers.batch_invariant import (
+    vllm_is_batch_invariant,
+)
+from vllm.platforms.interface import DeviceCapability
+from vllm.v1.attention.backend import (
+    AttentionCGSupport,
     AttentionLayer,
     AttentionType,
     MultipleOf,
     is_quantized_kv_cache,
 )
-from vllm.attention.utils.fa_utils import (
+from vllm.v1.attention.backends.fa_utils import (
     flash_attn_supports_mla,
     get_flash_attn_version,
 )
-from vllm.config import VllmConfig
-from vllm.config.cache import CacheDType
-from vllm.logger import init_logger
-from vllm.model_executor.layers.batch_invariant import (
-    vllm_is_batch_invariant,
-)
-from vllm.platforms.interface import DeviceCapability
 from vllm.v1.attention.backends.mla.common import (
     MLACommonBackend,
     MLACommonDecodeMetadata,
@@ -31,9 +32,11 @@ from vllm.v1.attention.backends.mla.common import (
     MLACommonMetadataBuilder,
     QueryLenSupport,
 )
-from vllm.v1.attention.backends.utils import AttentionCGSupport
 from vllm.v1.kv_cache_interface import AttentionSpec
-from vllm.vllm_flash_attn import flash_attn_varlen_func, get_scheduler_metadata
+from vllm.vllm_flash_attn import (  # type: ignore[attr-defined]
+    flash_attn_varlen_func,
+    get_scheduler_metadata,
+)
 
 logger = init_logger(__name__)
 
@@ -169,8 +172,8 @@ class FlashAttnMLAMetadataBuilder(MLACommonMetadataBuilder[FlashAttnMLAMetadata]
     def _build_decode(
         self,
         block_table_tensor: torch.Tensor,
-        seq_lens_cpu: torch.Tensor,
         seq_lens_device: torch.Tensor,
+        max_seq_len: int,
         query_start_loc_cpu: torch.Tensor,
         query_start_loc_device: torch.Tensor,
         num_decode_tokens: int,
@@ -178,11 +181,14 @@ class FlashAttnMLAMetadataBuilder(MLACommonMetadataBuilder[FlashAttnMLAMetadata]
     ) -> FlashAttnMLADecodeMetadata:
         query_lens_cpu = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1]
         max_query_len = query_lens_cpu.max().item()
-        max_seq_len = seq_lens_cpu.max().item()
 
         # For Flash Attention MLA + full cudagraph
         max_num_splits = 0
-        if self.use_full_cuda_graph and num_decode_tokens <= self.max_cudagraph_size:
+        if (
+            self.use_full_cuda_graph
+            and self.max_cudagraph_size is not None
+            and num_decode_tokens <= self.max_cudagraph_size
+        ):
             # NOTE(woosuk): Setting num_splits > 1 may increase the memory
             # usage, because the intermediate buffers of size [num_splits,
             # num_heads, num_tokens, head_size] are allocated. Therefore,
@@ -193,7 +199,7 @@ class FlashAttnMLAMetadataBuilder(MLACommonMetadataBuilder[FlashAttnMLAMetadata]
             max_num_splits = 1
 
         scheduler_metadata = self._schedule_decode(
-            num_reqs=seq_lens_cpu.numel(),
+            num_reqs=seq_lens_device.shape[0],
             cu_query_lens=query_start_loc_device,
             max_query_len=max_query_len,
             seqlens=seq_lens_device,
diff --git a/vllm/v1/attention/backends/mla/flashinfer_mla.py b/vllm/v1/attention/backends/mla/flashinfer_mla.py
index f02a4bb1ef35ad651de69376edd0eba0f1246a8e..ffd2d47c839ddc9c548e4080558710271b7b6e96 100644
--- a/vllm/v1/attention/backends/mla/flashinfer_mla.py
+++ b/vllm/v1/attention/backends/mla/flashinfer_mla.py
@@ -6,14 +6,15 @@ from typing import ClassVar
 import torch
 from flashinfer.decode import trtllm_batch_decode_with_kv_cache_mla
 
-from vllm.attention.backends.abstract import (
+from vllm.config.cache import CacheDType
+from vllm.logger import init_logger
+from vllm.platforms.interface import DeviceCapability
+from vllm.v1.attention.backend import (
+    AttentionCGSupport,
     AttentionLayer,
     AttentionType,
     MultipleOf,
 )
-from vllm.config.cache import CacheDType
-from vllm.logger import init_logger
-from vllm.platforms.interface import DeviceCapability
 from vllm.v1.attention.backends.mla.common import (
     MLACommonBackend,
     MLACommonImpl,
@@ -21,7 +22,7 @@ from vllm.v1.attention.backends.mla.common import (
     MLACommonMetadataBuilder,
     QueryLenSupport,
 )
-from vllm.v1.attention.backends.utils import AttentionCGSupport, KVCacheLayoutType
+from vllm.v1.attention.backends.utils import KVCacheLayoutType
 
 logger = init_logger(__name__)
 
diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py
index db7c7df23728af1dba51ce79aaf8335ed6c804dd..f7cf9a022c44a31e90cdbf38f5dea392819d11ec 100644
--- a/vllm/v1/attention/backends/mla/flashmla.py
+++ b/vllm/v1/attention/backends/mla/flashmla.py
@@ -6,12 +6,6 @@ from typing import ClassVar
 
 import torch
 
-from vllm.attention.backends.abstract import AttentionLayer, AttentionType, MultipleOf
-from vllm.attention.ops.flashmla import (
-    flash_mla_with_kvcache,
-    get_mla_metadata,
-    is_flashmla_dense_supported,
-)
 from vllm.config import VllmConfig
 from vllm.config.cache import CacheDType
 from vllm.logger import init_logger
@@ -19,6 +13,12 @@ from vllm.model_executor.layers.batch_invariant import (
     vllm_is_batch_invariant,
 )
 from vllm.platforms.interface import DeviceCapability
+from vllm.v1.attention.backend import (
+    AttentionCGSupport,
+    AttentionLayer,
+    AttentionType,
+    MultipleOf,
+)
 from vllm.v1.attention.backends.mla.common import (
     MLACommonBackend,
     MLACommonDecodeMetadata,
@@ -28,10 +28,14 @@ from vllm.v1.attention.backends.mla.common import (
     QueryLenSupport,
 )
 from vllm.v1.attention.backends.utils import (
-    AttentionCGSupport,
     reshape_attn_output_for_spec_decode,
     reshape_query_for_spec_decode,
 )
+from vllm.v1.attention.ops.flashmla import (
+    flash_mla_with_kvcache,
+    get_mla_metadata,
+    is_flashmla_dense_supported,
+)
 from vllm.v1.kv_cache_interface import AttentionSpec
 from vllm import envs
 from vllm.platforms import current_platform
@@ -81,11 +85,11 @@ class FlashMLABackend(MLACommonBackend):
         device_capability: DeviceCapability,
     ) -> str | None:
         if use_sparse:
-            from vllm.attention.ops.flashmla import is_flashmla_sparse_supported
+            from vllm.v1.attention.ops.flashmla import is_flashmla_sparse_supported
 
             return is_flashmla_sparse_supported()[1]
         else:
-            from vllm.attention.ops.flashmla import is_flashmla_dense_supported
+            from vllm.v1.attention.ops.flashmla import is_flashmla_dense_supported
 
             return is_flashmla_dense_supported()[1]
 
@@ -146,8 +150,8 @@ class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]):
     def _build_decode(
         self,
         block_table_tensor: torch.Tensor,
-        seq_lens_cpu: torch.Tensor,
         seq_lens_device: torch.Tensor,
+        max_seq_len: int,
         query_start_loc_cpu: torch.Tensor,
         query_start_loc_device: torch.Tensor,
         num_decode_tokens: int,
diff --git a/vllm/v1/attention/backends/mla/flashmla_sparse.py b/vllm/v1/attention/backends/mla/flashmla_sparse.py
index 0818078da0364920cbffef3b02b0bdf9ac37789a..a2554a53a072d538550e4a219554a76fdc3afdce 100644
--- a/vllm/v1/attention/backends/mla/flashmla_sparse.py
+++ b/vllm/v1/attention/backends/mla/flashmla_sparse.py
@@ -7,17 +7,6 @@ import numpy as np
 import torch
 
 from vllm import _custom_ops as ops
-from vllm.attention.backends.abstract import (
-    AttentionBackend,
-    AttentionLayer,
-    MultipleOf,
-)
-from vllm.attention.backends.utils import get_mla_dims
-from vllm.attention.ops.flashmla import (
-    flash_mla_sparse_prefill,
-    flash_mla_with_kvcache,
-    get_mla_metadata,
-)
 from vllm.config import VllmConfig, get_current_vllm_config
 from vllm.config.cache import CacheDType
 from vllm.logger import init_logger
@@ -25,16 +14,27 @@ from vllm.platforms import current_platform
 from vllm.platforms.interface import DeviceCapability
 from vllm.triton_utils import tl, triton
 from vllm.utils.math_utils import cdiv
-from vllm.v1.attention.backends.mla.common import MLACommonBaseImpl
-from vllm.v1.attention.backends.utils import (
+from vllm.v1.attention.backend import (
+    AttentionBackend,
     AttentionCGSupport,
+    AttentionLayer,
+    AttentionMetadata,
     AttentionMetadataBuilder,
     CommonAttentionMetadata,
+    MultipleOf,
+)
+from vllm.v1.attention.backends.mla.common import MLACommonBaseImpl, get_mla_dims
+from vllm.v1.attention.backends.utils import (
     reshape_attn_output_for_spec_decode,
     reshape_query_for_spec_decode,
     split_decodes_and_prefills,
     split_prefill_chunks,
 )
+from vllm.v1.attention.ops.flashmla import (
+    flash_mla_sparse_prefill,
+    flash_mla_with_kvcache,
+    get_mla_metadata,
+)
 from vllm.v1.kv_cache_interface import AttentionSpec
 from vllm.v1.worker.workspace import current_workspace_manager
 
@@ -124,7 +124,7 @@ class FlashMLASparseBackend(AttentionBackend):
 
 
 @dataclass
-class FlashMLASparseMetadata:
+class FlashMLASparseMetadata(AttentionMetadata):
     num_reqs: int
     max_query_len: int
     max_seq_len: int
@@ -511,7 +511,7 @@ class FlashMLASparseMetadataBuilder(AttentionMetadataBuilder[FlashMLASparseMetad
         # For pure decode batches, prefill_request_id will be None
         # For mixed batches, it will have -1 for decode and request_id for prefill
         if num_prefills > 0:
-            seq_lens_cpu = common_attn_metadata.seq_lens_cpu
+            seq_lens_cpu = common_attn_metadata.seq_lens.cpu()
             seq_lens = common_attn_metadata.seq_lens
             query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu
 
@@ -718,7 +718,7 @@ class FlashMLASparseImpl(MLACommonBaseImpl[FlashMLASparseMetadata]):
         )
         self.softmax_scale = scale
         assert indexer is not None
-        self.topk_indices_buffer = indexer.topk_indices_buffer
+        self.topk_indices_buffer: torch.Tensor | None = indexer.topk_indices_buffer
         self.padding = 128 if current_platform.is_device_capability_family(100) else 64
 
         if kv_cache_dtype == "fp8_ds_mla":
@@ -930,8 +930,8 @@ class FlashMLASparseImpl(MLACommonBaseImpl[FlashMLASparseMetadata]):
         if self.num_heads % self.padding != 0:
             assert self.padding % self.num_heads == 0
             logger.warning_once(
-                f"padding num_heads to {self.padding} \
-                    due to sparse attn kernel requirement"
+                f"padding num_heads to {self.padding} due to sparse attn "
+                "kernel requirement"
             )
             q_padded = q.new_empty((q.shape[0], self.padding, q.shape[2]))
             q_padded[:, : self.num_heads, :] = q
@@ -980,6 +980,7 @@ class FlashMLASparseImpl(MLACommonBaseImpl[FlashMLASparseMetadata]):
         q = q[:num_actual_toks, ...]
         k_c_normed = k_c_normed[:num_actual_toks, ...]
         k_pe = k_pe[:num_actual_toks, ...]
+        assert self.topk_indices_buffer is not None
         topk_indices = self.topk_indices_buffer[:num_actual_toks]
 
         q_nope, q_pe = q.split([self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
diff --git a/vllm/v1/attention/backends/mla/indexer.py b/vllm/v1/attention/backends/mla/indexer.py
index 6252b44974566278874a40f219aa702194607339..fbe4776bcc4a963fafef6bd6ca19d48fba73093a 100644
--- a/vllm/v1/attention/backends/mla/indexer.py
+++ b/vllm/v1/attention/backends/mla/indexer.py
@@ -5,18 +5,18 @@ from typing import ClassVar
 
 import torch
 
-from vllm.attention.backends.abstract import (
-    AttentionBackend,
-    MultipleOf,
-)
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.utils.deep_gemm import get_paged_mqa_logits_metadata, is_deep_gemm_supported
-from vllm.v1.attention.backends.utils import (
+from vllm.v1.attention.backend import (
+    AttentionBackend,
     AttentionCGSupport,
     AttentionMetadataBuilder,
     CommonAttentionMetadata,
+    MultipleOf,
+)
+from vllm.v1.attention.backends.utils import (
     split_decodes_and_prefills,
     split_prefill_chunks,
 )
diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
index 00a0a77a1c2f79d199584cefdc46ede9033326be..9eacd5ee70d2dd8c965a37722db94fda0a7f2015 100644
--- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
+++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
@@ -7,16 +7,16 @@ from typing import ClassVar
 import torch
 
 from vllm._aiter_ops import rocm_aiter_ops
-from vllm.attention.backends.abstract import AttentionLayer, MultipleOf
 from vllm.config import VllmConfig
+from vllm.v1.attention.backend import AttentionCGSupport, AttentionLayer, MultipleOf
 from vllm.v1.attention.backends.mla.common import (
     MLACommonBackend,
     MLACommonDecodeMetadata,
     MLACommonImpl,
     MLACommonMetadata,
     MLACommonMetadataBuilder,
+    QueryLenSupport,
 )
-from vllm.v1.attention.backends.utils import AttentionCGSupport
 from vllm.v1.kv_cache_interface import AttentionSpec
 
 
@@ -51,6 +51,8 @@ class AiterMLADecodeMetadata(MLACommonDecodeMetadata):
     qo_indptr: torch.Tensor | None = None
     # The dtype of MLA out tensor
     attn_out_dtype: torch.dtype = torch.bfloat16
+    # The max query output length: int
+    max_qo_len: int | None = None
 
 
 class AiterMLAMetadata(MLACommonMetadata[AiterMLADecodeMetadata]):
@@ -60,9 +62,8 @@ class AiterMLAMetadata(MLACommonMetadata[AiterMLADecodeMetadata]):
 class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
     # TODO(luka, lucas): audit this as part of:
     #  https://github.com/vllm-project/vllm/issues/22945
-    _cudagraph_support: ClassVar[AttentionCGSupport] = (
-        AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
-    )
+    _cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.UNIFORM_BATCH
+    query_len_support: ClassVar[QueryLenSupport] = QueryLenSupport.UNIFORM
 
     def __init__(
         self,
@@ -86,6 +87,13 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
         # TODO: we can disambiguate between decode and mixed-prefill decode here
         # so we can only use the persistent buffer if a cudagraph is actually
         # being used.
+
+        # paged_kv_last_page_len is always 1s (kernel block size is always 1),
+        # so we create it once and reuse slices in both eager and cudagraph modes.
+        self.paged_kv_last_page_len = torch.ones(
+            max_num_reqs, dtype=torch.int32, device=device
+        )
+
         if self.compilation_config.cudagraph_mode.has_full_cudagraphs():
             self.paged_kv_indptr = torch.zeros(
                 max_num_reqs + 1, dtype=torch.int32, device=device
@@ -93,19 +101,16 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
             self.paged_kv_indices = torch.zeros(
                 max_num_pages, dtype=torch.int32, device=device
             )
-            self.paged_kv_last_page_len = torch.zeros(
-                max_num_reqs, dtype=torch.int32, device=device
-            )
 
-            self.qo_indptr = torch.arange(
-                0, max_num_reqs + 1, dtype=torch.int32, device=device
+            self.qo_indptr = torch.zeros(
+                max_num_reqs + 1, dtype=torch.int32, device=device
             )
 
     def _build_decode(
         self,
         block_table_tensor: torch.Tensor,
-        seq_lens_cpu: torch.Tensor,
         seq_lens_device: torch.Tensor,
+        max_seq_len: int,
         query_start_loc_cpu: torch.Tensor,
         query_start_loc_device: torch.Tensor,
         num_decode_tokens: int,
@@ -120,7 +125,9 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
         ).unsqueeze(0) < seq_lens_device.unsqueeze(1)
         paged_kv_indices = block_table_tensor[mask]
 
-        paged_kv_last_page_len = torch.where(seq_lens_device == 0, 1, seq_lens_device)
+        # kernel block size is always 1, so each page has exactly 1 token.
+        # last_page_len is always 1 - just slice the pre-initialized buffer.
+        paged_kv_last_page_len = self.paged_kv_last_page_len[:num_reqs]
 
         paged_kv_indptr = torch.cat(
             [
@@ -128,6 +135,8 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
                 seq_lens_device.cumsum(dim=0, dtype=torch.int32),
             ]
         )
+        qo_len = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1]
+        max_qo_len = qo_len.max().item()
 
         if self.compilation_config.cudagraph_mode.has_full_cudagraphs():
             num_actual_pages = paged_kv_indices.size(0)
@@ -144,12 +153,13 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
             self.paged_kv_indptr[1 + num_reqs :].fill_(paged_kv_indptr[-1])
             paged_kv_indptr = self.paged_kv_indptr[: 1 + num_reqs]
 
-            self.paged_kv_last_page_len[:num_reqs].copy_(
-                paged_kv_last_page_len, non_blocking=True
-            )
-            self.paged_kv_last_page_len[num_reqs:].fill_(1)
-            paged_kv_last_page_len = self.paged_kv_last_page_len[:num_reqs]
+            # paged_kv_last_page_len already uses the pre-initialized buffer slice
+            # (set above), so no copy needed - buffer is always 1s.
 
+            self.qo_indptr[: 1 + num_reqs].copy_(
+                query_start_loc_device, non_blocking=True
+            )
+            self.qo_indptr[1 + num_reqs :] = query_start_loc_device[-1]
             qo_indptr = self.qo_indptr[: 1 + num_reqs]
 
         else:
@@ -165,6 +175,7 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
             paged_kv_last_page_len=paged_kv_last_page_len,
             qo_indptr=qo_indptr,
             dcp_tot_seq_lens=dcp_tot_seq_lens_device,
+            max_qo_len=max_qo_len,
             attn_out_dtype=self.decode_attn_out_dtype,
         )
 
@@ -239,6 +250,7 @@ class AiterMLAImpl(MLACommonImpl[AiterMLAMetadata]):
     ) -> tuple[torch.Tensor, torch.Tensor | None]:
         assert kv_c_and_k_pe_cache.numel() > 0
         assert attn_metadata.decode is not None
+        assert attn_metadata.decode.max_qo_len is not None
 
         if type(q) is tuple:
             q = torch.cat(q, dim=-1)
@@ -255,16 +267,13 @@ class AiterMLAImpl(MLACommonImpl[AiterMLAMetadata]):
 
         kv_buffer = kv_c_and_k_pe_cache.unsqueeze(2)
 
-        # max_seqlen_qo must be 1 except for MTP
-        # TODO: Find the best value for MTP
-        max_seqlen_qo = 1
         rocm_aiter_ops.mla_decode_fwd(
             q,
             kv_buffer,
             o,
             self.scale,
             attn_metadata.decode.qo_indptr,
-            max_seqlen_qo,
+            attn_metadata.decode.max_qo_len,
             attn_metadata.decode.paged_kv_indptr,
             attn_metadata.decode.paged_kv_indices,
             attn_metadata.decode.paged_kv_last_page_len,
diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla_sparse.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla_sparse.py
index c0e7f0e380b98eb6a31408a78c909ca71efe818a..997b1f62a2f5d9cbbb5c3bc6ad9211b455649ca8 100644
--- a/vllm/v1/attention/backends/mla/rocm_aiter_mla_sparse.py
+++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla_sparse.py
@@ -9,25 +9,20 @@ import torch
 
 from vllm import _custom_ops as ops
 from vllm._aiter_ops import rocm_aiter_ops
-from vllm.attention.backends.abstract import (
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.v1.attention.backend import (
     AttentionBackend,
+    AttentionCGSupport,
     AttentionLayer,
     AttentionMetadata,
+    AttentionMetadataBuilder,
+    CommonAttentionMetadata,
 )
-from vllm.attention.backends.utils import get_mla_dims
-from vllm.config import VllmConfig
-from vllm.logger import init_logger
-from vllm.v1.attention.backends.mla.common import (
-    MLACommonBaseImpl,
-)
+from vllm.v1.attention.backends.mla.common import MLACommonBaseImpl, get_mla_dims
 from vllm.v1.attention.backends.mla.flashmla_sparse import (
     triton_convert_req_index_to_global_index,
 )
-from vllm.v1.attention.backends.utils import (
-    AttentionCGSupport,
-    AttentionMetadataBuilder,
-    CommonAttentionMetadata,
-)
 from vllm.v1.kv_cache_interface import AttentionSpec
 
 if TYPE_CHECKING:
@@ -43,7 +38,7 @@ class ROCMAiterMLASparseBackend(AttentionBackend):
         return "ROCM_AITER_MLA_SPARSE"
 
     @staticmethod
-    def get_metadata_cls() -> type[AttentionMetadata]:
+    def get_metadata_cls() -> type["ROCMAiterMLASparseMetadata"]:
         return ROCMAiterMLASparseMetadata
 
     @staticmethod
@@ -74,7 +69,7 @@ class ROCMAiterMLASparseBackend(AttentionBackend):
 
 
 @dataclass
-class ROCMAiterMLASparseMetadata:
+class ROCMAiterMLASparseMetadata(AttentionMetadata):
     num_reqs: int
     max_query_len: int
     max_seq_len: int
@@ -223,7 +218,7 @@ class ROCMAiterMLASparseImpl(MLACommonBaseImpl[ROCMAiterMLASparseMetadata]):
         )
         self.softmax_scale = scale
         assert indexer is not None
-        self.topk_indices_buffer = indexer.topk_indices_buffer
+        self.topk_indices_buffer: torch.Tensor | None = indexer.topk_indices_buffer
         self.is_fp8bmm_enabled = rocm_aiter_ops.is_fp8bmm_enabled()
 
     def _forward_bf16_kv(
@@ -294,6 +289,7 @@ class ROCMAiterMLASparseImpl(MLACommonBaseImpl[ROCMAiterMLASparseMetadata]):
             # Convert from (N, B, L) to (B, N, L)
             ql_nope = ql_nope.transpose(0, 1)
 
+        assert self.topk_indices_buffer is not None
         topk_indices = self.topk_indices_buffer[:num_actual_toks]
 
         topk_indices_global = triton_convert_req_index_to_global_index(
diff --git a/vllm/v1/attention/backends/mla/triton_mla.py b/vllm/v1/attention/backends/mla/triton_mla.py
index 54ad3acb93ed279c81533019ff269ec1657c0366..32d3fa3b0320e11de430e62352ee1589b5f5f66c 100644
--- a/vllm/v1/attention/backends/mla/triton_mla.py
+++ b/vllm/v1/attention/backends/mla/triton_mla.py
@@ -5,23 +5,23 @@ from typing import ClassVar
 
 import torch
 
-from vllm.attention.backends.abstract import (
-    AttentionLayer,
-    AttentionType,
-    is_quantized_kv_cache,
-)
-from vllm.attention.ops.triton_decode_attention import decode_attention_fwd
 from vllm.config.cache import CacheDType
 from vllm.logger import init_logger
 from vllm.model_executor.layers.batch_invariant import (
     vllm_is_batch_invariant,
 )
 from vllm.platforms.interface import DeviceCapability
+from vllm.v1.attention.backend import (
+    AttentionLayer,
+    AttentionType,
+    is_quantized_kv_cache,
+)
 from vllm.v1.attention.backends.mla.common import (
     MLACommonBackend,
     MLACommonImpl,
     MLACommonMetadata,
 )
+from vllm.v1.attention.ops.triton_decode_attention import decode_attention_fwd
 
 logger = init_logger(__name__)
 
diff --git a/vllm/v1/attention/backends/pallas.py b/vllm/v1/attention/backends/pallas.py
deleted file mode 100644
index 525026bac5a7eb2849dbfe425a8c132295e12be4..0000000000000000000000000000000000000000
--- a/vllm/v1/attention/backends/pallas.py
+++ /dev/null
@@ -1,436 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from dataclasses import dataclass
-
-import torch
-
-from vllm.attention.backends.abstract import (
-    AttentionBackend,
-    AttentionImpl,
-    AttentionLayer,
-    AttentionType,
-)
-from vllm.config import VllmConfig
-from vllm.logger import init_logger
-from vllm.utils.math_utils import cdiv, next_power_of_2
-
-logger = init_logger(__name__)
-
-# TPU requires the head size to be a multiple of 128.
-TPU_HEAD_SIZE_ALIGNMENT = 128
-
-# Note: TPU can fp8 as storage dtype but doesn't support converting from uint8
-# from to fp32 directly. That's why it has a dtype mapping different from GPU
-TPU_STR_DTYPE_TO_TORCH_DTYPE = {
-    "half": torch.half,
-    "bfloat16": torch.bfloat16,
-    "float": torch.float,
-    "fp8": torch.float8_e4m3fn,
-    "fp8_e4m3": torch.float8_e4m3fn,
-    "fp8_e5m2": torch.float8_e5m2,
-    "int8": torch.int8,
-    "uint8": torch.uint8,
-}
-
-try:
-    import tpu_inference  # noqa: F401
-except ImportError:
-    # Lazy import torch_xla
-    import torch_xla.core.xla_builder as xb
-    import torch_xla.experimental.custom_kernel  # noqa: F401
-    from torch.library import impl
-    from torch_xla._internal.jax_workarounds import requires_jax
-    from torch_xla.experimental.custom_kernel import XLA_LIB
-
-    @requires_jax
-    def kv_cache_update_op_impl(
-        kv: torch.Tensor,
-        slot_mapping: torch.Tensor,
-        kv_cache: torch.Tensor,
-        num_kv_update_slices: torch.Tensor,
-        page_size: int,
-        num_slices_per_block: int,
-    ):
-        from vllm.attention.ops.pallas_kv_cache_update import kv_cache_update
-
-        new_kv_cache = xb.call_jax(
-            kv_cache_update,
-            (kv, slot_mapping, kv_cache, num_kv_update_slices),
-            {"page_size": page_size, "num_slices_per_block": num_slices_per_block},
-        )
-        return new_kv_cache
-
-    XLA_LIB.define(
-        "kv_cache_update_op(Tensor kv, Tensor slot_mapping,"
-        "Tensor kv_cache, Tensor num_kv_update_slices, int page_size,"
-        "int num_slices_per_block)"
-        "-> Tensor",
-    )
-
-    @impl(XLA_LIB, "kv_cache_update_op", "XLA")
-    def kv_cache_update_op_xla(
-        kv: torch.Tensor,
-        slot_mapping: torch.Tensor,
-        kv_cache: torch.Tensor,
-        num_kv_update_slices: torch.Tensor,
-        page_size: int,
-        num_slices_per_block: int,
-    ) -> torch.Tensor:
-        new_kv_cache = kv_cache_update_op_impl(
-            kv,
-            slot_mapping,
-            kv_cache,
-            num_kv_update_slices,
-            page_size,
-            num_slices_per_block,
-        )
-        return new_kv_cache
-
-    @impl(XLA_LIB, "kv_cache_update_op", "CompositeExplicitAutograd")
-    def kv_cache_update_op_non_xla(
-        kv: torch.Tensor,
-        slot_mapping: torch.Tensor,
-        kv_cache: torch.Tensor,
-        num_kv_update_slices: torch.Tensor,
-        page_size: int,
-        num_slices_per_block: int,
-    ) -> torch.Tensor:
-        return kv_cache
-
-
-class PallasAttentionBackend(AttentionBackend):
-    @staticmethod
-    def get_name() -> str:
-        return "PALLAS"
-
-    @staticmethod
-    def get_impl_cls() -> type["PallasAttentionBackendImpl"]:
-        return PallasAttentionBackendImpl
-
-    @staticmethod
-    def get_kv_cache_shape(
-        num_blocks: int,
-        block_size: int,
-        num_kv_heads: int,
-        head_size: int,
-        cache_dtype_str: str = "auto",
-    ) -> tuple[int, ...]:
-        padded_head_size = (
-            cdiv(head_size, TPU_HEAD_SIZE_ALIGNMENT) * TPU_HEAD_SIZE_ALIGNMENT
-        )
-        return (num_blocks, block_size, num_kv_heads * 2, padded_head_size)
-
-    @staticmethod
-    def swap_blocks(
-        src_kv_cache: torch.Tensor,
-        dst_kv_cache: torch.Tensor,
-        src_to_dst: torch.Tensor,
-    ) -> None:
-        raise RuntimeError("swap_blocks is not used for the TPU backend.")
-
-    # In recent TPU generations, up to v6e, the SMEM size is 1MB. The
-    # block_tables within the PallasMetadata constitute almost the entire SMEM
-    # requirement. Its size is max_num_seqs * num_page_per_seq * 4 (Int). Here
-    # we simply make sure that the size is smaller than half of SMEM capacity.
-    @staticmethod
-    def get_min_page_size(vllm_config: VllmConfig) -> int:
-        max_num_page_per_req = (
-            1024 * 1024 // 2 // vllm_config.scheduler_config.max_num_seqs // 4
-        )
-        min_page_size = cdiv(
-            vllm_config.model_config.max_model_len, max_num_page_per_req
-        )
-        min_page_size = 1 << (min_page_size - 1).bit_length()
-        return min_page_size
-
-    @staticmethod
-    def get_max_num_seqs(model_len: int, page_size: int) -> int:
-        num_page_per_req = cdiv(model_len, page_size)
-        return 1024 * 1024 // 2 // num_page_per_req // 4
-
-    # TPU has limited SREGs (scalar registers), if page_size is too small, we
-    # can spill SREGs easily which leads to bad performance. The strategy we
-    # apply here is trying to split max-model-len to 16 pages which make the
-    # spill less likely. Meanwhile we make sure the page size is in [16, 256].
-    @staticmethod
-    def get_page_size(vllm_config: VllmConfig) -> int:
-        # TODO: This is a temporary fix for vmem OOM.
-        # For long model length, we use 16 page-size to avoid too much
-        # VMEM spill. A more robust solution should be implemented to
-        # handle VREG spills.
-        if vllm_config.model_config.max_model_len > 8192:
-            return 16
-        page_size = next_power_of_2(vllm_config.model_config.max_model_len) // 16
-        if page_size <= 16:
-            return 16
-        if page_size >= 256:
-            return 256
-        return page_size
-
-
-@dataclass
-class PallasMetadata:
-    # NOTE(sang): Definition of context_len, query_len, and seq_len.
-    # |---------- N-1 iteration --------|
-    # |---------------- N iteration ---------------------|
-    # |- tokenA -|......................|-- newTokens ---|
-    # |---------- context_len ----------|
-    # |-------------------- seq_len ---------------------|
-    #                                   |-- query_len ---|
-
-    # Used in the PallasAttentionBackendImpl
-    slot_mapping: torch.Tensor
-    block_tables: torch.Tensor
-    context_lens: torch.Tensor
-    query_start_loc: torch.Tensor
-    num_seqs: torch.Tensor
-    num_kv_update_slices: torch.Tensor
-    num_slices_per_kv_cache_update_block: int
-
-
-class PallasAttentionBackendImpl(AttentionImpl):
-    def __init__(
-        self,
-        num_heads: int,
-        head_size: int,
-        scale: float,
-        num_kv_heads: int,
-        alibi_slopes: list[float] | None,
-        sliding_window: int | None,
-        kv_cache_dtype: str,
-        logits_soft_cap: float | None = None,
-        attn_type: str = AttentionType.DECODER,
-        kv_sharing_target_layer_name: int | None = None,
-    ) -> None:
-        self.num_heads = num_heads
-        self.head_size = head_size
-        self.scale = float(scale)
-        self.num_kv_heads = num_kv_heads
-        self.sliding_window = sliding_window
-        self.logits_soft_cap = logits_soft_cap
-        self.kv_sharing_target_layer_name = kv_sharing_target_layer_name
-
-        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
-        if alibi_slopes is not None:
-            raise NotImplementedError("Alibi slopes is not supported.")
-
-        if attn_type != AttentionType.DECODER:
-            raise NotImplementedError(
-                "Encoder self-attention and "
-                "encoder/decoder cross-attention "
-                "are not implemented for "
-                "PallasAttentionBackendImpl"
-            )
-
-        self.kv_cache_quantized_dtype = None
-        if kv_cache_dtype != "auto":
-            self.kv_cache_quantized_dtype = TPU_STR_DTYPE_TO_TORCH_DTYPE.get(
-                kv_cache_dtype.lower().strip()
-            )
-
-    def forward(
-        self,
-        layer: AttentionLayer,
-        query: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: PallasMetadata,
-        output: torch.Tensor | None = None,
-        output_scale: torch.Tensor | None = None,
-        output_block_scale: torch.Tensor | None = None,
-    ) -> torch.Tensor:
-        """Forward pass with Pallas attention.
-
-        Args:
-            query: shape = [num_tokens, num_heads * head_size]
-            key: shape = [num_tokens, num_kv_heads * head_size]
-            value: shape = [num_tokens, num_kv_heads * head_size]
-            kv_cache: shape =
-                [num_blocks, block_size, num_kv_heads * 2, head_size]
-            attn_metadata: Metadata for attention.
-        Returns:
-            shape = [num_tokens, num_heads * head_size]
-        """
-        if output_scale is not None or output_block_scale is not None:
-            raise NotImplementedError(
-                "fused output quantization is not yet supported"
-                " for PallasAttentionBackendImpl"
-            )
-
-        # For determine_available_memory case.
-        if kv_cache.numel() == 0:
-            if output is None:
-                output = torch.ones_like(query)
-            return output
-
-        num_tokens, hidden_size = query.shape
-        query = query.view(num_tokens, self.num_heads, self.head_size)
-        key = key.view(-1, self.num_kv_heads, self.head_size)
-        value = value.view(-1, self.num_kv_heads, self.head_size)
-        if self.head_size % TPU_HEAD_SIZE_ALIGNMENT != 0:
-            padded_head_size = (
-                cdiv(self.head_size, TPU_HEAD_SIZE_ALIGNMENT) * TPU_HEAD_SIZE_ALIGNMENT
-            )
-            query = torch.nn.functional.pad(
-                query, (0, padded_head_size - self.head_size), value=0.0
-            )
-            key = torch.nn.functional.pad(
-                key, (0, padded_head_size - self.head_size), value=0.0
-            )
-            value = torch.nn.functional.pad(
-                value, (0, padded_head_size - self.head_size), value=0.0
-            )
-
-        if self.kv_sharing_target_layer_name is None and kv_cache.numel() > 0:
-            # Write input keys and values to the KV cache.
-            # Skip this if sharing KV cache with an earlier attention layer.
-            slot_mapping = attn_metadata.slot_mapping
-            write_to_kv_cache(
-                key,
-                value,
-                kv_cache,
-                slot_mapping,
-                attn_metadata.num_slices_per_kv_cache_update_block,
-                attn_metadata.num_kv_update_slices,
-                self.kv_cache_quantized_dtype,
-                layer._k_scale_float,
-                layer._v_scale_float,
-            )
-
-        if self.kv_cache_quantized_dtype is not None and (
-            layer._k_scale_float == 0.0 or layer._v_scale_float == 0.0
-        ):
-            raise ValueError("k_scale_float and v_scale_float must be non-zero")
-        output = torch.ops.xla.ragged_paged_attention(
-            query,
-            kv_cache,
-            attn_metadata.context_lens,
-            attn_metadata.block_tables,
-            attn_metadata.query_start_loc,
-            attn_metadata.num_seqs,
-            # By default, the system utilizes optimized block size and
-            # vmem_limit_bytes parameters from the kernel repository. However,
-            # these can be manually adjusted for debugging if necessary.
-            num_kv_pages_per_block=None,
-            num_queries_per_block=None,
-            vmem_limit_bytes=None,
-            use_kernel=True,
-            sm_scale=self.scale,
-            sliding_window=self.sliding_window,
-            soft_cap=self.logits_soft_cap,
-            k_scale=layer._k_scale_float,
-            v_scale=layer._v_scale_float,
-        )
-
-        if self.head_size % TPU_HEAD_SIZE_ALIGNMENT != 0:
-            output = output[:, :, : self.head_size]
-
-        return output.reshape(num_tokens, hidden_size)
-
-
-def write_to_kv_cache(
-    key: torch.Tensor,
-    value: torch.Tensor,
-    kv_cache: torch.Tensor,
-    slot_mapping: torch.Tensor,
-    num_slices_per_kv_cache_update_block: int,
-    num_kv_update_slices: torch.Tensor,
-    kv_cache_quantized_dtype: torch.dtype | None = None,
-    k_scale: float = 1.0,
-    v_scale: float = 1.0,
-) -> None:
-    """Write the key and values to the KV cache.
-
-    Args:
-        key: shape = [num_tokens, num_kv_heads, head_size]
-        value: shape = [num_tokens, num_kv_heads, head_size]
-        kv_cache: shape = [num_blocks, block_size, num_kv_heads * 2, head_size]
-        num_slices_per_kv_cache_update_block: int
-    """
-    _, page_size, num_combined_kv_heads, head_size = kv_cache.shape
-    head_size = cdiv(head_size, TPU_HEAD_SIZE_ALIGNMENT) * TPU_HEAD_SIZE_ALIGNMENT
-
-    if kv_cache_quantized_dtype is not None:
-        dtype_info = torch.finfo(kv_cache_quantized_dtype)
-        key = key.to(torch.float32) / k_scale
-        # NOTE: clamp is added here to avoid out of range of quantized dtype
-        key = torch.clamp(key, dtype_info.min, dtype_info.max)
-        key = key.to(kv_cache_quantized_dtype)
-        value = value.to(torch.float32) / v_scale
-        value = torch.clamp(value, dtype_info.min, dtype_info.max)
-        value = value.to(kv_cache_quantized_dtype)
-
-    kv = torch.cat([key, value], axis=-1).reshape(-1, num_combined_kv_heads, head_size)
-
-    torch.ops.xla.dynamo_set_buffer_donor_(kv_cache, True)
-
-    kv_cache = kv_cache.flatten(0, 1)
-    new_kv_cache = torch.ops.xla.kv_cache_update_op(
-        kv,
-        slot_mapping,
-        kv_cache,
-        num_kv_update_slices,
-        page_size,
-        num_slices_per_kv_cache_update_block,
-    )
-    # NOTE: the in-place copy will be optimized away by XLA compiler.
-    kv_cache.copy_(new_kv_cache)
-
-
-# We can move this function to a common utils file if it's also useful for other
-# hardware.
-def dtype_bits(dtype: torch.dtype):
-    if dtype.is_floating_point:
-        try:
-            return torch.finfo(dtype).bits
-        except TypeError:
-            pass
-    elif dtype.is_complex:
-        if dtype is torch.complex32:
-            return 32
-        elif dtype is torch.complex64:
-            return 64
-        elif dtype is torch.complex128:
-            return 128
-    else:
-        try:
-            return torch.iinfo(dtype).bits
-        # torch.iinfo cannot support int4, int2, bits8...
-        except TypeError:
-            pass
-    str_dtype = str(dtype)
-    # support torch.int4, torch.int5, torch.uint5...
-    if str_dtype.startswith("torch.int") or str_dtype.startswith("torch.uint"):
-        return int(str_dtype[-1])
-    raise TypeError(f"Getting the bit width of {dtype} is not supported")
-
-
-def get_dtype_packing(dtype):
-    bits = dtype_bits(dtype)
-    if 32 % bits != 0:
-        raise ValueError(
-            f"The bit width must be divisible by 32, but got bits={bits}, "
-            "dtype={dtype}"
-        )
-    return 32 // bits
-
-
-def get_page_size_bytes(
-    block_size: int, num_kv_heads: int, head_size: int, kv_cache_dtype: torch.dtype
-) -> int:
-    """Returns the size in bytes of one page of the KV cache."""
-    padded_head_size = (
-        cdiv(head_size, TPU_HEAD_SIZE_ALIGNMENT) * TPU_HEAD_SIZE_ALIGNMENT
-    )
-    num_combined_kv_heads = num_kv_heads * 2
-
-    # NOTE: for the implicit padding in XLA
-    packing = get_dtype_packing(kv_cache_dtype)
-    num_combined_kv_heads = cdiv(num_combined_kv_heads, packing) * packing
-
-    kv_cache_dtype_bits = dtype_bits(kv_cache_dtype)
-    return (
-        block_size * num_combined_kv_heads * padded_head_size * kv_cache_dtype_bits // 8
-    )
diff --git a/vllm/attention/backends/registry.py b/vllm/v1/attention/backends/registry.py
similarity index 96%
rename from vllm/attention/backends/registry.py
rename to vllm/v1/attention/backends/registry.py
index eaa0fa1d5db3999ba689bdbd79568b825500f0d6..bd45702fa587d8df3c5eea1d727a4572031ec9f2 100644
--- a/vllm/attention/backends/registry.py
+++ b/vllm/v1/attention/backends/registry.py
@@ -10,7 +10,7 @@ from vllm.logger import init_logger
 from vllm.utils.import_utils import resolve_obj_by_qualname
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionBackend
+    from vllm.v1.attention.backend import AttentionBackend
 
 logger = init_logger(__name__)
 
@@ -42,6 +42,9 @@ class AttentionBackendEnum(Enum, metaclass=_AttentionBackendEnumMeta):
     """
 
     FLASH_ATTN = "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"
+    FLASH_ATTN_DIFFKV = (
+        "vllm.v1.attention.backends.flash_attn_diffkv.FlashAttentionDiffKVBackend"
+    )
     TRITON_ATTN = "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend"
     ROCM_ATTN = "vllm.v1.attention.backends.rocm_attn.RocmAttentionBackend"
     ROCM_AITER_MLA = "vllm.v1.attention.backends.mla.rocm_aiter_mla.AiterMLABackend"
@@ -66,7 +69,6 @@ class AttentionBackendEnum(Enum, metaclass=_AttentionBackendEnumMeta):
         "vllm.v1.attention.backends.mla.flashmla_sparse.FlashMLASparseBackend"
     )
     FLASH_ATTN_MLA = "vllm.v1.attention.backends.mla.flashattn_mla.FlashAttnMLABackend"
-    PALLAS = "vllm.v1.attention.backends.pallas.PallasAttentionBackend"
     IPEX = "vllm.v1.attention.backends.ipex.IpexAttentionBackend"
     NO_ATTENTION = "vllm.v1.attention.backends.no_attention.NoAttentionBackend"
     FLEX_ATTENTION = "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend"
@@ -77,7 +79,8 @@ class AttentionBackendEnum(Enum, metaclass=_AttentionBackendEnumMeta):
     )
     CPU_ATTN = "vllm.v1.attention.backends.cpu_attn.CPUAttentionBackend"
     # Placeholder for third-party/custom backends - must be registered before use
-    CUSTOM = ""
+    # set to None to avoid alias with other backend, whose value is an empty string
+    CUSTOM = None
 
     def get_path(self, include_classname: bool = True) -> str:
         """Get the class path for this backend (respects overrides).
@@ -139,7 +142,8 @@ class MambaAttentionBackendEnum(Enum, metaclass=_AttentionBackendEnumMeta):
     LINEAR = "vllm.v1.attention.backends.linear_attn.LinearAttentionBackend"
     GDN_ATTN = "vllm.v1.attention.backends.gdn_attn.GDNAttentionBackend"
     # Placeholder for third-party/custom backends - must be registered before use
-    CUSTOM = ""
+    # set to None to avoid alias with other backend, whose value is an empty string
+    CUSTOM = None
 
     def get_path(self, include_classname: bool = True) -> str:
         """Get the class path for this backend (respects overrides).
@@ -201,8 +205,8 @@ _MAMBA_ATTN_OVERRIDES: dict[MambaAttentionBackendEnum, str] = {}
 
 def register_backend(
     backend: AttentionBackendEnum | MambaAttentionBackendEnum,
-    is_mamba: bool = False,
     class_path: str | None = None,
+    is_mamba: bool = False,
 ) -> Callable[[type], type]:
     """Register or override a backend implementation.
 
diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py
index b6aa0ae2be48eec956c69efb5e4083897467b066..f384aaa46fa684c762884249a588148a5f3f760a 100644
--- a/vllm/v1/attention/backends/rocm_aiter_fa.py
+++ b/vllm/v1/attention/backends/rocm_aiter_fa.py
@@ -7,25 +7,25 @@ from typing import ClassVar
 
 import torch
 
-from vllm.attention.backends.abstract import (
-    AttentionBackend,
-    AttentionImpl,
-    AttentionType,
-    MultipleOf,
-)
 from vllm.attention.layer import Attention
-from vllm.attention.ops.merge_attn_states import merge_attn_states
 from vllm.config import VllmConfig, get_layers_from_vllm_config
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.utils.math_utils import cdiv
 from vllm.utils.platform_utils import get_cu_count
-from vllm.v1.attention.backends.utils import (
+from vllm.v1.attention.backend import (
+    AttentionBackend,
     AttentionCGSupport,
+    AttentionImpl,
     AttentionMetadataBuilder,
+    AttentionType,
     CommonAttentionMetadata,
+    MultipleOf,
+)
+from vllm.v1.attention.backends.utils import (
     split_decodes_prefills_and_extends,
 )
+from vllm.v1.attention.ops.merge_attn_states import merge_attn_states
 from vllm.v1.kv_cache_interface import AttentionSpec
 
 _PARTITION_SIZE_ROCM = 256
@@ -337,7 +337,7 @@ class AiterFlashAttentionMetadataBuilder(
 
         query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu
 
-        seq_lens = common_attn_metadata.seq_lens_cpu
+        seq_lens = common_attn_metadata.seq_lens.cpu()
 
         query_lens_cpu = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1]
 
@@ -367,7 +367,7 @@ class AiterFlashAttentionMetadataBuilder(
         if num_extends > 0:
             num_extends_slice = slice(num_decodes, num_decodes + num_extends)
             query_lens_for_extend = query_lens_cpu[num_extends_slice]
-            seq_lens_for_extend = common_attn_metadata.seq_lens_cpu[num_extends_slice]
+            seq_lens_for_extend = seq_lens[num_extends_slice]
             computed_kv_lens = seq_lens_for_extend - query_lens_for_extend
             swa_metadata = None
             if self.aot_sliding_window is not None:
diff --git a/vllm/v1/attention/backends/rocm_aiter_unified_attn.py b/vllm/v1/attention/backends/rocm_aiter_unified_attn.py
index 16fb52ab501c102193bafc8596c81f7bcf62ab55..9589c3128f8e0a36e60c323e5b247894851ba3b0 100644
--- a/vllm/v1/attention/backends/rocm_aiter_unified_attn.py
+++ b/vllm/v1/attention/backends/rocm_aiter_unified_attn.py
@@ -5,12 +5,12 @@
 import torch
 
 from vllm import _custom_ops as ops
-from vllm.attention.backends.abstract import AttentionType
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     QuantKey,
     kFp8StaticTensorSym,
 )
+from vllm.v1.attention.backend import AttentionType
 from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
 from vllm.v1.attention.backends.rocm_attn import (
     RocmAttentionBackend,
diff --git a/vllm/v1/attention/backends/rocm_attn.py b/vllm/v1/attention/backends/rocm_attn.py
index e2410a70b1a637848463f1eedeb6adaf93c66958..73747aaed4a451908b1a408d94f8083ce7878758 100644
--- a/vllm/v1/attention/backends/rocm_attn.py
+++ b/vllm/v1/attention/backends/rocm_attn.py
@@ -7,13 +7,6 @@ from typing import ClassVar
 
 import torch
 
-from vllm.attention.backends.abstract import (
-    AttentionBackend,
-    AttentionImpl,
-    AttentionType,
-)
-from vllm.attention.ops.chunked_prefill_paged_decode import chunked_prefill_paged_decode
-from vllm.attention.ops.paged_attn import PagedAttention
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
@@ -21,11 +14,22 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
     kFp8StaticTensorSym,
 )
 from vllm.platforms import current_platform
-from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
-from vllm.v1.attention.backends.utils import (
+from vllm.v1.attention.backend import (
+    AttentionBackend,
     AttentionCGSupport,
+    AttentionImpl,
     AttentionMetadataBuilder,
+    AttentionType,
     CommonAttentionMetadata,
+    MultipleOf,
+)
+from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
+from vllm.v1.attention.ops.chunked_prefill_paged_decode import (
+    chunked_prefill_paged_decode,
+)
+from vllm.v1.attention.ops.paged_attn import PagedAttention
+from vllm.v1.attention.ops.triton_reshape_and_cache_flash import (
+    triton_reshape_and_cache_flash,
 )
 from vllm.v1.kv_cache_interface import AttentionSpec
 
@@ -124,7 +128,7 @@ class RocmAttentionMetadataBuilder(AttentionMetadataBuilder[RocmAttentionMetadat
             prefix_kv_lens = torch.tensor(
                 [common_prefix_len], dtype=torch.int32, device=self.device
             )
-            suffix_kv_lens = common_attn_metadata.seq_lens_cpu - common_prefix_len
+            suffix_kv_lens = common_attn_metadata.seq_lens.cpu() - common_prefix_len
             suffix_kv_lens = suffix_kv_lens.to(self.device)
         else:
             cu_prefix_query_lens = None
@@ -152,7 +156,27 @@ class RocmAttentionMetadataBuilder(AttentionMetadataBuilder[RocmAttentionMetadat
 
 class RocmAttentionBackend(AttentionBackend):
     accept_output_buffer: bool = True
-    supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
+    supported_dtypes: ClassVar[list[torch.dtype]] = [
+        torch.float16,
+        torch.bfloat16,
+        torch.float32,
+    ]
+
+    @staticmethod
+    def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
+        # ROCM paged attention kernel only supports block sizes 16 and 32
+        # due to shared memory (LDS) constraints on AMD GPUs.
+        # See csrc/rocm/attention.cu CALL_CUSTOM_LAUNCHER_BLK macro.
+
+        # However, The limitations in [16, 32] are reasonable for a native C++ kernel,
+        # but vLLM should allow support for non-standard sizes via the Triton path,
+        # as addressed in this PR: https://github.com/vllm-project/vllm/pull/31380,
+        # where the Triton kernel under rocm_atten does not support inference
+        # for a non-standard qwen3-next model with a block_size of 544.
+        # We have fixed the Triton kernel so that the standard model uses the original
+        # bit-addressing logic, while the non-standard model
+        # uses our optimized kernel logic.
+        return [16, 32, 544]
 
     @classmethod
     def get_supported_head_sizes(cls) -> list[int]:
@@ -165,7 +189,7 @@ class RocmAttentionBackend(AttentionBackend):
             raise ValueError(
                 f"Head size {head_size} is not supported by {attn_type}. "
                 f"Supported head sizes are: {cls.get_supported_head_sizes()}. "
-                "Set --attention-config.backend=FLEX_ATTENTION to use "
+                "Set --attention-backend=FLEX_ATTENTION to use "
                 "FlexAttention backend which supports all head sizes."
             )
 
@@ -309,16 +333,38 @@ class RocmAttentionImpl(AttentionImpl):
         if self.kv_sharing_target_layer_name is None:
             # Reshape the input keys and values and store them in the cache.
             # Skip this if sharing KV cache with an earlier attention layer.
-            PagedAttention.write_to_paged_cache(
-                key,
-                value,
-                key_cache,
-                value_cache,
-                attn_metadata.slot_mapping,
-                self.kv_cache_dtype,
-                layer._k_scale,
-                layer._v_scale,
-            )
+
+            # Get the actual block_size from value_cache
+            # value_cache shape: [num_blocks, num_heads, head_size, block_size]
+            block_size = value_cache.shape[3]
+            # Determine if it is a power of 2
+            is_pow2 = block_size > 0 and (block_size & (block_size - 1) == 0)
+
+            if is_pow2:
+                # Normal 16, 32, 64, etc., use vLLM native HIP C++ logic
+                PagedAttention.write_to_paged_cache(
+                    key,
+                    value,
+                    key_cache,
+                    value_cache,
+                    attn_metadata.slot_mapping,
+                    self.kv_cache_dtype,
+                    layer._k_scale,
+                    layer._v_scale,
+                )
+            else:
+                # Case B: Non-standard blocks (e.g., 544 in Qwen3),
+                # force using our modified Triton logic
+                triton_reshape_and_cache_flash(
+                    key,
+                    value,
+                    key_cache,
+                    value_cache,
+                    attn_metadata.slot_mapping,
+                    self.kv_cache_dtype,
+                    layer._k_scale,
+                    layer._v_scale,
+                )
 
         if self.kv_cache_dtype.startswith("fp8"):
             key_cache = key_cache.view(self.fp8_dtype)
diff --git a/vllm/v1/attention/backends/short_conv_attn.py b/vllm/v1/attention/backends/short_conv_attn.py
index c8fe0faf71088763962a5af2115e49571d15b63c..dc6b425ce860f39702494b7cfe7939932838c71b 100644
--- a/vllm/v1/attention/backends/short_conv_attn.py
+++ b/vllm/v1/attention/backends/short_conv_attn.py
@@ -2,15 +2,10 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
 
-import torch
-
-from vllm.attention.backends.abstract import AttentionBackend
-from vllm.v1.attention.backends.mamba_attn import BaseMambaAttentionMetadataBuilder
-from vllm.v1.attention.backends.utils import (
-    PAD_SLOT_ID,
-    CommonAttentionMetadata,
-    compute_causal_conv1d_metadata,
-    split_decodes_and_prefills,
+from vllm.v1.attention.backend import AttentionBackend
+from vllm.v1.attention.backends.mamba_attn import (
+    BaseMambaAttentionMetadata,
+    BaseMambaAttentionMetadataBuilder,
 )
 
 
@@ -21,84 +16,11 @@ class ShortConvAttentionBackend(AttentionBackend):
 
 
 @dataclass
-class ShortConvAttentionMetadata:
-    num_prefills: int
-    num_prefill_tokens: int
-    num_decodes: int
-    num_decode_tokens: int
-
-    query_start_loc: torch.Tensor
-    state_indices_tensor: torch.Tensor
-    has_initial_states_p: torch.Tensor | None
-
-    # For causal_conv1d
-    nums_dict: dict | None = None
-    batch_ptr: torch.Tensor | None = None
-    token_chunk_offset_ptr: torch.Tensor | None = None
+class ShortConvAttentionMetadata(BaseMambaAttentionMetadata):
+    pass
 
 
 class ShortConvAttentionMetadataBuilder(
     BaseMambaAttentionMetadataBuilder[ShortConvAttentionMetadata]
 ):
-    def build(
-        self,
-        common_prefix_len: int,
-        common_attn_metadata: CommonAttentionMetadata,
-        fast_build: bool = False,
-    ) -> ShortConvAttentionMetadata:
-        num_reqs = common_attn_metadata.num_reqs
-        query_start_loc = common_attn_metadata.query_start_loc
-        state_indices_tensor = common_attn_metadata.block_table_tensor[:, 0]
-
-        # for causal_conv1d
-        nums_dict, batch_ptr, token_chunk_offset_ptr = None, None, None
-
-        num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
-            split_decodes_and_prefills(
-                common_attn_metadata, decode_threshold=self.reorder_batch_threshold
-            )
-        )
-
-        has_initial_states_p = None
-        if num_prefills > 0:
-            has_initial_states_cpu = (
-                common_attn_metadata.num_computed_tokens_cpu[
-                    num_reqs - num_prefills : num_reqs
-                ]
-                > 0
-            )
-            has_initial_states_p = has_initial_states_cpu.to(query_start_loc.device)
-
-            query_start_loc_p = (
-                common_attn_metadata.query_start_loc[-num_prefills - 1 :]
-                - num_decode_tokens
-            )
-
-            nums_dict, batch_ptr, token_chunk_offset_ptr = (
-                compute_causal_conv1d_metadata(query_start_loc_p)
-            )
-
-        elif (
-            num_decodes > 0
-            and num_decodes <= self.decode_cudagraph_max_bs
-            and self.compilation_config.cudagraph_mode.has_full_cudagraphs()
-        ):
-            self.state_indices_tensor[:num_decodes].copy_(
-                state_indices_tensor, non_blocking=True
-            )
-            state_indices_tensor = self.state_indices_tensor[:num_decode_tokens]
-            state_indices_tensor[num_decodes:] = PAD_SLOT_ID
-
-        attn_metadata = ShortConvAttentionMetadata(
-            query_start_loc=query_start_loc,
-            state_indices_tensor=state_indices_tensor,
-            has_initial_states_p=has_initial_states_p,
-            num_prefills=num_prefills,
-            num_prefill_tokens=num_prefill_tokens,
-            num_decodes=num_decodes,
-            num_decode_tokens=num_decode_tokens,
-            nums_dict=nums_dict,
-            batch_ptr=batch_ptr,
-            token_chunk_offset_ptr=token_chunk_offset_ptr,
-        )
-        return attn_metadata
+    metadata_cls = ShortConvAttentionMetadata
diff --git a/vllm/v1/attention/backends/tree_attn.py b/vllm/v1/attention/backends/tree_attn.py
index 523f759e05a21ac96ca716bb0c8a2c0dc0d69e48..c9c85ddc7c7e5ebdc112d3ffbeba56b8c3c22d44 100644
--- a/vllm/v1/attention/backends/tree_attn.py
+++ b/vllm/v1/attention/backends/tree_attn.py
@@ -9,20 +9,20 @@ from typing import ClassVar, Optional
 import torch
 
 from vllm import _custom_ops as ops
-from vllm.attention.backends.abstract import (
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.v1.attention.backend import (
     AttentionBackend,
     AttentionImpl,
+    AttentionMetadataBuilder,
     AttentionType,
+    CommonAttentionMetadata,
     MultipleOf,
 )
-from vllm.attention.ops.triton_unified_attention import unified_attention
-from vllm.config import VllmConfig
-from vllm.logger import init_logger
 from vllm.v1.attention.backends.utils import (
-    AttentionMetadataBuilder,
-    CommonAttentionMetadata,
     split_decodes_and_prefills,
 )
+from vllm.v1.attention.ops.triton_unified_attention import unified_attention
 from vllm.v1.kv_cache_interface import AttentionSpec
 
 logger = init_logger(__name__)
@@ -155,7 +155,9 @@ class TreeAttentionMetadataBuilder(AttentionMetadataBuilder[TreeAttentionMetadat
         self.block_size = kv_cache_spec.block_size
 
         spec_config = vllm_config.speculative_config
-        spec_token_tree = (spec := spec_config) and spec.speculative_token_tree
+        spec_token_tree: str | None = None
+        if spec := spec_config:
+            spec_token_tree = spec.speculative_token_tree
         tree_choices: list[tuple[int, ...]] = (
             ast.literal_eval(spec_token_tree) if spec_token_tree is not None else [(0,)]
         )
diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py
index ca7be990ca55589e5ee286b95918e7953e55218b..4cc438d9f40d5a7e58ef99849445ec45ec30e90b 100644
--- a/vllm/v1/attention/backends/triton_attn.py
+++ b/vllm/v1/attention/backends/triton_attn.py
@@ -7,16 +7,6 @@ from typing import ClassVar
 
 import torch
 
-from vllm.attention.backends.abstract import (
-    AttentionBackend,
-    AttentionImpl,
-    AttentionType,
-    MultipleOf,
-)
-from vllm.attention.ops.triton_reshape_and_cache_flash import (
-    triton_reshape_and_cache_flash,
-)
-from vllm.attention.ops.triton_unified_attention import unified_attention
 from vllm.config import CUDAGraphMode, VllmConfig
 from vllm.config.cache import CacheDType
 from vllm.logger import init_logger
@@ -27,11 +17,20 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
 from vllm.platforms import current_platform
 from vllm.platforms.interface import DeviceCapability
 from vllm.utils.math_utils import next_power_of_2
-from vllm.v1.attention.backends.utils import (
+from vllm.v1.attention.backend import (
+    AttentionBackend,
     AttentionCGSupport,
+    AttentionImpl,
     AttentionMetadataBuilder,
+    AttentionType,
     CommonAttentionMetadata,
+    MultipleOf,
 )
+from vllm.v1.attention.ops.triton_prefill_attention import context_attention_fwd
+from vllm.v1.attention.ops.triton_reshape_and_cache_flash import (
+    triton_reshape_and_cache_flash,
+)
+from vllm.v1.attention.ops.triton_unified_attention import unified_attention
 from vllm.v1.kv_cache_interface import AttentionSpec
 
 logger = init_logger(__name__)
@@ -220,7 +219,7 @@ class TritonAttentionMetadataBuilder(AttentionMetadataBuilder[TritonAttentionMet
             prefix_kv_lens = torch.tensor(
                 [common_prefix_len], dtype=torch.int32, device=self.device
             )
-            suffix_kv_lens = common_attn_metadata.seq_lens_cpu - common_prefix_len
+            suffix_kv_lens = common_attn_metadata.seq_lens.cpu() - common_prefix_len
             suffix_kv_lens = suffix_kv_lens.to(self.device)
         else:
             cu_prefix_query_lens = None
@@ -289,6 +288,19 @@ class TritonAttentionBackend(AttentionBackend):
             raise ValueError("Block size must be a multiple of 16.")
         return (num_blocks, 2, block_size, num_kv_heads, head_size)
 
+    @staticmethod
+    def get_kv_cache_stride_order(
+        include_num_layers_dimension: bool = False,
+    ) -> tuple[int, ...]:
+        # `stride_order` indicates the permutation that gets
+        # us from `get_kv_cache_shape` to the actual memory layout we want.
+        if include_num_layers_dimension:
+            # (num_blocks, num_layers, 2, block_size, num_kv_heads, head_size)
+            return (1, 0, 2, 3, 4, 5)
+
+        # (num_blocks, 2, block_size, num_kv_heads, head_size)
+        return (0, 1, 2, 3, 4)
+
     @staticmethod
     def use_cascade_attention(*args, **kwargs) -> bool:
         return False
@@ -309,6 +321,16 @@ class TritonAttentionBackend(AttentionBackend):
     def supports_sink(cls) -> bool:
         return True
 
+    @classmethod
+    def supports_attn_type(cls, attn_type: str) -> bool:
+        """TritonAttention supports all attention types."""
+        return attn_type in (
+            AttentionType.DECODER,
+            AttentionType.ENCODER,
+            AttentionType.ENCODER_ONLY,
+            AttentionType.ENCODER_DECODER,
+        )
+
     @classmethod
     def supports_compute_capability(cls, capability: DeviceCapability) -> bool:
         return True
@@ -341,6 +363,8 @@ class TritonAttentionImpl(AttentionImpl):
         self.alibi_slopes = alibi_slopes
         if sliding_window is None:
             self.sliding_window = (-1, -1)
+        elif attn_type in (AttentionType.ENCODER, AttentionType.ENCODER_ONLY):
+            self.sliding_window = (sliding_window - 1, sliding_window - 1)
         else:
             self.sliding_window = (sliding_window - 1, 0)
         self.kv_cache_dtype = kv_cache_dtype
@@ -352,10 +376,6 @@ class TritonAttentionImpl(AttentionImpl):
 
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
 
-        if attn_type not in [AttentionType.DECODER, AttentionType.ENCODER_DECODER]:
-            raise NotImplementedError(
-                "Encoder self-attention is not implemented for TritonAttentionImpl"
-            )
         self.attn_type = attn_type
         self.fp8_dtype = current_platform.fp8_dtype()
 
@@ -417,6 +437,21 @@ class TritonAttentionImpl(AttentionImpl):
         # performance to make sure it does not introduce any overhead.
 
         num_actual_tokens = attn_metadata.num_actual_tokens
+
+        # Handle encoder attention differently - no KV cache needed
+        if self.attn_type in (AttentionType.ENCODER_ONLY, AttentionType.ENCODER):
+            # For encoder attention,
+            # we use direct Q, K, V tensors without caching
+            return self._forward_encoder_attention(
+                query[:num_actual_tokens],
+                key[:num_actual_tokens],
+                value[:num_actual_tokens],
+                output[:num_actual_tokens],
+                attn_metadata,
+                layer,
+            )
+
+        # For decoder and cross-attention, use KV cache as before
         key_cache, value_cache = kv_cache.unbind(1)
 
         if (
@@ -495,3 +530,49 @@ class TritonAttentionImpl(AttentionImpl):
         )
 
         return output
+
+    def _forward_encoder_attention(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        output: torch.Tensor,
+        attn_metadata: TritonAttentionMetadata,
+        layer: torch.nn.Module,
+    ) -> torch.Tensor:
+        """Forward pass for encoder attention without KV cache.
+
+        Args:
+            query: shape = [num_encoder_tokens, num_heads, head_size]
+            key: shape = [num_encoder_tokens, num_kv_heads, head_size]
+            value: shape = [num_encoder_tokens, num_kv_heads, head_size]
+            output: shape = [num_encoder_tokens, num_heads, head_size]
+            attn_metadata: Encoder attention metadata
+            layer: The attention layer
+        """
+        # For encoder attention, process FP8 quantization if needed
+        if self.kv_cache_dtype.startswith("fp8"):
+            raise NotImplementedError(
+                "quantization is not supported for encoder attention"
+            )
+
+        # Use encoder-specific metadata for sequence information
+        query_start_loc = attn_metadata.query_start_loc
+        seq_lens = attn_metadata.seq_lens
+        max_query_len = attn_metadata.max_query_len
+
+        # Call flash attention directly on Q, K, V tensors
+        context_attention_fwd(
+            q=query,
+            k=key,
+            v=value,
+            o=output,
+            b_start_loc=query_start_loc,
+            b_seq_len=seq_lens,
+            max_input_len=max_query_len,
+            is_causal=False,  # Encoder attention is bidirectional
+            softmax_scale=self.scale,
+            sliding_window_q=self.sliding_window[0],
+            sliding_window_k=self.sliding_window[1],
+        )
+        return output
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index f4ff6214f24e1703412c1ab976514acdab5ee328..c549bf7b5e2ad0c3ec1192ca323c30a76e9aa8ba 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -1,15 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import abc
-import enum
 import functools
-from abc import abstractmethod
+from collections.abc import Callable
 from dataclasses import dataclass, field, fields, make_dataclass
 from typing import (
     TYPE_CHECKING,
     Any,
-    ClassVar,
-    Generic,
     Literal,
     Protocol,
     TypeVar,
@@ -18,7 +14,7 @@ from typing import (
 
 import numpy as np
 import torch
-from typing_extensions import deprecated, runtime_checkable
+from typing_extensions import runtime_checkable
 
 from vllm.config import VllmConfig, get_layers_from_vllm_config
 from vllm.utils.math_utils import cdiv
@@ -28,17 +24,18 @@ if TYPE_CHECKING:
     from vllm.v1.worker.gpu_input_batch import InputBatch
 
 import vllm.envs as envs
-from vllm.attention.backends.abstract import (
-    AttentionBackend,
-    AttentionImpl,
-    AttentionMetadata,
-)
 from vllm.distributed.kv_transfer.kv_connector.utils import (
     get_kv_connector_cache_layout,
 )
 from vllm.logger import init_logger
 from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
-from vllm.v1.kv_cache_interface import AttentionSpec
+from vllm.v1.attention.backend import (
+    AttentionBackend,
+    AttentionImpl,
+    AttentionMetadata,
+    AttentionMetadataBuilder,
+    CommonAttentionMetadata,
+)
 from vllm.v1.worker.ubatch_utils import UBatchSlice
 
 logger = init_logger(__name__)
@@ -52,113 +49,6 @@ def is_valid_kv_cache_layout(value: str) -> bool:
     return value in get_args(KVCacheLayoutType)
 
 
-@dataclass
-class CommonAttentionMetadata:
-    """
-    Per-batch attention metadata, shared across layers and backends.
-    AttentionMetadataBuilder instances use it to construct per-layer metadata.
-
-    For many of the tensors we keep both GPU and CPU versions.
-    """
-
-    query_start_loc: torch.Tensor
-    query_start_loc_cpu: torch.Tensor
-    """(batch_size + 1,), the start location of each request in query Tensor"""
-
-    seq_lens: torch.Tensor
-    """(batch_size,), the number of computed tokens for each request"""
-    num_reqs: int
-    """Number of requests"""
-    # TODO(lucas): rename to num_tokens since it may be padded and this is misleading
-    num_actual_tokens: int
-    """Total number of tokens in batch"""
-    max_query_len: int
-    """Longest query in batch"""
-    max_seq_len: int
-    """Longest context length (may be an upper bound)"""
-
-    block_table_tensor: torch.Tensor
-    slot_mapping: torch.Tensor
-
-    causal: bool = True
-
-    # Needed by FastPrefillAttentionBuilder
-    logits_indices_padded: torch.Tensor | None = None
-    num_logits_indices: int | None = None
-
-    # Needed by CrossAttentionBuilder
-    encoder_seq_lens: torch.Tensor | None = None
-    encoder_seq_lens_cpu: np.ndarray | None = None
-
-    dcp_local_seq_lens: torch.Tensor | None = None
-    dcp_local_seq_lens_cpu: torch.Tensor | None = None
-    """Sequence lengths of the local rank in decode context parallelism world"""
-
-    # WARNING: Deprecated fields. Will be removed in a future release (v0.14.0)
-    _seq_lens_cpu: torch.Tensor | None = None
-    _num_computed_tokens_cpu: torch.Tensor | None = None
-
-    @property
-    @deprecated(
-        """
-    Prefer using device seq_lens directly to avoid implicit H<>D sync.
-    If a CPU copy is needed, use `seq_lens.cpu()` instead.
-    Will be removed in a future release (v0.14.0)
-    """
-    )
-    def seq_lens_cpu(self) -> torch.Tensor:
-        if self._seq_lens_cpu is None:
-            self._seq_lens_cpu = self.seq_lens.to("cpu")
-        return self._seq_lens_cpu
-
-    @property
-    @deprecated(
-        """
-    Prefer using device seq_lens directly to avoid implicit H<>D sync which breaks full
-    async scheduling. If a CPU copy is needed, it can be derived from 
-    query_start_loc_cpu and seq_lens.
-    Will be removed in a future release (v0.14.0)
-    """
-    )
-    def num_computed_tokens_cpu(self) -> torch.Tensor:
-        if self._num_computed_tokens_cpu is None:
-            query_seq_lens = (
-                self.query_start_loc_cpu[1:] - self.query_start_loc_cpu[:-1]
-            )
-            self._num_computed_tokens_cpu = self.seq_lens_cpu - query_seq_lens
-        return self._num_computed_tokens_cpu
-
-    # TODO(lucas): remove once we have FULL-CG spec-decode support
-    def unpadded(
-        self, num_actual_tokens: int, num_actual_reqs: int
-    ) -> "CommonAttentionMetadata":
-        maybe_slice_reqs = lambda x: x[:num_actual_reqs] if x is not None else None
-        return CommonAttentionMetadata(
-            query_start_loc=self.query_start_loc[: num_actual_reqs + 1],
-            query_start_loc_cpu=self.query_start_loc_cpu[: num_actual_reqs + 1],
-            seq_lens=self.seq_lens[:num_actual_reqs],
-            _seq_lens_cpu=self._seq_lens_cpu[:num_actual_reqs]
-            if self._seq_lens_cpu is not None
-            else None,
-            _num_computed_tokens_cpu=self._num_computed_tokens_cpu[:num_actual_reqs]
-            if self._num_computed_tokens_cpu is not None
-            else None,
-            num_reqs=num_actual_reqs,
-            num_actual_tokens=num_actual_tokens,
-            max_query_len=self.max_query_len,
-            max_seq_len=self.max_seq_len,
-            block_table_tensor=self.block_table_tensor[:num_actual_reqs],
-            slot_mapping=self.slot_mapping[:num_actual_tokens],
-            causal=self.causal,
-            logits_indices_padded=self.logits_indices_padded,
-            num_logits_indices=self.num_logits_indices,
-            encoder_seq_lens=maybe_slice_reqs(self.encoder_seq_lens),
-            encoder_seq_lens_cpu=maybe_slice_reqs(self.encoder_seq_lens_cpu),
-            dcp_local_seq_lens=maybe_slice_reqs(self.dcp_local_seq_lens),
-            dcp_local_seq_lens_cpu=maybe_slice_reqs(self.dcp_local_seq_lens_cpu),
-        )
-
-
 def slice_query_start_locs(
     query_start_loc: torch.Tensor,
     request_slice: slice,
@@ -200,10 +90,11 @@ def _make_metadata_with_slice(
     )
     # NOTE: last token can be outside of the last request if we have CG padding.
 
-    # If the "middle" request has tokens in both ubatches, we have to split it.
-    # If ubatch_slice is the first ubatch then we will be splitting the last
-    # request. If it's the second microbatch, then we will be splitting the
-    # first request
+    # If the request is split across ubatches, we have to adjust the metadata.
+    # splits_first_request: The first request in this slice is the continuation of
+    #                       a request that started in a previous slice.
+    # splits_last_request:  The last request in this slice continues into the
+    #                       next slice.
     splits_first_request = first_tok > start_locs[first_req]
     splits_last_request = last_tok < start_locs[last_req + 1] - 1
 
@@ -224,7 +115,10 @@ def _make_metadata_with_slice(
     seq_lens_cpu = attn_metadata.seq_lens_cpu[request_slice]
 
     if splits_last_request:
-        tokens_skipped = query_start_loc_cpu[-1] - token_slice.stop
+        # NOTE: We use start_locs (the original query_start_loc_cpu) to calculate
+        # the tokens skipped because query_start_loc_cpu might have been modified
+        # if splits_first_request is True.
+        tokens_skipped = start_locs[last_req + 1] - token_slice.stop
         query_start_loc[-1] -= tokens_skipped
         query_start_loc_cpu[-1] -= tokens_skipped
 
@@ -284,158 +178,12 @@ def split_attn_metadata(
     return results
 
 
-M = TypeVar("M")
-
-
-class AttentionCGSupport(enum.Enum):
-    """Constants for the cudagraph support of the attention backend
-    Here we do not consider the cascade attention, as currently
-    it is never cudagraph supported."""
-
-    ALWAYS = 3
-    """Cudagraph always supported; supports mixed-prefill-decode"""
-    UNIFORM_BATCH = 2
-    """Cudagraph supported for batches the only contain query lengths that are
-    the same, this can be used for spec-decode
-        i.e. "decodes" are 1 + num_speculative_tokens"""
-    UNIFORM_SINGLE_TOKEN_DECODE = 1
-    """Cudagraph supported for batches the only contain query_len==1 decodes"""
-    NEVER = 0
-    """NO cudagraph support"""
-
-
-class AttentionMetadataBuilder(abc.ABC, Generic[M]):
-    # Does this backend/builder support CUDA Graphs for attention (default: no).
-    # Do not access directly. Call get_cudagraph_support() instead.
-    _cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.NEVER
-    # Does this backend/builder reorder the batch?
-    # If not, set this to None. Otherwise set it to the query
-    # length that will be pulled into the front of the batch.
-    reorder_batch_threshold: int | None = None
-
-    @abstractmethod
-    def __init__(
-        self,
-        kv_cache_spec: AttentionSpec,
-        layer_names: list[str],
-        vllm_config: VllmConfig,
-        device: torch.device,
-    ):
-        self.kv_cache_spec = kv_cache_spec
-        self.layer_names = layer_names
-        self.vllm_config = vllm_config
-        self.device = device
-
-    @classmethod
-    def get_cudagraph_support(
-        cls: type["AttentionMetadataBuilder"],
-        vllm_config: VllmConfig,
-        kv_cache_spec: AttentionSpec,
-    ) -> AttentionCGSupport:
-        """Get the cudagraph support level of this builder class."""
-        return cls._cudagraph_support
-
-    def _init_reorder_batch_threshold(
-        self,
-        reorder_batch_threshold: int | None = 1,
-        supports_spec_as_decode: bool = False,
-        supports_dcp_with_varlen: bool = False,
-    ) -> None:
-        self.reorder_batch_threshold = reorder_batch_threshold
-        if self.reorder_batch_threshold is not None and supports_spec_as_decode:
-            # If the backend supports spec-as-decode kernels, then we can set
-            # the reorder_batch_threshold based on the number of speculative
-            # tokens from the config.
-            speculative_config = self.vllm_config.speculative_config
-            if (
-                speculative_config is not None
-                and speculative_config.num_speculative_tokens is not None
-            ):
-                self.reorder_batch_threshold = max(
-                    self.reorder_batch_threshold,
-                    1 + speculative_config.num_speculative_tokens,
-                )
-
-        if (
-            self.vllm_config.parallel_config.decode_context_parallel_size > 1
-            and not supports_dcp_with_varlen
-        ):
-            self.reorder_batch_threshold = 1
-
-    @abstractmethod
-    def build(
-        self,
-        common_prefix_len: int,
-        common_attn_metadata: CommonAttentionMetadata,
-        fast_build: bool = False,
-    ) -> M:
-        """
-        Central method that builds attention metadata.
-        Some builders (MLA) require reorder_batch to be called prior to build.
-
-        Args:
-            common_prefix_len: The length of the common prefix of the batch.
-            common_attn_metadata: The common attention metadata.
-            fast_build: The meta-data will prioritize speed of building over
-                then speed at execution. Can be used for spec-decode where the
-                result of a build call may only be used for few layers/iters.
-        """
-        raise NotImplementedError
-
-    def build_for_cudagraph_capture(
-        self, common_attn_metadata: CommonAttentionMetadata
-    ) -> M:
-        """
-        Build attention metadata for CUDA graph capture. Uses build by default.
-        Subclasses that override this method should call self.build or
-        super().build_for_cudagraph_capture.
-        """
-        return self.build(
-            common_prefix_len=0, common_attn_metadata=common_attn_metadata
-        )
-
-    def build_for_drafting(
-        self,
-        common_attn_metadata: CommonAttentionMetadata,
-        draft_index: int,
-    ) -> M:
-        """
-        Build attention metadata for draft model. Uses build by default.
-
-        Args:
-            common_attn_metadata: The common attention metadata.
-            draft_index: The index of the current draft operation.
-                When speculating a chain of tokens, this index refers to the
-                draft attempt for the i-th token.
-                For tree-based attention, this index instead refers to the
-                draft attempt for the i-th level in the tree of tokens.
-        """
-        return self.build(
-            common_prefix_len=0,
-            common_attn_metadata=common_attn_metadata,
-            fast_build=True,
-        )
-
-    def use_cascade_attention(
-        self,
-        common_prefix_len: int,
-        query_lens: np.ndarray,
-        num_query_heads: int,
-        num_kv_heads: int,
-        use_alibi: bool,
-        use_sliding_window: bool,
-        use_local_attention: bool,
-        num_sms: int,
-        dcp_world_size: int,
-    ) -> bool:
-        return False
-
-
 @functools.lru_cache
 def get_kv_cache_layout():
     # Format specified by the code.
     global _KV_CACHE_LAYOUT_OVERRIDE
 
+    cache_layout: Literal["NHD", "HND"] | None = None
     if _KV_CACHE_LAYOUT_OVERRIDE is not None:
         cache_layout = _KV_CACHE_LAYOUT_OVERRIDE
         logger.info_once(
@@ -491,7 +239,11 @@ def get_per_layer_parameters(
     to use during `plan`.
     """
 
-    layers = get_layers_from_vllm_config(vllm_config, AttentionLayerBase, layer_names)
+    layers = get_layers_from_vllm_config(
+        vllm_config,
+        AttentionLayerBase,  # type: ignore[type-abstract]
+        layer_names,
+    )
     per_layer_params: dict[str, PerLayerParameters] = {}
 
     for key, layer in layers.items():
@@ -598,7 +350,7 @@ def make_local_attention_virtual_batches(
     attn_chunk_size: int,
     common_attn_metadata: CommonAttentionMetadata,
     block_size: int = 0,
-) -> CommonAttentionMetadata:
+) -> tuple[CommonAttentionMetadata, Callable[[torch.Tensor], torch.Tensor]]:
     query_start_loc_np = common_attn_metadata.query_start_loc_cpu.numpy()
     seq_lens_np = common_attn_metadata.seq_lens_cpu.numpy()
     block_table = common_attn_metadata.block_table_tensor
@@ -710,9 +462,12 @@ def make_local_attention_virtual_batches(
     # tensor first, which recovers perf.
     batch_indices_torch = torch.from_numpy(batch_indices)
     block_indices_torch = torch.from_numpy(block_indices)
-    block_table_local = block_table[batch_indices_torch, block_indices_torch].view(
-        virtual_batches, -1
-    )
+
+    # Save as a lambda so we can return this for update_block_table
+    make_block_table = lambda block_table: block_table[
+        batch_indices_torch, block_indices_torch
+    ].view(virtual_batches, -1)
+    block_table_local = make_block_table(block_table)
 
     query_start_loc_cpu = torch.from_numpy(cu_seqlens_q_local)
     seq_lens_cpu = torch.from_numpy(seqlens_k_local)
@@ -731,7 +486,7 @@ def make_local_attention_virtual_batches(
         causal=True,
         _seq_lens_cpu=seq_lens_cpu,
         _num_computed_tokens_cpu=torch.from_numpy(num_computed_tokens_local),
-    )
+    ), make_block_table
 
 
 def make_kv_sharing_fast_prefill_common_attn_metadata(
@@ -793,6 +548,9 @@ def make_kv_sharing_fast_prefill_common_attn_metadata(
     return common_attn_metadata
 
 
+M = TypeVar("M")
+
+
 def subclass_attention_backend(
     name_prefix: str,
     attention_backend_cls: type[AttentionBackend],
@@ -808,6 +566,15 @@ def subclass_attention_backend(
     )
 
 
+def subclass_attention_backend_with_overrides(
+    name_prefix: str,
+    attention_backend_cls: type[AttentionBackend],
+    overrides: dict[str, Any],
+) -> type[AttentionBackend]:
+    name: str = name_prefix + attention_backend_cls.__name__  # type: ignore
+    return type(name, (attention_backend_cls,), overrides)
+
+
 def split_decodes_prefills_and_extends(
     common_attn_metadata: CommonAttentionMetadata,
     decode_threshold: int = 1,
@@ -990,9 +757,9 @@ def reorder_batch_to_split_decodes_and_prefills(
     num_scheduled_tokens_np = np.array(num_scheduled_tokens)
     num_computed_tokens_np = input_batch.num_computed_tokens_cpu[:num_reqs]
 
-    is_decode = num_scheduled_tokens_np <= decode_threshold
-    is_extend = (~is_decode) & (num_computed_tokens_np > 0)
-    is_prefill = (~is_decode) & (num_computed_tokens_np == 0)
+    is_prefill = num_computed_tokens_np == 0
+    is_decode = (num_scheduled_tokens_np <= decode_threshold) & (~is_prefill)
+    is_extend = (num_scheduled_tokens_np > decode_threshold) & (~is_prefill)
 
     # Desired order: decode → extend → prefill
     req_regions = np.zeros(is_decode.shape, dtype=np.int32)  # 0 = decode by default
@@ -1080,7 +847,7 @@ class KVSharingFastPrefillMetadata(Protocol):
 
 def create_fast_prefill_custom_backend(
     prefix: str,
-    underlying_attn_backend: AttentionBackend,
+    underlying_attn_backend: type[AttentionBackend],
 ) -> type[AttentionBackend]:
     underlying_builder = underlying_attn_backend.get_builder_cls()
 
diff --git a/vllm/v1/attention/ops/__init__.py b/vllm/v1/attention/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/vllm/attention/ops/chunked_prefill_paged_decode.py b/vllm/v1/attention/ops/chunked_prefill_paged_decode.py
similarity index 76%
rename from vllm/attention/ops/chunked_prefill_paged_decode.py
rename to vllm/v1/attention/ops/chunked_prefill_paged_decode.py
index 120261a5eb8982a19cbda556bdb09ac249bee2a5..f9ae0df00315a6645f80bdabc3a7f82cc9cb887e 100644
--- a/vllm/attention/ops/chunked_prefill_paged_decode.py
+++ b/vllm/v1/attention/ops/chunked_prefill_paged_decode.py
@@ -46,6 +46,7 @@ def kernel_paged_attention_2d(
     output_stride_0: tl.int64,  # int
     output_stride_1: tl.int64,  # int, should be equal to head_size
     BLOCK_SIZE: tl.constexpr,  # int
+    PHYSICAL_BLOCK_SIZE: tl.constexpr,  # int
     HEAD_SIZE: tl.constexpr,  # int
     HEAD_SIZE_PADDED: tl.constexpr,  # int, must be power of 2
     USE_ALIBI_SLOPES: tl.constexpr,  # bool
@@ -104,14 +105,15 @@ def kernel_paged_attention_2d(
 
     if not USE_SINKS:
         M = tl.full([num_queries_per_kv_padded], float("-inf"), dtype=tl.float32)
+        L = tl.zeros([num_queries_per_kv_padded], dtype=tl.float32)
     else:
         M = tl.load(
             sink_ptr + query_head_idx,
             mask=head_mask,
             other=float("-inf"),
         ).to(dtype=tl.float32)
+        L = tl.where(float("-inf") < M, 1.0, 0.0)
 
-    L = tl.full([num_queries_per_kv_padded], 1.0, dtype=tl.float32)
     acc = tl.zeros([num_queries_per_kv_padded, HEAD_SIZE_PADDED], dtype=tl.float32)
 
     # sequence len for this particular sequence
@@ -125,30 +127,45 @@ def kernel_paged_attention_2d(
 
     num_blocks = cdiv_fn(seq_len, BLOCK_SIZE)
 
+    offs_n = tl.arange(0, BLOCK_SIZE)
+    offs_d = tl.arange(0, HEAD_SIZE_PADDED)
     # iterate through tiles
     for j in range(0, num_blocks):
-        physical_block_idx = tl.load(block_tables_ptr + block_table_offset + j)
-
-        offs_n = tl.arange(0, BLOCK_SIZE)
-        offs_d = tl.arange(0, HEAD_SIZE_PADDED)
-
-        v_offset = (
-            physical_block_idx * stride_v_cache_0
-            + kv_head_idx * stride_v_cache_1
-            + offs_d[None, :] * stride_v_cache_2
-            + offs_n[:, None] * stride_v_cache_3
-        )
-
+        start_n = j * BLOCK_SIZE
+        # Calculate the logical location within a non-standard physical block,
+        # such as 544 in Qwen/Qwen3-Next-80B-A3B-Thinking.
+        # Supports non-contiguous mapping
+        # from logical blocks to physical blocks
+        abs_token_idx = start_n + offs_n
+        l_block_idx = abs_token_idx // PHYSICAL_BLOCK_SIZE
+        # Vectorized loading of physical block IDs
+        p_block_idx = tl.load(block_tables_ptr + block_table_offset + l_block_idx)
+        internal_offsets = abs_token_idx % PHYSICAL_BLOCK_SIZE
+
+        # 5D addressing logic of K
         k_offset = (
-            physical_block_idx * stride_k_cache_0
+            p_block_idx[None, :] * stride_k_cache_0
             + kv_head_idx * stride_k_cache_1
             + (offs_d[:, None] // x) * stride_k_cache_2
-            + offs_n[None, :] * stride_k_cache_3
+            + internal_offsets[None, :] * stride_k_cache_3
             + (offs_d[:, None] % x) * stride_k_cache_4
         )
 
+        # 4D addressing logic of V (Slot is innermost)
+        v_offset = (
+            p_block_idx[:, None] * stride_v_cache_0
+            + kv_head_idx * stride_v_cache_1
+            + offs_d[None, :] * stride_v_cache_2
+            + internal_offsets[:, None] * stride_v_cache_3
+        )
+
         # K : (HEAD_SIZE, BLOCK_SIZE)
-        K_load = tl.load(key_cache_ptr + k_offset, mask=dim_mask[:, None], other=0.0)
+        K_load = tl.load(
+            key_cache_ptr + k_offset,
+            mask=dim_mask[:, None],
+            other=0.0,
+            eviction_policy="evict_last",
+        )
 
         if K_load.dtype.is_fp8():
             K = (K_load.to(tl.float32) * tl.load(k_scale)).to(Q.dtype)
@@ -156,7 +173,12 @@ def kernel_paged_attention_2d(
             K = K_load
 
         # V : (BLOCK_SIZE, HEAD_SIZE)
-        V_load = tl.load(value_cache_ptr + v_offset, mask=dim_mask[None, :], other=0.0)
+        V_load = tl.load(
+            value_cache_ptr + v_offset,
+            mask=dim_mask[None, :],
+            other=0.0,
+            eviction_policy="evict_last",
+        )
 
         if V_load.dtype.is_fp8():
             V = (V_load.to(tl.float32) * tl.load(v_scale)).to(Q.dtype)
@@ -167,9 +189,9 @@ def kernel_paged_attention_2d(
         boundary = tl.full([BLOCK_SIZE], seq_len, dtype=tl.int32)
         seq_mask = seq_offset[None, :] < boundary
 
-        # S : (num_queries_per_kv, BLOCK_SIZE,)
-        S = tl.where(head_mask[:, None] & seq_mask, 0.0, float("-inf")).to(tl.float32)
-        S += scale * tl.dot(Q, K)
+        # First calculate the dot, then apply the mask.
+        qk = scale * tl.dot(Q, K)
+        S = tl.where(head_mask[:, None] & seq_mask, qk, float("-inf"))
 
         context_len = seq_len - 1
 
@@ -184,13 +206,15 @@ def kernel_paged_attention_2d(
         m_j = tl.maximum(M, tl.max(S, axis=1))
 
         # P : (num_queries_per_kv, BLOCK_SIZE,)
-        P = tl.exp(S - m_j[:, None])
+        p = tl.exp(S - m_j[:, None])
+        p = tl.where(m_j[:, None] == float("-inf"), 0.0, p)
 
         # l_j : (num_queries_per_kv,)
-        l_j = tl.sum(P, axis=1)
+        l_j = tl.sum(p, axis=1)
 
         # alpha : (num_queries_per_kv, )
         alpha = tl.exp(M - m_j)
+        alpha = tl.where(float("-inf") == M, 0.0, alpha)
 
         # acc : (num_queries_per_kv, BLOCK_SIZE,)
         acc = acc * alpha[:, None]
@@ -200,10 +224,10 @@ def kernel_paged_attention_2d(
         M = m_j
 
         # acc : (num_queries_per_kv, BLOCK_SIZE,)
-        acc += tl.dot(P.to(V.dtype), V)
+        acc += tl.dot(p.to(V.dtype), V)
 
     # epilogue
-    acc = acc / L[:, None]
+    acc = acc / (L[:, None] + 1e-10)
     if USE_FP8:
         acc = acc * tl.load(out_scale_inv)
         acc = tl.clamp(acc, FP8_MIN, FP8_MAX)
@@ -241,9 +265,10 @@ def chunked_prefill_paged_decode(
     output_scale=None,
     # Optional tensor for sinks
     sinks=None,
+    is_block_table_ptr: bool = False,
 ):
     if sm_scale is None:
-        sm_scale = 1.0 / (query.shape[1] ** 0.5)
+        sm_scale = 1.0 / (query.shape[2] ** 0.5)
 
     use_alibi_slopes = alibi_slopes is not None
 
@@ -292,7 +317,10 @@ def chunked_prefill_paged_decode(
         elif kv_cache_dtype == "fp8_e5m2":
             target_dtype = torch.float8_e5m2
         else:
-            raise ValueError("Unsupported FP8 dtype:", kv_cache_dtype)
+            raise ValueError(
+                f"Unsupported FP8 kv_cache_dtype {kv_cache_dtype}: "
+                f"should be one of 'fp8', 'fp8_e4m3', 'fp8_e5m2'."
+            )
 
         key_cache = key_cache.view(target_dtype)
         value_cache = value_cache.view(target_dtype)
@@ -301,6 +329,8 @@ def chunked_prefill_paged_decode(
 
     from vllm.platforms.rocm import use_rocm_custom_paged_attention
 
+
+    use_custom = False
     # use_custom = use_rocm_custom_paged_attention(
     #     query.dtype,
     #     head_size,
@@ -312,7 +342,16 @@ def chunked_prefill_paged_decode(
     #     alibi_slopes,
     #     sinks,
     # )
-    use_custom = False
+    # Triton is only forced when encountering a non-standard block
+    # like Qwen3 with a size of 544.
+    # 1. Check if block_size is a power of 2 (16, 32, 64...)
+    # 2. If it's a power of 2, we trust the vLLM's native use_custom decision.
+    # 3. If it's not a power of 2 (such as Qwen3's 544),
+    # then our Triton path is forced.
+    is_pow2 = block_size > 0 and (block_size & (block_size - 1) == 0)
+    if not is_pow2:
+        use_custom = False
+
     if use_custom:
         _PARTITION_SIZE_ROCM = 256
         max_num_partitions = (
@@ -354,6 +393,25 @@ def chunked_prefill_paged_decode(
             fp8_out_scale=output_scale,
         )
     else:
+        real_block_size = value_cache.shape[3]
+        # The standard model directly uses the original block_size.
+        # Non-standard 544 uses 32 to accommodate integer division logic.
+        TRITON_BLOCK_SIZE = block_size if is_pow2 else 32
+        if is_block_table_ptr:
+            # Using the physical base address of tensors
+            kv_element_size = key_cache.element_size()
+            block_byte_stride = key_cache.stride(0) * kv_element_size
+            # Get the starting physical address of the KV Cache
+            base_addr = key_cache.data_ptr()
+
+            # Normalization: Directly calculate the block offset
+            # of the pointer relative to the base address
+            processed_block_table = ((block_table - base_addr) // block_byte_stride).to(
+                torch.int32
+            )
+        else:
+            processed_block_table = block_table.to(torch.int32)
+
         kernel_paged_attention_2d[
             (
                 num_seqs,
@@ -365,7 +423,7 @@ def chunked_prefill_paged_decode(
             key_cache_ptr=key_cache,
             value_cache_ptr=value_cache,
             sink_ptr=sinks,
-            block_tables_ptr=block_table,
+            block_tables_ptr=processed_block_table,
             seq_lens_ptr=seq_lens,
             alibi_slopes_ptr=alibi_slopes,
             scale=sm_scale,
@@ -375,12 +433,13 @@ def chunked_prefill_paged_decode(
             num_query_heads=num_query_heads,
             num_queries_per_kv=num_queries_per_kv,
             num_queries_per_kv_padded=num_queries_per_kv_padded,
-            block_table_stride=block_table.stride(0),
+            block_table_stride=processed_block_table.stride(0),
             query_stride_0=query.stride(0),
             query_stride_1=query.stride(1),
             output_stride_0=output.stride(0),
             output_stride_1=output.stride(1),
-            BLOCK_SIZE=block_size,
+            BLOCK_SIZE=TRITON_BLOCK_SIZE,
+            PHYSICAL_BLOCK_SIZE=real_block_size,
             HEAD_SIZE=head_size,
             HEAD_SIZE_PADDED=triton.next_power_of_2(head_size),
             USE_ALIBI_SLOPES=use_alibi_slopes,
diff --git a/vllm/attention/ops/common.py b/vllm/v1/attention/ops/common.py
similarity index 100%
rename from vllm/attention/ops/common.py
rename to vllm/v1/attention/ops/common.py
diff --git a/vllm/attention/ops/flashmla.py b/vllm/v1/attention/ops/flashmla.py
similarity index 97%
rename from vllm/attention/ops/flashmla.py
rename to vllm/v1/attention/ops/flashmla.py
index 87ff2bffadaebeb38b294b37cbf44e7d00929652..bf8f81975e7ffff69f1fbbbed71ef4913dee57d0 100644
--- a/vllm/attention/ops/flashmla.py
+++ b/vllm/v1/attention/ops/flashmla.py
@@ -59,7 +59,7 @@ def is_flashmla_dense_supported() -> tuple[bool, str | None]:
     is_availble, maybe_reason = _is_flashmla_available()
     if not is_availble:
         return False, maybe_reason
-    if current_platform.get_device_capability()[0] != 9:
+    if not current_platform.is_device_capability_family(90):
         return False, "FlashMLA Dense is only supported on Hopper devices."
     return True, None
 
@@ -71,7 +71,10 @@ def is_flashmla_sparse_supported() -> tuple[bool, str | None]:
     is_availble, maybe_reason = _is_flashmla_available()
     if not is_availble:
         return False, maybe_reason
-    if current_platform.get_device_capability()[0] not in (9, 10):
+    if not (
+        current_platform.is_device_capability_family(90)
+        or current_platform.is_device_capability_family(100)
+    ):
         return (
             False,
             "FlashMLA Sparse is only supported on Hopper and Blackwell devices.",
diff --git a/vllm/attention/ops/merge_attn_states.py b/vllm/v1/attention/ops/merge_attn_states.py
similarity index 91%
rename from vllm/attention/ops/merge_attn_states.py
rename to vllm/v1/attention/ops/merge_attn_states.py
index b5850da0c2a7ed473316744db79f44cb950092d1..6c91461817d5a8d32ac9ff4ed8eea321bcbb58d1 100644
--- a/vllm/attention/ops/merge_attn_states.py
+++ b/vllm/v1/attention/ops/merge_attn_states.py
@@ -16,7 +16,7 @@ def merge_attn_states(
     output_lse: torch.Tensor | None = None,
 ) -> None:
     # NOTE(DefTruth): Currently, custom merge_attn_states CUDA kernel
-    # is not support for FP8 dtype, fallback to use Triton kernel.
+    # does not support FP8 dtype, fallback to use Triton kernel.
     def supported_dtypes(o: torch.Tensor) -> bool:
         return o.dtype in [torch.float32, torch.half, torch.bfloat16]
 
@@ -41,7 +41,7 @@ def merge_attn_states(
             output, prefix_output, prefix_lse, suffix_output, suffix_lse, output_lse
         )
     else:
-        from vllm.attention.ops.triton_merge_attn_states import merge_attn_states
+        from vllm.v1.attention.ops.triton_merge_attn_states import merge_attn_states
 
         return merge_attn_states(
             output, prefix_output, prefix_lse, suffix_output, suffix_lse, output_lse
diff --git a/vllm/attention/ops/paged_attn.py b/vllm/v1/attention/ops/paged_attn.py
similarity index 97%
rename from vllm/attention/ops/paged_attn.py
rename to vllm/v1/attention/ops/paged_attn.py
index b898596708fa15fa505464fc4e7a188e84bcd24b..79a5347ac7bd142b50ffe981b39b8670cf9674ed 100644
--- a/vllm/attention/ops/paged_attn.py
+++ b/vllm/v1/attention/ops/paged_attn.py
@@ -11,7 +11,7 @@ from vllm import envs
 if current_platform.is_cuda_alike():
     from vllm import _custom_ops as ops
 elif current_platform.is_xpu():
-    from vllm._ipex_ops import ipex_ops as ops
+    from vllm._ipex_ops import ipex_ops as ops  # type: ignore[no-redef]
     
 
 class PagedAttention:
diff --git a/vllm/attention/ops/pallas_kv_cache_update.py b/vllm/v1/attention/ops/pallas_kv_cache_update.py
similarity index 100%
rename from vllm/attention/ops/pallas_kv_cache_update.py
rename to vllm/v1/attention/ops/pallas_kv_cache_update.py
diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/v1/attention/ops/prefix_prefill.py
similarity index 87%
rename from vllm/attention/ops/prefix_prefill.py
rename to vllm/v1/attention/ops/prefix_prefill.py
index 5c715cf183717432acaf83af57c479133b533c61..9be6d0862440ca34c3f4f873321fa84fa55e7ebf 100644
--- a/vllm/attention/ops/prefix_prefill.py
+++ b/vllm/v1/attention/ops/prefix_prefill.py
@@ -83,6 +83,7 @@ def _fwd_kernel(
     BLOCK_DMODEL: tl.constexpr,
     BLOCK_DMODEL_PADDED: tl.constexpr,
     BLOCK_SIZE: tl.constexpr,
+    PHYSICAL_BLOCK_SIZE: tl.constexpr,
     BLOCK_N: tl.constexpr,
     SLIDING_WINDOW: tl.constexpr,
     num_unroll_cache: tl.constexpr,
@@ -143,42 +144,52 @@ def _fwd_kernel(
     # initialize pointer to m and l
     if not USE_SINKS:
         m_i = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32)
+        l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
     else:
         m_i = tl.load(
             sink_ptr + tl.full([BLOCK_M], cur_head, dtype=tl.int64),
             mask=(offs_m < cur_batch_query_len),
             other=float("-inf"),
         ).to(dtype=tl.float32)
+        l_i = tl.where(m_i > float("-inf"), 1.0, 0.0)
 
-    l_i = tl.full([BLOCK_M], 1.0, dtype=tl.float32)
     acc = tl.zeros([BLOCK_M, BLOCK_DMODEL_PADDED], dtype=tl.float32)  # [M,D]
 
     # compute query against context (no causal mask here)
     for start_n in tl.range(
         0, cur_batch_ctx_len, BLOCK_SIZE, loop_unroll_factor=num_unroll_cache
     ):
-        start_n = tl.multiple_of(start_n, BLOCK_SIZE)
-        # -- compute qk ----
+        # Under a block size of 544 (Qwen/Qwen3-Next-80B-A3B-Thinking),
+        # replace one physical block every 17 32-Tile blocks
+        # Calculate the logical block index of each of the 32 tokens
+        # in the current Tile (handling cross-block cases).
+        token_indices = start_n + offs_bs_n
+        bn_logical_indices = token_indices // PHYSICAL_BLOCK_SIZE
+
+        # 2. Vectorized loading of physical block IDs from B_Loc
         bn = tl.load(
-            B_Loc
-            + cur_batch * stride_b_loc_b
-            + (start_n // BLOCK_SIZE) * stride_b_loc_s
+            B_Loc + cur_batch * stride_b_loc_b + bn_logical_indices * stride_b_loc_s
         ).to(tl.int64)
-        # [D,BLOCK_SIZE]
+
+        # 3. Calculate the exact offset of
+        # each token within its physical block.
+        internal_offsets = token_indices % PHYSICAL_BLOCK_SIZE
+
+        # Addressing of K (5D)
         off_k = (
             bn[None, :] * stride_k_cache_bs
             + cur_kv_head * stride_k_cache_h
             + (offs_d[:, None] // x) * stride_k_cache_d
-            + ((start_n + offs_bs_n[None, :]) % BLOCK_SIZE) * stride_k_cache_bl
+            + internal_offsets[None, :] * stride_k_cache_bl
             + (offs_d[:, None] % x) * stride_k_cache_x
         )
 
-        # [BLOCK_SIZE,D]
+        # Addressing of V (4D)
         off_v = (
             bn[:, None] * stride_v_cache_bs
             + cur_kv_head * stride_v_cache_h
             + offs_d[None, :] * stride_v_cache_d
-            + offs_bs_n[:, None] * stride_v_cache_bl
+            + internal_offsets[:, None] * stride_v_cache_bl
         )
 
         if (
@@ -199,12 +210,12 @@ def _fwd_kernel(
         else:
             k = k_load
 
-        qk = tl.zeros([BLOCK_M, BLOCK_SIZE], dtype=tl.float32)  # [M,N]
-        qk = tl.dot(q, k, acc=qk, input_precision=IN_PRECISION)
+        # qk = tl.zeros([BLOCK_M, BLOCK_SIZE], dtype=tl.float32)  # [M,N]
+        qk = sm_scale * tl.dot(q, k, input_precision=IN_PRECISION)
         qk = tl.where(
             (start_n + offs_bs_n[None, :]) < cur_batch_ctx_len, qk, float("-inf")
         )
-        qk *= sm_scale
+        # qk *= sm_scale
         if SLIDING_WINDOW > 0:
             # (cur_batch_ctx_len + offs_m[:, None]) are the positions of
             # Q entries in sequence
@@ -221,14 +232,16 @@ def _fwd_kernel(
                 (cur_batch_ctx_len + offs_m[:, None]) - (start_n + offs_bs_n[None, :])
                 < SLIDING_WINDOW,
                 qk,
-                -10000,
+                float("-inf"),
             )
 
         # compute running maximum
         m_ij = tl.maximum(m_i, tl.max(qk, axis=1))
         p = tl.exp(qk - m_ij[:, None])
+        p = tl.where(m_ij[:, None] == float("-inf"), 0.0, p)
         l_ij = tl.sum(p, axis=1)
         alpha = tl.exp(m_i - m_ij)
+        alpha = tl.where(m_i == float("-inf"), 0.0, alpha)
         acc = acc * alpha[:, None]
 
         # update acc
@@ -297,14 +310,17 @@ def _fwd_kernel(
             qk = tl.where(
                 offs_m[:, None] - (start_n + offs_n[None, :]) < SLIDING_WINDOW,
                 qk,
-                -10000,
+                float("-inf"),
             )
 
         # compute running maximum
         m_ij = tl.maximum(m_i, tl.max(qk, axis=1))
         p = tl.exp(qk - m_ij[:, None])
+        p = tl.where(m_ij[:, None] == float("-inf"), 0.0, p)
         l_ij = tl.sum(p, axis=1)
         alpha = tl.exp(m_i - m_ij)
+        # To prevent NaN from appearing in the first round
+        alpha = tl.where(m_i == float("-inf"), 0.0, alpha)
         acc = acc * alpha[:, None]
 
         # update acc
@@ -321,7 +337,7 @@ def _fwd_kernel(
         l_i = l_i * alpha + l_ij
         m_i = m_ij
 
-    acc = acc / l_i[:, None]
+    acc = acc / (l_i[:, None] + 1e-10)
 
     # initialize pointers to output
     off_o = (
@@ -641,6 +657,7 @@ def context_attention_fwd(
     skip_decode=False,
     fp8_out_scale=None,
     sinks=None,
+    is_block_table_ptr: bool = False,
 ):
     q_dtype_is_f32 = q.dtype is torch.float32
 
@@ -693,6 +710,19 @@ def context_attention_fwd(
     if sliding_window is None or sliding_window <= 0:
         sliding_window = 0
 
+    if is_block_table_ptr:
+        kv_element_size = k_cache.element_size()
+        block_byte_stride = k_cache.stride(0) * kv_element_size
+        # The physical starting point of the obtained KV Cache Pool
+        base_addr = k_cache.data_ptr()
+
+        mask = b_loc > 0
+        processed_b_loc = torch.where(
+            mask, (b_loc - base_addr) // block_byte_stride, b_loc
+        ).to(torch.int32)
+    else:
+        processed_b_loc = b_loc.to(torch.int32)
+
     if alibi_slopes is not None:
         assert sinks is None, "Sinks arg is not supported with alibi"
         assert fp8_out_scale is None, "FP8 output not supported with alibi"
@@ -756,17 +786,34 @@ def context_attention_fwd(
     max_seq_len = 0 if max_seq_len is None else max_seq_len
     extra_kargs = {}
     if current_platform.is_rocm():
-        extra_kargs = {"kpack": 1, "waves_per_eu": 2}
+        extra_kargs = {}
+
+    real_block_size = v_cache.shape[3]
+    is_pow2 = real_block_size > 0 and (real_block_size & (real_block_size - 1) == 0)
+    # For standard models involving powers of 2,
+    # follow the original logic (Llama 128/64)
+    # For non-standard models (Qwen3-next block_size 544), set to 32.
+    if is_pow2:
+        BLOCK_M = 128
+        BLOCK_N = 64
+    else:
+        BLOCK_M = 32
+        BLOCK_N = 32
+
+    # TRITON_BLOCK_SIZE is kept at 32 to ensure
+    # correct alignment logic when the kernel handles
+    # non-standard sizes (such as 544).
+    TRITON_BLOCK_SIZE = 32
 
-    grid = lambda META: (batch, head, triton.cdiv(max_input_len, META["BLOCK_M"]))
-    _fwd_kernel[grid](
+    grid_fn = lambda META: (batch, head, triton.cdiv(max_input_len, META["BLOCK_M"]))
+    _fwd_kernel[grid_fn](
         q,
         k,
         v,
         k_cache,
         v_cache,
         sinks,
-        b_loc,
+        processed_b_loc,
         sm_scale,
         k_scale,
         v_scale,
@@ -775,8 +822,8 @@ def context_attention_fwd(
         b_seq_len,
         k_cache.shape[4],
         o,
-        b_loc.stride(0),
-        b_loc.stride(1),
+        processed_b_loc.stride(0),
+        processed_b_loc.stride(1),
         q.stride(0),
         q.stride(1),
         q.stride(2),
@@ -789,16 +836,17 @@ def context_attention_fwd(
         o.stride(0),
         o.stride(1),
         o.stride(2),
-        k_cache.stride(0),
-        k_cache.stride(1),
-        k_cache.stride(2),
-        k_cache.stride(3),
-        k_cache.stride(4),  # [num_blocks, num_kv_heads, head_size/x, block_size, x]
-        v_cache.stride(0),
-        v_cache.stride(1),
-        v_cache.stride(2),
-        v_cache.stride(3),  # [num_blocks, num_kv_heads, head_size, block_size]
-        BLOCK_SIZE=v_cache.shape[3],
+        stride_k_cache_bs=k_cache.stride(0),
+        stride_k_cache_h=k_cache.stride(1),
+        stride_k_cache_d=k_cache.stride(2),
+        stride_k_cache_bl=k_cache.stride(3),
+        stride_k_cache_x=k_cache.stride(4),
+        stride_v_cache_bs=v_cache.stride(0),
+        stride_v_cache_h=v_cache.stride(1),
+        stride_v_cache_d=v_cache.stride(2),
+        stride_v_cache_bl=v_cache.stride(3),
+        BLOCK_SIZE=TRITON_BLOCK_SIZE,
+        PHYSICAL_BLOCK_SIZE=real_block_size,
         num_queries_per_kv=num_queries_per_kv,
         IN_PRECISION=IN_PRECISION,
         BLOCK_DMODEL=Lk,
@@ -806,8 +854,8 @@ def context_attention_fwd(
         SLIDING_WINDOW=sliding_window,
         SKIP_DECODE=skip_decode,
         USE_FP8=fp8_out_scale is not None,
-        BLOCK_M=128,
-        BLOCK_N=64,
+        BLOCK_M=BLOCK_M,
+        BLOCK_N=BLOCK_N,
         num_unroll_cache=4,
         num_unroll_request=1,
         num_warps=4,
diff --git a/vllm/attention/ops/rocm_aiter_mla_sparse.py b/vllm/v1/attention/ops/rocm_aiter_mla_sparse.py
similarity index 98%
rename from vllm/attention/ops/rocm_aiter_mla_sparse.py
rename to vllm/v1/attention/ops/rocm_aiter_mla_sparse.py
index 080e92ecc9408b2db3a5c159dd998c81c14b6fc1..1e89d48dbbb6f02487510f586b7752c8e54e9874 100644
--- a/vllm/attention/ops/rocm_aiter_mla_sparse.py
+++ b/vllm/v1/attention/ops/rocm_aiter_mla_sparse.py
@@ -37,9 +37,9 @@ def fp8_mqa_logits_torch(
     Returns:
         Logits tensor of shape [M, N], dtype `torch.float32`.
     """
-    kv, scale = kv
-    seq_len_kv = kv.shape[0]
-    k = kv.to(torch.bfloat16)
+    k_fp8, scale = kv
+    seq_len_kv = k_fp8.shape[0]
+    k = k_fp8.to(torch.bfloat16)
     q = q.to(torch.bfloat16)
 
     mask_lo = (
diff --git a/vllm/attention/ops/triton_decode_attention.py b/vllm/v1/attention/ops/triton_decode_attention.py
similarity index 99%
rename from vllm/attention/ops/triton_decode_attention.py
rename to vllm/v1/attention/ops/triton_decode_attention.py
index eef6e38d24ce74802dce769e6ac62b90066a6d03..69241e814b5736168ff41d8de18efd7f2cecf3f9 100644
--- a/vllm/attention/ops/triton_decode_attention.py
+++ b/vllm/v1/attention/ops/triton_decode_attention.py
@@ -285,10 +285,7 @@ def _fwd_grouped_kernel_stage1(
     cur_kv_head = cur_head_id // tl.cdiv(kv_group_num, BLOCK_H)
     split_kv_id = tl.program_id(2)
 
-    if kv_group_num > BLOCK_H:
-        VALID_BLOCK_H: tl.constexpr = BLOCK_H
-    else:
-        VALID_BLOCK_H: tl.constexpr = kv_group_num
+    VALID_BLOCK_H: tl.constexpr = BLOCK_H if kv_group_num > BLOCK_H else kv_group_num
     cur_head = cur_head_id * VALID_BLOCK_H + tl.arange(0, BLOCK_H)
     mask_h = cur_head < (cur_head_id + 1) * VALID_BLOCK_H
     mask_h = mask_h & (cur_head < q_head_num)
diff --git a/vllm/attention/ops/triton_merge_attn_states.py b/vllm/v1/attention/ops/triton_merge_attn_states.py
similarity index 100%
rename from vllm/attention/ops/triton_merge_attn_states.py
rename to vllm/v1/attention/ops/triton_merge_attn_states.py
diff --git a/vllm/v1/attention/ops/triton_prefill_attention.py b/vllm/v1/attention/ops/triton_prefill_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..046d0c1707a9721c7209c65132009858cd571e14
--- /dev/null
+++ b/vllm/v1/attention/ops/triton_prefill_attention.py
@@ -0,0 +1,272 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://github.com/sgl-project/sglang/blob/97cb762bb65ebf05025eb342de03c184660427a3/python/sglang/srt/layers/attention/triton_ops/prefill_attention.py
+# Changes:
+# - Add support for sliding window attention
+
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+Memory-efficient attention for prefill.
+It supports page size = 1.
+"""
+
+# Adapted from
+# https://github.com/ModelTC/lightllm/blob/f2a54f0912293f683bf1d1695fd12c4098a5bf82/lightllm/models/llama/triton_kernel/context_flashattention_nopad.py#L1
+import torch
+
+from vllm.platforms import current_platform
+from vllm.triton_utils import tl, triton
+
+
+@triton.jit
+def _fwd_kernel(
+    Q,
+    K,
+    V,
+    sm_scale,
+    B_Start_Loc,
+    B_Seqlen,
+    Out,
+    stride_qbs,
+    stride_qh,
+    stride_kbs,
+    stride_kh,
+    stride_vbs,
+    stride_vh,
+    stride_obs,
+    stride_oh,
+    kv_group_num: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    IS_CAUSAL: tl.constexpr,
+    SLIDING_WINDOW_Q: tl.constexpr,
+    SLIDING_WINDOW_K: tl.constexpr,
+    Lk: tl.constexpr,
+):
+    cur_batch = tl.program_id(0)
+    cur_head = tl.program_id(1)
+    start_m = tl.program_id(2)
+
+    cur_kv_head = cur_head // kv_group_num
+
+    cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
+    cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch)
+
+    block_start_loc = BLOCK_M * start_m
+
+    # initialize offsets
+    offs_n = tl.arange(0, BLOCK_N)
+    offs_d = tl.arange(0, BLOCK_DMODEL)
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    off_q = (
+        (cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs
+        + cur_head * stride_qh
+        + offs_d[None, :]
+    )
+    off_k = offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh + offs_d[:, None]
+    off_v = offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh + offs_d[None, :]
+
+    mask_d = offs_d < Lk
+
+    q = tl.load(
+        Q + off_q,
+        mask=(offs_m[:, None] < cur_batch_seq_len) & (mask_d[None, :]),
+        other=0.0,
+    )
+
+    k_ptrs = K + off_k
+    v_ptrs = V + off_v
+
+    # initialize pointer to m and l
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+
+    block_mask = tl.where(block_start_loc < cur_batch_seq_len, 1, 0)
+
+    # Calculate the end position for attention computation
+    end_n = cur_batch_seq_len
+
+    # Apply causal attention pruning and sliding window attention pruning
+    end_n = tl.minimum(end_n, (start_m + 1) * BLOCK_M) if IS_CAUSAL else end_n
+
+    # Calculate the start position for backward sliding window
+    start_n_limit = 0
+    end_n_limit = block_mask * end_n
+
+    for start_n in range(start_n_limit, end_n_limit, BLOCK_N):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+        # -- compute qk ----
+        k = tl.load(
+            k_ptrs + (cur_batch_in_all_start_index + start_n) * stride_kbs,
+            mask=((start_n + offs_n[None, :]) < cur_batch_seq_len) & (mask_d[:, None]),
+            other=0.0,
+        )
+
+        # Apply attention mask (causal + bidirectional sliding window)
+        # Position indices in the sequence
+        pos_q = offs_m[:, None]  # Query positions [BLOCK_M, 1]
+        pos_k = start_n + offs_n[None, :]  # Key positions [1, BLOCK_N]
+
+        # Valid sequence mask
+        mask = pos_k < cur_batch_seq_len
+        # Causal mask
+        if IS_CAUSAL:
+            mask &= pos_q >= pos_k
+
+        # Bidirectional sliding window masks
+        sliding_mask_q = (
+            pos_q - pos_k <= SLIDING_WINDOW_Q if SLIDING_WINDOW_Q > 0 else None
+        )
+        sliding_mask_k = (
+            pos_k - pos_q <= SLIDING_WINDOW_K if SLIDING_WINDOW_K > 0 else None
+        )
+        if sliding_mask_q is not None:
+            mask &= sliding_mask_q
+        if sliding_mask_k is not None:
+            mask &= sliding_mask_k
+
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk += tl.where(mask, 0, float("-inf"))
+        qk += tl.dot(q, k)
+        qk *= sm_scale
+
+        # -- compute m_ij, p, l_ij
+        m_ij = tl.max(qk, 1)
+        # For sliding window there's a chance the max is -inf due to masking of
+        # the entire row. In this case we need to set m_j 0 to avoid NaN
+        m_ij_valid_mask = m_ij > float("-inf")
+        m_ij_masked = tl.where(m_ij_valid_mask, m_ij, 0.0)
+        # -- compute p and l_ij --
+        p = tl.exp(qk - m_ij_masked[:, None])
+        l_ij = tl.sum(p, 1)
+        # -- update m_i and l_i
+        m_i_new = tl.maximum(m_i, m_ij)
+        m_i_new_mask = m_i_new > float("-inf")
+        alpha = tl.exp(m_i - m_i_new)
+        beta = tl.exp(m_ij - m_i_new)
+        # mask alpha and beta for sliding window
+        alpha = tl.where(m_i_new_mask, alpha, 1.0)
+        beta = tl.where(m_i_new_mask, beta, 0.0)
+        l_i_new = alpha * l_i + beta * l_ij
+        # -- update output accumulator --
+        # scale p
+        # For sliding window there's a chance the l_i_new is 0 due to masking
+        # the entire row. We need to set l_i_new 1 to avoid zero division
+        l_i_new_mask = (l_i_new != 0.0) & (m_i_new_mask > float("-inf"))
+        l_i_new_safe = tl.where(l_i_new_mask, l_i_new, 1.0)
+        p_scale = beta / l_i_new_safe
+        p = p * p_scale[:, None]
+        # scale acc
+        acc_scale = l_i / l_i_new_safe * alpha
+        acc = acc * acc_scale[:, None]
+        # update acc
+        v = tl.load(
+            v_ptrs + (cur_batch_in_all_start_index + start_n) * stride_vbs,
+            mask=((start_n + offs_n[:, None]) < cur_batch_seq_len) & (mask_d[None, :]),
+            other=0.0,
+        )
+
+        p = p.to(v.dtype)
+        acc += tl.dot(p, v)
+        # update m_i and l_i
+        l_i = l_i_new
+        m_i = m_i_new
+    # initialize pointers to output
+    off_o = (
+        (cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs
+        + cur_head * stride_oh
+        + offs_d[None, :]
+    )
+    out_ptrs = Out + off_o
+    tl.store(
+        out_ptrs, acc, mask=(offs_m[:, None] < cur_batch_seq_len) & (mask_d[None, :])
+    )
+
+
+def get_block_size(dtype: torch.dtype) -> int:
+    if dtype == torch.float32:
+        return 32
+    elif current_platform.is_cuda_alike() and current_platform.has_device_capability(
+        80
+    ):
+        return 128
+    else:
+        return 64
+
+
+def context_attention_fwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    o: torch.Tensor,
+    b_start_loc: torch.Tensor,
+    b_seq_len: torch.Tensor,
+    max_input_len: int,
+    is_causal: bool = True,
+    softmax_scale: float | None = None,
+    sliding_window_q: int | None = None,
+    sliding_window_k: int | None = None,
+):
+    """
+    q, k, v: [b * s, head, head_dim]
+    b_start_loc: [b]
+    b_seq_len: [b]
+    out: [b * s, head, head_dim]
+    """
+    BLOCK = get_block_size(q.dtype)
+
+    Lq, Lk, _ = q.shape[-1], k.shape[-1], v.shape[-1]
+
+    sm_scale = 1.0 / (Lq**0.5) if softmax_scale is None else softmax_scale
+    batch, head = b_seq_len.shape[0], q.shape[1]
+    kv_group_num = q.shape[1] // k.shape[1]
+
+    grid = (batch, head, triton.cdiv(max_input_len, BLOCK))
+    num_warps = 4 if Lk <= 64 else 8
+
+    sliding_window_q = sliding_window_q if sliding_window_q is not None else 0
+    sliding_window_k = sliding_window_k if sliding_window_k is not None else 0
+
+    _fwd_kernel[grid](
+        q,
+        k,
+        v,
+        sm_scale,
+        b_start_loc,
+        b_seq_len,
+        o,
+        q.stride(0),
+        q.stride(1),
+        k.stride(0),
+        k.stride(1),
+        v.stride(0),
+        v.stride(1),
+        o.stride(0),
+        o.stride(1),
+        kv_group_num=kv_group_num,
+        BLOCK_M=BLOCK,
+        BLOCK_DMODEL=triton.next_power_of_2(Lk),
+        BLOCK_N=BLOCK,
+        IS_CAUSAL=is_causal,
+        SLIDING_WINDOW_Q=sliding_window_q,
+        SLIDING_WINDOW_K=sliding_window_k,
+        num_warps=num_warps,
+        num_stages=1,
+        Lk=Lk,
+    )
diff --git a/vllm/v1/attention/ops/triton_reshape_and_cache_flash.py b/vllm/v1/attention/ops/triton_reshape_and_cache_flash.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5c9a9c96662c95a2cfd0e13ef5888657c0cad86
--- /dev/null
+++ b/vllm/v1/attention/ops/triton_reshape_and_cache_flash.py
@@ -0,0 +1,395 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+
+from vllm.platforms import current_platform
+from vllm.triton_utils import tl, triton
+
+
+@triton.jit
+def reshape_and_cache_kernel_flash(
+    key_ptr,  # [num_tokens, num_heads, head_size]
+    value_ptr,  # [num_tokens, num_heads, head_size]
+    key_cache_ptr,  # [num_blocks, block_size, num_heads, head_size]
+    value_cache_ptr,  # [num_blocks, block_size, num_heads, head_size]
+    slot_mapping_ptr,  # [num_tokens]
+    k_scale,  # float32
+    v_scale,  # float32
+    # strides
+    key_stride: tl.int64,
+    value_stride: tl.int64,
+    block_stride: tl.int64,
+    head_stride: tl.int64,
+    dim_stride_k: tl.int64,
+    dim_stride_v: tl.int64,
+    page_stride: tl.int64,
+    num_heads: tl.constexpr,
+    head_size: tl.constexpr,
+    block_size: tl.constexpr,
+    x: tl.constexpr,
+    USE_HEAD_MAJOR_LAYOUT: tl.constexpr,
+    # FP8 flags
+    FP8_KV_CACHE: tl.constexpr,
+    # tune parameters
+    TILE_SIZE: tl.constexpr,
+):
+    token_idx = tl.program_id(axis=0)
+    slot_idx = tl.load(slot_mapping_ptr + token_idx).to(tl.int64)
+    if slot_idx < 0:
+        # Padding token that should be ignored.
+        return
+
+    block_idx = slot_idx // block_size
+    block_offset = slot_idx % block_size
+
+    tile_i = tl.program_id(axis=1)
+    tile_offs = tl.arange(0, TILE_SIZE)
+    tile_pos = tile_i * TILE_SIZE + tile_offs
+    src_key_idx = token_idx * key_stride
+    src_value_idx = token_idx * value_stride
+
+    if USE_HEAD_MAJOR_LAYOUT:
+        # Decompose the tile index back into head and dim coordinates.
+        cur_head = tile_pos // head_size
+        cur_dim = tile_pos % head_size
+        # Value addressing (4D): [Block, Head, Dim, Slot]
+        tgt_idx_v = (
+            block_idx * block_stride
+            + cur_head * head_stride
+            + cur_dim * dim_stride_v
+            + block_offset * 1
+        )
+        # Key addressing (5D): [Block, Head, Dim//8, Slot, 8]
+        tgt_idx_k = (
+            block_idx * block_stride
+            + cur_head * head_stride
+            + (cur_dim // x) * dim_stride_k
+            + block_offset * x
+            + (cur_dim % x)
+        )
+    else:
+        tgt_base = block_idx * block_stride + block_offset * page_stride
+        tgt_idx_k = tgt_base + tile_pos
+        tgt_idx_v = tgt_base + tile_pos
+
+    # [TILE_SIZE]
+    key_load = tl.load(
+        key_ptr + src_key_idx + tile_pos, mask=tile_pos < (num_heads * head_size)
+    )
+    if FP8_KV_CACHE:
+        # tl.store will do the correct implicit cast to fp8,
+        # based on the key_cache_ptr.dtype.element_ty
+        key_tile = key_load if key_load.dtype.is_fp8() else key_load / tl.load(k_scale)
+    else:
+        key_tile = key_load
+
+    # [TILE_SIZE]
+    value_load = tl.load(
+        value_ptr + src_value_idx + tile_pos, mask=tile_pos < (num_heads * head_size)
+    )
+    if FP8_KV_CACHE:
+        if value_load.dtype.is_fp8():
+            value_tile = value_load
+        else:
+            # tl.store will do the correct implicit cast to fp8,
+            #  based on the value_cache_ptr.dtype.element_ty
+            value_tile = value_load / tl.load(v_scale)
+    else:
+        value_tile = value_load
+
+    tl.store(
+        key_cache_ptr + tgt_idx_k,
+        key_tile,
+        mask=tile_pos < (num_heads * head_size),
+    )
+    tl.store(
+        value_cache_ptr + tgt_idx_v,
+        value_tile,
+        mask=tile_pos < (num_heads * head_size),
+    )
+    return
+
+
+def triton_reshape_and_cache_flash(
+    key: torch.Tensor,  # [num_tokens, num_heads, head_size]
+    value: torch.Tensor,  # [num_tokens, num_heads, head_size]
+    # [num_blocks, block_size, num_heads, head_size]
+    key_cache: torch.Tensor,
+    # [num_blocks, block_size, num_heads, head_size]
+    value_cache: torch.Tensor,
+    slot_mapping: torch.Tensor,  # [num_tokens]
+    kv_cache_dtype: str,  # "auto", "fp8"
+    k_scale: torch.Tensor,  # float32
+    v_scale: torch.Tensor,  # float32
+):
+    num_heads = key.shape[1]
+    head_size = key.shape[2]
+
+    use_head_major_layout = key_cache.ndim == 5
+    if use_head_major_layout:
+        block_size = key_cache.shape[3]
+        x = key_cache.shape[4]
+        head_stride = key_cache.stride(1)
+        dim_stride_k = key_cache.stride(2)
+        dim_stride_v = value_cache.stride(2)
+    else:
+        block_size = key_cache.shape[1]
+        x = 1
+        dim_stride_k = 0
+        dim_stride_v = 0
+        head_stride = key_cache.stride()[2]
+    n = num_heads * head_size
+    key_stride = key.stride()[0]
+    value_stride = value.stride()[0]
+    block_stride = key_cache.stride()[0]
+    page_stride = key_cache.stride()[1]
+
+    assert kv_cache_dtype == "auto" or kv_cache_dtype.startswith("fp8"), (
+        f"unsupported kv_cache_dtype (str), got {kv_cache_dtype}."
+    )
+    kv_cache_torch_dtype = (
+        current_platform.fp8_dtype()
+        if kv_cache_dtype.startswith("fp8")
+        else key_cache.dtype
+    )
+
+    if key_cache.dtype != kv_cache_torch_dtype and kv_cache_dtype.startswith("fp8"):
+        # to avoid erounous implicit cast in triton kernel (tl.store to uint8)
+        # (e.g. explicit cast to fp8e4m3fnuz is not supported in triton 3.4)
+        key_cache = key_cache.view(kv_cache_torch_dtype)
+        value_cache = value_cache.view(kv_cache_torch_dtype)
+    assert kv_cache_dtype != torch.uint8, (
+        "explicit fp8 cast and store to "
+        "uint8 is not supported by triton reshape_and_cache_flash"
+    )
+
+    FP8_KV_CACHE = kv_cache_dtype.startswith("fp8")
+    assert (not FP8_KV_CACHE) or kv_cache_torch_dtype in [
+        torch.float8_e4m3fn,
+        torch.float8_e5m2,
+        torch.uint8,
+        torch.float8_e4m3fnuz,
+    ], (
+        "unsupported dtype of KV cache tensor, got "
+        "{kv_cache_torch_dtype}. Supported kv cache dtypes: fp8e4m3fn, "
+        "fp8e5m2, uint8, bfloat16, float16, float32, fp8e4m3fnuz."
+    )
+
+    # heuristics instead of autotuning
+    TILE_SIZE = min(2048, triton.next_power_of_2(n))
+    if current_platform.is_rocm() or current_platform.is_xpu():
+        num_stages = 4
+        num_warps = 8
+    else:  # cuda
+        num_stages = 10
+        num_warps = 16
+        if torch.cuda.get_device_capability(key.device)[0] < 9:
+            TILE_SIZE = min(512, TILE_SIZE)
+
+    # TODO(ngl): maybe replace with static launch grid to avoid overhead if
+    #   using cudagraphs
+    grid = lambda meta: (
+        slot_mapping.shape[0],
+        triton.cdiv(n, meta["TILE_SIZE"]),
+    )
+
+    reshape_and_cache_kernel_flash[grid](
+        key_ptr=key,
+        value_ptr=value,
+        key_cache_ptr=key_cache,
+        value_cache_ptr=value_cache,
+        slot_mapping_ptr=slot_mapping,
+        k_scale=k_scale,
+        v_scale=v_scale,
+        # strides
+        key_stride=key_stride,
+        value_stride=value_stride,
+        block_stride=block_stride,
+        head_stride=head_stride,
+        dim_stride_k=dim_stride_k,
+        dim_stride_v=dim_stride_v,
+        page_stride=page_stride,
+        num_heads=num_heads,
+        head_size=head_size,
+        block_size=block_size,
+        x=x,
+        USE_HEAD_MAJOR_LAYOUT=use_head_major_layout,
+        # FP8 flags
+        FP8_KV_CACHE=FP8_KV_CACHE,
+        # autotune parameters
+        TILE_SIZE=TILE_SIZE,
+        num_warps=num_warps,
+        num_stages=num_stages,
+    )
+
+
+@triton.jit
+def reshape_and_cache_kernel_flash_diffkv(
+    key_ptr,  # [num_tokens, num_heads, head_size]
+    value_ptr,  # [num_tokens, num_heads, head_size_v]
+    kv_cache_ptr,  # [num_blocks, block_size, num_heads, head_size + head_size_v]
+    slot_mapping_ptr,  # [num_tokens]
+    k_scale,  # float32
+    v_scale,  # float32
+    # strides
+    key_stride: tl.int64,
+    value_stride: tl.int64,
+    block_stride: tl.int64,
+    page_stride: tl.int64,
+    num_heads: tl.constexpr,
+    head_size_k: tl.constexpr,
+    head_size_v: tl.constexpr,
+    block_size: tl.constexpr,
+    # FP8 flags
+    FP8_KV_CACHE: tl.constexpr,
+    # tune parameters
+    TILE_SIZE: tl.constexpr,
+):
+    token_idx = tl.program_id(axis=0)
+    slot_idx = tl.load(slot_mapping_ptr + token_idx).to(tl.int64)
+    if slot_idx < 0:
+        # Padding token that should be ignored.
+        return
+
+    tile_i = tl.program_id(axis=1)
+    tile_offs = tl.arange(0, TILE_SIZE)
+
+    block_idx = slot_idx // block_size
+    block_offset = slot_idx % block_size
+
+    src_key_idx = token_idx * key_stride + tile_i * head_size_k
+    src_value_idx = token_idx * value_stride + tile_i * head_size_v
+
+    tgt_idx = (
+        block_idx * block_stride
+        + block_offset * page_stride
+        + tile_i * (head_size_k + head_size_v)
+    )
+
+    # [TILE_SIZE]
+    key_load = tl.load(key_ptr + src_key_idx + tile_offs, mask=tile_offs < head_size_k)
+    if FP8_KV_CACHE:
+        # tl.store will do the correct implicit cast to fp8,
+        # based on the key_cache_ptr.dtype.element_ty
+        key_tile = key_load if key_load.dtype.is_fp8() else key_load / tl.load(k_scale)
+    else:
+        key_tile = key_load
+
+    # [TILE_SIZE]
+    value_load = tl.load(
+        value_ptr + src_value_idx + tile_offs, mask=tile_offs < head_size_v
+    )
+    if FP8_KV_CACHE:
+        if value_load.dtype.is_fp8():
+            value_tile = value_load
+        else:
+            # tl.store will do the correct implicit cast to fp8,
+            #  based on the value_cache_ptr.dtype.element_ty
+            value_tile = value_load / tl.load(v_scale)
+    else:
+        value_tile = value_load
+
+    tl.store(
+        kv_cache_ptr + tgt_idx + tile_offs,
+        key_tile,
+        mask=tile_offs < head_size_k,
+    )
+    tl.store(
+        kv_cache_ptr + tgt_idx + head_size_k + tile_offs,
+        value_tile,
+        mask=tile_offs < head_size_v,
+    )
+    return
+
+
+def triton_reshape_and_cache_flash_diffkv(
+    key: torch.Tensor,  # [num_tokens, num_heads, head_size]
+    value: torch.Tensor,  # [num_tokens, num_heads, head_size_v]
+    # [num_blocks, block_size, num_heads, head_size + head_size_v]
+    kv_cache: torch.Tensor,
+    slot_mapping: torch.Tensor,  # [num_tokens]
+    kv_cache_dtype: str,  # "auto", "fp8"
+    k_scale: torch.Tensor,  # float32
+    v_scale: torch.Tensor,  # float32
+):
+    num_heads = key.shape[1]
+    head_size_k = key.shape[2]
+    head_size_v = value.shape[2]
+    block_size = kv_cache.shape[1]
+
+    k_stride = key.stride()[0]
+    v_stride = value.stride()[0]
+    block_stride = kv_cache.stride()[0]
+    page_stride = kv_cache.stride()[1]
+
+    assert kv_cache_dtype == "auto" or kv_cache_dtype.startswith("fp8"), (
+        f"unsupported kv_cache_dtype (str), got {kv_cache_dtype}."
+    )
+    kv_cache_torch_dtype = (
+        current_platform.fp8_dtype()
+        if kv_cache_dtype.startswith("fp8")
+        else kv_cache.dtype
+    )
+
+    if kv_cache.dtype != kv_cache_torch_dtype and kv_cache_dtype.startswith("fp8"):
+        # to avoid erounous implicit cast in triton kernel (tl.store to uint8)
+        # (e.g. explicit cast to fp8e4m3fnuz is not supported in triton 3.4)
+        kv_cache = kv_cache.view(kv_cache_torch_dtype)
+    assert kv_cache_dtype != torch.uint8, (
+        "explicit fp8 cast and store to "
+        "uint8 is not supported by triton reshape_and_cache_flash_diffkv"
+    )
+
+    FP8_KV_CACHE = kv_cache_dtype.startswith("fp8")
+    assert (not FP8_KV_CACHE) or kv_cache_torch_dtype in [
+        torch.float8_e4m3fn,
+        torch.float8_e5m2,
+        torch.uint8,
+        torch.float8_e4m3fnuz,
+    ], (
+        "unsupported dtype of KV cache tensor, got "
+        "{kv_cache_torch_dtype}. Supported kv cache dtypes: fp8e4m3fn, "
+        "fp8e5m2, uint8, bfloat16, float16, float32, fp8e4m3fnuz."
+    )
+
+    # heuristics instead of autotuning
+    TILE_SIZE = max(head_size_k, head_size_v)
+    TILE_SIZE = triton.next_power_of_2(TILE_SIZE)
+    if current_platform.is_rocm() or current_platform.is_xpu():
+        num_stages = 4
+        num_warps = 8
+    else:  # cuda
+        num_stages = 10
+        num_warps = 16
+
+    # TODO(ngl): maybe replace with static launch grid to avoid overhead if
+    #   using cudagraphs
+    grid = lambda meta: (
+        slot_mapping.shape[0],
+        num_heads,
+    )
+
+    reshape_and_cache_kernel_flash_diffkv[grid](
+        key_ptr=key,
+        value_ptr=value,
+        kv_cache_ptr=kv_cache,
+        slot_mapping_ptr=slot_mapping,
+        k_scale=k_scale,
+        v_scale=v_scale,
+        # strides
+        key_stride=k_stride,
+        value_stride=v_stride,
+        block_stride=block_stride,
+        page_stride=page_stride,
+        num_heads=num_heads,
+        head_size_k=head_size_k,
+        head_size_v=head_size_v,
+        block_size=block_size,
+        # FP8 flags
+        FP8_KV_CACHE=FP8_KV_CACHE,
+        # autotune parameters
+        TILE_SIZE=TILE_SIZE,
+        num_warps=num_warps,
+        num_stages=num_stages,
+    )
diff --git a/vllm/attention/ops/triton_unified_attention.py b/vllm/v1/attention/ops/triton_unified_attention.py
similarity index 94%
rename from vllm/attention/ops/triton_unified_attention.py
rename to vllm/v1/attention/ops/triton_unified_attention.py
index f61c8e9b89c24810542c02891af70ad88aa77a15..345889969d3e1cc9a837b799e14c5fcc165d9d22 100644
--- a/vllm/attention/ops/triton_unified_attention.py
+++ b/vllm/v1/attention/ops/triton_unified_attention.py
@@ -189,9 +189,14 @@ def kernel_unified_attention_2d(
         + 1
     )
 
-    # adjust for potential padding in the last q_block by considering the
-    # actual sequence length
-    max_seq_prefix_len = tl.minimum(max_seq_prefix_len, seq_len)
+    if USE_MM_PREFIX:
+        # image bidirectional attention ranges require a full range
+        # including q_block padding to make sure doc mask is correct
+        max_seq_prefix_len = tl.maximum(max_seq_prefix_len, seq_len)
+    else:
+        # adjust for potential padding in the last q_block by considering the
+        # actual sequence length
+        max_seq_prefix_len = tl.minimum(max_seq_prefix_len, seq_len)
 
     # calculate the number of tiles that need to be processed to
     # cover the longest sequence prefix (due to causal masking, tiles beyond
@@ -202,7 +207,8 @@ def kernel_unified_attention_2d(
     # Default: keep previous global behavior
     tile_start = 0
     tile_end = num_tiles
-    if SLIDING_WINDOW > 0:
+    # TODO(Isotr0py): sliding window pruning with image bidirectional mask
+    if SLIDING_WINDOW > 0 and not USE_MM_PREFIX:
         # Query rows covered by this Q-block
         qpos_lo = q_block_local_idx * BLOCK_Q
         qpos_hi = tl.minimum(
@@ -357,6 +363,12 @@ def kernel_unified_attention_2d(
         L = L * alpha + l_j
         M = m_j
 
+        if SLIDING_WINDOW:
+            qpos_lo = q_block_local_idx * BLOCK_Q
+            V = tl.where(
+                (context_len + qpos_lo - seq_offset[:, None]) < SLIDING_WINDOW, V, 0.0
+            )
+
         # acc : (BLOCK_M, HEAD_SIZE_PADDED)
         acc += tl.dot(P.to(V.dtype), V)
 
@@ -533,10 +545,33 @@ def kernel_unified_attention_3d(
     # this prefix can be skipped)
     num_tiles = cdiv_fn(max_seq_prefix_len, TILE_SIZE)
 
-    # iterate through tiles within current segment
+    # ---- Sliding-window tile pruning --------------------
+    # Default: keep previous global behavior
+    tile_start = 0
+    tile_end = num_tiles
+    # TODO(Isotr0py): sliding window pruning with image bidirectional mask
+    if SLIDING_WINDOW > 0 and not USE_MM_PREFIX:
+        # Query rows covered by this Q-block
+        qpos_lo = q_block_local_idx * BLOCK_Q
+        qpos_hi = tl.minimum(
+            qpos_lo + (BLOCK_M - 1) // num_queries_per_kv,
+            cur_batch_query_len - 1,
+        )
+        # For sliding window, each query position q can only attend to
+        # keys in the range [q_abs - SLIDING_WINDOW + 1, q_abs]
+        # where q_abs = context_len + q
+        # The union of allowed key positions for this Q-block is:
+        # [context_len + qpos_lo - SLIDING_WINDOW + 1, context_len + qpos_hi]
+        first_allowed_key = context_len + qpos_lo - SLIDING_WINDOW + 1
+        last_allowed_key = context_len + qpos_hi
+        # Convert to tile indices and clamp
+        tile_start = tl.maximum(0, first_allowed_key // TILE_SIZE)
+        tile_end = tl.minimum((last_allowed_key // TILE_SIZE) + 1, num_tiles)
+
+    # iterate through tiles (now limited to the sliding window range)
     for j in range(
-        segm_idx * tiles_per_segment,
-        min((segm_idx + 1) * tiles_per_segment, num_tiles),
+        max(segm_idx * tiles_per_segment, tile_start),
+        min((segm_idx + 1) * tiles_per_segment, tile_end),
     ):
         seq_offset = j * TILE_SIZE + offs_t
         tile_mask = seq_offset < max_seq_prefix_len
@@ -672,6 +707,12 @@ def kernel_unified_attention_3d(
         L = L * alpha + l_j
         M = m_j
 
+        if SLIDING_WINDOW:
+            qpos_lo = q_block_local_idx * BLOCK_Q
+            V = tl.where(
+                (context_len + qpos_lo - seq_offset[:, None]) < SLIDING_WINDOW, V, 0.0
+            )
+
         # acc : (BLOCK_M, HEAD_SIZE_PADDED)
         acc += tl.dot(P.to(V.dtype), V)
 
diff --git a/vllm/attention/ops/vit_attn_wrappers.py b/vllm/v1/attention/ops/vit_attn_wrappers.py
similarity index 60%
rename from vllm/attention/ops/vit_attn_wrappers.py
rename to vllm/v1/attention/ops/vit_attn_wrappers.py
index 892c4209c01e04c36d56642a193d325bf10ddfad..f077a61c984f91b4623709502c281aed7509966e 100644
--- a/vllm/attention/ops/vit_attn_wrappers.py
+++ b/vllm/v1/attention/ops/vit_attn_wrappers.py
@@ -24,15 +24,29 @@ def flash_attn_maxseqlen_wrapper(
     q: torch.Tensor,
     k: torch.Tensor,
     v: torch.Tensor,
-    cu_seqlens: torch.Tensor,
-    max_seqlen: torch.Tensor,
     batch_size: int,
     is_rocm_aiter: bool,
+    fa_version: int | None,
+    scale: float | None = None,
+    cu_seqlens: torch.Tensor | None = None,
+    max_seqlen: torch.Tensor | None = None,
 ) -> torch.Tensor:
+    kwargs = {}
     if is_rocm_aiter:
         from aiter import flash_attn_varlen_func
     else:
-        from vllm.attention.utils.fa_utils import flash_attn_varlen_func
+        from vllm.v1.attention.backends.fa_utils import flash_attn_varlen_func
+
+        if not current_platform.is_rocm() and fa_version is not None:
+            kwargs["fa_version"] = fa_version
+
+    q_len = q.size(1)
+    if cu_seqlens is None:
+        cu_seqlens = torch.arange(
+            0, (batch_size + 1) * q_len, step=q_len, dtype=torch.int32, device=q.device
+        )
+    max_seqlen = q_len if max_seqlen is None else max_seqlen.item()
+
     q, k, v = (einops.rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
     output = flash_attn_varlen_func(
         q,
@@ -40,10 +54,12 @@ def flash_attn_maxseqlen_wrapper(
         v,
         cu_seqlens_q=cu_seqlens,
         cu_seqlens_k=cu_seqlens,
-        max_seqlen_q=max_seqlen.item(),
-        max_seqlen_k=max_seqlen.item(),
+        max_seqlen_q=max_seqlen,
+        max_seqlen_k=max_seqlen,
         dropout_p=0.0,
         causal=False,
+        softmax_scale=scale,
+        **kwargs,
     )
     context_layer = einops.rearrange(output, "(b s) h d -> b s h d", b=batch_size)
     return context_layer
@@ -53,10 +69,12 @@ def flash_attn_maxseqlen_wrapper_fake(
     q: torch.Tensor,
     k: torch.Tensor,
     v: torch.Tensor,
-    cu_seqlens: torch.Tensor,
-    max_seqlen: torch.Tensor,
     batch_size: int,
     is_rocm_aiter: bool,
+    fa_version: int | None,
+    scale: float | None = None,
+    cu_seqlens: torch.Tensor | None = None,
+    max_seqlen: torch.Tensor | None = None,
 ) -> torch.Tensor:
     return torch.empty_like(q)
 
@@ -72,23 +90,50 @@ def vit_flash_attn_wrapper(
     q: torch.Tensor,
     k: torch.Tensor,
     v: torch.Tensor,
-    cu_seqlens: torch.Tensor,
-    max_seqlen: torch.Tensor,
     batch_size: int,
     is_rocm_aiter: bool,
+    fa_version: int | None,
+    scale: float | None = None,
+    cu_seqlens: torch.Tensor | None = None,
+    max_seqlen: torch.Tensor | None = None,
 ) -> torch.Tensor:
     return torch.ops.vllm.flash_attn_maxseqlen_wrapper(
-        q, k, v, cu_seqlens, max_seqlen, batch_size, is_rocm_aiter
+        q,
+        k,
+        v,
+        batch_size,
+        is_rocm_aiter,
+        fa_version,
+        scale,
+        cu_seqlens,
+        max_seqlen,
     )
 
 
+def apply_sdpa(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    scale: float | None = None,
+) -> torch.Tensor:
+    """
+    Input shape:
+    (batch_size x seq_len x num_heads x head_size)
+    """
+    q, k, v = (einops.rearrange(x, "b s h d -> b h s d") for x in [q, k, v])
+    output = F.scaled_dot_product_attention(q, k, v, dropout_p=0.0, scale=scale)
+    output = einops.rearrange(output, "b h s d -> b s h d ")
+    return output
+
+
 # TODO: Once we have a torch 2.10, we can use tensor slices
 # so we won't need to wrap this in custom ops
 def torch_sdpa_wrapper(
     q: torch.Tensor,
     k: torch.Tensor,
     v: torch.Tensor,
-    cu_seqlens: torch.Tensor,
+    scale: float | None = None,
+    cu_seqlens: torch.Tensor | None = None,
 ) -> torch.Tensor:
     # Never remove the contiguous logic for ROCm
     # Without it, hallucinations occur with the backend
@@ -97,6 +142,9 @@ def torch_sdpa_wrapper(
         k = k.contiguous()
         v = v.contiguous()
 
+    if cu_seqlens is None:
+        return apply_sdpa(q, k, v, scale=scale)
+
     outputs = []
 
     lens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
@@ -104,11 +152,7 @@ def torch_sdpa_wrapper(
     k_chunks = torch.split(k, lens, dim=1)
     v_chunks = torch.split(v, lens, dim=1)
     for q_i, k_i, v_i in zip(q_chunks, k_chunks, v_chunks):
-        q_i, k_i, v_i = (
-            einops.rearrange(x, "b s h d -> b h s d") for x in [q_i, k_i, v_i]
-        )
-        output_i = F.scaled_dot_product_attention(q_i, k_i, v_i, dropout_p=0.0)
-        output_i = einops.rearrange(output_i, "b h s d -> b s h d ")
+        output_i = apply_sdpa(q_i, k_i, v_i, scale=scale)
         outputs.append(output_i)
     context_layer = torch.cat(outputs, dim=1)
     return context_layer
@@ -118,7 +162,8 @@ def torch_sdpa_wrapper_fake(
     q: torch.Tensor,
     k: torch.Tensor,
     v: torch.Tensor,
-    cu_seqlens: torch.Tensor,
+    scale: float | None,
+    cu_seqlens: torch.Tensor | None,
 ) -> torch.Tensor:
     return torch.empty_like(q)
 
@@ -134,6 +179,7 @@ def vit_torch_sdpa_wrapper(
     q: torch.Tensor,
     k: torch.Tensor,
     v: torch.Tensor,
-    cu_seqlens: torch.Tensor,
+    scale: float | None = None,
+    cu_seqlens: torch.Tensor | None = None,
 ) -> torch.Tensor:
-    return torch.ops.vllm.torch_sdpa_wrapper(q, k, v, cu_seqlens)
+    return torch.ops.vllm.torch_sdpa_wrapper(q, k, v, scale, cu_seqlens)
diff --git a/vllm/attention/selector.py b/vllm/v1/attention/selector.py
similarity index 97%
rename from vllm/attention/selector.py
rename to vllm/v1/attention/selector.py
index e66f698add99d534869f395c0731f74bcf34d030..e364c3235cfebac225926f63f9d48e457791990a 100644
--- a/vllm/attention/selector.py
+++ b/vllm/v1/attention/selector.py
@@ -6,14 +6,14 @@ from typing import NamedTuple, cast, get_args
 
 import torch
 
-from vllm.attention.backends.abstract import AttentionBackend, AttentionType
-from vllm.attention.backends.registry import (
-    MAMBA_TYPE_TO_BACKEND_MAP,
-    MambaAttentionBackendEnum,
-)
 from vllm.config.cache import CacheDType
 from vllm.logger import init_logger
 from vllm.utils.import_utils import resolve_obj_by_qualname
+from vllm.v1.attention.backend import AttentionBackend, AttentionType
+from vllm.v1.attention.backends.registry import (
+    MAMBA_TYPE_TO_BACKEND_MAP,
+    MambaAttentionBackendEnum,
+)
 
 logger = init_logger(__name__)
 
diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py
index c779e3d34b3ed899af59655b972457beb91d9eb5..cf93218a18731c251b2593f3c2570b6a731b1b40 100644
--- a/vllm/v1/core/block_pool.py
+++ b/vllm/v1/core/block_pool.py
@@ -254,6 +254,10 @@ class BlockPool:
             [] if self.enable_kv_cache_events else None
         )
         for i, blk in enumerate(new_full_blocks):
+            # Some blocks may be null blocks when enabling sparse attention like
+            # sliding window attention. We skip null blocks here.
+            if blk.is_null:
+                continue
             assert blk.block_hash is None
             block_hash = new_block_hashes[i]
 
@@ -270,10 +274,8 @@ class BlockPool:
             if num_cached_blocks == 0:
                 parent_block_hash: ExternalBlockHash | None = None
             else:
-                parent_block = blocks[num_cached_blocks - 1]
-                assert parent_block.block_hash is not None
                 parent_block_hash = maybe_convert_block_hash(
-                    get_block_hash(parent_block.block_hash)
+                    block_hashes[num_cached_blocks - 1]
                 )
 
             self.kv_event_queue.append(
@@ -288,6 +290,9 @@ class BlockPool:
                     if request.lora_request
                     else None,
                     medium=MEDIUM_GPU,
+                    lora_name=request.lora_request.name
+                    if request.lora_request
+                    else None,
                 )
             )
 
@@ -363,7 +368,7 @@ class BlockPool:
             )
         return True
 
-    def touch(self, blocks: tuple[Sequence[KVCacheBlock], ...]) -> None:
+    def touch(self, blocks: Sequence[KVCacheBlock]) -> None:
         """Touch a block increases its reference count by 1, and may remove
         the block from the free queue. This is used when a block is hit by
         another request with the same prefix.
@@ -371,15 +376,14 @@ class BlockPool:
         Args:
             blocks: A list of blocks to touch.
         """
-        for blocks_per_group in blocks:
-            for block in blocks_per_group:
-                # ref_cnt=0 means this block is in the free list (i.e. eviction
-                # candidate), so remove it.
-                if block.ref_cnt == 0 and not block.is_null:
-                    self.free_block_queue.remove(block)
-                block.ref_cnt += 1
-                if self.metrics_collector:
-                    self.metrics_collector.on_block_accessed(block)
+        for block in blocks:
+            # ref_cnt=0 means this block is in the free list (i.e. eviction
+            # candidate), so remove it.
+            if block.ref_cnt == 0 and not block.is_null:
+                self.free_block_queue.remove(block)
+            block.ref_cnt += 1
+            if self.metrics_collector:
+                self.metrics_collector.on_block_accessed(block)
 
     def free_blocks(self, ordered_blocks: Iterable[KVCacheBlock]) -> None:
         """Free a list of blocks. The blocks should be ordered by their
diff --git a/vllm/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py
index 4b09b76c1c5918da07471aa817805ba451511b56..4550e2b7956207a6e266ef2e189e6dc278faccea 100644
--- a/vllm/v1/core/kv_cache_coordinator.py
+++ b/vllm/v1/core/kv_cache_coordinator.py
@@ -14,7 +14,7 @@ from vllm.v1.core.kv_cache_utils import (
 )
 from vllm.v1.core.single_type_kv_cache_manager import (
     CrossAttentionManager,
-    FullAttentionManager,
+    SingleTypeKVCacheManager,
     get_manager_for_kv_cache_spec,
 )
 from vllm.v1.kv_cache_interface import (
@@ -60,6 +60,7 @@ class KVCacheCoordinator(ABC):
             get_manager_for_kv_cache_spec(
                 kv_cache_spec=kv_cache_group.kv_cache_spec,
                 block_pool=self.block_pool,
+                enable_caching=enable_caching,
                 kv_cache_group_id=i,
                 dcp_world_size=dcp_world_size,
                 pcp_world_size=pcp_world_size,
@@ -73,6 +74,7 @@ class KVCacheCoordinator(ABC):
         num_tokens: int,
         new_computed_blocks: tuple[Sequence[KVCacheBlock], ...],
         num_encoder_tokens: int,
+        total_computed_tokens: int,
     ) -> int:
         """
         Get the number of blocks needed to be allocated for the request.
@@ -85,9 +87,10 @@ class KVCacheCoordinator(ABC):
                 prefix caching.
             num_encoder_tokens: The number of encoder tokens for allocating
                 blocks for cross-attention.
+            total_computed_tokens: Include both local and external tokens.
 
         Returns:
-            The number of blocks.
+            The number of blocks to allocate.
         """
         num_blocks_to_allocate = 0
         for i, manager in enumerate(self.single_type_managers):
@@ -95,30 +98,48 @@ class KVCacheCoordinator(ABC):
                 # For cross-attention, we issue a single static allocation
                 # of blocks based on the number of encoder input tokens.
                 num_blocks_to_allocate += manager.get_num_blocks_to_allocate(
-                    request_id, num_encoder_tokens, []
+                    request_id, num_encoder_tokens, [], 0
                 )
             else:
                 num_blocks_to_allocate += manager.get_num_blocks_to_allocate(
-                    request_id, num_tokens, new_computed_blocks[i]
+                    request_id,
+                    num_tokens,
+                    new_computed_blocks[i],
+                    total_computed_tokens,
                 )
         return num_blocks_to_allocate
 
-    def save_new_computed_blocks(
-        self, request_id: str, new_computed_blocks: tuple[Sequence[KVCacheBlock], ...]
+    def allocate_new_computed_blocks(
+        self,
+        request_id: str,
+        new_computed_blocks: tuple[Sequence[KVCacheBlock], ...],
+        num_local_computed_tokens: int,
+        num_external_computed_tokens: int,
     ) -> None:
         """
-        Add the new computed blocks to the request.
+        Add the new computed blocks to the request. Optionally allocate new
+            blocks for external computed tokens (if any).
 
         Args:
             request_id: The request ID.
             new_computed_blocks: The new computed blocks just hitting the
                 prefix cache.
+            num_local_computed_tokens: The number of local computed tokens.
+            num_external_computed_tokens: The number of external computed tokens.
         """
         for i, manager in enumerate(self.single_type_managers):
-            manager.save_new_computed_blocks(request_id, new_computed_blocks[i])
+            manager.allocate_new_computed_blocks(
+                request_id,
+                new_computed_blocks[i],
+                num_local_computed_tokens,
+                num_external_computed_tokens,
+            )
 
     def allocate_new_blocks(
-        self, request_id: str, num_tokens: int, num_encoder_tokens: int = 0
+        self,
+        request_id: str,
+        num_tokens: int,
+        num_encoder_tokens: int = 0,
     ) -> tuple[list[KVCacheBlock], ...]:
         """
         Allocate new blocks for the request to give it at least `num_tokens`
@@ -184,17 +205,20 @@ class KVCacheCoordinator(ABC):
             for manager in self.single_type_managers
         ]
 
-    def remove_skipped_blocks(self, request_id: str, num_computed_tokens: int) -> None:
+    def remove_skipped_blocks(
+        self, request_id: str, total_computed_tokens: int
+    ) -> None:
         """
         Remove the blocks that are no longer needed from `blocks` and replace
         the removed blocks with null_block.
 
         Args:
             request_id: The request ID.
-            num_computed_tokens: The number of tokens that have been computed.
+            total_computed_tokens: The total number of computed tokens, including
+                local computed tokens and external computed tokens.
         """
         for manager in self.single_type_managers:
-            manager.remove_skipped_blocks(request_id, num_computed_tokens)
+            manager.remove_skipped_blocks(request_id, total_computed_tokens)
 
     def get_blocks(self, request_id: str) -> tuple[list[KVCacheBlock], ...]:
         """
@@ -330,9 +354,6 @@ class HybridKVCacheCoordinator(KVCacheCoordinator):
     """
     KV cache coordinator for hybrid models with multiple KV cache types, and
     thus multiple kv cache groups.
-    To simplify `find_longest_cache_hit`, it only supports the combination of
-    two types of KV cache groups, and one of them must be full attention.
-    May extend to more general cases in the future.
     """
 
     def __init__(
@@ -373,70 +394,46 @@ class HybridKVCacheCoordinator(KVCacheCoordinator):
 
     def verify_and_split_kv_cache_groups(self) -> None:
         """
-        Verifies that the model has exactly two types of KV cache groups, and
-        one of them is full attention. Then, split the kv cache groups into full
-        attention groups and other groups.
+        Groups KV cache groups by their spec type for efficient batch processing
+        during cache hit lookup.
         """
-        full_attention_spec: FullAttentionSpec | None = None
-        other_spec: KVCacheSpec | None = None
-        self.full_attention_group_ids: list[int] = []
-        self.other_group_ids: list[int] = []
+        attention_groups: list[
+            tuple[KVCacheSpec, list[int], type[SingleTypeKVCacheManager]]
+        ] = []
+
         for i, g in enumerate(self.kv_cache_config.kv_cache_groups):
-            if isinstance(g.kv_cache_spec, FullAttentionSpec):
-                if full_attention_spec is None:
-                    full_attention_spec = g.kv_cache_spec
-                else:
-                    assert full_attention_spec == g.kv_cache_spec, (
-                        "HybridKVCacheCoordinator assumes exactly one type of "
-                        "full attention groups now."
+            manager_cls = self.single_type_managers[i].__class__
+            spec = g.kv_cache_spec
+
+            # Try to find an existing group with the same spec
+            for existing_spec, group_ids, existing_cls in attention_groups:
+                if existing_spec == spec:
+                    assert manager_cls is existing_cls, (
+                        "Expected same manager class for identical KV cache specs."
                     )
-                self.full_attention_group_ids.append(i)
+                    group_ids.append(i)
+                    break
             else:
-                if other_spec is None:
-                    other_spec = g.kv_cache_spec
-                else:
-                    assert other_spec == g.kv_cache_spec, (
-                        "HybridKVCacheCoordinator assumes "
-                        "exactly one other type of groups now."
-                    )
-                self.other_group_ids.append(i)
+                attention_groups.append((spec, [i], manager_cls))
 
-        assert full_attention_spec is not None, (
-            "HybridKVCacheCoordinator assumes exactly one type of full "
-            "attention groups now."
+        assert len(attention_groups) > 1, (
+            "HybridKVCacheCoordinator requires at least two attention groups."
         )
-        assert other_spec is not None, (
-            "HybridKVCacheCoordinator assumes exactly one type of other groups now."
+
+        # Put full attention first: its efficient left-to-right scan provides
+        # a tighter initial bound, reducing work for subsequent groups.
+        self.attention_groups = sorted(
+            attention_groups,
+            key=lambda x: not isinstance(x[0], FullAttentionSpec),
         )
 
-        self.full_attention_manager_cls = FullAttentionManager
-        self.other_attention_cls = self.single_type_managers[
-            self.other_group_ids[0]
-        ].__class__
-        self.full_attention_spec = full_attention_spec
-        self.other_spec = other_spec
-        self.full_attention_block_size = self.full_attention_spec.block_size
-        self.other_block_size = self.other_spec.block_size
-        # The LCM of the block sizes of full attention and other attention.
+        # The LCM of the block sizes of all attention types.
         # The cache hit length must be a multiple of the LCM of the block sizes
         # to make sure the cache hit length is a multiple of the block size of
         # each attention type. Requiring this because we don't support partial
         # block cache hit yet.
-        self.lcm_block_size = lcm(self.full_attention_block_size, self.other_block_size)
-
-        if max(self.full_attention_group_ids) < min(self.other_group_ids):
-            self.full_attn_first = True
-        elif max(self.other_group_ids) < min(self.full_attention_group_ids):
-            self.full_attn_first = False
-        else:
-            raise ValueError(
-                "HybridKVCacheCoordinator assumes the full "
-                "attention group ids and other attention group ids "
-                "do not interleave, either full attention group ids "
-                "are before other attention group ids or vice versa."
-                "This is for simplifying merging hit_blocks_full_attn and "
-                "hit_blocks_other_attn to hit_blocks."
-            )
+        block_sizes = [spec.block_size for spec, _, _ in attention_groups]
+        self.lcm_block_size = lcm(*block_sizes)
 
     def find_longest_cache_hit(
         self,
@@ -444,7 +441,12 @@ class HybridKVCacheCoordinator(KVCacheCoordinator):
         max_cache_hit_length: int,
     ) -> tuple[tuple[list[KVCacheBlock], ...], int]:
         """
-        Find the longest cache hit for the request.
+        Find the longest cache hit using an iterative fixed-point algorithm.
+
+        Each attention type either accepts the current candidate length or
+        reduces it. If any type reduces the length, restart checks over all
+        types. This converges because length monotonically decreases and is
+        bounded below by 0.
 
         Args:
             block_hashes: The block hashes of the request.
@@ -452,75 +454,63 @@ class HybridKVCacheCoordinator(KVCacheCoordinator):
 
         Returns:
             A tuple containing:
-                - A list of the cache hit blocks for each single type manager.
+                - A tuple of the cache hit blocks for each single type manager.
                 - The number of tokens of the longest cache hit.
         """
-        # First, find the longest cache hit for full attention.
-        if self.full_attention_spec.block_size == self.hash_block_size:
-            # Common case.
-            full_attention_block_hashes: BlockHashList = block_hashes
-        else:
-            # block_size is a multiple of hash_block_size. This happens when different
-            # KV cache groups have different block sizes. In this case, we need to
-            # recalculate block_hashes at the granularity of block_size, using the
-            # original block_hashes (at the granularity of hash_block_size).
-            full_attention_block_hashes = BlockHashListWithBlockSize(
-                block_hashes, self.hash_block_size, self.full_attention_spec.block_size
-            )
-        hit_blocks_full_attn = self.full_attention_manager_cls.find_longest_cache_hit(
-            block_hashes=full_attention_block_hashes,
-            max_length=max_cache_hit_length,
-            kv_cache_group_ids=self.full_attention_group_ids,
-            block_pool=self.block_pool,
-            kv_cache_spec=self.full_attention_spec,
-            use_eagle=self.use_eagle,
-            alignment_tokens=self.lcm_block_size,
-        )
-        hit_length = len(hit_blocks_full_attn[0]) * self.full_attention_block_size
-
-        # Next, find the cache hit for the other attention WITHIN
-        # the cache hit of full attention.
-        if self.other_spec.block_size == self.hash_block_size:
-            # Common case.
-            other_block_hashes: BlockHashList = block_hashes
-        else:
-            # Similar to the full attention case, here we need to recalculate
-            # block_hashes at the granularity of block_size, using the original
-            # block_hashes (at the granularity of hash_block_size).
-            other_block_hashes = BlockHashListWithBlockSize(
-                block_hashes, self.hash_block_size, self.other_spec.block_size
+
+        def _get_block_hashes(kv_cache_spec: KVCacheSpec) -> BlockHashList:
+            if kv_cache_spec.block_size == self.hash_block_size:
+                return block_hashes
+            return BlockHashListWithBlockSize(
+                block_hashes, self.hash_block_size, kv_cache_spec.block_size
             )
-        hit_blocks_other_attn = self.other_attention_cls.find_longest_cache_hit(
-            block_hashes=other_block_hashes,
-            max_length=hit_length,
-            kv_cache_group_ids=self.other_group_ids,
-            block_pool=self.block_pool,
-            kv_cache_spec=self.other_spec,
-            use_eagle=self.use_eagle,
-            alignment_tokens=self.lcm_block_size,
-        )
-        hit_length = len(hit_blocks_other_attn[0]) * self.other_block_size
-
-        # NOTE: the prefix cache hit length must be a multiple of block_size as
-        # we don't support partial block cache hit yet. The cache hit length
-        # of other attention is ensured to be a multiple of the block size of
-        # full attention layers in current implementation, because hit_length is
-        # a multiple of other attention's block size, and other attention's
-        # block size is a multiple of full attention's block size (verified in
-        # `verify_and_split_kv_cache_groups`).
-        assert hit_length % self.full_attention_block_size == 0
-
-        # Truncate the full attention cache hit to the length of the
-        # cache hit of the other attention.
-        for group_hit_blocks in hit_blocks_full_attn:
-            del group_hit_blocks[hit_length // self.full_attention_block_size :]
-
-        # Merge the hit blocks of full attention and other attention.
-        if self.full_attn_first:
-            hit_blocks = hit_blocks_full_attn + hit_blocks_other_attn
-        else:
-            hit_blocks = hit_blocks_other_attn + hit_blocks_full_attn
-        return hit_blocks, hit_length
+
+        num_groups = len(self.kv_cache_config.kv_cache_groups)
+        hit_length = max_cache_hit_length
+        hit_blocks_by_group: list[list[KVCacheBlock] | None] = [None] * num_groups
+
+        while True:
+            curr_hit_length = hit_length
+
+            for spec, group_ids, manager_cls in self.attention_groups:
+                is_full_attn = isinstance(spec, FullAttentionSpec)
+
+                # Full attention: reuse cached blocks (downward-closed property)
+                cached_blocks = hit_blocks_by_group[group_ids[0]]
+                if is_full_attn and cached_blocks is not None:
+                    # For full attention, we only need to compute the cache hit
+                    # length once. Starting from the second iteration, if the
+                    # curr_hit_length is reduced by other groups, we can simply
+                    # keep the first (curr_hit_length // block_size) blocks from
+                    # the last iteration.
+                    num_blocks = curr_hit_length // spec.block_size
+                    curr_hit_length = num_blocks * spec.block_size
+                    for group_id in group_ids:
+                        blocks = hit_blocks_by_group[group_id]
+                        assert blocks is not None
+                        del blocks[num_blocks:]
+                else:
+                    hit_blocks = manager_cls.find_longest_cache_hit(
+                        block_hashes=_get_block_hashes(spec),
+                        max_length=curr_hit_length,
+                        kv_cache_group_ids=group_ids,
+                        block_pool=self.block_pool,
+                        kv_cache_spec=spec,
+                        use_eagle=self.use_eagle,
+                        alignment_tokens=self.lcm_block_size,
+                    )
+                    curr_hit_length = len(hit_blocks[0]) * spec.block_size
+                    for group_id, blocks in zip(group_ids, hit_blocks):
+                        hit_blocks_by_group[group_id] = blocks
+
+            if curr_hit_length < hit_length:
+                hit_length = curr_hit_length
+            else:
+                break
+
+        return tuple(
+            blocks if blocks is not None else [] for blocks in hit_blocks_by_group
+        ), hit_length
 
 
 def get_kv_cache_coordinator(
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 13086a66f6ea6f4bdbd1dbad51cb35cd3cab07f3..2197107c1fc6d2cd1d96ef6fca55834a7befc878 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -210,6 +210,7 @@ class KVCacheManager:
         num_new_computed_tokens: int = 0,
         new_computed_blocks: KVCacheBlocks | None = None,
         num_lookahead_tokens: int = 0,
+        num_external_computed_tokens: int = 0,
         delay_cache_blocks: bool = False,
         num_encoder_tokens: int = 0,
     ) -> KVCacheBlocks | None:
@@ -217,16 +218,16 @@ class KVCacheManager:
 
         Args:
             request: The request to allocate slots.
-            num_new_tokens: The number of tokens to allocate, including external
-                tokens. Note that this does not include tokens that have
-                already been computed locally (i.e. new_computed_blocks).
+            num_new_tokens: The number of new tokens to be allocated and computed.
             num_new_computed_tokens: The number of new computed tokens just
                 hitting the prefix caching, excluding external tokens.
             new_computed_blocks: The cached blocks for the above new computed
-                tokens.
+                tokens, grouped as a tuple by kv cache groups.
             num_lookahead_tokens: The number of speculative tokens to allocate.
                 This is used by spec decode proposers with kv-cache such
                 as eagle.
+            num_external_computed_tokens: The number of tokens that their
+                KV caches are not cached by vLLM but cached by the connector.
             delay_cache_blocks: Whether to skip caching the blocks. This is
                 used by P/D when allocating blocks used in a KV transfer
                 which will complete in a future step.
@@ -236,29 +237,81 @@ class KVCacheManager:
 
         Blocks layout:
         ```
-        -----------------------------------------------------------------------
-        | < computed > | < new computed > |    < new >    | < pre-allocated > |
-        -----------------------------------------------------------------------
-        |                  < required >                   |
-        --------------------------------------------------
-        |                    < full >                  |
-        ------------------------------------------------
-                                          | <new full> |
-                                          --------------
+        ----------------------------------------------------------------------
+        | < comp > | < new_comp > | < ext_comp >  | < new >  | < lookahead > |
+        ----------------------------------------------------------------------
+                                                  |   < to be computed >     |
+        ----------------------------------------------------------------------
+                                  |            < to be allocated >           |
+        ----------------------------------------------------------------------
+                                  | < to be cached (roughly, |
+                                  | details below)>          |
+        ----------------------------------------------------------------------
+        | Prefix-cached tokens from either vLLM   |
+        | or connector. Can be safely removed if  |
+        | they are outside sliding window.        |
+        ----------------------------------------------------------------------
+        |   < cached by vLLM >    | not cached by |
+                                  | vLLM, but     |
+        | ref_cnt  | ref_cnt not  | cached by     |
+        | increased| increased yet| connector     |
+        ----------------------------------------------------------------------
         ```
-        The following *_blocks are illustrated in this layout.
+
+        Abbrivations:
+
+        ```
+        comp      = request.num_computed_tokens
+        new_comp  = num_new_computed_tokens
+                  = len(new_computed_blocks) * block_size
+        ext_comp  = num_external_computed_tokens, cached by the connector
+        new       = num_new_tokens, including unverified draft tokens
+        lookahead = num_lookahead_tokens
+        ```
+
+        NOTE: for new tokens which include both verified and unverified draft
+        tokens, we only cache the verified tokens (by capping the number at
+        `request.num_tokens`).
+
+        The allocation has three stages:
+        - Free unnecessary blocks in `comp` and check
+           if we have sufficient free blocks (return None if not).
+        - Handle prefix tokens (`comp + new_comp + ext_comp`):
+            - Free unnecessary blocks (e.g. outside sliding window)
+            - Allocate new blocks for `ext_comp` tokens inside
+              sliding window
+        - Allocate new blocks for tokens to be computed (`new + lookahead`)
 
         Returns:
             A list of new allocated blocks.
         """
-        if num_new_tokens == 0:
-            raise ValueError("num_new_tokens must be greater than 0")
+        # When loading KV data asynchronously, we may have zero new tokens to
+        # compute while still allocating slots for externally computed tokens.
+        if num_new_tokens == 0 and num_external_computed_tokens == 0:
+            raise ValueError(
+                "num_new_tokens must be greater than 0 when there are no "
+                "external computed tokens"
+            )
 
         if new_computed_blocks is not None:
             new_computed_block_list = new_computed_blocks.blocks
         else:
             new_computed_block_list = self.empty_kv_cache_blocks.blocks
 
+        # The number of computed tokens is the number of computed tokens plus
+        # the new prefix caching hits
+        num_local_computed_tokens = (
+            request.num_computed_tokens + num_new_computed_tokens
+        )
+        total_computed_tokens = min(
+            num_local_computed_tokens + num_external_computed_tokens,
+            self.max_model_len,
+        )
+        num_tokens_need_slot = min(
+            total_computed_tokens + num_new_tokens + num_lookahead_tokens,
+            self.max_model_len,
+        )
+
         # Free the blocks that are skipped during the attention computation
         # (e.g., tokens outside the sliding window).
         # We can do this even if we cannot schedule this request due to
@@ -266,15 +319,7 @@ class KVCacheManager:
         # Should call this function before allocating new blocks to reduce
         # the number of evicted blocks.
         self.coordinator.remove_skipped_blocks(
-            request.request_id, request.num_computed_tokens
-        )
-
-        # The number of computed tokens is the number of computed tokens plus
-        # the new prefix caching hits
-        num_computed_tokens = request.num_computed_tokens + num_new_computed_tokens
-        num_tokens_need_slot = min(
-            num_computed_tokens + num_new_tokens + num_lookahead_tokens,
-            self.max_model_len,
+            request.request_id, total_computed_tokens
         )
 
         num_blocks_to_allocate = self.coordinator.get_num_blocks_to_allocate(
@@ -282,25 +327,25 @@ class KVCacheManager:
             num_tokens=num_tokens_need_slot,
             new_computed_blocks=new_computed_block_list,
             num_encoder_tokens=num_encoder_tokens,
+            total_computed_tokens=num_local_computed_tokens
+            + num_external_computed_tokens,
         )
 
         if num_blocks_to_allocate > self.block_pool.get_num_free_blocks():
             # Cannot allocate new blocks
             return None
 
-        # Touch the computed blocks to make sure they won't be evicted.
-        if self.enable_caching:
-            self.block_pool.touch(new_computed_block_list)
-        else:
-            assert not any(new_computed_block_list), (
-                "Computed blocks should be empty when prefix caching is disabled"
-            )
-
-        if new_computed_block_list is not self.empty_kv_cache_blocks.blocks:
+        if (
+            new_computed_block_list is not self.empty_kv_cache_blocks.blocks
+            or num_external_computed_tokens > 0
+        ):
             # Append the new computed blocks to the request blocks until now to
             # avoid the case where the new blocks cannot be allocated.
-            self.coordinator.save_new_computed_blocks(
-                request.request_id, new_computed_block_list
+            self.coordinator.allocate_new_computed_blocks(
+                request_id=request.request_id,
+                new_computed_blocks=new_computed_block_list,
+                num_local_computed_tokens=num_local_computed_tokens,
+                num_external_computed_tokens=num_external_computed_tokens,
             )
 
         new_blocks = self.coordinator.allocate_new_blocks(
@@ -312,12 +357,14 @@ class KVCacheManager:
         if not self.enable_caching or delay_cache_blocks:
             return self.create_kv_cache_blocks(new_blocks)
 
-        # NOTE(woosuk): We want to commit (cache) up to num_computed_tokens +
-        # num_new_tokens, but must exclude "non-committable" tokens (e.g.,
-        # draft tokens that could be rejected). Therefore, we cap the number
-        # at `request.num_tokens`, ensuring only "finalized" tokens are cached.
+        # NOTE(woosuk): We want to commit (cache) up to num_local_computed_tokens
+        # + num_external_computed_tokens + num_new_tokens, but must exclude
+        # "non-committable" tokens (e.g., draft tokens that could be rejected).
+        # Therefore, we cap the number at `request.num_tokens`, ensuring only
+        # "finalized" tokens are cached.
         num_tokens_to_cache = min(
-            num_computed_tokens + num_new_tokens, request.num_tokens
+            total_computed_tokens + num_new_tokens,
+            request.num_tokens,
         )
         self.coordinator.cache_blocks(request, num_tokens_to_cache)
 
@@ -333,6 +380,19 @@ class KVCacheManager:
         """
         self.coordinator.free(request.request_id)
 
+    def remove_skipped_blocks(
+        self, request_id: str, total_computed_tokens: int
+    ) -> None:
+        """Remove the blocks that are no longer needed from `blocks` and replace
+        the removed blocks with null_block.
+
+        Args:
+            request_id: The request ID.
+            total_computed_tokens: The total number of computed tokens, including
+                local computed tokens and external computed tokens.
+        """
+        self.coordinator.remove_skipped_blocks(request_id, total_computed_tokens)
+
     def evict_blocks(self, block_ids: set[int]) -> None:
         """evict blocks from the prefix cache by their block IDs.
 
@@ -408,7 +468,13 @@ class KVCacheManager:
         return self.get_blocks(request_id).get_block_ids()
 
     def cache_blocks(self, request: Request, num_computed_tokens: int) -> None:
-        """Cache the blocks for the request, if enabled."""
+        """Cache the blocks for the request, if enabled.
+
+        Args:
+            request: The request to cache the blocks.
+            num_computed_tokens: The number of computed tokens, including tokens
+                that are already cached and tokens to be cached.
+        """
         if self.enable_caching:
             self.coordinator.cache_blocks(request, num_computed_tokens)
 
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index e4360de3717d19b0b5ae90bb7b4cca604df2c2a1..7f900bd9e73efc4732ead3039635cd0627fbcf23 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -14,7 +14,7 @@ from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.utils.hashing import sha256_cbor, xxhash_cbor
 from vllm.utils.math_utils import cdiv
-from vllm.utils.mem_constants import GiB_bytes
+from vllm.utils.mem_utils import format_gib
 from vllm.v1.kv_cache_interface import (
     ChunkedLocalAttentionSpec,
     FullAttentionSpec,
@@ -606,6 +606,43 @@ def get_request_block_hasher(
     return request_block_hasher
 
 
+def _check_enough_kv_cache_memory(
+    available_memory: int,
+    get_needed_memory: Callable[[], int],
+    max_model_len: int,
+    estimate_max_model_len: Callable[[int], int],
+):
+    if available_memory <= 0:
+        raise ValueError(
+            "No available memory for the cache blocks. "
+            "Try increasing `gpu_memory_utilization` when initializing the engine. "
+            "See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ "
+            "for more details."
+        )
+
+    needed_memory = get_needed_memory()
+
+    if needed_memory > available_memory:
+        estimated_max_len = estimate_max_model_len(available_memory)
+        estimated_msg = ""
+        if estimated_max_len > 0:
+            estimated_msg = (
+                "Based on the available memory, "
+                f"the estimated maximum model length is {estimated_max_len}. "
+            )
+
+        raise ValueError(
+            f"To serve at least one request with the models's max seq len "
+            f"({max_model_len}), ({format_gib(needed_memory)} GiB KV "
+            f"cache is needed, which is larger than the available KV cache "
+            f"memory ({format_gib(available_memory)} GiB). {estimated_msg}"
+            f"Try increasing `gpu_memory_utilization` or decreasing `max_model_len` "
+            f"when initializing the engine. "
+            f"See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ "
+            f"for more details."
+        )
+
+
 def max_memory_usage_bytes(
     vllm_config: VllmConfig, kv_cache_specs: Iterable[KVCacheSpec]
 ) -> int:
@@ -624,6 +661,9 @@ def estimate_max_model_len(
     Estimates the maximum model length that can fit in the available memory
     using binary search.
 
+    This function temporarily modifies max_model_len during estimation but
+    restores the original value before returning, ensuring no side effects.
+
     Args:
         vllm_config: The global VllmConfig
         kv_cache_spec: The kv cache spec of each attention layer in the model
@@ -632,33 +672,38 @@ def estimate_max_model_len(
     Returns:
         The estimated maximum model length that can fit in the available memory.
     """
+    # Save the original max_model_len to restore after estimation
+    original_max_model_len = vllm_config.model_config.max_model_len
 
     # Define a function to check if a given model length fits in memory
     def fits_in_memory(model_len: int) -> bool:
-        # Modify the max_model_len for this calculation
+        # Temporarily modify the max_model_len for this calculation
         vllm_config.model_config.max_model_len = model_len
         # Calculate memory needed for the given model length
         memory_needed = max_memory_usage_bytes(vllm_config, kv_cache_spec.values())
         return memory_needed <= available_memory
 
-    # Binary search for the maximum model length
-    current_max = vllm_config.model_config.max_model_len
-    left, right = 1, current_max
-
-    # If even the smallest model length doesn't fit, return 0
-    if not fits_in_memory(left):
-        return 0
-
-    # Binary search for the maximum model length that fits
-    result = 1
-    while left <= right:
-        mid = (left + right) // 2
-        if fits_in_memory(mid):
-            result = mid
-            left = mid + 1
-        else:
-            right = mid - 1
-    return result
+    try:
+        # Binary search for the maximum model length
+        left, right = 1, original_max_model_len
+
+        # If even the smallest model length doesn't fit, return 0
+        if not fits_in_memory(left):
+            return 0
+
+        # Binary search for the maximum model length that fits
+        result = 1
+        while left <= right:
+            mid = (left + right) // 2
+            if fits_in_memory(mid):
+                result = mid
+                left = mid + 1
+            else:
+                right = mid - 1
+        return result
+    finally:
+        # Always restore the original max_model_len to avoid side effects
+        vllm_config.model_config.max_model_len = original_max_model_len
 
 
 def check_enough_kv_cache_memory(
@@ -680,43 +725,12 @@ def check_enough_kv_cache_memory(
     """
 
     # No need to check for available memory if the kv_cache_spec is empty
-    if not kv_cache_spec:
-        return
-
-    if available_memory <= 0:
-        raise ValueError(
-            "No available memory for the cache blocks. "
-            "Try increasing `gpu_memory_utilization` when "
-            "initializing the engine. "
-            "See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ "
-            "for more details."
-        )
-
-    max_model_len = vllm_config.model_config.max_model_len
-    needed_memory = max_memory_usage_bytes(vllm_config, kv_cache_spec.values())
-
-    if needed_memory > available_memory:
-        # Estimate the maximum model length that can fit in the available memory
-        estimated_max_len = estimate_max_model_len(
-            vllm_config, kv_cache_spec, available_memory
-        )
-        estimated_msg = ""
-        if estimated_max_len > 0:
-            estimated_msg = (
-                "Based on the available memory, "
-                f"the estimated maximum model length is {estimated_max_len}."
-            )
-
-        raise ValueError(
-            f"To serve at least one request with the models's max seq len "
-            f"({max_model_len}), ({needed_memory / GiB_bytes:.2f} GiB KV "
-            f"cache is needed, which is larger than the available KV cache "
-            f"memory ({available_memory / GiB_bytes:.2f} GiB). "
-            f"{estimated_msg} "
-            f"Try increasing `gpu_memory_utilization` or decreasing `max_model_len` "
-            f"when initializing the engine. "
-            f"See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ "
-            f"for more details."
+    if kv_cache_spec:
+        _check_enough_kv_cache_memory(
+            available_memory,
+            lambda: max_memory_usage_bytes(vllm_config, kv_cache_spec.values()),
+            vllm_config.model_config.max_model_len,
+            lambda am: estimate_max_model_len(vllm_config, kv_cache_spec, am),
         )
 
 
@@ -1301,6 +1315,140 @@ def _report_kv_cache_config(
     )
 
 
+def _max_memory_usage_bytes_from_groups(
+    vllm_config: VllmConfig,
+    kv_cache_groups: list[KVCacheGroupSpec],
+) -> int:
+    """
+    Calculate maximum memory usage in bytes from KV cache groups.
+
+    This correctly accounts for padding in hybrid models. For example, if a
+    model has 8 full attention layers and 9 sliding window layers, they will
+    be padded to 9 full + 9 sliding window for uniform group sizes.
+    """
+    if not kv_cache_groups:
+        return 0
+
+    # UniformTypeKVCacheSpecs special case (single group, per-layer specs)
+    if len(kv_cache_groups) == 1 and isinstance(
+        kv_cache_groups[0].kv_cache_spec, UniformTypeKVCacheSpecs
+    ):
+        per_layer_specs = kv_cache_groups[0].kv_cache_spec.kv_cache_specs
+        return sum(
+            spec.max_memory_usage_bytes(vllm_config)
+            for spec in per_layer_specs.values()
+        )
+
+    # General case: group_size pools, each shared by one layer per group
+    # Memory = group_size * page_size * blocks_for_max_len
+    group_size = max(len(group.layer_names) for group in kv_cache_groups)
+    page_size = get_uniform_page_size(
+        [group.kv_cache_spec for group in kv_cache_groups]
+    )
+    any_spec = kv_cache_groups[0].kv_cache_spec
+    blocks_needed = cdiv(any_spec.max_memory_usage_bytes(vllm_config), page_size)
+
+    return group_size * page_size * blocks_needed
+
+
+def _estimate_max_model_len_from_groups(
+    vllm_config: VllmConfig,
+    kv_cache_groups: list[KVCacheGroupSpec],
+    available_memory: int,
+) -> int:
+    """
+    Binary search for the maximum model length that fits in available memory.
+    Returns 0 if even 1 token doesn't fit.
+    """
+    original_max = vllm_config.model_config.max_model_len
+
+    def fits(model_len: int) -> bool:
+        vllm_config.model_config.max_model_len = model_len
+        return (
+            _max_memory_usage_bytes_from_groups(vllm_config, kv_cache_groups)
+            <= available_memory
+        )
+
+    try:
+        left, right = 1, original_max
+        if not fits(left):
+            return 0
+        result = 1
+        while left <= right:
+            mid = (left + right) // 2
+            if fits(mid):
+                result = mid
+                left = mid + 1
+            else:
+                right = mid - 1
+        return result
+    finally:
+        vllm_config.model_config.max_model_len = original_max
+
+
+def _auto_fit_max_model_len(
+    vllm_config: VllmConfig,
+    kv_cache_groups: list[KVCacheGroupSpec],
+    available_memory: list[int],
+) -> None:
+    """
+    When max_model_len is set to -1, this function estimates the largest
+    context length that can be supported with the available GPU memory.
+    It uses binary search to find the maximum length that fits across all
+    workers.
+
+    Args:
+        vllm_config: The global VllmConfig (will be modified in-place)
+        kv_cache_groups: The global KV cache groups (from get_kv_cache_groups).
+            This correctly accounts for padding in hybrid models.
+        available_memory: Memory available for KV cache in bytes for each
+            worker.
+    """
+    original_max = vllm_config.model_config.max_model_len
+
+    if not kv_cache_groups:
+        # All workers have empty specs (attention-free model)
+        logger.info_once(
+            "Auto-fit max_model_len: attention-free model, "
+            "using derived max_model_len=%d",
+            original_max,
+            scope="local",
+        )
+        return
+
+    # Use minimum available memory across all workers
+    min_available_memory = min(available_memory)
+    auto_fit_max = _estimate_max_model_len_from_groups(
+        vllm_config, kv_cache_groups, min_available_memory
+    )
+
+    if auto_fit_max <= 0:
+        raise ValueError(
+            "Cannot auto-fit max_model_len: not enough GPU memory available "
+            "to serve even a single token. Try increasing `gpu_memory_utilization`."
+        )
+
+    if auto_fit_max >= original_max:
+        # The model's full context length fits in memory
+        logger.info_once(
+            "Auto-fit max_model_len: full model context length %d fits in "
+            "available GPU memory",
+            original_max,
+            scope="local",
+        )
+    else:
+        # Need to reduce max_model_len to fit in memory
+        vllm_config.model_config.max_model_len = auto_fit_max
+        logger.info_once(
+            "Auto-fit max_model_len: reduced from %d to %d to fit in "
+            "available GPU memory (%s GiB available for KV cache)",
+            original_max,
+            auto_fit_max,
+            format_gib(min_available_memory),
+            scope="local",
+        )
+
+
 def get_kv_cache_configs(
     vllm_config: VllmConfig,
     kv_cache_specs: list[dict[str, KVCacheSpec]],
@@ -1317,10 +1465,12 @@ def get_kv_cache_configs(
     1. Merge the KV cache specs of all workers to get the KVCacheSpecs for
        the whole model.
     2. Generate the KV cache groups based on the layer ratio of the whole model.
-    3. Generate the KV cache configs for each worker based on the KV cache
+       This also handles spec unification for hybrid models.
+    3. Handle auto-fit max_model_len and memory checks using the unified specs.
+    4. Generate the KV cache configs for each worker based on the KV cache
        grouping strategy. (This is reasonable because the layer ratio of
        different PP stages are similar.)
-    4. Change the num_blocks of each worker to the smallest among all workers
+    5. Change the num_blocks of each worker to the smallest among all workers
        and shrink tensor sizes proportionally to avoid allocating unused memory.
 
     Args:
@@ -1333,14 +1483,6 @@ def get_kv_cache_configs(
         The generated KVCacheConfigs for each worker.
     """
 
-    # Check if the available memory is enough for each worker.
-    for kv_cache_spec_one_worker, available_memory_one_worker in zip(
-        kv_cache_specs, available_memory
-    ):
-        check_enough_kv_cache_memory(
-            vllm_config, kv_cache_spec_one_worker, available_memory_one_worker
-        )
-
     # Merge the KV cache specs of all workers. Different PP stages may have
     # different layer names, and different TP ranks of the same PP stage should
     # have the same KV cache spec.
@@ -1354,8 +1496,32 @@ def get_kv_cache_configs(
                     "The KV cache specs for the same layer are different "
                     "across workers. This is not supported yet."
                 )
+
+    # Get global KV cache groups. This also handles spec unification for
+    # hybrid models when disable_hybrid_kv_cache_manager is enabled.
+    # After this call, merged_kv_cache_specs may be modified in-place.
     global_kv_cache_groups = get_kv_cache_groups(vllm_config, merged_kv_cache_specs)
 
+    # If original_max_model_len was -1, automatically
+    # determine the maximum model length that fits in available GPU memory.
+    # We use the global groups here to correctly account for padding.
+    if vllm_config.model_config.original_max_model_len == -1:
+        _auto_fit_max_model_len(vllm_config, global_kv_cache_groups, available_memory)
+
+    # Check if the available memory is enough (using min across all workers).
+    # We use the global groups to correctly account for padding.
+    if global_kv_cache_groups:
+        _check_enough_kv_cache_memory(
+            min(available_memory),
+            lambda: _max_memory_usage_bytes_from_groups(
+                vllm_config, global_kv_cache_groups
+            ),
+            vllm_config.model_config.max_model_len,
+            lambda am: _estimate_max_model_len_from_groups(
+                vllm_config, global_kv_cache_groups, am
+            ),
+        )
+
     kv_cache_configs: list[KVCacheConfig] = []
     for kv_cache_spec_one_worker, available_memory_one_worker in zip(
         kv_cache_specs, available_memory
diff --git a/vllm/v1/core/sched/async_scheduler.py b/vllm/v1/core/sched/async_scheduler.py
index df61eebb395e521ffee1d2877986d33b21aeff18..3c66a23208ec8a75e5e1d4a827058b9ea0ccf0da 100644
--- a/vllm/v1/core/sched/async_scheduler.py
+++ b/vllm/v1/core/sched/async_scheduler.py
@@ -10,15 +10,14 @@ logger = init_logger(__name__)
 
 
 class AsyncScheduler(Scheduler):
-    def _update_after_schedule(
-        self,
-        scheduler_output: SchedulerOutput,
-    ) -> None:
+    def _update_after_schedule(self, scheduler_output: SchedulerOutput) -> None:
         super()._update_after_schedule(scheduler_output)
+        has_structured_output_requests = False
         pending_structured_output_tokens = False
         spec_decode_tokens = scheduler_output.scheduled_spec_decode_tokens
         for req_id in scheduler_output.num_scheduled_tokens:
             request = self.requests[req_id]
+            has_structured_output_requests |= request.use_structured_output
             pending_structured_output_tokens |= (
                 request.use_structured_output and request.num_output_placeholders > 0
             )
@@ -36,14 +35,13 @@ class AsyncScheduler(Scheduler):
                 # We will update the actual spec token ids in the worker process.
                 request.spec_token_ids = [-1] * self.num_spec_tokens
 
+        scheduler_output.has_structured_output_requests = has_structured_output_requests
         scheduler_output.pending_structured_output_tokens = (
             pending_structured_output_tokens
         )
 
     def _update_request_with_output(
-        self,
-        request: Request,
-        new_token_ids: list[int],
+        self, request: Request, new_token_ids: list[int]
     ) -> tuple[list[int], bool]:
         if request.discard_latest_async_tokens:
             # If the request is force preempted in reset_prefix_cache, we
diff --git a/vllm/v1/core/sched/interface.py b/vllm/v1/core/sched/interface.py
index 596ab05ad320a4c027b5f4c4c3073e79616caf79..92d8d929287b95645c044ee29283868b0483d7f9 100644
--- a/vllm/v1/core/sched/interface.py
+++ b/vllm/v1/core/sched/interface.py
@@ -85,11 +85,27 @@ class SchedulerInterface(ABC):
         raise NotImplementedError
 
     @abstractmethod
-    def update_draft_token_ids(
-        self,
-        draft_token_ids: "DraftTokenIds",
+    def update_draft_token_ids(self, draft_token_ids: "DraftTokenIds") -> None:
+        """Update requests with newly generated draft token ids, applying
+        structured output grammar validation if needed.
+
+        Args:
+            draft_token_ids: The input draft token ids for each request.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def update_draft_token_ids_in_output(
+        self, draft_token_ids: "DraftTokenIds", scheduler_output: "SchedulerOutput"
     ) -> None:
-        """Update the draft token ids for the scheduled requests."""
+        """Update scheduler output with newly generated draft token ids, applying
+        structured output grammar validation if needed.
+
+        Args:
+            draft_token_ids: The input draft token ids for each request.
+            scheduler_output: Update the given scheduler_output
+                with the corresponding draft token ids.
+        """
         raise NotImplementedError
 
     @abstractmethod
diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py
index b69fa87ebddc8ed7dd10a151fc6a0425210d4cd9..7e53f4f2ec9e8471b06a38c8ec3e38700c61f2ac 100644
--- a/vllm/v1/core/sched/output.py
+++ b/vllm/v1/core/sched/output.py
@@ -5,8 +5,6 @@ from dataclasses import dataclass
 from functools import cached_property
 from typing import TYPE_CHECKING
 
-from typing_extensions import deprecated
-
 from vllm._bc_linter import bc_linter_include
 
 if TYPE_CHECKING:
@@ -93,10 +91,14 @@ class NewRequestData:
         prompt_embeds_shape = (
             self.prompt_embeds.shape if self.prompt_embeds is not None else None
         )
+        prefill_token_ids_len = (
+            len(self.prefill_token_ids) if self.prefill_token_ids is not None else None
+        )
         return (
             f"NewRequestData("
             f"req_id={self.req_id},"
             f"prompt_token_ids_len={prompt_token_ids_len},"
+            f"prefill_token_ids_len={prefill_token_ids_len},"
             f"mm_features={self.mm_features},"
             f"sampling_params={self.sampling_params},"
             f"block_ids={self.block_ids},"
@@ -125,22 +127,44 @@ class CachedRequestData:
     num_computed_tokens: list[int]
     num_output_tokens: list[int]
 
+    # Version of dataclass repr with token IDs obfuscated.
+    def anon_repr(self) -> str:
+        new_token_ids_lens = [len(toks) for toks in self.new_token_ids]
+        all_token_ids_lens = {
+            req_id: len(toks) for req_id, toks in self.all_token_ids.items()
+        }
+        return (
+            f"CachedRequestData("
+            f"req_ids={self.req_ids},"
+            f"resumed_req_ids={self.resumed_req_ids},"
+            f"new_token_ids_lens={new_token_ids_lens},"
+            f"all_token_ids_lens={all_token_ids_lens},"
+            f"new_block_ids={self.new_block_ids},"
+            f"num_computed_tokens={self.num_computed_tokens},"
+            f"num_output_tokens={self.num_output_tokens}"
+            f")"
+        )
+
+    def __repr__(self) -> str:
+        return self.anon_repr()
+
     @property
     def num_reqs(self) -> int:
         return len(self.req_ids)
 
     @cached_property
-    @deprecated("This will be removed in v0.14, use `resumed_req_ids` instead.")
-    def resumed_from_preemption(self) -> list[bool]:
-        return [req_id in self.resumed_req_ids for req_id in self.req_ids]
+    def _req_id_to_num_output_tokens(self) -> dict[str, int]:
+        """Cache mapping of req_id to num_output_tokens for O(1) lookup.
 
-    @cached_property
-    @deprecated("This will be removed in v0.14, use `all_token_ids` instead.")
-    def resumed_req_token_ids(self) -> list[list[int] | None]:
-        return [
-            self.all_token_ids[req_id] if req_id in self.resumed_req_ids else None
-            for req_id in self.req_ids
-        ]
+        This cached property is safe because CachedRequestData instances
+        are created fresh each scheduling iteration and not mutated during
+        computation of iteration details.
+        """
+        return dict(zip(self.req_ids, self.num_output_tokens))
+
+    def is_context_phase(self, req_id: str) -> bool:
+        num_output_tokens = self._req_id_to_num_output_tokens.get(req_id)
+        return num_output_tokens is not None and num_output_tokens == 0
 
     @classmethod
     def make_empty(cls) -> "CachedRequestData":
@@ -197,10 +221,17 @@ class SchedulerOutput:
     # Only used for v2 model runner.
     preempted_req_ids: set[str] | None = None
 
+    # Whether any of the scheduled requests use structured output.
+    # Set only in async scheduling case.
+    has_structured_output_requests: bool = False
+
     # Whether the scheduled requests have all the output tokens they
     # need to perform grammar bitmask computation.
     pending_structured_output_tokens: bool = False
 
+    # Used for adjusting acceptance rate calculation.
+    num_invalid_spec_tokens: dict[str, int] | None = None
+
     # KV Cache Connector metadata.
     kv_connector_metadata: KVConnectorMetadata | None = None
 
diff --git a/vllm/v1/core/sched/request_queue.py b/vllm/v1/core/sched/request_queue.py
index a00ca1912b0f39c2b6d9e6d26e2cfdd6786f5e50..38c7db94a45e5f8e7116660f2d3547ab54138fb6 100644
--- a/vllm/v1/core/sched/request_queue.py
+++ b/vllm/v1/core/sched/request_queue.py
@@ -71,11 +71,6 @@ class RequestQueue(ABC):
         """Iterate over the queue according to the policy."""
         pass
 
-    @abstractmethod
-    def __reversed__(self) -> Iterator[Request]:
-        """Iterate over the queue in reverse order."""
-        pass
-
 
 class FCFSRequestQueue(deque[Request], RequestQueue):
     """A first-come-first-served queue that supports deque operations."""
@@ -100,8 +95,12 @@ class FCFSRequestQueue(deque[Request], RequestQueue):
 
     def prepend_requests(self, requests: RequestQueue) -> None:
         """Prepend all requests from another queue to the front of this
-        queue."""
-        self.extendleft(reversed(requests))
+        queue.
+
+        Note: The requests will be prepended in reverse order of their
+        appearance in the `requests` queue.
+        """
+        self.extendleft(requests)
 
     def remove_request(self, request: Request) -> None:
         """Remove a specific request from the queue."""
@@ -128,10 +127,6 @@ class FCFSRequestQueue(deque[Request], RequestQueue):
         """Iterate over the queue according to FCFS policy."""
         return super().__iter__()
 
-    def __reversed__(self) -> Iterator[Request]:
-        """Iterate over the queue in reverse order."""
-        return super().__reversed__()
-
 
 class PriorityRequestQueue(RequestQueue):
     """
@@ -202,10 +197,6 @@ class PriorityRequestQueue(RequestQueue):
         while heap_copy:
             yield heapq.heappop(heap_copy)
 
-    def __reversed__(self) -> Iterator[Request]:
-        """Iterate over the queue in reverse priority order."""
-        return reversed(list(self))
-
 
 def create_request_queue(policy: SchedulingPolicy) -> RequestQueue:
     """Create request queue based on scheduling policy."""
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index c539c8bb8d34bf34a8e39d23c96e3d253bb1643f..6cd39e24550ffea72f68a98ce6235e43a0a75525 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -6,6 +6,8 @@ from collections import defaultdict
 from collections.abc import Iterable
 from typing import Any
 
+import numpy as np
+
 from vllm import envs
 from vllm.compilation.cuda_graph import CUDAGraphStat
 from vllm.config import VllmConfig
@@ -24,6 +26,9 @@ from vllm.distributed.kv_transfer.kv_connector.v1 import (
 from vllm.distributed.kv_transfer.kv_connector.v1.base import KVConnectorMetadata
 from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorStats
 from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe.routed_experts_capturer import (
+    RoutedExpertsReader,
+)
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
 from vllm.v1.core.encoder_cache_manager import (
     EncoderCacheManager,
@@ -43,6 +48,7 @@ from vllm.v1.core.sched.request_queue import SchedulingPolicy, create_request_qu
 from vllm.v1.core.sched.utils import check_stop, remove_all
 from vllm.v1.engine import EngineCoreEventType, EngineCoreOutput, EngineCoreOutputs
 from vllm.v1.kv_cache_interface import KVCacheConfig
+from vllm.v1.metrics.perf import ModelMetrics, PerfStats
 from vllm.v1.metrics.stats import (
     PrefixCacheStats,
     SchedulerStats,
@@ -126,7 +132,7 @@ class Scheduler(SchedulerInterface):
 
         self.kv_event_publisher = EventPublisherFactory.create(
             self.kv_events_config,
-            self.parallel_config.data_parallel_rank,
+            self.parallel_config.data_parallel_index,
         )
         self.ec_connector = None
         if self.vllm_config.ec_transfer_config is not None:
@@ -187,6 +193,12 @@ class Scheduler(SchedulerInterface):
             if self.is_encoder_decoder
             else EncoderCacheManager(cache_size=encoder_cache_size)
         )
+        # For encoder-decoder models, allocate the maximum number of tokens for Cross
+        # Attn blocks, as for Whisper its input is always padded to the maximum length.
+        # TODO (NickLucche): Generalize to models with variable-length encoder inputs.
+        self._num_encoder_max_input_tokens = (
+            MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(vllm_config.model_config)
+        )
 
         speculative_config = vllm_config.speculative_config
         self.use_eagle = False
@@ -212,6 +224,30 @@ class Scheduler(SchedulerInterface):
         )
         self.use_pp = self.parallel_config.pipeline_parallel_size > 1
         self.use_v2_model_runner = envs.VLLM_USE_V2_MODEL_RUNNER
+        self.perf_metrics: ModelMetrics | None = None
+        if self.log_stats and vllm_config.observability_config.enable_mfu_metrics:
+            self.perf_metrics = ModelMetrics(vllm_config)
+
+        if self.vllm_config.model_config.enable_return_routed_experts:
+            assert self.dcp_world_size == 1 and self.pcp_world_size == 1, (
+                "enable_return_routed_experts does not support context parallelism "
+                "(dcp_world_size > 1 or pcp_world_size > 1)"
+            )
+
+            self.routed_experts_reader = RoutedExpertsReader.create()
+
+            assert len(kv_cache_config.kv_cache_groups) > 0, (
+                "enable_return_routed_experts requires at least one kv cache group"
+            )
+            self.max_num_kv_tokens = (
+                kv_cache_config.num_blocks // len(kv_cache_config.kv_cache_groups) + 1
+            ) * self.block_size
+
+            self.routed_experts_reader.attach_buffer(
+                max_num_kv_tokens=self.max_num_kv_tokens,
+                model_config=self.vllm_config.model_config,
+                instance_id=self.vllm_config.instance_id,
+            )
 
     def schedule(self) -> SchedulerOutput:
         # NOTE(woosuk) on the scheduling algorithm:
@@ -434,7 +470,12 @@ class Scheduler(SchedulerInterface):
                 if request.status == RequestStatus.WAITING_FOR_REMOTE_KVS:
                     is_ready = self._update_waiting_for_remote_kv(request)
                     if is_ready:
-                        request.status = RequestStatus.WAITING
+                        if request.num_preemptions:
+                            # We must be loading for a resumed preemption
+                            # rather than a new request.
+                            request.status = RequestStatus.PREEMPTED
+                        else:
+                            request.status = RequestStatus.WAITING
                     else:
                         logger.debug(
                             "%s is still in WAITING_FOR_REMOTE_KVS state.",
@@ -568,30 +609,30 @@ class Scheduler(SchedulerInterface):
                     0 if request.num_computed_tokens == 0 else self.num_lookahead_tokens
                 )
 
-                # Determine if we need to allocate cross-attention blocks.
-                if self.is_encoder_decoder and request.has_encoder_inputs:
-                    # TODO(russellb): For Whisper, we know that the input is
-                    # always padded to the maximum length. If we support other
-                    # encoder-decoder models, this will need to be updated if we
-                    # want to only allocate what is needed.
-                    num_encoder_tokens = (
-                        self.scheduler_config.max_num_encoder_input_tokens
-                    )
-                else:
-                    num_encoder_tokens = 0
+                num_encoder_tokens = (
+                    self._num_encoder_max_input_tokens
+                    if self.is_encoder_decoder and request.has_encoder_inputs
+                    else 0
+                )
 
                 new_blocks = self.kv_cache_manager.allocate_slots(
                     request,
-                    num_new_tokens + num_external_computed_tokens,
-                    num_new_local_computed_tokens,
-                    new_computed_blocks,
+                    num_new_tokens,
+                    num_new_computed_tokens=num_new_local_computed_tokens,
+                    new_computed_blocks=new_computed_blocks,
                     num_lookahead_tokens=effective_lookahead_tokens,
+                    num_external_computed_tokens=num_external_computed_tokens,
                     delay_cache_blocks=load_kv_async,
                     num_encoder_tokens=num_encoder_tokens,
                 )
 
                 if new_blocks is None:
                     # The request cannot be scheduled.
+
+                    # NOTE: we need to untouch the request from the encode cache
+                    # manager
+                    if request.has_encoder_inputs:
+                        self.encoder_cache_manager.free(request)
                     break
 
                 # KVTransfer: the connector uses this info to determine
@@ -601,7 +642,7 @@ class Scheduler(SchedulerInterface):
                 if self.connector is not None:
                     self.connector.update_state_after_alloc(
                         request,
-                        new_computed_blocks + new_blocks,
+                        self.kv_cache_manager.get_blocks(request.request_id),
                         num_external_computed_tokens,
                     )
 
@@ -756,11 +797,7 @@ class Scheduler(SchedulerInterface):
             self._update_after_schedule(scheduler_output)
         return scheduler_output
 
-    def _preempt_request(
-        self,
-        request: Request,
-        timestamp: float,
-    ) -> None:
+    def _preempt_request(self, request: Request, timestamp: float) -> None:
         """Preempt a request and put it back to the waiting queue.
 
         NOTE: The request should be popped from the running queue outside of this
@@ -773,6 +810,7 @@ class Scheduler(SchedulerInterface):
         self.encoder_cache_manager.free(request)
         request.status = RequestStatus.PREEMPTED
         request.num_computed_tokens = 0
+        request.spec_token_ids.clear()
         request.num_preemptions += 1
         if self.log_stats:
             request.record_event(EngineCoreEventType.PREEMPTED, timestamp)
@@ -780,10 +818,7 @@ class Scheduler(SchedulerInterface):
         # Put the request back to the waiting queue.
         self.waiting.prepend_request(request)
 
-    def _update_after_schedule(
-        self,
-        scheduler_output: SchedulerOutput,
-    ) -> None:
+    def _update_after_schedule(self, scheduler_output: SchedulerOutput) -> None:
         # Advance the number of computed tokens for the request AFTER
         # the request is scheduled.
         # 1. The scheduler_output of the current step has to include the
@@ -1000,8 +1035,7 @@ class Scheduler(SchedulerInterface):
             )
             curr_embeds_start, curr_embeds_end = (
                 mm_feature.mm_position.get_embeds_indices_in_range(
-                    start_idx_rel,
-                    end_idx_rel,
+                    start_idx_rel, end_idx_rel
                 )
             )
             # There's no embeddings in the current range of encoder placeholder tokens
@@ -1028,8 +1062,7 @@ class Scheduler(SchedulerInterface):
         )
 
     def get_grammar_bitmask(
-        self,
-        scheduler_output: SchedulerOutput,
+        self, scheduler_output: SchedulerOutput
     ) -> GrammarOutput | None:
         # Collect list of scheduled request ids that use structured output.
         # The corresponding rows of the bitmask will be in this order.
@@ -1066,6 +1099,10 @@ class Scheduler(SchedulerInterface):
         kv_connector_output = model_runner_output.kv_connector_output
         cudagraph_stats = model_runner_output.cudagraph_stats
 
+        perf_stats: PerfStats | None = None
+        if self.perf_metrics and self.perf_metrics.is_enabled():
+            perf_stats = self.perf_metrics.get_step_perf_stats_per_gpu(scheduler_output)
+
         outputs: dict[int, list[EngineCoreOutput]] = defaultdict(list)
         spec_decoding_stats: SpecDecodingStats | None = None
         kv_connector_stats: KVConnectorStats | None = (
@@ -1129,6 +1166,8 @@ class Scheduler(SchedulerInterface):
                     spec_decoding_stats,
                     num_draft_tokens=num_draft_tokens,
                     num_accepted_tokens=num_accepted,
+                    num_invalid_spec_tokens=scheduler_output.num_invalid_spec_tokens,
+                    request_id=req_id,
                 )
 
             stopped = False
@@ -1148,7 +1187,30 @@ class Scheduler(SchedulerInterface):
                 request.status = RequestStatus.FINISHED_STOPPED
                 stopped = True
 
+            routed_experts = None
             if stopped:
+                if self.vllm_config.model_config.enable_return_routed_experts:
+                    kv_blocks = self.kv_cache_manager.get_blocks(request.request_id)
+                    block_ids = kv_blocks.get_block_ids()[0]
+                    num_tokens = request.num_tokens - 1
+
+                    # compute slot mapping
+                    block_ids_array = np.array(block_ids, dtype=np.int32)
+                    num_blocks = len(block_ids)
+                    block_size = self.block_size
+
+                    # generate block offsets
+                    block_offsets = np.arange(0, block_size)
+
+                    # compute slot mapping: slot = block_id * block_size + offset
+                    slot_mapping = (
+                        block_offsets.reshape((1, block_size))
+                        + block_ids_array.reshape((num_blocks, 1)) * block_size
+                    ).flatten()[:num_tokens]
+
+                    routed_experts = self.routed_experts_reader.get_routed_experts(
+                        indices=slot_mapping
+                    )
                 kv_transfer_params = self._free_request(request)
                 if status_before_stop == RequestStatus.RUNNING:
                     stopped_running_reqs.add(request)
@@ -1167,7 +1229,13 @@ class Scheduler(SchedulerInterface):
                 struct_output_request = request.structured_output_request
                 assert struct_output_request is not None
                 assert struct_output_request.grammar is not None
-                struct_output_request.grammar.accept_tokens(req_id, new_token_ids)
+                ok = struct_output_request.grammar.accept_tokens(req_id, new_token_ids)
+                if not ok:
+                    logger.warning(
+                        "Unexpected: grammar rejected tokens %s for request %s.",
+                        new_token_ids,
+                        req_id,
+                    )
 
             if num_nans_in_logits is not None and req_id in num_nans_in_logits:
                 request.num_nans_in_logits = num_nans_in_logits[req_id]
@@ -1189,6 +1257,7 @@ class Scheduler(SchedulerInterface):
                         kv_transfer_params=kv_transfer_params,
                         trace_headers=request.trace_headers,
                         num_cached_tokens=request.num_cached_tokens,
+                        routed_experts=routed_experts,
                         num_nans_in_logits=request.num_nans_in_logits,
                     )
                 )
@@ -1262,7 +1331,7 @@ class Scheduler(SchedulerInterface):
 
         if (
             stats := self.make_stats(
-                spec_decoding_stats, kv_connector_stats, cudagraph_stats
+                spec_decoding_stats, kv_connector_stats, cudagraph_stats, perf_stats
             )
         ) is not None:
             # Return stats to only one of the front-ends.
@@ -1275,9 +1344,7 @@ class Scheduler(SchedulerInterface):
         return engine_core_outputs
 
     def _update_request_with_output(
-        self,
-        request: Request,
-        new_token_ids: list[int],
+        self, request: Request, new_token_ids: list[int]
     ) -> tuple[list[int], bool]:
         # Append generated tokens and check for stop. Note that if
         # a request is still being prefilled, we expect the model runner
@@ -1318,10 +1385,7 @@ class Scheduler(SchedulerInterface):
                 # in the decoder's KV cache.
                 self.encoder_cache_manager.free_encoder_input(request, input_id)
 
-    def update_draft_token_ids(
-        self,
-        draft_token_ids: DraftTokenIds,
-    ) -> None:
+    def update_draft_token_ids(self, draft_token_ids: DraftTokenIds) -> None:
         for req_id, spec_token_ids in zip(
             draft_token_ids.req_ids,
             draft_token_ids.draft_token_ids,
@@ -1334,11 +1398,46 @@ class Scheduler(SchedulerInterface):
             # Add newly generated spec token ids to the request.
             if self.structured_output_manager.should_advance(request):
                 metadata = request.structured_output_request
-                request.spec_token_ids = metadata.grammar.validate_tokens(  # type: ignore[union-attr]
-                    spec_token_ids
-                )
-            else:
-                request.spec_token_ids = spec_token_ids
+                spec_token_ids = metadata.grammar.validate_tokens(spec_token_ids)  # type: ignore[union-attr]
+            request.spec_token_ids = spec_token_ids
+
+    def update_draft_token_ids_in_output(
+        self, draft_token_ids: DraftTokenIds, scheduler_output: SchedulerOutput
+    ) -> None:
+        num_invalid_spec_tokens: dict[str, int] = {}
+
+        sched_spec_tokens = scheduler_output.scheduled_spec_decode_tokens
+        for req_id, spec_token_ids in zip(
+            draft_token_ids.req_ids,
+            draft_token_ids.draft_token_ids,
+        ):
+            request = self.requests.get(req_id)
+            if request is None or request.is_finished():
+                # The request may have been finished. Skip.
+                continue
+
+            placeholder_spec_tokens = sched_spec_tokens.get(req_id)
+            if not placeholder_spec_tokens:
+                continue
+
+            orig_num_spec_tokens = len(placeholder_spec_tokens)
+            # Trim drafts to scheduled number of spec tokens
+            # (needed for chunked prefill case for example).
+            del spec_token_ids[orig_num_spec_tokens:]
+            # Filter out spec tokens which do not adhere to the grammar.
+            if self.structured_output_manager.should_advance(request):
+                metadata = request.structured_output_request
+                assert metadata is not None and metadata.grammar is not None
+                spec_token_ids = metadata.grammar.validate_tokens(spec_token_ids)
+            # Pad to original number of spec tokens.
+            num_invalid_tokens = orig_num_spec_tokens - len(spec_token_ids)
+            if num_invalid_tokens:
+                spec_token_ids.extend([-1] * num_invalid_tokens)
+                num_invalid_spec_tokens[req_id] = num_invalid_tokens
+
+            sched_spec_tokens[req_id] = spec_token_ids
+
+        scheduler_output.num_invalid_spec_tokens = num_invalid_spec_tokens
 
     def get_request_counts(self) -> tuple[int, int]:
         """Returns (num_running_reqs, num_waiting_reqs)."""
@@ -1351,9 +1450,7 @@ class Scheduler(SchedulerInterface):
             request.record_event(EngineCoreEventType.QUEUED)
 
     def finish_requests(
-        self,
-        request_ids: str | Iterable[str],
-        finished_status: RequestStatus,
+        self, request_ids: str | Iterable[str], finished_status: RequestStatus
     ) -> None:
         """Handles the finish signal from outside the scheduler.
 
@@ -1485,6 +1582,7 @@ class Scheduler(SchedulerInterface):
         spec_decoding_stats: SpecDecodingStats | None = None,
         kv_connector_stats: KVConnectorStats | None = None,
         cudagraph_stats: CUDAGraphStat | None = None,
+        perf_stats: PerfStats | None = None,
     ) -> SchedulerStats | None:
         if not self.log_stats:
             return None
@@ -1510,6 +1608,7 @@ class Scheduler(SchedulerInterface):
             spec_decoding_stats=spec_stats,
             kv_connector_stats=connector_stats_payload,
             cudagraph_stats=cudagraph_stats,
+            perf_stats=perf_stats,
         )
 
     def make_spec_decoding_stats(
@@ -1517,11 +1616,15 @@ class Scheduler(SchedulerInterface):
         spec_decoding_stats: SpecDecodingStats | None,
         num_draft_tokens: int,
         num_accepted_tokens: int,
+        num_invalid_spec_tokens: dict[str, int] | None,
+        request_id: str,
     ) -> SpecDecodingStats | None:
-        if not self.log_stats:
+        if not self.log_stats or not num_draft_tokens:
             return None
         if spec_decoding_stats is None:
             spec_decoding_stats = SpecDecodingStats.new(self.num_spec_tokens)
+        if num_invalid_spec_tokens:
+            num_draft_tokens -= num_invalid_spec_tokens.get(request_id, 0)
         spec_decoding_stats.observe_draft(
             num_draft_tokens=num_draft_tokens, num_accepted_tokens=num_accepted_tokens
         )
@@ -1569,6 +1672,13 @@ class Scheduler(SchedulerInterface):
         if self.connector is None:
             return False, None
 
+        # Free any out-of-window prefix blocks before we hand the block table to
+        # the connector.
+        self.kv_cache_manager.remove_skipped_blocks(
+            request_id=request.request_id,
+            total_computed_tokens=request.num_tokens,
+        )
+
         block_ids = self.kv_cache_manager.get_block_ids(request.request_id)
 
         if not isinstance(self.connector, SupportsHMA):
diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py
index 4aeb17a156bb3c13673b8dfa740117ac4d22db16..aed5c0580b28629fee435f19d2747bbcd3a03455 100644
--- a/vllm/v1/core/single_type_kv_cache_manager.py
+++ b/vllm/v1/core/single_type_kv_cache_manager.py
@@ -15,6 +15,7 @@ from vllm.v1.kv_cache_interface import (
     KVCacheSpec,
     MambaSpec,
     MLAAttentionSpec,
+    SinkFullAttentionSpec,
     SlidingWindowSpec,
 )
 from vllm.v1.request import Request
@@ -30,6 +31,7 @@ class SingleTypeKVCacheManager(ABC):
         self,
         kv_cache_spec: KVCacheSpec,
         block_pool: BlockPool,
+        enable_caching: bool,
         kv_cache_group_id: int,
         dcp_world_size: int = 1,
         pcp_world_size: int = 1,
@@ -48,6 +50,7 @@ class SingleTypeKVCacheManager(ABC):
             self.block_size *= dcp_world_size * pcp_world_size
         self.kv_cache_spec = kv_cache_spec
         self.block_pool = block_pool
+        self.enable_caching = enable_caching
 
         # Mapping from request ID to blocks to track the blocks allocated
         # for each request, so that we can free the blocks when the request
@@ -68,6 +71,7 @@ class SingleTypeKVCacheManager(ABC):
         request_id: str,
         num_tokens: int,
         new_computed_blocks: Sequence[KVCacheBlock],
+        total_computed_tokens: int,
     ) -> int:
         """
         Get the number of blocks needed to be allocated for the request.
@@ -78,46 +82,121 @@ class SingleTypeKVCacheManager(ABC):
                 tokens that are already allocated).
             new_computed_blocks: The new computed blocks just hitting the
                 prefix caching.
+            total_computed_tokens: Include both local and external computed
+                tokens.
 
         Returns:
-            The number of blocks.
+            The number of blocks to allocate.
         """
 
         num_required_blocks = cdiv(num_tokens, self.block_size)
-        num_new_blocks = (
-            num_required_blocks
-            - len(new_computed_blocks)
-            - len(self.req_to_blocks[request_id])
+        num_req_blocks = len(self.req_to_blocks.get(request_id, ()))
+
+        if request_id in self.num_cached_block:
+            # Fast-path: a running request won't have any new prefix-cache hits.
+            assert len(new_computed_blocks) == 0
+            # NOTE: With speculative decoding, request's blocks may be allocated
+            # for draft tokens which are later rejected. In this case,
+            # num_required_blocks may be smaller than num_req_blocks.
+            return max(num_required_blocks - num_req_blocks, 0)
+
+        num_skipped_tokens = self.get_num_skipped_tokens(total_computed_tokens)
+        num_local_computed_blocks = len(new_computed_blocks) + num_req_blocks
+        # Number of whole blocks that are skipped by the attention window.
+        # If nothing is skipped, this is 0.
+        num_skipped_blocks = num_skipped_tokens // self.block_size
+        # We need blocks for the non-skipped suffix. If there are still
+        # local-computed blocks inside the window, they contribute to the
+        # required capacity; otherwise, skipped blocks dominate.
+        num_new_blocks = max(
+            num_required_blocks - max(num_skipped_blocks, num_local_computed_blocks),
+            0,
         )
-        # If a computed block of a request is an eviction candidate (in the
-        # free queue and ref_cnt == 0), it will be changed from a free block
-        # to a computed block when the request is allocated, so we also count
-        # it as needed to be allocated.
-        num_evictable_computed_blocks = sum(
-            blk.ref_cnt == 0 and not blk.is_null for blk in new_computed_blocks
+
+        # Among the `new_computed_blocks`, the first `num_skipped_blocks` worth
+        # of blocks are skipped; `num_req_blocks` of those may already be in
+        # `req_to_blocks`, so only skip the remainder from `new_computed_blocks`.
+        num_skipped_new_computed_blocks = max(0, num_skipped_blocks - num_req_blocks)
+
+        # If a computed block is an eviction candidate (in the free queue and
+        # ref_cnt == 0), it will be removed from the free queue when touched by
+        # the allocated request, so we must count it in the free-capacity check.
+        num_evictable_blocks = sum(
+            blk.ref_cnt == 0 and not blk.is_null
+            for blk in new_computed_blocks[num_skipped_new_computed_blocks:]
         )
-        return num_new_blocks + num_evictable_computed_blocks
+        return num_new_blocks + num_evictable_blocks
 
-    def save_new_computed_blocks(
-        self, request_id: str, new_computed_blocks: Sequence[KVCacheBlock]
+    def allocate_new_computed_blocks(
+        self,
+        request_id: str,
+        new_computed_blocks: Sequence[KVCacheBlock],
+        num_local_computed_tokens: int,
+        num_external_computed_tokens: int,
     ) -> None:
         """
-        Add the new computed blocks to the request.
+        Add the new computed blocks to the request. This involves three steps:
+        1. Touch the computed blocks to make sure they won't be evicted.
+        1.5. (Optional) For sliding window, skip blocks are padded with null blocks.
+        2. Add the remaining computed blocks.
+        3. (Optional) For KV connectors, allocate new blocks for external computed
+            tokens (if any).
 
         Args:
             request_id: The request ID.
             new_computed_blocks: The new computed blocks just hitting the
                 prefix cache.
+            num_local_computed_tokens: The number of local computed tokens.
+            num_external_computed_tokens: The number of external computed tokens.
         """
-        if request_id not in self.num_cached_block:
-            # A new request.
-            req_blocks = self.req_to_blocks[request_id]
-            assert len(req_blocks) == 0
-            req_blocks.extend(new_computed_blocks)
-            self.num_cached_block[request_id] = len(new_computed_blocks)
-        else:
-            # A running request. Should not have new computed blocks.
+
+        if request_id in self.num_cached_block:
+            # Fast-path: a running request won't have any new prefix-cache hits.
+            # It should not have any new computed blocks.
             assert len(new_computed_blocks) == 0
+            return
+
+        # A new request.
+        req_blocks = self.req_to_blocks[request_id]
+        assert len(req_blocks) == 0
+        num_total_computed_tokens = (
+            num_local_computed_tokens + num_external_computed_tokens
+        )
+        num_skipped_tokens = self.get_num_skipped_tokens(num_total_computed_tokens)
+        num_skipped_blocks = num_skipped_tokens // self.block_size
+        if num_skipped_blocks > 0:
+            # It is possible that all new computed blocks are skipped when
+            # num_skipped_blocks > len(new_computed_blocks).
+            new_computed_blocks = new_computed_blocks[num_skipped_blocks:]
+            # Some external computed tokens may be skipped too.
+            num_external_computed_tokens = min(
+                num_total_computed_tokens - num_skipped_tokens,
+                num_external_computed_tokens,
+            )
+
+        # Touch the computed blocks to make sure they won't be evicted.
+        if self.enable_caching:
+            self.block_pool.touch(new_computed_blocks)
+        else:
+            assert not any(new_computed_blocks), (
+                "Computed blocks should be empty when prefix caching is disabled"
+            )
+
+        # Skip blocks are padded with null blocks.
+        req_blocks.extend([self._null_block] * num_skipped_blocks)
+        # Add the remaining computed blocks.
+        req_blocks.extend(new_computed_blocks)
+        # All cached hits (including skipped nulls) are already cached; mark
+        # them so cache_blocks() will not try to re-cache blocks that already
+        # have a block_hash set.
+        self.num_cached_block[request_id] = len(req_blocks)
+
+        if num_external_computed_tokens > 0:
+            # Allocate new blocks for external computed tokens.
+            allocated_blocks = self.block_pool.get_new_blocks(
+                cdiv(num_total_computed_tokens, self.block_size) - len(req_blocks)
+            )
+            req_blocks.extend(allocated_blocks)
 
     def allocate_new_blocks(
         self, request_id: str, num_tokens: int
@@ -252,7 +331,9 @@ class SingleTypeKVCacheManager(ABC):
 
         raise NotImplementedError
 
-    def remove_skipped_blocks(self, request_id: str, num_computed_tokens: int) -> None:
+    def remove_skipped_blocks(
+        self, request_id: str, total_computed_tokens: int
+    ) -> None:
         """
         Remove and free the blocks that are no longer needed for attention computation.
         The removed blocks should be replaced by null_block.
@@ -262,18 +343,24 @@ class SingleTypeKVCacheManager(ABC):
 
         Args:
             request_id: The request ID.
-            num_computed_tokens: The number of tokens that have been computed.
+            total_computed_tokens: The total number of computed tokens, including
+                local computed tokens and external computed tokens.
         """
         # Remove the blocks that will be skipped during attention computation.
-        num_skipped_tokens = self.get_num_skipped_tokens(num_computed_tokens)
+        num_skipped_tokens = self.get_num_skipped_tokens(total_computed_tokens)
         if num_skipped_tokens <= 0:
             # This indicates that ALL tokens are inside attention window.
             # Thus we do not need to free any blocks outside attention window.
             # A typical case is full attention that we never free any token
             # before the request is finished.
             return
-        num_skipped_blocks = num_skipped_tokens // self.block_size
         blocks = self.req_to_blocks[request_id]
+        num_skipped_blocks = num_skipped_tokens // self.block_size
+        # `num_skipped_tokens` may include tokens that haven't been allocated yet
+        # (e.g., when the attention window moves into the external computed tokens
+        # range), so we must cap to the number of blocks that currently exist for
+        # this request.
+        num_skipped_blocks = min(num_skipped_blocks, len(blocks))
         removed_blocks: list[KVCacheBlock] = []
         # Because the block starts from index 0, the num_skipped_block-th block
         # corresponds to index num_skipped_blocks - 1.
@@ -486,7 +573,7 @@ class SlidingWindowManager(SingleTypeKVCacheManager):
         Returns:
             The number of tokens that will be skipped for attention computation.
         """
-        return num_computed_tokens - self.sliding_window + 1
+        return max(0, num_computed_tokens - self.sliding_window + 1)
 
     def get_num_common_prefix_blocks(self, running_request_id: str) -> int:
         """
@@ -711,6 +798,7 @@ class MambaManager(SingleTypeKVCacheManager):
         request_id: str,
         num_tokens: int,
         new_computed_blocks: Sequence[KVCacheBlock],
+        total_computed_tokens: int,
     ) -> int:
         # Allocate extra `num_speculative_blocks` blocks for
         # speculative decoding (MTP/EAGLE) with linear attention.
@@ -721,7 +809,7 @@ class MambaManager(SingleTypeKVCacheManager):
                 * self.kv_cache_spec.num_speculative_blocks
             )
         return super().get_num_blocks_to_allocate(
-            request_id, num_tokens, new_computed_blocks
+            request_id, num_tokens, new_computed_blocks, total_computed_tokens
         )
 
     def allocate_new_blocks(
@@ -737,12 +825,24 @@ class MambaManager(SingleTypeKVCacheManager):
             )
         return super().allocate_new_blocks(request_id, num_tokens)
 
+    def get_num_skipped_tokens(self, num_computed_tokens: int) -> int:
+        """
+        Get the number of tokens whose mamba state are not needed anymore. Mamba only
+        need to keep the state of the last computed token, so we return
+        num_computed_tokens - 1.
+        """
+        return num_computed_tokens - 1
+
 
 class CrossAttentionManager(SingleTypeKVCacheManager):
     """Manager for cross-attention KV cache in encoder-decoder models."""
 
-    def save_new_computed_blocks(
-        self, request_id: str, new_computed_blocks: Sequence[KVCacheBlock]
+    def allocate_new_computed_blocks(
+        self,
+        request_id: str,
+        new_computed_blocks: Sequence[KVCacheBlock],
+        num_local_computed_tokens: int,
+        num_external_computed_tokens: int,
     ) -> None:
         # We do not cache blocks for cross-attention to be shared between
         # requests, so  `new_computed_blocks` should always be empty.
@@ -783,6 +883,30 @@ class CrossAttentionManager(SingleTypeKVCacheManager):
         raise NotImplementedError("CrossAttentionManager does not support caching")
 
 
+class SinkFullAttentionManager(FullAttentionManager):
+    def __init__(
+        self,
+        kv_cache_spec: SinkFullAttentionSpec,
+        block_pool: BlockPool,
+        enable_caching: bool,
+        kv_cache_group_id: int,
+        dcp_world_size: int = 1,
+        pcp_world_size: int = 1,
+    ):
+        super().__init__(
+            kv_cache_spec,
+            block_pool,
+            enable_caching,
+            kv_cache_group_id,
+            dcp_world_size,
+            pcp_world_size,
+        )
+        sink_len = kv_cache_spec.sink_len
+        assert sink_len is not None and sink_len > 0 and sink_len % self.block_size == 0
+        num_sink_block = sink_len // self.block_size
+        self.sink_blocks = self.block_pool.free_block_queue.popleft_n(num_sink_block)
+
+
 spec_manager_map: dict[type[KVCacheSpec], type[SingleTypeKVCacheManager]] = {
     FullAttentionSpec: FullAttentionManager,
     MLAAttentionSpec: FullAttentionManager,
@@ -790,6 +914,7 @@ spec_manager_map: dict[type[KVCacheSpec], type[SingleTypeKVCacheManager]] = {
     ChunkedLocalAttentionSpec: ChunkedLocalAttentionManager,
     MambaSpec: MambaManager,
     CrossAttentionSpec: CrossAttentionManager,
+    SinkFullAttentionSpec: SinkFullAttentionManager,
 }
 
 
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 4f54d12f4b8d02e3ca6f4cb489e8fe9ea43abfa9..0ffb97206c66d39b021d730531ec9a8d1f27b93c 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -7,6 +7,7 @@ from collections.abc import Mapping
 from typing import Any
 
 import msgspec
+import numpy as np
 import torch
 
 from vllm.lora.request import LoRARequest
@@ -75,6 +76,12 @@ class EngineCoreRequest(
 
     trace_headers: Mapping[str, str] | None = None
 
+    # The user-provided request ID. This field is set internally,
+    # copied from the provided request_id that's originally assigned
+    # to the request_id field, see InputProcessor.assign_request_id().
+    # Used in outputs and to support abort(req_id, internal=False).
+    external_req_id: str | None = None
+
     @property
     def params(self) -> SamplingParams | PoolingParams:
         """Return the processed params (sampling or pooling)."""
@@ -133,7 +140,7 @@ class EngineCoreOutput(
     trace_headers: Mapping[str, str] | None = None
     # The number of tokens with prefix cache hits.
     num_cached_tokens: int = 0
-
+    routed_experts: np.ndarray | None = None
     # The number of NaNs in logits.
     # A value greater than 0 indicates that the output is corrupted.
     num_nans_in_logits: int = 0
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index a6ee241c41151163538d42c40f692802e20e660b..454e20ad17b94fe94d6bcd8c5ba6376046a28e40 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -4,13 +4,12 @@ import asyncio
 import os
 import socket
 import time
+import warnings
 from collections.abc import AsyncGenerator, Iterable, Mapping
 from copy import copy
 from typing import Any, cast
 
-import numpy as np
 import torch
-from typing_extensions import deprecated
 
 import vllm.envs as envs
 from vllm.config import VllmConfig
@@ -32,7 +31,6 @@ from vllm.transformers_utils.config import maybe_register_config_serialize_by_va
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils.async_utils import cancel_task_threadsafe
 from vllm.utils.collection_utils import as_list
-from vllm.utils.math_utils import cdiv
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.core_client import EngineCoreClient
 from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError
@@ -189,14 +187,6 @@ class AsyncLLM(EngineClient):
         else:
             self.profiler = None
 
-    @property
-    @deprecated(
-        "`AsyncLLM.processor` has been renamed to `AsyncLLM.input_processor`. "
-        "The old name will be removed in v0.14."
-    )
-    def processor(self):
-        return self.input_processor
-
     @classmethod
     def from_vllm_config(
         cls,
@@ -289,14 +279,39 @@ class AsyncLLM(EngineClient):
 
         is_pooling = isinstance(params, PoolingParams)
 
-        # Create a new output collector for the request.
-        queue = RequestOutputCollector(output_kind=params.output_kind)
+        if (
+            self.vllm_config.cache_config.kv_sharing_fast_prefill
+            and not is_pooling
+            and params.prompt_logprobs
+        ):
+            raise ValueError(
+                "--kv-sharing-fast-prefill produces incorrect logprobs for "
+                "prompt tokens, please disable it when the requests need "
+                "prompt logprobs"
+            )
+
+        if tokenization_kwargs is None:
+            tokenization_kwargs = {}
+        _validate_truncation_size(
+            self.model_config.max_model_len,
+            params.truncate_prompt_tokens,
+            tokenization_kwargs,
+        )
 
         # Convert Input --> Request.
         if isinstance(prompt, EngineCoreRequest):
             request = prompt
+            if request_id != request.request_id:
+                logger.warning_once(
+                    "AsyncLLM.add_request() was passed a request_id parameter that "
+                    "does not match the EngineCoreRequest.request_id attribute. The "
+                    "latter will be used, and the former will be ignored."
+                )
         else:
-            assert prompt_text is None
+            if prompt_text is not None:
+                raise ValueError(
+                    "should only provide prompt_text with EngineCoreRequest"
+                )
             request = self.input_processor.process_inputs(
                 request_id,
                 prompt,
@@ -313,6 +328,20 @@ class AsyncLLM(EngineClient):
             elif isinstance(prompt, Mapping):
                 prompt_text = cast(str | None, prompt.get("prompt"))
 
+        self.input_processor.assign_request_id(request)
+
+        # We start the output_handler on the first call to add_request() so
+        # we can call __init__ before the event loop, which enables us
+        # to handle startup failure gracefully in the OpenAI server.
+        self._run_output_handler()
+
+        # Respect pause state before accepting new requests.
+        async with self._pause_cond:
+            await self._pause_cond.wait_for(lambda: not self._paused)
+
+        # Create a new output collector for the request.
+        queue = RequestOutputCollector(params.output_kind, request.request_id)
+
         # Use cloned params that may have been updated in process_inputs()
         params = request.params
 
@@ -324,7 +353,7 @@ class AsyncLLM(EngineClient):
         assert isinstance(parent_params, SamplingParams)
 
         # Fan out child requests (for n>1).
-        parent_request = ParentRequest(request_id, parent_params)
+        parent_request = ParentRequest(request)
         for idx in range(parent_params.n):
             request_id, child_params = parent_request.get_child_info(idx)
             child_request = request if idx == parent_params.n - 1 else copy(request)
@@ -385,36 +414,8 @@ class AsyncLLM(EngineClient):
         returning the RequestOutput back to the caller.
         """
 
-        if (
-            self.vllm_config.cache_config.kv_sharing_fast_prefill
-            and sampling_params.prompt_logprobs
-        ):
-            raise ValueError(
-                "--kv-sharing-fast-prefill produces incorrect logprobs for "
-                "prompt tokens, please disable it when the requests need "
-                "prompt logprobs"
-            )
-
+        q: RequestOutputCollector | None = None
         try:
-            # We start the output_handler on the first call to generate() so
-            # we can call __init__ before the event loop, which enables us
-            # to handle startup failure gracefully in the OpenAI server.
-            self._run_output_handler()
-
-            # Wait until generation is resumed if the engine is paused.
-            async with self._pause_cond:
-                await self._pause_cond.wait_for(lambda: not self._paused)
-
-            if tokenization_kwargs is None:
-                tokenization_kwargs = {}
-                truncate_prompt_tokens = sampling_params.truncate_prompt_tokens
-
-                _validate_truncation_size(
-                    self.model_config.max_model_len,
-                    truncate_prompt_tokens,
-                    tokenization_kwargs,
-                )
-
             q = await self.add_request(
                 request_id,
                 prompt,
@@ -445,7 +446,8 @@ class AsyncLLM(EngineClient):
         # is cancelled or the generator is garbage collected. So,
         # we abort the request if we end up here.
         except (asyncio.CancelledError, GeneratorExit):
-            await self.abort(request_id)
+            if q is not None:
+                await self.abort(q.request_id, internal=True)
             if self.log_requests:
                 logger.info("Request %s aborted.", request_id)
             raise
@@ -457,16 +459,25 @@ class AsyncLLM(EngineClient):
             raise
 
         # Request validation error.
-        except ValueError:
+        except ValueError as e:
             if self.log_requests:
-                logger.info("Request %s failed (bad request).", request_id)
+                logger.info("Request %s failed (bad request): %s.", request_id, e)
             raise
 
         # Unexpected error in the generate() task (possibly recoverable).
         except Exception as e:
-            await self.abort(request_id)
+            if q is not None:
+                await self.abort(q.request_id, internal=True)
             if self.log_requests:
-                logger.info("Request %s failed.", request_id)
+                try:
+                    s = f"{e.__class__.__name__}: {e}"
+                except Exception as e2:
+                    s = (
+                        f"{e.__class__.__name__}: "
+                        + "error during printing an exception of class"
+                        + e2.__class__.__name__
+                    )
+                logger.info("Request %s failed due to %s.", request_id, s)
             raise EngineGenerateError() from e
 
     def _run_output_handler(self):
@@ -482,6 +493,7 @@ class AsyncLLM(EngineClient):
         log_stats = self.log_stats
         logger_manager = self.logger_manager
         input_processor = self.input_processor
+        chunk_size = envs.VLLM_V1_OUTPUT_PROC_CHUNK_SIZE
 
         async def output_handler():
             try:
@@ -497,15 +509,10 @@ class AsyncLLM(EngineClient):
                     # Split outputs into chunks of at most
                     # VLLM_V1_OUTPUT_PROC_CHUNK_SIZE, so that we don't block the
                     # event loop for too long.
-                    if num_outputs <= envs.VLLM_V1_OUTPUT_PROC_CHUNK_SIZE:
-                        slices = (outputs.outputs,)
-                    else:
-                        slices = np.array_split(
-                            outputs.outputs,
-                            cdiv(num_outputs, envs.VLLM_V1_OUTPUT_PROC_CHUNK_SIZE),
-                        )
-
-                    for i, outputs_slice in enumerate(slices):
+                    engine_core_outputs = outputs.outputs
+                    for start in range(0, num_outputs, chunk_size):
+                        end = start + chunk_size
+                        outputs_slice = engine_core_outputs[start:end]
                         # 2) Process EngineCoreOutputs.
                         processed_outputs = output_processor.process_outputs(
                             outputs_slice, outputs.timestamp, iteration_stats
@@ -514,13 +521,14 @@ class AsyncLLM(EngineClient):
                         assert not processed_outputs.request_outputs
 
                         # Allow other asyncio tasks to run between chunks
-                        if i + 1 < len(slices):
+                        if end < num_outputs:
                             await asyncio.sleep(0)
 
                         # 3) Abort any reqs that finished due to stop strings.
-                        await engine_core.abort_requests_async(
-                            processed_outputs.reqs_to_abort
-                        )
+                        if processed_outputs.reqs_to_abort:
+                            await engine_core.abort_requests_async(
+                                processed_outputs.reqs_to_abort
+                            )
 
                     output_processor.update_scheduler_stats(outputs.scheduler_stats)
 
@@ -540,13 +548,15 @@ class AsyncLLM(EngineClient):
 
         self.output_handler = asyncio.create_task(output_handler())
 
-    async def abort(self, request_id: str | Iterable[str]) -> None:
+    async def abort(
+        self, request_id: str | Iterable[str], internal: bool = False
+    ) -> None:
         """Abort RequestId in OutputProcessor and EngineCore."""
 
         request_ids = (
             (request_id,) if isinstance(request_id, str) else as_list(request_id)
         )
-        all_request_ids = self.output_processor.abort_requests(request_ids)
+        all_request_ids = self.output_processor.abort_requests(request_ids, internal)
         await self.engine_core.abort_requests_async(all_request_ids)
 
         if self.log_requests:
@@ -580,7 +590,7 @@ class AsyncLLM(EngineClient):
         if not wait_for_inflight_requests:
             request_ids = list(self.output_processor.request_states.keys())
             if request_ids:
-                await self.abort(request_ids)
+                await self.abort(request_ids, internal=True)
 
         # Wait for running requests to drain before clearing cache.
         if self.output_processor.has_unfinished_requests():
@@ -627,25 +637,21 @@ class AsyncLLM(EngineClient):
 
         The caller of generate() iterates the returned AsyncGenerator,
         returning the RequestOutput back to the caller.
+
+        NOTE: truncate_prompt_tokens is deprecated in v0.14.
+        TODO: Remove truncate_prompt_tokens in v0.15.
         """
 
+        q: RequestOutputCollector | None = None
         try:
-            # We start the output_handler on the first call to generate() so
-            # we can call __init__ before the event loop, which enables us
-            # to handle startup failure gracefully in the OpenAI server.
-            self._run_output_handler()
-
-            # Respect pause state before accepting new requests.
-            async with self._pause_cond:
-                await self._pause_cond.wait_for(lambda: not self._paused)
-
-            if tokenization_kwargs is None:
-                tokenization_kwargs = {}
-            _validate_truncation_size(
-                self.model_config.max_model_len,
-                truncate_prompt_tokens,
-                tokenization_kwargs,
-            )
+            if truncate_prompt_tokens is not None:
+                warnings.warn(
+                    "The `truncate_prompt_tokens` parameter in `AsyncLLM.encode()` "
+                    "is deprecated and will be removed in v0.15. "
+                    "Please use `pooling_params.truncate_prompt_tokens` instead.",
+                    DeprecationWarning,
+                    stacklevel=2,
+                )
 
             q = await self.add_request(
                 request_id,
@@ -673,7 +679,8 @@ class AsyncLLM(EngineClient):
         # If the request is disconnected by the client, generate()
         # is cancelled. So, we abort the request if we end up here.
         except asyncio.CancelledError:
-            await self.abort(request_id)
+            if q is not None:
+                await self.abort(q.request_id, internal=True)
             if self.log_requests:
                 logger.info("Request %s aborted.", request_id)
             raise
@@ -692,7 +699,8 @@ class AsyncLLM(EngineClient):
 
         # Unexpected error in the generate() task (possibly recoverable).
         except Exception as e:
-            await self.abort(request_id)
+            if q is not None:
+                await self.abort(q.request_id, internal=True)
             if self.log_requests:
                 logger.info("Request %s failed.", request_id)
             raise EngineGenerateError() from e
diff --git a/vllm/v1/engine/coordinator.py b/vllm/v1/engine/coordinator.py
index 953342cdd5d05dc1df5ef7ef0c7c014584e9c107..c2a9fe7c046ac6c6dd48bd6bd7d1752e47a951af 100644
--- a/vllm/v1/engine/coordinator.py
+++ b/vllm/v1/engine/coordinator.py
@@ -55,7 +55,9 @@ class DPCoordinator:
     request wave / running state changes.
     """
 
-    def __init__(self, parallel_config: ParallelConfig):
+    def __init__(
+        self, parallel_config: ParallelConfig, enable_wave_coordination: bool = True
+    ):
         dp_size = parallel_config.data_parallel_size
         assert dp_size > 1, "Coordinator only used for data parallel"
 
@@ -83,6 +85,7 @@ class DPCoordinator:
                 "front_publish_address": front_publish_address,
                 "back_output_address": back_output_address,
                 "back_publish_address": back_publish_address,
+                "enable_wave_coordination": enable_wave_coordination,
             },
             daemon=True,
         )
@@ -110,13 +113,19 @@ class EngineState:
 
 
 class DPCoordinatorProc:
-    def __init__(self, engine_count: int, min_stats_update_interval_ms: int = 100):
+    def __init__(
+        self,
+        engine_count: int,
+        min_stats_update_interval_ms: int = 100,
+        enable_wave_coordination: bool = True,
+    ):
         set_process_title("DPCoordinator")
         self.ctx = zmq.Context()
 
         self.engines = [EngineState() for _ in range(engine_count)]
 
         self.stats_update_interval_ms = min_stats_update_interval_ms
+        self.enable_wave_coordination = enable_wave_coordination
 
     @staticmethod
     def run_coordinator(
@@ -125,10 +134,12 @@ class DPCoordinatorProc:
         back_output_address: str,
         back_publish_address: str,
         min_stats_update_interval_ms: int = 100,
+        enable_wave_coordination: bool = True,
     ):
         coordinator = DPCoordinatorProc(
             engine_count=engine_count,
             min_stats_update_interval_ms=min_stats_update_interval_ms,
+            enable_wave_coordination=enable_wave_coordination,
         )
         try:
             coordinator.process_input_socket(
@@ -265,22 +276,25 @@ class DPCoordinatorProc:
                             )
                         continue  # Skip normal engine notification processing
 
-                    # We received a message on the front-end XPUB socket,
-                    # from an API server sending a new request while the
-                    # engines are paused, so that we can wake the other
-                    # engines.
-                    engine_to_exclude, wave = decoded
-                    if not engines_running:
-                        if wave < current_wave:
-                            # If the wave number is stale, ensure the message
-                            # is handled by all the engines.
-                            engine_to_exclude = None
-
-                        engines_running = True
-                        wave_state_changed = True
-                        self._send_start_wave(
-                            publish_back, current_wave, engine_to_exclude
-                        )
+                    # Wave coordination: handle new-request messages from front-end.
+                    # Only process these when wave coordination is enabled
+                    if self.enable_wave_coordination:
+                        # We received a message on the front-end XPUB socket,
+                        # from an API server sending a new request while the
+                        # engines are paused, so that we can wake the other
+                        # engines.
+                        engine_to_exclude, wave = decoded
+                        if not engines_running:
+                            if wave < current_wave:
+                                # If the wave number is stale, ensure the message
+                                # is handled by all the engines.
+                                engine_to_exclude = None
+
+                            engines_running = True
+                            wave_state_changed = True
+                            self._send_start_wave(
+                                publish_back, current_wave, engine_to_exclude
+                            )
 
                 if output_back in events:
                     # We received a message from one of the engines.
@@ -325,34 +339,39 @@ class DPCoordinatorProc:
                         stats[1] = scheduler_stats.num_running_reqs
                         stats_changed = True
 
-                    if (wave := outputs.wave_complete) is not None:
-                        # 2. Notification from rank 0 engine that we've
-                        # moved into the global paused state
-                        # (engines_running==False).
-                        if current_wave <= wave:
-                            new_wave = wave + 1
+                    # Wave coordination: handle wave completion and start notifications
+                    # Only process these when wave coordination is enabled
+                    if self.enable_wave_coordination:
+                        if (wave := outputs.wave_complete) is not None:
+                            # 2. Notification from rank 0 engine that we've
+                            # moved into the global paused state
+                            # (engines_running==False).
+                            if current_wave <= wave:
+                                new_wave = wave + 1
+                                logger.debug(
+                                    "Moving DP wave from %d to %d.",
+                                    current_wave,
+                                    new_wave,
+                                )
+                                current_wave = new_wave
+                                engines_running = False
+                                wave_state_changed = True
+                        elif (wave := outputs.start_wave) is not None and (
+                            wave > current_wave
+                            or (wave == current_wave and not engines_running)
+                        ):
+                            # 3. The engine received request for a non-current wave
+                            # so we must ensure that other engines progress to the
+                            # next wave (race condition handling).
                             logger.debug(
-                                "Moving DP wave from %d to %d.", current_wave, new_wave
+                                "Starting wave %d after notification of "
+                                "stale wave request from engine.",
+                                wave,
                             )
-                            current_wave = new_wave
-                            engines_running = False
+                            current_wave = wave
+                            engines_running = True
                             wave_state_changed = True
-                    elif (wave := outputs.start_wave) is not None and (
-                        wave > current_wave
-                        or (wave == current_wave and not engines_running)
-                    ):
-                        # 3. The engine received request for a non-current wave
-                        # so we must ensure that other engines progress to the
-                        # next wave (race condition handling).
-                        logger.debug(
-                            "Starting wave %d after notification of "
-                            "stale wave request from engine.",
-                            wave,
-                        )
-                        current_wave = wave
-                        engines_running = True
-                        wave_state_changed = True
-                        self._send_start_wave(publish_back, wave, eng_index)
+                            self._send_start_wave(publish_back, wave, eng_index)
 
                 if wave_state_changed:
                     message = (None, current_wave, engines_running)
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 0045b8c1dd3e772af9fe7f521f204bd4edee37dd..3ae0b31796929af8ff43a32ebd9c4e1868a808c7 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -43,9 +43,11 @@ from vllm.v1.core.kv_cache_utils import (
 from vllm.v1.core.sched.interface import SchedulerInterface
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.engine import (
+    EngineCoreOutput,
     EngineCoreOutputs,
     EngineCoreRequest,
     EngineCoreRequestType,
+    FinishReason,
     ReconfigureDistributedRequest,
     ReconfigureRankType,
     UtilityOutput,
@@ -63,6 +65,7 @@ from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder
 from vllm.v1.structured_output import StructuredOutputManager
+from vllm.v1.utils import compute_iteration_details
 from vllm.version import __version__ as VLLM_VERSION
 
 logger = init_logger(__name__)
@@ -82,6 +85,7 @@ class EngineCore:
         executor_class: type[Executor],
         log_stats: bool,
         executor_fail_callback: Callable | None = None,
+        include_finished_set: bool = False,
     ):
         # plugins need to be loaded at the engine/scheduler level too
         from vllm.plugins import load_general_plugins
@@ -89,7 +93,7 @@ class EngineCore:
         load_general_plugins()
 
         self.vllm_config = vllm_config
-        if vllm_config.parallel_config.data_parallel_rank == 0:
+        if not vllm_config.parallel_config.data_parallel_rank_local:
             logger.info(
                 "Initializing a V1 LLM engine (v%s) with config: %s",
                 VLLM_VERSION,
@@ -136,7 +140,7 @@ class EngineCore:
             vllm_config=vllm_config,
             kv_cache_config=kv_cache_config,
             structured_output_manager=self.structured_output_manager,
-            include_finished_set=vllm_config.parallel_config.data_parallel_size > 1,
+            include_finished_set=include_finished_set,
             log_stats=self.log_stats,
             block_size=scheduler_block_size,
         )
@@ -176,10 +180,10 @@ class EngineCore:
         # to eliminate pipeline bubbles.
         self.batch_queue_size = self.model_executor.max_concurrent_batches
         self.batch_queue: (
-            deque[tuple[Future[ModelRunnerOutput], SchedulerOutput]] | None
+            deque[tuple[Future[ModelRunnerOutput], SchedulerOutput, Future[Any]]] | None
         ) = None
         if self.batch_queue_size > 1:
-            logger.info("Batch queue is enabled with size %d", self.batch_queue_size)
+            logger.debug("Batch queue is enabled with size %d", self.batch_queue_size)
             self.batch_queue = deque(maxlen=self.batch_queue_size)
 
         self.is_ec_producer = (
@@ -205,7 +209,6 @@ class EngineCore:
         self.async_scheduling = vllm_config.scheduler_config.async_scheduling
 
         self.aborts_queue = queue.Queue[list[str]]()
-
         # Mark the startup heap as static so that it's ignored by GC.
         # Reduces pause times of oldest generation collections.
         freeze_gc_heap()
@@ -245,9 +248,20 @@ class EngineCore:
 
         assert len(kv_cache_specs) == len(available_gpu_memory)
 
+        # Track max_model_len before KV cache config to detect auto-fit changes
+        max_model_len_before = vllm_config.model_config.max_model_len
+
         kv_cache_configs = get_kv_cache_configs(
             vllm_config, kv_cache_specs, available_gpu_memory
         )
+
+        # If auto-fit reduced max_model_len, sync the new value to workers.
+        # This is needed because workers were spawned before memory profiling
+        # and have the original (larger) max_model_len cached.
+        max_model_len_after = vllm_config.model_config.max_model_len
+        if max_model_len_after != max_model_len_before:
+            self.collective_rpc("update_max_model_len", args=(max_model_len_after,))
+
         scheduler_kv_cache_config = generate_scheduler_kv_cache_config(kv_cache_configs)
         num_gpu_blocks = scheduler_kv_cache_config.num_blocks
         num_cpu_blocks = 0
@@ -323,15 +337,35 @@ class EngineCore:
             )
             raise err
 
-    def _log_err_callback(self, scheduler_output: SchedulerOutput):
-        """Log error details of a future that's not expected to return a result."""
-
-        def callback(f, sched_output=scheduler_output):
-            with self.log_error_detail(sched_output):
-                result = f.result()
-                assert result is None
-
-        return callback
+    @contextmanager
+    def log_iteration_details(self, scheduler_output: SchedulerOutput):
+        if not self.vllm_config.observability_config.enable_logging_iteration_details:
+            yield
+            return
+        self._iteration_index = getattr(self, "_iteration_index", 0)
+        iteration_details = compute_iteration_details(scheduler_output)
+        before = time.monotonic()
+        yield
+        logger.info(
+            "".join(
+                [
+                    "Iteration(",
+                    str(self._iteration_index),
+                    "): ",
+                    str(iteration_details.num_ctx_requests),
+                    " context requests, ",
+                    str(iteration_details.num_ctx_tokens),
+                    " context tokens, ",
+                    str(iteration_details.num_generation_requests),
+                    " generation requests, ",
+                    str(iteration_details.num_generation_tokens),
+                    " generation tokens, iteration elapsed time: ",
+                    format((time.monotonic() - before) * 1000, ".2f"),
+                    " ms",
+                ]
+            )
+        )
+        self._iteration_index += 1
 
     def step(self) -> tuple[dict[int, EngineCoreOutputs], bool]:
         """Schedule, execute, and make output.
@@ -347,7 +381,10 @@ class EngineCore:
         scheduler_output = self.scheduler.schedule()
         future = self.model_executor.execute_model(scheduler_output, non_block=True)
         grammar_output = self.scheduler.get_grammar_bitmask(scheduler_output)
-        with self.log_error_detail(scheduler_output):
+        with (
+            self.log_error_detail(scheduler_output),
+            self.log_iteration_details(scheduler_output),
+        ):
             model_output = future.result()
             if model_output is None:
                 model_output = self.model_executor.sample_tokens(grammar_output)
@@ -409,8 +446,6 @@ class EngineCore:
                 # No sampling required (no requests scheduled).
                 future = cast(Future[ModelRunnerOutput], exec_future)
             else:
-                exec_future.add_done_callback(self._log_err_callback(scheduler_output))
-
                 if not scheduler_output.pending_structured_output_tokens:
                     # We aren't waiting for any tokens, get any grammar output
                     # and sample immediately.
@@ -427,7 +462,7 @@ class EngineCore:
 
             if not deferred_scheduler_output:
                 # Add this step's future to the queue.
-                batch_queue.appendleft((future, scheduler_output))
+                batch_queue.appendleft((future, scheduler_output, exec_future))
                 if (
                     model_executed
                     and len(batch_queue) < self.batch_queue_size
@@ -444,9 +479,17 @@ class EngineCore:
             return None, False
 
         # Block until the next result is available.
-        future, scheduler_output = batch_queue.pop()
-        with self.log_error_detail(scheduler_output):
+        future, scheduler_output, exec_model_fut = batch_queue.pop()
+        with (
+            self.log_error_detail(scheduler_output),
+            self.log_iteration_details(scheduler_output),
+        ):
             model_output = future.result()
+            if model_output is None:
+                # None from sample_tokens() implies that the original execute_model()
+                # call failed - raise that exception.
+                exec_model_fut.result()
+                raise RuntimeError("unexpected error")
 
         # Before processing the model output, process any aborts that happened
         # during the model execution.
@@ -459,13 +502,25 @@ class EngineCore:
         # in a field and do it immediately once step_with_batch_queue is
         # re-called. The latter slightly favors TTFT over TPOT/throughput.
         if deferred_scheduler_output:
+            # If we are doing speculative decoding with structured output,
+            # we need to get the draft token ids from the prior step before
+            # we can compute the grammar bitmask for the deferred request.
+            if self.use_spec_decode:
+                draft_token_ids = self.model_executor.take_draft_token_ids()
+                assert draft_token_ids is not None
+                # Update the draft token ids in the scheduler output to
+                # filter out the invalid spec tokens, which will be padded
+                # with -1 and skipped by the grammar bitmask computation.
+                self.scheduler.update_draft_token_ids_in_output(
+                    draft_token_ids, deferred_scheduler_output
+                )
             # We now have the tokens needed to compute the bitmask for the
             # deferred request. Get the bitmask and call sample tokens.
             grammar_output = self.scheduler.get_grammar_bitmask(
                 deferred_scheduler_output
             )
             future = self.model_executor.sample_tokens(grammar_output, non_block=True)
-            batch_queue.appendleft((future, deferred_scheduler_output))
+            batch_queue.appendleft((future, deferred_scheduler_output, exec_future))
 
         return engine_core_outputs, model_executed
 
@@ -474,10 +529,8 @@ class EngineCore:
             request_ids = []
             while not self.aborts_queue.empty():
                 ids = self.aborts_queue.get_nowait()
-                if isinstance(ids, str):
-                    # Should be a list here, but also handle string just in case.
-                    ids = (ids,)
-                request_ids.extend(ids)
+                # Should be a list here, but also handle string just in case.
+                request_ids.extend((ids,) if isinstance(ids, str) else ids)
             # More efficient to abort all as a single batch.
             self.abort_requests(request_ids)
 
@@ -594,6 +647,7 @@ class EngineCoreProc(EngineCore):
         executor_class: type[Executor],
         log_stats: bool,
         client_handshake_address: str | None = None,
+        *,
         engine_index: int = 0,
     ):
         self.input_queue = queue.Queue[tuple[EngineCoreRequestType, Any]]()
@@ -625,17 +679,22 @@ class EngineCoreProc(EngineCore):
                 self.has_coordinator,
                 self.frontend_stats_publish_address,
             )
-            # Only publish request queue stats to coordinator for "internal"
-            # and "hybrid" LB modes .
-            self.publish_dp_lb_stats = (
+            internal_dp_balancing = (
                 self.has_coordinator
                 and not vllm_config.parallel_config.data_parallel_external_lb
             )
+            # Only publish request queue stats to coordinator for "internal"
+            # and "hybrid" LB modes.
+            self.publish_dp_lb_stats = internal_dp_balancing
 
             self._init_data_parallel(vllm_config)
 
             super().__init__(
-                vllm_config, executor_class, log_stats, executor_fail_callback
+                vllm_config,
+                executor_class,
+                log_stats,
+                executor_fail_callback,
+                internal_dp_balancing,
             )
 
             # Background Threads and Queues for IO. These enable us to
@@ -843,18 +902,29 @@ class EngineCoreProc(EngineCore):
 
         engine_core: EngineCoreProc | None = None
         try:
-            parallel_config: ParallelConfig = kwargs["vllm_config"].parallel_config
-            if parallel_config.data_parallel_size > 1 or dp_rank > 0:
+            vllm_config: VllmConfig = kwargs["vllm_config"]
+            parallel_config: ParallelConfig = vllm_config.parallel_config
+            data_parallel = parallel_config.data_parallel_size > 1 or dp_rank > 0
+            if data_parallel:
+                parallel_config.data_parallel_rank_local = local_dp_rank
                 set_process_title("EngineCore", f"DP{dp_rank}")
-                decorate_logs()
+            else:
+                set_process_title("EngineCore")
+            decorate_logs()
+
+            parallel_config.data_parallel_index = dp_rank
+            if data_parallel and vllm_config.model_config.is_moe:
                 # Set data parallel rank for this engine process.
                 parallel_config.data_parallel_rank = dp_rank
-                parallel_config.data_parallel_rank_local = local_dp_rank
                 engine_core = DPEngineCoreProc(*args, **kwargs)
             else:
-                set_process_title("EngineCore")
-                decorate_logs()
-                engine_core = EngineCoreProc(*args, **kwargs)
+                # Non-MoE DP ranks are completely independent, so treat like DP=1.
+                # Note that parallel_config.data_parallel_index will still reflect
+                # the original DP rank.
+                parallel_config.data_parallel_size = 1
+                parallel_config.data_parallel_size_local = 1
+                parallel_config.data_parallel_rank = 0
+                engine_core = EngineCoreProc(*args, engine_index=dp_rank, **kwargs)
 
             engine_core.run_busy_loop()
 
@@ -923,6 +993,13 @@ class EngineCoreProc(EngineCore):
         # Post-step hook.
         self.post_step(model_executed)
 
+        # If no model execution happened but there are waiting requests
+        # (e.g., WAITING_FOR_REMOTE_KVS), yield the GIL briefly to allow
+        # background threads (like NIXL handshake) to make progress.
+        # Without this, the tight polling loop can starve background threads.
+        if not model_executed and self.scheduler.has_unfinished_requests():
+            time.sleep(0.001)
+
         return model_executed
 
     def _handle_client_request(
@@ -1048,9 +1125,14 @@ class EngineCoreProc(EngineCore):
                     request_type = EngineCoreRequestType(bytes(type_frame.buffer))
 
                     # Deserialize the request data.
+                    request: Any
                     if request_type == EngineCoreRequestType.ADD:
-                        request = add_request_decoder.decode(data_frames)
-                        request = self.preprocess_add_request(request)
+                        req: EngineCoreRequest = add_request_decoder.decode(data_frames)
+                        try:
+                            request = self.preprocess_add_request(req)
+                        except Exception:
+                            self._handle_request_preproc_error(req)
+                            continue
                     else:
                         request = generic_decoder.decode(data_frames)
 
@@ -1134,6 +1216,30 @@ class EngineCoreProc(EngineCore):
                     # Limit the number of buffers to reuse.
                     reuse_buffers.append(buffer)
 
+    def _handle_request_preproc_error(self, request: EngineCoreRequest) -> None:
+        """Log and return a request-scoped error response for exceptions raised
+        from the add request preprocessing in the input socket processing thread.
+        """
+        logger.exception(
+            "Unexpected error pre-processing request %s", request.request_id
+        )
+        self.output_queue.put_nowait(
+            (
+                request.client_index,
+                EngineCoreOutputs(
+                    engine_index=self.engine_index,
+                    finished_requests={request.request_id},
+                    outputs=[
+                        EngineCoreOutput(
+                            request_id=request.request_id,
+                            new_token_ids=[],
+                            finish_reason=FinishReason.ERROR,
+                        )
+                    ],
+                ),
+            )
+        )
+
 
 class DPEngineCoreProc(EngineCoreProc):
     """ZMQ-wrapper for running EngineCore in background process
@@ -1148,6 +1254,10 @@ class DPEngineCoreProc(EngineCoreProc):
         log_stats: bool,
         client_handshake_address: str | None = None,
     ):
+        assert vllm_config.model_config.is_moe, (
+            "DPEngineCoreProc should only be used for MoE models"
+        )
+
         # Counts forward-passes of the model so that we can synchronize
         # finished with DP peers every N steps.
         self.step_counter = 0
@@ -1163,7 +1273,7 @@ class DPEngineCoreProc(EngineCoreProc):
             executor_class,
             log_stats,
             client_handshake_address,
-            dp_rank,
+            engine_index=dp_rank,
         )
 
     def _init_data_parallel(self, vllm_config: VllmConfig):
@@ -1344,7 +1454,7 @@ class DPEngineCoreProc(EngineCoreProc):
             )
 
 
-class DPEngineCoreActor(DPEngineCoreProc):
+class EngineCoreActorMixin:
     """
     Ray actor for running EngineCore in a data parallel context
     """
@@ -1352,15 +1462,12 @@ class DPEngineCoreActor(DPEngineCoreProc):
     def __init__(
         self,
         vllm_config: VllmConfig,
-        local_client: bool,
         addresses: EngineZmqAddresses,
-        executor_class: type[Executor],
-        log_stats: bool,
         dp_rank: int = 0,
         local_dp_rank: int = 0,
     ):
         self.addresses = addresses
-        vllm_config.parallel_config.data_parallel_rank = dp_rank
+        vllm_config.parallel_config.data_parallel_index = dp_rank
         vllm_config.parallel_config.data_parallel_rank_local = local_dp_rank
 
         # Set CUDA_VISIBLE_DEVICES as early as possible in actor life cycle
@@ -1382,8 +1489,6 @@ class DPEngineCoreActor(DPEngineCoreProc):
         # of ray.
         self._set_visible_devices(vllm_config, local_dp_rank)
 
-        super().__init__(vllm_config, local_client, "", executor_class, log_stats)
-
     def _set_visible_devices(self, vllm_config: VllmConfig, local_dp_rank: int):
         from vllm.platforms import current_platform
 
@@ -1444,7 +1549,7 @@ class DPEngineCoreActor(DPEngineCoreProc):
         Run the engine core busy loop.
         """
         try:
-            self.run_busy_loop()
+            self.run_busy_loop()  # type: ignore[attr-defined]
         except SystemExit:
             logger.debug("EngineCore exiting.")
             raise
@@ -1452,4 +1557,58 @@ class DPEngineCoreActor(DPEngineCoreProc):
             logger.exception("EngineCore encountered a fatal error.")
             raise
         finally:
-            self.shutdown()
+            self.shutdown()  # type: ignore[attr-defined]
+
+
+class DPMoEEngineCoreActor(EngineCoreActorMixin, DPEngineCoreProc):
+    """Used for MoE model data parallel cases."""
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        local_client: bool,
+        addresses: EngineZmqAddresses,
+        executor_class: type[Executor],
+        log_stats: bool,
+        dp_rank: int = 0,
+        local_dp_rank: int = 0,
+    ):
+        vllm_config.parallel_config.data_parallel_rank = dp_rank
+
+        EngineCoreActorMixin.__init__(
+            self, vllm_config, addresses, dp_rank, local_dp_rank
+        )
+        DPEngineCoreProc.__init__(
+            self, vllm_config, local_client, "", executor_class, log_stats
+        )
+
+
+class EngineCoreActor(EngineCoreActorMixin, EngineCoreProc):
+    """Used for non-MoE and/or non-DP cases."""
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        local_client: bool,
+        addresses: EngineZmqAddresses,
+        executor_class: type[Executor],
+        log_stats: bool,
+        dp_rank: int = 0,
+        local_dp_rank: int = 0,
+    ):
+        vllm_config.parallel_config.data_parallel_size = 1
+        vllm_config.parallel_config.data_parallel_size_local = 1
+        vllm_config.parallel_config.data_parallel_rank = 0
+
+        EngineCoreActorMixin.__init__(
+            self, vllm_config, addresses, dp_rank, local_dp_rank
+        )
+        EngineCoreProc.__init__(
+            self,
+            vllm_config,
+            local_client,
+            "",
+            executor_class,
+            log_stats,
+            engine_index=dp_rank,
+        )
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index c936646aa7993576a789d401696d9bd67ccb6122..905d8df4d35355a83f3d2b322bcfe9793f7fa647 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -20,6 +20,7 @@ import zmq
 import zmq.asyncio
 
 from vllm.config import VllmConfig
+from vllm.envs import VLLM_ENGINE_READY_TIMEOUT_S
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.tasks import SupportedTask
@@ -268,7 +269,8 @@ class InprocClient(EngineCoreClient):
         self.engine_core = EngineCore(*args, **kwargs)
 
     def get_output(self) -> EngineCoreOutputs:
-        outputs, _ = self.engine_core.step_fn()
+        outputs, model_executed = self.engine_core.step_fn()
+        self.engine_core.post_step(model_executed=model_executed)
         return outputs and outputs.get(0) or EngineCoreOutputs()
 
     def get_supported_tasks(self) -> tuple[SupportedTask, ...]:
@@ -500,7 +502,7 @@ class MPClient(EngineCoreClient):
 
             parallel_config = vllm_config.parallel_config
             dp_size = parallel_config.data_parallel_size
-            dp_rank = parallel_config.data_parallel_rank
+            dp_rank = parallel_config.data_parallel_index
             dp_local_size = parallel_config.data_parallel_size_local
             offline_mode = parallel_config.data_parallel_rank_local is not None
             # Client manages local+remote EngineCores in pure internal LB case.
@@ -527,9 +529,11 @@ class MPClient(EngineCoreClient):
             identities = set(self.core_engines)
             sync_input_socket = zmq.Socket.shadow(self.input_socket)
             while identities:
-                if not sync_input_socket.poll(timeout=600_000):
+                if not sync_input_socket.poll(
+                    timeout=VLLM_ENGINE_READY_TIMEOUT_S * 1000  # convert to ms
+                ):
                     raise TimeoutError(
-                        "Timed out waiting for engines to send"
+                        "Timed out waiting for engines to send "
                         "initial message on input socket."
                     )
                 identity, _ = sync_input_socket.recv_multipart()
@@ -1339,7 +1343,9 @@ class DPLBAsyncMPClient(DPAsyncMPClient):
         # Wait for ready messages from new engines on the input socket
         sync_input_socket = zmq.Socket.shadow(self.input_socket)
         while new_engine_identities:
-            if not sync_input_socket.poll(timeout=600_000):
+            if not sync_input_socket.poll(
+                timeout=VLLM_ENGINE_READY_TIMEOUT_S * 1000  # convert to ms
+            ):
                 raise TimeoutError(
                     "Timed out waiting for new engines to send initial "
                     "message on input socket."
diff --git a/vllm/v1/engine/input_processor.py b/vllm/v1/engine/input_processor.py
index 65e0c845b0afa160dc52aee7bc7e53389e6ed1f4..f9a77f581ab80156811698baf80062485d0c7deb 100644
--- a/vllm/v1/engine/input_processor.py
+++ b/vllm/v1/engine/input_processor.py
@@ -1,11 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import os
 import time
 from collections.abc import Mapping
 from typing import Any, Literal, cast
 
 from vllm.config import VllmConfig
+from vllm.exceptions import VLLMValidationError
 from vllm.inputs import ProcessorInputs, PromptType, SingletonInputs
 from vllm.inputs.parse import split_enc_dec_inputs
 from vllm.inputs.preprocess import InputPreprocessor
@@ -15,16 +17,20 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
 from vllm.multimodal.cache import processor_cache_from_config
 from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalUUIDDict
 from vllm.multimodal.parse import MultiModalDataParser
-from vllm.multimodal.processing import EncDecMultiModalProcessor
+from vllm.multimodal.processing import EncDecMultiModalProcessor, set_request_id
 from vllm.multimodal.utils import argsort_mm_positions
 from vllm.pooling_params import PoolingParams
-from vllm.sampling_params import SamplingParams
+from vllm.sampling_params import _SAMPLING_EPS, SamplingParams
 from vllm.tokenizers import TokenizerLike
 from vllm.tokenizers.mistral import MistralTokenizer
-from vllm.utils import length_from_prompt_token_ids_or_embeds
+from vllm.utils import length_from_prompt_token_ids_or_embeds, random_uuid
+from vllm.utils.torch_utils import set_default_torch_num_threads
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.metrics.stats import MultiModalCacheStats
-from vllm.v1.structured_output.backend_guidance import validate_guidance_grammar
+from vllm.v1.structured_output.backend_guidance import (
+    has_guidance_unsupported_json_features,
+    validate_guidance_grammar,
+)
 from vllm.v1.structured_output.backend_lm_format_enforcer import (
     validate_structured_output_request_lm_format_enforcer,
 )
@@ -57,6 +63,7 @@ class InputProcessor:
         self.input_preprocessor = InputPreprocessor(
             self.model_config,
             tokenizer,
+            self.vllm_config.observability_config,
             mm_registry,
             mm_processor_cache=self.mm_processor_cache,
         )
@@ -79,9 +86,11 @@ class InputProcessor:
             if num_logprobs == -1:
                 num_logprobs = self.model_config.get_vocab_size()
             if num_logprobs > max_logprobs:
-                raise ValueError(
+                raise VLLMValidationError(
                     f"Requested sample logprobs of {num_logprobs}, "
-                    f"which is greater than max allowed: {max_logprobs}"
+                    f"which is greater than max allowed: {max_logprobs}",
+                    parameter="logprobs",
+                    value=num_logprobs,
                 )
 
         # Validate prompt logprobs.
@@ -90,9 +99,11 @@ class InputProcessor:
             if num_prompt_logprobs == -1:
                 num_prompt_logprobs = self.model_config.get_vocab_size()
             if num_prompt_logprobs > max_logprobs:
-                raise ValueError(
+                raise VLLMValidationError(
                     f"Requested prompt logprobs of {num_prompt_logprobs}, "
-                    f"which is greater than max allowed: {max_logprobs}"
+                    f"which is greater than max allowed: {max_logprobs}",
+                    parameter="prompt_logprobs",
+                    value=num_prompt_logprobs,
                 )
 
     def _validate_sampling_params(
@@ -130,9 +141,11 @@ class InputProcessor:
                 invalid_token_ids.append(token_id)
 
         if invalid_token_ids:
-            raise ValueError(
+            raise VLLMValidationError(
                 f"token_id(s) {invalid_token_ids} in logit_bias contain "
-                f"out-of-vocab token ids. Vocabulary size: {vocab_size}"
+                f"out-of-vocab token ids. Vocabulary size: {vocab_size}",
+                parameter="logit_bias",
+                value=invalid_token_ids,
             )
 
     def _validate_supported_sampling_params(
@@ -142,24 +155,16 @@ class InputProcessor:
         # Logits processors not supported.
         if params.logits_processors:
             raise ValueError(
-                "vLLM V1 does not support per request user provided logits processors."
-            )
-        # Async scheduling + spec decode currently incompatible with some
-        # sampling parameters.
-        if (
-            self.vllm_config.speculative_config is not None
-            and self.vllm_config.scheduler_config.async_scheduling
-            and (
-                params.frequency_penalty != 0.0
-                or params.presence_penalty != 0.0
-                or params.repetition_penalty != 1.0
-                or params.bad_words_token_ids
-                or params.structured_outputs
+                "vLLM V1 does not support per request user-provided logits processors."
             )
+
+        # Some sampling parameters are not yet compatible with spec decoding.
+        if self.vllm_config.speculative_config is not None and (
+            params.min_tokens > 1 or params.min_p > _SAMPLING_EPS or params.logit_bias
         ):
             raise ValueError(
-                "async scheduling with spec decoding doesn't yet support "
-                "penalties, bad words or structured outputs in sampling parameters."
+                "The min_tokens, min_p, and logit_bias sampling parameters "
+                "are not yet supported with speculative decoding."
             )
 
     def _validate_params(
@@ -340,8 +345,22 @@ class InputProcessor:
                 # The request either failed validation
                 # or includes some jsonschema feature(s) that
                 # are not supported in xgrammar.
-                if isinstance(self.tokenizer, MistralTokenizer):
+
+                # Check if schema has features unsupported by guidance
+                so_params = params.structured_outputs
+                skip_guidance = False
+                if so_params.json:
+                    if isinstance(so_params.json, str):
+                        import json
+
+                        schema = json.loads(so_params.json)
+                    else:
+                        schema = so_params.json
+                    skip_guidance = has_guidance_unsupported_json_features(schema)
+
+                if isinstance(self.tokenizer, MistralTokenizer) or skip_guidance:
                     # Fall back to outlines if the tokenizer is Mistral
+                    # or if schema contains features unsupported by guidance
                     validate_structured_output_request_outlines(params)
                     params.structured_outputs._backend = "outlines"
                 else:
@@ -351,6 +370,10 @@ class InputProcessor:
             # Remember that this backend was set automatically
             params.structured_outputs._backend_was_auto = True
 
+        # Run post-init validation. This is also important to ensure subsequent
+        # roundtrip serialization/deserialization won't fail.
+        params.structured_outputs.__post_init__()
+
     def _maybe_build_mm_uuids(
         self,
         request_id: str,
@@ -389,6 +412,37 @@ class InputProcessor:
             mm_uuids[modality] = [f"{request_id}-{modality}-{i}" for i in range(n)]
         return mm_uuids
 
+    def _get_mm_identifier(
+        self,
+        mm_hash: str,
+        lora_request: LoRARequest | None,
+    ) -> str:
+        """
+        When enable_tower_connector_lora is True, multi-modal embeddings
+        vary depending on the LoRA request. Therefore, the mm_hash must be
+        generated based on the LoRA request to prevent incorrect cache hits.
+        """
+        if (
+            lora_request is None
+            or self.lora_config is None
+            or not self.lora_config.enable_tower_connector_lora
+        ):
+            return mm_hash
+        return f"{lora_request.lora_name}:{mm_hash}"
+
+    @staticmethod
+    def assign_request_id(request: EngineCoreRequest):
+        """Replace the externally supplied request ID with an internal request ID
+        that adds 8 random characters in order to ensure uniquness.
+        """
+        if request.external_req_id is not None:
+            raise ValueError(
+                "The external_req_id field should not be set on EngineCoreRequests"
+                " passed to vLLM; use the request_id field."
+            )
+        request.external_req_id = request.request_id
+        request.request_id = f"{request.external_req_id}-{random_uuid():.8}"
+
     def process_inputs(
         self,
         request_id: str,
@@ -445,11 +499,21 @@ class InputProcessor:
         # 1. Tokenize text prompt, with LoRA request if one exists.
         # 2. For multimodal models with a merged preprocessor, preprocess
         #   multimodal data and expand prompt token ids accordingly.
-        processed_inputs: ProcessorInputs = self.input_preprocessor.preprocess(
-            prompt,
-            tokenization_kwargs=tokenization_kwargs,
-            mm_uuids=mm_uuids,
-        )
+        num_threads = int(os.environ.get("OMP_NUM_THREADS", "1"))
+        if "OMP_NUM_THREADS" not in os.environ:
+            logger.debug_once(
+                "OMP_NUM_THREADS is not set; defaulting Torch threads to %d for "
+                "input preprocessing.",
+                num_threads,
+            )
+
+        with set_request_id(request_id), set_default_torch_num_threads(num_threads):
+            processed_inputs: ProcessorInputs = self.input_preprocessor.preprocess(
+                prompt,
+                tokenization_kwargs=tokenization_kwargs,
+                mm_uuids=mm_uuids,
+            )
+
         from vllm.platforms import current_platform
 
         current_platform.validate_request(
@@ -505,12 +569,17 @@ class InputProcessor:
 
             mm_features = []
             for modality, idx in sorted_mm_idxs:
+                base_mm_hash = decoder_mm_hashes[modality][idx]
                 mm_features.append(
                     MultiModalFeatureSpec(
                         data=decoder_mm_inputs[modality][idx],
                         modality=modality,
-                        identifier=decoder_mm_hashes[modality][idx],
+                        identifier=self._get_mm_identifier(
+                            base_mm_hash,
+                            lora_request,
+                        ),
                         mm_position=decoder_mm_positions[modality][idx],
+                        mm_hash=base_mm_hash,
                     )
                 )
 
@@ -567,7 +636,7 @@ class InputProcessor:
 
         tokenizer = self.tokenizer
         if tokenizer is not None:
-            max_input_id = max(prompt_ids or [], default=0)
+            max_input_id = max(prompt_ids or (), default=0)
 
             # NOTE: tokenizer.max_token_id is the tokenizer’s vocab size while
             # self.model_config.get_vocab_size() is the model’s vocab size.
@@ -590,6 +659,7 @@ class InputProcessor:
                 mm_registry = self.input_preprocessor.mm_registry
                 mm_processor = mm_registry.create_processor(
                     model_config,
+                    self.vllm_config.observability_config,
                     tokenizer=tokenizer,
                 )
                 assert isinstance(mm_processor, EncDecMultiModalProcessor)
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 1011317b706d3ad154a70851f2b1e86343503a9c..c02143c7295eefa08c9b8fcabbe19177be3eb2e6 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -7,7 +7,7 @@ from copy import copy
 from typing import Any, cast
 
 import torch.nn as nn
-from typing_extensions import TypeVar, deprecated
+from typing_extensions import TypeVar
 
 import vllm.envs as envs
 from vllm.config import ParallelConfig, VllmConfig
@@ -65,8 +65,9 @@ class LLMEngine:
 
         self.log_stats = log_stats
 
-        executor_backend = self.vllm_config.parallel_config.distributed_executor_backend
         parallel_config = vllm_config.parallel_config
+        executor_backend = parallel_config.distributed_executor_backend
+
         self.external_launcher_dp = (
             parallel_config.data_parallel_size > 1
             and executor_backend == "external_launcher"
@@ -136,14 +137,6 @@ class LLMEngine:
         # Don't keep the dummy data in memory
         self.reset_mm_cache()
 
-    @property
-    @deprecated(
-        "`LLMEngine.processor` has been renamed to `LLMEngine.input_processor`. "
-        "The old name will be removed in v0.14."
-    )
-    def processor(self):
-        return self.input_processor
-
     @classmethod
     def from_vllm_config(
         cls,
@@ -213,10 +206,10 @@ class LLMEngine:
     def get_supported_tasks(self) -> tuple[SupportedTask, ...]:
         return self.engine_core.get_supported_tasks()
 
-    def abort_request(self, request_ids: list[str]) -> None:
+    def abort_request(self, request_ids: list[str], internal: bool = False) -> None:
         """Remove request_ids from EngineCore and Detokenizer."""
 
-        request_ids = self.output_processor.abort_requests(request_ids)
+        request_ids = self.output_processor.abort_requests(request_ids, internal)
         self.engine_core.abort_requests(request_ids)
 
     def add_request(
@@ -238,6 +231,12 @@ class LLMEngine:
         # Process raw inputs into the request.
         if isinstance(prompt, EngineCoreRequest):
             request = prompt
+            if request_id != request.request_id:
+                logger.warning_once(
+                    "AsyncLLM.add_request() was passed a request_id parameter that "
+                    "does not match the EngineCoreRequest.request_id attribute. The "
+                    "latter will be used, and the former will be ignored."
+                )
         else:
             assert prompt_text is None
             request = self.input_processor.process_inputs(
@@ -255,6 +254,8 @@ class LLMEngine:
             elif isinstance(prompt, Mapping):
                 prompt_text = cast(str | None, prompt.get("prompt"))
 
+        self.input_processor.assign_request_id(request)
+
         # Use cloned params that may have been updated in process_inputs()
         params = request.params
 
@@ -268,7 +269,7 @@ class LLMEngine:
             return
 
         # Fan out child requests (for n>1).
-        parent_req = ParentRequest(request_id, params)
+        parent_req = ParentRequest(request)
         for idx in range(n):
             request_id, child_params = parent_req.get_child_info(idx)
             child_request = request if idx == n - 1 else copy(request)
diff --git a/vllm/v1/engine/logprobs.py b/vllm/v1/engine/logprobs.py
index 599725b6de9188a2f5e4ca878abd0a5f8f77f2c3..64ac323126336f342fb7017eede3f1542f88623b 100644
--- a/vllm/v1/engine/logprobs.py
+++ b/vllm/v1/engine/logprobs.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import itertools
+from collections.abc import Iterable
 from dataclasses import dataclass
 
 from vllm.logger import init_logger
@@ -88,11 +89,16 @@ class LogprobsProcessor:
             logprobs = logprobs_np.tolist()
             token_ids = token_ids_np.tolist()
             # Detokenize (non-incrementally).
-            decoded_tokens = (
-                NONES
-                if self.tokenizer is None
-                else (convert_ids_list_to_tokens(self.tokenizer, token_ids))
-            )
+            decoded_tokens: list[str] | Iterable[None]
+            if self.tokenizer is None:
+                decoded_tokens = NONES
+            else:
+                decoded_tokens_list = convert_ids_list_to_tokens(
+                    self.tokenizer, token_ids
+                )
+                decoded_tokens = self._verify_tokens(
+                    decoded_tokens_list=decoded_tokens_list, tokens=token_ids
+                )
 
             # Sampler puts the sampled logprob in first.
             sampled_token_logprob = logprobs[0]
@@ -126,37 +132,45 @@ class LogprobsProcessor:
 
         token_ids, logprobs, ranks = prompt_logprobs_tensors
 
+        # Recover shapes.
+        num_prompt_tokens, num_logprobs = logprobs.shape
+
         # Detokenize non-incrementally.
         # Output is flat: [num_tok, num_lps] -> [num_tok * num_lps]
-        decoded_tokens = (
+        all_decoded_tokens: list[str] | None = (
             None
             if self.tokenizer is None
-            else (
-                convert_ids_list_to_tokens(self.tokenizer, token_ids.flatten().tolist())
+            else convert_ids_list_to_tokens(
+                self.tokenizer, token_ids.flatten().tolist()
             )
         )
 
-        # Recover shapes.
-        num_prompt_tokens, num_logprobs = logprobs.shape
-
         # Pythonize the torch tensors.
         prompt_token_ranks = ranks.tolist()
         prompt_logprobs = logprobs.tolist()
-        token_ids = token_ids.tolist()
+        token_ids_list = token_ids.tolist()
 
         # Make Logprob for each position.
         for pos in range(num_prompt_tokens):
-            # Handle flattening.
+            # Handle flattening and UTF-8 correction per position
             offset = pos * num_logprobs
             offset_end = offset + num_logprobs
-            decoded_tokens_for_pos = (
-                NONES if decoded_tokens is None else decoded_tokens[offset:offset_end]
-            )
+
+            decoded_tokens_for_pos: list[str] | Iterable[None]
+            if all_decoded_tokens is None:
+                decoded_tokens_for_pos = NONES
+            else:
+                # Extract decoded tokens for this position
+                decoded_tokens_slice = all_decoded_tokens[offset:offset_end]
+                # Apply UTF-8 correction within this position's token boundaries
+                decoded_tokens_for_pos = self._verify_tokens(
+                    decoded_tokens_list=decoded_tokens_slice, tokens=token_ids_list[pos]
+                )
 
             # Update with the Logprob container for this pos.
             append_logprobs_for_next_position(
                 self.prompt_logprobs,
-                token_ids[pos],
+                token_ids_list[pos],
                 prompt_logprobs[pos],
                 decoded_tokens_for_pos,
                 prompt_token_ranks[pos],
@@ -182,6 +196,48 @@ class LogprobsProcessor:
             self.prompt_logprobs = []
         return plp
 
+    def _correct_decoded_token(self, idx: int, tokens: list[int]) -> str:
+        assert self.tokenizer is not None, "self.tokenizer should not be None"
+
+        # try with prev token id in same list
+        if idx > 0:
+            possible_decoded_token = self.tokenizer.decode(tokens[idx - 1 : idx + 1])
+            if not possible_decoded_token.endswith("�"):
+                return possible_decoded_token
+        # try with previous logprob token id
+        if self.logprobs:
+            latest_token_id = next(iter(self.logprobs[-1]))
+
+            decode_ids = [latest_token_id]
+            if idx > 0:
+                decode_ids.extend(tokens[idx - 1 : idx + 1])
+            else:
+                decode_ids.extend(tokens[idx : idx + 1])
+
+            possible_decoded_token = self.tokenizer.decode(decode_ids)
+            if not possible_decoded_token.endswith("�"):
+                return possible_decoded_token
+
+        # by default return empty string
+        return ""
+
+    def _verify_tokens(
+        self, decoded_tokens_list: list[str], tokens: list[int]
+    ) -> list[str]:
+        corrected_decoded_token_map = dict()
+        for idx, text in enumerate(decoded_tokens_list):
+            if text.endswith("�"):
+                # utf-8 char at the end means it's a potential unfinished byte sequence
+                # from byte fallback tokenization.
+                corrected_decoded_token_map[idx] = self._correct_decoded_token(
+                    idx, tokens
+                )
+
+        for idx, text in corrected_decoded_token_map.items():
+            decoded_tokens_list[idx] = text
+
+        return decoded_tokens_list
+
     def update_from_output(self, output: EngineCoreOutput) -> None:
         if output.new_logprobs is not None:
             self._update_sample_logprobs(output.new_logprobs)
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 9be3f4da7352d7bef61c472b1d958cb7aa69d2a6..f461e56fff076cdad1c14e7f5026b2a5f5bd5b28 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -2,12 +2,15 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
+from collections import defaultdict
 from collections.abc import Iterable
 from dataclasses import dataclass
 from typing import Any, cast
 
+import numpy as np
 import torch
 
+from vllm.lora.request import LoRARequest
 from vllm.outputs import (
     CompletionOutput,
     PoolingOutput,
@@ -29,6 +32,9 @@ from vllm.v1.metrics.stats import (
     SchedulerStats,
 )
 
+# shared empty CPU tensor used as a placeholder pooling output
+EMPTY_CPU_TENSOR = torch.empty(0, device="cpu")
+
 
 class RequestOutputCollector:
     """
@@ -39,8 +45,9 @@ class RequestOutputCollector:
     producer gets ahead of the consumer.
     """
 
-    def __init__(self, output_kind: RequestOutputKind):
+    def __init__(self, output_kind: RequestOutputKind, request_id: str):
         self.aggregate = output_kind == RequestOutputKind.DELTA
+        self.request_id = request_id
         self.output: RequestOutput | PoolingRequestOutput | Exception | None = None
         self.ready = asyncio.Event()
 
@@ -91,9 +98,10 @@ class RequestState:
     def __init__(
         self,
         request_id: str,
+        external_req_id: str,
         parent_req: ParentRequest | None,
         request_index: int,
-        lora_name: str | None,
+        lora_request: LoRARequest | None,
         output_kind: RequestOutputKind,
         prompt: str | None,
         prompt_token_ids: list[int] | None,
@@ -110,9 +118,11 @@ class RequestState:
         temperature: float | None = None,
     ):
         self.request_id = request_id
+        self.external_req_id = external_req_id
         self.parent_req = parent_req
         self.request_index = request_index
-        self.lora_name = lora_name
+        self.lora_request = lora_request
+        self.lora_name = lora_request.lora_name if lora_request is not None else None
         self.output_kind = output_kind
         self.prompt = prompt
         self.prompt_token_ids = prompt_token_ids
@@ -174,13 +184,13 @@ class RequestState:
             assert request.pooling_params is not None
             output_kind = request.pooling_params.output_kind
 
+        assert request.external_req_id is not None
         return cls(
             request_id=request.request_id,
+            external_req_id=request.external_req_id,
             parent_req=parent_req,
             request_index=request_index,
-            lora_name=(
-                request.lora_request.name if request.lora_request is not None else None
-            ),
+            lora_request=request.lora_request,
             output_kind=output_kind,
             prompt=prompt,
             prompt_token_ids=request.prompt_token_ids,
@@ -204,6 +214,7 @@ class RequestState:
         finish_reason: FinishReason | None,
         stop_reason: int | str | None,
         kv_transfer_params: dict[str, Any] | None = None,
+        routed_experts: np.ndarray | None = None,
     ) -> RequestOutput | PoolingRequestOutput | None:
         finished = finish_reason is not None
         final_only = self.output_kind == RequestOutputKind.FINAL_ONLY
@@ -235,30 +246,34 @@ class RequestState:
                 ]
                 self.sent_tokens_offset = len(self.detokenizer.output_token_ids)
 
-        request_id = self.request_id
+        external_req_id = self.external_req_id
+
         if pooling_output is not None:
             return self._new_request_output(
-                request_id, [self._new_pooling_output(pooling_output)], finished
+                external_req_id,
+                [self._new_pooling_output(pooling_output)],
+                finished,
             )
 
-        output = self._new_completion_output(new_token_ids, finish_reason, stop_reason)
+        output = self._new_completion_output(
+            new_token_ids, finish_reason, stop_reason, routed_experts
+        )
 
         if self.parent_req is None:
             outputs = [output]
         else:
-            request_id, outputs, finished = self.parent_req.get_outputs(
-                request_id, output
-            )
+            outputs, finished = self.parent_req.get_outputs(self.request_id, output)
             if not outputs:
                 return None
+            external_req_id = self.parent_req.external_req_id
 
         return self._new_request_output(
-            request_id, outputs, finished, kv_transfer_params
+            external_req_id, outputs, finished, kv_transfer_params
         )
 
     def _new_request_output(
         self,
-        request_id: str,
+        external_req_id: str,
         outputs: list[CompletionOutput] | list[PoolingOutput],
         finished: bool,
         kv_transfer_params: dict[str, Any] | None = None,
@@ -269,7 +284,7 @@ class RequestState:
             # Prompt embeddings are currently not supported by pooling requests.
             assert self.prompt_token_ids is not None
             return PoolingRequestOutput(
-                request_id=request_id,
+                request_id=external_req_id,
                 outputs=first_output,
                 num_cached_tokens=self.num_cached_tokens,
                 prompt_token_ids=self.prompt_token_ids,
@@ -288,7 +303,8 @@ class RequestState:
             prompt_token_ids = [0] * len(self.prompt_embeds)
 
         return RequestOutput(
-            request_id=request_id,
+            request_id=external_req_id,  # request_id is what was provided externally
+            lora_request=self.lora_request,
             prompt=self.prompt,
             prompt_token_ids=prompt_token_ids,
             prompt_logprobs=prompt_logprobs,
@@ -304,6 +320,7 @@ class RequestState:
         token_ids: list[int],
         finish_reason: FinishReason | None,
         stop_reason: int | str | None,
+        routed_experts: np.ndarray | None = None,
     ) -> CompletionOutput:
         assert self.detokenizer is not None
         assert self.logprobs_processor is not None
@@ -324,16 +341,14 @@ class RequestState:
             index=self.request_index,
             text=text,
             token_ids=token_ids,
+            routed_experts=routed_experts,
             logprobs=logprobs,
             cumulative_logprob=self.logprobs_processor.cumulative_logprob,
             finish_reason=str(finish_reason) if finished else None,
             stop_reason=stop_reason if finished else None,
         )
 
-    def _new_pooling_output(
-        self,
-        pooling_output: torch.Tensor,
-    ) -> PoolingOutput:
+    def _new_pooling_output(self, pooling_output: torch.Tensor) -> PoolingOutput:
         return PoolingOutput(data=pooling_output)
 
 
@@ -351,6 +366,7 @@ class OutputProcessor:
         self.stream_interval = stream_interval
         self.request_states: dict[str, RequestState] = {}
         self.parent_requests: dict[str, ParentRequest] = {}
+        self.external_req_ids: defaultdict[str, list[str]] = defaultdict(list)
         self.lora_states = LoRARequestStates(log_stats)
         self.tracer: Tracer | None = None
         self._requests_drained = asyncio.Event()
@@ -374,12 +390,41 @@ class OutputProcessor:
             assert state.queue is not None
             state.queue.put(e)
 
-    def abort_requests(
-        self,
-        request_ids: Iterable[str],
-    ) -> list[str]:
-        request_ids_to_abort = []
+    def abort_requests(self, request_ids: Iterable[str], internal: bool) -> list[str]:
+        """Abort a list of requests.
+
+        The request_ids may be either external request IDs (those passed to
+        InputProcessor.process_inputs()) or internal request IDs (those randomly
+        generated when creating the EngineCoreRequest).
+
+        If an external request ID is provided, and that external request ID
+        was used for multiple requests, all requests associated with that external
+        request ID are aborted.
+
+        In the case of parallel sampling, a request ID may be used to identify
+        a parent request, in which case the associated child requests are aborted
+        also.
+        """
+
+        internal_req_ids = []
         for request_id in request_ids:
+            if internal:
+                # Internal ID - this may be a parent request
+                internal_req_ids.append(request_id)
+
+                # Remove internal ID from the external->internal mapping
+                if req_state := self.request_states.get(request_id):
+                    external_req_id = req_state.external_req_id
+                    internal_ids = self.external_req_ids[external_req_id]
+                    internal_ids.remove(request_id)
+                    if not internal_ids:
+                        del self.external_req_ids[external_req_id]
+            elif internal_ids := self.external_req_ids.pop(request_id, []):
+                # External ID - abort all requests in the external->internal mapping
+                internal_req_ids.extend(internal_ids)
+
+        request_ids_to_abort = []
+        for request_id in internal_req_ids:
             req_state = self.request_states.pop(request_id, None)
             if req_state is not None:
                 self.lora_states.request_finished(request_id, req_state.lora_name)
@@ -390,7 +435,7 @@ class OutputProcessor:
                         new_token_ids=[],
                         # Set pooling_output is not None to
                         # correctly enter the abort pooling branch
-                        pooling_output=torch.randn(0, device="cpu")
+                        pooling_output=EMPTY_CPU_TENSOR
                         if req_state.detokenizer is None
                         else None,
                         finish_reason=FinishReason.ABORT,
@@ -403,7 +448,7 @@ class OutputProcessor:
                 # Abort children prior to removing the parent.
                 if parent.child_requests:
                     child_reqs = list(parent.child_requests)
-                    child_reqs = self.abort_requests(child_reqs)
+                    child_reqs = self.abort_requests(child_reqs, internal=True)
                     request_ids_to_abort.extend(child_reqs)
                 self.parent_requests.pop(request_id, None)
         if not self.request_states:
@@ -438,6 +483,9 @@ class OutputProcessor:
         if parent_req:
             self.parent_requests[parent_req.request_id] = parent_req
 
+        # Track the external_req_id -> [internal_req_id, ...] mapping
+        self.external_req_ids[req_state.external_req_id].append(request_id)
+
     def process_outputs(
         self,
         engine_core_outputs: list[EngineCoreOutput],
@@ -485,6 +533,7 @@ class OutputProcessor:
             finish_reason = engine_core_output.finish_reason
             stop_reason = engine_core_output.stop_reason
             kv_transfer_params = engine_core_output.kv_transfer_params
+            routed_experts = engine_core_output.routed_experts
             req_state.num_cached_tokens = engine_core_output.num_cached_tokens
             req_state.is_prefilling = False
 
@@ -510,6 +559,7 @@ class OutputProcessor:
                 finish_reason,
                 stop_reason,
                 kv_transfer_params,
+                routed_experts,
             ):
                 if req_state.queue is not None:
                     # AsyncLLM: put into queue for handling by generate().
@@ -521,6 +571,12 @@ class OutputProcessor:
             # Free completed requests.
             if finish_reason is not None:
                 self.request_states.pop(req_id)
+
+                internal_ids = self.external_req_ids[req_state.external_req_id]
+                internal_ids.remove(req_id)
+                if not internal_ids:
+                    del self.external_req_ids[req_state.external_req_id]
+
                 # Remove parent request if applicable.
                 parent_req = req_state.parent_req
                 if parent_req and not parent_req.child_requests:
@@ -596,7 +652,9 @@ class OutputProcessor:
             )
 
             # meta
-            span.set_attribute(SpanAttributes.GEN_AI_REQUEST_ID, req_state.request_id)
+            span.set_attribute(
+                SpanAttributes.GEN_AI_REQUEST_ID, req_state.external_req_id
+            )
             if req_state.top_p:
                 span.set_attribute(SpanAttributes.GEN_AI_REQUEST_TOP_P, req_state.top_p)
             if req_state.max_tokens_param:
@@ -645,9 +703,7 @@ class OutputProcessor:
         assert req_state.stats is not None
         iteration_stats.update_from_finished_request(
             finish_reason=finish_reason,
-            num_prompt_tokens=length_from_prompt_token_ids_or_embeds(
-                req_state.prompt_token_ids, req_state.prompt_embeds
-            ),
+            num_prompt_tokens=req_state.prompt_len,
             max_tokens_param=req_state.max_tokens_param,
             req_stats=req_state.stats,
             num_cached_tokens=req_state.num_cached_tokens,
diff --git a/vllm/v1/engine/parallel_sampling.py b/vllm/v1/engine/parallel_sampling.py
index 59aacd1963076ae2920ba417ad91121544610008..b7761970ba92f9c6535d7602af88417bc5361294 100644
--- a/vllm/v1/engine/parallel_sampling.py
+++ b/vllm/v1/engine/parallel_sampling.py
@@ -6,6 +6,7 @@ from typing import Optional, cast
 
 from vllm.outputs import CompletionOutput
 from vllm.sampling_params import RequestOutputKind, SamplingParams
+from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.metrics.stats import IterationStats
 
 
@@ -17,6 +18,7 @@ class ParentRequest:
     """
 
     request_id: str
+    external_req_id: str
     sampling_params: SamplingParams
 
     # To track the completion of child requests
@@ -31,8 +33,11 @@ class ParentRequest:
     # To efficiently obtain child sampling params
     cached_child_sampling_params: SamplingParams | None
 
-    def __init__(self, request_id: str, sampling_params: SamplingParams) -> None:
-        self.request_id = request_id
+    def __init__(self, request: EngineCoreRequest) -> None:
+        assert request.external_req_id is not None
+        sampling_params = request.params
+        self.request_id = request.request_id
+        self.external_req_id = request.external_req_id
         self.sampling_params = sampling_params
 
         self.child_requests = set()
@@ -96,7 +101,7 @@ class ParentRequest:
         self,
         child_request_id: str,
         completion_output: CompletionOutput,
-    ) -> tuple[str, list[CompletionOutput], bool]:
+    ) -> tuple[list[CompletionOutput], bool]:
         already_finished_and_returned: bool = False
         if completion_output.finished():
             if child_request_id in self.child_requests:
@@ -118,7 +123,7 @@ class ParentRequest:
             outputs = [] if self.child_requests else self.output_aggregator
 
         finished = not self.child_requests
-        return self.request_id, outputs, finished
+        return outputs, finished
 
     def observe_num_generation_tokens(self, num_generation_tokens: int):
         self.max_num_generation_tokens = max(
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
deleted file mode 100644
index a8c93499299d3cdde188c210af34636f1d8a34e9..0000000000000000000000000000000000000000
--- a/vllm/v1/engine/processor.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import warnings
-
-
-def __getattr__(name: str):
-    if name == "Processor":
-        from .input_processor import InputProcessor
-
-        warnings.warn(
-            "`vllm.v1.engine.processor.Processor` has been moved to "
-            "`vllm.v1.engine.input_processor.InputProcessor`. "
-            "The old name will be removed in v0.14.",
-            DeprecationWarning,
-            stacklevel=2,
-        )
-
-        return InputProcessor
-
-    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
diff --git a/vllm/v1/engine/utils.py b/vllm/v1/engine/utils.py
index 24bf66c42f312202595ff2d4988b9105fd237e07..66212ed7cd5efdeba10f57eba6e7a9face13b116 100644
--- a/vllm/v1/engine/utils.py
+++ b/vllm/v1/engine/utils.py
@@ -75,7 +75,6 @@ class EngineHandshakeMetadata:
 
     addresses: EngineZmqAddresses
     parallel_config: dict[str, int | str | list[int]]
-    parallel_config_hash: str | None = None
 
 
 class CoreEngineProcManager:
@@ -249,12 +248,19 @@ class CoreEngineActorManager:
         from ray.runtime_env import RuntimeEnv
         from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
 
-        from vllm.v1.engine.core import DPEngineCoreActor
+        from vllm.v1.engine.core import DPMoEEngineCoreActor, EngineCoreActor
+
+        dp_size = vllm_config.parallel_config.data_parallel_size
+        actor_class = (
+            DPMoEEngineCoreActor
+            if dp_size > 1 and vllm_config.model_config.is_moe
+            else EngineCoreActor
+        )
 
         self.local_engine_actors: list[ray.ActorHandle] = []
         self.remote_engine_actors: list[ray.ActorHandle] = []
 
-        env_vars_list = get_env_vars_to_copy(destination="DPEngineCoreActor")
+        env_vars_list = get_env_vars_to_copy(destination=actor_class.__name__)
         self.env_vars_dict = {
             name: os.environ[name] for name in env_vars_list if name in os.environ
         }
@@ -263,7 +269,6 @@ class CoreEngineActorManager:
         self.addresses = addresses
         self.executor_class = executor_class
         self.log_stats = log_stats
-        dp_size = vllm_config.parallel_config.data_parallel_size
         local_engine_count = vllm_config.parallel_config.data_parallel_size_local
         world_size = vllm_config.parallel_config.world_size
 
@@ -314,7 +319,7 @@ class CoreEngineActorManager:
                 runtime_env = RuntimeEnv(env_vars=actor_env_vars)
 
             actor = (
-                ray.remote(DPEngineCoreActor)
+                ray.remote(actor_class)
                 .options(
                     scheduling_strategy=PlacementGroupSchedulingStrategy(
                         placement_group=pg,
@@ -624,7 +629,13 @@ class CoreEngineActorManager:
         from ray.runtime_env import RuntimeEnv
         from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
 
-        from vllm.v1.engine.core import DPEngineCoreActor
+        from vllm.v1.engine.core import DPMoEEngineCoreActor, EngineCoreActor
+
+        actor_class = (
+            DPMoEEngineCoreActor
+            if cur_vllm_config.model_config.is_moe
+            else EngineCoreActor
+        )
 
         cur_data_parallel_size = len(self.local_engine_actors) + len(
             self.remote_engine_actors
@@ -667,7 +678,7 @@ class CoreEngineActorManager:
                 )
 
             actor = (
-                ray.remote(DPEngineCoreActor)
+                ray.remote(actor_class)
                 .options(
                     scheduling_strategy=PlacementGroupSchedulingStrategy(
                         placement_group=pg,
@@ -804,12 +815,19 @@ def launch_core_engines(
         ],
     )
 
-    # Run the DP Coordinator process with rank 0 when in
-    # online DP mode.
-    run_coordinator = dp_size > 1 and not offline_mode and dp_rank == 0
+    # Run the DP Coordinator process with rank 0 when in online DP mode.
+    # The coordinator is needed for:
+    # 1. Internal/hybrid LB: collecting and publishing queue stats for load balancing
+    # 2. MoE models: wave coordination in addition to stats
+    run_coordinator = (
+        vllm_config.needs_dp_coordinator and not offline_mode and dp_rank == 0
+    )
 
     if run_coordinator:
-        coordinator = DPCoordinator(parallel_config)
+        coordinator = DPCoordinator(
+            parallel_config,
+            enable_wave_coordination=vllm_config.model_config.is_moe,
+        )
 
         addresses.coordinator_input, addresses.coordinator_output = (
             coordinator.get_engine_socket_addresses()
@@ -905,6 +923,7 @@ def launch_core_engines(
             addresses,
             engines_to_handshake,
             parallel_config,
+            dp_size > 1 and vllm_config.model_config.is_moe,
             vllm_config.cache_config,
             local_engine_manager,
             coordinator.proc if coordinator else None,
@@ -916,6 +935,7 @@ def wait_for_engine_startup(
     addresses: EngineZmqAddresses,
     core_engines: list[CoreEngine],
     parallel_config: ParallelConfig,
+    coordinated_dp: bool,
     cache_config: CacheConfig,
     proc_manager: CoreEngineProcManager | None,
     coord_process: Process | None,
@@ -997,8 +1017,7 @@ def wait_for_engine_startup(
                 )
 
         if status == "HELLO" and engine.state == CoreEngineState.NEW:
-            # Send init message with DP config info and config hash.
-            # The config hash ensures all DP workers have compatible configs.
+            # Send init message with DP config info.
             init_message = msgspec.msgpack.encode(
                 EngineHandshakeMetadata(
                     addresses=addresses,
@@ -1010,10 +1029,9 @@ def wait_for_engine_startup(
                             "_data_parallel_master_port_list",
                             "data_parallel_size",
                         )
-                    },
-                    parallel_config_hash=parallel_config.compute_hash()
-                    if parallel_config.data_parallel_size > 1
-                    else None,
+                    }
+                    if coordinated_dp
+                    else {},
                 )
             )
             handshake_socket.send_multipart((eng_identity, init_message), copy=False)
@@ -1034,8 +1052,8 @@ def wait_for_engine_startup(
             if addresses.frontend_stats_publish_address is None:
                 addresses.frontend_stats_publish_address = msg.get("dp_stats_address")
 
-            # Validate config hash consistency across DP workers
-            if parallel_config.data_parallel_size > 1:
+            # Validate config hash consistency across DP workers for MoE models.
+            if coordinated_dp:
                 worker_config_hash = msg.get("parallel_config_hash")
                 expected_hash = parallel_config.compute_hash()
                 if worker_config_hash != expected_hash:
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index 649875fe8b7c1626bd0c9e0eebd5f33c0708c9b7..7b427b4a6cdee394b7f0a630c3583b9e4e47fe23 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -519,9 +519,7 @@ class WorkerProc:
         shared_worker_lock: LockType,
     ):
         self.rank = rank
-        wrapper = WorkerWrapperBase(
-            vllm_config=vllm_config, rpc_rank=local_rank, global_rank=rank
-        )
+        wrapper = WorkerWrapperBase(rpc_rank=local_rank, global_rank=rank)
         # TODO: move `init_worker` to executor level as a collective rpc call
         all_kwargs: list[dict] = [
             {} for _ in range(vllm_config.parallel_config.world_size)
@@ -695,7 +693,7 @@ class WorkerProc:
         worker = None
         # tuple[Connection, Connection]
         reader, ready_writer = kwargs.pop("ready_pipe")
-        death_pipe = kwargs.pop("death_pipe", None)
+        death_pipe: Connection | None = kwargs.pop("death_pipe", None)
         shutdown_event = threading.Event()
         # Start death monitoring thread if death_pipe is provided
         if death_pipe is not None:
diff --git a/vllm/v1/executor/ray_executor.py b/vllm/v1/executor/ray_executor.py
index 2fd64e5c2277c28fff0cc526560c0f8c98318e8b..292fa877f5a46a3d762760184b721b51eff0f469 100644
--- a/vllm/v1/executor/ray_executor.py
+++ b/vllm/v1/executor/ray_executor.py
@@ -208,9 +208,7 @@ class RayDistributedExecutor(Executor):
                     num_gpus=num_gpus,
                     scheduling_strategy=scheduling_strategy,
                     **ray_remote_kwargs,
-                )(RayWorkerWrapper).remote(  # type: ignore[attr-defined]
-                    vllm_config=self.vllm_config, rpc_rank=rank
-                )
+                )(RayWorkerWrapper).remote(rpc_rank=rank)
             else:
                 worker = ray.remote(
                     num_cpus=0,
@@ -218,9 +216,8 @@ class RayDistributedExecutor(Executor):
                     resources={current_platform.ray_device_key: num_gpus},
                     scheduling_strategy=scheduling_strategy,
                     **ray_remote_kwargs,
-                )(RayWorkerWrapper).remote(  # type: ignore[attr-defined]
-                    vllm_config=self.vllm_config, rpc_rank=rank
-                )
+                )(RayWorkerWrapper).remote(rpc_rank=rank)
+
             worker_metadata.append(RayWorkerMetaData(worker=worker, created_rank=rank))
 
         worker_ips = ray.get(
diff --git a/vllm/v1/executor/ray_utils.py b/vllm/v1/executor/ray_utils.py
index 21910d1160bd490238dcd3f2295345cf6f2d2e96..dadf55006b264ed6cbd2f9a711cff7fd246c25c7 100644
--- a/vllm/v1/executor/ray_utils.py
+++ b/vllm/v1/executor/ray_utils.py
@@ -104,8 +104,11 @@ try:
                 scheduler_output, intermediate_tensors
             )
             if isinstance(output, IntermediateTensors):
-                output = scheduler_output, grammar_output, output
-            elif not get_pp_group().is_last_rank:
+                return scheduler_output, grammar_output, output
+
+            if isinstance(output, AsyncModelRunnerOutput):
+                output = output.get_output()
+            if not get_pp_group().is_last_rank:
                 # Case where there are no scheduled requests
                 # but may still be finished requests.
                 assert not output or not output.req_ids
diff --git a/vllm/v1/executor/uniproc_executor.py b/vllm/v1/executor/uniproc_executor.py
index b8ca92255430459d1945368518ae952542d950a3..b9c7b550170b095e78f48be0ad3d165868ef6f3d 100644
--- a/vllm/v1/executor/uniproc_executor.py
+++ b/vllm/v1/executor/uniproc_executor.py
@@ -26,7 +26,7 @@ logger = init_logger(__name__)
 class UniProcExecutor(Executor):
     def _init_executor(self) -> None:
         """Initialize the worker and load the model."""
-        self.driver_worker = WorkerWrapperBase(vllm_config=self.vllm_config, rpc_rank=0)
+        self.driver_worker = WorkerWrapperBase(rpc_rank=0)
         distributed_init_method, rank, local_rank = self._distributed_args()
         kwargs = dict(
             vllm_config=self.vllm_config,
diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py
index 751862aa9c7673ef4b0012f7c35f7834b136709f..5c9913bb095bac96d2d50cddc246d6a2df765fba 100644
--- a/vllm/v1/kv_cache_interface.py
+++ b/vllm/v1/kv_cache_interface.py
@@ -61,14 +61,23 @@ class KVCacheSpec:
         return copy.deepcopy(specs[0])
 
 
-@dataclass(frozen=True)
+@dataclass(frozen=True, kw_only=True)
 class AttentionSpec(KVCacheSpec):
     num_kv_heads: int
     head_size: int
     dtype: torch.dtype
+    page_size_padded: int | None = None
 
     @property
     def page_size_bytes(self) -> int:
+        real_page_size = self.real_page_size_bytes
+        if self.page_size_padded is not None:
+            assert self.page_size_padded >= real_page_size
+            return self.page_size_padded
+        return real_page_size
+
+    @property
+    def real_page_size_bytes(self) -> int:
         return (
             2
             * self.block_size
@@ -78,19 +87,28 @@ class AttentionSpec(KVCacheSpec):
         )
 
 
-@dataclass(frozen=True)
+@dataclass(frozen=True, kw_only=True)
 class FullAttentionSpec(AttentionSpec):
-    sliding_window: int | None = None
-    attention_chunk_size: int | None = None
     """
-    When hybrid allocator is disabled and the model contains both full 
-    attention layers and sliding window attention layers, sliding 
-    window attention are regarded as full attention in KV cache manager 
-    (blocks are allocated for all tokens), while computed as sliding window 
+    When hybrid allocator is disabled and the model contains both full
+    attention layers and sliding window attention layers, sliding
+    window attention are regarded as full attention in KV cache manager
+    (blocks are allocated for all tokens), while computed as sliding window
     attention in model runner.
     In this case, we use FullAttentionSpec and record the sliding window size.
+    """
+
+    head_size_v: int | None = None
+
+    sliding_window: int | None = None
+    """
     Default to None for not using sliding window attention.
     """
+    attention_chunk_size: int | None = None
+
+    def __post_init__(self):
+        if self.head_size_v is None:
+            object.__setattr__(self, "head_size_v", self.head_size)
 
     def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
         max_model_len = vllm_config.model_config.max_model_len
@@ -139,7 +157,9 @@ class FullAttentionSpec(AttentionSpec):
             block_size=specs[0].block_size,
             num_kv_heads=specs[0].num_kv_heads,
             head_size=specs[0].head_size,
+            head_size_v=specs[0].head_size_v,
             dtype=specs[0].dtype,
+            page_size_padded=specs[0].page_size_padded,
             sliding_window=cls.merge_window_sizes(sliding_window),
             attention_chunk_size=cls.merge_window_sizes(attention_chunk_size),
         )
@@ -157,14 +177,23 @@ class FullAttentionSpec(AttentionSpec):
         )
         return merged_spec
 
+    @property
+    def real_page_size_bytes(self) -> int:
+        return (
+            self.block_size
+            * self.num_kv_heads
+            * (self.head_size + self.head_size_v)
+            * get_dtype_size(self.dtype)
+        )
+
 
-@dataclass(frozen=True)
+@dataclass(frozen=True, kw_only=True)
 class MLAAttentionSpec(FullAttentionSpec):
     # TODO(Lucas/Chen): less hacky way to do this
     cache_dtype_str: str | None = None
 
     @property
-    def page_size_bytes(self) -> int:
+    def real_page_size_bytes(self) -> int:
         if self.cache_dtype_str == "fp8_ds_mla":
             # See `vllm/v1/attention/backends/mla/flashmla_sparse.py`
             #  for details.
@@ -191,11 +220,12 @@ class MLAAttentionSpec(FullAttentionSpec):
             num_kv_heads=specs[0].num_kv_heads,
             head_size=specs[0].head_size,
             dtype=specs[0].dtype,
+            page_size_padded=specs[0].page_size_padded,
             cache_dtype_str=cache_dtype_str_set.pop(),
         )
 
 
-@dataclass(frozen=True)
+@dataclass(frozen=True, kw_only=True)
 class ChunkedLocalAttentionSpec(AttentionSpec):
     attention_chunk_size: int
 
@@ -214,7 +244,7 @@ class ChunkedLocalAttentionSpec(AttentionSpec):
         return cdiv(num_tokens, self.block_size) * self.page_size_bytes
 
 
-@dataclass(frozen=True)
+@dataclass(frozen=True, kw_only=True)
 class SlidingWindowSpec(AttentionSpec):
     sliding_window: int
 
@@ -284,6 +314,57 @@ class CrossAttentionSpec(AttentionSpec):
         return cdiv(max_encoder_len, self.block_size) * self.page_size_bytes
 
 
+@dataclass(frozen=True)
+class SinkFullAttentionSpec(FullAttentionSpec):
+    sink_len: int | None = None
+
+    @classmethod
+    def merge(cls, specs: list[Self]) -> Self:
+        """
+        Merge a list of FullAttentionSpec objects into a single
+        FullAttentionSpec object.
+        """
+        assert all(isinstance(spec, FullAttentionSpec) for spec in specs), (
+            "All attention layers in the same KV cache group must be FullAttentionSpec."
+        )
+
+        sliding_window = set(
+            spec.sliding_window for spec in specs if spec.sliding_window is not None
+        )
+        attention_chunk_size = set(
+            spec.attention_chunk_size
+            for spec in specs
+            if spec.attention_chunk_size is not None
+        )
+        assert not any(isinstance(spec, MLAAttentionSpec) for spec in specs), (
+            "MLAAttentionSpec should be merged in MLAAttentionSpec.merge"
+        )
+        merged_spec = cls(
+            block_size=specs[0].block_size,
+            num_kv_heads=specs[0].num_kv_heads,
+            head_size=specs[0].head_size,
+            head_size_v=specs[0].head_size_v,
+            sink_len=specs[0].sink_len,
+            dtype=specs[0].dtype,
+            page_size_padded=specs[0].page_size_padded,
+            sliding_window=cls.merge_window_sizes(sliding_window),
+            attention_chunk_size=cls.merge_window_sizes(attention_chunk_size),
+        )
+        for spec in specs:
+            for f in fields(AttentionSpec):
+                assert getattr(spec, f.name) == getattr(merged_spec, f.name), (
+                    "All attention layers in the same KV cache group must have "
+                    "the same attention spec."
+                )
+        assert (merged_spec.sliding_window is not None) + (
+            merged_spec.attention_chunk_size is not None
+        ) <= 1, (
+            "Model with both sliding window layers and chunked local attention "
+            "layers is not supported."
+        )
+        return merged_spec
+
+
 @dataclass(frozen=True)
 class UniformTypeKVCacheSpecs(KVCacheSpec):
     """
@@ -390,10 +471,11 @@ class KVCacheConfig:
     The KV cache configuration of a model.
     """
 
-    """The number of KV cache blocks"""
     num_blocks: int
-    """How should model runner initialize the KV cache tensors for each layer"""
+    """The number of KV cache blocks"""
     kv_cache_tensors: list[KVCacheTensor]
+    """How should model runner initialize the KV cache tensors for each layer"""
+    kv_cache_groups: list[KVCacheGroupSpec]
     """
     The kv cache groups of the model.
     For models with only one type of attention, there is only one group that
@@ -401,4 +483,3 @@ class KVCacheConfig:
     For models with multiple types of attention, there will be multiple groups,
     see `_get_kv_cache_config_uniform_page_size` for more details.
     """
-    kv_cache_groups: list[KVCacheGroupSpec]
diff --git a/vllm/v1/kv_offload/cpu.py b/vllm/v1/kv_offload/cpu.py
index e1cf7b14a785c4e07c40105434f8e0f4262eb3fa..d07ef8ad0d484cefb1432997e99539f9e3cda3d9 100644
--- a/vllm/v1/kv_offload/cpu.py
+++ b/vllm/v1/kv_offload/cpu.py
@@ -4,9 +4,10 @@ from collections.abc import Iterator
 
 import torch
 
-from vllm.attention.backends.abstract import AttentionBackend
 from vllm.config import VllmConfig
 from vllm.platforms import current_platform
+from vllm.v1.attention.backend import AttentionBackend
+from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.kv_offload.abstract import LoadStoreSpec, OffloadingManager
 from vllm.v1.kv_offload.arc_manager import ARCOffloadingManager
 from vllm.v1.kv_offload.backends.cpu import CPUBackend
@@ -18,15 +19,37 @@ from vllm.v1.kv_offload.worker.worker import OffloadingHandler
 
 
 class CPUOffloadingSpec(OffloadingSpec):
-    def __init__(self, vllm_config: VllmConfig):
-        super().__init__(vllm_config)
+    def __init__(self, vllm_config: VllmConfig, kv_cache_config: KVCacheConfig):
+        super().__init__(vllm_config, kv_cache_config)
 
-        num_cpu_blocks = self.extra_config.get("num_cpu_blocks")
-        if not num_cpu_blocks:
+        cpu_bytes_to_use = self.extra_config.get("cpu_bytes_to_use")
+        if not cpu_bytes_to_use:
             raise Exception(
-                "num_cpu_blocks must be specified in kv_connector_extra_config"
+                "cpu_bytes_to_use must be specified in kv_connector_extra_config"
             )
-        self.num_cpu_blocks: int = num_cpu_blocks
+
+        # calculate kv_bytes_per_offloaded_block
+        assert kv_cache_config is not None
+        page_sizes = {
+            kv_cache_group.kv_cache_spec.page_size_bytes
+            for kv_cache_group in kv_cache_config.kv_cache_groups
+        }
+        assert len(page_sizes) == 1
+        page_size_bytes = page_sizes.pop()
+        kv_bytes_per_block = (
+            page_size_bytes
+            * len(kv_cache_config.kv_cache_tensors)
+            * vllm_config.parallel_config.world_size
+        )
+        kv_bytes_per_offloaded_block = kv_bytes_per_block * (
+            self.offloaded_block_size // self.gpu_block_size
+        )
+
+        self.num_blocks = (
+            int(cpu_bytes_to_use) // kv_bytes_per_offloaded_block
+            if kv_bytes_per_offloaded_block > 0
+            else 0
+        )
 
         # scheduler-side
         self._manager: OffloadingManager | None = None
@@ -44,7 +67,7 @@ class CPUOffloadingSpec(OffloadingSpec):
             )
 
             backend = CPUBackend(
-                block_size=self.offloaded_block_size, num_blocks=self.num_cpu_blocks
+                block_size=self.offloaded_block_size, num_blocks=self.num_blocks
             )
 
             if self.eviction_policy == "lru":
@@ -77,7 +100,7 @@ class CPUOffloadingSpec(OffloadingSpec):
                 attn_backends=attn_backends,
                 gpu_block_size=self.gpu_block_size,
                 cpu_block_size=self.offloaded_block_size,
-                num_cpu_blocks=self.num_cpu_blocks,
+                num_cpu_blocks=self.num_blocks,
                 gpu_caches=kv_caches,
             )
 
diff --git a/vllm/v1/kv_offload/factory.py b/vllm/v1/kv_offload/factory.py
index b4d40cb48e1d18b0f192d55e7617e76c92e6dfe3..8fe018b89908ef1ae4807a49a9c6e2925ca5f5b3 100644
--- a/vllm/v1/kv_offload/factory.py
+++ b/vllm/v1/kv_offload/factory.py
@@ -9,6 +9,7 @@ from vllm.v1.kv_offload.spec import OffloadingSpec
 
 if TYPE_CHECKING:
     from vllm.config import VllmConfig
+    from vllm.v1.kv_cache_interface import KVCacheConfig
 
 logger = init_logger(__name__)
 
@@ -32,6 +33,7 @@ class OffloadingSpecFactory:
     def create_spec(
         cls,
         config: "VllmConfig",
+        kv_cache_config: "KVCacheConfig | None",
     ) -> OffloadingSpec:
         kv_transfer_config = config.kv_transfer_config
         assert kv_transfer_config is not None
@@ -47,7 +49,7 @@ class OffloadingSpecFactory:
             spec_cls = getattr(spec_module, spec_name)
         assert issubclass(spec_cls, OffloadingSpec)
         logger.info("Creating offloading spec with name: %s", spec_name)
-        return spec_cls(config)
+        return spec_cls(config, kv_cache_config)
 
 
 # Register various specs here.
diff --git a/vllm/v1/kv_offload/spec.py b/vllm/v1/kv_offload/spec.py
index 2cdd5ba5ffe5c58ed5b2b5c3cc8b9e65e33f045b..1d41ea71f46be5335c3011799e3f345542675e1a 100644
--- a/vllm/v1/kv_offload/spec.py
+++ b/vllm/v1/kv_offload/spec.py
@@ -6,13 +6,14 @@ from typing import TYPE_CHECKING
 
 import torch
 
-from vllm.attention.backends.abstract import AttentionBackend
 from vllm.logger import init_logger
+from vllm.v1.attention.backend import AttentionBackend
 from vllm.v1.kv_offload.abstract import LoadStoreSpec, OffloadingManager
 from vllm.v1.kv_offload.worker.worker import OffloadingHandler
 
 if TYPE_CHECKING:
     from vllm.config import VllmConfig
+    from vllm.v1.kv_cache_interface import KVCacheConfig
 
 logger = init_logger(__name__)
 
@@ -20,12 +21,15 @@ logger = init_logger(__name__)
 class OffloadingSpec(ABC):
     """Spec for an offloading connector"""
 
-    def __init__(self, vllm_config: "VllmConfig"):
+    def __init__(
+        self, vllm_config: "VllmConfig", kv_cache_config: "KVCacheConfig | None"
+    ):
         logger.warning(
             "Initializing OffloadingSpec. This API is experimental and "
             "subject to change in the future as we iterate the design."
         )
         self.vllm_config = vllm_config
+        self.kv_cache_config = kv_cache_config
 
         kv_transfer_config = vllm_config.kv_transfer_config
         assert kv_transfer_config is not None
diff --git a/vllm/v1/kv_offload/worker/cpu_gpu.py b/vllm/v1/kv_offload/worker/cpu_gpu.py
index 42ae4f1413ad0ccbf377560dcc1d72064c708600..c18c4a411cc91f124a7a6de902d5bde4506c4bd0 100644
--- a/vllm/v1/kv_offload/worker/cpu_gpu.py
+++ b/vllm/v1/kv_offload/worker/cpu_gpu.py
@@ -6,9 +6,9 @@ import numpy as np
 import torch
 
 from vllm import _custom_ops as ops
-from vllm.attention.backends.abstract import AttentionBackend
 from vllm.logger import init_logger
 from vllm.utils.platform_utils import is_pin_memory_available
+from vllm.v1.attention.backend import AttentionBackend
 from vllm.v1.kv_offload.mediums import BlockIDsLoadStoreSpec
 from vllm.v1.kv_offload.worker.worker import (
     OffloadingHandler,
@@ -68,7 +68,6 @@ class SingleDirectionOffloadingHandler(OffloadingHandler):
         kv_dim_before_num_blocks: list[bool],
         src_block_size_factor: int,
         dst_block_size_factor: int,
-        priority: int,
     ):
         """
         Initialize a SingleDirectionOffloadingHandler.
@@ -85,8 +84,6 @@ class SingleDirectionOffloadingHandler(OffloadingHandler):
                 per KV block in a source tensor.
             dst_block_size_factor: The number of kernel blocks
                 per KV block in a destination tensor.
-            priority: The priority of the backing CUDA streams.
-                Lower numbers indicate higher priority.
         """
         assert len(src_tensors) == len(dst_tensors) == len(kv_dim_before_num_blocks)
 
@@ -95,8 +92,12 @@ class SingleDirectionOffloadingHandler(OffloadingHandler):
         self.kv_dim_before_num_blocks: list[bool] = kv_dim_before_num_blocks
         self.src_block_size_factor: int = src_block_size_factor
         self.dst_block_size_factor: int = dst_block_size_factor
-        self.priority = priority
 
+        assert len(src_tensors) > 0
+        self.gpu_to_cpu: bool = self.src_tensors[0].is_cuda
+
+        # job_id -> event
+        self._transfer_events: dict[int, torch.Event] = {}
         # queue of transfers (job_id, stream, event)
         self._transfers: deque[tuple[int, torch.cuda.Stream, torch.Event]] = deque()
         # list of CUDA streams available for re-use
@@ -130,12 +131,12 @@ class SingleDirectionOffloadingHandler(OffloadingHandler):
         expand_block_ids(dst_blocks, self.dst_block_size_factor, src_to_dst[:, 1])
         src_to_dst_tensor = torch.from_numpy(src_to_dst)
 
-        stream = (
-            self._stream_pool.pop()
-            if self._stream_pool
-            else torch.cuda.Stream(priority=self.priority)
-        )
+        stream = self._stream_pool.pop() if self._stream_pool else torch.cuda.Stream()
         event = self._event_pool.pop() if self._event_pool else torch.Event()
+
+        if self.gpu_to_cpu:
+            # wait for model computation to finish before offloading
+            stream.wait_stream(torch.cuda.current_stream())
         if self._transfers:
             _, _, last_event = self._transfers[-1]
             # assure job will start only after the previous one completes
@@ -153,6 +154,7 @@ class SingleDirectionOffloadingHandler(OffloadingHandler):
                     ops.swap_blocks(src_tensor, dst_tensor, src_to_dst_tensor)
             event.record(stream)
 
+        self._transfer_events[job_id] = event
         self._transfers.append((job_id, stream, event))
 
         # success
@@ -165,8 +167,15 @@ class SingleDirectionOffloadingHandler(OffloadingHandler):
             results.append((job_id, True))
             self._stream_pool.append(stream)
             self._event_pool.append(event)
+            del self._transfer_events[job_id]
         return results
 
+    def wait(self, job_ids: set[int]):
+        for job_id in job_ids:
+            event = self._transfer_events.get(job_id)
+            if event is not None:
+                event.synchronize()
+
 
 class CpuGpuOffloadingHandlers:
     def __init__(
@@ -267,7 +276,6 @@ class CpuGpuOffloadingHandlers:
             kv_dim_before_num_blocks=kv_dim_before_num_blocks,
             src_block_size_factor=gpu_block_size_factor,
             dst_block_size_factor=cpu_block_size_factor,
-            priority=1,
         )
 
         self.cpu_to_gpu_handler = SingleDirectionOffloadingHandler(
@@ -276,5 +284,4 @@ class CpuGpuOffloadingHandlers:
             kv_dim_before_num_blocks=kv_dim_before_num_blocks,
             src_block_size_factor=cpu_block_size_factor,
             dst_block_size_factor=gpu_block_size_factor,
-            priority=-1,
         )
diff --git a/vllm/v1/kv_offload/worker/worker.py b/vllm/v1/kv_offload/worker/worker.py
index 58ba082497fa88b1955e4b64457fddf5e80bef9f..d332f2e156a8cf6386afdfeca3a3a538bb5108ce 100644
--- a/vllm/v1/kv_offload/worker/worker.py
+++ b/vllm/v1/kv_offload/worker/worker.py
@@ -53,6 +53,15 @@ class OffloadingHandler(ABC):
         """
         pass
 
+    @abstractmethod
+    def wait(self, job_ids: set[int]) -> None:
+        """
+        Wait for jobs to finish (blocking).
+
+        Args:
+            job_ids: The set of job IDs to wait for.
+        """
+
 
 class OffloadingWorker:
     """
@@ -142,3 +151,13 @@ class OffloadingWorker:
         for handler in self.handlers:
             finished.extend(handler.get_finished())
         return finished
+
+    def wait(self, job_ids: set[int]) -> None:
+        """
+        Wait for jobs to finish (blocking).
+
+        Args:
+            job_ids: The set of job IDs to wait for.
+        """
+        for handler in self.handlers:
+            handler.wait(job_ids)
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index 9eaee1bb97bb921a84314c9b3706e0d1cecf60c2..2213b952c7a89cd03daa248ef90861359ae6851d 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -19,6 +19,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
 from vllm.logger import init_logger
 from vllm.plugins import STAT_LOGGER_PLUGINS_GROUP, load_plugins_by_group
 from vllm.v1.engine import FinishReason
+from vllm.v1.metrics.perf import PerfMetricsLogging
 from vllm.v1.metrics.prometheus import unregister_vllm_metrics
 from vllm.v1.metrics.stats import (
     CachingMetrics,
@@ -118,6 +119,9 @@ class LoggingStatLogger(StatLoggerBase):
         self.engine_is_idle = False
         self.aggregated = False
 
+        if self._enable_perf_stats():
+            self.perf_metrics_logging = PerfMetricsLogging(vllm_config)
+
     def _reset(self, now):
         self.last_log_time = now
 
@@ -127,6 +131,9 @@ class LoggingStatLogger(StatLoggerBase):
         self.num_corrupted_reqs: int = 0
         self.num_preemptions: int = 0
 
+    def _enable_perf_stats(self) -> bool:
+        return self.vllm_config.observability_config.enable_mfu_metrics
+
     def _track_iteration_stats(self, iteration_stats: IterationStats):
         # Save tracked stats for token counters.
         self.num_prompt_tokens += iteration_stats.num_prompt_tokens
@@ -175,6 +182,8 @@ class LoggingStatLogger(StatLoggerBase):
                 self.cudagraph_logging.observe(scheduler_stats.cudagraph_stats)
             if not self.aggregated:
                 self.last_scheduler_stats = scheduler_stats
+            if (perf_stats := scheduler_stats.perf_stats) and self._enable_perf_stats():
+                self.perf_metrics_logging.observe(perf_stats)
         if mm_cache_stats:
             self.mm_caching_metrics.observe(mm_cache_stats)
 
@@ -211,7 +220,7 @@ class LoggingStatLogger(StatLoggerBase):
             "Running: %d reqs",
             "Waiting: %d reqs",
         ]
-        log_args = [
+        log_args: list[int | float | str] = [
             self.last_prompt_throughput,
             self.last_generation_throughput,
             self.last_scheduler_stats.num_running_reqs,
@@ -254,6 +263,8 @@ class LoggingStatLogger(StatLoggerBase):
         self.kv_connector_logging.log(log_fn=log_fn)
         if self.cudagraph_logging is not None:
             self.cudagraph_logging.log(log_fn=log_fn)
+        if self._enable_perf_stats():
+            self.perf_metrics_logging.log(log_fn=log_fn, log_prefix=self.log_prefix)
 
     def log_engine_initialized(self):
         if self.vllm_config.cache_config.num_gpu_blocks:
@@ -282,6 +293,10 @@ class AggregatedLoggingStatLogger(LoggingStatLogger, AggregateStatLoggerBase):
     def log_prefix(self):
         return "{} Engines Aggregated: ".format(len(self.engine_indexes))
 
+    def _enable_perf_stats(self) -> bool:
+        # Adding per_gpu perf stats across engines can lead to misleading numbers.
+        return False
+
     def record(
         self,
         scheduler_stats: SchedulerStats | None,
diff --git a/vllm/v1/metrics/perf.py b/vllm/v1/metrics/perf.py
new file mode 100644
index 0000000000000000000000000000000000000000..446a81fc4855decb55e99a10413af3357c7bb041
--- /dev/null
+++ b/vllm/v1/metrics/perf.py
@@ -0,0 +1,1244 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""
+Analytic flops/memory estimation module for transformer components,
+to help derive MFU (Model Flops Utilization) stats for a running model.
+"""
+
+import json
+import time
+from abc import ABC, abstractmethod
+from collections.abc import Iterable
+from dataclasses import asdict, dataclass
+from typing import Any, Protocol
+
+import torch
+from pydantic import BaseModel, Field, ValidationError, model_validator
+from typing_extensions import Self
+
+import vllm.envs as envs
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.utils.torch_utils import (
+    STR_DTYPE_TO_TORCH_DTYPE,
+    get_dtype_size,
+    get_kv_cache_torch_dtype,
+)
+from vllm.v1.core.sched.output import SchedulerOutput
+
+logger = init_logger(__name__)
+
+
+class InvalidComponent(Exception):
+    """
+    Custom exception to indicate that a certain ComponentMetric is not
+    applicable to the given VllmConfig.
+    """
+
+    pass
+
+
+#### Basic Data Types ####
+
+
+@dataclass
+class DebugPerfStats:
+    ## Stats for debugging the metrics calculation
+    calc_duration: float = 0.0  # time spent calculating these stats
+    num_prefill_requests: int = 0
+    num_decode_requests: int = 0
+    context_breakdown: dict[str, int] | None = None
+    num_flops_per_gpu_breakdown: dict[str, int] | None = None
+    num_read_bytes_per_gpu_breakdown: dict[str, int] | None = None
+    num_write_bytes_per_gpu_breakdown: dict[str, int] | None = None
+
+
+@dataclass
+class PerfStats:
+    num_flops_per_gpu: int = 0
+    num_read_bytes_per_gpu: int = 0
+    num_write_bytes_per_gpu: int = 0
+    debug_stats: DebugPerfStats | None = None
+
+
+@dataclass
+class ExecutionContext:
+    """
+    Represents an execution context for a batch of requests.
+
+    This class aggregates statistics across multiple requests in a batch,
+    separately tracking prefill and decode phases.
+
+    Example)
+    - Batch with one full prefill (2048 tokens) and one decode (1 token, 8192 context):
+      ctx = ExecutionContext()
+      ctx.add(2048, 2048, is_prefill=True)
+      ctx.add(1, 8192, is_prefill=False)
+    """
+
+    # Prefill phase statistics
+    num_prefill_requests: int = 0
+    prefill_num_tokens: int = 0  # sum of num_tokens for prefill requests
+    prefill_context_len: int = 0  # sum of context_len for prefill requests
+    prefill_token_context_product: int = 0  # sum of (num_tokens * context_len)
+
+    # Decode phase statistics
+    num_decode_requests: int = 0
+    decode_num_tokens: int = 0  # sum of num_tokens for decode requests
+    decode_context_len: int = 0  # sum of context_len for decode requests
+    decode_token_context_product: int = 0  # sum of (num_tokens * context_len)
+
+    def add(self, num_tokens: int, context_len: int, is_prefill: bool) -> None:
+        """Add a single request's statistics to this batch context."""
+        if is_prefill:
+            self.num_prefill_requests += 1
+            self.prefill_num_tokens += num_tokens
+            self.prefill_context_len += context_len
+            self.prefill_token_context_product += num_tokens * context_len
+        else:
+            self.num_decode_requests += 1
+            self.decode_num_tokens += num_tokens
+            self.decode_context_len += context_len
+            self.decode_token_context_product += num_tokens * context_len
+
+    def total_num_tokens(self) -> int:
+        """Total number of tokens across all requests in the batch."""
+        return self.prefill_num_tokens + self.decode_num_tokens
+
+    def total_token_context_product(self) -> int:
+        """Total sum of (num_tokens * context_len) across all requests."""
+        return self.prefill_token_context_product + self.decode_token_context_product
+
+    @classmethod
+    def from_single_request(
+        cls, num_tokens: int, context_len: int, is_prefill: bool
+    ) -> "ExecutionContext":
+        """Create an ExecutionContext from a single request.
+
+        This is a convenience method primarily for testing.
+        """
+        ctx = cls()
+        ctx.add(num_tokens, context_len, is_prefill)
+        return ctx
+
+
+class ParsedArgs:
+    """
+    Syntactic sugar so that Parsers can use dot notations
+    to access/update the parsed arguments.
+
+    e.g.)
+        args = ParsedArgs()
+        args.x = 3
+        args.y = args.x + 1
+    """
+
+    def __getattr__(self, name: str) -> Any:
+        raise AttributeError(f"'{type(self).__name__}' has no attribute '{name}'")
+
+    def __setattr__(self, name: str, value: Any) -> None:
+        object.__setattr__(self, name, value)
+
+    def model_dump(self) -> dict[str, Any]:
+        return vars(self).copy()
+
+
+#### Abstract ####
+
+
+class Parser(Protocol):
+    def parse(self, args: ParsedArgs, vllm_config: VllmConfig) -> ParsedArgs:
+        """
+        Parse the vllm config and update the current ParsedArgs and pass it on.
+        If the parser isn't applicable to the vllm_config, it will do nothing.
+        """
+        ...
+
+
+class ParserChain:
+    """
+    Applies chain of parser in a sequential order.
+    Later parsers might overwrite results from previous parsers,
+    so parsers should be chained in the appropriate order if they
+    are not mutually exclusive.
+    """
+
+    def __init__(self, *parsers: Parser) -> None:
+        self.parsers = list(parsers)
+
+    def add_parser(self, parser: Parser) -> None:
+        self.parsers.append(parser)
+
+    def parse(self, vllm_config: VllmConfig) -> ParsedArgs:
+        args = ParsedArgs()
+        for parser in self.parsers:
+            args = parser.parse(args, vllm_config)
+        return args
+
+
+_COMPONENT_METRICS_REGISTRY: dict[str, type["ComponentMetrics"]] = {}
+
+
+class ComponentMetrics(BaseModel, ABC):
+    """
+    Each concrete ComponentMetrics class is associated with:
+    - fields that are required for metric derivation
+      (fields are specified/validated through pydantic model)
+    - parser to parse VllmConfig into fields
+    - metric methods that derive flops/bytes for a given execution context
+    """
+
+    @classmethod
+    @abstractmethod
+    def component_type(cls) -> str: ...
+
+    @classmethod
+    @abstractmethod
+    def get_parser(cls) -> ParserChain:
+        """
+        Return a ParserChain that provides values for all required fields.
+        The returned parser chain must populate ParsedArgs with values for every
+        field defined on this ComponentMetrics class. Missing fields will cause
+        a ValidationError when from_vllm_config() is called.
+        See individual Parser docstrings for which args they provide, and field
+        comments on ComponentMetrics subclasses for which parser provides each field.
+        """
+        ...
+
+    def __init_subclass__(cls):
+        _COMPONENT_METRICS_REGISTRY[cls.component_type()] = cls
+
+    @classmethod
+    def from_vllm_config(cls, vllm_config: VllmConfig) -> Self:
+        """
+        Instantiate this class from VllmConfig.
+        Raises ValidationError if parsing fails.
+        """
+
+        parser = cls.get_parser()
+        parsed_args = parser.parse(vllm_config)
+        try:
+            return cls.model_validate(parsed_args.model_dump())
+        except ValidationError as e:
+            raise InvalidComponent(f"Invalid {cls.component_type()} config: {e}") from e
+
+    @classmethod
+    def registered_metrics(cls) -> Iterable[type["ComponentMetrics"]]:
+        return iter(_COMPONENT_METRICS_REGISTRY.values())
+
+    @abstractmethod
+    def get_num_flops_breakdown(
+        self, ctx: ExecutionContext, per_gpu: bool = True
+    ) -> dict[str, int]: ...
+
+    @abstractmethod
+    def get_read_bytes_breakdown(
+        self, ctx: ExecutionContext, per_gpu: bool = True
+    ) -> dict[str, int]: ...
+
+    @abstractmethod
+    def get_write_bytes_breakdown(
+        self, ctx: ExecutionContext, per_gpu: bool = True
+    ) -> dict[str, int]: ...
+
+    def get_num_flops(self, ctx: ExecutionContext, per_gpu: bool = True) -> int:
+        return sum(self.get_num_flops_breakdown(ctx, per_gpu).values())
+
+    def get_read_bytes(self, ctx: ExecutionContext, per_gpu: bool = True) -> int:
+        return sum(self.get_read_bytes_breakdown(ctx, per_gpu).values())
+
+    def get_write_bytes(self, ctx: ExecutionContext, per_gpu: bool = True) -> int:
+        return sum(self.get_write_bytes_breakdown(ctx, per_gpu).values())
+
+
+#### parsers ####
+
+
+class BaseConfigParser(Parser):
+    """
+    Parses base model configuration.
+    Provides: vocab_size, hidden_size, num_attention_heads, num_hidden_layers,
+    weight_byte_size, activation_byte_size, dp_size, tp_size, pp_size, enable_ep
+    """
+
+    def parse(self, args: ParsedArgs, vllm_config: VllmConfig) -> ParsedArgs:
+        model_config = vllm_config.model_config
+
+        args.vocab_size = model_config.get_vocab_size()
+        args.hidden_size = model_config.get_hidden_size()
+        # NOTE: model_config.get_attention_heads() divide by TP
+        # so we access field manually here to get total num_heads
+        args.num_attention_heads = get_required(
+            model_config.hf_text_config, "num_attention_heads"
+        )
+        args.num_hidden_layers = get_required(
+            model_config.hf_text_config, "num_hidden_layers"
+        )
+
+        model_dtype = vllm_config.model_config.dtype
+
+        if isinstance(model_dtype, torch.dtype):
+            torch_dtype = model_dtype
+        elif isinstance(model_dtype, str) and model_dtype in STR_DTYPE_TO_TORCH_DTYPE:
+            torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[model_dtype]
+        else:
+            # FIXME: handle this better
+            logger.warning(
+                "Unknown model_dtype %s, defaulting to bfloat16",
+                model_dtype,
+            )
+            torch_dtype = torch.bfloat16
+
+        args.weight_byte_size = get_dtype_size(torch_dtype)
+
+        # FIXME: handle this better by parsing whether activations use
+        # bf16, fp32, etc...
+        args.activation_byte_size = 2
+
+        args.dp_size = vllm_config.parallel_config.data_parallel_size
+        args.tp_size = vllm_config.parallel_config.tensor_parallel_size
+        args.pp_size = vllm_config.parallel_config.pipeline_parallel_size
+        args.enable_ep = vllm_config.parallel_config.enable_expert_parallel
+
+        return args
+
+
+#### Attention ####
+
+
+class BaseAttentionConfigParser(Parser):
+    """
+    Parses attention-specific configuration.
+    Provides: num_key_value_heads, head_dim, cache_byte_size
+    """
+
+    def parse(self, args: ParsedArgs, vllm_config: VllmConfig) -> ParsedArgs:
+        model_config = vllm_config.model_config
+
+        args.num_key_value_heads = model_config.get_total_num_kv_heads()
+        args.head_dim = model_config.get_head_size()
+
+        model_dtype = vllm_config.model_config.dtype
+        cache_dtype = vllm_config.cache_config.cache_dtype
+
+        kv_cache_torch_dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype)
+        args.cache_byte_size = get_dtype_size(kv_cache_torch_dtype)
+
+        return args
+
+
+class AttentionQuantizationConfigParser(Parser):
+    """
+    Parses quantization configuration for attention layers.
+    Overrides: weight_byte_size
+    """
+
+    def parse(self, args: ParsedArgs, vllm_config: VllmConfig) -> ParsedArgs:
+        cfg = vllm_config.quant_config
+
+        if cfg is None:
+            return args
+
+        quant_method = cfg.get_name()
+        if quant_method in ["fp8", "fbgemm_fp8"]:
+            # FIXME: This is a hacky coarse-grained fp8 quantization detection.
+            # FIXME: These configs also have concept of "ignored layers" and we
+            # need to solve the same problem as above.
+            args.weight_byte_size = 1
+        elif quant_method == "mxfp4":
+            # FIXME: Also has "ignored layers" issue above
+            args.weight_byte_size = 0.5
+        else:
+            # FIXME: Add more parsing logic for different quant methods.
+            raise InvalidComponent
+
+        return args
+
+
+class AttentionMetrics(ComponentMetrics):
+    # From BaseConfigParser
+    num_hidden_layers: int = Field(..., gt=0)
+    hidden_size: int = Field(..., gt=0)
+    num_attention_heads: int = Field(..., gt=0)
+    activation_byte_size: int = Field(..., gt=0)
+    tp_size: int = Field(..., gt=0)
+    pp_size: int = Field(..., gt=0)
+
+    # From BaseAttentionConfigParser
+    num_key_value_heads: int = Field(..., gt=0)
+    head_dim: int = Field(..., gt=0)
+    cache_byte_size: int = Field(..., gt=0)
+
+    # From BaseConfig Parser, overridden by AttentionQuantizationConfigParser
+    weight_byte_size: int | float = Field(..., gt=0)
+
+    # TODO: discern cases where we have mixture of different attention layer types
+    # such as SWA, MLA, etc.
+
+    @classmethod
+    def component_type(cls) -> str:
+        return "attn"
+
+    @classmethod
+    def get_parser(cls) -> ParserChain:
+        return ParserChain(
+            BaseConfigParser(),
+            BaseAttentionConfigParser(),
+            AttentionQuantizationConfigParser(),
+        )
+
+    def get_num_flops_breakdown(
+        self, ctx: ExecutionContext, per_gpu: bool = True
+    ) -> dict[str, int]:
+        L, D, q, kv, d = (
+            self.num_hidden_layers,
+            self.hidden_size,
+            self.num_attention_heads,
+            self.num_key_value_heads,
+            self.head_dim,
+        )
+        T = ctx.total_num_tokens()
+        TC = ctx.total_token_context_product()
+
+        if per_gpu:
+            L //= self.pp_size
+            # tensor parallel along heads
+            q = max(1, q // self.tp_size)
+            kv = max(1, kv // self.tp_size)
+
+        return {
+            "qkv_proj": 2 * T * D * (q + 2 * kv) * d * L,
+            "attn_qk": 2 * q * TC * d * L,
+            "attn_av": 2 * q * TC * d * L,
+            "out_proj": 2 * T * D * q * d * L,
+        }
+
+    def get_read_bytes_breakdown(
+        self, ctx: ExecutionContext, per_gpu: bool = True
+    ) -> dict[str, int]:
+        L, D, q, kv, d = (
+            self.num_hidden_layers,
+            self.hidden_size,
+            self.num_attention_heads,
+            self.num_key_value_heads,
+            self.head_dim,
+        )
+        T = ctx.total_num_tokens()
+
+        if per_gpu:
+            L //= self.pp_size
+            # tensor parallel along heads
+            q = max(1, q // self.tp_size)
+            kv = max(1, kv // self.tp_size)
+
+        read_bytes = {}
+
+        read_bytes["qkv_input"] = T * D * self.activation_byte_size * L
+        read_bytes["qkv_weight"] = int(D * (q + 2 * kv) * d * self.weight_byte_size * L)
+
+        # Attention input reads differ between prefill and decode
+        # Prefill: read Q, K, V activations (all in activation_byte_size)
+        if ctx.prefill_num_tokens > 0:
+            read_bytes["attn_input"] = (
+                (ctx.prefill_num_tokens * q + 2 * ctx.prefill_context_len * kv)
+                * d
+                * self.activation_byte_size
+                * L
+            )
+
+        # Decode: read Q activations + read K, V from cache (in cache_byte_size)
+        if ctx.decode_num_tokens > 0:
+            read_bytes["attn_input"] = read_bytes.get("attn_input", 0) + (
+                ctx.decode_num_tokens * q * d * self.activation_byte_size * L
+                + 2 * ctx.decode_context_len * kv * d * self.cache_byte_size * L
+            )
+
+        read_bytes["out_input"] = T * q * d * self.activation_byte_size * L
+        read_bytes["out_weight"] = int(q * d * D * self.weight_byte_size * L)
+
+        return read_bytes
+
+    def get_write_bytes_breakdown(
+        self, ctx: ExecutionContext, per_gpu: bool = True
+    ) -> dict[str, int]:
+        """Calculate write memory traffic for attention layers."""
+        L, D, q, kv, d = (
+            self.num_hidden_layers,
+            self.hidden_size,
+            self.num_attention_heads,
+            self.num_key_value_heads,
+            self.head_dim,
+        )
+        T = ctx.total_num_tokens()
+
+        if per_gpu:
+            L //= self.pp_size
+            # tensor parallel along heads
+            q = max(1, q // self.tp_size)
+            kv = max(1, kv // self.tp_size)
+
+        return {
+            "qkv_output": T * (q + 2 * kv) * d * self.activation_byte_size * L,
+            "kv_cache": 2 * T * kv * d * self.cache_byte_size * L,
+            "out_output": T * D * self.activation_byte_size * L,
+        }
+
+
+#### Ffn ####
+
+
+class BaseFfnConfigParser(Parser):
+    """
+    Parses FFN and MoE configuration.
+    Provides: intermediate_size, num_experts, num_experts_per_tok,
+    moe_intermediate_size, num_shared_experts, num_moe_layers
+    """
+
+    def parse(self, args: ParsedArgs, vllm_config: VllmConfig) -> ParsedArgs:
+        cfg = vllm_config.model_config.hf_config
+        if hasattr(cfg, "text_config") and cfg.text_config is not None:
+            cfg = cfg.text_config
+
+        args.intermediate_size = getattr(cfg, "intermediate_size", args.hidden_size * 4)
+
+        # Try different naming conventions.
+        args.num_experts = vllm_config.model_config.get_num_experts()
+        args.num_experts_per_tok = getattr_from_list(
+            cfg, ["num_experts_per_tok", "moe_topk"], 0
+        )
+        args.moe_intermediate_size = getattr_from_list(
+            cfg, ["moe_intermediate_size", "intermediate_size"], 0
+        )
+        args.num_shared_experts = getattr_from_list(
+            cfg, ["n_shared_experts", "num_shared_experts"], 0
+        )
+
+        is_moe = args.num_experts != 0
+        # Assume all MoE layers by default
+        args.num_moe_layers = args.num_hidden_layers if is_moe else 0
+
+        return args
+
+
+class FfnParallelParser(Parser):
+    """
+    Parses FFN parallelism configuration.
+
+    Provides: ffn_tp_size, ffn_ep_size
+    """
+
+    def parse(self, args: ParsedArgs, vllm_config: VllmConfig) -> ParsedArgs:
+        # NOTE: ffn tp_size does not equal the tp_size parameter directly.
+        # e.g.) If we use DP2TP4, ffn will use TP8 (or EP8 if EP is enabled.)
+        if args.enable_ep:
+            ffn_tp_size, ffn_ep_size = 1, args.dp_size * args.tp_size
+        else:
+            ffn_tp_size, ffn_ep_size = args.dp_size * args.tp_size, 1
+
+        args.ffn_tp_size = ffn_tp_size
+        args.ffn_ep_size = ffn_ep_size
+
+        return args
+
+
+class InterleaveMoeLayerStepParser(Parser):
+    """
+    Parses interleave_moe_layer_step field for models like Llama4.
+
+    Overrides: num_moe_layers
+    """
+
+    def parse(self, args: ParsedArgs, vllm_config: VllmConfig) -> ParsedArgs:
+        cfg = vllm_config.model_config.hf_config
+        if hasattr(cfg, "text_config") and cfg.text_config is not None:
+            cfg = cfg.text_config
+
+        if (
+            hasattr(cfg, "interleave_moe_layer_step")
+            and cfg.interleave_moe_layer_step > 0
+        ):
+            args.num_moe_layers = len(
+                [
+                    layer
+                    for layer in range(args.num_hidden_layers)
+                    if (layer + 1) % cfg.interleave_moe_layer_step == 0
+                ]
+            )
+
+        return args
+
+
+class MoeLayerFreqParser(Parser):
+    """
+    Parses moe_layer_freq and first_k_dense_replace fields for models like Deepseek.
+
+    Overrides: num_moe_layers
+    """
+
+    def parse(self, args: ParsedArgs, vllm_config: VllmConfig) -> ParsedArgs:
+        cfg = vllm_config.model_config.hf_config
+        if hasattr(cfg, "text_config") and cfg.text_config is not None:
+            cfg = cfg.text_config
+
+        if hasattr(cfg, "moe_layer_freq") and hasattr(cfg, "first_k_dense_replace"):
+            args.num_moe_layers = len(
+                [
+                    layer
+                    for layer in range(args.num_hidden_layers)
+                    if layer >= cfg.first_k_dense_replace
+                    and layer % cfg.moe_layer_freq == 0
+                ]
+            )
+
+        return args
+
+
+class FfnQuantizationConfigParser(Parser):
+    """
+    Parses quantization configuration for FFN layers.
+
+    Overrides: weight_byte_size
+    """
+
+    def parse(self, args: ParsedArgs, vllm_config: VllmConfig) -> ParsedArgs:
+        cfg = vllm_config.quant_config
+
+        if cfg is None:
+            return args
+
+        quant_method = cfg.get_name()
+        if quant_method in ["fp8", "fbgemm_fp8"]:
+            # FIXME: This is a hacky coarse-grained fp8 quantization detection.
+            # (there might be more quantization methods for fp8).
+            # FIXME: These configs also have concept of "ignored layers" and we
+            # need to solve the same problem as above.
+            args.weight_byte_size = 1
+            pass
+        elif quant_method == "mxfp4":
+            # FIXME: Also has "ignored layers" issue above
+            args.weight_byte_size = 0.5
+        else:
+            # FIXME: Add more parsing logic for different quant methods.
+            raise InvalidComponent
+
+        return args
+
+
+class FfnMetrics(ComponentMetrics):
+    # From BaseConfigParser
+    num_hidden_layers: int = Field(..., gt=0)
+    hidden_size: int = Field(..., gt=0)
+    activation_byte_size: int = Field(..., gt=0)
+    pp_size: int = Field(..., gt=0)
+
+    # From FfnParallelParser
+    ffn_tp_size: int = Field(..., gt=0)
+    ffn_ep_size: int = Field(..., gt=0)
+
+    # From BaseFfnConfigParser
+    intermediate_size: int = Field(..., gt=0)
+    num_experts: int = Field(0)
+    num_experts_per_tok: int = Field(1)
+    moe_intermediate_size: int = Field(0)
+    num_shared_experts: int = Field(0)
+
+    # From BaseConfigParser, can be overridden InterleaveMoeLayerStep or MoeLayerFreq
+    num_moe_layers: int = Field(..., ge=0)
+
+    # FIXME: might have to make this more granular
+    # (i.e. dense_weight_byte_size, moe_routed_weight_byte_size,
+    # moe_shared_weight_byte_size)
+    # since it can differ from byte size of other components (e.g. attn)
+    # and can differ even from each other.
+
+    # From BaseConfigParser, can be overridden by FfnQuantizationConfigParser
+    weight_byte_size: int | float = Field(..., gt=0)
+
+    @model_validator(mode="after")
+    def validate_moe_fields(self) -> Self:
+        """Validate that MoE-related fields are properly set when num_moe_layers > 0."""
+        if self.num_moe_layers > 0:
+            assert self.num_experts, f"{self.num_experts=}"
+            assert self.num_experts_per_tok, f"{self.num_experts_per_tok=}"
+            assert self.moe_intermediate_size, f"{self.moe_intermediate_size=}"
+        return self
+
+    @classmethod
+    def component_type(cls) -> str:
+        return "ffn"
+
+    @classmethod
+    def get_parser(cls) -> ParserChain:
+        return ParserChain(
+            BaseConfigParser(),
+            FfnParallelParser(),
+            BaseFfnConfigParser(),
+            InterleaveMoeLayerStepParser(),
+            MoeLayerFreqParser(),
+            FfnQuantizationConfigParser(),
+        )
+
+    def get_num_flops_breakdown(
+        self, ctx: ExecutionContext, per_gpu: bool = True
+    ) -> dict[str, int]:
+        """Calculate flops breakdown for FFN layers."""
+        L, D, DI = self.num_hidden_layers, self.hidden_size, self.intermediate_size
+        Lm, E, MI, S = (
+            self.num_moe_layers,
+            self.num_experts_per_tok,
+            self.moe_intermediate_size,
+            self.num_shared_experts,
+        )
+        T = ctx.total_num_tokens()
+
+        Ld = L - Lm
+
+        num_activated_tokens = T * E if E else 0
+
+        if per_gpu:
+            Ld //= self.pp_size
+            Lm //= self.pp_size
+
+            DI //= self.ffn_tp_size
+            if MI is not None:
+                MI //= self.ffn_tp_size
+            if E:
+                num_activated_tokens //= self.ffn_ep_size
+
+        flops = {}
+
+        # Dense FFN layers (SwiGLU: 3 linear layers: up, gate, down)
+        if Ld:
+            flops["dense_ffn"] = 2 * D * 3 * DI * T * Ld
+
+        # MoE routed experts (each token activates E experts)
+        if Lm and E:
+            flops["routed_ffn"] = 2 * D * 3 * MI * num_activated_tokens * Lm
+
+        # MoE shared experts (all S shared experts run for every token)
+        if Lm and S:
+            flops["shared_ffn"] = 2 * D * 3 * MI * S * T * Lm
+
+        return flops
+
+    def get_read_bytes_breakdown(
+        self, ctx: ExecutionContext, per_gpu: bool = True
+    ) -> dict[str, int]:
+        """Calculate read memory traffic for FFN layers."""
+        L, D, DI = self.num_hidden_layers, self.hidden_size, self.intermediate_size
+        Lm, E, MI, S = (
+            self.num_moe_layers,
+            self.num_experts_per_tok,
+            self.moe_intermediate_size,
+            self.num_shared_experts,
+        )
+        T = ctx.total_num_tokens()
+        num_experts = self.num_experts
+
+        Ld = L - Lm
+
+        num_activated_tokens = T * E if E else 0
+
+        if per_gpu:
+            Ld //= self.pp_size
+            Lm //= self.pp_size
+
+            DI //= self.ffn_tp_size
+            if MI is not None:
+                MI //= self.ffn_tp_size
+            if E:
+                num_activated_tokens //= self.ffn_ep_size
+            if num_experts is not None:
+                num_experts //= self.ffn_ep_size
+
+        read_bytes = {}
+
+        # Dense FFN layers (3 GEMMs: up, gate, down projections + SiLU activation)
+        if Ld:
+            read_bytes["dense_up_gate_input"] = int(
+                T * D * self.activation_byte_size * Ld
+            )
+            read_bytes["dense_up_gate_weights"] = int(
+                2 * D * DI * self.weight_byte_size * Ld
+            )
+            read_bytes["dense_silu_input"] = int(
+                2 * T * DI * self.activation_byte_size * Ld
+            )
+            read_bytes["dense_down_input"] = int(
+                T * DI * self.activation_byte_size * Ld
+            )
+            read_bytes["dense_down_weights"] = int(D * DI * self.weight_byte_size * Ld)
+
+        if Lm:
+            # MoE routed expert reads
+            if E:
+                # FIXME: Assume perfect load balancing for now.
+                num_activated_experts = min(num_activated_tokens, num_experts)
+
+                read_bytes["routed_up_gate_input"] = int(
+                    num_activated_tokens * D * self.activation_byte_size * Lm
+                )
+                read_bytes["routed_up_gate_weights"] = int(
+                    2 * D * MI * num_activated_experts * self.weight_byte_size * Lm
+                )
+                read_bytes["routed_silu_input"] = int(
+                    2 * num_activated_tokens * MI * self.activation_byte_size * Lm
+                )
+                read_bytes["routed_down_input"] = int(
+                    num_activated_tokens * MI * self.activation_byte_size * Lm
+                )
+                read_bytes["routed_down_weights"] = int(
+                    D * MI * num_activated_experts * self.weight_byte_size * Lm
+                )
+
+            # MoE shared expert reads
+            if S:
+                read_bytes["shared_up_gate_input"] = int(
+                    T * D * self.activation_byte_size * Lm
+                )
+                read_bytes["shared_up_gate_weights"] = int(
+                    2 * D * MI * S * self.weight_byte_size * Lm
+                )
+                read_bytes["shared_silu_input"] = int(
+                    2 * T * MI * S * self.activation_byte_size * Lm
+                )
+                read_bytes["shared_down_input"] = int(
+                    T * MI * self.activation_byte_size * Lm
+                )
+                read_bytes["shared_down_weights"] = int(
+                    D * MI * S * self.weight_byte_size * Lm
+                )
+
+        return read_bytes
+
+    def get_write_bytes_breakdown(
+        self, ctx: ExecutionContext, per_gpu: bool = True
+    ) -> dict[str, int]:
+        """Calculate write memory traffic for FFN layers."""
+        L, D, DI = self.num_hidden_layers, self.hidden_size, self.intermediate_size
+        Lm, E, MI, S = (
+            self.num_moe_layers,
+            self.num_experts_per_tok,
+            self.moe_intermediate_size,
+            self.num_shared_experts,
+        )
+        T = ctx.total_num_tokens()
+
+        Ld = L - Lm
+
+        num_activated_tokens = T * E if E else 0
+
+        if per_gpu:
+            Ld //= self.pp_size
+            Lm //= self.pp_size
+
+            DI //= self.ffn_tp_size
+            if MI is not None:
+                MI //= self.ffn_tp_size
+            if E:
+                num_activated_tokens //= self.ffn_ep_size
+
+        write_bytes = {}
+
+        # Dense FFN layers
+        if Ld:
+            write_bytes["dense_up_gate_output"] = int(
+                2 * T * DI * self.activation_byte_size * Ld
+            )
+            write_bytes["dense_silu_output"] = int(
+                T * DI * self.activation_byte_size * Ld
+            )
+            write_bytes["dense_down_output"] = int(
+                T * D * self.activation_byte_size * Ld
+            )
+
+        # MoE outputs
+        if Lm:
+            if E:
+                write_bytes["routed_up_gate_output"] = int(
+                    2 * num_activated_tokens * MI * self.activation_byte_size * Lm
+                )
+                write_bytes["routed_silu_output"] = int(
+                    num_activated_tokens * MI * self.activation_byte_size * Lm
+                )
+                write_bytes["routed_down_output"] = int(
+                    num_activated_tokens * D * self.activation_byte_size * Lm
+                )
+            if S:
+                write_bytes["shared_up_gate_output"] = int(
+                    2 * T * S * MI * self.activation_byte_size * Lm
+                )
+                write_bytes["shared_silu_output"] = int(
+                    T * S * MI * self.activation_byte_size * Lm
+                )
+                write_bytes["shared_down_output"] = int(
+                    T * S * D * self.activation_byte_size * Lm
+                )
+
+        return write_bytes
+
+
+#### Unembed ####
+
+
+class UnembedMetrics(ComponentMetrics):
+    # From BaseConfigParser
+    hidden_size: int = Field(..., gt=0)
+    vocab_size: int = Field(..., gt=0)
+    weight_byte_size: int = Field(..., gt=0)
+    activation_byte_size: int = Field(..., gt=0)
+
+    tp_size: int
+
+    @classmethod
+    def component_type(cls) -> str:
+        return "unembed"
+
+    @classmethod
+    def get_parser(cls) -> ParserChain:
+        return ParserChain(
+            BaseConfigParser(),
+        )
+
+    def get_num_flops_breakdown(
+        self, ctx: ExecutionContext, per_gpu: bool = True
+    ) -> dict[str, int]:
+        """Calculate flops breakdown for unembedding layer."""
+        D, V = self.hidden_size, self.vocab_size
+        T = ctx.total_num_tokens()
+
+        if per_gpu:
+            V //= self.tp_size
+
+        return {
+            "unembed": 2 * T * D * V,
+        }
+
+    def get_read_bytes_breakdown(
+        self, ctx: ExecutionContext, per_gpu: bool = True
+    ) -> dict[str, int]:
+        """Calculate read memory traffic for unembedding layer."""
+        D, V = self.hidden_size, self.vocab_size
+        T = ctx.total_num_tokens()
+
+        if per_gpu:
+            V //= self.tp_size
+
+        return {
+            "input": T * D * self.activation_byte_size,
+            "weight": D * V * self.weight_byte_size,
+        }
+
+    def get_write_bytes_breakdown(
+        self, ctx: ExecutionContext, per_gpu: bool = True
+    ) -> dict[str, int]:
+        """Calculate write memory traffic for unembedding layer."""
+        V = self.vocab_size
+        T = ctx.total_num_tokens()
+
+        if per_gpu:
+            V //= self.tp_size
+
+        return {
+            "output": T * V * self.activation_byte_size,
+        }
+
+
+#### ModelMetrics ####
+
+
+class ModelMetrics:
+    def __init__(self, vllm_config: VllmConfig) -> None:
+        """
+        Parse vllm_config to instantiate metrics for each component.
+        is_enabled() will return False if no component metrics could be instantiated.
+        """
+
+        self.vllm_config = vllm_config
+
+        self.metrics: list[ComponentMetrics] = []
+        for metric_cls in ComponentMetrics.registered_metrics():
+            try:
+                metric = metric_cls.from_vllm_config(vllm_config)
+                self.metrics.append(metric)
+                logger.info(
+                    "Instantiated ComponentMetrics [%s] with (%s)",
+                    metric.component_type(),
+                    str(metric),
+                )
+            except InvalidComponent as e:
+                logger.debug(
+                    "Failed to instantiate %s from %s",
+                    metric_cls.component_type(),
+                    str(e),
+                )
+
+    def is_enabled(self) -> bool:
+        return len(self.metrics) > 0
+
+    def get_num_flops(self, ctx: ExecutionContext, per_gpu: bool = True) -> int:
+        return sum(metric.get_num_flops(ctx, per_gpu) for metric in self.metrics)
+
+    def get_read_bytes(self, ctx: ExecutionContext, per_gpu: bool = True) -> int:
+        return sum(metric.get_read_bytes(ctx, per_gpu) for metric in self.metrics)
+
+    def get_write_bytes(self, ctx: ExecutionContext, per_gpu: bool = True) -> int:
+        return sum(metric.get_write_bytes(ctx, per_gpu) for metric in self.metrics)
+
+    def get_num_flops_breakdown(
+        self, ctx: ExecutionContext, per_gpu: bool = True
+    ) -> dict[str, int]:
+        total = {}
+        for metric in self.metrics:
+            breakdown = metric.get_num_flops_breakdown(ctx, per_gpu)
+            component = metric.component_type()
+            prefixed = {f"{component}.{key}": val for key, val in breakdown.items()}
+            total.update(prefixed)
+        return total
+
+    def get_read_bytes_breakdown(
+        self, ctx: ExecutionContext, per_gpu: bool = True
+    ) -> dict[str, int]:
+        total = {}
+        for metric in self.metrics:
+            breakdown = metric.get_read_bytes_breakdown(ctx, per_gpu)
+            component = metric.component_type()
+            prefixed = {f"{component}.{key}": val for key, val in breakdown.items()}
+            total.update(prefixed)
+        return total
+
+    def get_write_bytes_breakdown(
+        self, ctx: ExecutionContext, per_gpu: bool = True
+    ) -> dict[str, int]:
+        total = {}
+        for metric in self.metrics:
+            breakdown = metric.get_write_bytes_breakdown(ctx, per_gpu)
+            component = metric.component_type()
+            prefixed = {f"{component}.{key}": val for key, val in breakdown.items()}
+            total.update(prefixed)
+        return total
+
+    def get_step_perf_stats_per_gpu(
+        self, scheduler_output: SchedulerOutput
+    ) -> PerfStats:
+        """
+        Calculate perf stats for the current step based on scheduled tokens.
+        """
+
+        t0 = time.monotonic()
+
+        # Build a single batch context
+        ctx = ExecutionContext()
+
+        # Process new requests (these are in prefill phase)
+        for new_req in scheduler_output.scheduled_new_reqs:
+            req_id = new_req.req_id
+            num_tokens = scheduler_output.num_scheduled_tokens.get(req_id, 0)
+            if num_tokens == 0:
+                continue
+
+            # For new requests, context_len = num_computed_tokens + num_tokens
+            # num_computed_tokens represents previously computed tokens in the sequence
+            context_len = new_req.num_computed_tokens + num_tokens
+            ctx.add(num_tokens, context_len, is_prefill=True)
+
+        # Process cached requests (continuing requests)
+        cached_reqs = scheduler_output.scheduled_cached_reqs
+        for i, req_id in enumerate(cached_reqs.req_ids):
+            num_tokens = scheduler_output.num_scheduled_tokens.get(req_id, 0)
+            if num_tokens == 0:
+                continue
+
+            # For cached requests, we have the current num_computed_tokens
+            num_computed_tokens = cached_reqs.num_computed_tokens[i]
+            context_len = num_computed_tokens + num_tokens
+
+            # Cached requests are typically in decode phase (num_tokens == 1)
+            # unless they're doing chunked prefill (num_tokens > 1)
+            is_prefill = num_tokens > 1
+            ctx.add(num_tokens, context_len, is_prefill)
+
+        num_flops_breakdown = self.get_num_flops_breakdown(ctx, True)
+        read_bytes_breakdown = self.get_read_bytes_breakdown(ctx, True)
+        write_bytes_breakdown = self.get_write_bytes_breakdown(ctx, True)
+        perf_stats = PerfStats(
+            sum(num_flops_breakdown.values()),
+            sum(read_bytes_breakdown.values()),
+            sum(write_bytes_breakdown.values()),
+        )
+
+        if envs.VLLM_DEBUG_MFU_METRICS:
+            perf_stats.debug_stats = DebugPerfStats(
+                time.monotonic() - t0,
+                ctx.num_prefill_requests,
+                ctx.num_decode_requests,
+                asdict(ctx),
+                num_flops_breakdown,
+                read_bytes_breakdown,
+                write_bytes_breakdown,
+            )
+
+        return perf_stats
+
+
+#### Logging ####
+
+
+class PerfMetricsDebugLogging:
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.total_calc_duration: float = 0.0
+        self.total_num_prefill_requests: int = 0
+        self.total_num_decode_requests: int = 0
+        self.total_num_batches: int = 0
+        self.total_context_breakdown: dict[str, int] = {}
+        self.total_num_flops_per_gpu_breakdown: dict[str, int] = {}
+        self.total_read_bytes_per_gpu_breakdown: dict[str, int] = {}
+        self.total_write_bytes_per_gpu_breakdown: dict[str, int] = {}
+
+    def observe(self, debug_stats: DebugPerfStats) -> None:
+        self.total_calc_duration += debug_stats.calc_duration
+        self.total_num_prefill_requests += debug_stats.num_prefill_requests
+        self.total_num_decode_requests += debug_stats.num_decode_requests
+        self.total_num_batches += 1
+
+        for dst, src in zip(
+            [
+                self.total_context_breakdown,
+                self.total_num_flops_per_gpu_breakdown,
+                self.total_read_bytes_per_gpu_breakdown,
+                self.total_write_bytes_per_gpu_breakdown,
+            ],
+            [
+                debug_stats.context_breakdown,
+                debug_stats.num_flops_per_gpu_breakdown,
+                debug_stats.num_read_bytes_per_gpu_breakdown,
+                debug_stats.num_write_bytes_per_gpu_breakdown,
+            ],
+        ):
+            assert isinstance(src, dict)
+            for key, val in src.items():
+                dst[key] = dst.get(key, 0) + val
+
+    def log(self, log_fn, log_prefix: str, delta_time: float):
+        # pretty print breakdowns
+        total_num_flops_per_gpu_breakdown = {
+            k: f"{v / 1e12:.1f}TF"
+            for k, v in self.total_num_flops_per_gpu_breakdown.items()
+        }
+        total_read_bytes_per_gpu_breakdown = {
+            k: f"{v / 1e9:.1f}GB"
+            for k, v in self.total_read_bytes_per_gpu_breakdown.items()
+        }
+        total_write_bytes_per_gpu_breakdown = {
+            k: f"{v / 1e9:.1f}GB"
+            for k, v in self.total_write_bytes_per_gpu_breakdown.items()
+        }
+
+        logger.debug(
+            "%sMFU details: %s",
+            log_prefix,
+            json.dumps(
+                {
+                    "prefill_reqs": self.total_num_prefill_requests,
+                    "decode_reqs": self.total_num_decode_requests,
+                    "num_batches": self.total_num_batches,
+                    "context_breakdown": self.total_context_breakdown,
+                    "flops_breakdown": total_num_flops_per_gpu_breakdown,
+                    "num_read_bytes_breakdown": total_read_bytes_per_gpu_breakdown,
+                    "num_write_bytes_breakdown": (total_write_bytes_per_gpu_breakdown),
+                    "duration": f"{delta_time:.1f}s",
+                    "mfu_calc_overhead": (
+                        f"{self.total_calc_duration / delta_time:.1%}"
+                    ),
+                },
+                indent=2,
+            ),
+        )
+
+
+class PerfMetricsLogging:
+    def __init__(self, vllm_config: VllmConfig):
+        self.vllm_config = vllm_config
+        self.pp_size = vllm_config.parallel_config.pipeline_parallel_size
+
+        self.debug_logging: PerfMetricsDebugLogging | None = None
+        if envs.VLLM_DEBUG_MFU_METRICS:
+            self.debug_logging = PerfMetricsDebugLogging()
+
+        self.reset()
+
+    def reset(self):
+        self.last_log_time = time.monotonic()
+
+        self.total_num_flops_per_gpu: int = 0
+        self.total_read_bytes_per_gpu: int = 0
+        self.total_write_bytes_per_gpu: int = 0
+
+        if self.debug_logging:
+            self.debug_logging.reset()
+
+    def observe(self, perf_stats: PerfStats) -> None:
+        self.total_num_flops_per_gpu += perf_stats.num_flops_per_gpu
+        self.total_read_bytes_per_gpu += perf_stats.num_read_bytes_per_gpu
+        self.total_write_bytes_per_gpu += perf_stats.num_write_bytes_per_gpu
+
+        if self.debug_logging:
+            assert perf_stats.debug_stats is not None
+            self.debug_logging.observe(perf_stats.debug_stats)
+
+    def log(self, log_fn=logger.info, log_prefix: str = "") -> None:
+        if not (
+            self.total_num_flops_per_gpu
+            or self.total_read_bytes_per_gpu
+            or self.total_write_bytes_per_gpu
+        ):
+            return
+
+        now = time.monotonic()
+        delta_time = now - self.last_log_time
+
+        if delta_time <= 0.0:
+            avg_tflops_per_gpu = 0.0
+            avg_gbps_per_gpu = 0.0
+        else:
+            avg_tflops_per_gpu = self.total_num_flops_per_gpu / delta_time / 1e12
+            avg_gbps_per_gpu = (
+                (self.total_read_bytes_per_gpu + self.total_write_bytes_per_gpu)
+                / delta_time
+                / 1e9
+            )
+
+        log_fn(
+            "%sMFU: %.1f TF/s/GPU %.1f GB/s/GPU",
+            log_prefix,
+            avg_tflops_per_gpu,
+            avg_gbps_per_gpu,
+        )
+
+        if self.debug_logging:
+            self.debug_logging.log(log_fn, log_prefix, delta_time)
+
+        self.reset()
+
+
+## util functions
+
+
+def get_required(obj: object, attr: str):
+    """Get an attr from an object, or throw a InvalidComponentError if it's not set."""
+    if not hasattr(obj, attr):
+        raise InvalidComponent(f"Missing required attr {attr} in config")
+    return getattr(obj, attr)
+
+
+def getattr_from_list(obj: object, attrs: list[str], default: object = None):
+    """Try to get the first attr that exists in the object
+    from a list of attrs. Otherwise return None."""
+    for attr in attrs:
+        if hasattr(obj, attr):
+            return getattr(obj, attr)
+    return default
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index a0cc58d0a64e86cc150b41550cb3895f11b1716a..cb1a860e38fbc090588f27eef61bd668060c8c98 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -8,6 +8,7 @@ from typing import TYPE_CHECKING, Any
 
 import vllm.envs as envs
 from vllm.compilation.cuda_graph import CUDAGraphStat
+from vllm.v1.metrics.perf import PerfStats
 from vllm.v1.spec_decode.metrics import SpecDecodingStats
 
 if TYPE_CHECKING:
@@ -186,6 +187,8 @@ class SchedulerStats:
 
     cudagraph_stats: CUDAGraphStat | None = None
 
+    perf_stats: PerfStats | None = None
+
 
 @dataclass
 class RequestStateStats:
diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
index bea9e5846de1377b25bf04244230d0dfdf5d4bac..75ab787072029f34e25b64caba3da2484ded9875 100644
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@@ -3,7 +3,7 @@
 
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, NamedTuple
+from typing import TYPE_CHECKING, NamedTuple, TypeAlias
 
 import numpy as np
 import torch
@@ -69,6 +69,14 @@ class LogprobsTensors(NamedTuple):
             self.selected_token_ranks.to("cpu", non_blocking=True),
         )
 
+    def filter(self, mask: torch.Tensor) -> "LogprobsTensors":
+        """Filter the logprobs tensors with the given bool mask."""
+        return LogprobsTensors(
+            self.logprob_token_ids[mask],
+            self.logprobs[mask],
+            self.selected_token_ranks[mask],
+        )
+
     @staticmethod
     def empty_cpu(
         num_positions: int, num_tokens_per_position: int
@@ -91,7 +99,7 @@ class LogprobsTensors(NamedTuple):
 
 # [num_reqs, <dynamic>]
 # The shape of each element depends on the pooler used
-PoolerOutput = list[torch.Tensor | None] | torch.Tensor | None
+PoolerOutput: TypeAlias = torch.Tensor | list[torch.Tensor] | list[torch.Tensor | None]
 
 
 @dataclass
@@ -151,21 +159,23 @@ class ModelRunnerOutput:
     # num_generated_tokens is the number of tokens
     # generated in the current step. It can be different for
     # each request due to speculative/jump decoding.
-    sampled_token_ids: list[list[int]]
+    sampled_token_ids: list[list[int]] = field(default_factory=list)
 
     # [num_reqs, max_num_logprobs + 1]
     # [num_reqs, max_num_logprobs + 1]
     # [num_reqs]
-    logprobs: LogprobsLists | None
+    logprobs: LogprobsLists | None = None
 
     # req_id -> (token_ids, logprobs, ranks)
     # [prompt_len, num_prompt_logprobs]
     # [prompt_len, num_prompt_logprobs]
     # [prompt_len]
-    prompt_logprobs_dict: dict[str, LogprobsTensors | None]
+    prompt_logprobs_dict: dict[str, LogprobsTensors | None] = field(
+        default_factory=dict
+    )
 
     # [num_reqs, hidden_size]
-    pooler_output: list[torch.Tensor | None]
+    pooler_output: list[torch.Tensor | None] | None = None
 
     kv_connector_output: KVConnectorOutput | None = None
 
@@ -225,21 +235,8 @@ def make_empty_encoder_model_runner_output(
         req_ids=req_ids,
         req_id_to_index=req_id_to_index,
         sampled_token_ids=sampled_token_ids,
-        logprobs=None,
-        prompt_logprobs_dict={},
         pooler_output=pooler_output,
-        kv_connector_output=None,
-        ec_connector_output=None,
-        num_nans_in_logits=None,
     )
 
 
-EMPTY_MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
-    req_ids=[],
-    req_id_to_index={},
-    sampled_token_ids=[],
-    logprobs=None,
-    prompt_logprobs_dict={},
-    pooler_output=[],
-    num_nans_in_logits=None,
-)
+EMPTY_MODEL_RUNNER_OUTPUT = ModelRunnerOutput(req_ids=[], req_id_to_index={})
diff --git a/vllm/v1/pool/metadata.py b/vllm/v1/pool/metadata.py
index acd1a00e87553b33bf21e1889f415286871116f6..0764d5e6f7a705987db02a0e97f5373691729a7e 100644
--- a/vllm/v1/pool/metadata.py
+++ b/vllm/v1/pool/metadata.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
 
+import numpy as np
 import torch
 
 from vllm.pooling_params import PoolingParams
@@ -89,38 +90,35 @@ class PoolingMetadata:
 
         return [prompt_token_ids[i, :num] for i, num in enumerate(self.prompt_lens)]
 
+    def get_pooling_cursor(self) -> PoolingCursor:
+        pooling_cursor = self.pooling_cursor
+        assert pooling_cursor is not None, "Should call `build_pooling_cursor` first"
+
+        return pooling_cursor
+
     def build_pooling_cursor(
         self,
-        num_scheduled_tokens: list[int],
+        num_scheduled_tokens_np: np.ndarray,
         seq_lens_cpu: torch.Tensor,
         device: torch.device,
     ):
-        self.pooling_cursor = build_pooling_cursor(
-            num_scheduled_tokens, seq_lens_cpu, self.prompt_lens, device
-        )
+        n_seq = len(num_scheduled_tokens_np)
+        prompt_lens = self.prompt_lens
 
+        assert len(prompt_lens) == n_seq
 
-def build_pooling_cursor(
-    num_scheduled_tokens: list[int],
-    seq_lens_cpu: torch.Tensor,
-    prompt_lens: torch.Tensor,
-    device: torch.device,
-):
-    assert len(prompt_lens) == len(num_scheduled_tokens)
-
-    n_seq = len(num_scheduled_tokens)
-    index = list(range(n_seq))
-    num_scheduled_tokens_cpu = torch.tensor(num_scheduled_tokens, device="cpu")
-    cumsum = torch.zeros(
-        n_seq + 1, dtype=torch.int64, pin_memory=pin_memory, device="cpu"
-    )
-    torch.cumsum(num_scheduled_tokens_cpu, dim=0, out=cumsum[1:])
-    cumsum = cumsum.to(device, non_blocking=True)
-    return PoolingCursor(
-        index=index,
-        first_token_indices_gpu=cumsum[:n_seq],
-        last_token_indices_gpu=cumsum[1:] - 1,
-        prompt_lens_cpu=prompt_lens,
-        seq_lens_cpu=seq_lens_cpu,
-        num_scheduled_tokens_cpu=num_scheduled_tokens_cpu,
-    )
+        index = list(range(n_seq))
+        num_scheduled_tokens_cpu = torch.from_numpy(num_scheduled_tokens_np)
+        cumsum = torch.zeros(
+            n_seq + 1, dtype=torch.int64, pin_memory=pin_memory, device="cpu"
+        )
+        torch.cumsum(num_scheduled_tokens_cpu, dim=0, out=cumsum[1:])
+        cumsum = cumsum.to(device, non_blocking=True)
+        self.pooling_cursor = PoolingCursor(
+            index=index,
+            first_token_indices_gpu=cumsum[:n_seq],
+            last_token_indices_gpu=cumsum[1:] - 1,
+            prompt_lens_cpu=prompt_lens,
+            seq_lens_cpu=seq_lens_cpu,
+            num_scheduled_tokens_cpu=num_scheduled_tokens_cpu,
+        )
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index f33059b80b8945a48ef7c98b286a52813858d828..9c27e8c05cc1072861d18f03166f603f476ec073 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -123,7 +123,7 @@ class Request:
         # indicates that the output is corrupted
         self.num_nans_in_logits = 0
 
-        # The number of requests being preempted by the scheduler
+        # The number of times this request has been preempted by the scheduler.
         self.num_preemptions = 0
 
         # The number of tokens that have been computed remotely.
@@ -211,8 +211,7 @@ class Request:
 
     def get_num_encoder_embeds(self, input_id: int) -> int:
         assert input_id < len(self.mm_features)
-        num_embeds = self.mm_features[input_id].mm_position.get_num_embeds
-        return num_embeds
+        return self.mm_features[input_id].mm_position.get_num_embeds
 
     def record_event(
         self,
diff --git a/vllm/v1/sample/ops/bad_words.py b/vllm/v1/sample/ops/bad_words.py
index 8e2c798dd35ff1cb50d7a4bec09a0de308bf2b97..56972e517980e5bc9de97c379e2cc0f9edc355df 100644
--- a/vllm/v1/sample/ops/bad_words.py
+++ b/vllm/v1/sample/ops/bad_words.py
@@ -42,11 +42,16 @@ def apply_bad_words_with_drafts(
     num_draft_tokens: list[int],
 ) -> None:
     start_idx = 0
-    for i, bad_words_ids in bad_words_token_ids.items():
-        for draft_idx in range(num_draft_tokens[i]):
-            _apply_bad_words_single_batch(
-                logits[start_idx + draft_idx],
-                bad_words_ids,
-                past_tokens_ids[start_idx + draft_idx],
-            )
-        start_idx += num_draft_tokens[i]
+    remaining = len(bad_words_token_ids)
+    for i, n in enumerate(num_draft_tokens):
+        if (bad_words_ids := bad_words_token_ids.get(i)) is not None:
+            for draft_idx in range(start_idx, start_idx + n):
+                _apply_bad_words_single_batch(
+                    logits[draft_idx],
+                    bad_words_ids,
+                    past_tokens_ids[draft_idx],
+                )
+            remaining -= 1
+            if not remaining:
+                break
+        start_idx += n
diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py
index c9229e788b6bfe1cbef3eeaafbade8464149be61..03da3e565e49912def7d8a367af91c0b7cfa1fcf 100644
--- a/vllm/v1/sample/ops/topk_topp_sampler.py
+++ b/vllm/v1/sample/ops/topk_topp_sampler.py
@@ -174,6 +174,8 @@ class TopKTopPSampler(nn.Module):
         k: torch.Tensor | None,
         p: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        # FIXME: Fix aiter_sampler's accuracy issue and remove this flag
+        DISABLE_AITER_SAMPLER = True
         """Optimized ROCm/aiter path (same structure as forward_cuda)."""
         if (k is None and p is None) or generators:
             if generators:
@@ -186,6 +188,8 @@ class TopKTopPSampler(nn.Module):
             "processed_logits",
             "processed_logprobs",
         ), "aiter sampler does not support returning logits/logprobs."
+        if DISABLE_AITER_SAMPLER:
+            return self.forward_native(logits, generators, k, p)
         return self.aiter_sample(logits, k, p, generators), None
 
     def aiter_sample(
diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py
index 50b91d8292ee8984011e870d8ebdc0adea83ee9a..16d01d518fe66ba500e6fa3090082de98c5b4acb 100644
--- a/vllm/v1/sample/rejection_sampler.py
+++ b/vllm/v1/sample/rejection_sampler.py
@@ -9,7 +9,7 @@ import torch.nn as nn
 
 from vllm.logger import init_logger
 from vllm.triton_utils import tl, triton
-from vllm.v1.outputs import LogprobsTensors, SamplerOutput
+from vllm.v1.outputs import LogprobsLists, LogprobsTensors, SamplerOutput
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.ops.bad_words import apply_bad_words_with_drafts
 from vllm.v1.sample.ops.penalties import apply_all_penalties
@@ -119,8 +119,14 @@ class RejectionSampler(nn.Module):
         raw_target_logits = logits[target_logits_indices]
         # Use float32 for the target_logits.
         raw_target_logits = raw_target_logits.to(torch.float32)
+        target_logits = raw_target_logits
+        if not self.is_processed_logprobs_mode:
+            # Clone raw_target_logits before applying processors to preserve
+            # the original raw logits for logprobs computation, since
+            # apply_logits_processors modifies the tensor in-place.
+            target_logits = target_logits.clone()
         target_logits = self.apply_logits_processors(
-            raw_target_logits, sampling_metadata, metadata
+            target_logits, sampling_metadata, metadata
         )
         # [num_tokens, vocab_size]
         # NOTE(woosuk): `target_logits` can be updated in place inside the
@@ -179,13 +185,22 @@ class RejectionSampler(nn.Module):
         final_logits[target_logits_indices] = target_logits.to(torch.float32)
         final_logits[bonus_logits_indices] = bonus_logits.to(torch.float32)
 
-        # Compute accepted token indices.
-        accepted_mask = sampled_token_ids != PLACEHOLDER_TOKEN_ID
-        num_accepted_tokens = accepted_mask.sum(dim=-1)
-        accepted_logit_indices = accepted_mask.nonzero(as_tuple=True)[1]
-        accepted_logit_indices += cu_num_sampled_tokens.repeat_interleave(
-            num_accepted_tokens
+        # NOTE: To avoid cpu-gpu synchronization, we now simply compute indices for
+        # all draft tokens, including the rejected ones. The rejected tokens will
+        # be filtered out in the `parse_output`.
+        logit_start_indices = cu_num_sampled_tokens
+        offsets = torch.arange(
+            sampled_token_ids.shape[-1],
+            device=logit_start_indices.device,
+            dtype=logit_start_indices.dtype,
         )
+        accepted_logit_indices = (
+            logit_start_indices.unsqueeze(1) + offsets.unsqueeze(0)
+        ).flatten()
+        accepted_logit_indices.clamp_(max=final_logits.shape[0] - 1)
+        accepted_tokens = sampled_token_ids.clone().flatten()
+        # we replace rejected token ids with 0 to avoid gather_logprobs error
+        accepted_tokens[accepted_tokens == PLACEHOLDER_TOKEN_ID] = 0
 
         # Compute logprobs for accepted tokens.
         accepted_logits = final_logits[accepted_logit_indices]
@@ -194,7 +209,6 @@ class RejectionSampler(nn.Module):
             if self.is_logits_logprobs_mode
             else self.sampler.compute_logprobs(accepted_logits)
         )
-        accepted_tokens = sampled_token_ids[accepted_mask]
         return self.sampler.gather_logprobs(
             accepted_logprobs,
             max_num_logprobs,
@@ -206,8 +220,8 @@ class RejectionSampler(nn.Module):
         output_token_ids: torch.Tensor,
         vocab_size: int,
         discard_req_indices: Sequence[int] = (),
-        return_cu_num_tokens: bool = False,
-    ) -> tuple[list[list[int]], list[int] | None]:
+        logprobs_tensors: LogprobsTensors | None = None,
+    ) -> tuple[list[list[int]], LogprobsLists | None]:
         """Parse the output of the rejection sampler.
         Args:
             output_token_ids: The sampled token IDs in shape
@@ -216,7 +230,7 @@ class RejectionSampler(nn.Module):
                 and will be filtered out in this function.
             vocab_size: The size of the vocabulary.
             discard_req_indices: Optional row indices to discard tokens in.
-            return_cu_num_tokens: Whether to also return cumulative token counts.
+            logprobs_tensors: Optional logprobs tensors to filter.
         Returns:
             A list of lists of token IDs.
         """
@@ -225,15 +239,18 @@ class RejectionSampler(nn.Module):
         valid_mask = (output_token_ids_np != PLACEHOLDER_TOKEN_ID) & (
             output_token_ids_np < vocab_size
         )
-        cu_num_tokens = None
-        if return_cu_num_tokens:
+        output_logprobs = None
+        if logprobs_tensors is not None:
             cu_num_tokens = [0] + valid_mask.sum(axis=1).cumsum().tolist()
+            filtered_tensors = logprobs_tensors.filter(valid_mask.flatten())
+            output_logprobs = filtered_tensors.tolists(cu_num_tokens)
+
         if len(discard_req_indices) > 0:
             valid_mask[discard_req_indices] = False
         outputs = [
             row[valid_mask[i]].tolist() for i, row in enumerate(output_token_ids_np)
         ]
-        return outputs, cu_num_tokens
+        return outputs, output_logprobs
 
     def apply_logits_processors(
         self,
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index d5624526a1ce3c924166df637ef9728a8f7c74d2..a2ec6cad1015c07b1c2dd6fae7dd92061a723dfd 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -10,7 +10,6 @@ import numpy as np
 import torch
 import torch.nn as nn
 
-from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config import (
     CompilationMode,
     CUDAGraphMode,
@@ -29,16 +28,16 @@ from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.platforms import current_platform
 from vllm.triton_utils import triton
 from vllm.utils.platform_utils import is_pin_memory_available
-from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
+from vllm.v1.attention.backend import (
+    AttentionMetadataBuilder,
+    CommonAttentionMetadata,
+)
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
 from vllm.v1.attention.backends.tree_attn import (
     TreeAttentionMetadata,
     TreeAttentionMetadataBuilder,
 )
 from vllm.v1.attention.backends.triton_attn import TritonAttentionMetadata
-from vllm.v1.attention.backends.utils import (
-    AttentionMetadataBuilder,
-    CommonAttentionMetadata,
-)
 from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.sampler import _SAMPLING_EPS
@@ -73,7 +72,6 @@ class EagleProposer:
         self.device = device
         self.dtype = vllm_config.model_config.dtype
         self.max_model_len = vllm_config.model_config.max_model_len
-        self.block_size = vllm_config.cache_config.block_size
         self.dp_rank = vllm_config.parallel_config.data_parallel_rank
         self.num_speculative_tokens = self.speculative_config.num_speculative_tokens
         self.max_num_tokens = vllm_config.scheduler_config.max_num_batched_tokens
@@ -170,7 +168,12 @@ class EagleProposer:
         # Determine allowed attention backends once during initialization.
         self.allowed_attn_types: tuple | None = None
         if current_platform.is_rocm():
-            rocm_types = [TritonAttentionMetadata, FlashAttentionMetadata]
+            from vllm.v1.attention.backends.rocm_attn import RocmAttentionMetadata
+
+            rocm_types = [
+                TritonAttentionMetadata,
+                RocmAttentionMetadata,
+            ]
             # ROCM_AITER_FA is an optional backend
             if find_spec(
                 AttentionBackendEnum.ROCM_AITER_FA.get_path(include_classname=False)
@@ -186,6 +189,11 @@ class EagleProposer:
 
             rocm_types.append(MLACommonMetadata)
 
+            # FlexAttention backend support
+            from vllm.v1.attention.backends.flex_attention import FlexAttentionMetadata
+
+            rocm_types.append(FlexAttentionMetadata)
+
             self.allowed_attn_types = tuple(rocm_types)
 
         # Parse the speculative token tree.
@@ -207,10 +215,7 @@ class EagleProposer:
             )
         # Precompute draft position offsets in flattened tree.
         self.tree_draft_pos_offsets = torch.arange(
-            1,
-            len(self.tree_choices) + 1,
-            device=device,
-            dtype=torch.int32,
+            1, len(self.tree_choices) + 1, device=device, dtype=torch.int32
         ).repeat(max_batch_size, 1)
 
     def _get_positions(self, num_tokens: int):
@@ -238,6 +243,7 @@ class EagleProposer:
         common_attn_metadata: CommonAttentionMetadata,
         sampling_metadata: SamplingMetadata,
         mm_embed_inputs: tuple[list[torch.Tensor], torch.Tensor] | None = None,
+        num_rejected_tokens_gpu: torch.Tensor | None = None,
     ) -> torch.Tensor:
         num_tokens = target_token_ids.shape[0]
         batch_size = next_token_ids.shape[0]
@@ -289,8 +295,7 @@ class EagleProposer:
             per_layer_attn_metadata[layer_name] = draft_indexer_metadata
 
         num_tokens_dp_padded, num_tokens_across_dp = self._pad_batch_across_dp(
-            num_tokens_unpadded=num_tokens,
-            num_tokens_padded=num_tokens,
+            num_tokens_unpadded=num_tokens, num_tokens_padded=num_tokens
         )
 
         cudagraph_runtime_mode = CUDAGraphMode.NONE
@@ -393,8 +398,7 @@ class EagleProposer:
         draft_token_ids_list = [draft_token_ids]
 
         batch_size_dp_padded, batch_size_across_dp = self._pad_batch_across_dp(
-            num_tokens_unpadded=batch_size,
-            num_tokens_padded=batch_size,
+            num_tokens_unpadded=batch_size, num_tokens_padded=batch_size
         )
 
         if (
@@ -416,6 +420,17 @@ class EagleProposer:
         common_attn_metadata.query_start_loc_cpu = torch.from_numpy(
             self.token_arange_np[: batch_size + 1]
         ).clone()
+
+        # In padded drafter batch, we need to adjust the sequence lengths
+        # to remove the "padding" (i.e. rejected tokens).
+        # Only apply this adjustment when we have rejected tokens
+        # (i.e., not the first proposal).
+        if self.num_speculative_tokens > 1 and num_rejected_tokens_gpu is not None:
+            common_attn_metadata.seq_lens -= num_rejected_tokens_gpu
+            # Invalidate the CPU-side shadows to avoid H<>D sync.
+            common_attn_metadata._seq_lens_cpu = None
+            common_attn_metadata._num_computed_tokens_cpu = None
+
         for token_index in range(self.num_speculative_tokens - 1):
             # Update the inputs.
             # cast to int32 is crucial when eagle model is compiled.
@@ -460,22 +475,23 @@ class EagleProposer:
                 common_attn_metadata._num_computed_tokens_cpu += 1
 
             # Compute the slot mapping.
+            block_size = attn_metadata_builder.kv_cache_spec.block_size
             if self.uses_mrope:
                 # all dimensions of positions are the same
-                block_numbers = clamped_positions[0] // self.block_size
+                block_numbers = clamped_positions[0] // block_size
             else:
-                block_numbers = clamped_positions // self.block_size
+                block_numbers = clamped_positions // block_size
             block_ids = common_attn_metadata.block_table_tensor.gather(
                 dim=1, index=block_numbers.view(-1, 1)
             )
             block_ids = block_ids.view(-1)
             if self.uses_mrope:
                 common_attn_metadata.slot_mapping = (
-                    block_ids * self.block_size + clamped_positions[0] % self.block_size
+                    block_ids * block_size + clamped_positions[0] % block_size
                 )
             else:
                 common_attn_metadata.slot_mapping = (
-                    block_ids * self.block_size + clamped_positions % self.block_size
+                    block_ids * block_size + clamped_positions % block_size
                 )
             # Mask out the slot mappings that exceed the max model length.
             # Otherwise, the KV cache will be inadvertently updated with the
@@ -600,10 +616,8 @@ class EagleProposer:
         assert discard_request_mask.dtype == torch.bool
         assert backup_tokens_gpu.dtype == torch.int32
 
-        next_token_ids = torch.empty((batch_size,), dtype=torch.int32, device=device)
-        valid_sampled_tokens_count = torch.empty(
-            (batch_size,), dtype=torch.int32, device=device
-        )
+        next_token_ids = torch.empty(batch_size, dtype=torch.int32, device=device)
+        valid_sampled_tokens_count = next_token_ids.new_empty(batch_size)
 
         # Kernel grid: one program per request (row)
         grid = (batch_size,)
@@ -630,13 +644,14 @@ class EagleProposer:
         common_attn_metadata: CommonAttentionMetadata,
         spec_decode_metadata: SpecDecodeMetadata,
         valid_sampled_tokens_count: torch.Tensor,
-    ) -> tuple[CommonAttentionMetadata, torch.Tensor]:
+    ) -> tuple[CommonAttentionMetadata, torch.Tensor, torch.Tensor]:
         """
         This function is used to prepare the inputs for speculative decoding
         It updates the common_attn_metadata for speculative decoding,
         but does not consider the rejected tokens. Instead, all tokens
         are included as inputs to the speculator, with the rejected tokens
         used as padding and filtered out later by `token_indices_to_sample`.
+        No blocking CPU operations should be introduced in this function.
         """
         num_reqs = common_attn_metadata.num_reqs
         device = valid_sampled_tokens_count.device
@@ -644,14 +659,17 @@ class EagleProposer:
         token_indices_to_sample = torch.empty(
             (num_reqs,), dtype=torch.int32, device=device
         )
+        num_rejected_tokens_gpu = torch.empty(
+            (num_reqs,), dtype=torch.int32, device=device
+        )
 
-        # Kernel grid: one program per request (row)
         grid = (num_reqs,)
         eagle_prepare_inputs_padded_kernel[grid](
             spec_decode_metadata.cu_num_draft_tokens,
             valid_sampled_tokens_count,
             common_attn_metadata.query_start_loc,
             token_indices_to_sample,
+            num_rejected_tokens_gpu,
             num_reqs,
         )
 
@@ -676,7 +694,11 @@ class EagleProposer:
             dcp_local_seq_lens=common_attn_metadata.dcp_local_seq_lens,
         )
 
-        return spec_common_attn_metadata, token_indices_to_sample
+        return (
+            spec_common_attn_metadata,
+            token_indices_to_sample,
+            num_rejected_tokens_gpu,
+        )
 
     def propose_tree(
         self,
@@ -764,8 +786,7 @@ class EagleProposer:
                 max_query_len=query_len,
             )
             attn_metadata = tree_attn_metadata_builder.build_for_drafting(
-                common_attn_metadata=common_attn_metadata,
-                draft_index=level + 1,
+                common_attn_metadata=common_attn_metadata, draft_index=level + 1
             )
 
             # Apply new attention metadata to all layers.
@@ -782,12 +803,11 @@ class EagleProposer:
             attn_metadata.seq_lens.masked_fill_(exceeds_max_model_len, 1)
 
             # Compute the slot mapping.
+            block_size = tree_attn_metadata_builder.kv_cache_spec.block_size
             query_positions = flattened_draft_positions[:, level : level + query_len]
-            block_numbers = query_positions // self.block_size
+            block_numbers = query_positions // block_size
             block_ids = attn_metadata.block_table.gather(dim=1, index=block_numbers)
-            slot_mapping = (
-                block_ids * self.block_size + query_positions % self.block_size
-            )
+            slot_mapping = block_ids * block_size + query_positions % block_size
             # Mask out the slot mappings that exceed the max model length.
             # Otherwise, the KV cache will be inadvertently updated with the
             # padding tokens.
@@ -1144,8 +1164,8 @@ class EagleProposer:
     def dummy_run(
         self,
         num_tokens: int,
-        use_cudagraphs=True,
-        is_graph_capturing=False,
+        use_cudagraphs: bool = True,
+        is_graph_capturing: bool = False,
     ) -> None:
         # Determine if CUDA graphs should be used for this run.
         cudagraphs_enabled = use_cudagraphs and self.use_cuda_graph
@@ -1157,8 +1177,7 @@ class EagleProposer:
         ):
             if fwd_idx <= 1:
                 num_tokens_dp_padded, num_tokens_across_dp = self._pad_batch_across_dp(
-                    num_tokens_unpadded=num_tokens,
-                    num_tokens_padded=num_tokens,
+                    num_tokens_unpadded=num_tokens, num_tokens_padded=num_tokens
                 )
                 if (
                     cudagraphs_enabled
@@ -1325,9 +1344,5 @@ def compute_probs_and_sample_next_token(
     next_token_ids = probs.div(q).argmax(dim=-1).view(-1)
     if not sampling_metadata.all_random:
         greedy_token_ids = probs.argmax(dim=-1)
-        next_token_ids = torch.where(
-            is_greedy,
-            greedy_token_ids,
-            next_token_ids,
-        )
+        next_token_ids = torch.where(is_greedy, greedy_token_ids, next_token_ids)
     return next_token_ids, probs
diff --git a/vllm/v1/spec_decode/ngram_proposer.py b/vllm/v1/spec_decode/ngram_proposer.py
index 1273ca12c36002b901c50f1fa01ec5726dca5e8c..f97d54e632397c5c64ea4d97ac2cb8b864d293b1 100644
--- a/vllm/v1/spec_decode/ngram_proposer.py
+++ b/vllm/v1/spec_decode/ngram_proposer.py
@@ -55,10 +55,8 @@ class NgramProposer:
         # This usually takes less than 1 second.
         self.propose(
             [[]] * 1024,
-            [""] * 1024,
             np.zeros(1024, dtype=np.int32),
             np.zeros((1024, self.max_model_len), dtype=np.int32),
-            set(),
         )
 
     def batch_propose(
@@ -132,10 +130,8 @@ class NgramProposer:
     def propose(
         self,
         sampled_token_ids: list[list[int]],
-        req_ids: list[str],
         num_tokens_no_spec: np.ndarray,
         token_ids_cpu: np.ndarray,
-        spec_decode_unsupported_reqs: set,
     ) -> list[list[int]]:
         # find which requests need ngram proposals
         valid_ngram_requests = []
@@ -145,12 +141,6 @@ class NgramProposer:
                 # Skip speculative decoding.
                 continue
 
-            # Skip requests that require sampling parameters that are not
-            # supported with speculative decoding.
-            req_id = req_ids[i]
-            if req_id in spec_decode_unsupported_reqs:
-                continue
-
             num_tokens = num_tokens_no_spec[i]
             if num_tokens >= self.max_model_len:
                 # Skip requests that have already reached the max model length.
diff --git a/vllm/v1/spec_decode/suffix_decoding.py b/vllm/v1/spec_decode/suffix_decoding.py
index 049e335db3254810eab771631cea83907e53c5c7..5d6dcc552f6731debe998f2d9402cd403a3e3dfe 100644
--- a/vllm/v1/spec_decode/suffix_decoding.py
+++ b/vllm/v1/spec_decode/suffix_decoding.py
@@ -46,13 +46,7 @@ class SuffixDecodingProposer:
                 draft_token_ids.append([])
                 continue
 
-            # Skip requests that require sampling parameters that are not
-            # supported with speculative decoding.
             req_id = input_batch.req_ids[i]
-            if req_id in input_batch.spec_decode_unsupported_reqs:
-                draft_token_ids.append([])
-                continue
-
             num_tokens = input_batch.num_tokens_no_spec[i]
             if num_tokens >= self.max_model_len:
                 # Skip requests that have already reached the max model length.
diff --git a/vllm/v1/spec_decode/utils.py b/vllm/v1/spec_decode/utils.py
index 9d4399d00487a1f9dfce777895e090e15e84156b..524714db37a749c1553e3c9f226e8139fc54b893 100644
--- a/vllm/v1/spec_decode/utils.py
+++ b/vllm/v1/spec_decode/utils.py
@@ -1,21 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from vllm.sampling_params import SamplingParams
 from vllm.triton_utils import tl, triton
 
-_SAMPLING_EPS = 1e-5
-
-
-def is_spec_decode_unsupported(sampling_params: SamplingParams) -> bool:
-    """True if request is incompatible with speculative decoding"""
-    return (
-        sampling_params.frequency_penalty != 0.0
-        or sampling_params.presence_penalty != 0.0
-        or sampling_params.repetition_penalty != 1.0
-        or sampling_params.min_p > _SAMPLING_EPS
-        or sampling_params.logprobs is not None
-    )
-
 
 @triton.jit
 def eagle_prepare_inputs_padded_kernel(
@@ -23,6 +9,7 @@ def eagle_prepare_inputs_padded_kernel(
     valid_sampled_tokens_count_ptr,  # [num_reqs]
     query_start_loc_gpu_ptr,  # [num_reqs + 1]
     token_indices_to_sample_ptr,  # [num_reqs] (output)
+    num_rejected_tokens_gpu_ptr,  # [num_reqs] (output)
     num_reqs,  # tl.int32
 ):
     """
@@ -56,6 +43,7 @@ def eagle_prepare_inputs_padded_kernel(
 
     index_to_sample = q_last_tok_idx - num_rejected_tokens
     tl.store(token_indices_to_sample_ptr + req_idx, index_to_sample)
+    tl.store(num_rejected_tokens_gpu_ptr + req_idx, num_rejected_tokens)
 
 
 @triton.jit
diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
index 79ee4161e9dfa46f22f2e38bf3cc943cc8419184..4c1d38110d7e4ad5ee825cf0740e5c871618c302 100644
--- a/vllm/v1/structured_output/__init__.py
+++ b/vllm/v1/structured_output/__init__.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import itertools
 import multiprocessing
+from collections.abc import Iterable
 from concurrent.futures import Future, ThreadPoolExecutor
 from typing import TYPE_CHECKING
 
@@ -26,8 +28,6 @@ if TYPE_CHECKING:
 else:
     torch = LazyLoader("torch", globals(), "torch")
 
-    ReasoningParser = object
-    Request = object
 
 logger = init_logger(__name__)
 
@@ -96,7 +96,7 @@ class StructuredOutputManager:
             self.vllm_config.structured_outputs_config.enable_in_reasoning
         )
 
-    def grammar_init(self, request: Request) -> None:
+    def grammar_init(self, request: "Request") -> None:
         if request.structured_output_request is None:
             return
 
@@ -154,10 +154,7 @@ class StructuredOutputManager:
             grammar = self._create_grammar(request)  # type: ignore[assignment]
         request.structured_output_request.grammar = grammar  # type: ignore[assignment]
 
-    def _create_grammar(
-        self,
-        request: Request,
-    ) -> StructuredOutputGrammar:
+    def _create_grammar(self, request: "Request") -> StructuredOutputGrammar:
         key = request.structured_output_request.structured_output_key  # type: ignore[union-attr]
 
         # Note that the request was validated in the engine core client,
@@ -171,8 +168,7 @@ class StructuredOutputManager:
         return self.backend.compile_grammar(request_type, grammar_spec)
 
     def _fill_bitmasks(
-        self,
-        batch: list[tuple[StructuredOutputGrammar, int, bool]],
+        self, batch: Iterable[tuple[StructuredOutputGrammar, int, bool]]
     ) -> None:
         assert self._grammar_bitmask is not None
         for grammar, index, apply_bitmask in batch:
@@ -185,14 +181,13 @@ class StructuredOutputManager:
                 self._grammar_bitmask[index].fill_(self._full_mask)
 
     def _async_submit_fill_bitmask(
-        self,
-        batch: list[tuple[StructuredOutputGrammar, int, bool]],
+        self, batch: list[tuple[StructuredOutputGrammar, int, bool]]
     ) -> Future:
         return self.executor_for_fillmask.submit(self._fill_bitmasks, batch)
 
     def grammar_bitmask(
         self,
-        requests: dict[str, Request],
+        requests: dict[str, "Request"],
         structured_output_request_ids: list[str],
         scheduled_spec_decode_tokens: dict[str, list[int]],
     ) -> "npt.NDArray[np.int32] | None":
@@ -237,11 +232,10 @@ class StructuredOutputManager:
                 if TYPE_CHECKING:
                     assert structured_output_request is not None
                     assert structured_output_request.grammar is not None
+                grammar = structured_output_request.grammar
 
                 apply_bitmask = self.should_fill_bitmask(request)
-                batch.append(
-                    (structured_output_request.grammar, cumulative_index, apply_bitmask)
-                )
+                batch.append((grammar, cumulative_index, apply_bitmask))
                 if len(batch) == self.fill_bitmask_parallel_batch_size:
                     promises.append(self._async_submit_fill_bitmask(batch))
                     batch = []
@@ -262,34 +256,23 @@ class StructuredOutputManager:
                 if TYPE_CHECKING:
                     assert structured_output_request is not None
                     assert structured_output_request.grammar is not None
+                grammar = structured_output_request.grammar
                 apply_bitmask = self.should_fill_bitmask(request)
 
                 state_advancements = 0
-                req_tokens = scheduled_spec_decode_tokens.get(req_id, [])
-                for i, token in enumerate(req_tokens + [None]):
-                    self._fill_bitmasks(
-                        [
-                            (
-                                structured_output_request.grammar,
-                                cumulative_index,
-                                apply_bitmask,
-                            )
-                        ]
-                    )
-
-                    if (
-                        apply_bitmask
-                        and token is not None
-                        and not structured_output_request.grammar.is_terminated()
-                    ):
-                        accepted = structured_output_request.grammar.accept_tokens(
-                            req_id, [token]
-                        )
+                req_tokens = scheduled_spec_decode_tokens.get(req_id, ())
+                for token in itertools.chain(req_tokens, (-1,)):
+                    self._fill_bitmasks(((grammar, cumulative_index, apply_bitmask),))
+                    if token == -1:
+                        # Stop advancing the grammar once we hit a padding token.
+                        apply_bitmask = False
+                    if apply_bitmask and not grammar.is_terminated():
+                        accepted = grammar.accept_tokens(req_id, [token])
                         assert accepted, (token, req_id, scheduled_spec_decode_tokens)
                         state_advancements += 1
                     cumulative_index += 1
                 if state_advancements > 0:
-                    structured_output_request.grammar.rollback(state_advancements)
+                    grammar.rollback(state_advancements)
 
         bitmask_tensor = self._grammar_bitmask
         if cumulative_index < bitmask_tensor.shape[0]:
@@ -300,7 +283,7 @@ class StructuredOutputManager:
         # and deserialization when sending this to the GPU workers.
         return bitmask_tensor.numpy()
 
-    def should_fill_bitmask(self, request: Request) -> bool:
+    def should_fill_bitmask(self, request: "Request") -> bool:
         # NOTE (Hanchen) if enable_in_reasoning is True, it means that
         # the model needs to be constrained in reasoning. So we should always
         # enable the bitmask filling.
@@ -316,7 +299,7 @@ class StructuredOutputManager:
             return request.structured_output_request.reasoning_ended
         return True
 
-    def should_advance(self, request: Request) -> bool:
+    def should_advance(self, request: "Request") -> bool:
         if not request.use_structured_output:
             return False
 
@@ -339,8 +322,9 @@ class StructuredOutputManager:
             return True
 
         # Check if reasoning ends in *this* step
+        delta_from = request.num_computed_tokens - request.num_output_placeholders
         if self.reasoner.is_reasoning_end_streaming(
-            request.all_token_ids, request.all_token_ids[request.num_computed_tokens :]
+            request.all_token_ids, request.all_token_ids[delta_from:]
         ):
             # Reasoning just ended, so we shouldn't advance til
             # next pass
diff --git a/vllm/v1/structured_output/backend_guidance.py b/vllm/v1/structured_output/backend_guidance.py
index 2962a439dcb3e940043414abe9b4d37c79ad4985..727a67333bd7177a52a7fbee3451a1d95dab5561 100644
--- a/vllm/v1/structured_output/backend_guidance.py
+++ b/vllm/v1/structured_output/backend_guidance.py
@@ -44,6 +44,32 @@ def _walk_json_for_additional_properties(data: object):
             _walk_json_for_additional_properties(item)
 
 
+def has_guidance_unsupported_json_features(schema: dict[str, Any]) -> bool:
+    """Check if JSON schema contains features unsupported by guidance/llguidance."""
+
+    def check_object(obj: dict[str, Any]) -> bool:
+        if not isinstance(obj, dict):
+            return False
+
+        # patternProperties is not supported by llguidance
+        if "patternProperties" in obj:
+            return True
+
+        # Recursively check all nested objects and arrays
+        for value in obj.values():
+            if isinstance(value, dict):
+                if check_object(value):
+                    return True
+            elif isinstance(value, list):
+                for item in value:
+                    if isinstance(item, dict) and check_object(item):
+                        return True
+
+        return False
+
+    return check_object(schema)
+
+
 def process_for_additional_properties(
     guide_json: str | dict[str, Any],
 ) -> dict[str, Any]:
diff --git a/vllm/v1/structured_output/request.py b/vllm/v1/structured_output/request.py
index 94ae36a1abb4ffd2223ae64a493f3a8a6bcbdd3b..b921a71b3cf19203a838456432e84e6c29b9662a 100644
--- a/vllm/v1/structured_output/request.py
+++ b/vllm/v1/structured_output/request.py
@@ -28,12 +28,9 @@ class StructuredOutputRequest:
         if sampling_params is None:
             return None
         params = sampling_params.structured_outputs
-        if params:
-            if params.all_constraints_none():
-                return None
-            else:
-                return StructuredOutputRequest(params=params)
-        return None
+        if not params or params.all_constraints_none():
+            return None
+        return StructuredOutputRequest(params=params)
 
     def _check_grammar_completion(self) -> bool:
         # NOTE: We have to lazy import to gate circular imports
diff --git a/vllm/v1/structured_output/utils.py b/vllm/v1/structured_output/utils.py
index ae42b33f80f8805f7add4eaf6aab3fc601cb2d8d..3c98538f8d7328b0742c101615fa6ee116aad31c 100644
--- a/vllm/v1/structured_output/utils.py
+++ b/vllm/v1/structured_output/utils.py
@@ -5,6 +5,7 @@ from __future__ import annotations
 import hashlib
 import importlib.metadata
 import os
+import tempfile
 from typing import TYPE_CHECKING
 
 import numpy as np
@@ -20,8 +21,8 @@ from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
 
 if TYPE_CHECKING:
     import outlines_core as oc
+    import transformers.convert_slow_tokenizer as convert_slow_tokenizer
     import transformers.file_utils as file_utils
-    import transformers.models.gpt2.tokenization_gpt2 as tokenization_gpt2
     import xgrammar as xgr
 
     from vllm.tokenizers import TokenizerLike
@@ -30,15 +31,10 @@ else:
     xgr = LazyLoader("xgr", globals(), "xgrammar")
     oc = LazyLoader("oc", globals(), "outlines_core")
     file_utils = LazyLoader("file_utils", globals(), "transformers.file_utils")
-    tokenization_gpt2 = LazyLoader(
-        "tokenization_gpt2",
-        globals(),
-        "transformers.models.gpt2.tokenization_gpt2",
+    convert_slow_tokenizer = LazyLoader(
+        "convert_slow_tokenizer", globals(), "transformers.convert_slow_tokenizer"
     )
 
-    TokenizerLike = object
-    SchedulerOutput = object
-    InputBatch = object
 
 logger = init_logger(__name__)
 
@@ -74,13 +70,12 @@ def apply_grammar_bitmask(
     # request in the batch, as the logit indices are offset by this amount.
     struct_out_req_batch_indices: dict[str, int] = {}
     cumulative_offset = 0
-    seq = sorted(input_batch.req_id_to_index.items(), key=lambda x: x[1])
-    for req_id, batch_index in seq:
+    spec_tokens = scheduler_output.scheduled_spec_decode_tokens
+    struct_out_req_ids = set(grammar_output.structured_output_request_ids)
+    for batch_index, req_id in enumerate(input_batch.req_ids):
         logit_index = batch_index + cumulative_offset
-        cumulative_offset += len(
-            scheduler_output.scheduled_spec_decode_tokens.get(req_id, [])
-        )
-        if req_id in grammar_output.structured_output_request_ids:
+        cumulative_offset += len(spec_tokens.get(req_id, ()))
+        if req_id in struct_out_req_ids:
             struct_out_req_batch_indices[req_id] = logit_index
 
     out_indices = []
@@ -93,14 +88,12 @@ def apply_grammar_bitmask(
     )
     cumulative_index = 0
     for req_id in grammar_output.structured_output_request_ids:
-        num_spec_tokens = len(
-            scheduler_output.scheduled_spec_decode_tokens.get(req_id, [])
-        )
-        if req_id in struct_out_req_batch_indices:
-            logit_index = struct_out_req_batch_indices[req_id]
+        num_spec_tokens = len(spec_tokens.get(req_id, ()))
+        if (logit_idx := struct_out_req_batch_indices.get(req_id)) is not None:
             for i in range(1 + num_spec_tokens):
-                sorted_bitmask[logit_index + i] = grammar_bitmask[cumulative_index + i]
-                out_indices.append(logit_index + i)
+                bitmask_index = logit_idx + i
+                sorted_bitmask[bitmask_index] = grammar_bitmask[cumulative_index + i]
+                out_indices.append(bitmask_index)
         cumulative_index += 1 + num_spec_tokens
 
     # Copy async to device as tensor.
@@ -151,21 +144,19 @@ def get_outlines_cache_path() -> str:
     if outlines_cache_dir:
         # OUTLINES_CACHE_DIR takes precedence
         return outlines_cache_dir
-    elif xdg_cache_home:
+    if xdg_cache_home:
         return os.path.join(xdg_cache_home, ".cache", "outlines")
     # If homedir is "/", we may be inside a container, and thus writing to
     # root would be problematic, so we fall back to using a tempfile.
     # Also validate the path exists, since os.path.expanduser does
     # not guarantee existence.
-    elif os.path.isdir(home_dir) and home_dir != "/":
+    if os.path.isdir(home_dir) and home_dir != "/":
         # Default Unix fallback: ~/.cache/outlines
         return os.path.join(home_dir, ".cache", "outlines")
-    else:
-        import tempfile
 
-        # home_dir may be / inside a docker container without existing user
-        tempdir = tempfile.gettempdir()
-        return os.path.join(tempdir, ".cache", "outlines")
+    # home_dir may be / inside a docker container without existing user
+    tempdir = tempfile.gettempdir()
+    return os.path.join(tempdir, ".cache", "outlines")
 
 
 def get_outlines_cache():
@@ -186,8 +177,8 @@ def get_outlines_cache():
             cache.clear()
         cache.set("__version__", outlines_version)
         return cache
-    else:
-        return LRUCache(maxsize=128)
+
+    return LRUCache(maxsize=128)
 
 
 re_llama_byte_token = re.compile(r"^<0x[0-9A-F]{2}>$")
@@ -195,8 +186,7 @@ re_replacement_seq = re.compile(r"^.{0,6}�+.{0,6}$")
 
 
 def _reduced_vocabulary(
-    tokenizer: TokenizerLike,
-    eos_token_id: int,
+    tokenizer: TokenizerLike, eos_token_id: int
 ) -> dict[bytes, list[int]]:
     """Create a map from vocabulary tokens to lists of equivalent token ids.
 
@@ -204,7 +194,9 @@ def _reduced_vocabulary(
         A Dict of token string -> equivalent token ids
     """
 
-    unicode_to_bytes = {v: k for k, v in tokenization_gpt2.bytes_to_unicode().items()}
+    unicode_to_bytes = {
+        v: k for k, v in convert_slow_tokenizer.bytes_to_unicode().items()
+    }
 
     def convert_token_to_string(token: str) -> str:
         string = tokenizer.convert_tokens_to_string([token])
@@ -267,17 +259,13 @@ def get_outlines_vocabulary(tokenizer: TokenizerLike) -> oc.Vocabulary:
         return tokenizer._outlines_vocabulary  # type: ignore
 
     try:
-        if (
-            hasattr(
-                tokenizer,
-                "eos_token_id",
-            )
-            and tokenizer.eos_token_id is not None
-        ):
+        if hasattr(tokenizer, "eos_token_id") and tokenizer.eos_token_id is not None:
             eos_token_id = tokenizer.eos_token_id
         else:
             raise ValueError(
-                f"Error during structured outputs setup for outlines: Tokenizer ({type(tokenizer)}) has no `eos_token_id` property, but `eos_token_id` is required for structured outputs to work properly."  # noqa: E501
+                "Error during structured outputs setup for outlines: Tokenizer "
+                f"({type(tokenizer)}) has no `eos_token_id` property, but "
+                "`eos_token_id` is required for structured outputs to work properly."
             )
 
         reduced_vocab = _reduced_vocabulary(
@@ -290,7 +278,7 @@ def get_outlines_vocabulary(tokenizer: TokenizerLike) -> oc.Vocabulary:
         return vocabulary
     except AttributeError as e:
         raise ValueError(
-            f"Cannot get the vocabulary of the tokenizer "
+            "Cannot get the vocabulary of the tokenizer "
             f"({type(tokenizer)}). The tokenizer should have a "
             "get_vocab method."
         ) from e
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index 29099d1e9b17ef4e1d16cc81f8fc249269fbe805..75ad304ddf1a72d9e15545055c29654681e7a15c 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -7,6 +7,7 @@ import time
 import weakref
 from collections.abc import Callable, Sequence
 from contextlib import AbstractContextManager
+from dataclasses import dataclass
 from multiprocessing import connection
 from multiprocessing.process import BaseProcess
 from typing import (
@@ -27,6 +28,7 @@ from vllm.logger import init_logger
 from vllm.usage.usage_lib import UsageContext, is_usage_stats_enabled, usage_message
 from vllm.utils.network_utils import get_open_port, get_open_zmq_ipc_path, get_tcp_uri
 from vllm.utils.system_utils import kill_process_tree
+from vllm.v1.core.sched.output import SchedulerOutput
 
 if TYPE_CHECKING:
     import numpy as np
@@ -412,3 +414,53 @@ def tensor_data(tensor: torch.Tensor) -> memoryview:
         A memoryview of the tensor data as uint8.
     """
     return tensor.flatten().contiguous().view(torch.uint8).numpy().data
+
+
+@dataclass
+class IterationDetails:
+    num_ctx_requests: int
+    num_ctx_tokens: int
+    num_generation_requests: int
+    num_generation_tokens: int
+
+    def __repr__(self) -> str:
+        return f"IterationDetails(num_ctx_requests={self.num_ctx_requests},\
+                 num_ctx_tokens={self.num_ctx_tokens}, \
+                 num_generation_requests={self.num_generation_requests}, \
+                 num_generation_tokens={self.num_generation_tokens})"
+
+
+def compute_iteration_details(scheduler_output: SchedulerOutput) -> IterationDetails:
+    """
+    Compute the number of context/generation requests and tokens
+    for the current iteration's scheduler output. A requests is regarded
+    as a context request if its output tokens are still 0, an extended chunk
+    of chunked prefill falls into this category.
+
+    Args:
+        scheduler_output: The scheduler output for the current iteration.
+
+    Returns:
+        An IterationDetails object containing the number of
+        context/generation requests and tokens.
+    """
+    num_context_requests = 0
+    num_context_tokens = 0
+    num_generation_requests = 0
+    num_generation_tokens = 0
+    new_req_ids = {new_req.req_id for new_req in scheduler_output.scheduled_new_reqs}
+    for req_id, num_tokens in scheduler_output.num_scheduled_tokens.items():
+        if scheduler_output.scheduled_cached_reqs.is_context_phase(req_id) or (
+            req_id in new_req_ids
+        ):
+            num_context_requests += 1
+            num_context_tokens += num_tokens
+        else:
+            num_generation_requests += 1
+            num_generation_tokens += num_tokens
+    return IterationDetails(
+        num_context_requests,
+        num_context_tokens,
+        num_generation_requests,
+        num_generation_tokens,
+    )
diff --git a/vllm/v1/worker/cpu_worker.py b/vllm/v1/worker/cpu_worker.py
index e54b995ab908f136f6014acd1bee7afba0ac721f..654f58834a154e9f5a3d7177d0bb20766c71c5c8 100644
--- a/vllm/v1/worker/cpu_worker.py
+++ b/vllm/v1/worker/cpu_worker.py
@@ -10,10 +10,10 @@ import torch
 from vllm import envs
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
-from vllm.model_executor.utils import set_random_seed
 from vllm.platforms import CpuArchEnum, current_platform
 from vllm.platforms.cpu import CpuPlatform, LogicalCPUInfo
 from vllm.profiler.wrapper import TorchProfilerWrapper
+from vllm.utils.torch_utils import set_random_seed
 from vllm.v1.worker.cpu_model_runner import CPUModelRunner
 from vllm.v1.worker.gpu_worker import Worker, init_worker_distributed_environment
 
diff --git a/vllm/v1/worker/dp_utils.py b/vllm/v1/worker/dp_utils.py
index 1b9646e1980a8161611c6298e84e68628ae5ff2c..82de0cba9194b9c6d8ad87e8651762ffd3659e03 100644
--- a/vllm/v1/worker/dp_utils.py
+++ b/vllm/v1/worker/dp_utils.py
@@ -11,7 +11,7 @@ from vllm.distributed.parallel_state import get_dp_group
 from vllm.logger import init_logger
 from vllm.v1.worker.ubatch_utils import (
     check_ubatch_thresholds,
-    is_second_ubatch_empty,
+    is_last_ubatch_empty,
 )
 
 logger = init_logger(__name__)
@@ -56,7 +56,7 @@ def _run_ar(
     return tensor
 
 
-def _post_process_ubatch(tensor: torch.Tensor) -> bool:
+def _post_process_ubatch(tensor: torch.Tensor, num_ubatches: int) -> bool:
     orig_num_tokens_tensor = tensor[0, :]
     padded_num_tokens_tensor = tensor[1, :]
 
@@ -68,7 +68,7 @@ def _post_process_ubatch(tensor: torch.Tensor) -> bool:
     # there are no "empty" second ubatches
     orig_min_num_tokens = int(orig_num_tokens_tensor.min().item())
     padded_max_num_tokens = int(padded_num_tokens_tensor.max().item())
-    if is_second_ubatch_empty(orig_min_num_tokens, padded_max_num_tokens):
+    if is_last_ubatch_empty(orig_min_num_tokens, padded_max_num_tokens, num_ubatches):
         logger.debug(
             "Aborting ubatching %s %s", orig_min_num_tokens, padded_max_num_tokens
         )
@@ -146,7 +146,7 @@ def _synchronize_dp_ranks(
     assert should_attempt_dp_padding == should_dp_pad
 
     # Check conditions for microbatching
-    should_ubatch = _post_process_ubatch(tensor)
+    should_ubatch = _post_process_ubatch(tensor, parallel_config.num_ubatches)
 
     if should_ubatch and not should_dp_pad:
         logger.debug_once(
diff --git a/vllm/v1/worker/ec_connector_model_runner_mixin.py b/vllm/v1/worker/ec_connector_model_runner_mixin.py
index 08a41532ea8e11bbf0f7d3778915a695737470ce..1a347a0b98ab23e5e8e9304eb418db64fff54b48 100644
--- a/vllm/v1/worker/ec_connector_model_runner_mixin.py
+++ b/vllm/v1/worker/ec_connector_model_runner_mixin.py
@@ -6,9 +6,7 @@ Define EC connector functionality mixin for model runners.
 
 from collections.abc import Generator
 from contextlib import AbstractContextManager, contextmanager, nullcontext
-from typing import (
-    TYPE_CHECKING,  # noqa: UP035
-)
+from typing import TYPE_CHECKING
 
 import torch
 
diff --git a/vllm/v1/worker/gpu/attn_utils.py b/vllm/v1/worker/gpu/attn_utils.py
index 6386f1a08b446c55204e4352fe3496776d6211f0..70c622fc026abbda5050d3090845e29d39619c73 100644
--- a/vllm/v1/worker/gpu/attn_utils.py
+++ b/vllm/v1/worker/gpu/attn_utils.py
@@ -3,13 +3,12 @@
 from collections.abc import Sequence
 from typing import Any, cast
 
-import numpy as np
 import torch
 
-from vllm.attention.backends.abstract import AttentionBackend
 from vllm.config import VllmConfig, get_layers_from_vllm_config
 from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
-from vllm.v1.attention.backends.utils import (
+from vllm.v1.attention.backend import (
+    AttentionBackend,
     AttentionMetadataBuilder,
     CommonAttentionMetadata,
 )
@@ -147,16 +146,13 @@ def build_attn_metadata(
     query_start_loc_gpu: torch.Tensor,
     query_start_loc_cpu: torch.Tensor,
     seq_lens: torch.Tensor,
-    seq_lens_np: np.ndarray,
-    num_computed_tokens_cpu: torch.Tensor | None,
+    max_seq_len: int,
     block_tables: Sequence[torch.Tensor],
     slot_mappings: torch.Tensor,
     kv_cache_config: KVCacheConfig,
 ) -> dict[str, Any]:
     max_query_len = int(query_start_loc_cpu.max())
     seq_lens = seq_lens[:num_reqs]
-    seq_lens_cpu = torch.from_numpy(seq_lens_np)
-    max_seq_len = int(seq_lens_np.max())
 
     attn_metadata: dict[str, Any] = {}
     kv_cache_groups = kv_cache_config.kv_cache_groups
@@ -168,9 +164,7 @@ def build_attn_metadata(
             query_start_loc=query_start_loc_gpu,
             query_start_loc_cpu=query_start_loc_cpu,
             seq_lens=seq_lens,
-            _seq_lens_cpu=seq_lens_cpu,
             max_seq_len=max_seq_len,
-            _num_computed_tokens_cpu=num_computed_tokens_cpu,
             num_reqs=num_reqs,
             num_actual_tokens=num_tokens,
             max_query_len=max_query_len,
diff --git a/vllm/v1/worker/gpu/block_table.py b/vllm/v1/worker/gpu/block_table.py
index b31e9b179d26cdc18f9fd56e6ea6a8e7e17809c8..d45917d4bf2c7d5c20bbe01b4d24afe2bd812604 100644
--- a/vllm/v1/worker/gpu/block_table.py
+++ b/vllm/v1/worker/gpu/block_table.py
@@ -4,10 +4,10 @@ from collections.abc import Iterable
 
 import torch
 
-from vllm.attention.backends.utils import PAD_SLOT_ID
 from vllm.triton_utils import tl, triton
 from vllm.utils.math_utils import cdiv
-from vllm.v1.utils import CpuGpuBuffer
+from vllm.v1.attention.backends.utils import PAD_SLOT_ID
+from vllm.v1.worker.gpu.buffer_utils import StagedWriteTensor, UvaBackedTensor
 
 
 class BlockTables:
@@ -18,51 +18,49 @@ class BlockTables:
         max_num_batched_tokens: int,
         max_model_len: int,
         device: torch.device,
-        pin_memory: bool,
     ):
         self.block_sizes = block_sizes
         self.max_num_reqs = max_num_reqs
         self.max_num_batched_tokens = max_num_batched_tokens
         self.max_model_len = max_model_len
         self.device = device
-        self.pin_memory = pin_memory
 
         self.num_kv_cache_groups = len(self.block_sizes)
         # num_kv_cache_groups x [max_num_reqs, max_num_blocks]
-        self.block_tables: list[torch.Tensor] = []
+        self.block_tables: list[StagedWriteTensor] = []
         for i in range(self.num_kv_cache_groups):
             block_size = self.block_sizes[i]
             max_num_blocks = cdiv(self.max_model_len, block_size)
-            block_table = torch.zeros(
-                self.max_num_reqs,
-                max_num_blocks,
+            block_table = StagedWriteTensor(
+                (self.max_num_reqs, max_num_blocks),
                 dtype=torch.int32,
-                device=self.device,
+                device=device,
             )
             self.block_tables.append(block_table)
-        self.block_table_ptrs = self._make_ptr_tensor(self.block_tables)
-
-        # Block tables used for model's forward pass.
-        # num_kv_cache_groups x [max_num_reqs, max_num_blocks]
-        self.input_block_tables: list[torch.Tensor] = [
-            torch.zeros_like(block_table) for block_table in self.block_tables
-        ]
-        self.input_block_table_ptrs = self._make_ptr_tensor(self.input_block_tables)
-
+        self.block_table_ptrs = self._make_ptr_tensor(
+            [b.gpu for b in self.block_tables]
+        )
         self.block_table_strides = torch.tensor(
-            [b.stride(0) for b in self.block_tables],
+            [b.gpu.stride(0) for b in self.block_tables],
             dtype=torch.int64,
             device=self.device,
         )
+
         self.block_sizes_tensor = torch.tensor(
             self.block_sizes, dtype=torch.int32, device=self.device
         )
-        self.num_blocks = torch.zeros(
-            self.num_kv_cache_groups,
-            self.max_num_reqs,
+        self.num_blocks = UvaBackedTensor(
+            (self.num_kv_cache_groups, self.max_num_reqs),
             dtype=torch.int32,
-            device=self.device,
         )
+
+        # Block tables used for model's forward pass.
+        # num_kv_cache_groups x [max_num_reqs, max_num_blocks]
+        self.input_block_tables: list[torch.Tensor] = [
+            torch.zeros_like(b.gpu) for b in self.block_tables
+        ]
+        self.input_block_table_ptrs = self._make_ptr_tensor(self.input_block_tables)
+
         self.slot_mappings = torch.zeros(
             self.num_kv_cache_groups,
             self.max_num_batched_tokens,
@@ -70,74 +68,32 @@ class BlockTables:
             device=self.device,
         )
 
-        # Misc buffers.
-        self.req_indices = self._make_buffer(self.max_num_reqs, dtype=torch.int32)
-        self.overwrite = self._make_buffer(self.max_num_reqs, dtype=torch.bool)
-        self.cu_num_new_blocks = self._make_buffer(
-            self.num_kv_cache_groups, self.max_num_reqs + 1, dtype=torch.int32
-        )
-
-    def _make_buffer(self, *args, dtype: torch.dtype) -> CpuGpuBuffer:
-        return CpuGpuBuffer(
-            *args, dtype=dtype, pin_memory=self.pin_memory, device=self.device
-        )
-
     def _make_ptr_tensor(self, x: Iterable[torch.Tensor]) -> torch.Tensor:
         # NOTE(woosuk): Use uint64 instead of int64 to cover all possible addresses.
-        ptrs_tensor_cpu = torch.tensor(
+        return torch.tensor(
             [t.data_ptr() for t in x],
             dtype=torch.uint64,
-            device="cpu",
-            pin_memory=self.pin_memory,
+            device=self.device,
         )
-        return ptrs_tensor_cpu.to(self.device, non_blocking=True)
 
     def append_block_ids(
         self,
-        # [num_reqs]
-        req_indices: list[int],
-        # [num_kv_cache_groups, num_reqs + 1]
-        cu_num_new_blocks: tuple[list[int], ...],
-        # [num_kv_cache_groups, num_new_blocks]
+        req_index: int,
         new_block_ids: tuple[list[int], ...],
-        # [num_reqs]
-        overwrite: list[bool],
+        overwrite: bool,
     ) -> None:
-        num_reqs = len(req_indices)
-        self.req_indices.np[:num_reqs] = req_indices
-        self.overwrite.np[:num_reqs] = overwrite
-        for i in range(self.num_kv_cache_groups):
-            self.cu_num_new_blocks.np[i, : num_reqs + 1] = cu_num_new_blocks[i]
-
-        # NOTE(woosuk): Here, we cannot use a fixed-size buffer because there's
-        # no clear upper bound to the number of new blocks in a single step.
-        # NOTE(woosuk): The buffer has to be cached, because otherwise we cannot
-        # guarantee that the buffer is not freed before the copy is completed.
-        self.new_block_ids_cpu = torch.empty(
-            self.num_kv_cache_groups,
-            max(len(x) for x in new_block_ids),
-            dtype=torch.int32,
-            device="cpu",
-            pin_memory=self.pin_memory,
-        )
-        new_block_ids_np = self.new_block_ids_cpu.numpy()
         for i in range(self.num_kv_cache_groups):
-            new_block_ids_np[i, : len(new_block_ids[i])] = new_block_ids[i]
-        new_block_ids_gpu = self.new_block_ids_cpu.to(self.device, non_blocking=True)
-
-        _append_block_ids_kernel[(self.num_kv_cache_groups, num_reqs)](
-            self.req_indices.copy_to_gpu(num_reqs),
-            self.cu_num_new_blocks.copy_to_gpu(),
-            self.cu_num_new_blocks.gpu.stride(0),
-            new_block_ids_gpu,
-            new_block_ids_gpu.stride(0),
-            self.overwrite.copy_to_gpu(num_reqs),
-            self.block_table_strides,
-            self.block_table_ptrs,
-            self.num_blocks,
-            self.num_blocks.stride(0),
-            BLOCK_SIZE=1024,  # type: ignore
-        )
+            start = self.num_blocks.np[i, req_index] if not overwrite else 0
+            block_ids = new_block_ids[i]
+            self.block_tables[i].stage_write(req_index, start, block_ids)
+            self.num_blocks.np[i, req_index] = start + len(block_ids)
+
+    def apply_staged_writes(self) -> None:
+        # TODO(woosuk): This can be inefficient since it launches one kernel per
+        # block table. Implement a kernel to handle all block tables at once.
+        for block_table in self.block_tables:
+            block_table.apply_write()
+        self.num_blocks.copy_to_uva()
 
     def gather_block_tables(
         self,
@@ -149,8 +105,8 @@ class BlockTables:
             self.block_table_ptrs,
             self.input_block_table_ptrs,
             self.block_table_strides,
-            self.num_blocks,
-            self.num_blocks.stride(0),
+            self.num_blocks.gpu,
+            self.num_blocks.gpu.stride(0),
             BLOCK_SIZE=1024,  # type: ignore
         )
         return tuple(block_table[:num_reqs] for block_table in self.input_block_tables)
@@ -186,54 +142,6 @@ class BlockTables:
         return self.slot_mappings[:, :num_tokens]
 
 
-@triton.jit
-def _append_block_ids_kernel(
-    # Inputs
-    req_indices,  # [num_reqs]
-    cu_num_new_blocks_ptr,  # [num_kv_cache_groups, num_reqs + 1]
-    cu_num_new_blocks_stride,
-    new_block_ids_ptr,  # [num_kv_cache_groups, num_new_blocks]
-    new_block_ids_stride,
-    overwrite,  # [num_reqs]
-    block_table_strides,  # [num_kv_cache_groups]
-    # Outputs
-    block_table_ptrs,  # [num_kv_cache_groups]
-    num_blocks_ptr,  # [num_kv_cache_groups, max_num_reqs]
-    num_blocks_stride,
-    # Constants
-    BLOCK_SIZE: tl.constexpr,
-):
-    group_id = tl.program_id(0)
-    batch_idx = tl.program_id(1)
-    req_idx = tl.load(req_indices + batch_idx)
-    do_overwrite = tl.load(overwrite + batch_idx)
-
-    group_new_blocks_ptr = cu_num_new_blocks_ptr + group_id * cu_num_new_blocks_stride
-    start_idx = tl.load(group_new_blocks_ptr + batch_idx)
-    end_idx = tl.load(group_new_blocks_ptr + batch_idx + 1)
-    num_new_blocks = end_idx - start_idx
-
-    group_num_blocks_ptr = num_blocks_ptr + group_id * num_blocks_stride
-    dst_start_idx = tl.load(group_num_blocks_ptr + req_idx) if not do_overwrite else 0
-    dst_end_idx = dst_start_idx + num_new_blocks
-    tl.store(group_num_blocks_ptr + req_idx, dst_end_idx)
-
-    # Destination
-    block_table_ptr = _load_ptr(block_table_ptrs + group_id, tl.int32)
-    block_table_stride = tl.load(block_table_strides + group_id)
-    row_ptr = block_table_ptr + req_idx * block_table_stride
-
-    group_new_block_ids_ptr = new_block_ids_ptr + group_id * new_block_ids_stride
-    for i in range(0, num_new_blocks, BLOCK_SIZE):
-        offset = i + tl.arange(0, BLOCK_SIZE)
-        block_ids = tl.load(
-            group_new_block_ids_ptr + start_idx + offset, mask=offset < num_new_blocks
-        )
-        tl.store(
-            row_ptr + dst_start_idx + offset, block_ids, mask=offset < num_new_blocks
-        )
-
-
 @triton.jit
 def _gather_block_tables_kernel(
     batch_idx_to_req_idx,  # [batch_size]
diff --git a/vllm/v1/worker/gpu/buffer_utils.py b/vllm/v1/worker/gpu/buffer_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..dced199aacc652a29f21dc2510c740266f767a8f
--- /dev/null
+++ b/vllm/v1/worker/gpu/buffer_utils.py
@@ -0,0 +1,224 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Iterable, Sequence
+
+import numpy as np
+import torch
+
+from vllm.triton_utils import tl, triton
+from vllm.utils.math_utils import next_power_of_2
+from vllm.utils.platform_utils import is_uva_available
+from vllm.utils.torch_utils import get_cuda_view_from_cpu_tensor
+
+
+class UvaBuffer:
+    def __init__(self, size: int | Sequence[int], dtype: torch.dtype):
+        if not is_uva_available():
+            raise RuntimeError("UVA is not available")
+        self.cpu = torch.zeros(size, dtype=dtype, device="cpu", pin_memory=True)
+        self.np = self.cpu.numpy()
+        self.uva = get_cuda_view_from_cpu_tensor(self.cpu)
+
+
+class UvaBufferPool:
+    def __init__(
+        self,
+        size: int | Sequence[int],
+        dtype: torch.dtype,
+        max_concurrency: int = 2,
+    ):
+        self.size = size
+        self.dtype = dtype
+        self.max_concurrency = max_concurrency
+
+        # UVA buffers for concurrency
+        self._uva_bufs = [UvaBuffer(size, dtype) for _ in range(max_concurrency)]
+        # Current buffer index
+        self._curr = 0
+
+    def copy_to_uva(self, x: torch.Tensor | np.ndarray | list) -> torch.Tensor:
+        # Round robin to the next buffer.
+        self._curr = (self._curr + 1) % self.max_concurrency
+        buf = self._uva_bufs[self._curr]
+        # CPU-to-CPU copy
+        dst = buf.cpu if isinstance(x, torch.Tensor) else buf.np
+        n = len(x)
+        dst[:n] = x
+        return buf.uva[:n]
+
+    def copy_to_gpu(
+        self,
+        x: torch.Tensor | np.ndarray,
+        out: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        uva = self.copy_to_uva(x)
+        if out is None:
+            # CPU-to-GPU copy
+            return uva.clone()
+        # CPU-to-GPU copy
+        return out.copy_(uva, non_blocking=True)
+
+
+class UvaBackedTensor:
+    def __init__(
+        self,
+        size: int | Sequence[int],
+        dtype: torch.dtype,
+        max_concurrency: int = 2,
+    ):
+        self.dtype = dtype
+        self.max_concurrency = max_concurrency
+
+        # Source of truth
+        self.cpu = torch.zeros(size, dtype=dtype, device="cpu", pin_memory=False)
+        self.np = self.cpu.numpy()
+
+        # Buffers for concurrency
+        self.pool = UvaBufferPool(size, dtype, max_concurrency)
+        self.gpu = self.pool.copy_to_uva(self.np)
+
+    def copy_to_uva(self, n: int | None = None) -> torch.Tensor:
+        # CPU-to-CPU copy
+        self.gpu = self.pool.copy_to_uva(self.np[:n] if n is not None else self.np)
+        return self.gpu
+
+
+class StagedWriteTensor:
+    def __init__(
+        self,
+        size: int | Sequence[int],
+        dtype: torch.dtype,
+        device: torch.device,
+        max_concurrency: int = 2,
+        uva_instead_of_gpu: bool = False,
+    ):
+        supported_dtypes = [torch.int32, torch.int64, torch.float32]
+        if dtype not in supported_dtypes:
+            raise ValueError(
+                f"Unsupported dtype {dtype}: should be one of {supported_dtypes}"
+            )
+        self.num_rows = size if isinstance(size, int) else size[0]
+        self.dtype = dtype
+        self.max_concurrency = max_concurrency
+
+        if not uva_instead_of_gpu:
+            # Create a GPU tensor (default)
+            self.gpu = torch.zeros(size, dtype=dtype, device=device)
+        else:
+            # For a large but not-frequently-accessed tensor, we can use UVA instead of
+            # GPU to save GPU memory
+            self._uva_buf = UvaBuffer(size, dtype)
+            self.gpu = self._uva_buf.uva
+
+        self._staged_write_indices: list[int] = []
+        self._staged_write_starts: list[int] = []
+        self._staged_write_contents: list[int | float] = []
+        self._staged_write_cu_lens: list[int] = []
+
+        self.write_indices = UvaBufferPool(
+            self.num_rows, dtype=torch.int32, max_concurrency=max_concurrency
+        )
+        self.write_starts = UvaBufferPool(
+            self.num_rows, dtype=torch.int32, max_concurrency=max_concurrency
+        )
+        init_size = next_power_of_2(self.num_rows)
+        self.write_contents = UvaBufferPool(
+            init_size, dtype=dtype, max_concurrency=max_concurrency
+        )
+        self.write_cu_lens = UvaBufferPool(
+            self.num_rows, dtype=torch.int32, max_concurrency=max_concurrency
+        )
+
+    def stage_write(
+        self,
+        index: int,
+        start: int,
+        x: Iterable[int] | Iterable[float],
+    ) -> None:
+        assert index >= 0
+        assert start >= 0
+        if not x:
+            return
+        self._staged_write_indices.append(index)
+        self._staged_write_starts.append(start)
+        self._staged_write_contents.extend(x)
+        self._staged_write_cu_lens.append(len(self._staged_write_contents))
+
+    def stage_write_elem(self, index: int, x: int) -> None:
+        assert index >= 0
+        self._staged_write_indices.append(index)
+        self._staged_write_starts.append(0)
+        self._staged_write_contents.append(x)
+        self._staged_write_cu_lens.append(len(self._staged_write_contents))
+
+    def apply_write(self) -> None:
+        n = len(self._staged_write_indices)
+        if n == 0:
+            return
+
+        indices_uva = self.write_indices.copy_to_uva(self._staged_write_indices)
+        starts_uva = self.write_starts.copy_to_uva(self._staged_write_starts)
+        cu_lens_uva = self.write_cu_lens.copy_to_uva(self._staged_write_cu_lens)
+
+        # Special handling for write_contents
+        diff_len = len(self._staged_write_contents)
+        assert isinstance(self.write_contents.size, int)
+        if diff_len > self.write_contents.size:
+            # Re-allocate a larger buffer for the write_contents
+            new_size = next_power_of_2(diff_len)
+            self.write_contents = UvaBufferPool(
+                new_size, dtype=self.dtype, max_concurrency=self.max_concurrency
+            )
+            # NOTE(woosuk): Since the previous write_contents buffer is released,
+            # we perform a synchronization here to ensure that all data transfers
+            # involving the old buffer have finished before allocating a new one.
+            # This prevents potential race conditions. The slight overhead is
+            # negligible because the reallocations are infrequent in practice.
+            torch.cuda.synchronize()
+        contents_uva = self.write_contents.copy_to_uva(self._staged_write_contents)
+
+        # Write diffs to the GPU buffer
+        _apply_write_kernel[(n,)](
+            self.gpu,
+            self.gpu.stride(0),
+            indices_uva,
+            starts_uva,
+            contents_uva,
+            cu_lens_uva,
+            BLOCK_SIZE=1024,
+        )
+        # Clear the staged writes
+        self.clear_staged_writes()
+
+    def clear_staged_writes(self) -> None:
+        self._staged_write_indices.clear()
+        self._staged_write_starts.clear()
+        self._staged_write_contents.clear()
+        self._staged_write_cu_lens.clear()
+
+
+@triton.jit
+def _apply_write_kernel(
+    output_ptr,
+    output_stride,
+    write_indices_ptr,
+    write_starts_ptr,
+    write_contents_ptr,
+    write_cu_lens_ptr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    row_idx = tl.load(write_indices_ptr + pid)
+    start_idx = tl.load(write_starts_ptr + pid)
+
+    cu_start = tl.load(write_cu_lens_ptr + pid - 1) if pid > 0 else 0
+    cu_end = tl.load(write_cu_lens_ptr + pid)
+    content_len = cu_end - cu_start
+
+    for i in range(0, content_len, BLOCK_SIZE):
+        block = i + tl.arange(0, BLOCK_SIZE)
+        mask = block < content_len
+        content = tl.load(write_contents_ptr + cu_start + block, mask=mask)
+        tl.store(
+            output_ptr + row_idx * output_stride + start_idx + block, content, mask=mask
+        )
diff --git a/vllm/v1/worker/gpu/cudagraph_utils.py b/vllm/v1/worker/gpu/cudagraph_utils.py
index 7f2994eeca008ef71272669c1781eb1d86e364ee..d5095af18346f79246faf61476bac80c3533cd2a 100644
--- a/vllm/v1/worker/gpu/cudagraph_utils.py
+++ b/vllm/v1/worker/gpu/cudagraph_utils.py
@@ -25,10 +25,12 @@ class CudaGraphManager:
     def __init__(
         self,
         vllm_config: VllmConfig,
+        uses_mrope: bool,
         device: torch.device,
     ):
         self.vllm_config = vllm_config
         self.scheduler_config = vllm_config.scheduler_config
+        self.uses_mrope = uses_mrope
         self.device = device
 
         self.max_model_len = vllm_config.model_config.max_model_len
@@ -79,7 +81,10 @@ class CudaGraphManager:
     ) -> None:
         num_reqs = min(num_tokens, self.max_num_reqs)
         input_ids = input_buffers.input_ids[:num_tokens]
-        positions = input_buffers.positions[:num_tokens]
+        if not self.uses_mrope:
+            positions = input_buffers.positions[:num_tokens]
+        else:
+            positions = input_buffers.mrope_positions[:, :num_tokens]
         attn_metadata = prepare_inputs_to_capture(
             num_reqs,
             num_tokens,
@@ -228,15 +233,16 @@ def prepare_inputs_to_capture(
     kv_cache_config: KVCacheConfig,
 ) -> dict[str, Any]:
     num_tokens_per_req = num_tokens // num_reqs
-    query_start_loc = input_buffers.query_start_loc
-    query_start_loc.np[: num_reqs + 1] = np.arange(num_reqs + 1) * num_tokens_per_req
-    query_start_loc.np[num_reqs:] = num_tokens
-    query_start_loc.copy_to_gpu()
-    seq_lens_np = np.full(num_reqs, max_model_len, dtype=np.int32)
+
+    query_start_loc_np = np.arange(num_reqs + 1, dtype=np.int32) * num_tokens_per_req
+    query_start_loc_np[-1] = num_tokens
+    query_start_loc_cpu = torch.from_numpy(query_start_loc_np)
+    input_buffers.query_start_loc[: num_reqs + 1] = query_start_loc_cpu
+    input_buffers.query_start_loc[num_reqs + 1 :] = num_tokens
+    query_start_loc = input_buffers.query_start_loc[: num_reqs + 1]
+
     # HACK(woosuk): For faster warmup, we set seq_lens (GPU) to num_tokens
-    # rather than max_model_len. This introduces a discrepancy between
-    # seq_lens (on GPU) and seq_lens_np (on CPU), which may cause issues for
-    # certain attention backends.
+    # rather than max_model_len.
     input_buffers.seq_lens[:num_reqs] = num_tokens
     input_buffers.seq_lens[num_reqs:] = 0
 
@@ -247,11 +253,10 @@ def prepare_inputs_to_capture(
         attn_metadata_builders=attn_metadata_builders,
         num_reqs=num_reqs,
         num_tokens=num_tokens,
-        query_start_loc_gpu=query_start_loc.gpu[: num_reqs + 1],
-        query_start_loc_cpu=query_start_loc.cpu[: num_reqs + 1],
+        query_start_loc_gpu=query_start_loc,
+        query_start_loc_cpu=query_start_loc_cpu,
         seq_lens=input_buffers.seq_lens,
-        seq_lens_np=seq_lens_np,
-        num_computed_tokens_cpu=None,  # FIXME
+        max_seq_len=max_model_len,
         block_tables=input_block_tables,
         slot_mappings=slot_mappings,
         kv_cache_config=kv_cache_config,
diff --git a/vllm/v1/worker/gpu/input_batch.py b/vllm/v1/worker/gpu/input_batch.py
index 1b78734fba78f8b01a236bededba9f208879cc4b..8f9552e3f8768ecadf69d526cb2e0bff48fa2e31 100644
--- a/vllm/v1/worker/gpu/input_batch.py
+++ b/vllm/v1/worker/gpu/input_batch.py
@@ -8,8 +8,6 @@ import torch
 
 from vllm.triton_utils import tl, triton
 from vllm.utils import random_uuid
-from vllm.utils.math_utils import cdiv
-from vllm.v1.utils import CpuGpuBuffer
 
 
 class InputBuffers:
@@ -21,29 +19,29 @@ class InputBuffers:
         vocab_size: int,
         dtype: torch.dtype,
         device: torch.device,
-        pin_memory: bool,
     ):
         self.max_num_reqs = max_num_reqs
         self.max_num_tokens = max_num_tokens
         self.device = device
-        self.pin_memory = pin_memory
 
-        self.idx_mapping = self._make_buffer(max_num_reqs, dtype=torch.int32)
         self.input_ids = torch.zeros(max_num_tokens, dtype=torch.int32, device=device)
         self.positions = torch.zeros(max_num_tokens, dtype=torch.int64, device=device)
-        self.query_start_loc = self._make_buffer(max_num_reqs + 1, dtype=torch.int32)
-        self.seq_lens = torch.zeros(max_num_reqs, dtype=torch.int32, device=device)
-        self.cu_num_logits = self._make_buffer(max_num_reqs + 1, dtype=torch.int32)
-
-        # Structured outputs.
-        self.bitmask_indices = self._make_buffer(max_num_reqs, dtype=torch.int32)
-        self.grammar_bitmask = self._make_buffer(
-            max_num_reqs, cdiv(vocab_size, 32), dtype=torch.int32
+        self.query_start_loc = torch.zeros(
+            max_num_reqs + 1, dtype=torch.int32, device=device
         )
+        self.seq_lens = torch.zeros(max_num_reqs, dtype=torch.int32, device=device)
 
-    def _make_buffer(self, *args, dtype: torch.dtype) -> CpuGpuBuffer:
-        return CpuGpuBuffer(
-            *args, dtype=dtype, pin_memory=self.pin_memory, device=self.device
+        # NOTE: `mrope_positions` is implemented with one additional dummy
+        # position on purpose to make it non-contiguous so that it can work
+        # with torch compile.
+        # See detailed explanation in https://github.com/vllm-project/vllm/pull/12128#discussion_r1926431923
+        # NOTE: When M-RoPE is enabled, position ids are 3D regardless of
+        # the modality of inputs. For text-only inputs, each dimension has
+        # identical position IDs, making M-RoPE functionally equivalent to
+        # 1D-RoPE.
+        # See page 5 of https://arxiv.org/abs/2409.12191
+        self.mrope_positions = torch.zeros(
+            (3, max_num_tokens + 1), dtype=torch.int64, device=device
         )
 
 
@@ -56,6 +54,8 @@ class InputBatch:
     # batch_idx -> req_state_idx
     idx_mapping: torch.Tensor
     idx_mapping_np: np.ndarray
+    # Identical to idx_mapping except for spec decoding.
+    expanded_idx_mapping: torch.Tensor
 
     # [num_reqs]
     # batch_idx -> num_scheduled_tokens
@@ -70,12 +70,13 @@ class InputBatch:
     query_start_loc_np: np.ndarray
     # [num_reqs]
     seq_lens: torch.Tensor
-    seq_lens_np: np.ndarray
 
     # [num_tokens_after_padding]
     input_ids: torch.Tensor
     # [num_tokens_after_padding]
     positions: torch.Tensor
+    # [3, num_tokens_after_padding]
+    mrope_positions: torch.Tensor
 
     # layer_name -> Metadata
     attn_metadata: dict[str, Any]
@@ -84,6 +85,7 @@ class InputBatch:
     logits_indices: torch.Tensor
     # [num_reqs + 1]
     cu_num_logits: torch.Tensor
+    cu_num_logits_np: np.ndarray
 
     @classmethod
     def make_dummy(
@@ -97,35 +99,44 @@ class InputBatch:
         req_ids = [f"req_{i}_{random_uuid()}" for i in range(num_reqs)]
         idx_mapping_np = np.arange(num_reqs, dtype=np.int32)
         idx_mapping = torch.arange(num_reqs, dtype=torch.int32, device=device)
+        expanded_idx_mapping = idx_mapping
         num_scheduled_tokens = np.full(num_reqs, num_tokens // num_reqs, dtype=np.int32)
         num_scheduled_tokens[-1] += num_tokens % num_reqs
         assert int(num_scheduled_tokens.sum()) == num_tokens
 
-        input_buffers.query_start_loc.np[0] = 0
-        input_buffers.query_start_loc.np[1 : num_reqs + 1] = np.cumsum(
-            num_scheduled_tokens
-        )
-        input_buffers.query_start_loc.np[num_reqs + 1 :] = num_tokens
-        query_start_loc_np = input_buffers.query_start_loc.np[: num_reqs + 1]
-        query_start_loc = input_buffers.query_start_loc.copy_to_gpu()[: num_reqs + 1]
         # seq_len equals to query_len
-        seq_lens_np = np.full(num_reqs, num_tokens // num_reqs, dtype=np.int32)
-        seq_lens_np[-1] += num_tokens % num_reqs
         input_buffers.seq_lens[:num_reqs] = num_tokens // num_reqs
         input_buffers.seq_lens[num_reqs - 1] += num_tokens % num_reqs
+        # Pad for full CUDA graph mode.
         input_buffers.seq_lens[num_reqs:] = 0
         seq_lens = input_buffers.seq_lens[:num_reqs]
 
-        input_ids = input_buffers.input_ids[:num_tokens]
-        positions = input_buffers.positions[:num_tokens]
+        query_start_loc_np = np.empty(num_reqs + 1, dtype=np.int32)
+        query_start_loc_np[0] = 0
+        np.cumsum(num_scheduled_tokens, out=query_start_loc_np[1:])
+        input_buffers.query_start_loc[0] = 0
+        torch.cumsum(
+            seq_lens, dim=0, out=input_buffers.query_start_loc[1 : num_reqs + 1]
+        )
+        # Pad for full CUDA graph mode.
+        input_buffers.query_start_loc[num_reqs + 1 :] = num_tokens
+        query_start_loc = input_buffers.query_start_loc[: num_reqs + 1]
+
+        input_ids = input_buffers.input_ids[:num_tokens].zero_()
+        positions = input_buffers.positions[:num_tokens].zero_()
+        input_buffers.mrope_positions.zero_()
+        mrope_positions = input_buffers.mrope_positions[:, :num_tokens]
+
         # attn_metadata = defaultdict(lambda: None)
         logits_indices = query_start_loc[1:] - 1
         cu_num_logits = torch.arange(num_reqs + 1, device=device, dtype=torch.int32)
+        cu_num_logits_np = np.arange(num_reqs + 1, dtype=np.int32)
         return cls(
             req_ids=req_ids,
             num_reqs=num_reqs,
             idx_mapping=idx_mapping,
             idx_mapping_np=idx_mapping_np,
+            expanded_idx_mapping=expanded_idx_mapping,
             num_scheduled_tokens=num_scheduled_tokens,
             num_tokens=num_tokens,
             num_tokens_after_padding=num_tokens,
@@ -133,12 +144,13 @@ class InputBatch:
             query_start_loc=query_start_loc,
             query_start_loc_np=query_start_loc_np,
             seq_lens=seq_lens,
-            seq_lens_np=seq_lens_np,
             input_ids=input_ids,
             positions=positions,
+            mrope_positions=mrope_positions,
             attn_metadata=None,  # type: ignore
             logits_indices=logits_indices,
             cu_num_logits=cu_num_logits,
+            cu_num_logits_np=cu_num_logits_np,
         )
 
 
@@ -477,3 +489,38 @@ def post_update(
         query_start_loc,
         num_warps=1,
     )
+
+
+@triton.jit
+def _expand_idx_mapping_kernel(
+    idx_mapping_ptr,
+    expanded_idx_mapping_ptr,
+    cu_num_logits_ptr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    req_idx = tl.program_id(0)
+    start_idx = tl.load(cu_num_logits_ptr + req_idx)
+    end_idx = tl.load(cu_num_logits_ptr + req_idx + 1)
+    num_tokens = end_idx - start_idx
+
+    block = tl.arange(0, BLOCK_SIZE)
+    mask = block < num_tokens
+    req_state_idx = tl.load(idx_mapping_ptr + req_idx)
+    tl.store(expanded_idx_mapping_ptr + start_idx + block, req_state_idx, mask=mask)
+
+
+def expand_idx_mapping(
+    idx_mapping: torch.Tensor,
+    total_num_logits: int,
+    cu_num_logits: torch.Tensor,
+    max_expand_len: int,
+) -> torch.Tensor:
+    num_reqs = idx_mapping.shape[0]
+    expanded_idx_mapping = idx_mapping.new_empty(total_num_logits)
+    _expand_idx_mapping_kernel[(num_reqs,)](
+        idx_mapping,
+        expanded_idx_mapping,
+        cu_num_logits,
+        BLOCK_SIZE=triton.next_power_of_2(max_expand_len),
+    )
+    return expanded_idx_mapping
diff --git a/vllm/v1/worker/gpu/mm/__init__.py b/vllm/v1/worker/gpu/mm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/vllm/v1/worker/gpu/mm/mrope_utils.py b/vllm/v1/worker/gpu/mm/mrope_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..c18b9c82e2d486b0dd43877fdec020cece4411f4
--- /dev/null
+++ b/vllm/v1/worker/gpu/mm/mrope_utils.py
@@ -0,0 +1,127 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+
+from vllm.model_executor.models.interfaces import SupportsMRoPE
+from vllm.triton_utils import tl, triton
+from vllm.v1.worker.gpu.buffer_utils import StagedWriteTensor, UvaBackedTensor
+
+
+class MRopeState:
+    def __init__(
+        self,
+        max_num_reqs: int,
+        max_model_len: int,
+        device: torch.device,
+    ):
+        self.max_num_reqs = max_num_reqs
+        self.max_model_len = max_model_len
+        self.device = device
+
+        # NOTE(woosuk): This tensor can be extremely large (e.g., several GBs)
+        # wasting a lot of CPU memory.
+        self.prefill_mrope_positions = StagedWriteTensor(
+            (max_num_reqs, 3 * max_model_len),
+            dtype=torch.int32,
+            device=device,
+            uva_instead_of_gpu=True,
+        )
+        self.prefill_mrope_delta = UvaBackedTensor(max_num_reqs, dtype=torch.int32)
+
+    def init_prefill_mrope_positions(
+        self,
+        req_idx: int,
+        mrope_model: SupportsMRoPE,
+        prefill_token_ids: list[int],
+        mm_features: list,
+    ) -> None:
+        prefill_mrope_positions, prefill_mrope_delta = (
+            mrope_model.get_mrope_input_positions(
+                prefill_token_ids,
+                mm_features,
+            )
+        )
+        for i in range(3):
+            pos = prefill_mrope_positions[i].tolist()
+            self.prefill_mrope_positions.stage_write(
+                req_idx, i * self.max_model_len, pos
+            )
+        self.prefill_mrope_delta.np[req_idx] = prefill_mrope_delta
+
+    def apply_staged_writes(self) -> None:
+        self.prefill_mrope_positions.apply_write()
+        self.prefill_mrope_delta.copy_to_uva()
+
+    def prepare_mrope_positions(
+        self,
+        idx_mapping: torch.Tensor,
+        query_start_loc: torch.Tensor,
+        prefill_lens: torch.Tensor,
+        num_computed_tokens: torch.Tensor,
+        mrope_positions: torch.Tensor,
+    ) -> None:
+        num_reqs = idx_mapping.shape[0]
+        _prepare_mrope_positions_kernel[(num_reqs,)](
+            mrope_positions,
+            mrope_positions.stride(0),
+            self.prefill_mrope_positions.gpu,
+            self.prefill_mrope_positions.gpu.stride(0),
+            self.max_model_len,
+            self.prefill_mrope_delta.gpu,
+            idx_mapping,
+            query_start_loc,
+            prefill_lens,
+            num_computed_tokens,
+            BLOCK_SIZE=1024,
+        )
+
+
+@triton.jit
+def _prepare_mrope_positions_kernel(
+    mrope_positions_ptr,
+    mrope_positions_stride,
+    prefill_mrope_positions_ptr,
+    prefill_mrope_positions_stride0,
+    prefill_mrope_positions_stride1,
+    prefill_mrope_delta_ptr,
+    idx_mapping_ptr,
+    query_start_loc_ptr,
+    prefill_lens_ptr,
+    num_computed_tokens_ptr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    batch_idx = tl.program_id(0)
+    req_state_idx = tl.load(idx_mapping_ptr + batch_idx)
+
+    prefill_len = tl.load(prefill_lens_ptr + req_state_idx)
+    num_computed = tl.load(num_computed_tokens_ptr + req_state_idx)
+    is_prefill = num_computed < prefill_len
+
+    query_start = tl.load(query_start_loc_ptr + batch_idx)
+    query_end = tl.load(query_start_loc_ptr + batch_idx + 1)
+    query_len = query_end - query_start
+
+    mrope_delta = tl.load(prefill_mrope_delta_ptr + req_state_idx)
+    for i in range(0, query_len, BLOCK_SIZE):
+        block = i + tl.arange(0, BLOCK_SIZE)
+        mask = block < query_len
+        orig_pos = num_computed + block
+
+        for j in tl.static_range(3):
+            if is_prefill:
+                # Read from pre-computed M-RoPE positions.
+                pos = tl.load(
+                    prefill_mrope_positions_ptr
+                    + req_state_idx * prefill_mrope_positions_stride0
+                    + j * prefill_mrope_positions_stride1
+                    + orig_pos,
+                    mask=mask,
+                )
+            else:
+                # Apply M-RoPE delta.
+                pos = orig_pos + mrope_delta
+            tl.store(
+                mrope_positions_ptr + j * mrope_positions_stride + query_start + block,
+                pos,
+                mask=mask,
+            )
diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py
index 9f4c6edfb6aa9bd8b3a011fad9f199ef7f22a795..7300357a167636a28216f81f8e75ff08ad467617 100644
--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -14,9 +14,7 @@ from vllm.config.compilation import CUDAGraphMode
 from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader import get_model_loader
-from vllm.utils.mem_constants import GiB_bytes
-from vllm.utils.mem_utils import DeviceMemoryProfiler
-from vllm.utils.platform_utils import is_pin_memory_available
+from vllm.utils.mem_utils import DeviceMemoryProfiler, format_gib
 from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
 from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
 from vllm.v1.kv_cache_interface import KVCacheConfig
@@ -25,7 +23,7 @@ from vllm.v1.outputs import (
     LogprobsTensors,
     ModelRunnerOutput,
 )
-from vllm.v1.worker.gpu.async_utils import AsyncOutput, async_barrier
+from vllm.v1.worker.gpu.async_utils import AsyncOutput
 from vllm.v1.worker.gpu.attn_utils import (
     build_attn_metadata,
     get_kv_cache_spec,
@@ -33,6 +31,7 @@ from vllm.v1.worker.gpu.attn_utils import (
     init_kv_cache,
 )
 from vllm.v1.worker.gpu.block_table import BlockTables
+from vllm.v1.worker.gpu.buffer_utils import UvaBufferPool
 from vllm.v1.worker.gpu.cudagraph_utils import CudaGraphManager
 from vllm.v1.worker.gpu.dp_utils import (
     get_batch_metadata_across_dp,
@@ -42,22 +41,21 @@ from vllm.v1.worker.gpu.input_batch import (
     InputBatch,
     InputBuffers,
     combine_sampled_and_draft_tokens,
+    expand_idx_mapping,
     get_num_sampled_and_rejected,
     post_update,
     prepare_pos_seq_lens,
     prepare_prefill_inputs,
 )
+from vllm.v1.worker.gpu.mm.mrope_utils import MRopeState
 from vllm.v1.worker.gpu.sample.logprob import compute_prompt_logprobs
-from vllm.v1.worker.gpu.sample.metadata import (
-    SamplingMetadata,
-    expand_sampling_metadata,
-)
+from vllm.v1.worker.gpu.sample.metadata import SamplingMetadata
 from vllm.v1.worker.gpu.sample.output import SamplerOutput
 from vllm.v1.worker.gpu.sample.sampler import Sampler
 from vllm.v1.worker.gpu.spec_decode import init_speculator
 from vllm.v1.worker.gpu.spec_decode.rejection_sample import rejection_sample
 from vllm.v1.worker.gpu.states import RequestState
-from vllm.v1.worker.gpu.structured_outputs import apply_grammar_bitmask
+from vllm.v1.worker.gpu.structured_outputs import StructuredOutputsWorker
 from vllm.v1.worker.kv_connector_model_runner_mixin import KVConnectorModelRunnerMixin
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
 
@@ -82,7 +80,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         self.observability_config = vllm_config.observability_config
 
         self.device = device
-        self.pin_memory = is_pin_memory_available()
         self.dtype = self.model_config.dtype
         self.kv_cache_dtype = self.dtype
         if self.cache_config.cache_dtype != "auto":
@@ -98,8 +95,14 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         self.max_num_reqs = self.scheduler_config.max_num_seqs
         self.inputs_embeds_size = self.model_config.get_inputs_embeds_size()
 
-        self.dp_size = self.parallel_config.data_parallel_size
-        self.dp_rank = self.parallel_config.data_parallel_rank
+        # Multimodal
+        self.uses_mrope = self.model_config.uses_mrope
+        if self.uses_mrope:
+            self.mrope_states = MRopeState(
+                max_num_reqs=self.max_num_reqs,
+                max_model_len=self.max_model_len,
+                device=self.device,
+            )
 
         self.use_async_scheduling = self.scheduler_config.async_scheduling
         self.output_copy_stream = torch.cuda.Stream(self.device)
@@ -127,7 +130,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             num_speculative_steps=self.num_speculative_steps,
             vocab_size=self.vocab_size,
             device=self.device,
-            pin_memory=self.pin_memory,
         )
         self.input_buffers = InputBuffers(
             max_num_reqs=self.max_num_reqs,
@@ -136,12 +138,27 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             vocab_size=self.vocab_size,
             dtype=self.dtype,
             device=self.device,
-            pin_memory=self.pin_memory,
         )
         self.sampler = Sampler(logprobs_mode=self.model_config.logprobs_mode)
 
         # CUDA graphs.
-        self.cudagraph_manager = CudaGraphManager(self.vllm_config, self.device)
+        self.cudagraph_manager = CudaGraphManager(
+            self.vllm_config, self.uses_mrope, self.device
+        )
+        # Structured outputs worker.
+        self.structured_outputs_worker = StructuredOutputsWorker(
+            max_num_logits=self.max_num_reqs * (self.num_speculative_steps + 1),
+            vocab_size=self.vocab_size,
+        )
+
+        # Buffers for CPU-to-GPU copies.
+        self.tmp_idx_mapping = UvaBufferPool(self.max_num_reqs, torch.int32)
+        self.tmp_cu_num_logits = UvaBufferPool(self.max_num_reqs + 1, torch.int32)
+        self.tmp_query_start_loc = UvaBufferPool(self.max_num_reqs + 1, torch.int32)
+
+    def update_max_model_len(self, max_model_len: int) -> None:
+        self.max_model_len = max_model_len
+        self.req_states.max_model_len = max_model_len
 
     def get_supported_tasks(self) -> tuple[str]:
         return ("generate",)
@@ -168,8 +185,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         self.model_memory_usage = m.consumed_memory
         logger.info(
-            "Model loading took %.4f GiB and %.6f seconds",
-            m.consumed_memory / GiB_bytes,
+            "Model loading took %s GiB and %.6f seconds",
+            format_gib(m.consumed_memory),
             time_after_load - time_before_load,
         )
 
@@ -193,7 +210,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             max_num_batched_tokens=self.max_num_tokens,
             max_model_len=self.max_model_len,
             device=self.device,
-            pin_memory=self.pin_memory,
         )
 
         self.attn_backends, self.attn_metadata_builders = init_attn_backend(
@@ -229,21 +245,14 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         slot_mappings = self.block_tables.get_dummy_slot_mappings(
             input_batch.num_tokens
         )
-        num_computed_tokens = torch.zeros(
-            input_batch.num_reqs, dtype=torch.int32, device=self.device
-        )
-        query_start_loc = self.input_buffers.query_start_loc
-        query_start_loc_gpu = query_start_loc.gpu[: input_batch.num_reqs + 1]
-        query_start_loc_cpu = query_start_loc.cpu[: input_batch.num_reqs + 1]
         attn_metadata = build_attn_metadata(
             attn_metadata_builders=self.attn_metadata_builders,
             num_reqs=input_batch.num_reqs,
             num_tokens=input_batch.num_tokens,
-            query_start_loc_gpu=query_start_loc_gpu,
-            query_start_loc_cpu=query_start_loc_cpu,
-            seq_lens=self.input_buffers.seq_lens,
-            seq_lens_np=input_batch.seq_lens_np,
-            num_computed_tokens_cpu=num_computed_tokens,
+            query_start_loc_gpu=input_batch.query_start_loc,
+            query_start_loc_cpu=torch.from_numpy(input_batch.query_start_loc_np),
+            seq_lens=input_batch.seq_lens,
+            max_seq_len=self.max_model_len,
             block_tables=block_tables,
             slot_mappings=slot_mappings,
             kv_cache_config=self.kv_cache_config,
@@ -268,8 +277,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         if not skip_attn:
             self.prepare_dummy_attn_metadata(input_batch)
 
-        num_tokens_across_dp = make_num_tokens_across_dp(self.dp_size, num_tokens)
+        dp_size = self.parallel_config.data_parallel_size
+        num_tokens_across_dp = make_num_tokens_across_dp(dp_size, num_tokens)
         num_sampled_tokens = np.ones(input_batch.num_reqs, dtype=np.int32)
+        if not self.uses_mrope:
+            positions = input_batch.positions
+        else:
+            positions = input_batch.mrope_positions
         with (
             self.maybe_dummy_run_with_lora(
                 self.lora_config,
@@ -285,7 +299,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         ):
             hidden_states = self.model(
                 input_ids=input_batch.input_ids,
-                positions=input_batch.positions,
+                positions=positions,
             )
             sample_hidden_states = hidden_states[input_batch.logits_indices]
         return hidden_states, sample_hidden_states
@@ -312,7 +326,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         self._dummy_sampler_run(sample_hidden_states)
         if self.do_spec_decode:
             num_tokens_across_dp = make_num_tokens_across_dp(
-                self.dp_size, self.max_num_tokens
+                self.parallel_config.data_parallel_size, self.max_num_tokens
             )
             self.speculator.run_model(
                 self.max_num_tokens,
@@ -381,16 +395,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         for req_id in scheduler_output.finished_req_ids:
             self.req_states.remove_request(req_id)
 
-        # TODO(woosuk): Change SchedulerOutput.
-        req_indices: list[int] = []
-        cu_num_new_blocks = tuple(
-            [0] for _ in range(self.block_tables.num_kv_cache_groups)
-        )
-        new_block_ids: tuple[list[int], ...] = tuple(
-            [] for _ in range(self.block_tables.num_kv_cache_groups)
-        )
-        overwrite: list[bool] = []
-
         # Add new requests.
         for new_req_data in scheduler_output.scheduled_new_reqs:
             assert new_req_data.prompt_token_ids is not None
@@ -405,38 +409,35 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 sampling_params=new_req_data.sampling_params,
                 lora_request=new_req_data.lora_request,
             )
-
             req_index = self.req_states.req_id_to_index[req_id]
-            req_indices.append(req_index)
-            for i, block_ids in enumerate(new_req_data.block_ids):
-                x = cu_num_new_blocks[i][-1]
-                cu_num_new_blocks[i].append(x + len(block_ids))
-                new_block_ids[i].extend(block_ids)
-            overwrite.append(True)
-        if scheduler_output.scheduled_new_reqs:
-            self.req_states.prefill_len.copy_to_gpu()
+
+            # Pre-compute M-RoPE positions for prefill.
+            if self.uses_mrope:
+                self.mrope_states.init_prefill_mrope_positions(
+                    req_index,
+                    self.model,  # type: ignore
+                    new_req_data.prefill_token_ids,
+                    mm_features=[],  # TODO
+                )
+
+            self.block_tables.append_block_ids(
+                req_index, new_req_data.block_ids, overwrite=True
+            )
 
         # Add new blocks for the existing requests.
         cached_reqs = scheduler_output.scheduled_cached_reqs
         for i, req_id in enumerate(cached_reqs.req_ids):
             req_index = self.req_states.req_id_to_index[req_id]
-
             req_new_block_ids = cached_reqs.new_block_ids[i]
             if req_new_block_ids is not None:
-                req_indices.append(req_index)
-                for group_id, block_ids in enumerate(req_new_block_ids):
-                    x = cu_num_new_blocks[group_id][-1]
-                    cu_num_new_blocks[group_id].append(x + len(block_ids))
-                    new_block_ids[group_id].extend(block_ids)
-                overwrite.append(False)
-
-        if req_indices:
-            self.block_tables.append_block_ids(
-                req_indices=req_indices,
-                cu_num_new_blocks=cu_num_new_blocks,
-                new_block_ids=new_block_ids,
-                overwrite=overwrite,
-            )
+                self.block_tables.append_block_ids(
+                    req_index, req_new_block_ids, overwrite=False
+                )
+
+        self.req_states.apply_staged_writes()
+        self.block_tables.apply_staged_writes()
+        if self.uses_mrope:
+            self.mrope_states.apply_staged_writes()
 
     def prepare_inputs(
         self,
@@ -460,19 +461,19 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         idx_mapping_list = [
             self.req_states.req_id_to_index[req_id] for req_id in req_ids
         ]
-        idx_mapping = self.input_buffers.idx_mapping
-        idx_mapping.np[:num_reqs] = idx_mapping_list
-        idx_mapping_np = idx_mapping.np[:num_reqs]
-        idx_mapping = idx_mapping.copy_to_gpu(num_reqs)
+        idx_mapping_np = np.array(idx_mapping_list, dtype=np.int32)
+        idx_mapping = self.tmp_idx_mapping.copy_to_gpu(idx_mapping_np)
 
         # Get the number of draft tokens for each request.
         if not scheduler_output.scheduled_spec_decode_tokens:
             # No draft token scheduled (common case).
             total_num_draft_tokens = 0
             total_num_logits = num_reqs
+            cu_num_logits_np = np.arange(num_reqs + 1, dtype=np.int32)
             cu_num_logits = torch.arange(
                 num_reqs + 1, device=self.device, dtype=torch.int32
             )
+            expanded_idx_mapping = idx_mapping
         else:
             draft_tokens = scheduler_output.scheduled_spec_decode_tokens
             num_draft_tokens = np.array(
@@ -485,56 +486,75 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             total_num_draft_tokens = int(num_draft_tokens.sum())
             total_num_logits = num_reqs + total_num_draft_tokens
 
-            np.cumsum(
-                num_draft_tokens + 1,
-                out=self.input_buffers.cu_num_logits.np[1 : num_reqs + 1],
+            num_logits = num_draft_tokens + 1
+            cu_num_logits_np = np.empty(num_reqs + 1, dtype=np.int32)
+            cu_num_logits_np[0] = 0
+            np.cumsum(num_logits, out=cu_num_logits_np[1:])
+            cu_num_logits = self.tmp_cu_num_logits.copy_to_gpu(cu_num_logits_np)
+
+            expanded_idx_mapping = expand_idx_mapping(
+                idx_mapping,
+                total_num_logits,
+                cu_num_logits,
+                max_expand_len=self.num_speculative_steps + 1,
             )
-            cu_num_logits = self.input_buffers.cu_num_logits.copy_to_gpu(num_reqs + 1)
 
         # Block tables: num_kv_cache_groups x [num_reqs, max_num_blocks]
         block_tables = self.block_tables.gather_block_tables(idx_mapping)
 
         # Get query_start_loc.
-        np.cumsum(
-            num_scheduled_tokens,
-            out=self.input_buffers.query_start_loc.np[1 : num_reqs + 1],
-        )
+        query_start_loc_np = np.empty(self.max_num_reqs + 1, dtype=np.int32)
+        query_start_loc_np[0] = 0
+        np.cumsum(num_scheduled_tokens, out=query_start_loc_np[1 : num_reqs + 1])
         # Pad for full CUDA graph mode.
         # Some attention backends like FA3 require query_start_loc to be non-decreasing.
-        self.input_buffers.query_start_loc.np[num_reqs + 1 :] = num_tokens
-        self.input_buffers.query_start_loc.copy_to_gpu()
-        query_start_loc_gpu = self.input_buffers.query_start_loc.gpu[: num_reqs + 1]
-        query_start_loc_cpu = self.input_buffers.query_start_loc.cpu[: num_reqs + 1]
-        query_start_loc_np = self.input_buffers.query_start_loc.np[: num_reqs + 1]
+        query_start_loc_np[num_reqs + 1 :] = num_tokens
+        self.tmp_query_start_loc.copy_to_gpu(
+            query_start_loc_np,
+            out=self.input_buffers.query_start_loc,
+        )
+        query_start_loc_np = query_start_loc_np[: num_reqs + 1]
+        query_start_loc_cpu = torch.from_numpy(query_start_loc_np)
+        query_start_loc = self.input_buffers.query_start_loc[: num_reqs + 1]
 
         # Get prefill tokens.
         prepare_prefill_inputs(
             self.input_buffers.input_ids,
             self.req_states.next_prefill_tokens,
             idx_mapping,
-            query_start_loc_gpu,
+            query_start_loc,
             self.req_states.prefill_token_ids.gpu,
             self.req_states.prefill_len.gpu,
-            self.req_states.num_computed_tokens,
+            self.req_states.num_computed_tokens.gpu,
         )
 
         # Prepare positions and seq_lens.
         prepare_pos_seq_lens(
             idx_mapping,
-            query_start_loc_gpu,
-            self.req_states.num_computed_tokens,
+            query_start_loc,
+            self.req_states.num_computed_tokens.gpu,
             self.input_buffers.positions,
             self.input_buffers.seq_lens,
         )
         seq_lens = self.input_buffers.seq_lens[:num_reqs]
 
+        # Prepare M-RoPE positions.
+        if self.uses_mrope:
+            self.mrope_states.prepare_mrope_positions(
+                idx_mapping,
+                query_start_loc,
+                self.req_states.prefill_len.gpu,
+                self.req_states.num_computed_tokens.gpu,
+                self.input_buffers.mrope_positions,
+            )
+
         # Some input token ids are directly read from the last sampled tokens
         # and draft tokens. Also, get the logits indices to sample tokens from.
         logits_indices = combine_sampled_and_draft_tokens(
             self.input_buffers.input_ids,
             idx_mapping,
             self.req_states.last_sampled_tokens,
-            query_start_loc_gpu,
+            query_start_loc,
             seq_lens,
             self.req_states.prefill_len.gpu,
             self.req_states.draft_tokens,
@@ -544,29 +564,18 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         # Compute slot mappings: [num_kv_cache_groups, num_tokens]
         slot_mappings = self.block_tables.compute_slot_mappings(
-            query_start_loc_gpu, self.input_buffers.positions[:num_tokens]
+            query_start_loc, self.input_buffers.positions[:num_tokens]
         )
 
-        # Get num_computed_tokens.
-        # HACK(woosuk): Here, we use num_computed_tokens on GPU instead of
-        # num_computed_tokens_cpu. This works for most cases.
-        num_computed_tokens = self.req_states.num_computed_tokens[idx_mapping]
-        # HACK(woosuk): Only GPU has the exact seq_lens because at this point
-        # CPU does not know how many draft tokens are accepted/rejected in the
-        # previous step. Therefore, we use max_model_len to be safe.
-        # NOTE(woosuk): This only works for FA3 backend.
-        seq_lens_np = np.full(num_reqs, self.max_model_len, dtype=np.int32)
-
         # Layer name -> attention metadata.
         attn_metadata = build_attn_metadata(
             attn_metadata_builders=self.attn_metadata_builders,
             num_reqs=num_reqs,
             num_tokens=num_tokens,
-            query_start_loc_gpu=query_start_loc_gpu,
+            query_start_loc_gpu=query_start_loc,
             query_start_loc_cpu=query_start_loc_cpu,
             seq_lens=self.input_buffers.seq_lens,
-            seq_lens_np=seq_lens_np,
-            num_computed_tokens_cpu=num_computed_tokens,
+            max_seq_len=self.max_model_len,
             block_tables=block_tables,
             slot_mappings=slot_mappings,
             kv_cache_config=self.kv_cache_config,
@@ -574,24 +583,29 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         input_ids = self.input_buffers.input_ids[:num_tokens_after_padding]
         positions = self.input_buffers.positions[:num_tokens_after_padding]
+        mrope_positions = self.input_buffers.mrope_positions[
+            :, :num_tokens_after_padding
+        ]
         return InputBatch(
             req_ids=req_ids,
             num_reqs=num_reqs,
             idx_mapping=idx_mapping,
             idx_mapping_np=idx_mapping_np,
+            expanded_idx_mapping=expanded_idx_mapping,
             num_scheduled_tokens=num_scheduled_tokens,
             num_tokens=num_tokens,
             num_tokens_after_padding=num_tokens_after_padding,
             num_draft_tokens=total_num_draft_tokens,
-            query_start_loc=query_start_loc_gpu,
+            query_start_loc=query_start_loc,
             query_start_loc_np=query_start_loc_np,
             seq_lens=seq_lens,
-            seq_lens_np=seq_lens_np,
             input_ids=input_ids,
             positions=positions,
+            mrope_positions=mrope_positions,
             attn_metadata=attn_metadata,
             logits_indices=logits_indices,
             cu_num_logits=cu_num_logits,
+            cu_num_logits_np=cu_num_logits_np,
         )
 
     def sample(
@@ -605,16 +619,12 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         logits = self.model.compute_logits(sample_hidden_states)
         if grammar_output is not None:
             # Apply grammar bitmask to the logits in-place.
-            # TODO(woosuk): Make compatible with spec decoding.
-            assert input_batch.num_draft_tokens == 0
-            with async_barrier(self.structured_outputs_event):
-                apply_grammar_bitmask(
-                    logits,
-                    input_batch.req_ids,
-                    grammar_output.structured_output_request_ids,
-                    grammar_output.grammar_bitmask,
-                    self.input_buffers,
-                )
+            self.structured_outputs_worker.apply_grammar_bitmask(
+                logits,
+                input_batch,
+                grammar_output.structured_output_request_ids,
+                grammar_output.grammar_bitmask,
+            )
 
         # Sample tokens and compute logprobs (if needed).
         sampler_output = self.sampler(logits, sampling_metadata)
@@ -682,8 +692,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # Handle chunked prompts.
         pos_after_step = computed_prefill + input_batch.num_scheduled_tokens
         is_prompt_chunked = pos_after_step < prompt_lens
-        prefill_token_ids = self.req_states.prefill_token_ids.np
-        query_start_loc = self.input_buffers.query_start_loc.np
+        prefill_token_ids = self.req_states.prefill_token_ids.gpu
+        query_start_loc_np = input_batch.query_start_loc_np
         for i, req_id in enumerate(input_batch.req_ids):
             if not needs_prompt_logprobs[i]:
                 continue
@@ -691,10 +701,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 continue
             # The prompt is chunked. Get the next prompt token.
             req_idx = input_batch.idx_mapping_np[i]
-            next_prompt_token = int(prefill_token_ids[req_idx, pos_after_step[i]])
-            idx = int(query_start_loc[i + 1] - 1)
-            # Set the next prompt token.
-            # NOTE(woosuk): This triggers a GPU operation.
+            idx = int(query_start_loc_np[i + 1] - 1)
+            # NOTE(woosuk): This triggers two GPU operations.
+            next_prompt_token = prefill_token_ids[req_idx, pos_after_step[i]]
             token_ids[idx] = next_prompt_token
 
         # NOTE(woosuk): We mask out logprobs for negative tokens.
@@ -710,8 +719,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             if not needs_prompt_logprobs[i]:
                 continue
 
-            start_idx = query_start_loc[i]
-            end_idx = query_start_loc[i + 1]
+            start_idx = query_start_loc_np[i]
+            end_idx = query_start_loc_np[i + 1]
             assert start_idx < end_idx, (
                 f"start_idx ({start_idx}) >= end_idx ({end_idx})"
             )
@@ -755,7 +764,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # Update the number of computed tokens.
         post_update(
             input_batch.idx_mapping,
-            self.req_states.num_computed_tokens,
+            self.req_states.num_computed_tokens.gpu,
             self.req_states.last_sampled_tokens,
             self.req_states.output_bin_counts,
             sampled_tokens,
@@ -807,7 +816,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         scheduler_output: SchedulerOutput,
     ) -> tuple[CUDAGraphMode, int, torch.Tensor | None]:
         total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
-        if self.dp_size == 1:
+        dp_size = self.parallel_config.data_parallel_size
+        if dp_size == 1:
             # No DP. Only consider CUDA graphs.
             if total_num_scheduled_tokens == 0:
                 # Special case: no tokens to run.
@@ -835,11 +845,12 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 cudagraph_size_before_dp = -1
 
         assert cudagraph_size_before_dp is not None
+        dp_rank = self.parallel_config.data_parallel_rank
         num_tokens_across_dp, cudagraph_size_across_dp = get_batch_metadata_across_dp(
             total_num_scheduled_tokens,
             cudagraph_size_before_dp,
-            self.dp_size,
-            self.dp_rank,
+            dp_size,
+            dp_rank,
         )
         if all(cudagraph_size_across_dp >= 0):
             # If all ranks can use CUDA graph, pad to the maximum number of tokens
@@ -850,7 +861,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             # If any of the ranks cannot use CUDA graph, use eager mode for all ranks.
             # No padding is needed except for ranks that have no tokens to run.
             num_tokens_across_dp = torch.clamp(num_tokens_across_dp, min=1)
-            num_tokens_after_padding = num_tokens_across_dp[self.dp_rank]
+            num_tokens_after_padding = num_tokens_across_dp[dp_rank]
             cudagraph_mode = CUDAGraphMode.NONE
         return cudagraph_mode, num_tokens_after_padding, num_tokens_across_dp
 
@@ -864,61 +875,49 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         assert intermediate_tensors is None
         if scheduler_output.total_num_scheduled_tokens == 0 and not dummy_run:
             # No need to run the model.
-            with async_barrier(self.input_prep_event):
-                self.update_states(scheduler_output)
-                return EMPTY_MODEL_RUNNER_OUTPUT
+            self.update_states(scheduler_output)
+            return EMPTY_MODEL_RUNNER_OUTPUT
 
-        # NOTE: Call this before the async barrier so CPU all-reduce and
-        # GPU execution can overlap.
         cudagraph_mode, num_tokens_after_padding, num_tokens_across_dp = (
             self.get_cudagraph_and_dp_padding(scheduler_output)
         )
-        with async_barrier(self.input_prep_event):
-            self.update_states(scheduler_output)
-            if num_tokens_after_padding == 0:
-                # All DP ranks have zero tokens to run.
-                return EMPTY_MODEL_RUNNER_OUTPUT
-
-            if not dummy_run:
-                # Common case.
-                # Prepare all the inputs and copy to the input buffers.
-                input_batch = self.prepare_inputs(
-                    scheduler_output,
-                    num_tokens_after_padding,
-                )
+        self.update_states(scheduler_output)
+        if num_tokens_after_padding == 0:
+            # All DP ranks have zero tokens to run.
+            return EMPTY_MODEL_RUNNER_OUTPUT
+
+        if not dummy_run:
+            # Common case.
+            # Prepare all the inputs and copy to the input buffers.
+            input_batch = self.prepare_inputs(
+                scheduler_output,
+                num_tokens_after_padding,
+            )
 
-                # NOTE(woosuk): Sampling metadata should be built under the async
-                # barrier to avoid race conditions.
-                pos = input_batch.positions[input_batch.logits_indices]
-                sampling_metadata = self.req_states.make_sampling_metadata(
-                    input_batch.idx_mapping, input_batch.idx_mapping_np, pos
-                )
-                if input_batch.num_draft_tokens > 0:
-                    sampling_metadata = expand_sampling_metadata(
-                        sampling_metadata,
-                        input_batch.cu_num_logits,
-                        max_expand_len=self.num_speculative_steps + 1,
-                    )
-
-                if self.lora_config:
-                    # Activate LoRA adapters.
-                    lora_inputs = self.req_states.make_lora_inputs(
-                        input_batch.req_ids,
-                        input_batch.idx_mapping_np,
-                        input_batch.num_scheduled_tokens,
-                    )
-                    self._set_active_loras(*lora_inputs)
-            else:
-                # No actual tokens to run. A dummy run for DP.
-                num_reqs = min(num_tokens_after_padding, self.max_num_reqs)
-                input_batch = InputBatch.make_dummy(
-                    num_reqs=num_reqs,
-                    num_tokens=num_tokens_after_padding,
-                    input_buffers=self.input_buffers,
-                    device=self.device,
+            pos = input_batch.positions[input_batch.logits_indices]
+            sampling_metadata = self.req_states.make_sampling_metadata(
+                input_batch.expanded_idx_mapping, input_batch.idx_mapping_np, pos
+            )
+
+            if self.lora_config:
+                # Activate LoRA adapters.
+                lora_inputs = self.req_states.make_lora_inputs(
+                    input_batch.req_ids,
+                    input_batch.idx_mapping_np,
+                    input_batch.num_scheduled_tokens,
                 )
-                self.prepare_dummy_attn_metadata(input_batch)
-                sampling_metadata = None
+                self._set_active_loras(*lora_inputs)
+        else:
+            # No actual tokens to run. A dummy run for DP.
+            num_reqs = min(num_tokens_after_padding, self.max_num_reqs)
+            input_batch = InputBatch.make_dummy(
+                num_reqs=num_reqs,
+                num_tokens=num_tokens_after_padding,
+                input_buffers=self.input_buffers,
+                device=self.device,
+            )
+            self.prepare_dummy_attn_metadata(input_batch)
+            sampling_metadata = None
 
         # Run model.
         if cudagraph_mode == CUDAGraphMode.FULL:
@@ -931,6 +930,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         else:
             # Run PyTorch model in eager mode.
             # TODO(woosuk): Support piecewise CUDA graph.
+            if not self.uses_mrope:
+                positions = input_batch.positions
+            else:
+                positions = input_batch.mrope_positions
             with set_forward_context(
                 input_batch.attn_metadata,
                 self.vllm_config,
@@ -940,7 +943,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             ):
                 hidden_states = self.model(
                     input_ids=input_batch.input_ids,
-                    positions=input_batch.positions,
+                    positions=positions,
                 )
 
         self.execute_model_state = hidden_states, input_batch, sampling_metadata
@@ -968,11 +971,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             # Only for compatibility with the existing model runner and scheduler.
             req_id_to_index={req_id: i for i, req_id in enumerate(input_batch.req_ids)},
             sampled_token_ids=None,  # type: ignore
-            logprobs=None,
-            prompt_logprobs_dict=prompt_logprobs_dict,  # type: ignore
-            pooler_output=[],
-            kv_connector_output=None,
-            num_nans_in_logits=None,
+            prompt_logprobs_dict=prompt_logprobs_dict,  # type: ignore[arg-type]
         )
         async_output = AsyncOutput(
             model_runner_output=model_runner_output,
diff --git a/vllm/v1/worker/gpu/sample/gumbel.py b/vllm/v1/worker/gpu/sample/gumbel.py
index a95bf1e7a37a7198516e4f91b92d1c13ff58a859..0cb80f8331513d39b6f21298199c95a7b1a8370b 100644
--- a/vllm/v1/worker/gpu/sample/gumbel.py
+++ b/vllm/v1/worker/gpu/sample/gumbel.py
@@ -13,6 +13,7 @@ def _gumbel_sample_kernel(
     local_max_stride,
     logits_ptr,
     logits_stride,
+    idx_mapping_ptr,
     seeds_ptr,
     pos_ptr,
     temp_ptr,
@@ -20,22 +21,24 @@ def _gumbel_sample_kernel(
     BLOCK_SIZE: tl.constexpr,
     APPLY_TEMPERATURE: tl.constexpr,
 ):
-    req_idx = tl.program_id(0)
+    batch_idx = tl.program_id(0)
+    req_state_idx = tl.load(idx_mapping_ptr + batch_idx)
+
     block_idx = tl.program_id(1)
     block = block_idx * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
     mask = block < vocab_size
     logits = tl.load(
-        logits_ptr + req_idx * logits_stride + block,
+        logits_ptr + batch_idx * logits_stride + block,
         mask=mask,
         other=float("-inf"),
     )
     logits = logits.to(tl.float32)
 
-    temp = tl.load(temp_ptr + req_idx).to(tl.float32)
+    temp = tl.load(temp_ptr + req_state_idx).to(tl.float32)
     if temp != 0.0:
         # Calculate the seed for gumbel noise.
-        seed = tl.load(seeds_ptr + req_idx)
-        pos = tl.load(pos_ptr + req_idx)
+        seed = tl.load(seeds_ptr + req_state_idx)
+        pos = tl.load(pos_ptr + batch_idx)
         gumbel_seed = tl.randint(seed, pos)
 
         # Generate gumbel noise.
@@ -55,12 +58,13 @@ def _gumbel_sample_kernel(
     idx = tl.argmax(logits, axis=0)
     token_id = block_idx * BLOCK_SIZE + idx
     value = tl.max(logits, axis=0)
-    tl.store(local_argmax_ptr + req_idx * local_argmax_stride + block_idx, token_id)
-    tl.store(local_max_ptr + req_idx * local_max_stride + block_idx, value)
+    tl.store(local_argmax_ptr + batch_idx * local_argmax_stride + block_idx, token_id)
+    tl.store(local_max_ptr + batch_idx * local_max_stride + block_idx, value)
 
 
 def gumbel_sample(
     logits: torch.Tensor,  # [num_reqs, vocab_size]
+    idx_mapping: torch.Tensor,  # [num_reqs]
     temperature: torch.Tensor,  # [num_reqs]
     seed: torch.Tensor,  # [num_reqs]
     pos: torch.Tensor,  # [num_reqs]
@@ -88,6 +92,7 @@ def gumbel_sample(
         local_max.stride(0),
         logits,
         logits.stride(0),
+        idx_mapping,
         seed,
         pos,
         temperature,
diff --git a/vllm/v1/worker/gpu/sample/logit_bias.py b/vllm/v1/worker/gpu/sample/logit_bias.py
new file mode 100644
index 0000000000000000000000000000000000000000..f959b36e4529578f322b4197d543a7bfa2b4e9e1
--- /dev/null
+++ b/vllm/v1/worker/gpu/sample/logit_bias.py
@@ -0,0 +1,270 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+
+from vllm.sampling_params import SamplingParams
+from vllm.triton_utils import tl, triton
+from vllm.v1.worker.gpu.buffer_utils import StagedWriteTensor, UvaBackedTensor
+
+MAX_NUM_ALLOWED_TOKEN_IDS = 1024
+MAX_NUM_LOGIT_BIAS_TOKENS = 1024
+MAX_NUM_STOP_TOKEN_IDS = 128
+
+
+class LogitBiasState:
+    def __init__(
+        self,
+        max_num_reqs: int,
+        device: torch.device,
+    ):
+        self.max_num_reqs = max_num_reqs
+
+        # Allowed token IDs.
+        self.num_allowed_token_ids = UvaBackedTensor(
+            self.max_num_reqs, dtype=torch.int32
+        )
+        self.allowed_token_ids = StagedWriteTensor(
+            (self.max_num_reqs, MAX_NUM_ALLOWED_TOKEN_IDS),
+            dtype=torch.int32,
+            device=device,
+        )
+        # Logit bias.
+        self.num_logit_bias = UvaBackedTensor(self.max_num_reqs, dtype=torch.int32)
+        self.logit_bias_token_ids = StagedWriteTensor(
+            (self.max_num_reqs, MAX_NUM_LOGIT_BIAS_TOKENS),
+            dtype=torch.int32,
+            device=device,
+        )
+        self.logit_bias = StagedWriteTensor(
+            (self.max_num_reqs, MAX_NUM_LOGIT_BIAS_TOKENS),
+            dtype=torch.float32,
+            device=device,
+        )
+        # Min tokens.
+        self.min_lens = UvaBackedTensor(self.max_num_reqs, dtype=torch.int32)
+        self.num_stop_token_ids = UvaBackedTensor(self.max_num_reqs, dtype=torch.int32)
+        self.stop_token_ids = StagedWriteTensor(
+            (self.max_num_reqs, MAX_NUM_STOP_TOKEN_IDS),
+            dtype=torch.int32,
+            device=device,
+        )
+
+    def add_request(
+        self,
+        req_idx: int,
+        prompt_len: int,
+        sampling_params: SamplingParams,
+    ) -> None:
+        # Allowed token IDs.
+        allowed_token_ids = sampling_params.allowed_token_ids
+        if allowed_token_ids:
+            num_allowed_token_ids = len(allowed_token_ids)
+            if num_allowed_token_ids > MAX_NUM_ALLOWED_TOKEN_IDS:
+                raise ValueError(
+                    f"Too many allowed token IDs: {num_allowed_token_ids}. "
+                    f"The max size is {MAX_NUM_ALLOWED_TOKEN_IDS}."
+                )
+            self.num_allowed_token_ids.np[req_idx] = num_allowed_token_ids
+            self.allowed_token_ids.stage_write(req_idx, 0, allowed_token_ids)
+        else:
+            self.num_allowed_token_ids.np[req_idx] = 0
+
+        # Logit bias.
+        logit_bias = sampling_params.logit_bias
+        if logit_bias:
+            num_logit_bias = len(logit_bias)
+            if num_logit_bias > MAX_NUM_LOGIT_BIAS_TOKENS:
+                raise ValueError(
+                    f"Too many logit bias tokens: {num_logit_bias}. "
+                    f"The max size is {MAX_NUM_LOGIT_BIAS_TOKENS}."
+                )
+            self.num_logit_bias.np[req_idx] = num_logit_bias
+            self.logit_bias_token_ids.stage_write(req_idx, 0, logit_bias.keys())
+            self.logit_bias.stage_write(req_idx, 0, logit_bias.values())
+        else:
+            self.num_logit_bias.np[req_idx] = 0
+
+        # Min tokens.
+        min_tokens = sampling_params.min_tokens
+        min_len = prompt_len + min_tokens
+        self.min_lens.np[req_idx] = min_len
+        stop_token_ids = sampling_params.all_stop_token_ids
+        if stop_token_ids:
+            num_stop_token_ids = len(stop_token_ids)
+            if num_stop_token_ids > MAX_NUM_STOP_TOKEN_IDS:
+                raise ValueError(
+                    f"Too many stop tokens: {num_stop_token_ids}. "
+                    f"The max size is {MAX_NUM_STOP_TOKEN_IDS}."
+                )
+            self.num_stop_token_ids.np[req_idx] = num_stop_token_ids
+            self.stop_token_ids.stage_write(req_idx, 0, stop_token_ids)
+        else:
+            self.num_stop_token_ids.np[req_idx] = 0
+
+    def apply_staged_writes(self) -> None:
+        self.num_allowed_token_ids.copy_to_uva()
+        self.allowed_token_ids.apply_write()
+
+        self.num_logit_bias.copy_to_uva()
+        self.logit_bias_token_ids.apply_write()
+        self.logit_bias.apply_write()
+
+        self.min_lens.copy_to_uva()
+        self.num_stop_token_ids.copy_to_uva()
+        self.stop_token_ids.apply_write()
+
+    def apply_logit_bias(
+        self,
+        logits: torch.Tensor,
+        idx_mapping: torch.Tensor,
+        pos: torch.Tensor,
+    ) -> None:
+        apply_logit_bias(
+            logits,
+            idx_mapping,
+            pos,
+            self.num_allowed_token_ids.gpu,
+            self.allowed_token_ids.gpu,
+            self.num_logit_bias.gpu,
+            self.logit_bias_token_ids.gpu,
+            self.logit_bias.gpu,
+            self.min_lens.gpu,
+            self.num_stop_token_ids.gpu,
+            self.stop_token_ids.gpu,
+        )
+
+
+@triton.jit
+def _bias_kernel(
+    logits_ptr,
+    logits_stride,
+    vocab_size,
+    idx_mapping_ptr,
+    # Allowed token IDs.
+    num_allowed_token_ids_ptr,
+    allowed_token_ids_ptr,
+    allowed_token_ids_stride,
+    # Logit bias.
+    num_logit_bias_ptr,
+    bias_token_ids_ptr,
+    bias_token_ids_stride,
+    bias_ptr,
+    bias_stride,
+    # Min tokens.
+    pos_ptr,
+    min_lens_ptr,
+    num_stop_token_ids_ptr,
+    stop_token_ids_ptr,
+    stop_token_ids_stride,
+    BLOCK_SIZE: tl.constexpr,
+    LOGITS_BLOCK_SIZE: tl.constexpr,
+):
+    batch_idx = tl.program_id(0)
+    req_state_idx = tl.load(idx_mapping_ptr + batch_idx)
+
+    block = tl.arange(0, BLOCK_SIZE)
+
+    # Allowed token IDs.
+    num_allowed_token_ids = tl.load(num_allowed_token_ids_ptr + req_state_idx)
+    if num_allowed_token_ids > 0:
+        block = tl.arange(0, BLOCK_SIZE)
+        mask = block < num_allowed_token_ids
+
+        # Save logits for allowed token IDs.
+        allowed_token_ids = tl.load(
+            allowed_token_ids_ptr + req_state_idx * allowed_token_ids_stride + block,
+            mask=mask,
+        )
+        logits = tl.load(
+            logits_ptr + batch_idx * logits_stride + allowed_token_ids, mask=mask
+        )
+
+        # Set logits to -inf for all tokens.
+        for i in range(0, vocab_size, LOGITS_BLOCK_SIZE):
+            offset = i + tl.arange(0, LOGITS_BLOCK_SIZE)
+            tl.store(
+                logits_ptr + batch_idx * logits_stride + offset,
+                -float("inf"),
+                mask=offset < vocab_size,
+            )
+
+        # Restore logits for allowed token IDs.
+        tl.store(
+            logits_ptr + batch_idx * logits_stride + allowed_token_ids,
+            logits,
+            mask=mask,
+        )
+
+    # Logit bias.
+    num_logit_bias = tl.load(num_logit_bias_ptr + req_state_idx)
+    if num_logit_bias > 0:
+        mask = block < num_logit_bias
+        token_ids = tl.load(
+            bias_token_ids_ptr + req_state_idx * bias_token_ids_stride + block,
+            mask=mask,
+        )
+        bias = tl.load(bias_ptr + req_state_idx * bias_stride + block, mask=mask)
+        logits = tl.load(logits_ptr + batch_idx * logits_stride + token_ids, mask=mask)
+        logits += bias
+        tl.store(logits_ptr + batch_idx * logits_stride + token_ids, logits, mask=mask)
+
+    # Apply min tokens.
+    num_stop_token_ids = tl.load(num_stop_token_ids_ptr + req_state_idx)
+    pos = tl.load(pos_ptr + batch_idx)
+    min_len = tl.load(min_lens_ptr + req_state_idx)
+    if num_stop_token_ids > 0 and pos < min_len:
+        mask = block < num_stop_token_ids
+        stop_token_ids = tl.load(
+            stop_token_ids_ptr + req_state_idx * stop_token_ids_stride + block,
+            mask=mask,
+        )
+        tl.store(
+            logits_ptr + batch_idx * logits_stride + stop_token_ids,
+            -float("inf"),
+            mask=mask,
+        )
+
+
+def apply_logit_bias(
+    logits: torch.Tensor,
+    idx_mapping: torch.Tensor,
+    pos: torch.Tensor,
+    num_allowed_token_ids: torch.Tensor,
+    allowed_token_ids: torch.Tensor,
+    num_logit_bias: torch.Tensor,
+    logit_bias_token_ids: torch.Tensor,
+    logit_bias: torch.Tensor,
+    min_lens: torch.Tensor,
+    num_stop_token_ids: torch.Tensor,
+    stop_token_ids: torch.Tensor,
+) -> None:
+    num_reqs, vocab_size = logits.shape
+    BLOCK_SIZE = triton.next_power_of_2(
+        max(
+            allowed_token_ids.shape[-1],
+            logit_bias_token_ids.shape[-1],
+            stop_token_ids.shape[-1],
+        )
+    )
+    LOGITS_BLOCK_SIZE = 8192
+    _bias_kernel[(num_reqs,)](
+        logits,
+        logits.stride(0),
+        vocab_size,
+        idx_mapping,
+        num_allowed_token_ids,
+        allowed_token_ids,
+        allowed_token_ids.stride(0),
+        num_logit_bias,
+        logit_bias_token_ids,
+        logit_bias_token_ids.stride(0),
+        logit_bias,
+        logit_bias.stride(0),
+        pos,
+        min_lens,
+        num_stop_token_ids,
+        stop_token_ids,
+        stop_token_ids.stride(0),
+        BLOCK_SIZE=BLOCK_SIZE,
+        LOGITS_BLOCK_SIZE=LOGITS_BLOCK_SIZE,
+    )
diff --git a/vllm/v1/worker/gpu/sample/metadata.py b/vllm/v1/worker/gpu/sample/metadata.py
index f10c72049cbaedd1f5bbdcc2445da74b4ed17def..27167fd20c5e3e5a1b524391085cf3ecbc910e0e 100644
--- a/vllm/v1/worker/gpu/sample/metadata.py
+++ b/vllm/v1/worker/gpu/sample/metadata.py
@@ -4,20 +4,23 @@ from dataclasses import dataclass
 
 import torch
 
-from vllm.triton_utils import tl, triton
-
 
 @dataclass
 class SamplingMetadata:
+    idx_mapping: torch.Tensor
+
     temperature: torch.Tensor
 
     top_p: torch.Tensor | None
     top_k: torch.Tensor | None
     min_p: torch.Tensor | None
 
+    # For penalties
     repetition_penalty: torch.Tensor
     frequency_penalty: torch.Tensor
     presence_penalty: torch.Tensor
+    prompt_bin_mask: torch.Tensor
+    output_bin_counts: torch.Tensor
 
     seeds: torch.Tensor
     pos: torch.Tensor
@@ -25,11 +28,6 @@ class SamplingMetadata:
     # None means no logprobs, 0 means sampled token logprobs only
     max_num_logprobs: int | None
 
-    # For penalties
-    idx_mapping: torch.Tensor
-    prompt_bin_mask: torch.Tensor
-    output_bin_counts: torch.Tensor
-
     @classmethod
     def make_dummy(
         cls,
@@ -37,6 +35,8 @@ class SamplingMetadata:
         device: torch.device,
     ) -> "SamplingMetadata":
         assert num_reqs > 0
+        idx_mapping = torch.arange(num_reqs, dtype=torch.int32, device=device)
+
         temperature = torch.zeros(num_reqs, dtype=torch.float32, device=device)
         temperature[0] = 0.5
         # TODO(woosuk): Use top-p and top-k for dummy sampler.
@@ -51,18 +51,19 @@ class SamplingMetadata:
         repetition_penalty = torch.ones(num_reqs, dtype=torch.float32, device=device)
         frequency_penalty = torch.zeros(num_reqs, dtype=torch.float32, device=device)
         presence_penalty = torch.zeros(num_reqs, dtype=torch.float32, device=device)
-        seeds = torch.zeros(num_reqs, dtype=torch.int64, device=device)
-        pos = torch.zeros(num_reqs, dtype=torch.int64, device=device)
-        max_num_logprobs = 20
 
-        idx_mapping = torch.arange(num_reqs, dtype=torch.int32, device=device)
         # NOTE(woosuk): These are placeholder tensors to avoid None checks in the
         # penalties kernel. We use 2 instead of 1 as vocab_size to avoid Triton
         # specialization and re-compilation at runtime.
         prompt_bin_mask = torch.zeros(num_reqs, 2, dtype=torch.int32, device=device)
         output_bin_counts = torch.zeros(num_reqs, 2, dtype=torch.int32, device=device)
 
+        seeds = torch.zeros(num_reqs, dtype=torch.int64, device=device)
+        pos = torch.zeros(num_reqs, dtype=torch.int64, device=device)
+        max_num_logprobs = 20
+
         return cls(
+            idx_mapping=idx_mapping,
             temperature=temperature,
             top_p=top_p,
             top_k=top_k,
@@ -70,123 +71,9 @@ class SamplingMetadata:
             repetition_penalty=repetition_penalty,
             frequency_penalty=frequency_penalty,
             presence_penalty=presence_penalty,
+            prompt_bin_mask=prompt_bin_mask,
+            output_bin_counts=output_bin_counts,
             seeds=seeds,
             pos=pos,
             max_num_logprobs=max_num_logprobs,
-            idx_mapping=idx_mapping,
-            prompt_bin_mask=prompt_bin_mask,
-            output_bin_counts=output_bin_counts,
         )
-
-
-# NOTE(woosuk): Re-compilation can happen at runtime since top_p and top_k can be None.
-@triton.jit
-def _expand_sampling_metadata_kernel(
-    temp_ptr,
-    expanded_temp_ptr,
-    top_p_ptr,
-    expanded_top_p_ptr,
-    top_k_ptr,
-    expanded_top_k_ptr,
-    min_p_ptr,
-    expanded_min_p_ptr,
-    rep_penalty_ptr,
-    expanded_rep_penalty_ptr,
-    freq_penalty_ptr,
-    expanded_freq_penalty_ptr,
-    pres_penalty_ptr,
-    expanded_pres_penalty_ptr,
-    seeds_ptr,
-    expanded_seeds_ptr,
-    cu_num_logits_ptr,
-    BLOCK_SIZE: tl.constexpr,
-):
-    req_idx = tl.program_id(0)
-    start_idx = tl.load(cu_num_logits_ptr + req_idx)
-    end_idx = tl.load(cu_num_logits_ptr + req_idx + 1)
-    num_tokens = end_idx - start_idx
-
-    block = tl.arange(0, BLOCK_SIZE)
-    mask = block < num_tokens
-
-    temp = tl.load(temp_ptr + req_idx)
-    tl.store(expanded_temp_ptr + start_idx + block, temp, mask=mask)
-
-    if top_p_ptr is not None:
-        top_p = tl.load(top_p_ptr + req_idx)
-        tl.store(expanded_top_p_ptr + start_idx + block, top_p, mask=mask)
-
-    if top_k_ptr is not None:
-        top_k = tl.load(top_k_ptr + req_idx)
-        tl.store(expanded_top_k_ptr + start_idx + block, top_k, mask=mask)
-
-    if min_p_ptr is not None:
-        min_p = tl.load(min_p_ptr + req_idx)
-        tl.store(expanded_min_p_ptr + start_idx + block, min_p, mask=mask)
-
-    rep_penalty = tl.load(rep_penalty_ptr + req_idx)
-    tl.store(expanded_rep_penalty_ptr + start_idx + block, rep_penalty, mask=mask)
-
-    freq_penalty = tl.load(freq_penalty_ptr + req_idx)
-    tl.store(expanded_freq_penalty_ptr + start_idx + block, freq_penalty, mask=mask)
-
-    pres_penalty = tl.load(pres_penalty_ptr + req_idx)
-    tl.store(expanded_pres_penalty_ptr + start_idx + block, pres_penalty, mask=mask)
-
-    seed = tl.load(seeds_ptr + req_idx)
-    tl.store(expanded_seeds_ptr + start_idx + block, seed, mask=mask)
-
-
-def expand_sampling_metadata(
-    sampling_metadata: SamplingMetadata,
-    cu_num_logits: torch.Tensor,
-    max_expand_len: int,
-) -> SamplingMetadata:
-    total_num_logits = sampling_metadata.pos.shape[0]
-    create_empty = lambda x: x.new_empty(total_num_logits) if x is not None else None
-    expanded_temp = create_empty(sampling_metadata.temperature)
-    expanded_top_p = create_empty(sampling_metadata.top_p)
-    expanded_top_k = create_empty(sampling_metadata.top_k)
-    expanded_min_p = create_empty(sampling_metadata.min_p)
-    expanded_repetition_penalty = create_empty(sampling_metadata.repetition_penalty)
-    expanded_frequency_penalty = create_empty(sampling_metadata.frequency_penalty)
-    expanded_presence_penalty = create_empty(sampling_metadata.presence_penalty)
-    expanded_seeds = create_empty(sampling_metadata.seeds)
-
-    num_reqs = cu_num_logits.shape[0] - 1
-    _expand_sampling_metadata_kernel[(num_reqs,)](
-        sampling_metadata.temperature,
-        expanded_temp,
-        sampling_metadata.top_p,
-        expanded_top_p,
-        sampling_metadata.top_k,
-        expanded_top_k,
-        sampling_metadata.min_p,
-        expanded_min_p,
-        sampling_metadata.repetition_penalty,
-        expanded_repetition_penalty,
-        sampling_metadata.frequency_penalty,
-        expanded_frequency_penalty,
-        sampling_metadata.presence_penalty,
-        expanded_presence_penalty,
-        sampling_metadata.seeds,
-        expanded_seeds,
-        cu_num_logits,
-        BLOCK_SIZE=triton.next_power_of_2(max_expand_len),
-    )
-    return SamplingMetadata(
-        temperature=expanded_temp,
-        top_p=expanded_top_p,
-        top_k=expanded_top_k,
-        min_p=expanded_min_p,
-        seeds=expanded_seeds,
-        repetition_penalty=expanded_repetition_penalty,
-        frequency_penalty=expanded_frequency_penalty,
-        presence_penalty=expanded_presence_penalty,
-        pos=sampling_metadata.pos,
-        max_num_logprobs=sampling_metadata.max_num_logprobs,
-        # TODO(woosuk): Support penalties with spec decoding.
-        idx_mapping=sampling_metadata.idx_mapping,
-        prompt_bin_mask=sampling_metadata.prompt_bin_mask,
-        output_bin_counts=sampling_metadata.output_bin_counts,
-    )
diff --git a/vllm/v1/worker/gpu/sample/min_p.py b/vllm/v1/worker/gpu/sample/min_p.py
index c98a42cb2b1bb469d683cf328b52f37561852b32..26c3e5905760370d2bc1a79e11879dfe987475c3 100644
--- a/vllm/v1/worker/gpu/sample/min_p.py
+++ b/vllm/v1/worker/gpu/sample/min_p.py
@@ -9,12 +9,14 @@ from vllm.triton_utils import tl, triton
 def _min_p_kernel(
     logits_ptr,
     logits_stride,
+    idx_mapping_ptr,
     min_p_ptr,
     vocab_size,
     BLOCK_SIZE: tl.constexpr,
 ):
     req_idx = tl.program_id(0)
-    min_p = tl.load(min_p_ptr + req_idx).to(tl.float32)
+    req_state_idx = tl.load(idx_mapping_ptr + req_idx)
+    min_p = tl.load(min_p_ptr + req_state_idx).to(tl.float32)
     if min_p == 0.0:
         return
 
@@ -39,12 +41,17 @@ def _min_p_kernel(
         tl.store(logits_ptr + req_idx * logits_stride + block, logits, mask=mask)
 
 
-def apply_min_p(logits: torch.Tensor, min_p: torch.Tensor) -> None:
+def apply_min_p(
+    logits: torch.Tensor,
+    idx_mapping: torch.Tensor,
+    min_p: torch.Tensor,
+) -> None:
     num_reqs, vocab_size = logits.shape
     BLOCK_SIZE = 1024
     _min_p_kernel[(num_reqs,)](
         logits,
         logits.stride(0),
+        idx_mapping,
         min_p,
         vocab_size,
         BLOCK_SIZE=BLOCK_SIZE,
diff --git a/vllm/v1/worker/gpu/sample/penalties.py b/vllm/v1/worker/gpu/sample/penalties.py
index b4fcc822ecfce18ad2caa28c216747d28f0a4781..26b0346b29d818766ea0ce8622c402aa985794a0 100644
--- a/vllm/v1/worker/gpu/sample/penalties.py
+++ b/vllm/v1/worker/gpu/sample/penalties.py
@@ -10,11 +10,11 @@ from vllm.v1.worker.gpu.sample.metadata import SamplingMetadata
 def _penalties_and_temperature_kernel(
     logits_ptr,
     logits_stride,
+    idx_mapping_ptr,
     repetition_penalty_ptr,
     frequency_penalty_ptr,
     presence_penalty_ptr,
     temperature_ptr,
-    idx_mapping_ptr,
     prompt_bin_mask_ptr,
     prompt_bin_mask_stride,
     output_bin_counts_ptr,
@@ -23,10 +23,11 @@ def _penalties_and_temperature_kernel(
     BLOCK_SIZE: tl.constexpr,
 ):
     batch_idx = tl.program_id(0)
-    rep_penalty = tl.load(repetition_penalty_ptr + batch_idx)
-    freq_penalty = tl.load(frequency_penalty_ptr + batch_idx)
-    pres_penalty = tl.load(presence_penalty_ptr + batch_idx)
-    temperature = tl.load(temperature_ptr + batch_idx)
+    req_state_idx = tl.load(idx_mapping_ptr + batch_idx)
+    rep_penalty = tl.load(repetition_penalty_ptr + req_state_idx)
+    freq_penalty = tl.load(frequency_penalty_ptr + req_state_idx)
+    pres_penalty = tl.load(presence_penalty_ptr + req_state_idx)
+    temperature = tl.load(temperature_ptr + req_state_idx)
     temperature = tl.where(temperature == 0.0, 1.0, temperature)
 
     use_rep_penalty = rep_penalty != 1.0
@@ -45,7 +46,6 @@ def _penalties_and_temperature_kernel(
     logits = logits.to(tl.float32)
 
     if use_penalty:
-        req_state_idx = tl.load(idx_mapping_ptr + batch_idx)
         output_bin_counts = tl.load(
             output_bin_counts_ptr + req_state_idx * output_bin_counts_stride + block,
             mask=mask,
@@ -92,11 +92,11 @@ def apply_penalties_and_temperature(
     _penalties_and_temperature_kernel[(num_reqs, num_blocks)](
         logits,
         logits.stride(0),
+        sampling_metadata.idx_mapping,
         sampling_metadata.repetition_penalty,
         sampling_metadata.frequency_penalty,
         sampling_metadata.presence_penalty,
         sampling_metadata.temperature,
-        sampling_metadata.idx_mapping,
         sampling_metadata.prompt_bin_mask,
         sampling_metadata.prompt_bin_mask.stride(0),
         sampling_metadata.output_bin_counts,
diff --git a/vllm/v1/worker/gpu/sample/sampler.py b/vllm/v1/worker/gpu/sample/sampler.py
index 84a3e18671b2cdea5ae83dea9f26221f3bd8db1c..6ed849ec8a1d444a5c9b58a1014af595fc949b95 100644
--- a/vllm/v1/worker/gpu/sample/sampler.py
+++ b/vllm/v1/worker/gpu/sample/sampler.py
@@ -71,7 +71,7 @@ class Sampler:
         apply_penalties_and_temperature(logits, sampling_metadata)
         # Apply min_p in place.
         if sampling_metadata.min_p is not None:
-            apply_min_p(logits, sampling_metadata.min_p)
+            apply_min_p(logits, sampling_metadata.idx_mapping, sampling_metadata.min_p)
         # Apply top_k and/or top_p. This might return a new tensor.
         logits = apply_top_k_top_p(
             logits, sampling_metadata.top_k, sampling_metadata.top_p
@@ -79,6 +79,7 @@ class Sampler:
 
         sampled = gumbel_sample(
             logits,
+            sampling_metadata.idx_mapping,
             sampling_metadata.temperature,
             sampling_metadata.seeds,
             sampling_metadata.pos,
diff --git a/vllm/v1/worker/gpu/spec_decode/eagle.py b/vllm/v1/worker/gpu/spec_decode/eagle.py
index 8848e220eb5b89d67c148f671daaf84f66c5baa0..ed92601202079aeac810651692844226d2ad5443 100644
--- a/vllm/v1/worker/gpu/spec_decode/eagle.py
+++ b/vllm/v1/worker/gpu/spec_decode/eagle.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Any
 
-import numpy as np
 import torch
 import torch.nn as nn
 
@@ -12,7 +11,6 @@ from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader import get_model
 from vllm.triton_utils import tl, triton
-from vllm.utils.platform_utils import is_pin_memory_available
 from vllm.v1.attention.backends.utils import AttentionMetadataBuilder
 from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.worker.gpu.attn_utils import build_attn_metadata
@@ -46,7 +44,6 @@ class EagleSpeculator:
         self.hidden_size = self.draft_model_config.get_hidden_size()
         self.inputs_embeds_size = self.draft_model_config.get_inputs_embeds_size()
         self.vocab_size = self.draft_model_config.get_vocab_size()
-        self.pin_memory = is_pin_memory_available()
         self.dtype = vllm_config.model_config.dtype
 
         self.input_buffers = InputBuffers(
@@ -56,7 +53,6 @@ class EagleSpeculator:
             vocab_size=self.vocab_size,
             dtype=self.dtype,
             device=device,
-            pin_memory=self.pin_memory,
         )
         self.hidden_states = torch.zeros(
             self.max_num_tokens,
@@ -64,6 +60,11 @@ class EagleSpeculator:
             dtype=self.dtype,
             device=device,
         )
+        self.idx_mapping = torch.zeros(
+            self.max_num_reqs,
+            dtype=torch.int32,
+            device=device,
+        )
         self.temperature = torch.zeros(
             self.max_num_reqs,
             dtype=torch.float32,
@@ -140,7 +141,7 @@ class EagleSpeculator:
         num_tokens_across_dp: torch.Tensor | None,
     ) -> None:
         pos = self.input_buffers.positions[:num_reqs]
-        query_start_loc = self.input_buffers.query_start_loc.gpu[: num_reqs + 1]
+        query_start_loc = self.input_buffers.query_start_loc[: num_reqs + 1]
         for step in range(1, self.num_speculative_steps):
             # Run the eagle model.
             last_hidden_states, hidden_states = self.run_model(
@@ -152,8 +153,9 @@ class EagleSpeculator:
             # used for draft and target sampling.
             draft_tokens = gumbel_sample(
                 logits,
-                self.temperature[:num_reqs],
-                self.seeds[:num_reqs],
+                self.idx_mapping[:num_reqs],
+                self.temperature,
+                self.seeds,
                 pos + 1,
                 apply_temperature=True,
             )
@@ -237,23 +239,27 @@ class EagleSpeculator:
         logits = self.model.compute_logits(sample_hidden_states)
 
         num_reqs = input_batch.num_reqs
-        cu_num_logits = input_batch.cu_num_logits[:num_reqs]
         # NOTE(woosuk): For draft sampling, we only consider the temperature
         # and ignore the other sampling parameters such as top_k and top_p,
         # for simplicity and performance.
         # While this may slightly degrade the acceptance rate, it does not
         # affect the output distribution after rejection sampling.
-        temperature = self.temperature[:num_reqs]
-        seeds = self.seeds[:num_reqs]
-        pos = self.input_buffers.positions[:num_reqs]
+        idx_mapping = self.idx_mapping[:num_reqs]
+        idx_mapping.copy_(input_batch.idx_mapping)
+        self.temperature.copy_(sampling_metadata.temperature)
+        self.seeds.copy_(sampling_metadata.seeds)
         # Gather the values and copy them to the pre-allocated buffers.
-        torch.gather(sampling_metadata.temperature, 0, cu_num_logits, out=temperature)
-        torch.gather(sampling_metadata.seeds, 0, cu_num_logits, out=seeds)
+        pos = self.input_buffers.positions[:num_reqs]
         torch.gather(input_batch.positions, 0, last_token_indices, out=pos)
         # NOTE(woosuk): We must add 1 to the positions to match the Gumbel noise
         # used for draft and target sampling.
         draft_tokens = gumbel_sample(
-            logits, temperature, seeds, pos + 1, apply_temperature=True
+            logits,
+            idx_mapping,
+            self.temperature,
+            self.seeds,
+            pos + 1,
+            apply_temperature=True,
         )
         if self.num_speculative_steps == 1:
             # Early exit.
@@ -273,11 +279,8 @@ class EagleSpeculator:
             self.max_model_len,
             self.max_num_reqs,
         )
-        query_start_loc = self.input_buffers.query_start_loc
-        query_start_loc_gpu = query_start_loc.gpu[: num_reqs + 1]
-        slot_mappings = self.block_tables.compute_slot_mappings(
-            query_start_loc_gpu, pos
-        )
+        query_start_loc = self.input_buffers.query_start_loc[: num_reqs + 1]
+        slot_mappings = self.block_tables.compute_slot_mappings(query_start_loc, pos)
 
         cudagraph_size = self.cudagraph_manager.get_cudagraph_size(num_reqs)
         if cudagraph_size is not None:
@@ -286,10 +289,9 @@ class EagleSpeculator:
             return self.draft_tokens[:num_reqs]
 
         # Run eager mode.
-        query_start_loc.np[: num_reqs + 1] = np.arange(num_reqs + 1)
-        query_start_loc_cpu = query_start_loc.cpu[: num_reqs + 1]
-        # HACK(woosuk)
-        seq_lens_np = np.full(num_reqs, self.max_model_len, dtype=np.int32)
+        query_start_loc_cpu = torch.arange(
+            num_reqs + 1, dtype=torch.int32, device="cpu"
+        )
         block_tables = [x[:num_reqs] for x in self.block_tables.input_block_tables]
 
         # FIXME(woosuk): This is UNSAFE!!
@@ -297,11 +299,10 @@ class EagleSpeculator:
             attn_metadata_builders=self.attn_metadata_builders,
             num_reqs=num_reqs,
             num_tokens=num_reqs,
-            query_start_loc_gpu=query_start_loc_gpu,
+            query_start_loc_gpu=query_start_loc,
             query_start_loc_cpu=query_start_loc_cpu,
             seq_lens=self.input_buffers.seq_lens[:num_reqs],
-            seq_lens_np=seq_lens_np,
-            num_computed_tokens_cpu=None,  # FIXME
+            max_seq_len=self.max_model_len,
             block_tables=block_tables,
             slot_mappings=slot_mappings,
             kv_cache_config=self.kv_cache_config,
@@ -487,7 +488,7 @@ def prepare_eagle_decode(
         input_buffers.positions,
         input_hidden_states,
         input_hidden_states.stride(0),
-        input_buffers.query_start_loc.gpu,
+        input_buffers.query_start_loc,
         input_buffers.seq_lens,
         hidden_size,
         max_model_len,
diff --git a/vllm/v1/worker/gpu/states.py b/vllm/v1/worker/gpu/states.py
index 6823c0c8ee5c7522f68a9a3a499a8b2433c8655e..abfc88405c96f8d9b9eb157332ced270ac5b8154 100644
--- a/vllm/v1/worker/gpu/states.py
+++ b/vllm/v1/worker/gpu/states.py
@@ -8,10 +8,8 @@ import torch
 from vllm.lora.request import LoRARequest
 from vllm.sampling_params import SamplingParams
 from vllm.utils.math_utils import cdiv
-from vllm.utils.platform_utils import is_uva_available
-from vllm.utils.torch_utils import get_cuda_view_from_cpu_tensor
 from vllm.v1.outputs import LogprobsTensors
-from vllm.v1.utils import CpuGpuBuffer
+from vllm.v1.worker.gpu.buffer_utils import StagedWriteTensor, UvaBackedTensor
 from vllm.v1.worker.gpu.sample.metadata import SamplingMetadata
 from vllm.v1.worker.gpu.sample.penalties import bincount
 
@@ -29,7 +27,6 @@ class RequestState:
         num_speculative_steps: int,
         vocab_size: int,
         device: torch.device,
-        pin_memory: bool,
     ):
         self.max_num_reqs = max_num_reqs
         self.max_model_len = max_model_len
@@ -37,7 +34,6 @@ class RequestState:
         self.num_speculative_steps = num_speculative_steps
         self.vocab_size = vocab_size
         self.device = device
-        self.pin_memory = pin_memory
 
         self.req_id_to_index: dict[str, int] = {}
         self.index_to_req_id: dict[int, str] = {}
@@ -47,16 +43,18 @@ class RequestState:
         self.prompt_len = np.zeros(self.max_num_reqs, dtype=np.int32)
         # NOTE(woosuk): This tensor can be extremely large (e.g., several GBs)
         # depending on the configured max_num_reqs and max_model_len.
-        self.prefill_token_ids = UvaBuffer(
-            self.max_num_reqs, self.max_model_len, dtype=torch.int32
+        # To save GPU memory, we use UVA instead of GPU for this tensor.
+        self.prefill_token_ids = StagedWriteTensor(
+            (self.max_num_reqs, self.max_model_len),
+            dtype=torch.int32,
+            device=device,
+            uva_instead_of_gpu=True,
         )
-        # NOTE(woosuk): We don't use UVA for prefill_len because its GPU view
-        # can be used outside of update_states and prepare_inputs.
-        # Without async barrier, using UVA can cause race conditions.
-        self.prefill_len = self._make_buffer(self.max_num_reqs, dtype=torch.int32)
+        self.prefill_len = UvaBackedTensor(self.max_num_reqs, dtype=torch.int32)
+
         # Number of computed tokens.
         self.num_computed_prefill_tokens = np.zeros(self.max_num_reqs, dtype=np.int32)
-        self.num_computed_tokens = torch.zeros(
+        self.num_computed_tokens = StagedWriteTensor(
             self.max_num_reqs, dtype=torch.int32, device=device
         )
 
@@ -84,14 +82,16 @@ class RequestState:
         self.lora_ids.fill(NO_LORA_ID)
 
         # Sampling parameters.
-        self.temperature = self._make_param(self.max_num_reqs, torch.float32)
-        self.top_p = self._make_param(self.max_num_reqs, torch.float32)
-        self.top_k = self._make_param(self.max_num_reqs, torch.int32)
-        self.min_p = self._make_param(self.max_num_reqs, torch.float32)
-        self.repetition_penalty = self._make_param(self.max_num_reqs, torch.float32)
-        self.frequency_penalty = self._make_param(self.max_num_reqs, torch.float32)
-        self.presence_penalty = self._make_param(self.max_num_reqs, torch.float32)
-        self.seeds = self._make_param(self.max_num_reqs, torch.int64)
+        self.temperature = UvaBackedTensor(self.max_num_reqs, dtype=torch.float32)
+        self.top_p = UvaBackedTensor(self.max_num_reqs, dtype=torch.float32)
+        self.top_k = UvaBackedTensor(self.max_num_reqs, dtype=torch.int32)
+        self.min_p = UvaBackedTensor(self.max_num_reqs, dtype=torch.float32)
+        self.repetition_penalty = UvaBackedTensor(
+            self.max_num_reqs, dtype=torch.float32
+        )
+        self.frequency_penalty = UvaBackedTensor(self.max_num_reqs, dtype=torch.float32)
+        self.presence_penalty = UvaBackedTensor(self.max_num_reqs, dtype=torch.float32)
+        self.seeds = UvaBackedTensor(self.max_num_reqs, dtype=torch.int64)
 
         self.num_logprobs = np.empty(self.max_num_reqs, dtype=np.int32)
         # -1 means no logprobs are requested.
@@ -111,13 +111,7 @@ class RequestState:
             self.max_num_reqs, self.vocab_size, dtype=torch.int32, device=self.device
         )
 
-    def _make_param(self, size: int, dtype: torch.dtype) -> "Param":
-        return Param(size, dtype=dtype, device=self.device, pin_memory=self.pin_memory)
-
-    def _make_buffer(self, size: int, dtype: torch.dtype) -> CpuGpuBuffer:
-        return CpuGpuBuffer(
-            size, dtype=dtype, device=self.device, pin_memory=self.pin_memory
-        )
+        self._penalties_reqs: list[int] = []
 
     @property
     def num_reqs(self) -> int:
@@ -144,12 +138,9 @@ class RequestState:
             f"prefill_len {prefill_len} < prompt_len {prompt_len}"
         )
         self.prefill_len.np[req_idx] = prefill_len
-        self.prefill_token_ids.np[req_idx, :prefill_len] = prefill_token_ids
-
+        self.prefill_token_ids.stage_write(req_idx, 0, prefill_token_ids)
         self.num_computed_prefill_tokens[req_idx] = num_computed_tokens
-        # FIXME(woosuk): This triggers a GPU operation whenever adding a new request.
-        # Optimize this.
-        self.num_computed_tokens[req_idx] = num_computed_tokens
+        self.num_computed_tokens.stage_write_elem(req_idx, num_computed_tokens)
 
         if lora_request is not None:
             self.lora_ids[req_idx] = lora_request.lora_int_id
@@ -169,13 +160,7 @@ class RequestState:
         self.presence_penalty.np[req_idx] = sampling_params.presence_penalty
 
         if use_penalty(sampling_params):
-            bincount(
-                self.prefill_token_ids.gpu[req_idx],
-                prefill_len,
-                prompt_len,
-                self.prompt_bin_mask[req_idx],
-                self.output_bin_counts[req_idx],
-            )
+            self._penalties_reqs.append(req_idx)
 
         if sampling_params.seed is not None:
             seed = sampling_params.seed
@@ -193,6 +178,22 @@ class RequestState:
         needs_prompt_logprobs = sampling_params.prompt_logprobs is not None
         self.needs_prompt_logprobs[req_idx] = needs_prompt_logprobs
 
+    def apply_staged_writes(self) -> None:
+        self.prefill_len.copy_to_uva()
+        self.prefill_token_ids.apply_write()
+        self.num_computed_tokens.apply_write()
+
+        # TODO(woosuk): Optimize this.
+        for req_idx in self._penalties_reqs:
+            bincount(
+                self.prefill_token_ids.gpu[req_idx],
+                int(self.prefill_len.np[req_idx]),
+                int(self.prompt_len[req_idx]),
+                self.prompt_bin_mask[req_idx],
+                self.output_bin_counts[req_idx],
+            )
+        self._penalties_reqs.clear()
+
     def remove_request(self, req_id: str) -> None:
         self.extra_data.pop(req_id, None)
         req_idx = self.req_id_to_index.pop(req_id, None)
@@ -208,30 +209,25 @@ class RequestState:
         idx_mapping_np: np.ndarray,
         pos: torch.Tensor,
     ) -> SamplingMetadata:
-        temperature = self.temperature.np[idx_mapping_np]
-        temperature = self.temperature.copy_np_to_gpu(temperature)
+        temperature = self.temperature.copy_to_uva()
 
         top_p = self.top_p.np[idx_mapping_np]
         no_top_p = np.all(top_p == 1.0)
-        top_p = self.top_p.copy_np_to_gpu(top_p) if not no_top_p else None
+        top_p = self.top_p.copy_to_uva()[idx_mapping] if not no_top_p else None
 
         top_k = self.top_k.np[idx_mapping_np]
         no_top_k = np.all(top_k == self.vocab_size)
-        top_k = self.top_k.copy_np_to_gpu(top_k) if not no_top_k else None
+        top_k = self.top_k.copy_to_uva()[idx_mapping] if not no_top_k else None
 
         min_p = self.min_p.np[idx_mapping_np]
         no_min_p = np.all(min_p == 0.0)
-        min_p = self.min_p.copy_np_to_gpu(min_p) if not no_min_p else None
+        min_p = self.min_p.copy_to_uva() if not no_min_p else None
 
-        rep_penalty = self.repetition_penalty.np[idx_mapping_np]
-        rep_penalty = self.repetition_penalty.copy_np_to_gpu(rep_penalty)
-        freq_penalty = self.frequency_penalty.np[idx_mapping_np]
-        freq_penalty = self.frequency_penalty.copy_np_to_gpu(freq_penalty)
-        pres_penalty = self.presence_penalty.np[idx_mapping_np]
-        pres_penalty = self.presence_penalty.copy_np_to_gpu(pres_penalty)
+        rep_penalty = self.repetition_penalty.copy_to_uva()
+        freq_penalty = self.frequency_penalty.copy_to_uva()
+        pres_penalty = self.presence_penalty.copy_to_uva()
 
-        seeds = self.seeds.np[idx_mapping_np]
-        seeds = self.seeds.copy_np_to_gpu(seeds)
+        seeds = self.seeds.copy_to_uva()
 
         num_logprobs = self.num_logprobs[idx_mapping_np]
         max_num_logprobs: int | None = int(np.max(num_logprobs))
@@ -239,6 +235,7 @@ class RequestState:
             max_num_logprobs = None
 
         return SamplingMetadata(
+            idx_mapping=idx_mapping,
             temperature=temperature,
             top_p=top_p,
             top_k=top_k,
@@ -246,12 +243,11 @@ class RequestState:
             repetition_penalty=rep_penalty,
             frequency_penalty=freq_penalty,
             presence_penalty=pres_penalty,
+            prompt_bin_mask=self.prompt_bin_mask,
+            output_bin_counts=self.output_bin_counts,
             seeds=seeds,
             pos=pos,
             max_num_logprobs=max_num_logprobs,
-            idx_mapping=idx_mapping,
-            prompt_bin_mask=self.prompt_bin_mask,
-            output_bin_counts=self.output_bin_counts,
         )
 
     def make_lora_inputs(
@@ -272,42 +268,12 @@ class RequestState:
         return prompt_lora_mapping, token_lora_mapping, active_lora_requests
 
 
-class Param:
-    def __init__(
-        self,
-        size: int,
-        dtype: torch.dtype,
-        device: torch.device,
-        pin_memory: bool,
-    ):
-        self.buffer = CpuGpuBuffer(
-            size,
-            dtype=dtype,
-            device=device,
-            pin_memory=pin_memory,
-        )
-        self.np = np.zeros_like(self.buffer.np)
-
-    def copy_np_to_gpu(self, x: np.ndarray) -> torch.Tensor:
-        n = x.shape[0]
-        self.buffer.np[:n] = x
-        return self.buffer.copy_to_gpu(n)
-
-
 @dataclass
 class ExtraData:
     lora_request: LoRARequest | None
     in_progress_prompt_logprobs: list[LogprobsTensors] = field(default_factory=list)
 
 
-class UvaBuffer:
-    def __init__(self, *size: int | torch.SymInt, dtype: torch.dtype):
-        assert is_uva_available()
-        self.cpu = torch.zeros(*size, dtype=dtype, device="cpu", pin_memory=True)
-        self.np = self.cpu.numpy()
-        self.gpu = get_cuda_view_from_cpu_tensor(self.cpu)
-
-
 def use_penalty(sampling_params: SamplingParams) -> bool:
     return (
         sampling_params.repetition_penalty != 1.0
diff --git a/vllm/v1/worker/gpu/structured_outputs.py b/vllm/v1/worker/gpu/structured_outputs.py
index 83051b0ed33ffda3f98a0361c4b5a0ed4d0bb2e8..2eaadbb0b221877c0bcebe3705b600b6c6b73e2a 100644
--- a/vllm/v1/worker/gpu/structured_outputs.py
+++ b/vllm/v1/worker/gpu/structured_outputs.py
@@ -4,38 +4,65 @@ import numpy as np
 import torch
 
 from vllm.triton_utils import tl, triton
-from vllm.v1.worker.gpu.input_batch import InputBuffers
+from vllm.utils.math_utils import cdiv
+from vllm.v1.worker.gpu.buffer_utils import UvaBufferPool
+from vllm.v1.worker.gpu.input_batch import InputBatch
 
 
-def apply_grammar_bitmask(
-    logits: torch.Tensor,
-    req_ids: list[str],
-    grammar_req_ids: list[str],
-    grammar_bitmask: np.ndarray,
-    input_buffers: InputBuffers,
-) -> None:
-    input_buffers.grammar_bitmask.np[: grammar_bitmask.shape[0]] = grammar_bitmask
-    input_buffers.grammar_bitmask.copy_to_gpu(grammar_bitmask.shape[0])
+class StructuredOutputsWorker:
+    def __init__(
+        self,
+        max_num_logits: int,
+        vocab_size: int,
+    ):
+        # NOTE(woosuk): Here, we use UvaBufferPool instead of UvaBackedTensor
+        # to save a unnecessary CPU-to-CPU copy.
+        self.logits_indices = UvaBufferPool(max_num_logits, torch.int32)
+        self.grammar_bitmask = UvaBufferPool(
+            (max_num_logits, cdiv(vocab_size, 32)), torch.int32
+        )
 
-    batch_size = logits.shape[0]
-    grammar_req_id_to_idx = {req_id: i for i, req_id in enumerate(grammar_req_ids)}
-    # logits -> bitmask mapping
-    mapping = [grammar_req_id_to_idx.get(req_id, -1) for req_id in req_ids]
-    input_buffers.bitmask_indices.np[:batch_size] = mapping
-    input_buffers.bitmask_indices.copy_to_gpu(batch_size)
+    def apply_grammar_bitmask(
+        self,
+        logits: torch.Tensor,
+        input_batch: InputBatch,
+        grammar_req_ids: list[str],
+        grammar_bitmask: np.ndarray,
+    ) -> None:
+        if not grammar_req_ids:
+            return
 
-    vocab_size = logits.shape[-1]
-    BLOCK_SIZE = 8192
-    grid = (batch_size, triton.cdiv(vocab_size, BLOCK_SIZE))
-    _apply_grammar_bitmask_kernel[grid](
-        logits,
-        logits.stride(0),
-        input_buffers.grammar_bitmask.gpu,
-        input_buffers.grammar_bitmask.gpu.stride(0),
-        input_buffers.bitmask_indices.gpu,
-        vocab_size,
-        BLOCK_SIZE=BLOCK_SIZE,
-    )
+        # Construct bitmask -> logits mapping
+        mapping: list[int] = []
+        req_ids = input_batch.req_ids
+        cu_num_logits = input_batch.cu_num_logits_np.tolist()
+        req_id_to_idx = {req_id: i for i, req_id in enumerate(req_ids)}
+        for grammar_req_id in grammar_req_ids:
+            req_idx = req_id_to_idx[grammar_req_id]
+            logits_start_idx = cu_num_logits[req_idx]
+            logits_end_idx = cu_num_logits[req_idx + 1]
+            mapping.extend(range(logits_start_idx, logits_end_idx))
+        # Copy the mapping.
+        mapping_np = np.array(mapping, dtype=np.int32)
+        logits_indices = self.logits_indices.copy_to_uva(mapping_np)
+
+        # Copy the bitmask.
+        bitmask = self.grammar_bitmask.copy_to_uva(grammar_bitmask)
+
+        num_masks = bitmask.shape[0]
+        assert num_masks == len(mapping)
+        vocab_size = logits.shape[-1]
+        BLOCK_SIZE = 8192
+        grid = (num_masks, triton.cdiv(vocab_size, BLOCK_SIZE))
+        _apply_grammar_bitmask_kernel[grid](
+            logits,
+            logits.stride(0),
+            logits_indices,
+            bitmask,
+            bitmask.stride(0),
+            vocab_size,
+            BLOCK_SIZE=BLOCK_SIZE,
+        )
 
 
 # Adapted from
@@ -44,17 +71,14 @@ def apply_grammar_bitmask(
 def _apply_grammar_bitmask_kernel(
     logits_ptr,
     logits_stride,
+    logits_indices_ptr,
     bitmask_ptr,
     bitmask_stride,
-    bitmask_indices_ptr,
     vocab_size,
     BLOCK_SIZE: tl.constexpr,
 ):
-    logits_idx = tl.program_id(0)
-    bitmask_idx = tl.load(bitmask_indices_ptr + logits_idx)
-    if bitmask_idx == -1:
-        # No bitmask to apply.
-        return
+    bitmask_idx = tl.program_id(0)
+    logits_idx = tl.load(logits_indices_ptr + bitmask_idx)
 
     # Load the bitmask.
     block_id = tl.program_id(1)
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index ead7a3619dea598229a8165c3e2d9f6635eef458..662badeb5f1a73e26fbb4b4edd23ce59619c6ec8 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -22,7 +22,6 @@ from vllm.v1.sample.logits_processor import (
     MoveDirectionality,
 )
 from vllm.v1.sample.metadata import SamplingMetadata
-from vllm.v1.spec_decode.utils import is_spec_decode_unsupported
 from vllm.v1.utils import copy_slice
 from vllm.v1.worker.block_table import MultiGroupBlockTable
 
@@ -128,7 +127,6 @@ class InputBatch:
         # allocation if max_model_len is big.
         # Maps req_index -> tensor of shape (num_prompt_tokens, hidden_size)
         self.req_prompt_embeds: dict[int, torch.Tensor] = {}
-        self.num_tokens = np.zeros(max_num_reqs, dtype=np.int32)
         self.num_tokens_no_spec = np.zeros(max_num_reqs, dtype=np.int32)
         self.num_prompt_tokens = np.zeros(max_num_reqs, dtype=np.int32)
         self.num_computed_tokens_cpu_tensor = torch.zeros(
@@ -177,9 +175,6 @@ class InputBatch:
         self.top_k_cpu = self.top_k_cpu_tensor.numpy()
         self.top_k_reqs: set[str] = set()
 
-        # IDs of requests which do not support spec decoding
-        self.spec_decode_unsupported_reqs: set[str] = set()
-
         # Frequency penalty related data structures
         self.frequency_penalties = torch.empty(
             (max_num_reqs,), dtype=torch.float, device=device
@@ -340,9 +335,6 @@ class InputBatch:
             self.req_prompt_embeds[req_index] = request.prompt_embeds
         self.token_ids_cpu[req_index, start_idx:end_idx] = request.output_token_ids
         self.is_token_ids[req_index, start_idx:end_idx] = True
-        # Number of token ids in prompt (token_ids_cpu or prompt_embeds).
-        # NOTE(woosuk): This may include spec decode tokens.
-        self.num_tokens[req_index] = request.num_tokens
         # Number of tokens without spec decode tokens.
         self.num_tokens_no_spec[req_index] = request.num_tokens
 
@@ -350,8 +342,6 @@ class InputBatch:
         self.block_table.add_row(request.block_ids, req_index)
 
         if sampling_params := request.sampling_params:
-            if self.is_spec_decode and is_spec_decode_unsupported(sampling_params):
-                self.spec_decode_unsupported_reqs.add(req_id)
             if sampling_params.sampling_type == SamplingType.GREEDY:
                 # Should avoid division by zero later when apply_temperature.
                 self.temperature_cpu[req_index] = 0.0
@@ -450,6 +440,32 @@ class InputBatch:
 
         return req_index
 
+    def update_req_spec_token_ids(
+        self, request: CachedRequestState, scheduled_spec_tokens: dict[str, list[int]]
+    ) -> None:
+        req_id = request.req_id
+        req_index = self.req_id_to_index[req_id]
+        cur_spec_token_ids = self.spec_token_ids[req_index]
+        # When speculative decoding is used with structured output,
+        # the scheduler can drop draft tokens that do not
+        # conform to the schema. This can result in
+        # scheduler_output.scheduled_spec_decode_tokens being empty,
+        # even when speculative decoding is enabled.
+        cur_spec_token_ids.clear()
+        spec_token_ids = scheduled_spec_tokens.get(req_id, ())
+        num_spec_tokens = len(spec_token_ids)
+        request.prev_num_draft_len = num_spec_tokens
+        if not spec_token_ids:
+            return
+
+        # For async scheduling, token_ids_cpu assigned from
+        # spec_token_ids are placeholders and will be overwritten in
+        # _prepare_input_ids.
+        start_index = self.num_tokens_no_spec[req_index]
+        end_token_index = start_index + num_spec_tokens
+        self.token_ids_cpu[req_index, start_index:end_token_index] = spec_token_ids
+        cur_spec_token_ids.extend(spec_token_ids)
+
     def remove_request(self, req_id: str) -> int | None:
         """This method must always be followed by a call to condense().
 
@@ -488,7 +504,6 @@ class InputBatch:
         self.random_reqs.discard(req_id)
         self.top_p_reqs.discard(req_id)
         self.top_k_reqs.discard(req_id)
-        self.spec_decode_unsupported_reqs.discard(req_id)
         self.frequency_penalties_reqs.discard(req_id)
         self.presence_penalties_reqs.discard(req_id)
         self.repetition_penalties_reqs.discard(req_id)
@@ -522,10 +537,6 @@ class InputBatch:
             self.req_id_to_index[old_id_i2],
             self.req_id_to_index[old_id_i1],
         )
-        self.num_tokens[i1], self.num_tokens[i2] = (
-            self.num_tokens[i2],
-            self.num_tokens[i1],
-        )
         self.num_tokens_no_spec[i1], self.num_tokens_no_spec[i2] = (
             self.num_tokens_no_spec[i2],
             self.num_tokens_no_spec[i1],
@@ -661,17 +672,16 @@ class InputBatch:
             self.req_output_token_ids[last_req_index] = None
             self.req_id_to_index[req_id] = empty_index
 
-            if last_req_index != empty_index:
-                (
-                    self.spec_token_ids[last_req_index],
-                    self.spec_token_ids[empty_index],
-                ) = (
-                    self.spec_token_ids[empty_index],
-                    self.spec_token_ids[last_req_index],
-                )
-                self.spec_token_ids[last_req_index].clear()
+            num_tokens = self.num_tokens_no_spec[last_req_index] + len(
+                self.spec_token_ids[last_req_index]
+            )
+
+            (self.spec_token_ids[last_req_index], self.spec_token_ids[empty_index]) = (
+                self.spec_token_ids[empty_index],
+                self.spec_token_ids[last_req_index],
+            )
+            self.spec_token_ids[last_req_index].clear()
 
-            num_tokens = self.num_tokens[last_req_index]
             self.token_ids_cpu[empty_index, :num_tokens] = self.token_ids_cpu[
                 last_req_index, :num_tokens
             ]
@@ -682,7 +692,6 @@ class InputBatch:
                 self.req_prompt_embeds[empty_index] = self.req_prompt_embeds.pop(
                     last_req_index
                 )
-            self.num_tokens[empty_index] = num_tokens
             self.num_tokens_no_spec[empty_index] = self.num_tokens_no_spec[
                 last_req_index
             ]
@@ -839,7 +848,7 @@ class InputBatch:
             presence_penalties=self.presence_penalties[:num_reqs],
             repetition_penalties=self.repetition_penalties[:num_reqs],
             output_token_ids=output_token_ids,
-            spec_token_ids=cast(list[list[int]], self.spec_token_ids),
+            spec_token_ids=self.spec_token_ids,
             no_penalties=self.no_penalties,
             allowed_token_ids_mask=allowed_token_ids_mask,
             bad_words_token_ids=self.bad_words_token_ids,
@@ -949,9 +958,40 @@ class InputBatch:
             if sampled_token_ids is None:
                 assert self.async_copy_ready_event is not None
                 self.async_copy_ready_event.synchronize()
-                sampled_token_ids = self.sampled_token_ids_cpu.squeeze(-1).tolist()
-            # Replace placeholder token id with actual sampled id.
-            req_output_token_ids[-1] = sampled_token_ids[prev_index]
+                sampled_token_ids = self.sampled_token_ids_cpu.tolist()
+            # Replace placeholder token id(s) with actual sampled id(s).
+            new_ids: list[int] = sampled_token_ids[prev_index]
+            if not new_ids:
+                continue
+            num_sampled_ids = len(new_ids) if new_ids[-1] != -1 else new_ids.index(-1)
+            # Also account for case where there may be a smaller number of
+            # output placeholders (tokens can be discarded after a kv-load failure).
+            first_placeholder = req_output_token_ids.index(-1)
+            num_placeholders = len(req_output_token_ids) - first_placeholder
+            num_to_replace = min(num_sampled_ids, num_placeholders)
+            del new_ids[num_to_replace:]
+            end_index = first_placeholder + num_to_replace
+            req_output_token_ids[first_placeholder:end_index] = new_ids
+
+    def update_async_spec_token_ids(self, draft_token_ids: list[list[int]]) -> None:
+        """
+        In async scheduling case, update spec_token_ids in sampling metadata with
+        real draft token ids from prior step. This is called right before they are
+        needed by the rejection sampler for penalty/bad_words computation.
+        """
+        if not draft_token_ids or not self.prev_req_id_to_index:
+            return
+
+        if (spec_token_ids := self.sampling_metadata.spec_token_ids) is not None:
+            for req_id, spec_ids in zip(self.req_ids, spec_token_ids):
+                if spec_ids:
+                    prev_index = self.prev_req_id_to_index.get(req_id)
+                    if prev_index is not None:
+                        draft_ids = draft_token_ids[prev_index]
+                        if draft_ids:
+                            del draft_ids[len(spec_ids) :]
+                            spec_ids.clear()
+                            spec_ids.extend(draft_ids)
 
     @property
     def num_reqs(self) -> int:
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index fd403ccd4df9d0a194e25dc98994df225c073b5d..0867a18799ec814db2de1689dd13e10860e02eed 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -20,12 +20,6 @@ import torch.nn as nn
 from tqdm import tqdm
 
 import vllm.envs as envs
-from vllm.attention.backends.abstract import (
-    AttentionBackend,
-    AttentionMetadata,
-    AttentionType,
-    MultipleOf,
-)
 from vllm.attention.layer import Attention, MLAAttention
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.cuda_graph import CUDAGraphStat, CUDAGraphWrapper
@@ -54,18 +48,24 @@ from vllm.forward_context import (
     set_forward_context,
 )
 from vllm.logger import init_logger
+from vllm.lora.layers import LoRAMapping, LoRAMappingType
 from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
+from vllm.model_executor.layers.fused_moe.routed_experts_capturer import (
+    RoutedExpertsCapturer,
+)
 from vllm.model_executor.layers.rotary_embedding import (
     MRotaryEmbedding,
     XDRotaryEmbedding,
 )
 from vllm.model_executor.model_loader import TensorizerLoader, get_model_loader
 from vllm.model_executor.models.interfaces import (
+    MultiModalEmbeddings,
     SupportsMRoPE,
     SupportsMultiModal,
     SupportsXDRoPE,
     is_mixture_of_experts,
     supports_eagle3,
+    supports_mm_encoder_only,
     supports_mrope,
     supports_multimodal_pruning,
     supports_transcription,
@@ -90,8 +90,7 @@ from vllm.tasks import GenerationTask, PoolingTask, SupportedTask
 from vllm.utils import length_from_prompt_token_ids_or_embeds
 from vllm.utils.jsontree import json_map_leaves
 from vllm.utils.math_utils import cdiv, round_up
-from vllm.utils.mem_constants import GiB_bytes
-from vllm.utils.mem_utils import DeviceMemoryProfiler
+from vllm.utils.mem_utils import DeviceMemoryProfiler, format_gib
 from vllm.utils.nvtx_pytorch_hooks import PytHooks
 from vllm.utils.platform_utils import is_pin_memory_available
 from vllm.utils.torch_utils import (
@@ -99,11 +98,17 @@ from vllm.utils.torch_utils import (
     kv_cache_dtype_str_to_dtype,
     supports_dynamo,
 )
-from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadataBuilder
-from vllm.v1.attention.backends.utils import (
+from vllm.v1.attention.backend import (
+    AttentionBackend,
     AttentionCGSupport,
+    AttentionMetadata,
     AttentionMetadataBuilder,
+    AttentionType,
     CommonAttentionMetadata,
+    MultipleOf,
+)
+from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadataBuilder
+from vllm.v1.attention.backends.utils import (
     create_fast_prefill_custom_backend,
     get_dcp_local_seq_lens,
     reorder_batch_to_split_decodes_and_prefills,
@@ -235,22 +240,65 @@ class AsyncGPUModelRunnerOutput(AsyncModelRunnerOutput):
             valid_sampled_token_ids = self.sampled_token_ids_cpu.tolist()
             for i in self._invalid_req_indices:
                 valid_sampled_token_ids[i].clear()
-            cu_num_tokens = None
+            logprobs_lists = None
+            if self._logprobs_tensors_cpu is not None:
+                logprobs_lists = self._logprobs_tensors_cpu.tolists()
         else:
-            valid_sampled_token_ids, cu_num_tokens = RejectionSampler.parse_output(
+            valid_sampled_token_ids, logprobs_lists = RejectionSampler.parse_output(
                 self.sampled_token_ids_cpu,
                 self.vocab_size,
                 self._invalid_req_indices,
-                return_cu_num_tokens=self._logprobs_tensors_cpu is not None,
+                logprobs_tensors=self._logprobs_tensors_cpu,
             )
 
         output = self._model_runner_output
         output.sampled_token_ids = valid_sampled_token_ids
-        if self._logprobs_tensors_cpu:
-            output.logprobs = self._logprobs_tensors_cpu.tolists(cu_num_tokens)
+        output.logprobs = logprobs_lists
         return output
 
 
+class AsyncGPUPoolingModelRunnerOutput(AsyncModelRunnerOutput):
+    def __init__(
+        self,
+        model_runner_output: ModelRunnerOutput,
+        raw_pooler_output: PoolerOutput,
+        finished_mask: list[bool],
+        async_output_copy_stream: torch.cuda.Stream,
+    ):
+        self._model_runner_output = model_runner_output
+
+        # Event on the copy stream so we can synchronize the non-blocking copy.
+        self.async_copy_ready_event = torch.Event()
+
+        # Keep a reference to the device tensors to avoid them being
+        # deallocated until we finish copying it to the host.
+        self._raw_pooler_output = raw_pooler_output
+
+        # Initiate the copy on a separate stream, but do not synchronize it.
+        default_stream = torch.cuda.current_stream()
+        with torch.cuda.stream(async_output_copy_stream):
+            async_output_copy_stream.wait_stream(default_stream)
+            raw_pooler_output_cpu = json_map_leaves(
+                lambda x: None if x is None else x.to("cpu", non_blocking=True),
+                self._raw_pooler_output,
+            )
+            self.async_copy_ready_event.record()
+            self._model_runner_output.pooler_output = [
+                out if include else None
+                for out, include in zip(raw_pooler_output_cpu, finished_mask)
+            ]
+
+    def get_output(self) -> ModelRunnerOutput:
+        """Copy the device tensors to the host and return a ModelRunnerOutput.
+        This function blocks until the copy is finished.
+        """
+        self.async_copy_ready_event.synchronize()
+
+        # Release the device tensors once the copy has completed.
+        del self._raw_pooler_output
+        return self._model_runner_output
+
+
 class ExecuteModelState(NamedTuple):
     """Ephemeral cached state transferred between execute_model() and
     sample_tokens(), after execute_model() returns None."""
@@ -323,7 +371,7 @@ class GPUModelRunner(
         # https://github.com/vllm-project/vllm/issues/18019
         self.broadcast_pp_output = (
             self.parallel_config.distributed_executor_backend == "external_launcher"
-            and len(get_pp_group().ranks) > 0
+            and len(get_pp_group().ranks) > 1
         )
 
         # Model-related.
@@ -351,6 +399,9 @@ class GPUModelRunner(
         else:
             self.max_encoder_len = 0
 
+        # Async scheduling
+        self.use_async_scheduling = self.scheduler_config.async_scheduling
+
         # Sampler
         self.sampler = Sampler(logprobs_mode=self.model_config.logprobs_mode)
 
@@ -408,6 +459,11 @@ class GPUModelRunner(
         self.num_spec_tokens = 0
         if self.speculative_config:
             self.num_spec_tokens = self.speculative_config.num_speculative_tokens
+            draft_config = self.speculative_config.draft_model_config
+            if draft_config is not None and draft_config.max_model_len is not None:
+                self.effective_drafter_max_model_len = draft_config.max_model_len
+            else:
+                self.effective_drafter_max_model_len = self.max_model_len
 
         # Request states.
         self.requests: dict[str, CachedRequestState] = {}
@@ -455,7 +511,6 @@ class GPUModelRunner(
             cp_kv_cache_interleave_size=self.parallel_config.cp_kv_cache_interleave_size,
         )
 
-        self.use_async_scheduling = self.scheduler_config.async_scheduling
         # Separate cuda stream for overlapping transfer of sampled token ids from
         # GPU to CPU when async scheduling is enabled.
         self.async_output_copy_stream: torch.cuda.Stream | None = None
@@ -509,7 +564,13 @@ class GPUModelRunner(
 
         # Only relevant for multimodal models
         if self.supports_mm_inputs:
-            self.is_mm_embed = self._make_buffer(self.max_num_tokens, dtype=torch.bool)
+            # Double buffer to avoid race condition: previous iteration's async
+            # copy may still be reading from CPU while current iteration writes.
+            self.is_mm_embed_buffers = [
+                self._make_buffer(self.max_num_tokens, dtype=torch.bool),
+                self._make_buffer(self.max_num_tokens, dtype=torch.bool),
+            ]
+            self.is_mm_embed_idx = 0
 
         # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
         if self.uses_mrope:
@@ -581,6 +642,7 @@ class GPUModelRunner(
 
         # Cached outputs.
         self._draft_token_ids: list[list[int]] | torch.Tensor | None = None
+        self._draft_token_req_ids: list[str] | None = None
         self.transfer_event = torch.Event()
         self.sampled_token_ids_pinned_cpu = torch.empty(
             (self.max_num_reqs, 1),
@@ -593,21 +655,43 @@ class GPUModelRunner(
         # with dedicated stream for overlapping and event for coordination.
         self.valid_sampled_token_count_event: torch.Event | None = None
         self.valid_sampled_token_count_copy_stream: torch.cuda.Stream | None = None
-        if self.use_async_scheduling and self.num_spec_tokens:
-            self.valid_sampled_token_count_event = torch.Event()
-            self.valid_sampled_token_count_copy_stream = torch.cuda.Stream()
-        self.valid_sampled_token_count_cpu = torch.empty(
-            self.max_num_reqs,
-            dtype=torch.int64,
-            device="cpu",
-            pin_memory=self.pin_memory,
-        )
+        # We also copy the drafted tokens to the CPU asynchronously,
+        # in case we need them for structured outputs.
+        self.draft_token_ids_event: torch.Event | None = None
+        self.draft_token_ids_copy_stream: torch.cuda.Stream | None = None
+        self.valid_sampled_token_count_cpu: torch.Tensor | None = None
+        self.draft_token_ids_cpu: torch.Tensor | None = None
+        if self.num_spec_tokens:
+            self.draft_token_ids_event = torch.Event()
+            self.draft_token_ids_copy_stream = torch.cuda.Stream()
+            self.draft_token_ids_cpu = torch.empty(
+                (self.max_num_reqs, self.num_spec_tokens),
+                dtype=torch.int64,
+                device="cpu",
+                pin_memory=self.pin_memory,
+            )
+            if self.use_async_scheduling:
+                self.valid_sampled_token_count_event = torch.Event()
+                self.valid_sampled_token_count_copy_stream = torch.cuda.Stream()
+                self.valid_sampled_token_count_cpu = torch.empty(
+                    self.max_num_reqs,
+                    dtype=torch.int64,
+                    device="cpu",
+                    pin_memory=self.pin_memory,
+                )
 
         # Ephemeral state transferred between execute_model() and sample_tokens().
         self.execute_model_state: ExecuteModelState | None = None
         self.kv_connector_output: KVConnectorOutput | None = None
         self.layerwise_nvtx_hooks_registered = False
 
+    def update_max_model_len(self, max_model_len: int) -> None:
+        self.max_model_len = max_model_len
+        if self.speculative_config:
+            draft_config = self.speculative_config.draft_model_config
+            if draft_config is None or draft_config.max_model_len is None:
+                self.effective_drafter_max_model_len = self.max_model_len
+
     def reset_mm_cache(self) -> None:
         if self.mm_budget:
             self.mm_budget.reset_cache()
@@ -681,7 +765,7 @@ class GPUModelRunner(
             with_numpy=numpy,
         )
 
-    def _init_model_kwargs(self, num_tokens: int):
+    def _init_model_kwargs(self):
         model_kwargs = dict[str, Any]()
 
         if not self.is_pooling_model:
@@ -859,6 +943,7 @@ class GPUModelRunner(
         # Update the states of the running/resumed requests.
         is_last_rank = get_pp_group().is_last_rank
         req_data = scheduler_output.scheduled_cached_reqs
+        scheduled_spec_tokens = scheduler_output.scheduled_spec_decode_tokens
 
         # Wait until valid_sampled_tokens_count is copied to cpu,
         # then use it to update actual num_computed_tokens of each request.
@@ -872,20 +957,20 @@ class GPUModelRunner(
             num_output_tokens = req_data.num_output_tokens[i]
             req_index = self.input_batch.req_id_to_index.get(req_id)
 
-            # prev_num_draft_len is used in async scheduling mode with
-            # spec decode. it indicates if need to update num_computed_tokens
-            # of the request. for example:
-            # fist step: num_computed_tokens = 0, spec_tokens = [],
-            # prev_num_draft_len = 0.
-            # second step: num_computed_tokens = 100(prompt lenth),
-            # spec_tokens = [a,b], prev_num_draft_len = 0.
-            # third step: num_computed_tokens = 100 + 2, spec_tokens = [c,d],
-            # prev_num_draft_len = 2.
-            # num_computed_tokens in first step and second step does't contain
-            # the spec tokens length, but in third step it contains the
-            # spec tokens length. we only need to update num_computed_tokens
-            # when prev_num_draft_len > 0.
-            if req_state.prev_num_draft_len:
+            if req_state.prev_num_draft_len and self.use_async_scheduling:
+                # prev_num_draft_len is used in async scheduling mode with
+                # spec decode. it indicates if need to update num_computed_tokens
+                # of the request. for example:
+                # fist step: num_computed_tokens = 0, spec_tokens = [],
+                # prev_num_draft_len = 0.
+                # second step: num_computed_tokens = 100(prompt lenth),
+                # spec_tokens = [a,b], prev_num_draft_len = 0.
+                # third step: num_computed_tokens = 100 + 2, spec_tokens = [c,d],
+                # prev_num_draft_len = 2.
+                # num_computed_tokens in first step and second step does't contain
+                # the spec tokens length, but in third step it contains the
+                # spec tokens length. we only need to update num_computed_tokens
+                # when prev_num_draft_len > 0.
                 if req_index is None:
                     req_state.prev_num_draft_len = 0
                 else:
@@ -923,7 +1008,6 @@ class GPUModelRunner(
                         self.input_batch.num_prompt_tokens[req_index]
                         + num_output_tokens
                     )
-                    self.input_batch.num_tokens[req_index] = end_idx
                     self.input_batch.num_tokens_no_spec[req_index] = end_idx
 
             # Update the block IDs.
@@ -968,46 +1052,15 @@ class GPUModelRunner(
                     req_index, start_token_index:end_token_index
                 ] = new_token_ids
                 self.input_batch.num_tokens_no_spec[req_index] = end_token_index
-                self.input_batch.num_tokens[req_index] = end_token_index
 
             # Add spec_token_ids to token_ids_cpu.
-            spec_token_ids = scheduler_output.scheduled_spec_decode_tokens.get(
-                req_id, []
-            )
-            num_spec_tokens = len(spec_token_ids)
-            # For async scheduling, token_ids_cpu assigned from
-            # spec_token_ids are placeholders and will be overwritten in
-            # _prepare_input_ids.
-            if num_spec_tokens:
-                start_index = self.input_batch.num_tokens_no_spec[req_index]
-                end_token_index = start_index + num_spec_tokens
-                self.input_batch.token_ids_cpu[
-                    req_index, start_index:end_token_index
-                ] = spec_token_ids
-                # NOTE(woosuk): `num_tokens` here may include spec tokens.
-                self.input_batch.num_tokens[req_index] += num_spec_tokens
-
-            # When speculative decoding is used with structured output,
-            # the scheduler can drop draft tokens that do not
-            # conform to the schema. This can result in
-            # scheduler_output.scheduled_spec_decode_tokens being empty,
-            # even when speculative decoding is enabled.
-            self.input_batch.spec_token_ids[req_index].clear()
-            self.input_batch.spec_token_ids[req_index].extend(spec_token_ids)
-
-            # there are no draft tokens with async scheduling,
-            # we clear the spec_decoding info in scheduler_output and
-            # use normal sampling but rejection_sampling.
-            if self.use_async_scheduling:
-                req_state.prev_num_draft_len = num_spec_tokens
-                if num_spec_tokens and self._draft_token_ids is None:
-                    scheduler_output.total_num_scheduled_tokens -= num_spec_tokens
-                    scheduler_output.num_scheduled_tokens[req_id] -= num_spec_tokens
-                    scheduler_output.scheduled_spec_decode_tokens.pop(req_id, None)
+            self.input_batch.update_req_spec_token_ids(req_state, scheduled_spec_tokens)
+
         # Add the new or resumed requests to the persistent batch.
         # The smaller empty indices are filled first.
         for request in reqs_to_add:
             self.input_batch.add_request(request)
+            self.input_batch.update_req_spec_token_ids(request, scheduled_spec_tokens)
 
         # Condense the batched states if there are gaps left by removed requests
         self.input_batch.condense()
@@ -1027,7 +1080,7 @@ class GPUModelRunner(
         each sequence, and a shifting is done during the next iteration
         based on the number of accepted tokens.
         """
-        if not self.model_config.is_hybrid or not self.speculative_config:
+        if not self.speculative_config or not self.model_config.is_hybrid:
             return
 
         # Find the number of accepted tokens for each sequence.
@@ -1096,13 +1149,11 @@ class GPUModelRunner(
                     mm_kwargs.append(feature.data)
 
         # Input all modalities at once
-        model = cast(SupportsMultiModal, self.model)
         mm_kwargs_combined: BatchedTensorInputs = {}
         for _, _, mm_kwargs_group in group_mm_kwargs_by_modality(
             mm_kwargs,
             device=self.device,
             pin_memory=self.pin_memory,
-            merge_by_field_config=model.merge_by_field_config,
         ):
             mm_kwargs_combined.update(mm_kwargs_group)
 
@@ -1252,7 +1303,6 @@ class GPUModelRunner(
         # because input_ids dtype is torch.int32,
         # so convert draft_token_ids to torch.int32 here.
         draft_token_ids = self._draft_token_ids.to(dtype=torch.int32)
-        self._draft_token_ids = None
 
         self.input_ids.gpu.scatter_(
             dim=0,
@@ -1467,7 +1517,6 @@ class GPUModelRunner(
             # We will ignore the sampled tokens from the partial requests.
             # TODO: Support prompt logprobs.
             logits_indices = query_start_loc[1:] - 1
-            num_draft_tokens = None
             spec_decode_metadata = None
             num_sampled_tokens = np.ones(num_reqs, dtype=np.int32)
         else:
@@ -1484,14 +1533,11 @@ class GPUModelRunner(
             ) in scheduler_output.scheduled_spec_decode_tokens.items():
                 req_idx = self.input_batch.req_id_to_index[req_id]
                 num_draft_tokens[req_idx] = len(draft_token_ids)
-                num_decode_draft_tokens[req_idx] = (
-                    len(draft_token_ids)
-                    if (
-                        self.input_batch.num_computed_tokens_cpu[req_idx]
-                        >= self.input_batch.num_prompt_tokens[req_idx]
-                    )
-                    else -1
-                )
+                if (
+                    self.input_batch.num_computed_tokens_cpu[req_idx]
+                    >= self.input_batch.num_prompt_tokens[req_idx]
+                ):
+                    num_decode_draft_tokens[req_idx] = len(draft_token_ids)
             spec_decode_metadata = self._calc_spec_decode_metadata(
                 num_draft_tokens, cu_num_tokens
             )
@@ -1590,6 +1636,8 @@ class GPUModelRunner(
             return blk_table_tensor, slot_mapping
 
         block_table_gid_0, slot_mapping_gid_0 = _get_block_table_and_slot_mapping(0)
+        if self.model_config.enable_return_routed_experts:
+            self.slot_mapping = slot_mapping_gid_0[:num_tokens].cpu().numpy()
         cm_base = CommonAttentionMetadata(
             query_start_loc=self.query_start_loc.gpu[: num_reqs_padded + 1],
             query_start_loc_cpu=self.query_start_loc.cpu[: num_reqs_padded + 1],
@@ -1628,6 +1676,15 @@ class GPUModelRunner(
                 logits_indices
             )
 
+        # Cache attention metadata builds across hybrid KV-cache groups
+        # The only thing that changes between different hybrid KV-cache groups when the
+        # same metadata builder and KVCacheSpec is the same is the block table, so we
+        # can cache the attention metadata builds and just update the block table using
+        # `builder.update_block_table` if the builder supports it.
+        cached_attn_metadata: dict[
+            tuple[KVCacheSpec, type[AttentionMetadataBuilder]], AttentionMetadata
+        ] = {}
+
         def _build_attn_group_metadata(
             kv_cache_gid: int,
             attn_gid: int,
@@ -1635,13 +1692,18 @@ class GPUModelRunner(
             ubid: int | None = None,
         ) -> None:
             attn_group = self.attn_groups[kv_cache_gid][attn_gid]
+            builder = attn_group.get_metadata_builder(ubid or 0)
+            kv_cache_spec = kv_cache_groups[kv_cache_gid].kv_cache_spec
+            if isinstance(kv_cache_spec, UniformTypeKVCacheSpecs):
+                kv_cache_spec = kv_cache_spec.kv_cache_specs[attn_group.layer_names[0]]
+            cache_key = (kv_cache_spec, type(builder))
+
             cascade_attn_prefix_len = (
                 cascade_attn_prefix_lens[kv_cache_gid][attn_gid]
                 if cascade_attn_prefix_lens
                 else 0
             )
 
-            builder = attn_group.get_metadata_builder(ubid or 0)
             extra_attn_metadata_args = {}
             if use_spec_decode and isinstance(builder, GDNAttentionMetadataBuilder):
                 assert ubid is None, "UBatching not supported with GDN yet"
@@ -1656,12 +1718,23 @@ class GPUModelRunner(
                 attn_metadata_i = builder.build_for_cudagraph_capture(
                     common_attn_metadata
                 )
+            elif (
+                cache_key in cached_attn_metadata
+                and builder.supports_update_block_table
+            ):
+                attn_metadata_i = builder.update_block_table(
+                    cached_attn_metadata[cache_key],
+                    common_attn_metadata.block_table_tensor,
+                    common_attn_metadata.slot_mapping,
+                )
             else:
                 attn_metadata_i = builder.build(
                     common_prefix_len=cascade_attn_prefix_len,
                     common_attn_metadata=common_attn_metadata,
                     **extra_attn_metadata_args,
                 )
+                if builder.supports_update_block_table:
+                    cached_attn_metadata[cache_key] = attn_metadata_i
 
             if ubid is None:
                 assert isinstance(attn_metadata, dict)
@@ -2075,28 +2148,35 @@ class GPUModelRunner(
         ]
         return logits_indices_padded
 
-    def _batch_mm_kwargs_from_scheduler(
+    def _batch_mm_inputs_from_scheduler(
         self,
         scheduler_output: "SchedulerOutput",
-    ) -> tuple[list[MultiModalKwargsItem], list[tuple[str, PlaceholderRange]]]:
-        """Batch multimodal kwargs from scheduled encoder inputs.
+    ) -> tuple[
+        list[str],
+        list[MultiModalKwargsItem],
+        list[tuple[str, PlaceholderRange]],
+    ]:
+        """Batch multimodal inputs from scheduled encoder inputs.
 
         Args:
             scheduler_output: The scheduler output containing scheduled encoder
                 inputs.
 
         Returns:
-            A tuple of (mm_kwargs, req_ids_pos) where:
-            - mm_kwargs: List of multimodal kwargs items to be batched
-            - mm_hashes_pos: List of (mm_hash, position_info) tuples
+            A tuple of (mm_hashes, mm_kwargs, mm_lora_refs) where:
+            - mm_hashes: List of multimodal hashes for each item
+            - mm_kwargs: List of multimodal kwargs for each item
+            - mm_lora_refs: List of (req_id, placeholder_range) for each item
         """
         scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs
         if not scheduled_encoder_inputs:
-            return [], []
-        # Batch the multi-modal inputs.
+            return [], [], []
+
+        mm_hashes = list[str]()
         mm_kwargs = list[MultiModalKwargsItem]()
-        # list of tuple (mm_hash, position_info)
-        mm_hashes_pos = list[tuple[str, PlaceholderRange]]()
+        # Multimodal LoRA reference info to map each multimodal item
+        # back to its request & position
+        mm_lora_refs = list[tuple[str, PlaceholderRange]]()
         for req_id, encoder_input_ids in scheduled_encoder_inputs.items():
             req_state = self.requests[req_id]
 
@@ -2104,17 +2184,17 @@ class GPUModelRunner(
                 mm_feature = req_state.mm_features[mm_input_id]
                 if mm_feature.data is None:
                     continue
-                mm_hash = mm_feature.identifier
+
+                mm_hashes.append(mm_feature.identifier)
                 mm_kwargs.append(mm_feature.data)
-                mm_hashes_pos.append((mm_hash, mm_feature.mm_position))
+                mm_lora_refs.append((req_id, mm_feature.mm_position))
 
-        return mm_kwargs, mm_hashes_pos
+        return mm_hashes, mm_kwargs, mm_lora_refs
 
     def _execute_mm_encoder(
         self, scheduler_output: "SchedulerOutput"
     ) -> list[torch.Tensor]:
-        # Batch the multi-modal inputs using the helper method.
-        mm_kwargs, mm_hashes_pos = self._batch_mm_kwargs_from_scheduler(
+        mm_hashes, mm_kwargs, mm_lora_refs = self._batch_mm_inputs_from_scheduler(
             scheduler_output
         )
 
@@ -2129,13 +2209,70 @@ class GPUModelRunner(
         # multimodal inputs. The proper solution should be reordering the
         # encoder outputs.
         model = cast(SupportsMultiModal, self.model)
+
+        if self.lora_config and self.lora_manager.supports_tower_connector_lora():
+            # Build LoRA mappings independently for encoder inputs
+            # (encoder batch structure is different from main batch)
+            prompt_lora_mapping = []
+            token_lora_mapping = []
+            lora_requests = set()
+            encoder_token_counts = []
+
+            for req_id, pos_info in mm_lora_refs:
+                req_idx = self.input_batch.req_id_to_index[req_id]
+                lora_id = int(self.input_batch.request_lora_mapping[req_idx])
+
+                # Prefer pos_info.get_num_embeds to count precise MM embedding tokens.
+                num_tokens = self.model.get_num_mm_encoder_tokens(  # type: ignore[attr-defined]
+                    pos_info.get_num_embeds
+                )
+                prompt_lora_mapping.append(lora_id)
+                token_lora_mapping.extend([lora_id] * num_tokens)
+                encoder_token_counts.append(num_tokens)
+
+                if lora_id > 0:
+                    lora_request = self.input_batch.lora_id_to_lora_request.get(lora_id)
+                    if lora_request is not None:
+                        lora_requests.add(lora_request)
+
+            # Set tower adapter mapping
+            tower_mapping = LoRAMapping(
+                tuple(token_lora_mapping),
+                tuple(prompt_lora_mapping),
+                is_prefill=True,
+                type=LoRAMappingType.TOWER,
+            )
+            self.lora_manager.set_active_adapters(lora_requests, tower_mapping)
+
+            if hasattr(self.model, "get_num_mm_connector_tokens"):
+                post_op_counts = [
+                    self.model.get_num_mm_connector_tokens(num_tokens)  # type: ignore[attr-defined]
+                    for num_tokens in encoder_token_counts
+                ]
+
+                connector_token_mapping = np.repeat(
+                    np.array(prompt_lora_mapping, dtype=np.int32),
+                    np.array(post_op_counts, dtype=np.int32),
+                )
+                connector_mapping = LoRAMapping(
+                    index_mapping=tuple(connector_token_mapping.tolist()),
+                    prompt_mapping=tuple(prompt_lora_mapping),
+                    is_prefill=True,
+                    type=LoRAMappingType.CONNECTOR,
+                )
+
+                self.lora_manager.set_active_adapters(
+                    lora_requests,
+                    connector_mapping,
+                )
+
         encoder_outputs: list[torch.Tensor] = []
         for modality, num_items, mm_kwargs_group in group_mm_kwargs_by_modality(
             mm_kwargs,
             device=self.device,
             pin_memory=self.pin_memory,
         ):
-            curr_group_outputs: list[torch.Tensor] = []
+            curr_group_outputs: MultiModalEmbeddings
 
             # EVS-related change.
             # (ekhvedchenia): Temporary hack to limit peak memory usage when
@@ -2151,6 +2288,7 @@ class GPUModelRunner(
                 and modality == "video"
                 and num_items > 1
             ):
+                curr_group_outputs_lst = list[torch.Tensor]()
                 for video_mm_kwargs_item in filter(
                     lambda item: item.modality == "video", mm_kwargs
                 ):
@@ -2166,7 +2304,9 @@ class GPUModelRunner(
                         **micro_batch_mm_inputs
                     )
 
-                    curr_group_outputs.extend(micro_batch_outputs)
+                    curr_group_outputs_lst.extend(micro_batch_outputs)
+
+                curr_group_outputs = curr_group_outputs_lst
             else:
                 # Run the encoder.
                 # `curr_group_outputs` is either of the following:
@@ -2175,7 +2315,7 @@ class GPUModelRunner(
                 # 2. A list or tuple (length: num_items) of tensors,
                 # each of shape (feature_size, hidden_size) in case the feature
                 # size is dynamic depending on the input multimodal items.
-                curr_group_outputs = model.embed_multimodal(**mm_kwargs_group)  # type: ignore[assignment]
+                curr_group_outputs = model.embed_multimodal(**mm_kwargs_group)
 
             sanity_check_mm_encoder_outputs(
                 curr_group_outputs,
@@ -2184,7 +2324,7 @@ class GPUModelRunner(
             encoder_outputs.extend(curr_group_outputs)
 
         # Cache the encoder outputs by mm_hash
-        for (mm_hash, pos_info), output in zip(mm_hashes_pos, encoder_outputs):
+        for mm_hash, output in zip(mm_hashes, encoder_outputs):
             self.encoder_cache[mm_hash] = output
             logger.debug("Finish execute for mm hash %s", mm_hash)
             self.maybe_save_ec_to_connector(self.encoder_cache, mm_hash)
@@ -2198,8 +2338,13 @@ class GPUModelRunner(
     ) -> tuple[list[torch.Tensor], torch.Tensor]:
         total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
 
+        # Swap to the other buffer to avoid race condition with previous
+        # iteration's async copy that may still be reading from CPU.
+        self.is_mm_embed_idx = 1 - self.is_mm_embed_idx
+        is_mm_embed_buf = self.is_mm_embed_buffers[self.is_mm_embed_idx]
+
         mm_embeds = list[torch.Tensor]()
-        is_mm_embed = self.is_mm_embed.cpu
+        is_mm_embed = is_mm_embed_buf.cpu
         is_mm_embed[:total_num_scheduled_tokens] = False
 
         req_start_idx = 0
@@ -2277,7 +2422,7 @@ class GPUModelRunner(
             mm_embeds.extend(mm_embeds_req)
             req_start_idx += num_scheduled_tokens
 
-        is_mm_embed = self.is_mm_embed.copy_to_gpu(total_num_scheduled_tokens)
+        is_mm_embed = is_mm_embed_buf.copy_to_gpu(total_num_scheduled_tokens)
 
         if should_sync_mrope_positions:
             self._calc_mrope_positions(scheduler_output)
@@ -2387,45 +2532,60 @@ class GPUModelRunner(
         hidden_states: torch.Tensor,
         num_scheduled_tokens: int,
         num_scheduled_tokens_np: np.ndarray,
-    ) -> ModelRunnerOutput:
-        assert self.input_batch.num_reqs == len(self.input_batch.pooling_params), (
+        kv_connector_output: KVConnectorOutput | None,
+    ) -> ModelRunnerOutput | AsyncModelRunnerOutput:
+        num_reqs = self.input_batch.num_reqs
+        assert num_reqs == len(self.input_batch.pooling_params), (
             "Either all or none of the requests in a batch must be pooling request"
         )
 
         hidden_states = hidden_states[:num_scheduled_tokens]
-        seq_lens_cpu = self.seq_lens.cpu[: self.input_batch.num_reqs]
+        seq_lens_cpu = self.seq_lens.cpu[:num_reqs]
 
         pooling_metadata = self.input_batch.get_pooling_metadata()
         pooling_metadata.build_pooling_cursor(
-            num_scheduled_tokens_np.tolist(), seq_lens_cpu, device=hidden_states.device
+            num_scheduled_tokens_np, seq_lens_cpu, device=hidden_states.device
         )
 
         model = cast(VllmModelForPooling, self.model)
         raw_pooler_output: PoolerOutput = model.pooler(
-            hidden_states=hidden_states,
-            pooling_metadata=pooling_metadata,
+            hidden_states=hidden_states, pooling_metadata=pooling_metadata
         )
+
+        finished_mask = [
+            seq_len == prompt_len
+            for seq_len, prompt_len in zip(seq_lens_cpu, pooling_metadata.prompt_lens)
+        ]
+
+        model_runner_output = ModelRunnerOutput(
+            req_ids=self.input_batch.req_ids.copy(),
+            req_id_to_index=self.input_batch.req_id_to_index.copy(),
+            kv_connector_output=kv_connector_output,
+        )
+
+        if raw_pooler_output is None or not any(finished_mask):
+            model_runner_output.pooler_output = [None] * num_reqs
+            return model_runner_output
+
+        if self.use_async_scheduling:
+            return AsyncGPUPoolingModelRunnerOutput(
+                model_runner_output=model_runner_output,
+                raw_pooler_output=raw_pooler_output,
+                finished_mask=finished_mask,
+                async_output_copy_stream=self.async_output_copy_stream,
+            )
+
         raw_pooler_output = json_map_leaves(
-            lambda x: x.to("cpu", non_blocking=True) if x is not None else x,
+            lambda x: None if x is None else x.to("cpu", non_blocking=True),
             raw_pooler_output,
         )
+        model_runner_output.pooler_output = [
+            out if include else None
+            for out, include in zip(raw_pooler_output, finished_mask)
+        ]
         self._sync_device()
 
-        pooler_output: list[torch.Tensor | None] = []
-        for raw_output, seq_len, prompt_len in zip(
-            raw_pooler_output, seq_lens_cpu, pooling_metadata.prompt_lens
-        ):
-            output = raw_output if seq_len == prompt_len else None
-            pooler_output.append(output)
-
-        return ModelRunnerOutput(
-            req_ids=self.input_batch.req_ids,
-            req_id_to_index=self.input_batch.req_id_to_index,
-            sampled_token_ids=[],
-            logprobs=None,
-            prompt_logprobs_dict={},
-            pooler_output=pooler_output,
-        )
+        return model_runner_output
 
     def _pad_for_sequence_parallelism(self, num_scheduled_tokens: int) -> int:
         # Pad tokens to multiple of tensor_parallel_size when
@@ -2435,6 +2595,17 @@ class GPUModelRunner(
             return round_up(num_scheduled_tokens, tp_size)
         return num_scheduled_tokens
 
+    def _prepare_mm_inputs(
+        self, num_tokens: int
+    ) -> tuple[torch.Tensor | None, torch.Tensor]:
+        if self.model.requires_raw_input_tokens:
+            input_ids = self.input_ids.gpu[:num_tokens]
+        else:
+            input_ids = None
+
+        inputs_embeds = self.inputs_embeds.gpu[:num_tokens]
+        return input_ids, inputs_embeds
+
     def _preprocess(
         self,
         scheduler_output: "SchedulerOutput",
@@ -2477,10 +2648,9 @@ class GPUModelRunner(
             # TODO(woosuk): Avoid the copy. Optimize.
             self.inputs_embeds.gpu[:num_scheduled_tokens].copy_(inputs_embeds_scheduled)
 
-            input_ids = None
-            inputs_embeds = self.inputs_embeds.gpu[:num_input_tokens]
+            input_ids, inputs_embeds = self._prepare_mm_inputs(num_input_tokens)
             model_kwargs = {
-                **self._init_model_kwargs(num_scheduled_tokens),
+                **self._init_model_kwargs(),
                 **self._extract_mm_kwargs(scheduler_output),
             }
         elif self.enable_prompt_embeds and is_first_rank:
@@ -2508,7 +2678,7 @@ class GPUModelRunner(
                 self.inputs_embeds.gpu[token_ids_idx] = tokens_to_embeds
 
             inputs_embeds = self.inputs_embeds.gpu[:num_input_tokens]
-            model_kwargs = self._init_model_kwargs(num_input_tokens)
+            model_kwargs = self._init_model_kwargs()
             input_ids = None
         else:
             # For text-only models, we use token ids as input.
@@ -2517,7 +2687,7 @@ class GPUModelRunner(
             # then the embedding layer is not included in the CUDA graph.
             input_ids = self.input_ids.gpu[:num_input_tokens]
             inputs_embeds = None
-            model_kwargs = self._init_model_kwargs(num_input_tokens)
+            model_kwargs = self._init_model_kwargs()
 
         if self.uses_mrope:
             positions = self.mrope_positions.gpu[:, :num_input_tokens]
@@ -2559,15 +2729,21 @@ class GPUModelRunner(
     ) -> SamplerOutput:
         # Sample the next token and get logprobs if needed.
         sampling_metadata = self.input_batch.sampling_metadata
+        # Update output token ids with tokens sampled in last step
+        # if async scheduling and required by current sampling params.
+        self.input_batch.update_async_output_token_ids()
         if spec_decode_metadata is None:
-            # Update output token ids with tokens sampled in last step
-            # if async scheduling and required by current sampling params.
-            self.input_batch.update_async_output_token_ids()
             return self.sampler(
                 logits=logits,
                 sampling_metadata=sampling_metadata,
             )
 
+        # Update spec_token_ids with real draft tokens from pre step only when
+        # output_token_ids is needed (penalties or bad_words are in use).
+        if self.use_async_scheduling and self._draft_token_req_ids is not None:
+            draft_token_ids_cpu, _ = self._get_draft_token_ids_cpu()
+            self.input_batch.update_async_spec_token_ids(draft_token_ids_cpu)
+
         sampler_output = self.rejection_sampler(
             spec_decode_metadata,
             None,  # draft_probs
@@ -2616,7 +2792,7 @@ class GPUModelRunner(
         sampled_token_ids = sampler_output.sampled_token_ids
         logprobs_tensors = sampler_output.logprobs_tensors
         invalid_req_indices = []
-        cu_num_tokens: list[int] | None = None
+        logprobs_lists = None
         if not self.use_async_scheduling:
             # Get the valid generated tokens.
             max_gen_len = sampled_token_ids.shape[-1]
@@ -2626,13 +2802,16 @@ class GPUModelRunner(
                 # Mask out the sampled tokens that should not be sampled.
                 for i in discard_sampled_tokens_req_indices:
                     valid_sampled_token_ids[int(i)].clear()
+
+                if logprobs_tensors is not None:
+                    logprobs_lists = logprobs_tensors.tolists()
             else:
                 # Includes spec decode tokens.
-                valid_sampled_token_ids, cu_num_tokens = RejectionSampler.parse_output(
+                valid_sampled_token_ids, logprobs_lists = RejectionSampler.parse_output(
                     sampled_token_ids,
                     self.input_batch.vocab_size,
                     discard_sampled_tokens_req_indices,
-                    return_cu_num_tokens=logprobs_tensors is not None,
+                    logprobs_tensors=logprobs_tensors,
                 )
         else:
             valid_sampled_token_ids = []
@@ -2680,18 +2859,11 @@ class GPUModelRunner(
             self.input_batch.token_ids_cpu[req_idx, start_idx:end_idx] = sampled_ids
             self.input_batch.is_token_ids[req_idx, start_idx:end_idx] = True
             self.input_batch.num_tokens_no_spec[req_idx] = end_idx
-            self.input_batch.num_tokens[req_idx] = end_idx
 
             req_id = req_ids[req_idx]
             req_state = self.requests[req_id]
             req_state.output_token_ids.extend(sampled_ids)
 
-        logprobs_lists = (
-            logprobs_tensors.tolists(cu_num_tokens)
-            if not self.use_async_scheduling and logprobs_tensors is not None
-            else None
-        )
-
         # Compute prompt logprobs if needed.
         prompt_logprobs_dict = self._get_prompt_logprobs_dict(
             hidden_states[:num_scheduled_tokens],
@@ -2755,6 +2927,27 @@ class GPUModelRunner(
             **model_kwargs,
         )
 
+    @staticmethod
+    def _is_uniform_decode(
+        max_num_scheduled_tokens: int,
+        uniform_decode_query_len: int,
+        num_tokens: int,
+        num_reqs: int,
+        force_uniform_decode: bool | None = None,
+    ) -> bool:
+        """
+        Checks if it's a decode batch with same amount scheduled tokens
+        across all requests.
+        """
+        return (
+            (
+                (max_num_scheduled_tokens == uniform_decode_query_len)
+                and (num_tokens == max_num_scheduled_tokens * num_reqs)
+            )
+            if force_uniform_decode is None
+            else force_uniform_decode
+        )
+
     def _determine_batch_execution_and_padding(
         self,
         num_tokens: int,
@@ -2776,14 +2969,12 @@ class GPUModelRunner(
         torch.Tensor | None,
         CUDAGraphStat | None,
     ]:
-        num_tokens_padded = self._pad_for_sequence_parallelism(num_tokens)
-        uniform_decode = (
-            (
-                (max_num_scheduled_tokens == self.uniform_decode_query_len)
-                and (num_tokens_padded == max_num_scheduled_tokens * num_reqs)
-            )
-            if force_uniform_decode is None
-            else force_uniform_decode
+        uniform_decode = self._is_uniform_decode(
+            max_num_scheduled_tokens=max_num_scheduled_tokens,
+            uniform_decode_query_len=self.uniform_decode_query_len,
+            num_tokens=num_tokens,
+            num_reqs=num_reqs,
+            force_uniform_decode=force_uniform_decode,
         )
         # Encoder-decoder models only support CG for decoder_step > 0 (no enc_output
         # is present). Also, chunked-prefill is disabled, so batch are uniform.
@@ -2797,6 +2988,7 @@ class GPUModelRunner(
             else force_has_lora
         )
 
+        num_tokens_padded = self._pad_for_sequence_parallelism(num_tokens)
         dispatch_cudagraph = (
             lambda num_tokens, disable_full: self.cudagraph_dispatcher.dispatch(
                 num_tokens=num_tokens,
@@ -2812,6 +3004,15 @@ class GPUModelRunner(
             num_tokens_padded, use_cascade_attn or has_encoder_output
         )
         num_tokens_padded = batch_descriptor.num_tokens
+        if self.compilation_config.pass_config.enable_sp:
+            assert (
+                batch_descriptor.num_tokens
+                % self.vllm_config.parallel_config.tensor_parallel_size
+                == 0
+            ), (
+                "Sequence parallelism requires num_tokens to be "
+                "a multiple of tensor parallel size"
+            )
 
         # Extra coordination when running data-parallel since we need to coordinate
         # across ranks
@@ -2909,146 +3110,149 @@ class GPUModelRunner(
         self,
         scheduler_output: "SchedulerOutput",
         intermediate_tensors: IntermediateTensors | None = None,
-    ) -> ModelRunnerOutput | IntermediateTensors | None:
+    ) -> ModelRunnerOutput | AsyncModelRunnerOutput | IntermediateTensors | None:
         if self.execute_model_state is not None:
             raise RuntimeError(
                 "State error: sample_tokens() must be called "
                 "after execute_model() returns None."
             )
 
-        # self._draft_token_ids is None when `input_fits_in_drafter=False`
-        # and there is no draft tokens scheduled. so it need to update the
-        # spec_decoding info in scheduler_output with async_scheduling.
-        # use deepcopy to avoid the modification has influence on the
-        # scheduler_output in engine core process.
-        # TODO(Ronald1995): deepcopy is expensive when there is a large
-        # number of requests, optimize it later.
-        if (
-            self.use_async_scheduling
-            and self.num_spec_tokens
-            and self._draft_token_ids is None
-        ):
-            scheduler_output = deepcopy(scheduler_output)
+        if self.vllm_config.model_config.enable_return_routed_experts:
+            capturer = RoutedExpertsCapturer.get_instance()
+            if capturer is not None:
+                capturer.clear_buffer()  # noqa
+            else:
+                logger.error("RoutedExpertsCapturer not initialized.")
 
-        num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
-        with record_function_or_nullcontext("gpu_model_runner: preprocess"):
-            with self.synchronize_input_prep():
-                # Update persistent batch states.
-                self._update_states(scheduler_output)
-
-                if has_ec_transfer() and get_ec_transfer().is_producer:
-                    with self.maybe_get_ec_connector_output(
-                        scheduler_output,
-                        encoder_cache=self.encoder_cache,
-                    ) as ec_connector_output:
-                        self._execute_mm_encoder(scheduler_output)
-                        return make_empty_encoder_model_runner_output(scheduler_output)
-
-                if not num_scheduled_tokens:
-                    if (
-                        self.parallel_config.distributed_executor_backend
-                        == "external_launcher"
-                        and self.parallel_config.data_parallel_size > 1
-                    ):
-                        # this is a corner case when both external launcher
-                        # and DP are enabled, num_scheduled_tokens could be
-                        # 0, and has_unfinished_requests in the outer loop
-                        # returns True. before returning early here we call
-                        # dummy run to ensure coordinate_batch_across_dp
-                        # is called into to avoid out of sync issues.
-                        self._dummy_run(1)
-                    if not has_kv_transfer_group():
-                        # Return empty ModelRunnerOutput if no work to do.
-                        return EMPTY_MODEL_RUNNER_OUTPUT
-                    return self.kv_connector_no_forward(
-                        scheduler_output, self.vllm_config
-                    )
-                if self.cache_config.kv_sharing_fast_prefill:
-                    assert not self.num_prompt_logprobs, (
-                        "--kv-sharing-fast-prefill produces incorrect "
-                        "logprobs for prompt tokens, tokens, please disable "
-                        "it when the requests need prompt logprobs"
-                    )
+        if scheduler_output.preempted_req_ids and has_kv_transfer_group():
+            get_kv_transfer_group().handle_preemptions(
+                scheduler_output.preempted_req_ids
+            )
 
-                num_reqs = self.input_batch.num_reqs
-                req_ids = self.input_batch.req_ids
-                tokens = [scheduler_output.num_scheduled_tokens[i] for i in req_ids]
-                num_scheduled_tokens_np = np.array(tokens, dtype=np.int32)
-                max_num_scheduled_tokens = int(num_scheduled_tokens_np.max())
-                num_tokens_unpadded = scheduler_output.total_num_scheduled_tokens
+        num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
+        with (
+            record_function_or_nullcontext("gpu_model_runner: preprocess"),
+            self.synchronize_input_prep(),
+        ):
+            # Update persistent batch states.
+            self._update_states(scheduler_output)
 
-                (
-                    logits_indices,
-                    spec_decode_metadata,
-                ) = self._prepare_inputs(
+            if has_ec_transfer() and get_ec_transfer().is_producer:
+                with self.maybe_get_ec_connector_output(
                     scheduler_output,
-                    num_scheduled_tokens_np,
+                    encoder_cache=self.encoder_cache,
+                ) as ec_connector_output:
+                    self._execute_mm_encoder(scheduler_output)
+                    return make_empty_encoder_model_runner_output(scheduler_output)
+
+            if not num_scheduled_tokens:
+                if (
+                    self.parallel_config.distributed_executor_backend
+                    == "external_launcher"
+                    and self.parallel_config.data_parallel_size > 1
+                ):
+                    # this is a corner case when both external launcher
+                    # and DP are enabled, num_scheduled_tokens could be
+                    # 0, and has_unfinished_requests in the outer loop
+                    # returns True. before returning early here we call
+                    # dummy run to ensure coordinate_batch_across_dp
+                    # is called into to avoid out of sync issues.
+                    self._dummy_run(1)
+                if not has_kv_transfer_group():
+                    # Return empty ModelRunnerOutput if no work to do.
+                    return EMPTY_MODEL_RUNNER_OUTPUT
+                return self.kv_connector_no_forward(scheduler_output, self.vllm_config)
+
+            if self.cache_config.kv_sharing_fast_prefill:
+                assert not self.num_prompt_logprobs, (
+                    "--kv-sharing-fast-prefill produces incorrect "
+                    "logprobs for prompt tokens, tokens, please disable "
+                    "it when the requests need prompt logprobs"
                 )
 
-                cascade_attn_prefix_lens = None
-                # Disable cascade attention when using microbatching (DBO)
-                if self.cascade_attn_enabled and not self.parallel_config.enable_dbo:
-                    # Pre-compute cascade attention prefix lengths
-                    cascade_attn_prefix_lens = self._compute_cascade_attn_prefix_lens(
-                        num_scheduled_tokens_np,
-                        self.input_batch.num_computed_tokens_cpu[:num_reqs],
-                        scheduler_output.num_common_prefix_blocks,
-                    )
-
-                (
-                    cudagraph_mode,
-                    batch_desc,
-                    should_ubatch,
-                    num_tokens_across_dp,
-                    cudagraph_stats,
-                ) = self._determine_batch_execution_and_padding(
-                    num_tokens=num_tokens_unpadded,
-                    num_reqs=num_reqs,
-                    num_scheduled_tokens_np=num_scheduled_tokens_np,
-                    max_num_scheduled_tokens=max_num_scheduled_tokens,
-                    use_cascade_attn=cascade_attn_prefix_lens is not None,
-                    num_encoder_reqs=len(scheduler_output.scheduled_encoder_inputs),
-                )
+            num_reqs = self.input_batch.num_reqs
+            req_ids = self.input_batch.req_ids
+            tokens = [scheduler_output.num_scheduled_tokens[i] for i in req_ids]
+            num_scheduled_tokens_np = np.array(tokens, dtype=np.int32)
+            max_num_scheduled_tokens = int(num_scheduled_tokens_np.max())
+            num_tokens_unpadded = scheduler_output.total_num_scheduled_tokens
 
-                logger.debug(
-                    "Running batch with cudagraph_mode: %s, batch_descriptor: %s, "
-                    "should_ubatch: %s, num_tokens_across_dp: %s",
-                    cudagraph_mode,
-                    batch_desc,
-                    should_ubatch,
-                    num_tokens_across_dp,
-                )
+            logits_indices, spec_decode_metadata = self._prepare_inputs(
+                scheduler_output,
+                num_scheduled_tokens_np,
+            )
 
-                num_tokens_padded = batch_desc.num_tokens
-                num_reqs_padded = (
-                    batch_desc.num_reqs if batch_desc.num_reqs is not None else num_reqs
-                )
-                ubatch_slices, ubatch_slices_padded = maybe_create_ubatch_slices(
-                    should_ubatch,
+            cascade_attn_prefix_lens = None
+            # Disable cascade attention when using microbatching (DBO)
+            if self.cascade_attn_enabled and not self.parallel_config.use_ubatching:
+                # Pre-compute cascade attention prefix lengths
+                cascade_attn_prefix_lens = self._compute_cascade_attn_prefix_lens(
                     num_scheduled_tokens_np,
-                    num_tokens_padded,
-                    num_reqs_padded,
+                    self.input_batch.num_computed_tokens_cpu[:num_reqs],
+                    scheduler_output.num_common_prefix_blocks,
                 )
 
-                pad_attn = cudagraph_mode == CUDAGraphMode.FULL
-
-                use_spec_decode = len(scheduler_output.scheduled_spec_decode_tokens) > 0
-                ubatch_slices_attn = ubatch_slices_padded if pad_attn else ubatch_slices
-
-                (attn_metadata, spec_decode_common_attn_metadata) = (
-                    self._build_attention_metadata(
-                        num_tokens=num_tokens_unpadded,
-                        num_tokens_padded=num_tokens_padded if pad_attn else None,
-                        num_reqs=num_reqs,
-                        num_reqs_padded=num_reqs_padded if pad_attn else None,
-                        max_query_len=max_num_scheduled_tokens,
-                        ubatch_slices=ubatch_slices_attn,
-                        logits_indices=logits_indices,
-                        use_spec_decode=use_spec_decode,
-                        num_scheduled_tokens=scheduler_output.num_scheduled_tokens,
-                        cascade_attn_prefix_lens=cascade_attn_prefix_lens,
-                    )
+            (
+                cudagraph_mode,
+                batch_desc,
+                should_ubatch,
+                num_tokens_across_dp,
+                cudagraph_stats,
+            ) = self._determine_batch_execution_and_padding(
+                num_tokens=num_tokens_unpadded,
+                num_reqs=num_reqs,
+                num_scheduled_tokens_np=num_scheduled_tokens_np,
+                max_num_scheduled_tokens=max_num_scheduled_tokens,
+                use_cascade_attn=cascade_attn_prefix_lens is not None,
+                num_encoder_reqs=len(scheduler_output.scheduled_encoder_inputs),
+            )
+
+            logger.debug(
+                "Running batch with cudagraph_mode: %s, batch_descriptor: %s, "
+                "should_ubatch: %s, num_tokens_across_dp: %s",
+                cudagraph_mode,
+                batch_desc,
+                should_ubatch,
+                num_tokens_across_dp,
+            )
+
+            num_tokens_padded = batch_desc.num_tokens
+            num_reqs_padded = (
+                batch_desc.num_reqs if batch_desc.num_reqs is not None else num_reqs
+            )
+            ubatch_slices, ubatch_slices_padded = maybe_create_ubatch_slices(
+                should_ubatch,
+                num_scheduled_tokens_np,
+                num_tokens_padded,
+                num_reqs_padded,
+                self.parallel_config.num_ubatches,
+            )
+
+            logger.debug(
+                "ubatch_slices: %s, ubatch_slices_padded: %s",
+                ubatch_slices,
+                ubatch_slices_padded,
+            )
+
+            pad_attn = cudagraph_mode == CUDAGraphMode.FULL
+
+            use_spec_decode = len(scheduler_output.scheduled_spec_decode_tokens) > 0
+            ubatch_slices_attn = ubatch_slices_padded if pad_attn else ubatch_slices
+
+            attn_metadata, spec_decode_common_attn_metadata = (
+                self._build_attention_metadata(
+                    num_tokens=num_tokens_unpadded,
+                    num_tokens_padded=num_tokens_padded if pad_attn else None,
+                    num_reqs=num_reqs,
+                    num_reqs_padded=num_reqs_padded if pad_attn else None,
+                    max_query_len=max_num_scheduled_tokens,
+                    ubatch_slices=ubatch_slices_attn,
+                    logits_indices=logits_indices,
+                    use_spec_decode=use_spec_decode,
+                    num_scheduled_tokens=scheduler_output.num_scheduled_tokens,
+                    cascade_attn_prefix_lens=cascade_attn_prefix_lens,
                 )
+            )
 
             (
                 input_ids,
@@ -3112,11 +3316,12 @@ class GPUModelRunner(
 
                 if self.is_pooling_model:
                     # Return the pooling output.
-                    output = self._pool(
-                        hidden_states, num_scheduled_tokens, num_scheduled_tokens_np
+                    return self._pool(
+                        hidden_states,
+                        num_scheduled_tokens,
+                        num_scheduled_tokens_np,
+                        kv_connector_output,
                     )
-                    output.kv_connector_output = kv_connector_output
-                    return output
 
                 sample_hidden_states = hidden_states[logits_indices]
                 logits = self.model.compute_logits(sample_hidden_states)
@@ -3209,6 +3414,8 @@ class GPUModelRunner(
         with record_function_or_nullcontext("gpu_model_runner: sample"):
             sampler_output = self._sample(logits, spec_decode_metadata)
 
+        self._draft_token_ids = None
+        self._draft_token_req_ids = None
         self.input_batch.prev_sampled_token_ids = None
 
         def propose_draft_token_ids(sampled_token_ids):
@@ -3224,50 +3431,44 @@ class GPUModelRunner(
                     spec_decode_metadata,
                     spec_decode_common_attn_metadata,
                 )
+                self._copy_draft_token_ids_to_cpu(scheduler_output)
 
         spec_config = self.speculative_config
-        use_padded_batch_for_eagle = (
-            spec_config is not None
-            and spec_config.use_eagle()
-            and not spec_config.disable_padded_drafter_batch
-        )
-        effective_drafter_max_model_len = self.max_model_len
-        if effective_drafter_max_model_len is None:
-            effective_drafter_max_model_len = self.model_config.max_model_len
-        if (
-            spec_config is not None
-            and spec_config.draft_model_config is not None
-            and spec_config.draft_model_config.max_model_len is not None
-        ):
-            effective_drafter_max_model_len = (
-                spec_config.draft_model_config.max_model_len
+        propose_drafts_after_bookkeeping = False
+        if spec_config is not None:
+            input_fits_in_drafter = spec_decode_common_attn_metadata is not None and (
+                spec_decode_common_attn_metadata.max_seq_len + self.num_spec_tokens
+                <= self.effective_drafter_max_model_len
             )
-        input_fits_in_drafter = spec_decode_common_attn_metadata and (
-            spec_decode_common_attn_metadata.max_seq_len + self.num_spec_tokens
-            <= effective_drafter_max_model_len
-        )
-        if use_padded_batch_for_eagle:
-            assert self.speculative_config is not None
-            assert isinstance(self.drafter, EagleProposer)
-            sampled_token_ids = sampler_output.sampled_token_ids
-            if input_fits_in_drafter:
+            if spec_config.use_eagle() and not spec_config.disable_padded_drafter_batch:
                 # EAGLE speculative decoding can use the GPU sampled tokens
                 # as inputs, and does not need to wait for bookkeeping to finish.
-                propose_draft_token_ids(sampled_token_ids)
-            elif self.valid_sampled_token_count_event is not None:
-                assert spec_decode_common_attn_metadata is not None
-                next_token_ids, valid_sampled_tokens_count = (
-                    self.drafter.prepare_next_token_ids_padded(
-                        spec_decode_common_attn_metadata,
-                        sampled_token_ids,
-                        self.requests,
-                        self.input_batch,
-                        self.discard_request_mask.gpu,
+                assert isinstance(self.drafter, EagleProposer)
+                sampled_token_ids = sampler_output.sampled_token_ids
+                if input_fits_in_drafter:
+                    propose_draft_token_ids(sampled_token_ids)
+                elif self.valid_sampled_token_count_event is not None:
+                    assert spec_decode_common_attn_metadata is not None
+                    next_token_ids, valid_sampled_tokens_count = (
+                        self.drafter.prepare_next_token_ids_padded(
+                            spec_decode_common_attn_metadata,
+                            sampled_token_ids,
+                            self.requests,
+                            self.input_batch,
+                            self.discard_request_mask.gpu,
+                        )
                     )
-                )
-                self._copy_valid_sampled_token_count(
-                    next_token_ids, valid_sampled_tokens_count
-                )
+                    self._copy_valid_sampled_token_count(
+                        next_token_ids, valid_sampled_tokens_count
+                    )
+                    # Since we couldn't run the drafter,
+                    # just use zeros for the draft tokens.
+                    self._draft_token_ids = torch.zeros(
+                        1, device=self.device, dtype=torch.int32
+                    ).expand(len(self.input_batch.req_ids), self.num_spec_tokens)
+                    self._copy_draft_token_ids_to_cpu(scheduler_output, zeros_only=True)
+            else:
+                propose_drafts_after_bookkeeping = input_fits_in_drafter
 
         with record_function_or_nullcontext("gpu_model_runner: bookkeep"):
             (
@@ -3287,25 +3488,28 @@ class GPUModelRunner(
                 spec_decode_metadata,
             )
 
-        if (
-            self.speculative_config
-            and not use_padded_batch_for_eagle
-            and input_fits_in_drafter
-        ):
+        if propose_drafts_after_bookkeeping:
             # ngram and other speculative decoding methods use the sampled
             # tokens on the CPU, so they are run after bookkeeping.
             propose_draft_token_ids(valid_sampled_token_ids)
 
         with record_function_or_nullcontext("gpu_model_runner: eplb"):
             self.eplb_step()
+
         with record_function_or_nullcontext("gpu_model_runner: ModelRunnerOutput"):
+            if self.model_config.enable_return_routed_experts:
+                capturer = RoutedExpertsCapturer.get_instance()
+                if capturer is not None:
+                    capturer.save_captured_experts(indices=self.slot_mapping)  # noqa
+                else:
+                    logger.error("RoutedExpertsCapturer not initialized.")
+
             output = ModelRunnerOutput(
                 req_ids=req_ids_output_copy,
                 req_id_to_index=req_id_to_index_output_copy,
                 sampled_token_ids=valid_sampled_token_ids,
                 logprobs=logprobs_lists,
                 prompt_logprobs_dict=prompt_logprobs_dict,
-                pooler_output=[],
                 kv_connector_output=kv_connector_output,
                 ec_connector_output=ec_connector_output
                 if self.supports_mm_inputs
@@ -3316,6 +3520,7 @@ class GPUModelRunner(
 
         if not self.use_async_scheduling:
             return output
+
         with record_function_or_nullcontext(
             "gpu_model_runner: AsyncGPUModelRunnerOutput"
         ):
@@ -3340,16 +3545,55 @@ class GPUModelRunner(
         return async_output
 
     def take_draft_token_ids(self) -> DraftTokenIds | None:
-        if self._draft_token_ids is None:
+        if not self.num_spec_tokens or not self._draft_token_req_ids:
             return None
-        req_ids = self.input_batch.req_ids
-        if isinstance(self._draft_token_ids, torch.Tensor):
-            draft_token_ids = self._draft_token_ids.tolist()
-        else:
-            draft_token_ids = self._draft_token_ids
-        self._draft_token_ids = None
+        draft_token_ids, req_ids = self._get_draft_token_ids_cpu()
         return DraftTokenIds(req_ids, draft_token_ids)
 
+    def _copy_draft_token_ids_to_cpu(
+        self, scheduler_output: "SchedulerOutput", zeros_only: bool = False
+    ) -> None:
+        # Check if we need to copy draft tokens to CPU. In async scheduling,
+        # we only copy when needed for structured output, penalties or bad_words.
+        if self.use_async_scheduling and not (
+            scheduler_output.has_structured_output_requests
+            or self.input_batch.sampling_metadata.output_token_ids
+        ):
+            return
+        # We must also set the corresponding request ids.
+        self._draft_token_req_ids = self.input_batch.req_ids.copy()
+
+        draft_token_ids: torch.Tensor = self._draft_token_ids
+        if not torch.is_tensor(draft_token_ids):
+            return
+        assert self.draft_token_ids_event is not None
+        assert self.draft_token_ids_copy_stream is not None
+        assert self.draft_token_ids_cpu is not None
+        default_stream = torch.cuda.current_stream()
+        num_reqs = draft_token_ids.shape[0]
+        with torch.cuda.stream(self.draft_token_ids_copy_stream):
+            if not zeros_only:
+                # Trigger async copy of draft token ids to cpu.
+                self.draft_token_ids_copy_stream.wait_stream(default_stream)
+                self.draft_token_ids_cpu[:num_reqs].copy_(
+                    draft_token_ids, non_blocking=True
+                )
+            else:
+                # No copy needed, just zero-out cpu tensor.
+                self.draft_token_ids_cpu[:num_reqs] = 0
+            self.draft_token_ids_event.record()
+
+    def _get_draft_token_ids_cpu(self) -> tuple[list[list[int]], list[str]]:
+        if isinstance(self._draft_token_ids, list):
+            return self._draft_token_ids, self.input_batch.req_ids
+        req_ids = self._draft_token_req_ids
+        if req_ids is None:
+            return [], []
+        assert self.draft_token_ids_event is not None
+        assert self.draft_token_ids_cpu is not None
+        self.draft_token_ids_event.synchronize()
+        return self.draft_token_ids_cpu[: len(req_ids)].tolist(), req_ids
+
     def _copy_valid_sampled_token_count(
         self, next_token_ids: torch.Tensor, valid_sampled_tokens_count: torch.Tensor
     ) -> None:
@@ -3363,6 +3607,7 @@ class GPUModelRunner(
             self.valid_sampled_token_count_copy_stream.wait_stream(default_stream)  # type: ignore
             counts = valid_sampled_tokens_count
             counts_cpu = self.valid_sampled_token_count_cpu
+            assert counts_cpu is not None
             counts_cpu[: counts.shape[0]].copy_(counts, non_blocking=True)
             self.valid_sampled_token_count_event.record()
 
@@ -3371,14 +3616,13 @@ class GPUModelRunner(
     def _get_valid_sampled_token_count(self) -> list[int]:
         # Wait until valid_sampled_tokens_count is copied to cpu,
         prev_sampled_token_ids = self.input_batch.prev_sampled_token_ids
-        if (
-            self.valid_sampled_token_count_event is None
-            or prev_sampled_token_ids is None
-        ):
+        sampled_count_event = self.valid_sampled_token_count_event
+        if sampled_count_event is None or prev_sampled_token_ids is None:
             return []
 
         counts_cpu = self.valid_sampled_token_count_cpu
-        self.valid_sampled_token_count_event.synchronize()
+        assert counts_cpu is not None
+        sampled_count_event.synchronize()
         return counts_cpu[: prev_sampled_token_ids.shape[0]].tolist()
 
     def propose_draft_token_ids(
@@ -3400,10 +3644,8 @@ class GPUModelRunner(
             assert isinstance(self.drafter, NgramProposer)
             draft_token_ids = self.drafter.propose(
                 sampled_token_ids,
-                self.input_batch.req_ids,
                 self.input_batch.num_tokens_no_spec,
                 self.input_batch.token_ids_cpu,
-                self.input_batch.spec_decode_unsupported_reqs,
             )
         elif spec_config.method == "suffix":
             assert isinstance(sampled_token_ids, list)
@@ -3473,6 +3715,7 @@ class GPUModelRunner(
                     next_token_ids, valid_sampled_tokens_count
                 )
 
+            num_rejected_tokens_gpu = None
             if spec_decode_metadata is None:
                 token_indices_to_sample = None
                 # input_ids can be None for multimodal models.
@@ -3503,12 +3746,14 @@ class GPUModelRunner(
                     else:
                         target_hidden_states = hidden_states[token_indices]
                 else:
-                    common_attn_metadata, token_indices_to_sample = (
-                        self.drafter.prepare_inputs_padded(
-                            common_attn_metadata,
-                            spec_decode_metadata,
-                            valid_sampled_tokens_count,
-                        )
+                    (
+                        common_attn_metadata,
+                        token_indices_to_sample,
+                        num_rejected_tokens_gpu,
+                    ) = self.drafter.prepare_inputs_padded(
+                        common_attn_metadata,
+                        spec_decode_metadata,
+                        valid_sampled_tokens_count,
                     )
                     total_num_tokens = common_attn_metadata.num_actual_tokens
                     # When padding the batch, token_indices is just a range
@@ -3539,6 +3784,7 @@ class GPUModelRunner(
                 sampling_metadata=sampling_metadata,
                 common_attn_metadata=common_attn_metadata,
                 mm_embed_inputs=mm_embed_inputs,
+                num_rejected_tokens_gpu=num_rejected_tokens_gpu,
             )
 
         return draft_token_ids
@@ -3657,8 +3903,8 @@ class GPUModelRunner(
             logger.error(combined_msg)
             raise e
         logger.info_once(
-            "Model loading took %.4f GiB memory and %.6f seconds",
-            self.model_memory_usage / GiB_bytes,
+            "Model loading took %s GiB memory and %.6f seconds",
+            format_gib(self.model_memory_usage),
             time_after_load - time_before_load,
             scope="local",
         )
@@ -3710,11 +3956,14 @@ class GPUModelRunner(
         # wrap the model with full cudagraph wrapper if needed.
         cudagraph_mode = self.compilation_config.cudagraph_mode
         assert cudagraph_mode is not None
-        if cudagraph_mode.has_full_cudagraphs() and not self.parallel_config.enable_dbo:
+        if (
+            cudagraph_mode.has_full_cudagraphs()
+            and not self.parallel_config.use_ubatching
+        ):
             self.model = CUDAGraphWrapper(
                 self.model, self.vllm_config, runtime_mode=CUDAGraphMode.FULL
             )
-        elif self.parallel_config.enable_dbo:
+        elif self.parallel_config.use_ubatching:
             if cudagraph_mode.has_full_cudagraphs():
                 self.model = UBatchWrapper(
                     self.model, self.vllm_config, CUDAGraphMode.FULL, self.device
@@ -4004,6 +4253,11 @@ class GPUModelRunner(
             remove_lora: If False, dummy LoRAs are not destroyed after the run
             activate_lora: If False, dummy_run is performed without LoRAs.
         """
+        if supports_mm_encoder_only(self.model):
+            # The current dummy run only covers LM execution, so we can skip it.
+            # mm encoder dummy run may need to add in the future.
+            return torch.tensor([]), torch.tensor([])
+
         assert (
             cudagraph_runtime_mode is None
             or cudagraph_runtime_mode.valid_runtime_modes()
@@ -4095,7 +4349,16 @@ class GPUModelRunner(
             batch_desc.num_reqs if batch_desc.num_reqs is not None else num_reqs
         )
         ubatch_slices, ubatch_slices_padded = maybe_create_ubatch_slices(
-            should_ubatch, num_scheduled_tokens, num_tokens_padded, num_reqs_padded
+            should_ubatch,
+            num_scheduled_tokens,
+            num_tokens_padded,
+            num_reqs_padded,
+            self.vllm_config.parallel_config.num_ubatches,
+        )
+        logger.debug(
+            "ubatch_slices: %s, ubatch_slices_padded: %s",
+            ubatch_slices,
+            ubatch_slices_padded,
         )
 
         attn_metadata: PerLayerAttnMetadata | None = None
@@ -4140,10 +4403,10 @@ class GPUModelRunner(
         ):
             # Make sure padding doesn't exceed max_num_tokens
             assert num_tokens_padded <= self.max_num_tokens
-            model_kwargs = self._init_model_kwargs(num_tokens_padded)
+            model_kwargs = self._init_model_kwargs()
             if self.supports_mm_inputs and not self.model_config.is_encoder_decoder:
-                input_ids = None
-                inputs_embeds = self.inputs_embeds.gpu[:num_tokens_padded]
+                input_ids, inputs_embeds = self._prepare_mm_inputs(num_tokens_padded)
+
                 model_kwargs = {
                     **model_kwargs,
                     **self._dummy_mm_kwargs(num_reqs),
@@ -4151,7 +4414,7 @@ class GPUModelRunner(
             elif self.enable_prompt_embeds:
                 input_ids = None
                 inputs_embeds = self.inputs_embeds.gpu[:num_tokens_padded]
-                model_kwargs = self._init_model_kwargs(num_tokens_padded)
+                model_kwargs = self._init_model_kwargs()
             else:
                 input_ids = self.input_ids.gpu[:num_tokens_padded]
                 inputs_embeds = None
@@ -4276,6 +4539,11 @@ class GPUModelRunner(
         # The dummy hidden states may contain special values,
         # like `inf` or `nan`.
         # To avoid breaking the sampler, we use a random tensor here instead.
+
+        if supports_mm_encoder_only(self.model):
+            # MM Encoder only model no need to run sampler.
+            return torch.tensor([])
+
         hidden_states = torch.rand_like(hidden_states)
 
         logits = self.model.compute_logits(hidden_states)
@@ -4350,17 +4618,14 @@ class GPUModelRunner(
         max_num_reqs = self.scheduler_config.max_num_seqs
         num_reqs = min(num_tokens, max_num_reqs)
         min_tokens_per_req = num_tokens // num_reqs
-        num_scheduled_tokens_list = [min_tokens_per_req] * num_reqs
-        num_scheduled_tokens_list[-1] += num_tokens % num_reqs
-        assert sum(num_scheduled_tokens_list) == num_tokens
-        assert len(num_scheduled_tokens_list) == num_reqs
+        num_scheduled_tokens_np = np.full(num_reqs, min_tokens_per_req)
+        num_scheduled_tokens_np[-1] += num_tokens % num_reqs
+        assert np.sum(num_scheduled_tokens_np) == num_tokens
+        assert len(num_scheduled_tokens_np) == num_reqs
 
         req_num_tokens = num_tokens // num_reqs
 
-        dummy_prompt_lens = torch.tensor(
-            num_scheduled_tokens_list,
-            device="cpu",
-        )
+        dummy_prompt_lens = torch.from_numpy(num_scheduled_tokens_np)
         dummy_token_ids = torch.zeros(
             (num_reqs, req_num_tokens), dtype=torch.int32, device=self.device
         )
@@ -4379,7 +4644,7 @@ class GPUModelRunner(
         )
 
         dummy_metadata.build_pooling_cursor(
-            num_scheduled_tokens_list,
+            num_scheduled_tokens_np,
             seq_lens_cpu=dummy_prompt_lens,
             device=hidden_states.device,
         )
@@ -4404,6 +4669,10 @@ class GPUModelRunner(
         self,
         hidden_states: torch.Tensor,
     ) -> PoolerOutput:
+        if supports_mm_encoder_only(self.model):
+            # MM Encoder only model not need to run pooler.
+            return torch.tensor([])
+
         # Find the task that has the largest output for subsequent steps
         supported_pooling_tasks = self.get_supported_pooling_tasks()
 
@@ -4419,7 +4688,7 @@ class GPUModelRunner(
         for task in supported_pooling_tasks:
             # Run a full batch with each task to ensure none of them OOMs
             output = self._dummy_pooler_run_task(hidden_states, task)
-            output_size[task] = sum(o.nbytes for o in output)
+            output_size[task] = sum(o.nbytes for o in output if o is not None)
             del output  # Allow GC
 
         max_task = max(output_size.items(), key=lambda x: x[1])[0]
@@ -4625,7 +4894,7 @@ class GPUModelRunner(
             # is above the threshold. Otherwise we just capture a non-ubatched
             # version of the graph
             allow_microbatching = (
-                self.parallel_config.enable_dbo
+                self.parallel_config.use_ubatching
                 and cudagraph_runtime_mode == CUDAGraphMode.FULL
                 and uniform_decode
                 and check_ubatch_thresholds(
@@ -4760,8 +5029,8 @@ class GPUModelRunner(
                     if kv_cache_group_id < len(kernel_block_sizes)
                     else None,
                     num_metadata_builders=1
-                    if not self.parallel_config.enable_dbo
-                    else 2,
+                    if not self.parallel_config.use_ubatching
+                    else self.parallel_config.num_ubatches,
                 )
         # Calculate reorder batch threshold (if needed)
         # Note (tdoublep): do this *after* constructing builders,
@@ -5453,6 +5722,28 @@ class GPUModelRunner(
                 kv_transfer_group.register_kv_caches(kv_caches)
             kv_transfer_group.set_host_xfer_buffer_ops(copy_kv_blocks)
 
+        if self.model_config.enable_return_routed_experts:
+            self.init_routed_experts_capturer()
+
+    def init_routed_experts_capturer(self):
+        logger.info(
+            "Initializing routed experts capturer, enable_return_routed_experts: %s",
+            self.model_config.enable_return_routed_experts,
+        )
+        routed_experts_capturer = RoutedExpertsCapturer.create()
+        block_size = self.cache_config.block_size
+        self.max_num_kv_tokens = (
+            self.kv_cache_config.num_blocks // len(self.kv_cache_config.kv_cache_groups)
+            + 1
+        ) * block_size
+
+        routed_experts_capturer.init_buffer(
+            max_num_batched_tokens=self.scheduler_config.max_num_batched_tokens,
+            max_num_kv_tokens=self.max_num_kv_tokens,
+            model_config=self.model_config,
+            instance_id=self.vllm_config.instance_id,
+        )
+
     def may_add_encoder_only_layers_to_kv_cache_config(self) -> None:
         """
         Add encoder-only layers to the KV cache config.
diff --git a/vllm/v1/worker/gpu_ubatch_wrapper.py b/vllm/v1/worker/gpu_ubatch_wrapper.py
index 2ce2b64512560053c4cd3f570079c7203d242124..af09129e67b1e4fc6aa59783c0e0b3a019e11f99 100644
--- a/vllm/v1/worker/gpu_ubatch_wrapper.py
+++ b/vllm/v1/worker/gpu_ubatch_wrapper.py
@@ -103,8 +103,10 @@ class UBatchWrapper:
         self.vllm_config = vllm_config
         self.compilation_config = vllm_config.compilation_config
         self.comm_stream = torch.cuda.Stream(device=device)
-        # Two ubatch threads plus the main thread
-        self.ready_barrier = threading.Barrier(3)
+        # Ubatch threads plus the main thread
+        self.ready_barrier = threading.Barrier(
+            self.vllm_config.parallel_config.num_ubatches + 1
+        )
 
         self.cudagraphs: dict[int, CUDAGraphMetaData] = {}
 
@@ -309,7 +311,7 @@ class UBatchWrapper:
                 create_forward_context(
                     attn_metadata[i] if attn_metadata is not None else None,
                     self.vllm_config,
-                    dp_metadata=dp_metadata,
+                    dp_metadata=dp_metadata[i],
                     batch_descriptor=batch_descriptor,
                     cudagraph_runtime_mode=cudagraph_runtime_mode,
                 )
@@ -417,18 +419,19 @@ class UBatchWrapper:
 
         # We shouldn't be here unless we are running with multiple DP ranks
         assert dp_metadata is not None
-        num_tokens_per_ubatch = (
-            ubatch_slices[0].token_slice.stop - ubatch_slices[0].token_slice.start
-        )
-        dp_size = self.vllm_config.parallel_config.data_parallel_size
-        ubatch_num_tokens_across_dp = torch.tensor(
-            [num_tokens_per_ubatch] * dp_size, device="cpu", dtype=torch.int32
-        )
-        ubatch_dp_metadata = DPMetadata.make(
-            self.vllm_config.parallel_config,
-            num_tokens_per_ubatch,
-            ubatch_num_tokens_across_dp,
-        )
+        ubatch_dp_metadata = []
+        for ubatch_slice in ubatch_slices:
+            dp_size = self.vllm_config.parallel_config.data_parallel_size
+            ubatch_num_tokens_across_dp = torch.tensor(
+                [ubatch_slice.num_tokens] * dp_size, device="cpu", dtype=torch.int32
+            )
+            ubatch_dp_metadata.append(
+                DPMetadata.make(
+                    self.vllm_config.parallel_config,
+                    ubatch_slice.num_tokens,
+                    ubatch_num_tokens_across_dp,
+                )
+            )
 
         if (
             num_tokens not in self.cudagraphs
@@ -464,7 +467,7 @@ class UBatchWrapper:
                 intermediate_tensors=intermediate_tensors,
                 inputs_embeds=inputs_embeds,
                 compute_stream=compute_stream,
-                dp_metadata=dp_metadata,
+                dp_metadata=ubatch_dp_metadata,
                 batch_descriptor=batch_descriptor,
                 cudagraph_runtime_mode=CUDAGraphMode.NONE,
             )
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 98ca3e0a1f01523c3b0f43f91c9670cfe40a8b29..a8d4631759e9519053ea7b0522cfe977f5b1eb1d 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -14,7 +14,7 @@ import torch.distributed
 import torch.nn as nn
 
 import vllm.envs as envs
-from vllm.config import CUDAGraphMode, VllmConfig
+from vllm.config import CUDAGraphMode, VllmConfig, set_current_vllm_config
 from vllm.config.compilation import CompilationMode
 from vllm.distributed import (
     ensure_model_parallel_initialized,
@@ -34,15 +34,14 @@ from vllm.distributed.parallel_state import (
 )
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
-from vllm.model_executor import set_random_seed
 from vllm.model_executor.models.interfaces import is_mixture_of_experts
 from vllm.model_executor.warmup.kernel_warmup import kernel_warmup
 from vllm.platforms import current_platform
 from vllm.profiler.wrapper import CudaProfilerWrapper, TorchProfilerWrapper
 from vllm.sequence import IntermediateTensors
 from vllm.tasks import SupportedTask
-from vllm.utils.mem_constants import GiB_bytes
-from vllm.utils.mem_utils import MemorySnapshot, memory_profiling
+from vllm.utils.mem_utils import MemorySnapshot, format_gib, memory_profiling
+from vllm.utils.torch_utils import set_random_seed
 from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
 from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType
 from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
@@ -51,11 +50,13 @@ from vllm.v1.outputs import (
     DraftTokenIds,
     ModelRunnerOutput,
 )
-from vllm.v1.utils import report_usage_stats
+from vllm.v1.utils import compute_iteration_details, report_usage_stats
 from vllm.v1.worker.utils import is_residual_scattered_for_sp
 from vllm.v1.worker.worker_base import WorkerBase
 from vllm.v1.worker.workspace import init_workspace_manager
 
+from .utils import request_memory
+
 logger = init_logger(__name__)
 
 if TYPE_CHECKING:
@@ -82,13 +83,7 @@ class Worker(WorkerBase):
 
         # configure float32 matmul precision according to vLLM env.
         # precision = envs.VLLM_FLOAT32_MATMUL_PRECISION
-        # torch.backends.cuda.matmul.fp32_precision = precision
-
-        if self.model_config.trust_remote_code:
-            # note: lazy import to avoid importing torch before initializing
-            from vllm.utils.import_utils import init_cached_hf_modules
-
-            init_cached_hf_modules()
+        # torch.set_float32_matmul_precision(precision)
 
         # Buffers saved before sleep
         self._sleep_saved_buffers: dict[str, torch.Tensor] = {}
@@ -130,9 +125,9 @@ class Worker(WorkerBase):
         used_bytes = total - free_bytes_after_sleep
         assert freed_bytes >= 0, "Memory usage increased after sleeping."
         logger.info(
-            "Sleep mode freed %.2f GiB memory, %.2f GiB memory is still in use.",
-            freed_bytes / GiB_bytes,
-            used_bytes / GiB_bytes,
+            "Sleep mode freed %s GiB memory, %s GiB memory is still in use.",
+            format_gib(freed_bytes),
+            format_gib(used_bytes),
         )
 
     def wake_up(self, tags: list[str] | None = None) -> None:
@@ -177,22 +172,20 @@ class Worker(WorkerBase):
         self.cache_config.num_cpu_blocks = num_cpu_blocks
 
     def init_device(self):
-        device = self.device_config.device
-        if isinstance(device, torch.device) and device.type == "cuda":
+        if self.device_config.device_type == "cuda":
             # This env var set by Ray causes exceptions with graph building.
             os.environ.pop("NCCL_ASYNC_ERROR_HANDLING", None)
+            parallel_config = self.parallel_config
             if (
-                self.parallel_config.data_parallel_size > 1
-                and self.parallel_config.data_parallel_size_local > 0
-                and self.parallel_config.distributed_executor_backend
-                not in ["ray", "external_launcher"]
-                and self.vllm_config.parallel_config.data_parallel_backend != "ray"
-                and self.vllm_config.parallel_config.nnodes_within_dp == 1
+                parallel_config.distributed_executor_backend
+                not in ("ray", "external_launcher")
+                and parallel_config.data_parallel_backend != "ray"
+                and parallel_config.nnodes_within_dp == 1
             ):
                 # Use local DP rank if available, otherwise use global DP rank.
                 dp_local_rank = self.parallel_config.data_parallel_rank_local
                 if dp_local_rank is None:
-                    dp_local_rank = self.parallel_config.data_parallel_rank
+                    dp_local_rank = self.parallel_config.data_parallel_index
 
                 tp_pp_world_size = (
                     self.parallel_config.pipeline_parallel_size
@@ -237,22 +230,12 @@ class Worker(WorkerBase):
             torch.cuda.empty_cache()
 
             # take current memory snapshot
-            self.init_snapshot = MemorySnapshot()
-            self.requested_memory = (
-                self.init_snapshot.total_memory
-                * self.cache_config.gpu_memory_utilization
+            self.init_snapshot = init_snapshot = MemorySnapshot(device=self.device)
+            self.requested_memory = request_memory(init_snapshot, self.cache_config)
+            logger.debug("worker init memory snapshot: %r", self.init_snapshot)
+            logger.debug(
+                "worker requested memory: %sGiB", format_gib(self.requested_memory)
             )
-            if self.init_snapshot.free_memory < self.requested_memory:
-                GiB = lambda b: round(b / GiB_bytes, 2)
-                raise ValueError(
-                    f"Free memory on device "
-                    f"({GiB(self.init_snapshot.free_memory)}/"
-                    f"{GiB(self.init_snapshot.total_memory)} GiB) on startup "
-                    f"is less than desired GPU memory utilization "
-                    f"({self.cache_config.gpu_memory_utilization}, "
-                    f"{GiB(self.requested_memory)} GiB). Decrease GPU memory "
-                    f"utilization or reduce GPU memory used by other processes."
-                )
         else:
             raise RuntimeError(f"Not support device type: {self.device_config.device}")
 
@@ -285,7 +268,9 @@ class Worker(WorkerBase):
     # to hijack tensor allocation.
     def load_model(self) -> None:
         eep_scale_up = os.environ.get("VLLM_ELASTIC_EP_SCALE_UP_LAUNCH") == "1"
-        with self._maybe_get_memory_pool_context(tag="weights"):
+        with self._maybe_get_memory_pool_context(
+            tag="weights"
+        ) and set_current_vllm_config(self.vllm_config):
             self.model_runner.load_model(eep_scale_up=eep_scale_up)
 
     def update_config(self, overrides: dict[str, Any]) -> None:
@@ -307,15 +292,14 @@ class Worker(WorkerBase):
             You may limit the usage of GPU memory
             by adjusting the `gpu_memory_utilization` parameter.
         """
-        GiB = lambda b: b / GiB_bytes
         if kv_cache_memory_bytes := self.cache_config.kv_cache_memory_bytes:
             # still need a profile run which compiles the model for
             # max_num_batched_tokens
             self.model_runner.profile_run()
 
             msg = (
-                f"Initial free memory {GiB(self.init_snapshot.free_memory):.2f} "
-                f"GiB, reserved {GiB(kv_cache_memory_bytes):.2f} GiB memory for "
+                f"Initial free memory {format_gib(self.init_snapshot.free_memory)} "
+                f"GiB, reserved {format_gib(kv_cache_memory_bytes)} GiB memory for "
                 "KV Cache as specified by kv_cache_memory_bytes config and "
                 "skipped memory profiling. This does not respect the "
                 "gpu_memory_utilization config. Only use kv_cache_memory_bytes "
@@ -347,8 +331,8 @@ class Worker(WorkerBase):
         # GPU did not change their memory usage during the profiling.
         assert self.init_snapshot.free_memory > free_gpu_memory, (
             "Error in memory profiling. "
-            f"Initial free memory {GiB(self.init_snapshot.free_memory)} GiB, "
-            f"current free memory {GiB(free_gpu_memory)} GiB. "
+            f"Initial free memory {format_gib(self.init_snapshot.free_memory)} GiB, "
+            f"current free memory {format_gib(free_gpu_memory)} GiB. "
             "This happens when other processes sharing the same container "
             "release GPU memory while vLLM is profiling during initialization. "
             "To fix this, ensure consistent GPU memory allocation or "
@@ -360,21 +344,20 @@ class Worker(WorkerBase):
 
         unrequested_memory = self.init_snapshot.free_memory - self.requested_memory
         logger.debug(
-            "Initial free memory: %.2f GiB; Requested memory: %.2f (util), %.2f GiB",
-            GiB(self.init_snapshot.free_memory),
+            "Initial free memory: %s GiB; Requested memory: %f (util), %s GiB",
+            format_gib(self.init_snapshot.free_memory),
             self.cache_config.gpu_memory_utilization,
-            GiB(self.requested_memory),
+            format_gib(self.requested_memory),
         )
         logger.debug(
-            "Free memory after profiling: %.2f GiB (total), "
-            "%.2f GiB (within requested)",
-            GiB(free_gpu_memory),
-            GiB(free_gpu_memory - unrequested_memory),
+            "Free memory after profiling: %s GiB (total), %s GiB (within requested)",
+            format_gib(free_gpu_memory),
+            format_gib(free_gpu_memory - unrequested_memory),
         )
         logger.debug(profile_result)
         logger.info_once(
-            "Available KV cache memory: %.2f GiB",
-            GiB(self.available_kv_cache_memory_bytes),
+            "Available KV cache memory: %s GiB",
+            format_gib(self.available_kv_cache_memory_bytes),
             scope="local",
         )
         gc.collect()
@@ -399,6 +382,19 @@ class Worker(WorkerBase):
     def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
         return self.model_runner.get_kv_cache_spec()
 
+    def update_max_model_len(self, max_model_len: int) -> None:
+        """Update max_model_len after auto-fit to GPU memory.
+
+        This is called when max_model_len=-1 is used and the engine
+        automatically determines the maximum context length that fits
+        in GPU memory. Workers need to update their cached max_model_len
+        to match the engine's decision.
+        """
+        self.model_config.max_model_len = max_model_len
+        if self.model_runner is not None:
+            self.model_runner.update_max_model_len(max_model_len)
+        logger.debug("Updated max_model_len to %d", max_model_len)
+
     def initialize_from_config(self, kv_cache_config: KVCacheConfig) -> None:
         """Allocate GPU KV cache with the specified kv_cache_config."""
 
@@ -468,7 +464,6 @@ class Worker(WorkerBase):
             # CUDAGraph memory size and may not utilize all gpu memory.
             # Users may want fine-grained control to specify kv cache
             # memory size.
-            GiB = lambda b: round(b / GiB_bytes, 2)
 
             # empirically observed that the memory profiling may
             # slightly underestimate the memory consumption.
@@ -493,24 +488,24 @@ class Worker(WorkerBase):
 
             msg = (
                 f"Free memory on device "
-                f"({GiB(self.init_snapshot.free_memory)}/"
-                f"{GiB(self.init_snapshot.total_memory)} GiB) on startup. "
+                f"({format_gib(self.init_snapshot.free_memory)}/"
+                f"{format_gib(self.init_snapshot.total_memory)} GiB) on startup. "
                 f"Desired GPU memory utilization is "
                 f"({self.cache_config.gpu_memory_utilization}, "
-                f"{GiB(self.requested_memory)} GiB). "
-                f"Actual usage is {GiB(self.model_runner.model_memory_usage)} "
-                f"GiB for weight, {GiB(self.peak_activation_memory)} GiB "
-                f"for peak activation, {GiB(self.non_torch_memory)} GiB "
-                f"for non-torch memory, and {GiB(cuda_graph_memory_bytes)} "
+                f"{format_gib(self.requested_memory)} GiB). "
+                f"Actual usage is {format_gib(self.model_runner.model_memory_usage)} "
+                f"GiB for weight, {format_gib(self.peak_activation_memory)} GiB "
+                f"for peak activation, {format_gib(self.non_torch_memory)} GiB "
+                f"for non-torch memory, and {format_gib(cuda_graph_memory_bytes)} "
                 f"GiB for CUDAGraph memory. Replace gpu_memory_utilization "
                 f"config with `--kv-cache-memory="
                 f"{kv_cache_memory_bytes_to_requested_limit}` "
-                f"({GiB(kv_cache_memory_bytes_to_requested_limit)} GiB) to fit "
+                f"({format_gib(kv_cache_memory_bytes_to_requested_limit)} GiB) to fit "
                 f"into requested memory, or `--kv-cache-memory="
                 f"{kv_cache_memory_bytes_to_gpu_limit}` "
-                f"({GiB(kv_cache_memory_bytes_to_gpu_limit)} GiB) to fully "
+                f"({format_gib(kv_cache_memory_bytes_to_gpu_limit)} GiB) to fully "
                 f"utilize gpu memory. Current kv cache memory in use is "
-                f"{GiB(self.available_kv_cache_memory_bytes)} GiB."
+                f"{format_gib(self.available_kv_cache_memory_bytes)} GiB."
             )
 
             logger.debug(msg)
@@ -552,18 +547,29 @@ class Worker(WorkerBase):
 
     def annotate_profile(self, scheduler_output):
         # add trace annotation so that we can easily distinguish
-        # new/cached request numbers in each iteration
+        # context/generation request numbers in each iteration.
+        # A context request is a request that has not yet generated any tokens
         if not self.profiler:
             return nullcontext()
 
         self.profiler.step()
 
-        num_new = len(scheduler_output.scheduled_new_reqs)
-        num_cached = len(scheduler_output.scheduled_cached_reqs.req_ids)
-
-        return self.profiler.annotate_context_manager(
-            f"execute_new_{num_new}_cached_{num_cached}"
+        iteration_details = compute_iteration_details(scheduler_output)
+
+        annotation = "".join(
+            [
+                "execute_context_",
+                str(iteration_details.num_ctx_requests),
+                "(",
+                str(iteration_details.num_ctx_tokens),
+                ")_generation_",
+                str(iteration_details.num_generation_requests),
+                "(",
+                str(iteration_details.num_generation_tokens),
+                ")",
+            ]
         )
+        return self.profiler.annotate_context_manager(annotation)
 
     @torch.inference_mode()
     def sample_tokens(
@@ -574,7 +580,7 @@ class Worker(WorkerBase):
     @torch.inference_mode()
     def execute_model(
         self, scheduler_output: "SchedulerOutput"
-    ) -> ModelRunnerOutput | None:
+    ) -> ModelRunnerOutput | AsyncModelRunnerOutput | None:
         intermediate_tensors = None
         forward_pass = scheduler_output.total_num_scheduled_tokens > 0
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
@@ -623,7 +629,9 @@ class Worker(WorkerBase):
             output = self.model_runner.execute_model(
                 scheduler_output, intermediate_tensors
             )
-            if isinstance(output, (ModelRunnerOutput, NoneType)):
+            if isinstance(
+                output, ModelRunnerOutput | AsyncModelRunnerOutput | NoneType
+            ):
                 return output
 
         assert isinstance(output, IntermediateTensors)
@@ -646,7 +654,12 @@ class Worker(WorkerBase):
 
     def profile(self, is_start: bool = True):
         if self.profiler is None:
-            raise RuntimeError("Profiling is not enabled.")
+            raise RuntimeError(
+                "Profiling is not enabled. Please set --profiler-config to enable "
+                "profiling. Example: "
+                "'--profiler-config.profiler=torch --profiler-config.torch_profiler_dir"
+                "=YOUR_DIR_PATH_TO_DUMP_TRACE'"
+            )
         if is_start:
             self.profiler.start()
         else:
diff --git a/vllm/v1/worker/kv_connector_model_runner_mixin.py b/vllm/v1/worker/kv_connector_model_runner_mixin.py
index 2bcc87b63bcdf8f575363d4e2d474a9200d2e5df..ca0868befd44244756f9998c9c99abb3fb1b2a86 100644
--- a/vllm/v1/worker/kv_connector_model_runner_mixin.py
+++ b/vllm/v1/worker/kv_connector_model_runner_mixin.py
@@ -7,13 +7,10 @@ Define KV connector functionality mixin for model runners.
 import copy
 from collections.abc import Generator
 from contextlib import AbstractContextManager, contextmanager, nullcontext
-from typing import (
-    TYPE_CHECKING,  # noqa: UP035
-)
+from typing import TYPE_CHECKING
 
 import torch
 
-from vllm.attention.backends.abstract import AttentionBackend
 from vllm.config import VllmConfig
 from vllm.config.cache import CacheDType
 from vllm.distributed.kv_transfer import (
@@ -24,6 +21,7 @@ from vllm.distributed.kv_transfer import (
 from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase
 from vllm.forward_context import get_forward_context, set_forward_context
 from vllm.logger import init_logger
+from vllm.v1.attention.backend import AttentionBackend
 from vllm.v1.kv_cache_interface import AttentionSpec, KVCacheConfig
 from vllm.v1.outputs import (
     EMPTY_MODEL_RUNNER_OUTPUT,
diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py
index a67246146005cd91f5202a9d6af3d483eeb6f95c..b7d488ea1c182f6067bd81aa1216f9251b4e11e6 100644
--- a/vllm/v1/worker/lora_model_runner_mixin.py
+++ b/vllm/v1/worker/lora_model_runner_mixin.py
@@ -5,6 +5,7 @@ Define LoRA functionality mixin for model runners.
 """
 
 from contextlib import contextmanager
+from typing import TypeAlias
 
 import numpy as np
 import torch
@@ -13,14 +14,14 @@ import torch.nn as nn
 from vllm.config import VllmConfig
 from vllm.config.lora import LoRAConfig
 from vllm.logger import init_logger
-from vllm.lora.layers import LoRAMapping
+from vllm.lora.layers import LoRAMapping, LoRAMappingType
 from vllm.lora.request import LoRARequest
 from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
-from vllm.model_executor.models import supports_lora, supports_multimodal
+from vllm.model_executor.models import supports_lora
 from vllm.v1.worker.gpu_input_batch import InputBatch as GPUInputBatch
 from vllm.v1.worker.tpu_input_batch import InputBatch as TPUInputBatch
 
-InputBatch = TPUInputBatch | GPUInputBatch
+InputBatch: TypeAlias = TPUInputBatch | GPUInputBatch
 
 logger = init_logger(__name__)
 
@@ -28,29 +29,28 @@ logger = init_logger(__name__)
 # Defined as a mixin for GPUModelRunner
 class LoRAModelRunnerMixin:
     def load_lora_model(
-        self, model: nn.Module, vllm_config: VllmConfig, device: torch.device
+        self,
+        model: nn.Module,
+        vllm_config: VllmConfig,
+        device: torch.device,
     ) -> nn.Module:
         if not supports_lora(model):
             raise ValueError(f"{model.__class__.__name__} does not support LoRA yet.")
 
-        if supports_multimodal(model):
-            logger.warning(
-                "Regarding multimodal models, vLLM currently "
-                "only supports adding LoRA to language model."
-            )
         # Add LoRA Manager to the Model Runner
         self.lora_manager = LRUCacheWorkerLoRAManager(
             vllm_config,
             device,
             model.embedding_modules,
         )
-        return self.lora_manager.create_lora_manager(model)
+        return self.lora_manager.create_lora_manager(model, vllm_config)
 
     def _set_active_loras(
         self,
         prompt_lora_mapping: tuple[int, ...],
         token_lora_mapping: tuple[int, ...],
         lora_requests: set[LoRARequest],
+        mapping_type: LoRAMappingType = LoRAMappingType.LANGUAGE,
     ) -> None:
         self._ensure_lora_enabled()
 
@@ -59,7 +59,10 @@ class LoRAModelRunnerMixin:
         # On cuda platforms we use the same kernels for prefill and
         # decode and this flag is generally ignored.
         lora_mapping = LoRAMapping(
-            token_lora_mapping, prompt_lora_mapping, is_prefill=True
+            token_lora_mapping,
+            prompt_lora_mapping,
+            is_prefill=True,
+            type=mapping_type,
         )
         self.lora_manager.set_active_adapters(lora_requests, lora_mapping)
 
@@ -72,6 +75,7 @@ class LoRAModelRunnerMixin:
         input_batch: InputBatch,
         num_scheduled_tokens: np.ndarray,
         num_sampled_tokens: np.ndarray | None = None,
+        mapping_type: LoRAMappingType = LoRAMappingType.LANGUAGE,
     ) -> None:
         if num_sampled_tokens is None:
             num_sampled_tokens = np.ones_like(num_scheduled_tokens, dtype=np.int32)
@@ -83,7 +87,7 @@ class LoRAModelRunnerMixin:
             input_batch.make_lora_inputs(num_scheduled_tokens, num_sampled_tokens)
         )
         return self._set_active_loras(
-            prompt_lora_mapping, token_lora_mapping, lora_requests
+            prompt_lora_mapping, token_lora_mapping, lora_requests, mapping_type
         )
 
     @contextmanager
@@ -127,6 +131,7 @@ class LoRAModelRunnerMixin:
         self,
         lora_config: LoRAConfig | None,
         num_scheduled_tokens: np.ndarray,
+        mapping_type: LoRAMappingType = LoRAMappingType.LANGUAGE,
         num_sampled_tokens: np.ndarray | None = None,
         activate_lora: bool = True,
     ):
@@ -168,7 +173,10 @@ class LoRAModelRunnerMixin:
             }
 
             self._set_active_loras(
-                tuple(sample_lora_mapping), tuple(token_lora_mapping), lora_requests
+                tuple(sample_lora_mapping),
+                tuple(token_lora_mapping),
+                lora_requests,
+                mapping_type,
             )
 
             yield
@@ -181,11 +189,16 @@ class LoRAModelRunnerMixin:
         num_sampled_tokens: np.ndarray,
         activate_lora: bool = True,
         remove_lora: bool = True,
+        mapping_type: LoRAMappingType = LoRAMappingType.LANGUAGE,
     ):
         with (
             self.maybe_setup_dummy_loras(lora_config, remove_lora),
             self.maybe_select_dummy_loras(
-                lora_config, num_scheduled_tokens, num_sampled_tokens, activate_lora
+                lora_config,
+                num_scheduled_tokens,
+                mapping_type,
+                num_sampled_tokens,
+                activate_lora,
             ),
         ):
             yield
diff --git a/vllm/v1/worker/tpu_input_batch.py b/vllm/v1/worker/tpu_input_batch.py
index 2ed65ca9d31cdc126770c714f4de59f828dbd6f4..3758a73ee4967ad9e8cd322ebfc70fbf92824291 100644
--- a/vllm/v1/worker/tpu_input_batch.py
+++ b/vllm/v1/worker/tpu_input_batch.py
@@ -51,7 +51,6 @@ class InputBatch:
             pin_memory=False,
         )
         self.token_ids_cpu = self.token_ids_cpu_tensor.numpy()
-        self.num_tokens = np.zeros(max_num_reqs, dtype=np.int32)
         self.num_tokens_no_spec = np.zeros(max_num_reqs, dtype=np.int32)
         self.num_prompt_tokens = np.zeros(max_num_reqs, dtype=np.int32)
         self.num_computed_tokens_cpu_tensor = torch.zeros(
@@ -200,9 +199,6 @@ class InputBatch:
         start_idx = num_prompt_tokens
         end_idx = start_idx + len(request.output_token_ids)
         self.token_ids_cpu[req_index, start_idx:end_idx] = request.output_token_ids
-        # Number of token ids in token_ids_cpu.
-        # NOTE(woosuk): This may include spec decode tokens.
-        self.num_tokens[req_index] = request.num_tokens
         # Number of tokens without spec decode tokens.
         self.num_tokens_no_spec[req_index] = request.num_tokens
 
@@ -344,10 +340,6 @@ class InputBatch:
             self.req_id_to_index[old_id_i2],
             self.req_id_to_index[old_id_i1],
         )
-        self.num_tokens[i1], self.num_tokens[i2] = (
-            self.num_tokens[i2],
-            self.num_tokens[i1],
-        )
         self.num_tokens_no_spec[i1], self.num_tokens_no_spec[i2] = (
             self.num_tokens_no_spec[i2],
             self.num_tokens_no_spec[i1],
@@ -448,11 +440,10 @@ class InputBatch:
             self.req_output_token_ids[last_req_index] = None
             self.req_id_to_index[req_id] = empty_index
 
-            num_tokens = self.num_tokens[last_req_index]
+            num_tokens = self.num_tokens_no_spec[last_req_index]
             self.token_ids_cpu[empty_index, :num_tokens] = self.token_ids_cpu[
                 last_req_index, :num_tokens
             ]
-            self.num_tokens[empty_index] = num_tokens
             self.num_tokens_no_spec[empty_index] = self.num_tokens_no_spec[
                 last_req_index
             ]
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
deleted file mode 100644
index 283f21b779e38b494bd173c87a85960904a2b527..0000000000000000000000000000000000000000
--- a/vllm/v1/worker/tpu_model_runner.py
+++ /dev/null
@@ -1,2191 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import bisect
-import gc
-import time
-from typing import TYPE_CHECKING, Any, cast
-from unittest.mock import patch
-
-import numpy as np
-import torch
-import torch.nn as nn
-
-# TPU XLA related
-import torch_xla
-import torch_xla.core.xla_model as xm
-import torch_xla.distributed.spmd as xs
-import torch_xla.runtime as xr
-
-import vllm.envs as envs
-from vllm.attention.backends.abstract import AttentionType
-from vllm.attention.layer import Attention, MLAAttention
-from vllm.attention.layers.chunked_local_attention import ChunkedLocalAttention
-from vllm.compilation.wrapper import TorchCompileWithNoGuardsWrapper
-from vllm.config import (
-    ParallelConfig,
-    VllmConfig,
-    get_layers_from_vllm_config,
-    update_config,
-)
-from vllm.distributed.kv_transfer import get_kv_transfer_group, has_kv_transfer_group
-from vllm.distributed.kv_transfer.kv_connector.utils import copy_kv_blocks
-from vllm.forward_context import set_forward_context
-from vllm.logger import init_logger
-from vllm.lora.layers import BaseLayerWithLoRA
-from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
-from vllm.model_executor.model_loader import get_model_loader
-from vllm.model_executor.model_loader.tpu import TPUModelLoader
-from vllm.model_executor.models.interfaces import (
-    SupportsMultiModal,
-    supports_transcription,
-)
-from vllm.model_executor.models.interfaces_base import (
-    is_pooling_model,
-    is_text_generation_model,
-)
-from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (
-    BatchedTensorInputs,
-    MultiModalKwargsItem,
-    PlaceholderRange,
-)
-from vllm.multimodal.utils import group_mm_kwargs_by_modality
-from vllm.sequence import IntermediateTensors
-from vllm.tasks import GenerationTask, PoolingTask, SupportedTask
-from vllm.utils.math_utils import cdiv, prev_power_of_2
-from vllm.utils.platform_utils import is_pin_memory_available
-from vllm.v1.attention.backends.pallas import (
-    TPU_STR_DTYPE_TO_TORCH_DTYPE,
-    PallasAttentionBackend,
-    PallasMetadata,
-    get_page_size_bytes,
-)
-from vllm.v1.kv_cache_interface import (
-    AttentionSpec,
-    FullAttentionSpec,
-    KVCacheConfig,
-    KVCacheSpec,
-    MLAAttentionSpec,
-    SlidingWindowSpec,
-)
-from vllm.v1.outputs import (
-    EMPTY_MODEL_RUNNER_OUTPUT,
-    LogprobsLists,
-    LogprobsTensors,
-    ModelRunnerOutput,
-)
-from vllm.v1.sample.tpu.metadata import TPUSupportedSamplingMetadata
-from vllm.v1.sample.tpu.sampler import Sampler as TPUSampler
-from vllm.v1.worker.kv_connector_model_runner_mixin import (
-    KVConnectorModelRunnerMixin,
-    KVConnectorOutput,
-)
-from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
-from vllm.v1.worker.tpu_input_batch import CachedRequestState, InputBatch
-
-from .utils import (
-    MultiModalBudget,
-    add_kv_sharing_layers_to_kv_cache_groups,
-    bind_kv_cache,
-    sanity_check_mm_encoder_outputs,
-)
-
-if TYPE_CHECKING:
-    from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
-
-logger = init_logger(__name__)
-
-INVALID_TOKEN_ID = -1
-# Smallest output size
-MIN_NUM_SEQS = 8
-
-
-#########################################################
-# Ways to avoid recompilation
-#########################################################
-#
-# The model executor has two primary components:
-# 1. preparing the model and sampler inputs
-# 2. executing the model and sampler.
-# The core idea is to avoid any TPU computation during input preparation. For
-# better compilation tracking and increased flexibility, the model execution and
-# sampler are divided into several distinct components.
-#
-# Below are the detailed steps:
-#
-# Step 1
-# It is recommended to avoid TPU operations when preparing the model and sampler
-# inputs. CPU tensors can be prepared and transferred to the XLA device using
-# cpu_tensor.to(xla_device), which only triggers CPU to TPU transfers and avoids
-# compilation.
-#
-# Step 2
-# The TPU execution should be decomposed into subgraphs (4 at the moment):
-# 1. the main model
-# 2. selecting hidden states for each request
-# 3. sampler
-# 4. encoder.
-# Each subgraph should be decorated in a torch.compile. This is used to make
-# sure that we have the same subgraph topology in both dummy_run and
-# xecute_model. The results from these subgraphs should either be passed to
-# other subgraphs, or transferred from TPU to CPU using xla_tensor.cpu() for
-# subsequent processing on the CPU.
-#
-# Step 3
-# The dummy_run should be comprehensive, ensuring all potential input shapes and
-# branch predictions are included as subgraph inputs to facilitate
-# pre-compilation.
-class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        device: torch.device,
-        original_parallel_config: ParallelConfig | None = None,
-    ):
-        self.vllm_config = vllm_config
-        self.model_config = vllm_config.model_config
-        self.cache_config = vllm_config.cache_config
-        self.lora_config = vllm_config.lora_config
-        self.load_config = vllm_config.load_config
-        self.parallel_config = vllm_config.parallel_config
-        self.original_parallel_config = original_parallel_config
-        self.scheduler_config = vllm_config.scheduler_config
-        self.speculative_config = vllm_config.speculative_config
-        self.observability_config = vllm_config.observability_config
-        self.device_config = vllm_config.device_config
-
-        model_config = self.model_config
-        cache_config = self.cache_config
-        scheduler_config = self.scheduler_config
-        parallel_config = self.parallel_config
-        self.device = device
-        self.check_recompilation = envs.VLLM_XLA_CHECK_RECOMPILATION
-
-        # SPMD Related
-        self.use_spmd = envs.VLLM_XLA_USE_SPMD
-        if self.use_spmd:
-            num_devices = xr.global_runtime_device_count()
-            mesh_shape = (num_devices, 1)
-            device_ids = np.array(range(num_devices))
-            self.mesh = xs.Mesh(device_ids, mesh_shape, ("x", "y"))
-
-        self.enforce_eager = model_config.enforce_eager
-
-        self.num_xla_graphs = 0
-        self._update_num_xla_graphs("init")
-
-        self.pin_memory = is_pin_memory_available()
-        self.dtype = self.model_config.dtype
-        if cache_config.cache_dtype == "auto":
-            model_dtype = self.dtype
-            if isinstance(model_dtype, str):
-                self.kv_cache_dtype = TPU_STR_DTYPE_TO_TORCH_DTYPE[model_dtype]
-            else:
-                self.kv_cache_dtype = model_dtype
-        else:
-            self.kv_cache_dtype = TPU_STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype]
-        self._hidden_states_dtype = self.dtype
-
-        self.sliding_window = model_config.get_sliding_window()
-        self.block_size = cache_config.block_size
-        self.max_model_len = model_config.max_model_len
-        self.most_model_len = envs.VLLM_TPU_MOST_MODEL_LEN
-        self.max_num_blocks_per_req = cdiv(self.max_model_len, self.block_size)
-        self.num_blocks_per_most_len_req = (
-            cdiv(self.most_model_len, self.block_size)
-            if self.most_model_len is not None
-            else None
-        )
-        # InputBatch needs to work with sampling tensors greater than padding
-        # to avoid dynamic shapes. Also, avoid suboptimal alignment.
-        self.max_num_reqs = max(scheduler_config.max_num_seqs, MIN_NUM_SEQS)
-        self.num_tokens_paddings = _get_token_paddings(
-            min_token_size=16,
-            max_token_size=scheduler_config.max_num_batched_tokens,
-            padding_gap=envs.VLLM_TPU_BUCKET_PADDING_GAP,
-        )
-        # In case `max_num_tokens < max(num_tokens_paddings)` use the actual
-        # padded max value to pre-allocate data structures and pre-compile.
-        self.max_num_tokens = self.num_tokens_paddings[-1]
-
-        # Model-related.
-        self.num_attn_layers = model_config.get_num_layers_by_block_type(
-            parallel_config, "attention"
-        )
-        self.num_query_heads = model_config.get_num_attention_heads(parallel_config)
-        self.num_kv_heads = model_config.get_num_kv_heads(parallel_config)
-        self.head_size = model_config.get_head_size()
-        self.inputs_embeds_size = model_config.get_inputs_embeds_size()
-        self.vocab_size = model_config.get_vocab_size()
-
-        # Multi-modal data support
-        self.mm_registry = MULTIMODAL_REGISTRY
-        self.uses_mrope = model_config.uses_mrope
-        self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs(
-            model_config
-        )
-        # TODO: Support M-RoPE (e.g, Qwen2-VL)
-        assert not self.uses_mrope, "TPU does not support M-RoPE yet."
-
-        self._num_slices_per_kv_cache_update_block = (
-            _get_num_slices_per_kv_cache_update_block(
-                get_page_size_bytes(
-                    block_size=self.block_size,
-                    num_kv_heads=self.num_kv_heads,
-                    head_size=self.head_size,
-                    kv_cache_dtype=self.kv_cache_dtype,
-                )
-            )
-        )
-
-        # Lazy initialization
-        self.model: nn.Module  # Set after load_model
-        self.kv_caches: list[torch.Tensor] = []
-        # mm_hash -> encoder_output
-        self.encoder_cache: dict[str, torch.Tensor] = {}
-
-        # Request states.
-        self.requests: dict[str, CachedRequestState] = {}
-        # NOTE(rob): num_prompt_logprobs only includes reqs
-        # that are currently in the prefill phase.
-        self.num_prompt_logprobs: dict[str, int] = {}
-
-        # Initialize input batch early to avoid AttributeError in _update_states
-        self.input_batch = InputBatch(
-            max_num_reqs=self.max_num_reqs,
-            max_model_len=self.max_model_len,
-            max_num_batched_tokens=self.max_num_tokens,
-            device=self.device,
-            pin_memory=self.pin_memory,
-            vocab_size=self.model_config.get_vocab_size(),
-            block_sizes=[self.block_size],
-            kernel_block_sizes=[self.cache_config.block_size],
-        )
-
-        # Cached torch/numpy tensor
-        # The pytorch tensor and numpy array share the same buffer.
-        # Sometimes the numpy op is faster so we create both.
-        self.input_ids_cpu = torch.zeros(
-            self.max_num_tokens, dtype=torch.int32, device="cpu"
-        )
-
-        self.positions_cpu = torch.zeros(
-            self.max_num_tokens, dtype=torch.int32, device="cpu"
-        )
-        self.positions_np = self.positions_cpu.numpy()
-        self.block_table_cpu = torch.zeros(
-            (self.max_num_reqs, self.max_num_blocks_per_req),
-            dtype=torch.int32,
-            device="cpu",
-        )
-        # adjust num_reqs to avoid SMEM OOM.
-        self.num_reqs_most_model_len = (
-            min(
-                PallasAttentionBackend.get_max_num_seqs(
-                    self.most_model_len, self.block_size
-                ),
-                self.max_num_reqs,
-            )
-            if self.most_model_len is not None
-            else None
-        )
-        self.num_reqs_max_model_len = min(
-            PallasAttentionBackend.get_max_num_seqs(
-                self.max_model_len, self.block_size
-            ),
-            self.max_num_reqs,
-        )
-        self.query_start_loc_cpu = torch.zeros(
-            self.max_num_tokens + 1,
-            dtype=torch.int32,
-            device="cpu",
-            pin_memory=self.pin_memory,
-        )
-        self.query_start_loc_np = self.query_start_loc_cpu.numpy()
-
-        self.seq_lens_cpu = torch.zeros(
-            self.max_num_tokens,
-            dtype=torch.int32,
-            device="cpu",
-            pin_memory=self.pin_memory,
-        )
-        self.seq_lens_np = self.seq_lens_cpu.numpy()
-
-        # Only relevant for multimodal models
-        if self.supports_mm_inputs:
-            self.is_mm_embed_cpu = torch.zeros(
-                self.max_num_tokens,
-                dtype=torch.bool,
-                device="cpu",
-                pin_memory=self.pin_memory,
-            )
-
-        # Range tensor with values [0 .. self.max_num_tokens - 1].
-        # Used to initialize positions / context_lens / seq_lens
-        # Keep in int64 to avoid overflow with long context
-        self.arange_np = np.arange(self.max_num_tokens, dtype=np.int64)
-        self.num_reqs_paddings = _get_req_paddings(
-            min_req_size=MIN_NUM_SEQS, max_req_size=self.max_num_reqs
-        )
-
-        # Layer pairings for cross-layer KV sharing.
-        # If an Attention layer `layer_name` is in the keys of this dict, it
-        # means this layer will perform attention using the keys and values
-        # from the KV cache of `shared_kv_cache_layers[layer_name]`.
-        self.shared_kv_cache_layers: dict[str, str] = {}
-
-        # tensors for structured decoding
-        self.grammar_bitmask_cpu = torch.zeros(
-            (self.max_num_reqs, cdiv(self.vocab_size, 32)),
-            dtype=torch.int32,
-            device="cpu",
-            pin_memory=self.pin_memory,
-        )
-        self.require_structured_out_cpu = torch.zeros(
-            (self.max_num_reqs, 1),
-            dtype=torch.bool,
-            device="cpu",
-            pin_memory=self.pin_memory,
-        )
-        self.structured_decode_arange = torch.arange(
-            0, 32, device="cpu", pin_memory=self.pin_memory
-        )
-
-        self.mm_budget = (
-            MultiModalBudget(
-                self.model_config,
-                self.scheduler_config,
-                self.mm_registry,
-            )
-            if self.supports_mm_inputs
-            else None
-        )
-
-        if not self.use_spmd:
-            self.sample_from_logits_func = torch.compile(
-                self.sample_from_logits,
-                backend="openxla",
-                fullgraph=True,
-                dynamic=False,
-            )
-        else:
-            self.sample_from_logits_func = self.sample_from_logits
-
-        # For passing scheduler_output between successive
-        # execute_model() and sample_tokens() calls.
-        self.scheduler_output: SchedulerOutput | None = None
-        self.mm_embed_inputs: tuple[list[torch.Tensor], torch.Tensor] | None = None
-
-    def reset_mm_cache(self) -> None:
-        if self.mm_budget:
-            self.mm_budget.reset_cache()
-
-    def _update_num_xla_graphs(self, case_str):
-        check_comp = self.check_recompilation and not self.enforce_eager
-        if not check_comp:
-            return
-
-        total_cached_graphs = xr.get_num_cached_compilation_graph()
-        new_compiled_graphs = total_cached_graphs - self.num_xla_graphs
-        if new_compiled_graphs == 0:
-            return
-
-        logger.info(
-            "Add new %d compiled XLA graphs due to %s", new_compiled_graphs, case_str
-        )
-        self.num_xla_graphs += new_compiled_graphs
-
-    def _verify_num_xla_graphs(self, case_str):
-        check_comp = self.check_recompilation and not self.enforce_eager
-        if not check_comp:
-            return
-
-        curr_cached_graph = xr.get_num_cached_compilation_graph()
-        assert self.num_xla_graphs == curr_cached_graph, (
-            "Recompilation after warm up is detected during {}."
-            " num_xla_graphs = {} curr_cached_graph = {}".format(
-                case_str, self.num_xla_graphs, curr_cached_graph
-            )
-        )
-
-    def _update_states(self, scheduler_output: "SchedulerOutput") -> bool:
-        """Update the cached states and the persistent batch with the scheduler
-        output.
-
-        The updated states are used by the `_prepare_inputs` function to create
-        the input GPU tensors for the model.
-
-        Returns:
-            True if there is a new/resumed/paused/finished request.
-            If False, we can skip copying SamplingMetadata to the GPU.
-        """
-        # Remove finished requests from the cached states.
-        for req_id in scheduler_output.finished_req_ids:
-            self.requests.pop(req_id, None)
-            self.num_prompt_logprobs.pop(req_id, None)
-
-        # Remove the finished requests from the persistent batch.
-        # NOTE(woosuk): There could be an edge case where finished_req_ids and
-        # scheduled_req_ids overlap. This happens when a request is aborted and
-        # then resubmitted with the same ID. In this case, we treat them as two
-        # distinct requests - clearing the cached states for the first request
-        # and handling the second as a new request.
-        removed_req_indices: list[int] = []
-        for req_id in scheduler_output.finished_req_ids:
-            req_index = self.input_batch.remove_request(req_id)
-            if req_index is not None:
-                removed_req_indices.append(req_index)
-
-        # Free the cached encoder outputs.
-        for mm_hash in scheduler_output.free_encoder_mm_hashes:
-            self.encoder_cache.pop(mm_hash, None)
-
-        # Remove the unscheduled requests from the persistent batch.
-        # NOTE(woosuk): The unscheduled requests are either preempted requests
-        # or running requests that are not scheduled in this step. We remove
-        # them from the persistent batch but keep their cached states since
-        # they will be scheduled again sometime in the future.
-        scheduled_req_ids = scheduler_output.num_scheduled_tokens.keys()
-        cached_req_ids = self.input_batch.req_id_to_index.keys()
-        unscheduled_req_ids = cached_req_ids - scheduled_req_ids
-        # NOTE(woosuk): The persistent batch optimization assumes that
-        # consecutive batches contain mostly the same requests. If batches
-        # have low request overlap (e.g., alternating between two distinct
-        # sets of requests), this optimization becomes very inefficient.
-        for req_id in unscheduled_req_ids:
-            req_index = self.input_batch.remove_request(req_id)
-            assert req_index is not None
-            removed_req_indices.append(req_index)
-
-        req_ids_to_add: list[str] = []
-        # Add new requests to the cached states.
-        for new_req_data in scheduler_output.scheduled_new_reqs:
-            assert new_req_data.sampling_params is not None, (
-                "Pooling is not supported in TPU yet"
-            )
-            req_id = new_req_data.req_id
-            sampling_params = new_req_data.sampling_params
-
-            self.requests[req_id] = CachedRequestState(
-                req_id=req_id,
-                prompt_token_ids=new_req_data.prompt_token_ids,
-                prompt_embeds=new_req_data.prompt_embeds,
-                mm_features=new_req_data.mm_features,
-                sampling_params=sampling_params,
-                pooling_params=None,
-                generator=None,
-                block_ids=new_req_data.block_ids,
-                num_computed_tokens=new_req_data.num_computed_tokens,
-                output_token_ids=[],
-                lora_request=new_req_data.lora_request,
-            )
-
-            if sampling_params and sampling_params.prompt_logprobs is not None:
-                self.num_prompt_logprobs[req_id] = (
-                    self.input_batch.vocab_size
-                    if sampling_params.prompt_logprobs == -1
-                    else sampling_params.prompt_logprobs
-                )
-
-            req_ids_to_add.append(req_id)
-
-        # Update the states of the running/resumed requests.
-        req_data = scheduler_output.scheduled_cached_reqs
-        for i, req_id in enumerate(req_data.req_ids):
-            req_state = self.requests[req_id]
-            num_computed_tokens = req_data.num_computed_tokens[i]
-            new_block_ids = req_data.new_block_ids[i]
-            resumed_from_preemption = req_id in req_data.resumed_req_ids
-
-            # Update the cached states.
-            req_state.num_computed_tokens = num_computed_tokens
-            if not resumed_from_preemption:
-                if new_block_ids is not None:
-                    # Append the new blocks to the existing block IDs.
-                    for block_ids, new_ids in zip(req_state.block_ids, new_block_ids):
-                        block_ids.extend(new_ids)
-            else:
-                assert new_block_ids is not None
-                # The request is resumed from preemption.
-                # Replace the existing block IDs with the new ones.
-                req_state.block_ids = new_block_ids
-
-            req_index = self.input_batch.req_id_to_index.get(req_id)
-            if req_index is None:
-                # The request is not in the persistent batch.
-                # The request was either preempted and resumed later, or was not
-                # scheduled in the previous step and needs to be added again.
-                req_ids_to_add.append(req_id)
-                continue
-
-            # Update the persistent batch.
-            self.input_batch.num_computed_tokens_cpu[req_index] = num_computed_tokens
-            if new_block_ids is not None:
-                self.input_batch.block_table.append_row(new_block_ids, req_index)
-
-        # Add the new or resumed requests to the persistent batch.
-        # The smaller empty indices are filled first.
-        removed_req_indices = sorted(removed_req_indices, reverse=True)
-        for req_id in req_ids_to_add:
-            req_state = self.requests[req_id]
-            # Fill the empty index or append to the end
-            req_index = removed_req_indices.pop() if removed_req_indices else None
-            self.input_batch.add_request(req_state, req_index)
-
-        # Condense the batched states if there are empty indices.
-        if removed_req_indices:
-            self.input_batch.condense(removed_req_indices)
-
-        return len(unscheduled_req_ids) > 0 or len(req_ids_to_add) > 0
-
-    def get_model(self) -> nn.Module:
-        return self.model
-
-    def get_supported_generation_tasks(self) -> list[GenerationTask]:
-        model = self.get_model()
-        supported_tasks = list[GenerationTask]()
-
-        if is_text_generation_model(model):
-            supported_tasks.append("generate")
-
-        if supports_transcription(model):
-            if model.supports_transcription_only:
-                return ["transcription"]
-
-            supported_tasks.append("transcription")
-
-        return supported_tasks
-
-    def get_supported_pooling_tasks(self) -> list[PoolingTask]:
-        model = self.get_model()
-        if not is_pooling_model(model):
-            return []
-
-        return list(model.pooler.get_supported_tasks())
-
-    def get_supported_tasks(self) -> tuple[SupportedTask, ...]:
-        tasks = list[SupportedTask]()
-
-        if self.model_config.runner_type == "generate":
-            tasks.extend(self.get_supported_generation_tasks())
-        if self.model_config.runner_type == "pooling":
-            tasks.extend(self.get_supported_pooling_tasks())
-
-        return tuple(tasks)
-
-    def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
-        """
-        Generates the KVCacheSpec by parsing the kv cache format from each
-        Attention module in the static forward context.
-        Returns:
-            KVCacheSpec: A dictionary mapping layer names to their KV cache
-            format. Layers that do not need KV cache are not included.
-        """
-
-        layers = get_layers_from_vllm_config(
-            self.vllm_config,
-            AttentionLayerBase,  # type: ignore[type-abstract]
-        )
-        block_size = self.vllm_config.cache_config.block_size
-        cache_dtype_str = self.vllm_config.cache_config.cache_dtype
-
-        kv_cache_spec: dict[str, KVCacheSpec] = {}
-        for layer_name, attn_module in layers.items():
-            # Classic Attention path
-            if isinstance(attn_module, Attention):
-                if (
-                    kv_tgt_layer := attn_module.kv_sharing_target_layer_name
-                ) is not None:
-                    # The layer doesn't need its own KV cache and will use that of
-                    # the target layer. We skip creating a KVCacheSpec for it, so
-                    # that KV cache management logic will act as this layer does
-                    # not exist, and doesn't allocate KV cache for the layer. This
-                    # enables the memory saving of cross-layer kv sharing, allowing
-                    # a given amount of memory to accommodate longer context lengths
-                    # or enable more requests to be processed simultaneously.
-                    self.shared_kv_cache_layers[layer_name] = kv_tgt_layer
-                    continue
-
-                if attn_module.attn_type == AttentionType.DECODER:
-                    if isinstance(attn_module, ChunkedLocalAttention):
-                        logger.warning_once(
-                            "Using irope in Pallas is not supported yet, it "
-                            "will fall back to global attention for long context."
-                        )
-                    if attn_module.sliding_window is not None:
-                        kv_cache_spec[layer_name] = SlidingWindowSpec(
-                            block_size=block_size,
-                            num_kv_heads=attn_module.num_kv_heads,
-                            head_size=attn_module.head_size,
-                            dtype=self.kv_cache_dtype,
-                            sliding_window=attn_module.sliding_window,
-                        )
-                    else:
-                        kv_cache_spec[layer_name] = FullAttentionSpec(
-                            block_size=block_size,
-                            num_kv_heads=attn_module.num_kv_heads,
-                            head_size=attn_module.head_size,
-                            dtype=self.kv_cache_dtype,
-                        )
-                elif attn_module.attn_type in (
-                    AttentionType.ENCODER,
-                    AttentionType.ENCODER_ONLY,
-                ):
-                    # encoder-only attention does not need KV cache.
-                    continue
-                elif attn_module.attn_type == AttentionType.ENCODER_DECODER:
-                    raise NotImplementedError
-                else:
-                    raise ValueError(f"Unknown attention type: {attn_module.attn_type}")
-            # MLAAttention path
-            elif isinstance(attn_module, MLAAttention):
-                if layer_name in kv_cache_spec:
-                    continue
-                kv_cache_spec[layer_name] = MLAAttentionSpec(
-                    block_size=block_size,
-                    num_kv_heads=1,
-                    head_size=attn_module.head_size,
-                    dtype=self.kv_cache_dtype,
-                    cache_dtype_str=cache_dtype_str,
-                )
-            else:
-                continue
-
-        return kv_cache_spec
-
-    def _get_slot_mapping_metadata(
-        self, num_reqs, num_scheduled_tokens_per_req
-    ) -> np.ndarray:
-        """
-        Computes metadata for mapping slots to blocks in the key-value (KV)
-        cache for a batch of requests.
-
-        This function determines, for each request in the batch, how the
-        scheduled tokens are distributed across memory blocks, and generates
-        metadata needed to map slices of tokens to their corresponding positions
-        in the KV cache.
-
-        Args:
-            num_reqs (int): Number of requests in the current batch.
-            num_scheduled_tokens_per_req (int or np.ndarray): Number of tokens
-                to be scheduled for each request.
-
-        Returns:
-            np.ndarray: A 2D array of shape (total_block_len, 3), where each row
-                contains:
-                - kv_cache_start_index (int): The starting index in the KV cache
-                  for the corresponding slice.
-                - new_kv_start_index (int): The starting index in the new KV
-                  cache for the corresponding slice.
-                - slice_len (int): The length of the slice.
-        """
-        slices_start = self.input_batch.num_computed_tokens_cpu[:num_reqs]
-        slices_end = (
-            self.input_batch.num_computed_tokens_cpu[:num_reqs]
-            + num_scheduled_tokens_per_req
-        )
-        local_block_start_idx = slices_start // self.block_size
-        local_block_end_idx = (slices_end - 1) // self.block_size
-        no_repeat_req_indices = self.arange_np[:num_reqs]
-        global_block_start_idx = (
-            no_repeat_req_indices * self.max_num_blocks_per_req + local_block_start_idx
-        )
-        block_lens = local_block_end_idx - local_block_start_idx + 1
-        global_block_start_idx = np.repeat(global_block_start_idx, block_lens)
-        slice_arange = np.concatenate([self.arange_np[:n] for n in block_lens])
-        global_block_indices = global_block_start_idx + slice_arange
-        block_table_cpu = self.input_batch.block_table[0].get_cpu_tensor()
-        block_numbers = block_table_cpu.flatten()[global_block_indices].numpy()
-        total_block_len = np.sum(block_lens)
-        slot_mapping_slices = np.repeat(
-            np.array([[0, self.block_size]], dtype=np.int32), total_block_len, axis=0
-        )
-        cu_block_lens = np.zeros(len(block_lens) + 1, dtype=np.int32)
-        np.cumsum(block_lens, out=cu_block_lens[1:])
-        for req_idx in range(num_reqs):
-            slot_mapping_slices[cu_block_lens[req_idx]][0] = (
-                slices_start[req_idx] % self.block_size
-            )
-            slot_mapping_slices[cu_block_lens[req_idx + 1] - 1][1] = (
-                slices_end[req_idx] - 1
-            ) % self.block_size + 1
-        slice_lens = slot_mapping_slices[:, 1] - slot_mapping_slices[:, 0]
-        cu_slices_lens = np.zeros(len(slice_lens) + 1, dtype=np.int32)
-        np.cumsum(slice_lens, out=cu_slices_lens[1:])
-        kv_cache_start_indices = slot_mapping_slices[:, 0] + (
-            block_numbers * self.block_size
-        )
-        new_kv_start_indices = cu_slices_lens[:-1]
-        slot_mapping_metadata = np.stack(
-            [kv_cache_start_indices, new_kv_start_indices, slice_lens], axis=1
-        )
-        return slot_mapping_metadata
-
-    def _prepare_inputs(self, scheduler_output: "SchedulerOutput", start_index: int):
-        assert scheduler_output.total_num_scheduled_tokens > 0
-        num_reqs = self.input_batch.num_reqs
-        assert num_reqs > 0
-        assert start_index < num_reqs
-
-        # Get the number of scheduled tokens for each request.
-        use_max_model_len = self.most_model_len is None
-        num_scheduled_tokens_per_req = []
-        max_num_scheduled_tokens_all_reqs = 0
-        end_index = start_index
-
-        # Use either most_model_len or max_model_len depending on request size.
-        for i in range(start_index, num_reqs):
-            req_id = self.input_batch.req_ids[i]
-            assert req_id is not None
-            num_tokens = scheduler_output.num_scheduled_tokens[req_id]
-            if (
-                not use_max_model_len
-                and self.most_model_len is not None
-                and num_tokens > self.most_model_len
-            ):
-                use_max_model_len = True
-            num_scheduled_tokens_per_req.append(num_tokens)
-        if use_max_model_len:
-            if len(num_scheduled_tokens_per_req) > self.num_reqs_max_model_len:
-                num_scheduled_tokens_per_req = num_scheduled_tokens_per_req[
-                    : self.num_reqs_max_model_len
-                ]
-                end_index = start_index + self.num_reqs_max_model_len
-            else:
-                end_index = num_reqs
-        else:
-            assert self.num_reqs_most_model_len is not None
-            if len(num_scheduled_tokens_per_req) > self.num_reqs_most_model_len:
-                num_scheduled_tokens_per_req = num_scheduled_tokens_per_req[
-                    : self.num_reqs_most_model_len
-                ]
-                end_index = start_index + self.num_reqs_most_model_len
-            else:
-                end_index = num_reqs
-        max_num_scheduled_tokens_all_reqs = max(num_scheduled_tokens_per_req)
-        num_scheduled_tokens_per_req = np.array(
-            num_scheduled_tokens_per_req, dtype=np.int32
-        )
-        total_num_scheduled_tokens = sum(num_scheduled_tokens_per_req)
-        assert max_num_scheduled_tokens_all_reqs > 0
-
-        num_reqs = len(num_scheduled_tokens_per_req)
-
-        # Get request indices.
-        # E.g., [2, 5, 3] -> [0, 0, 1, 1, 1, 1, 1, 2, 2, 2]
-        # For each scheduled token, what are the corresponding req index.
-        req_indices = np.repeat(self.arange_np[:num_reqs], num_scheduled_tokens_per_req)
-
-        # Get batched arange.
-        # E.g., [2, 5, 3] -> [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
-        # For each scheduled token, what is its position in corresponding req.
-        arange = np.concatenate(
-            [self.arange_np[:n] for n in num_scheduled_tokens_per_req]
-        )
-
-        # Get positions.
-        positions_np = self.positions_np[:total_num_scheduled_tokens]
-        np.add(
-            self.input_batch.num_computed_tokens_cpu[req_indices],
-            arange,
-            out=positions_np,
-        )
-
-        # Get token indices.
-        # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
-        # -> [0, 1, M, M + 1, M + 2, M + 3, M + 4, 2 * M, 2 * M + 1, 2 * M + 2]
-        # where M is the max_model_len.
-        token_indices = (
-            positions_np + req_indices * self.input_batch.token_ids_cpu.shape[1]
-        )
-
-        # NOTE(woosuk): We use torch.index_select instead of np.take here
-        # because torch.index_select is much faster than np.take for large
-        # tensors.
-        torch.index_select(
-            self.input_batch.token_ids_cpu_tensor.flatten(),
-            0,
-            torch.from_numpy(token_indices),
-            out=self.input_ids_cpu[:total_num_scheduled_tokens],
-        )
-
-        # Prepare the attention metadata.
-        self.query_start_loc_np[0] = 0
-        np.cumsum(
-            num_scheduled_tokens_per_req, out=self.query_start_loc_np[1 : num_reqs + 1]
-        )
-        self.query_start_loc_np[num_reqs + 1 :] = 1
-
-        self.seq_lens_np[:num_reqs] = (
-            self.input_batch.num_computed_tokens_cpu[:num_reqs]
-            + num_scheduled_tokens_per_req
-        )
-
-        # Do the padding and copy the tensors to the TPU.
-        padded_total_num_scheduled_tokens = _get_padded_token_len(
-            self.num_tokens_paddings, total_num_scheduled_tokens
-        )
-        # Zero out to avoid spurious values from prev iteration (last cp chunk)
-        self.input_ids_cpu[
-            total_num_scheduled_tokens:padded_total_num_scheduled_tokens
-        ] = 0
-        self.input_ids = self.input_ids_cpu[:padded_total_num_scheduled_tokens].to(
-            self.device
-        )
-        self.position_ids = self.positions_cpu[:padded_total_num_scheduled_tokens].to(
-            self.device
-        )
-        if use_max_model_len:
-            block_tables = self.block_table_cpu[
-                : self.num_reqs_max_model_len, : self.max_num_blocks_per_req
-            ]
-            block_tables[:num_reqs, : self.max_num_blocks_per_req] = (
-                self.input_batch.block_table[0].get_cpu_tensor()[:num_reqs]
-            )
-            query_start_loc = self.query_start_loc_cpu[
-                : self.num_reqs_max_model_len + 1
-            ].to(self.device)
-            seq_lens = self.seq_lens_cpu[: self.num_reqs_max_model_len].to(self.device)
-        else:
-            assert self.num_reqs_most_model_len is not None
-            block_tables = self.block_table_cpu[
-                : self.num_reqs_most_model_len, : self.num_blocks_per_most_len_req
-            ]
-            block_tables[:num_reqs, : self.num_blocks_per_most_len_req] = (
-                self.input_batch.block_table[0].get_cpu_tensor()[
-                    :num_reqs, : self.num_blocks_per_most_len_req
-                ]
-            )
-            query_start_loc = self.query_start_loc_cpu[
-                : self.num_reqs_most_model_len + 1
-            ].to(self.device)
-            seq_lens = self.seq_lens_cpu[: self.num_reqs_most_model_len].to(self.device)
-        block_tables = block_tables.to(self.device)
-
-        # Calculate the slot mapping
-        slot_mapping_metadata = self._get_slot_mapping_metadata(
-            num_reqs, num_scheduled_tokens_per_req
-        )
-        num_kv_update_slices = slot_mapping_metadata.shape[0]
-        padded_num_slices = _get_padded_num_kv_cache_update_slices(
-            padded_total_num_scheduled_tokens, self.max_num_reqs, self.block_size
-        )
-        slot_mapping_metadata = np.pad(
-            slot_mapping_metadata,
-            [[0, padded_num_slices - len(slot_mapping_metadata)], [0, 0]],
-            constant_values=0,
-        )
-        slot_mapping_metadata = np.transpose(slot_mapping_metadata)
-        slot_mapping_metadata = torch.tensor(slot_mapping_metadata, device=self.device)
-
-        if self.lora_config is not None:
-            # We need to respect padding when activating LoRA adapters
-            padded_num_scheduled_tokens_per_req = np.copy(
-                num_scheduled_tokens_per_req
-            )  # Copying to avoid accidental state corruption bugs
-            padded_num_scheduled_tokens_per_req[-1] += (
-                padded_total_num_scheduled_tokens - total_num_scheduled_tokens
-            )
-
-            self.set_active_loras(self.input_batch, padded_num_scheduled_tokens_per_req)
-
-        attn_metadata = PallasMetadata(
-            slot_mapping=slot_mapping_metadata,
-            block_tables=block_tables,
-            context_lens=seq_lens,
-            query_start_loc=query_start_loc,
-            num_seqs=torch.tensor([num_reqs], dtype=torch.int32, device=self.device),
-            num_kv_update_slices=torch.tensor(
-                [num_kv_update_slices], dtype=torch.int32, device=self.device
-            ),
-            num_slices_per_kv_cache_update_block=self._num_slices_per_kv_cache_update_block,
-        )
-        # NOTE(woosuk): Due to chunked prefills, there can be at most 1 partial
-        # request in the batch. While we should not sample any token from this
-        # partial request, we do so for simplicity. We will ignore the sampled
-        # token from the partial request.
-        # TODO: Support prompt logprobs.
-        padded_num_reqs = _get_padded_num_reqs_with_upper_limit(
-            num_reqs, self.max_num_reqs
-        )
-        # Indices at which we sample (positions of last token in the sequence).
-        # Padded to avoid recompiling when `num_reqs` varies.
-        logits_indices = self.query_start_loc_cpu[1 : padded_num_reqs + 1] - 1
-        logits_indices = logits_indices.to(self.device)
-
-        if self.lora_config is not None:
-            # We need to respect padding when activating LoRA adapters
-            padded_num_scheduled_tokens_per_req = np.copy(
-                num_scheduled_tokens_per_req
-            )  # Copying to avoid accidental state corruption bugs
-            padded_num_scheduled_tokens_per_req[-1] += (
-                padded_total_num_scheduled_tokens - total_num_scheduled_tokens
-            )
-
-            self.set_active_loras(self.input_batch, padded_num_scheduled_tokens_per_req)
-
-        layer_names = get_layers_from_vllm_config(self.vllm_config, Attention).keys()
-        per_layer_attn_metadata = {
-            layer_name: attn_metadata for layer_name in layer_names
-        }
-        return (
-            per_layer_attn_metadata,
-            logits_indices,
-            padded_num_reqs,
-            num_reqs,
-            end_index,
-        )
-
-    def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
-        scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs
-        if not scheduled_encoder_inputs:
-            return
-
-        # Batch the multi-modal inputs.
-        mm_kwargs = list[MultiModalKwargsItem]()
-        # List of tuple (mm_hash, pos_info)
-        mm_hashes_pos = list[tuple[str, PlaceholderRange]]()
-        for req_id, encoder_input_ids in scheduled_encoder_inputs.items():
-            req_state = self.requests[req_id]
-
-            for mm_input_id in encoder_input_ids:
-                mm_feature = req_state.mm_features[mm_input_id]
-                if mm_feature.data is None:
-                    continue
-                mm_hash = mm_feature.identifier
-                mm_kwargs.append(mm_feature.data)
-                mm_hashes_pos.append((mm_hash, mm_feature.mm_position))
-
-        # Batch mm inputs as much as we can: if a request in the batch has
-        # multiple modalities or a different modality than the previous one,
-        # we process it separately to preserve item order.
-        # FIXME(ywang96): This is a hacky way to deal with multiple modalities
-        # in the same batch while still being able to benefit from batching
-        # multimodal inputs. The proper solution should be reordering the
-        # encoder outputs.
-        model = cast(SupportsMultiModal, self.model)
-        encoder_outputs = []
-        for _, num_items, mm_kwargs_group in group_mm_kwargs_by_modality(
-            mm_kwargs,
-            device=self.device,
-            pin_memory=self.pin_memory,
-        ):
-            # Run the encoder.
-            # `curr_group_outputs` is either of the following:
-            # 1. A tensor of shape (num_items, feature_size, hidden_size)
-            # in case feature_size is fixed across all multimodal items.
-            # 2. A list or tuple (length: num_items) of tensors, each of shape
-            # (feature_size, hidden_size) in case the feature size is dynamic
-            # depending on the input multimodal items.
-            torch_xla.sync(wait=False)
-            curr_group_outputs = model.embed_multimodal(**mm_kwargs_group)
-            torch_xla.sync(wait=False)
-
-            sanity_check_mm_encoder_outputs(
-                curr_group_outputs,
-                expected_num_items=num_items,
-            )
-
-            if isinstance(curr_group_outputs, torch.Tensor):
-                encoder_outputs.append(curr_group_outputs)
-            else:
-                assert isinstance(curr_group_outputs, (list, tuple))
-                for output in curr_group_outputs:
-                    encoder_outputs.append(output)
-
-        # Cache the encoder outputs.
-        # NOTE (NickLucche) here we diverge from logic in other runners, as we
-        # assume to only have whole mm items to process. Hence we avoid the
-        # intrinsic dynamism that `scatter_mm_placeholders` introduces.
-        for (mm_hash, pos_info), output in zip(mm_hashes_pos, encoder_outputs):
-            assert pos_info.is_embed is None, (
-                "Expected all positions to be contiguous and embeddings."
-            )
-            self.encoder_cache[mm_hash] = output
-
-    def _gather_mm_embeddings(
-        self,
-        scheduler_output: "SchedulerOutput",
-    ) -> tuple[list[torch.Tensor], torch.Tensor]:
-        total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
-        padded_total_num_scheduled_tokens = _get_padded_token_len(
-            self.num_tokens_paddings, total_num_scheduled_tokens
-        )
-
-        is_mm_embed = self.is_mm_embed_cpu
-        is_mm_embed[:padded_total_num_scheduled_tokens] = False
-        mm_embeds = list[torch.Tensor]()
-        req_start_idx = 0
-
-        for req_id in self.input_batch.req_ids:
-            num_scheduled_tokens = scheduler_output.num_scheduled_tokens[req_id]
-            req_state = self.requests[req_id]
-            num_computed_tokens = req_state.num_computed_tokens
-
-            # TODO unroll loop and assume/enforce --disable_chunked_mm_input
-            # NOTE (NickLucche) here we diverge from logic in other runners, as
-            # we assume to only have whole mm items to process. Hence we avoid
-            # the intrinsic dynamism that `gather_mm_placeholders` introduces.
-            for mm_feature in req_state.mm_features:
-                pos_info = mm_feature.mm_position
-                start_pos = pos_info.offset
-                num_encoder_tokens = pos_info.length
-
-                # The encoder output is needed if the two ranges overlap:
-                # [num_computed_tokens,
-                #  num_computed_tokens + num_scheduled_tokens) and
-                # [start_pos, start_pos + num_encoder_tokens)
-                if start_pos >= num_computed_tokens + num_scheduled_tokens:
-                    # The encoder output is not needed in this step.
-                    break
-                if start_pos + num_encoder_tokens <= num_computed_tokens:
-                    # The encoder output is already processed and stored
-                    # in the decoder's KV cache.
-                    continue
-
-                start_idx = max(num_computed_tokens - start_pos, 0)
-                end_idx = min(
-                    num_computed_tokens - start_pos + num_scheduled_tokens,
-                    num_encoder_tokens,
-                )
-                assert start_idx < end_idx
-
-                mm_hash = mm_feature.identifier
-                encoder_output = self.encoder_cache.get(mm_hash, None)
-                assert encoder_output is not None, f"Encoder cache miss for {mm_hash}."
-
-                assert pos_info.is_embed is None, (
-                    "Expected all positions to be contiguous and embeddings."
-                )
-
-                req_start_pos = req_start_idx + start_pos - num_computed_tokens
-                is_mm_embed[req_start_pos + start_idx : req_start_pos + end_idx] = True
-
-                # Only whole mm items are processed
-                mm_embeds.append(encoder_output)
-
-            req_start_idx += num_scheduled_tokens
-
-        is_mm_embed = is_mm_embed[:padded_total_num_scheduled_tokens].to(self.device)
-
-        return mm_embeds, is_mm_embed
-
-    def _get_model_inputs(
-        self,
-        input_ids: torch.Tensor,
-        mm_embed_inputs: tuple[list[torch.Tensor], torch.Tensor] | None,
-    ):
-        if self.supports_mm_inputs:
-            mm_embeds, is_mm_embed = mm_embed_inputs or (None, None)
-
-            # NOTE(woosuk): To unify token ids and soft tokens (vision
-            # embeddings), we always use embeddings (rather than token ids)
-            # as input to the multimodal model, even when the input is text.
-            inputs_embeds = self.model.embed_input_ids(
-                input_ids,
-                multimodal_embeddings=mm_embeds,
-                is_multimodal=is_mm_embed,
-            )
-
-            return None, inputs_embeds
-        else:
-            # For text-only models, we use token ids as input.
-            # While it is possible to use embeddings as input just like the
-            # multimodal models, it is not desirable for performance since
-            # then the embedding layer is not included in the CUDA graph.
-            return input_ids, None
-
-    @torch.no_grad()
-    def execute_model(
-        self,
-        scheduler_output: "SchedulerOutput",
-        intermediate_tensors: IntermediateTensors | None = None,
-    ) -> ModelRunnerOutput | None:
-        if self.scheduler_output is not None:
-            raise RuntimeError(
-                "State error: sample_tokens() must be called "
-                "after execute_model() returns None."
-            )
-        # Update cached state
-        self._update_states(scheduler_output)
-        if not scheduler_output.total_num_scheduled_tokens:
-            if not has_kv_transfer_group():
-                # Return empty ModelRunnerOutput if there's no work to do.
-                return EMPTY_MODEL_RUNNER_OUTPUT
-
-            return self.kv_connector_no_forward(scheduler_output, self.vllm_config)
-
-        mm_embed_inputs = None
-        if self.supports_mm_inputs:
-            # Run the multimodal encoder if any.
-            self._execute_mm_encoder(scheduler_output)
-            mm_embed_inputs = self._gather_mm_embeddings(scheduler_output)
-
-        torch_xla.sync(wait=False)
-
-        self.scheduler_output = scheduler_output
-        self.mm_embed_inputs = mm_embed_inputs
-        return None
-
-    @torch.no_grad()
-    def sample_tokens(
-        self, grammar_output: "GrammarOutput | None"
-    ) -> ModelRunnerOutput:
-        if self.scheduler_output is None:
-            # Nothing to do (PP non-final rank case), output isn't used.
-            return None  # type: ignore[return-value]
-        scheduler_output = self.scheduler_output
-        mm_embed_inputs = self.mm_embed_inputs
-        self.scheduler_output = None
-        self.mm_embed_inputs = None
-
-        # Prepare inputs, the requests might be split into multiple
-        # executions, combine the result of each execution.
-        start_index = 0
-        combined_selected_tokens: list[torch.Tensor] = []
-        combined_logprobs: list[LogprobsLists] = []
-
-        # NOTE: setup current batch's metadata for kv connector.
-        # Currently, only verified with NixlConnector
-        with set_forward_context(None, self.vllm_config):
-            self.maybe_setup_kv_connector(scheduler_output)
-
-        while start_index < self.input_batch.num_reqs:
-            attn_metadata, logits_indices, padded_num_reqs, num_reqs, end_index = (
-                self._prepare_inputs(scheduler_output, start_index)
-            )
-            input_ids, inputs_embeds = self._get_model_inputs(
-                self.input_ids, mm_embed_inputs
-            )
-            torch_xla.sync(wait=False)
-            # Run the decoder
-            with set_forward_context(
-                attn_metadata,
-                self.vllm_config,
-                num_tokens=scheduler_output.total_num_scheduled_tokens,
-            ):
-                hidden_states = self.model(
-                    input_ids=input_ids,
-                    positions=self.position_ids,
-                    inputs_embeds=inputs_embeds,
-                )
-            hidden_states = self.select_hidden_states(hidden_states, logits_indices)
-            logits = self.compute_logits(hidden_states)
-            tpu_sampling_metadata = TPUSupportedSamplingMetadata.from_input_batch(
-                self.input_batch, padded_num_reqs, self.device
-            )
-            if grammar_output is not None:
-                require_struct_decoding, grammar_bitmask_padded, arange = (
-                    self.prepare_structured_decoding_input(logits, grammar_output)
-                )
-                logits = self.structured_decode(
-                    require_struct_decoding, grammar_bitmask_padded, logits, arange
-                )
-            selected_token_ids = self.sample_from_logits_func(
-                logits, tpu_sampling_metadata
-            )
-            # NOTE (NickLucche) Use the original logits (before any penalties or
-            # temperature scaling) for the top-k logprobs. We can't enforce it
-            # due to recompilations outside torch.compiled code, so just make
-            # sure `sample_from_logits` does not modify the logits in-place.
-            logprobs = (
-                self.gather_logprobs(logits, selected_token_ids)
-                if tpu_sampling_metadata.logprobs
-                else None
-            )
-
-            # Remove padding on cpu and keep dynamic op outside of xla graph.
-            selected_token_ids = selected_token_ids.cpu()[:num_reqs]
-
-            combined_selected_tokens.append(selected_token_ids)
-            if tpu_sampling_metadata.logprobs:
-                combined_logprobs.append(logprobs.tolists())
-
-            start_index = end_index
-
-        # NOTE: current kv load and save get h2d/d2h copies involved.
-        # Those copies are blocking. Once they become async., kv_save
-        # should be called right after each single forward pass,
-        # instead of the forwards of the entire input batch.
-        self.maybe_wait_for_kv_save()
-        finished_sending, finished_recving = self.get_finished_kv_transfers(
-            scheduler_output
-        )
-
-        selected_token_ids = torch.cat(combined_selected_tokens, dim=0)
-        if tpu_sampling_metadata.logprobs:
-
-            def concat_lists(input_lists):
-                result = []
-                for input_list in input_lists:
-                    result.extend(input_list)
-                return result
-
-            logprobs_lists = LogprobsLists(
-                logprob_token_ids=concat_lists(
-                    [lp.logprob_token_ids for lp in combined_logprobs]
-                ),
-                logprobs=concat_lists([lp.logprobs for lp in combined_logprobs]),
-                sampled_token_ranks=concat_lists(
-                    [lp.sampled_token_ranks for lp in combined_logprobs]
-                ),
-            )
-        else:
-            logprobs_lists = None
-
-        # Update the cache state concurrently. Code above will not block until
-        # we use `selected_token_ids`. Add mark_step if post-processing changes
-        request_seq_lens: list[tuple[int, CachedRequestState, int]] = []
-        discard_sampled_tokens_req_indices = []
-        num_reqs = self.input_batch.num_reqs
-        for i, req_id in zip(range(num_reqs), self.input_batch.req_ids):
-            assert req_id is not None
-            req_state = self.requests[req_id]
-            seq_len = (
-                req_state.num_computed_tokens
-                + scheduler_output.num_scheduled_tokens[req_id]
-            )
-            if seq_len >= req_state.num_tokens:
-                request_seq_lens.append((i, req_state, seq_len))
-            else:
-                # Ignore the sampled token from the partial request.
-                # Rewind the generator state as if the token was not sampled.
-                generator = self.input_batch.generators.get(i)
-                if generator is not None:
-                    # This relies on cuda-specific torch-internal impl details
-                    generator.set_offset(generator.get_offset() - 4)
-
-                # Record the index of the request that should not be sampled,
-                # so that we could clear the sampled tokens before returning.
-                discard_sampled_tokens_req_indices.append(i)
-
-        assert all(
-            req_id is not None for req_id in self.input_batch.req_ids[:num_reqs]
-        ), "req_ids contains None"
-        req_ids = cast(list[str], self.input_batch.req_ids[:num_reqs])
-
-        prompt_logprobs_dict: dict[str, LogprobsTensors | None] = {}
-        for req_id in self.input_batch.req_ids[:num_reqs]:
-            prompt_logprobs_dict[req_id] = None
-
-        max_gen_len = selected_token_ids.shape[-1]
-        if max_gen_len == 1:
-            valid_sampled_token_ids = selected_token_ids.tolist()
-
-            # Mask out the sampled tokens that should not be sampled.
-            # TODO: Keep in sync with gpu_model_runner.py, in particular
-            #       the "else" case here
-            for i in discard_sampled_tokens_req_indices:
-                valid_sampled_token_ids[i].clear()
-
-            # Append sampled tokens
-            for i, req_state, seq_len in request_seq_lens:
-                token_id = valid_sampled_token_ids[i][0]
-                self.input_batch.token_ids_cpu[i, seq_len] = token_id
-                req_state.output_token_ids.append(token_id)
-                self.input_batch.num_tokens[i] += 1
-
-        else:
-            valid_mask = selected_token_ids != INVALID_TOKEN_ID
-            gen_lens = valid_mask.sum(dim=1).tolist()
-            valid_sampled_token_ids = [
-                seq.tolist() for seq in selected_token_ids[valid_mask].split(gen_lens)
-            ]
-            self.input_batch.num_tokens[:num_reqs] += gen_lens
-            for i, req_state, seq_len in request_seq_lens:
-                target_slice = slice(seq_len - gen_lens[i] + 1, seq_len + 1)
-                self.input_batch.token_ids_cpu[i, target_slice] = (
-                    valid_sampled_token_ids[i]
-                )
-                req_state.output_token_ids.extend(valid_sampled_token_ids[i])
-
-        kv_connector_output = (
-            None
-            if (finished_sending is None and finished_recving is None)
-            else KVConnectorOutput(
-                finished_sending=finished_sending,
-                finished_recving=finished_recving,
-            )
-        )
-
-        model_runner_output = ModelRunnerOutput(
-            req_ids=req_ids,
-            req_id_to_index=self.input_batch.req_id_to_index,
-            sampled_token_ids=valid_sampled_token_ids,
-            logprobs=logprobs_lists,
-            prompt_logprobs_dict=prompt_logprobs_dict,
-            pooler_output=[],
-            kv_connector_output=kv_connector_output,
-        )
-
-        # Check there are no new graphs compiled - all the graphs should be
-        # captured and compiled during warm up.
-        self._verify_num_xla_graphs("execute_model")
-
-        return model_runner_output
-
-    def update_config(self, overrides: dict[str, Any]) -> None:
-        # TODO: TPU config may need extra validation
-        # https://github.com/vllm-project/vllm/pull/20095#discussion_r2201497754
-        allowed_config_names = {"load_config", "model_config"}
-        for config_name, config_overrides in overrides.items():
-            assert config_name in allowed_config_names, (
-                f"Config `{config_name}` not supported. "
-                f"Allowed configs: {allowed_config_names}"
-            )
-            config = getattr(self, config_name)
-            new_config = update_config(config, config_overrides)
-            setattr(self, config_name, new_config)
-
-    def load_model(self) -> None:
-        self.device = self.device_config.device
-
-        # NOTE(woosuk): While the executor assigns the TP ranks to the worker
-        # process, the ranks can be different from the ranks internally assigned
-        # by the xm runtime. Therefore, there is a mismatch in the rank
-        # assignment between the gloo (cpu) runtime and the xm (tpu) runtime.
-        # This is not a problem in linear layers because all-reduce is
-        # rank-agnostic. However, it matters for all-gather as the ranks
-        # determine the order of concatenating the output tensors.
-        # As a workaround, we use the xm's rank assignment only when loading
-        # the embedding weights.
-        xm_tp_rank = xr.global_ordinal()
-        with patch(
-            "vllm.model_executor.layers.vocab_parallel_embedding."
-            "get_tensor_model_parallel_rank",
-            return_value=xm_tp_rank,
-        ):
-            try:
-                if self.use_spmd:
-                    tpu_loader = TPUModelLoader(
-                        load_config=self.vllm_config.load_config
-                    )
-                    model = tpu_loader.load_model(
-                        vllm_config=self.vllm_config,
-                        model_config=self.vllm_config.model_config,
-                        mesh=self.mesh,
-                    )
-                else:
-                    model_loader = get_model_loader(self.load_config)
-                    logger.info("Loading model from scratch...")
-                    model = model_loader.load_model(
-                        vllm_config=self.vllm_config, model_config=self.model_config
-                    )
-            except RuntimeError as e:
-                raise RuntimeError(
-                    f"Unable to load model, a likely reason is the model is "
-                    "too large for the current device's HBM memory. "
-                    "Consider switching to a smaller model "
-                    "or sharding the weights on more chips. "
-                    f"See the detailed error: {e}"
-                ) from e
-        if self.lora_config is not None:
-            model = self.load_lora_model(model, self.vllm_config, self.device)
-            replace_set_lora(model)
-
-        # Sync all pending XLA execution during model initialization and weight
-        # loading.
-        torch_xla.sync(wait=False)
-        xm.wait_device_ops()
-        if not hasattr(self, "model"):
-            self.model = model
-        self.sampler = TPUSampler()
-
-    def reload_weights(self) -> None:
-        assert getattr(self, "model", None) is not None, (
-            "Cannot reload weights before model is loaded."
-        )
-        model_loader = get_model_loader(self.load_config)
-        logger.info("Reloading weights inplace...")
-        model_loader.load_weights(self.model, model_config=self.model_config)
-
-    @torch.no_grad()
-    def _dummy_run(self, num_tokens: int, num_reqs: int, num_blocks: int) -> None:
-        if self.supports_mm_inputs:
-            input_ids = None
-            inputs_embeds = torch.zeros(
-                (num_tokens, self.inputs_embeds_size),
-                dtype=self.dtype,
-                device=self.device,
-            )
-        else:
-            input_ids = torch.zeros((num_tokens), dtype=torch.int32).to(self.device)
-            inputs_embeds = None
-        actual_num_reqs = min(num_tokens, num_reqs)
-        position_ids = torch.zeros(num_tokens, dtype=torch.int32).to(self.device)
-        padded_num_slices = _get_padded_num_kv_cache_update_slices(
-            num_tokens, self.max_num_reqs, self.block_size
-        )
-        num_kv_update_slices = torch.tensor([padded_num_slices], dtype=torch.int32).to(
-            self.device
-        )
-        slot_mapping = torch.zeros((3, padded_num_slices), dtype=torch.int32).to(
-            self.device
-        )
-        block_tables = torch.zeros((num_reqs, num_blocks), dtype=torch.int32).to(
-            self.device
-        )
-        query_lens = [1] * num_reqs
-        query_start_loc = torch.cumsum(
-            torch.tensor([0] + query_lens, dtype=torch.int32), dim=0, dtype=torch.int32
-        ).to(self.device)
-        context_lens = torch.ones((num_reqs,), dtype=torch.int32).to(self.device)
-        num_seqs = torch.tensor([actual_num_reqs], dtype=torch.int32).to(self.device)
-        attn_metadata = PallasMetadata(
-            slot_mapping=slot_mapping,
-            block_tables=block_tables,
-            context_lens=context_lens,
-            query_start_loc=query_start_loc,
-            num_seqs=num_seqs,
-            num_kv_update_slices=num_kv_update_slices,
-            num_slices_per_kv_cache_update_block=self._num_slices_per_kv_cache_update_block,
-        )
-
-        if self.supports_mm_inputs:
-            torch._dynamo.mark_dynamic(inputs_embeds, 0)
-        else:
-            torch._dynamo.mark_dynamic(input_ids, 0)
-        torch._dynamo.mark_dynamic(position_ids, 0)
-        torch._dynamo.mark_dynamic(attn_metadata.slot_mapping, 0)
-        torch._dynamo.mark_dynamic(attn_metadata.block_tables, (0, 1))
-        torch._dynamo.mark_dynamic(attn_metadata.context_lens, 0)
-        torch._dynamo.mark_dynamic(attn_metadata.query_start_loc, 0)
-
-        layer_names = get_layers_from_vllm_config(self.vllm_config, Attention).keys()
-        per_layer_attn_metadata = {
-            layer_name: attn_metadata for layer_name in layer_names
-        }
-
-        with (
-            self.maybe_select_dummy_loras(
-                self.lora_config, np.array([num_tokens], dtype=np.int32)
-            ),
-            set_forward_context(per_layer_attn_metadata, self.vllm_config, 0),
-        ):
-            out = self.model(
-                input_ids=input_ids, positions=position_ids, inputs_embeds=inputs_embeds
-            )
-        self._hidden_states_dtype = out.dtype
-
-    def _set_active_loras(
-        self, prompt_lora_mapping, token_lora_mapping, lora_requests
-    ) -> None:
-        torch_xla.sync(wait=False)  # Captures input updates
-        super()._set_active_loras(
-            prompt_lora_mapping, token_lora_mapping, lora_requests
-        )
-        torch_xla.sync(wait=False)  # Captures metadata updates
-
-    def _precompile_mm_encoder(self) -> None:
-        if not self.supports_mm_inputs:
-            return
-
-        # Pre-compile MM encoder for all supported data modalities.
-        hf_config = self.vllm_config.model_config.hf_config
-
-        mm_budget = self.mm_budget
-        assert mm_budget is not None
-
-        max_items_per_seq_by_modality = mm_budget.max_items_per_batch_by_modality  # noqa: E501
-
-        for mode, max_items_per_seq in max_items_per_seq_by_modality.items():
-            logger.info(
-                "Compiling Multimodal %s Encoder with different input shapes.", mode
-            )
-            start = time.perf_counter()
-            # No padding for MM encoder just yet.
-            for num_items in range(1, max_items_per_seq + 1):
-                logger.info("  -- mode: %s items: %d", mode, num_items)
-                batched_dummy_mm_inputs = self._get_mm_dummy_batch(
-                    mode,
-                    num_items,
-                )
-                # Run multimodal encoder.
-                torch_xla.sync(wait=False)
-                mm_embeds = self.model.embed_multimodal(**batched_dummy_mm_inputs)
-                torch_xla.sync(wait=False)
-                num_patches = mm_embeds[0].shape[0]
-                items_size = num_patches * num_items
-
-                # NOTE (NickLucche) pre-compile `embed_input_ids` when mm
-                # embeddings are present. We assume `--disable-mm-chunked`,
-                # hence only whole items can be scheduled. This implies we just
-                # need to compile when `num_items` fit the (padded) `input_ids`
-                for num_tokens in self.num_tokens_paddings:
-                    if num_tokens >= items_size:
-                        # XLA Workaround: if torch.zeros(..device) is used, XLA
-                        # compiles a scalar+expansion op, which won't match
-                        # the graph generated at runtime. CPU->TPU must be used
-                        placeholders_ids = torch.zeros(
-                            num_tokens, dtype=torch.int32, device="cpu"
-                        )
-                        # Align placeholders and actual num mm_embeddings.
-                        placeholders_ids[:items_size] = hf_config.image_token_index
-
-                        placeholders_ids = placeholders_ids.to(self.device)
-
-                        mm_mask = torch.tensor([False] * num_tokens)
-                        mm_mask[:items_size] = True
-                        mm_mask = mm_mask.to(self.device)
-                        # Assign outputs or the graph will be cut short.
-                        a, b = self._get_model_inputs(
-                            placeholders_ids,
-                            mm_embed_inputs=([mm_embeds], mm_mask),
-                        )
-                        assert a is None
-                        torch_xla.sync(wait=False)
-
-            # Pre-compile `embed_input_ids` when mm_embeddings are not
-            # present. Chunk is only made of text, no mm_placeholders.
-            for num_tokens in self.num_tokens_paddings:
-                placeholders_ids = torch.zeros(
-                    num_tokens, dtype=torch.int32, device="cpu"
-                )
-                placeholders_ids = placeholders_ids.to(self.device)
-                a, b = self._get_model_inputs(
-                    placeholders_ids,
-                    mm_embed_inputs=None,
-                )
-                assert a is None
-                torch_xla.sync(wait=False)
-
-            xm.wait_device_ops()
-            end = time.perf_counter()
-            logger.info(
-                "Multimodal %s Encoder compilation finished in in %.2f [secs].",
-                mode,
-                end - start,
-            )
-
-    def _precompile_backbone(self) -> None:
-        logger.info("Compiling the model with different input shapes.")
-        start = time.perf_counter()
-        for num_tokens in self.num_tokens_paddings:
-            logger.info("  -- num_tokens: %d", num_tokens)
-            self._dummy_run(
-                num_tokens, self.num_reqs_max_model_len, self.max_num_blocks_per_req
-            )
-            if self.most_model_len is not None:
-                self._dummy_run(
-                    num_tokens,
-                    self.num_reqs_most_model_len,
-                    self.num_blocks_per_most_len_req,
-                )
-        xm.wait_device_ops()
-        end = time.perf_counter()
-        logger.info("Compilation finished in %.2f [secs].", end - start)
-        self._update_num_xla_graphs("model backbone")
-
-    def _precompile_select_hidden_states(self) -> None:
-        # Compile hidden state selection function for bucketed
-        # n_tokens x max_num_reqs. Graph is really small so this is fine.
-        logger.info("Compiling select_hidden_states with different input shapes.")
-        start = time.perf_counter()
-        hsize = self.model_config.get_hidden_size()
-        for num_tokens in self.num_tokens_paddings:
-            dummy_hidden = torch.zeros(
-                (num_tokens, hsize), device=self.device, dtype=self._hidden_states_dtype
-            )
-            torch._dynamo.mark_dynamic(dummy_hidden, 0)
-            for num_reqs in self.num_reqs_paddings:
-                indices = torch.zeros(num_reqs, dtype=torch.int32, device=self.device)
-                torch._dynamo.mark_dynamic(indices, 0)
-                self.select_hidden_states(dummy_hidden, indices)
-                logger.info("  -- num_tokens: %d, num_seqs: %d", num_tokens, num_reqs)
-                # Requests can't be more than tokens. But do compile for the
-                # next bigger value in case num_tokens uses bucketed padding.
-                if num_reqs >= min(num_tokens, self.max_num_reqs):
-                    break
-        xm.wait_device_ops()
-        end = time.perf_counter()
-        logger.info("Compilation finished in %.2f [secs].", end - start)
-        self._update_num_xla_graphs("select_hidden_states")
-
-    def _precompile_compute_logits(self) -> None:
-        logger.info("Compiling compute_logits with different input shapes.")
-        start = time.perf_counter()
-        hsize = self.model_config.get_hidden_size()
-        for num_reqs in self.num_reqs_paddings:
-            dummy_hidden = torch.zeros(
-                (num_reqs, hsize), device=self.device, dtype=self._hidden_states_dtype
-            )
-            torch._dynamo.mark_dynamic(dummy_hidden, 0)
-            self.compute_logits(dummy_hidden)
-            logger.info("  -- num_seqs: %d", num_reqs)
-        xm.wait_device_ops()
-        end = time.perf_counter()
-        logger.info("Compilation finished in %.2f [secs].", end - start)
-        self._update_num_xla_graphs("compute_logits")
-
-    def _precompile_structured_decoding(self) -> None:
-        logger.info("Compiling structured_decoding with different input shapes.")
-        start = time.perf_counter()
-        for num_reqs in self.num_reqs_paddings:
-            dummy_logits = torch.zeros(
-                (num_reqs, self.vocab_size),
-                device=self.device,
-                dtype=self._hidden_states_dtype,
-            )
-            dummy_require_struct_decoding = self.require_structured_out_cpu[
-                :num_reqs
-            ].to(self.device)
-            dummy_grammar_bitmask = self.grammar_bitmask_cpu[:num_reqs].to(self.device)
-            # The first dimension of the above 3 dummy tensors cannot be
-            # mark_dynamic because some operations in structured_decode require
-            # them to be static.
-            arange = self.structured_decode_arange.to(self.device)
-            self.structured_decode(
-                dummy_require_struct_decoding,
-                dummy_grammar_bitmask,
-                dummy_logits,
-                arange,
-            )
-            logger.info("  -- num_seqs: %d", num_reqs)
-        xm.wait_device_ops()
-        end = time.perf_counter()
-        logger.info("Compilation finished in %.2f [secs].", end - start)
-        self._update_num_xla_graphs("structured_decoding")
-
-    def _precompile_sample_from_logits(self) -> None:
-        logger.info("Compiling sample_from_logits with different input shapes.")
-        start = time.perf_counter()
-        for num_reqs in self.num_reqs_paddings:
-            dummy_logits = torch.zeros(
-                (num_reqs, self.vocab_size),
-                device=self.device,
-                dtype=self._hidden_states_dtype,
-            )
-            # The first dimension of dummy_logits cannot be mark_dynamic
-            # because some operations in the sampler require it to be static.
-            for all_greedy in [False, True]:
-                generate_params_if_all_greedy = not all_greedy
-                sampling_metadata = TPUSupportedSamplingMetadata.from_input_batch(
-                    self.input_batch,
-                    num_reqs,
-                    self.device,
-                    generate_params_if_all_greedy,
-                )
-                sampling_metadata.all_greedy = all_greedy
-                with self.maybe_select_dummy_loras(
-                    self.lora_config, np.array([num_reqs], dtype=np.int32)
-                ):
-                    self.sample_from_logits_func(dummy_logits, sampling_metadata)
-            logger.info("  -- num_seqs: %d", num_reqs)
-        xm.wait_device_ops()
-        end = time.perf_counter()
-        logger.info("Compilation finished in %.2f [secs].", end - start)
-        self._update_num_xla_graphs("sample_from_logits")
-
-    def _precompile_gather_logprobs(self) -> None:
-        logger.info("Compiling gather_logprobs with different input shapes.")
-        start = time.perf_counter()
-        for num_reqs in self.num_reqs_paddings:
-            dummy_logits = torch.zeros(
-                (num_reqs, self.vocab_size),
-                device=self.device,
-                dtype=self._hidden_states_dtype,
-            )
-            dummy_tokens = torch.zeros((num_reqs, 1), dtype=torch.int64).to(self.device)
-            with self.maybe_select_dummy_loras(
-                self.lora_config, np.array([num_reqs], dtype=np.int32)
-            ):
-                self.gather_logprobs(dummy_logits, dummy_tokens)
-            logger.info("  -- num_seqs: %d", num_reqs)
-        xm.wait_device_ops()
-        end = time.perf_counter()
-        logger.info("Compilation finished in %.2f [secs].", end - start)
-        self._update_num_xla_graphs("gather_logprobs")
-
-    def capture_model(self) -> None:
-        """
-        Precompile all the subgraphs with possible input shapes.
-        """
-        with self.maybe_setup_dummy_loras(self.lora_config):
-            self._precompile_mm_encoder()
-            self._precompile_backbone()
-            self._precompile_select_hidden_states()
-            self._precompile_compute_logits()
-            self._precompile_structured_decoding()
-            self._precompile_sample_from_logits()
-            self._precompile_gather_logprobs()
-
-    def profile_run(
-        self,
-        num_tokens: int,
-    ) -> None:
-        # Profile with multimodal encoder & encoder cache.
-        if self.supports_mm_inputs:
-            mm_config = self.model_config.multimodal_config
-            if mm_config is not None and mm_config.skip_mm_profiling:
-                logger.info(
-                    "Skipping memory profiling for multimodal encoder and "
-                    "encoder cache."
-                )
-            else:
-                mm_budget = self.mm_budget
-                assert mm_budget is not None
-
-                # TODO: handle encoder-decoder models once we support them.
-                if (encoder_budget := mm_budget.get_encoder_budget()) > 0:
-                    # NOTE: Currently model is profiled with a single non-text
-                    # modality with the max possible input tokens even when
-                    # it supports multiple.
-                    dummy_modality = mm_budget.get_modality_with_max_tokens()
-                    max_mm_items_per_batch = mm_budget.max_items_per_batch_by_modality[
-                        dummy_modality
-                    ]
-
-                    logger.info(
-                        "Encoder cache will be initialized with a budget of "
-                        "%s tokens, and profiled with %s %s items of the "
-                        "maximum feature size.",
-                        encoder_budget,
-                        max_mm_items_per_batch,
-                        dummy_modality,
-                    )
-
-                    # Create dummy batch of multimodal inputs.
-                    batched_dummy_mm_inputs = self._get_mm_dummy_batch(
-                        dummy_modality,
-                        max_mm_items_per_batch,
-                    )
-
-                    # Run multimodal encoder.
-                    # Isolate encoder graph from post-processing to minimize
-                    # impact of recompilation until it's fixed.
-                    start = time.perf_counter()
-                    torch_xla.sync(wait=False)
-                    dummy_encoder_outputs = self.model.embed_multimodal(
-                        **batched_dummy_mm_inputs
-                    )
-                    torch_xla.sync(wait=False)
-                    xm.wait_device_ops()
-                    end = time.perf_counter()
-                    logger.info(
-                        "Multimodal Encoder profiling finished in %.2f [secs].",
-                        end - start,
-                    )
-
-                    sanity_check_mm_encoder_outputs(
-                        dummy_encoder_outputs,
-                        expected_num_items=max_mm_items_per_batch,
-                    )
-
-                    # Cache the dummy encoder outputs.
-                    self.encoder_cache["tmp"] = dict(enumerate(dummy_encoder_outputs))
-
-        # Trigger compilation for general shape.
-        self._dummy_run(
-            num_tokens, self.num_reqs_max_model_len, self.max_num_blocks_per_req
-        )
-        if self.most_model_len is not None:
-            self._dummy_run(
-                num_tokens,
-                self.num_reqs_most_model_len,
-                self.num_blocks_per_most_len_req,
-            )
-
-        torch_xla.sync(wait=False)
-        xm.wait_device_ops()
-        self.encoder_cache.clear()
-        gc.collect()
-
-    def maybe_setup_cross_layer_kv_sharing(
-        self,
-        kv_caches: dict[str, torch.Tensor],
-        kv_cache_config: KVCacheConfig,
-    ) -> None:
-        """
-        Add layers that re-use KV cache to KV cache group of its target layer.
-        Mapping of KV cache tensors happens in `initialize_kv_cache_tensors()`
-        """
-        if not self.shared_kv_cache_layers:
-            # No cross-layer KV sharing, return
-            return
-
-        add_kv_sharing_layers_to_kv_cache_groups(
-            self.shared_kv_cache_layers,
-            kv_cache_config.kv_cache_groups,
-        )
-
-        for layer_name, target_layer_name in self.shared_kv_cache_layers.items():
-            logger.debug("%s reuses KV cache of %s", layer_name, target_layer_name)
-            kv_caches[layer_name] = kv_caches[target_layer_name]
-
-    def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
-        """
-        Initialize KV cache based on `kv_cache_config`.
-        Args:
-            kv_cache_config: Configuration for the KV cache, including the KV
-            cache size of each layer
-        """
-        if len(kv_cache_config.kv_cache_groups) > 1:
-            raise NotImplementedError(
-                "Hybrid models with more than one KV cache type are not supported yet."
-            )
-
-        if (
-            kv_cache_config.kv_cache_groups[0].kv_cache_spec.block_size
-            != self.block_size
-        ):
-            self.input_batch = InputBatch(
-                max_num_reqs=self.max_num_reqs,
-                max_model_len=self.max_model_len,
-                max_num_batched_tokens=self.max_num_tokens,
-                device=self.device,
-                pin_memory=self.pin_memory,
-                vocab_size=self.model_config.get_vocab_size(),
-                block_sizes=[
-                    kv_cache_config.kv_cache_groups[0].kv_cache_spec.block_size
-                ],
-                kernel_block_sizes=[
-                    kv_cache_config.kv_cache_groups[0].kv_cache_spec.block_size
-                ],
-            )
-        # Verify dtype compatibility between block_table_cpu and input_batch
-        assert (
-            self.block_table_cpu.dtype
-            == self.input_batch.block_table[0].get_cpu_tensor().dtype
-        )
-
-        kv_cache_sizes = {}
-        for kv_cache_tensor in kv_cache_config.kv_cache_tensors:
-            assert len(kv_cache_tensor.shared_by) == 1, (
-                "KV cache tensor shared by multiple layers is not supported in TPU."
-            )
-            kv_cache_sizes[kv_cache_tensor.shared_by[0]] = kv_cache_tensor.size
-
-        kv_caches: dict[str, torch.Tensor] = {}
-        for kv_cache_group in kv_cache_config.kv_cache_groups:
-            kv_cache_spec = kv_cache_group.kv_cache_spec
-            for layer_name in kv_cache_group.layer_names:
-                tensor_size = kv_cache_sizes[layer_name]
-                assert tensor_size % kv_cache_spec.page_size_bytes == 0
-                num_blocks = tensor_size // kv_cache_spec.page_size_bytes  # noqa
-                if isinstance(kv_cache_spec, AttentionSpec):
-                    if self.use_spmd:
-                        num_kv_heads = kv_cache_spec.num_kv_heads
-                        assert self.original_parallel_config is not None
-                        tp_size = self.original_parallel_config.tensor_parallel_size
-                        # TODO: Handle kv cache duplication under SPMD mode.
-                        assert num_kv_heads % tp_size == 0, (
-                            f"num_kv_heads {num_kv_heads} must be divisible by "
-                            f"tp_size {tp_size} under SPMD mode"
-                        )
-                    kv_cache_shape = PallasAttentionBackend.get_kv_cache_shape(
-                        num_blocks,
-                        kv_cache_spec.block_size,
-                        kv_cache_spec.num_kv_heads,
-                        kv_cache_spec.head_size,
-                    )
-                    dtype = kv_cache_spec.dtype
-
-                    tpu_kv_cache = torch.zeros(kv_cache_shape, dtype=dtype).to(
-                        self.device
-                    )
-
-                    kv_caches[layer_name] = tpu_kv_cache
-                else:
-                    raise NotImplementedError
-
-        # Set up cross-layer KV cache sharing if needed
-        self.maybe_setup_cross_layer_kv_sharing(kv_caches, kv_cache_config)
-
-        bind_kv_cache(
-            kv_caches,
-            self.vllm_config.compilation_config.static_forward_context,
-            self.kv_caches,
-        )
-
-        if self.use_spmd:
-            # Shard KV Cache
-            for cache in self.kv_caches:
-                xs.mark_sharding(cache, self.mesh, (None, "x", None, None))
-
-        if has_kv_transfer_group():
-            get_kv_transfer_group().register_kv_caches(kv_caches)
-            get_kv_transfer_group().set_host_xfer_buffer_ops(copy_kv_blocks)
-
-    def reset_dynamo_cache(self):
-        # NOTE: We check `is_multimodal_model` instead of `supports_mm_inputs`
-        # since the compiled model object of the language backbone of a
-        # multimodal model needs to be extracted via `get_language_model`.
-        if self.model_config.is_multimodal_model:
-            compiled_model = self.model.get_language_model().model
-        else:
-            compiled_model = self.model.model
-        if isinstance(compiled_model, TorchCompileWithNoGuardsWrapper):
-            logger.info("Clear dynamo cache and cached dynamo bytecode.")
-            torch._dynamo.eval_frame.remove_from_cache(
-                compiled_model.original_code_object()
-            )
-            # Reset the wrapper to re-initialize.
-            compiled_model.compiled = False
-            TorchCompileWithNoGuardsWrapper.__init__(compiled_model)
-
-    @torch.compile(backend="openxla", fullgraph=True, dynamic=False)
-    def select_hidden_states(self, hidden_states, indices_do_sample):
-        return hidden_states[indices_do_sample]
-
-    @torch.compile(backend="openxla", fullgraph=True, dynamic=False)
-    def compute_logits(self, sample_hidden_states: torch.Tensor) -> torch.Tensor:
-        return self.model.compute_logits(sample_hidden_states)
-
-    # TODO: Under SPMD mode, sample_from_logits has correctness issue.
-    #       Re-enable the torch.compile once the issue is fixed in torchxla.
-    # @torch.compile(backend="openxla", fullgraph=True, dynamic=False)
-    def sample_from_logits(
-        self, logits: torch.Tensor, sampling_metadata: TPUSupportedSamplingMetadata
-    ) -> torch.Tensor:
-        """
-        Sample with xla-friendly function. This function is to be traced
-        separately from `forward` for lighter compilation overhead.
-        """
-        if sampling_metadata.all_greedy:
-            out_tokens = torch.argmax(logits, dim=-1, keepdim=True)
-        else:
-            out_tokens = self.sampler(logits, sampling_metadata).sampled_token_ids
-        return out_tokens
-
-    @torch.compile(backend="openxla", fullgraph=True, dynamic=False)
-    def gather_logprobs(
-        self, logits: torch.Tensor, sampled_tokens: torch.Tensor
-    ) -> LogprobsTensors:
-        """
-        Gather the top_logprobs with corresponding tokens. Use a fixed number
-        of logprobs as an alternative to having multiple pre-compiled graphs.
-        Select the number of logprobs actually demanded by each request on CPU.
-        """
-        logprobs = self.sampler.compute_logprobs(logits)
-        return self.sampler.gather_logprobs(
-            logprobs,
-            self.model_config.max_logprobs,
-            token_ids=sampled_tokens.squeeze(-1),
-        )
-
-    @torch.compile(backend="openxla", fullgraph=True, dynamic=False)
-    def structured_decode(
-        self,
-        require_struct_decoding: torch.Tensor,
-        grammar_bitmask: torch.Tensor,
-        logits: torch.Tensor,
-        arange: torch.Tensor,
-    ) -> torch.Tensor:
-        return torch.where(
-            require_struct_decoding,
-            self.apply_grammar_bitmask(logits, grammar_bitmask, arange),
-            logits,
-        )
-
-    def apply_grammar_bitmask(
-        self, logits: torch.Tensor, grammar_bitmask: torch.Tensor, arange: torch.Tensor
-    ):
-        assert logits.shape[0] == grammar_bitmask.shape[0]
-        logits_cloned = logits.clone()
-        for i in range(logits.shape[0]):
-            unpacked_bitmask = (
-                torch.bitwise_right_shift(grammar_bitmask[i][:, None], arange[None, :])
-                & 1
-            ) == 0
-            unpacked_bitmask = unpacked_bitmask.reshape(-1)[: self.vocab_size]
-            logits_cloned[i] = logits_cloned[i].masked_fill(
-                unpacked_bitmask, -float("inf")
-            )
-        return logits_cloned
-
-    def embed_multimodal(self, *args, **kwargs):
-        return self.model.embed_multimodal(*args, **kwargs)
-
-    def embed_input_ids(self, *args, **kwargs):
-        return self.model.embed_input_ids(*args, **kwargs)
-
-    def prepare_structured_decoding_input(
-        self, logits: torch.Tensor, grammar_output: "GrammarOutput"
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        grammar_bitmask = grammar_output.grammar_bitmask
-        num_reqs, _ = logits.shape
-
-        # Reset pre-allocated tensors
-        self.grammar_bitmask_cpu.zero_()
-        self.require_structured_out_cpu.zero_()
-
-        cumulative_mask_idx = 0
-        for req_id in grammar_output.structured_output_request_ids:
-            if req_id not in self.input_batch.req_id_to_index:
-                continue
-            batch_index = self.input_batch.req_id_to_index[req_id]
-            self.grammar_bitmask_cpu[batch_index] = torch.from_numpy(
-                grammar_bitmask[cumulative_mask_idx]
-            )
-            # It's not guaranteed that all requests in this batch require
-            # structured output, so create a bool tensor to represent
-            # the requests that need structured output.
-            self.require_structured_out_cpu[batch_index] = True
-            cumulative_mask_idx += 1
-
-        return (
-            self.require_structured_out_cpu[:num_reqs].to(logits.device),
-            self.grammar_bitmask_cpu[:num_reqs].to(logits.device),
-            self.structured_decode_arange.to(logits.device),
-        )
-
-    def _get_mm_dummy_batch(
-        self,
-        modality: str,
-        max_items_per_batch: int,
-    ) -> BatchedTensorInputs:
-        """Dummy data for profiling and precompiling multimodal models."""
-        assert self.mm_budget is not None
-
-        dummy_decoder_data = self.mm_registry.get_decoder_dummy_data(
-            model_config=self.model_config,
-            seq_len=self.max_model_len,
-            mm_counts={modality: 1},
-            cache=self.mm_budget.cache,
-        )
-        dummy_mm_data = dummy_decoder_data.multi_modal_data
-
-        # Result in the maximum GPU consumption of the model
-        dummy_mm_item = dummy_mm_data[modality][0]
-        dummy_mm_items = [dummy_mm_item] * max_items_per_batch
-
-        return next(
-            grouped_mm_kwargs
-            for _, _, grouped_mm_kwargs in group_mm_kwargs_by_modality(
-                dummy_mm_items,
-                device=self.device,
-                pin_memory=self.pin_memory,
-            )
-        )
-
-
-def _get_req_paddings(min_req_size: int, max_req_size: int) -> list[int]:
-    logger.info("Preparing request paddings:")
-    # assert min_req_size is power of 2
-    assert (min_req_size & (min_req_size - 1) == 0) and min_req_size > 0
-    paddings: list = []
-    num = max(MIN_NUM_SEQS, min_req_size)
-    while num <= max_req_size and (len(paddings) == 0 or paddings[-1] != num):
-        paddings.append(num)
-        logger.info("    %d", num)
-        num = _get_padded_num_reqs_with_upper_limit(num + 1, max_req_size)
-    return paddings
-
-
-def _get_padded_num_reqs_with_upper_limit(x: int, upper_limit: int) -> int:
-    res = MIN_NUM_SEQS if x <= MIN_NUM_SEQS else 1 << (x - 1).bit_length()
-    return min(res, upper_limit)
-
-
-def _get_token_paddings(
-    min_token_size: int, max_token_size: int, padding_gap: int
-) -> list[int]:
-    """Generate a list of padding size, starting from min_token_size,
-    ending with a number that can cover max_token_size
-
-    If padding_gap == 0 then:
-        increase 2X each time (exponential)
-    else:
-        first increase the size to twice,
-        then increase the padding size by padding_gap.
-    """
-    # assert min_token_size is power of 2
-    assert (min_token_size & (min_token_size - 1) == 0) and min_token_size > 0
-    paddings = []
-    num = min_token_size
-
-    if padding_gap == 0:
-        logger.info("Using exponential token paddings:")
-        while True:
-            logger.info("    %d", num)
-            paddings.append(num)
-            if num >= max_token_size:
-                break
-            num *= 2
-    else:
-        logger.info("Using incremental token paddings:")
-        while num <= padding_gap:
-            logger.info("    %d", num)
-            paddings.append(num)
-            num *= 2
-        num //= 2
-        while num < max_token_size:
-            num += padding_gap
-            logger.info("    %d", num)
-            paddings.append(num)
-
-    return paddings
-
-
-def _get_padded_token_len(paddings: list[int], x: int) -> int:
-    """Return the first element in paddings list greater or equal to x."""
-    index = bisect.bisect_left(paddings, x)
-    assert index < len(paddings)
-    return paddings[index]
-
-
-def _get_padded_num_kv_cache_update_slices(
-    num_tokens: int, max_num_reqs: int, page_size: int
-) -> int:
-    """Calculates the padded number of KV cache update slices to avoid
-    recompilation."""
-    # NOTE(chengjiyao): let's say R_i is the token num for i-th request,
-    # so it occupies most 2 + R_i // page_size pages. The total maximum
-    # possible number of pages needed is sum(2 + R_i // page_size), which
-    # is <= 2 * max_num_reqs + sum(R_i) // page_size
-    # = 2 * max_num_reqs + num_tokens // page_size
-    padded_num_slices = 2 * max_num_reqs + num_tokens // page_size
-    padded_num_slices = min(padded_num_slices, num_tokens)
-    return padded_num_slices
-
-
-def _get_num_slices_per_kv_cache_update_block(page_size_bytes: int) -> int:
-    """Find the optimum number of slices to copy per Pallas program instance.
-
-    Increasing the number of slices copied in one instance of the kernel program
-    will increase HBM bandwidth utilization via more in-flight DMAs.
-
-    However, it will also use more VMEM, and experimentally, we observed
-    performance regression at 128 slices on v6e, likely due to running
-    out of scalar registers. Thus this function will limit the number of
-    slices to 64.
-    """
-    # The default vmem_limit_bytes of a pallas kernel is 32MB. Here we
-    # calculate num_slices_per_block based on 16MB in case any register spills.
-    vmem_limit = 16 * 1024 * 1024
-    num_slices_per_block = vmem_limit // page_size_bytes
-    assert num_slices_per_block > 0, "Number of slices should be positive"
-    num_slices_per_block = prev_power_of_2(num_slices_per_block)
-    if num_slices_per_block > 64:
-        num_slices_per_block = 64
-    return num_slices_per_block
-
-
-def replace_set_lora(model):
-    def _tpu_set_lora(
-        self,
-        index: int,
-        lora_a: torch.Tensor,
-        lora_b: torch.Tensor,
-        embeddings_tensor: torch.Tensor | None,
-    ):
-        # TODO: The integer index leads to a recompilation, but converting it
-        # to a tensor doesn't seem to work anymore. This might be fixed with a
-        # later release of torch_xla.
-        self._original_set_lora(index, lora_a, lora_b, embeddings_tensor)
-        torch_xla.sync(wait=False)
-
-    def _tpu_reset_lora(self, index: int):
-        self._original_reset_lora(index)
-        torch_xla.sync(wait=False)
-
-    for _, module in model.named_modules():
-        if isinstance(module, BaseLayerWithLoRA):
-            module._original_set_lora = module.set_lora
-            module._original_reset_lora = module.reset_lora
-            module.set_lora = _tpu_set_lora.__get__(  # type: ignore[method-assign]
-                module, module.__class__
-            )
-            module.reset_lora = _tpu_reset_lora.__get__(  # type: ignore[method-assign]
-                module, module.__class__
-            )
diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py
index 5f6136b178b46c269e0abda1fc6761df51a5b7a6..4c73d6c92d39187dedf0c3cde28b30d136833fb8 100644
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@@ -2,350 +2,16 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """A TPU worker class."""
 
-import os
-from collections.abc import Callable
-from typing import Any, TypeVar
+from typing import TypeVar
 
-import torch
-import torch.nn as nn
-
-import vllm.envs as envs
-from vllm.config import VllmConfig, set_current_vllm_config
-from vllm.distributed import (
-    ensure_model_parallel_initialized,
-    init_distributed_environment,
-)
-from vllm.distributed.kv_transfer import (
-    ensure_kv_transfer_initialized,
-)
 from vllm.logger import init_logger
-from vllm.lora.request import LoRARequest
-from vllm.model_executor import set_random_seed
-from vllm.platforms import current_platform
 from vllm.platforms.tpu import USE_TPU_INFERENCE
-from vllm.tasks import SupportedTask
-from vllm.utils.math_utils import cdiv
-from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
-from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
-from vllm.v1.kv_cache_interface import AttentionSpec, KVCacheConfig, KVCacheSpec
-from vllm.v1.outputs import ModelRunnerOutput
-from vllm.v1.utils import report_usage_stats
-from vllm.v1.worker.utils import bind_kv_cache
 
 logger = init_logger(__name__)
 
 _R = TypeVar("_R")
 
-if not USE_TPU_INFERENCE:
-    logger.info("tpu_inference not found, using vLLM's TPUWorker.")
-    import torch_xla.core.xla_model as xm
-    import torch_xla.debug.profiler as xp
-    import torch_xla.runtime as xr
-
-    from vllm.v1.attention.backends.pallas import TPU_HEAD_SIZE_ALIGNMENT
-    from vllm.v1.worker.tpu_model_runner import TPUModelRunner
-
-
-class TPUWorker:
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        local_rank: int,
-        rank: int,
-        distributed_init_method: str,
-        is_driver_worker: bool = False,
-    ):
-        self.is_driver_worker = is_driver_worker
-        self.vllm_config = vllm_config
-        self.model_config = vllm_config.model_config
-        self.cache_config = vllm_config.cache_config
-        self.lora_config = vllm_config.lora_config
-        self.load_config = vllm_config.load_config
-        self.parallel_config = vllm_config.parallel_config
-        self.use_spmd = envs.VLLM_XLA_USE_SPMD
-        self.original_parallel_config = None
-        if self.use_spmd:
-            # Under SPMD mode, distributed env is initialized as if there is
-            # only one worker/device.
-            self.original_parallel_config = self.parallel_config
-            self.parallel_config.tensor_parallel_size = 1
-            self.parallel_config.pipeline_parallel_size = 1
-            self.parallel_config.world_size = 1
-        self.scheduler_config = vllm_config.scheduler_config
-        self.device_config = vllm_config.device_config
-        self.speculative_config = vllm_config.speculative_config
-        self.observability_config = vllm_config.observability_config
-
-        self.parallel_config.rank = rank
-        self.local_rank = local_rank
-        self.rank = rank
-        self.distributed_init_method = distributed_init_method
-
-        if self.cache_config.cache_dtype == "auto":
-            self.cache_dtype = self.model_config.dtype
-        else:
-            self.cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[self.cache_config.cache_dtype]
-
-        if self.model_config.trust_remote_code:
-            # note: lazy import to avoid importing torch before initializing
-            from vllm.utils.import_utils import init_cached_hf_modules
-
-            init_cached_hf_modules()
-
-        # Delay profiler initialization to the start of the profiling.
-        # This is because in vLLM V1, MP runtime is initialized before the
-        # TPU Worker is initialized. The profiler server needs to start after
-        # MP runtime is initialized.
-        self.profiler = None
-        self.profile_dir = None
-        if vllm_config.profiler_config.profiler == "torch" and self.rank < 1:
-            # For TPU, we can only have 1 active profiler session for 1 profiler
-            # server. So we only profile on rank0.
-            self.profile_dir = vllm_config.profiler_config.torch_profiler_dir
-            logger.info(
-                "Profiling enabled. Traces will be saved to: %s", self.profile_dir
-            )
-
-    def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None:
-        self.cache_config.num_gpu_blocks = num_gpu_blocks
-        self.cache_config.num_cpu_blocks = num_cpu_blocks
-
-    def init_device(self):
-        os.environ["PJRT_DEVICE"] = "TPU"
-        # Note: Currently the XLA compiler wrongly uses 2D ring strategy on 1D
-        # ring, the xla tpu compiler flag
-        # `xla_tpu_force_1d_allreduce_at_chunk_count` is a temporary solution to
-        # fix this. It will be removed after the bug in XLA compiler is fixed.
-        os.environ["LIBTPU_INIT_ARGS"] = (
-            os.environ.get("LIBTPU_INIT_ARGS", "")
-            + " --xla_tpu_force_1d_allreduce_at_chunk_count=1"
-            " --xla_jf_conv_input_fusion=False"
-        )
-        # --xla_jf_conv_input_fusion=False is used to improve the perf of
-        # quantized matmul.
-        torch.set_grad_enabled(False)
-        torch.set_default_dtype(self.model_config.dtype)
-
-        # Initialize the distributed environment.
-        self._init_tpu_worker_distributed_environment(
-            self.vllm_config, self.rank, self.distributed_init_method, self.local_rank
-        )
-
-        # Device initialization should happen after initializing
-        # the distributed runtime.
-        self.device = xm.xla_device()
-        self.device_config.device = self.device
-
-        # Set random seed.
-        set_random_seed(self.model_config.seed)
-        xm.set_rng_state(self.model_config.seed, self.device)
-
-        # Increase the cache size limit, which is the maximum number of
-        # dynamo graphs that can be compiled.
-        # TODO (NickLucche) On gsm we compile 80+ graphs.
-        # Re-evaluate limit, with MM we may get close to this limit.
-        torch._dynamo.config.cache_size_limit = 128
-        # Use persistent cache to avoid XLA recompilation.
-        # NOTE(woosuk): Set per-rank cache path since different ranks
-        # can have slightly different XLA graphs.
-        world_size = self.parallel_config.world_size
-        rank = xr.global_ordinal()
-        # The PyTorch/XLA compilation cache uses the Torch IR to generate keys.
-        # Consequently, changes in optimization flags, which affect compilation
-        # results, don't change the cache key. This can result in the wrong
-        # compilation being used. To prevent this, disabling the XLA compilation
-        # cache during development is recommended.We can disable it by
-        # `export VLLM_XLA_CACHE_PATH=`
-        if envs.VLLM_XLA_CACHE_PATH:
-            per_rank_path = os.path.join(
-                envs.VLLM_XLA_CACHE_PATH, f"tp{world_size}_rank{rank}"
-            )
-            xr.initialize_cache(per_rank_path, readonly=False)
-
-        # Init ModelRunner here, so that we have access to self.device.
-        self.model_runner = TPUModelRunner(
-            self.vllm_config, self.device, self.original_parallel_config
-        )
-
-        if rank == 0:
-            # If usage stat is enabled, collect relevant info.
-            report_usage_stats(self.vllm_config)
-
-    def determine_available_memory(self) -> int:
-        kv_caches: dict[str, torch.Tensor] = {}
-        kv_cache_spec = self.model_runner.get_kv_cache_spec()
-        for layer_name, layer_spec in kv_cache_spec.items():
-            if isinstance(layer_spec, AttentionSpec):
-                dtype = layer_spec.dtype
-
-                # Use an empty tensor instead of `None` to force Dynamo to pass
-                # it by reference, rather by specializing on the value `None`.
-                tpu_kv_cache = torch.tensor([], dtype=dtype).to(self.device)
-                kv_caches[layer_name] = tpu_kv_cache
-            else:
-                raise NotImplementedError(
-                    f"Unsupported KV cache spec '{type(layer_spec)}'"
-                )
-
-        runner_kv_caches: list[torch.Tensor] = []
-        bind_kv_cache(
-            kv_caches,
-            self.vllm_config.compilation_config.static_forward_context,
-            runner_kv_caches,
-        )
-
-        # `max_num_tokens >= max_num_batched_tokens` due to padding.
-        with self.model_runner.maybe_setup_dummy_loras(self.lora_config):
-            self.model_runner.profile_run(self.model_runner.max_num_tokens)
-
-        # Synchronize before measuring the memory usage.
-        xm.wait_device_ops()
-
-        # During the profiling run, the model runs without KV cache. After
-        # the profiling run, the model always runs with KV cache. Here we clear
-        # the dynamo cache and cached bytecode to ensure the model always has
-        # one compiled bytecode. Having one FX graph/cached bytecode per
-        # compiled model is required for `support_torch_compile` decorator to
-        # skip dynamo guard.
-        with set_current_vllm_config(self.vllm_config):
-            self.model_runner.reset_dynamo_cache()
-
-        # Get the maximum amount of memory used by the model weights and
-        # intermediate activations.
-        if self.use_spmd:
-            # This is a workaround for the TPU SPMD mode. The get_memory_info
-            # API doesn't work with SPMD mode in PyTorch/XLA.
-            # TODO: use xm.get_memory_info for SPMD once it's supported in
-            # PyTorch/XLA.
-            import tpu_info
-
-            chip_type, _ = tpu_info.device.get_local_chips()
-            device_usage = tpu_info.metrics.get_chip_usage(chip_type)
-            total_memory_size = device_usage[0].total_memory
-            current_mem = device_usage[0].memory_usage
-        else:
-            m = xm.get_memory_info(self.device)
-            total_memory_size = m["bytes_limit"]
-            current_mem = m["bytes_used"]
-        # Ideally we would use profiled = m["peak_bytes_used"] to
-        # get weights + activations. But there is memory used during
-        # compilation / weight loading that impacts the peak and
-        # there is no way to reset peak memory in XLA, So we
-        # use the heuristic of 2% of weights.
-        profiled = current_mem * 1.02
-
-        # Calculate the TPU KV cache size based on profiling.
-        usable_memory_size = int(
-            total_memory_size * self.cache_config.gpu_memory_utilization
-        )
-        tpu_kv_cache_bytes = max(usable_memory_size - profiled, 0)
-        head_size = self.model_config.get_head_size()
-        if head_size > 0:
-            padded_head_size = (
-                cdiv(head_size, TPU_HEAD_SIZE_ALIGNMENT) * TPU_HEAD_SIZE_ALIGNMENT
-            )
-            if padded_head_size != head_size:
-                logger.warning_once("head size is padded to %d", padded_head_size)
-            # We adjust the usable memory size for the KV cache to prevent OOM
-            # errors, even after padding the head_size.
-            tpu_kv_cache_bytes = tpu_kv_cache_bytes * head_size // padded_head_size
-        return int(tpu_kv_cache_bytes)
-
-    def sample_tokens(self, grammar_output: "GrammarOutput") -> ModelRunnerOutput:
-        return self.model_runner.sample_tokens(grammar_output)
-
-    def execute_model(
-        self, scheduler_output: "SchedulerOutput"
-    ) -> ModelRunnerOutput | None:
-        return self.model_runner.execute_model(scheduler_output)
-
-    def profile(self, is_start: bool = True):
-        if self.rank < 1:
-            if self.profile_dir is None:
-                raise RuntimeError("Profiler is not enabled.")
-            if is_start:
-                if self.profiler is None:
-                    self.profiler = xp.start_server(9012)
-                xp.start_trace(self.profile_dir)
-            else:
-                xp.stop_trace()
-
-    def add_lora(self, lora_request: LoRARequest) -> bool:
-        return self.model_runner.add_lora(lora_request)
-
-    def load_model(self) -> None:
-        self.model_runner.load_model()
-
-    def update_config(self, overrides: dict[str, Any]) -> None:
-        self.model_runner.update_config(overrides)
-
-    def reload_weights(self) -> None:
-        self.model_runner.reload_weights()
-
-    def compile_or_warm_up_model(self) -> None:
-        if not self.model_config.enforce_eager:
-            self.model_runner.capture_model()
-
-        # Reset the seed to ensure that the random state is not affected by
-        # the model initialization and profiling.
-        set_random_seed(self.model_config.seed)
-
-    def reset_mm_cache(self) -> None:
-        self.model_runner.reset_mm_cache()
-
-    def get_model(self) -> nn.Module:
-        return self.model_runner.get_model()
-
-    def get_supported_tasks(self) -> tuple[SupportedTask, ...]:
-        return self.model_runner.get_supported_tasks()
-
-    def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
-        return self.model_runner.get_kv_cache_spec()
-
-    def initialize_from_config(self, kv_cache_config: KVCacheConfig) -> None:
-        """Allocate GPU KV cache with the specified kv_cache_config."""
-        self.model_runner.initialize_kv_cache(kv_cache_config)
-
-    def check_health(self) -> None:
-        # worker will always be healthy as long as it's running.
-        return
-
-    def _init_tpu_worker_distributed_environment(
-        self,
-        vllm_config: VllmConfig,
-        rank: int,
-        distributed_init_method: str | None = None,
-        local_rank: int = -1,
-    ) -> None:
-        """Initialize the distributed environment."""
-        if self.use_spmd:
-            xr.use_spmd()
-        # NOTE(woosuk): This is just to initialize the TP group and broadcast
-        # the input objects on CPU. The all-reduce and all-gather ops on TPU
-        # are invoked by `xm.all_reduce` and `xm.all_gather` which use their
-        # own context.
-        parallel_config = vllm_config.parallel_config
-        init_distributed_environment(
-            world_size=parallel_config.world_size,
-            rank=rank,
-            local_rank=local_rank,
-            distributed_init_method=distributed_init_method or "env://",
-            backend=current_platform.dist_backend,
-        )
-        ensure_model_parallel_initialized(
-            parallel_config.tensor_parallel_size, parallel_config.pipeline_parallel_size
-        )
-
-        ensure_kv_transfer_initialized(vllm_config)
-
-    def shutdown(self) -> None:
-        self.model_runner.ensure_kv_transfer_shutdown()
-
-    def apply_model(self, fn: Callable[[nn.Module], _R]) -> _R:
-        """Apply a function on the model inside this worker."""
-        return fn(self.get_model())
-
-
+# TODO(weiyulin) Remove this file after adding an official way to use hardware plugin
 if USE_TPU_INFERENCE:
     from tpu_inference.worker.tpu_worker import TPUWorker as TpuInferenceWorker
 
diff --git a/vllm/v1/worker/ubatch_utils.py b/vllm/v1/worker/ubatch_utils.py
index 44788476fc9c51d3da83f60369ca89f4806ef461..f6889173578d6eb6e33df8d1ca427fb364b863ac 100644
--- a/vllm/v1/worker/ubatch_utils.py
+++ b/vllm/v1/worker/ubatch_utils.py
@@ -27,14 +27,16 @@ class UBatchSlice:
 UBatchSlices: TypeAlias = list[UBatchSlice]
 
 
-def is_second_ubatch_empty(orig_num_tokens: int, padded_num_tokens: int) -> bool:
-    return (padded_num_tokens // 2) >= orig_num_tokens
+def is_last_ubatch_empty(
+    orig_num_tokens: int, padded_num_tokens: int, num_ubatches: int
+) -> bool:
+    return (padded_num_tokens // num_ubatches) * (num_ubatches - 1) >= orig_num_tokens
 
 
 def check_ubatch_thresholds(
     config: ParallelConfig, num_tokens: int, uniform_decode: bool
 ) -> bool:
-    if not config.enable_dbo:
+    if not config.use_ubatching:
         return False
     if uniform_decode:
         return num_tokens >= config.dbo_decode_token_threshold
@@ -42,21 +44,17 @@ def check_ubatch_thresholds(
         return num_tokens >= config.dbo_prefill_token_threshold
 
 
-# This just pads the second ubatch slice out to the total number of tokens
+# This pads the last ubatch slice out to the total number of tokens
 # (num_tokens + padding) since we do `create_ubatch_slices` before applying DP padding.
 def _pad_out_ubatch_slices(
     ubatch_slices: UBatchSlices, num_total_tokens: int, num_reqs_padded: int
 ) -> UBatchSlices:
-    # TODO(lucas): handle empty second ubatch
-    padded_second_request_slice = slice(
-        ubatch_slices[1].request_slice.start, num_reqs_padded
-    )
-    padded_second_token_slice = slice(
-        ubatch_slices[1].token_slice.start, num_total_tokens
-    )
-    return [
-        ubatch_slices[0],
-        UBatchSlice(padded_second_request_slice, padded_second_token_slice),
+    last_slice = ubatch_slices[-1]
+    padded_last_request_slice = slice(last_slice.request_slice.start, num_reqs_padded)
+    padded_last_token_slice = slice(last_slice.token_slice.start, num_total_tokens)
+
+    return ubatch_slices[:-1] + [
+        UBatchSlice(padded_last_request_slice, padded_last_token_slice)
     ]
 
 
@@ -65,40 +63,45 @@ def maybe_create_ubatch_slices(
     num_scheduled_tokens: np.ndarray,
     num_tokens_padded: int,
     num_reqs_padded: int,
-    split_point: int | None = None,
+    num_ubatches: int,
+    split_point: list[int] | int | None = None,
 ) -> tuple[UBatchSlices | None, UBatchSlices | None]:
     if not should_ubatch:
         return None, None
 
     if split_point is None:
-        split_point = int(num_tokens_padded) // 2
+        split_point = int(num_tokens_padded) // num_ubatches
+
+    token_split_points = [split_point * i for i in range(1, num_ubatches)]
 
     # TODO(lucas): Refactor the gpu_model_runner.py so we can pass
     # in cu_num_tokens directly (i.e. query_start_loc)
     cu_num_tokens = np.zeros(len(num_scheduled_tokens) + 1, dtype=np.int32)
     np.cumsum(num_scheduled_tokens, dtype=np.int32, out=cu_num_tokens[1:])
 
-    first_ubatch_token_slice = slice(0, split_point)
-    second_ubatch_token_slice = slice(split_point, cu_num_tokens[-1])
+    ubatch_slices = []
+    start_token = 0
 
-    # Determine request slices using exclusive stop semantics
-    # First ubatch includes requests whose tokens overlap [0, split_point)
-    first_ubatch_req_stop = int(
-        np.searchsorted(cu_num_tokens, split_point, side="left")
-    )
-    first_ubatch_req_slice = slice(0, first_ubatch_req_stop)
+    # Add the end point to the split points to make iteration easier
+    all_points = token_split_points + [cu_num_tokens[-1]]
 
-    # Second ubatch starts at the request that contains the split_point
-    # or the request starting exactly at split_point (if on boundary)
-    second_ubatch_req_start = int(
-        np.searchsorted(cu_num_tokens, split_point, side="right") - 1
-    )
-    second_ubatch_req_slice = slice(second_ubatch_req_start, len(cu_num_tokens) - 1)
+    for end_token in all_points:
+        token_slice = slice(start_token, end_token)
 
-    ubatch_slices = [
-        UBatchSlice(first_ubatch_req_slice, first_ubatch_token_slice),
-        UBatchSlice(second_ubatch_req_slice, second_ubatch_token_slice),
-    ]
+        # Determine request slices using exclusive stop semantics
+        # Ubatch includes requests whose tokens overlap [start_token, end_token)
+
+        # Start at the request that contains the start_token
+        # or the request starting exactly at start_token (if on boundary)
+        req_start = int(np.searchsorted(cu_num_tokens, start_token, side="right") - 1)
+
+        # Stop at the request that starts at or after end_token
+        req_stop = int(np.searchsorted(cu_num_tokens, end_token, side="left"))
+
+        req_slice = slice(req_start, req_stop)
+        ubatch_slices.append(UBatchSlice(req_slice, token_slice))
+
+        start_token = end_token
 
     ubatch_slices_padded = _pad_out_ubatch_slices(
         ubatch_slices, num_tokens_padded, num_reqs_padded
diff --git a/vllm/v1/worker/ubatching.py b/vllm/v1/worker/ubatching.py
index be8326e2fdbc16b6e1c6ffbb180dd1ca30a72b62..e7a947f2ea8ca0df7db4f4a468ba9d4dab9bd309 100644
--- a/vllm/v1/worker/ubatching.py
+++ b/vllm/v1/worker/ubatching.py
@@ -7,10 +7,15 @@ import torch
 
 from vllm import forward_context
 from vllm.forward_context import ForwardContext
+from vllm.logger import init_logger
 from vllm.utils.torch_utils import current_stream
 
+logger = init_logger(__name__)
+
 _THREAD_ID_TO_CONTEXT: dict = {}
-_CURRENT_CONTEXTS: list[Optional["UBatchContext"]] = [None, None]
+# Here we hardcode the number of microbatches to 2 for default.
+_NUM_UBATCHES: int = 2
+_CURRENT_CONTEXTS: list[Optional["UBatchContext"]] = []
 
 
 class UBatchContext:
@@ -48,6 +53,7 @@ class UBatchContext:
         global _CURRENT_CONTEXTS, _THREAD_ID_TO_CONTEXT
         _THREAD_ID_TO_CONTEXT[threading.get_ident()] = self.id
         _CURRENT_CONTEXTS[self.id] = self
+        # _NUM_UBATCHES is set in make_ubatch_contexts
         self.ready_barrier.wait()
 
         self.cpu_wait_event.wait()
@@ -181,7 +187,7 @@ dbo_switch_to_compute_sync = _register_ubatch_function(
 def dbo_register_recv_hook(recv_hook):
     if len(_THREAD_ID_TO_CONTEXT) > 0:
         ctx_idx = _THREAD_ID_TO_CONTEXT[threading.get_ident()]
-        next_ctx = _CURRENT_CONTEXTS[(ctx_idx + 1) % 2]
+        next_ctx = _CURRENT_CONTEXTS[(ctx_idx + 1) % _NUM_UBATCHES]
         next_ctx.recv_hook = recv_hook
 
 
@@ -202,7 +208,14 @@ def make_ubatch_contexts(
     ready_barrier: threading.Barrier,
     schedule: str = "default",
 ) -> list[UBatchContext]:
-    assert num_micro_batches == 2, "only been tested with 2 micro-batches"
+    global _NUM_UBATCHES, _CURRENT_CONTEXTS
+    assert num_micro_batches > 1, "num_micro_batches must be greater than 1"
+
+    _NUM_UBATCHES = num_micro_batches
+    # Ensure the global context list is large enough
+    if len(_CURRENT_CONTEXTS) < num_micro_batches:
+        _CURRENT_CONTEXTS.extend([None] * (num_micro_batches - len(_CURRENT_CONTEXTS)))
+
     """
     Create a context manager for micro-batching synchronization.
     """
@@ -210,8 +223,6 @@ def make_ubatch_contexts(
     gpu_comm_done_events = [torch.Event() for _ in range(num_micro_batches)]
     gpu_compute_done_events = [torch.Event() for _ in range(num_micro_batches)]
 
-    assert len(forward_contexts) == 2
-
     ctxs = []
     for i in range(num_micro_batches):
         ctx = UBatchContext(
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index 2e8afec024ce92a5f94521d666cc4f34298580b0..85acc16795e2b2b232234816b8f1900134123454 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -1,20 +1,22 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import math
 from collections import defaultdict
 from dataclasses import dataclass, field
 
 import torch
 from typing_extensions import deprecated
 
-from vllm.attention.backends.abstract import AttentionBackend
 from vllm.attention.layer import Attention
-from vllm.config import ModelConfig, SchedulerConfig, VllmConfig
+from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.models.interfaces import MultiModalEmbeddings
 from vllm.model_executor.models.utils import extract_layer_index
 from vllm.multimodal.cache import processor_only_cache_from_config
 from vllm.multimodal.registry import MultiModalRegistry
 from vllm.platforms import current_platform
+from vllm.utils.mem_utils import MemorySnapshot, format_gib
+from vllm.v1.attention.backend import AttentionBackend
 from vllm.v1.attention.backends.utils import AttentionMetadataBuilder
 from vllm.v1.core.encoder_cache_manager import compute_mm_encoder_budget
 from vllm.v1.kv_cache_interface import KVCacheGroupSpec, KVCacheSpec
@@ -248,6 +250,29 @@ def gather_mm_placeholders(
     return placeholders[is_embed]
 
 
+def request_memory(init_snapshot: MemorySnapshot, cache_config: CacheConfig) -> int:
+    """
+    Calculate the amount of memory required by vLLM, then validate
+    that the current amount of free memory is sufficient for that.
+    """
+    requested_memory = math.ceil(
+        init_snapshot.total_memory * cache_config.gpu_memory_utilization
+    )
+
+    if init_snapshot.free_memory < requested_memory:
+        raise ValueError(
+            f"Free memory on device {init_snapshot.device_} "
+            f"({format_gib(init_snapshot.free_memory)}/"
+            f"{format_gib(init_snapshot.total_memory)} GiB) on startup "
+            f"is less than desired GPU memory utilization "
+            f"({cache_config.gpu_memory_utilization}, "
+            f"{format_gib(requested_memory)} GiB). Decrease GPU memory "
+            f"utilization or reduce GPU memory used by other processes."
+        )
+
+    return requested_memory
+
+
 def add_kv_sharing_layers_to_kv_cache_groups(
     shared_kv_cache_layers: dict[str, str],
     kv_cache_groups: list[KVCacheGroupSpec],
diff --git a/vllm/v1/worker/worker_base.py b/vllm/v1/worker/worker_base.py
index e1ccb0c577f0b7d5d233e398288261ba21810d27..957b0a1108abb47cb44ed90b6a0fab90039d06e1 100644
--- a/vllm/v1/worker/worker_base.py
+++ b/vllm/v1/worker/worker_base.py
@@ -124,13 +124,19 @@ class WorkerBase:
         """Apply a function on the model inside this worker."""
         return fn(self.get_model())
 
+    def get_model_inspection(self) -> str:
+        """Return a transformers-style hierarchical view of the model."""
+        from vllm.model_inspection import format_model_inspection
+
+        return format_model_inspection(self.get_model())
+
     def load_model(self) -> None:
         """Load model onto target device."""
         raise NotImplementedError
 
     def execute_model(
         self, scheduler_output: SchedulerOutput
-    ) -> ModelRunnerOutput | None:
+    ) -> ModelRunnerOutput | AsyncModelRunnerOutput | None:
         """If this method returns None, sample_tokens should be called immediately after
         to obtain the ModelRunnerOutput.
 
@@ -184,7 +190,6 @@ class WorkerWrapperBase:
 
     def __init__(
         self,
-        vllm_config: VllmConfig,
         rpc_rank: int = 0,
         global_rank: int | None = None,
     ) -> None:
@@ -200,21 +205,10 @@ class WorkerWrapperBase:
         """
         self.rpc_rank = rpc_rank
         self.global_rank = self.rpc_rank if global_rank is None else global_rank
-        self.worker: WorkerBase | None = None
 
-        # do not store this `vllm_config`, `init_worker` will set the final
-        # one.
-        # TODO: investigate if we can remove this field in `WorkerWrapperBase`,
-        # `init_cached_hf_modules` should be unnecessary now.
-        self.vllm_config: VllmConfig | None = None
-
-        # `model_config` can be None in tests
-        model_config = vllm_config.model_config
-        if model_config and model_config.trust_remote_code:
-            # note: lazy import to avoid importing torch before initializing
-            from vllm.utils.import_utils import init_cached_hf_modules
-
-            init_cached_hf_modules()
+        # Initialized after init_worker is called
+        self.worker: WorkerBase
+        self.vllm_config: VllmConfig
 
     def shutdown(self) -> None:
         if self.worker is not None:
@@ -247,27 +241,34 @@ class WorkerWrapperBase:
         Arguments are passed to the worker class constructor.
         """
         kwargs = all_kwargs[self.rpc_rank]
-        self.vllm_config = kwargs.get("vllm_config")
-        assert self.vllm_config is not None, (
+
+        vllm_config: VllmConfig | None = kwargs.get("vllm_config")
+        assert vllm_config is not None, (
             "vllm_config is required to initialize the worker"
         )
-        self.vllm_config.enable_trace_function_call_for_thread()
+        self.vllm_config = vllm_config
+
+        vllm_config.enable_trace_function_call_for_thread()
 
         from vllm.plugins import load_general_plugins
 
         load_general_plugins()
 
-        if isinstance(self.vllm_config.parallel_config.worker_cls, str):
-            worker_class = resolve_obj_by_qualname(
-                self.vllm_config.parallel_config.worker_cls
+        parallel_config = vllm_config.parallel_config
+        if isinstance(parallel_config.worker_cls, str):
+            worker_class: type[WorkerBase] = resolve_obj_by_qualname(
+                parallel_config.worker_cls
             )
         else:
             raise ValueError(
-                "passing worker_cls is no longer supported. Please pass keep the class in a separate module and pass the qualified name of the class as a string."  # noqa: E501
+                "passing worker_cls is no longer supported. "
+                "Please pass keep the class in a separate module "
+                "and pass the qualified name of the class as a string."
             )
-        if self.vllm_config.parallel_config.worker_extension_cls:
+
+        if parallel_config.worker_extension_cls:
             worker_extension_cls = resolve_obj_by_qualname(
-                self.vllm_config.parallel_config.worker_extension_cls
+                parallel_config.worker_extension_cls
             )
             extended_calls = []
             if worker_extension_cls not in worker_class.__bases__:
@@ -300,7 +301,7 @@ class WorkerWrapperBase:
                 "This argument is needed for mm_processor_cache_type='shm'."
             )
 
-            mm_config = self.vllm_config.model_config.multimodal_config
+            mm_config = vllm_config.model_config.multimodal_config
             if mm_config and mm_config.mm_processor_cache_type == "shm":
                 raise ValueError(msg)
             else:
@@ -309,7 +310,7 @@ class WorkerWrapperBase:
             self.mm_receiver_cache = None
         else:
             self.mm_receiver_cache = worker_receiver_cache_from_config(
-                self.vllm_config,
+                vllm_config,
                 MULTIMODAL_REGISTRY,
                 shared_worker_lock,
             )
@@ -317,7 +318,6 @@ class WorkerWrapperBase:
         with set_current_vllm_config(self.vllm_config):
             # To make vLLM config available during worker initialization
             self.worker = worker_class(**kwargs)
-            assert self.worker is not None
 
     def initialize_from_config(self, kv_cache_configs: list[Any]) -> None:
         kv_cache_config = kv_cache_configs[self.global_rank]
@@ -364,20 +364,15 @@ class WorkerWrapperBase:
             )
 
     def execute_model(
-        self,
-        scheduler_output: SchedulerOutput,
-        *args,
-        **kwargs,
-    ) -> ModelRunnerOutput | None:
+        self, scheduler_output: SchedulerOutput
+    ) -> ModelRunnerOutput | AsyncModelRunnerOutput | None:
         self._apply_mm_cache(scheduler_output)
 
-        assert self.worker is not None
-        return self.worker.execute_model(scheduler_output, *args, **kwargs)
+        return self.worker.execute_model(scheduler_output)
 
     def reset_mm_cache(self) -> None:
         mm_receiver_cache = self.mm_receiver_cache
         if mm_receiver_cache is not None:
             mm_receiver_cache.clear_cache()
 
-        assert self.worker is not None
         self.worker.reset_mm_cache()
diff --git a/vllm/v1/worker/xpu_worker.py b/vllm/v1/worker/xpu_worker.py
index 1faa1a24ff0ea75f6ed98e3e6b84fc7bc95561c8..fe0850771dd0d0115d697a3d2223c9f418e4af49 100644
--- a/vllm/v1/worker/xpu_worker.py
+++ b/vllm/v1/worker/xpu_worker.py
@@ -9,9 +9,9 @@ import torch.distributed
 from vllm.config import VllmConfig
 from vllm.distributed import get_world_group
 from vllm.logger import init_logger
-from vllm.model_executor import set_random_seed
 from vllm.platforms import current_platform
 from vllm.profiler.wrapper import TorchProfilerWrapper
+from vllm.utils.torch_utils import set_random_seed
 from vllm.v1.worker.gpu_worker import Worker, init_worker_distributed_environment
 from vllm.v1.worker.xpu_model_runner import XPUModelRunner